@mcptoolshop/research-os 0.3.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +210 -0
- package/README.es.md +26 -1
- package/README.fr.md +30 -5
- package/README.hi.md +45 -5
- package/README.it.md +26 -1
- package/README.ja.md +30 -5
- package/README.md +39 -1
- package/README.pt-BR.md +26 -1
- package/README.zh.md +26 -1
- package/dist/calibration/aggregate-receipt-schema.d.ts +509 -0
- package/dist/calibration/aggregate-receipt-schema.js +143 -0
- package/dist/calibration/aggregate-receipt-schema.js.map +1 -0
- package/dist/calibration/aggregate.d.ts +35 -0
- package/dist/calibration/aggregate.js +454 -0
- package/dist/calibration/aggregate.js.map +1 -0
- package/dist/calibration/receipt-schema.d.ts +317 -0
- package/dist/calibration/receipt-schema.js +68 -0
- package/dist/calibration/receipt-schema.js.map +1 -0
- package/dist/calibration/receipt.d.ts +31 -0
- package/dist/calibration/receipt.js +151 -0
- package/dist/calibration/receipt.js.map +1 -0
- package/dist/cli.js +1957 -1253
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +140 -4
- package/dist/index.js +1499 -1168
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
// src/calibration/aggregate-receipt-schema.ts
|
|
2
|
+
import { z as z2 } from "zod";
|
|
3
|
+
|
|
4
|
+
// src/calibration/receipt-schema.ts
|
|
5
|
+
import { z } from "zod";
|
|
6
|
+
var StatusLabelSchema = z.enum([
|
|
7
|
+
"trusted_baseline",
|
|
8
|
+
"conditional_pass",
|
|
9
|
+
"failed",
|
|
10
|
+
"comparison_only"
|
|
11
|
+
]);
|
|
12
|
+
var ArchitectureSchema = z.enum(["single-pass", "two-pass"]);
|
|
13
|
+
var RecallSchema = z.object({
|
|
14
|
+
matched: z.number().int().nonnegative(),
|
|
15
|
+
total: z.number().int().nonnegative(),
|
|
16
|
+
ratio: z.number().min(0).max(1)
|
|
17
|
+
});
|
|
18
|
+
var PerCategoryRecallSchema = z.record(z.string(), RecallSchema);
|
|
19
|
+
var PassFailSchema = z.object({
|
|
20
|
+
fp_ceiling: z.enum(["PASS", "FAIL"]),
|
|
21
|
+
any_flag_recall_floor: z.enum(["PASS", "FAIL"]),
|
|
22
|
+
per_category_any_flag_floor: z.enum(["PASS", "FAIL"]),
|
|
23
|
+
strict_recall_floor: z.enum(["PASS", "FAIL"]),
|
|
24
|
+
decision_vocab_completeness: z.enum(["PASS", "FAIL"]),
|
|
25
|
+
latency_soft: z.enum(["PASS", "WARN"]),
|
|
26
|
+
latency_hard: z.enum(["PASS", "FAIL"]),
|
|
27
|
+
empty_or_malformed: z.enum(["PASS", "FAIL"]),
|
|
28
|
+
overall: z.enum(["PASS", "FAIL"])
|
|
29
|
+
});
|
|
30
|
+
var DecisionVocabBarSchema = z.object({
|
|
31
|
+
architecture: ArchitectureSchema,
|
|
32
|
+
required: z.number().int().positive(),
|
|
33
|
+
produced: z.number().int().nonnegative(),
|
|
34
|
+
passed: z.boolean()
|
|
35
|
+
});
|
|
36
|
+
var CalibrationReceiptSchema = z.object({
|
|
37
|
+
schema_version: z.literal(1),
|
|
38
|
+
profile_name: z.string(),
|
|
39
|
+
status: StatusLabelSchema,
|
|
40
|
+
model: z.string(),
|
|
41
|
+
architecture: ArchitectureSchema,
|
|
42
|
+
fixture: z.string(),
|
|
43
|
+
fixture_total_claims: z.number().int().positive(),
|
|
44
|
+
fixture_good_claims: z.number().int().nonnegative(),
|
|
45
|
+
fixture_bad_claims: z.number().int().nonnegative(),
|
|
46
|
+
calibrated_at: z.string(),
|
|
47
|
+
research_os_version: z.string(),
|
|
48
|
+
runtime_ms: z.number().int().nonnegative(),
|
|
49
|
+
good_fp_count: z.number().int().nonnegative(),
|
|
50
|
+
any_flag_recall: RecallSchema,
|
|
51
|
+
strict_recall: RecallSchema,
|
|
52
|
+
per_category_any_flag: PerCategoryRecallSchema,
|
|
53
|
+
per_category_strict: PerCategoryRecallSchema,
|
|
54
|
+
decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),
|
|
55
|
+
decisions_produced_count: z.number().int().nonnegative(),
|
|
56
|
+
decision_vocab_bar: DecisionVocabBarSchema,
|
|
57
|
+
unreachable_decisions: z.array(z.string()),
|
|
58
|
+
empty_or_malformed_responses: z.number().int().nonnegative(),
|
|
59
|
+
pass_fail: PassFailSchema,
|
|
60
|
+
notes: z.array(z.string())
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
// src/calibration/aggregate-receipt-schema.ts
|
|
64
|
+
var AggregateMetricSchema = z2.object({
|
|
65
|
+
median: z2.number(),
|
|
66
|
+
min: z2.number(),
|
|
67
|
+
max: z2.number(),
|
|
68
|
+
values: z2.array(z2.number())
|
|
69
|
+
// per-run values in run order (run-001, run-002, ...)
|
|
70
|
+
});
|
|
71
|
+
var PerCategoryAggregateEntrySchema = z2.object({
|
|
72
|
+
median_ratio: z2.number().min(0).max(1),
|
|
73
|
+
min_ratio: z2.number().min(0).max(1),
|
|
74
|
+
max_ratio: z2.number().min(0).max(1),
|
|
75
|
+
total: z2.number().int().nonnegative(),
|
|
76
|
+
// seed count — same across all runs
|
|
77
|
+
per_run_ratios: z2.array(z2.number())
|
|
78
|
+
});
|
|
79
|
+
var PerCategoryAggregateSchema = z2.record(z2.string(), PerCategoryAggregateEntrySchema);
|
|
80
|
+
var AggregatePassFailSchema = z2.object({
|
|
81
|
+
fp_ceiling: z2.enum(["PASS", "FAIL"]),
|
|
82
|
+
any_flag_recall_floor: z2.enum(["PASS", "FAIL"]),
|
|
83
|
+
per_category_any_flag_floor: z2.enum(["PASS", "FAIL"]),
|
|
84
|
+
strict_recall_floor: z2.enum(["PASS", "FAIL"]),
|
|
85
|
+
decision_vocab_completeness: z2.enum(["PASS", "FAIL"]),
|
|
86
|
+
latency_soft: z2.enum(["PASS", "WARN"]),
|
|
87
|
+
latency_hard: z2.enum(["PASS", "FAIL"]),
|
|
88
|
+
empty_or_malformed: z2.enum(["PASS", "FAIL"]),
|
|
89
|
+
overall: z2.enum(["PASS", "FAIL"])
|
|
90
|
+
});
|
|
91
|
+
var AggregateDecisionVocabBarSchema = z2.object({
|
|
92
|
+
architecture: ArchitectureSchema,
|
|
93
|
+
required: z2.number().int().positive(),
|
|
94
|
+
median_produced: z2.number(),
|
|
95
|
+
// float — median of per-run decisions_produced_count
|
|
96
|
+
passed: z2.boolean()
|
|
97
|
+
});
|
|
98
|
+
var AggregateCalibrationReceiptSchema = z2.object({
|
|
99
|
+
schema_version: z2.literal(1),
|
|
100
|
+
receipt_kind: z2.literal("aggregate"),
|
|
101
|
+
// discriminates from single-run receipt
|
|
102
|
+
profile_name: z2.string(),
|
|
103
|
+
status: StatusLabelSchema,
|
|
104
|
+
model: z2.string(),
|
|
105
|
+
architecture: ArchitectureSchema,
|
|
106
|
+
fixture: z2.string(),
|
|
107
|
+
fixture_total_claims: z2.number().int().positive(),
|
|
108
|
+
fixture_good_claims: z2.number().int().nonnegative(),
|
|
109
|
+
fixture_bad_claims: z2.number().int().nonnegative(),
|
|
110
|
+
runs_count: z2.number().int().min(2),
|
|
111
|
+
run_files: z2.array(z2.string()),
|
|
112
|
+
// relative paths: runs/run-001.json, etc.
|
|
113
|
+
aggregated_at: z2.string(),
|
|
114
|
+
// ISO 8601
|
|
115
|
+
research_os_version: z2.string(),
|
|
116
|
+
// Aggregate metrics — median + min + max + per-run values in run order
|
|
117
|
+
good_fp_count: AggregateMetricSchema,
|
|
118
|
+
any_flag_recall_ratio: AggregateMetricSchema,
|
|
119
|
+
strict_recall_ratio: AggregateMetricSchema,
|
|
120
|
+
decisions_produced_count: AggregateMetricSchema,
|
|
121
|
+
runtime_ms: AggregateMetricSchema,
|
|
122
|
+
empty_or_malformed_responses: AggregateMetricSchema,
|
|
123
|
+
per_category_any_flag: PerCategoryAggregateSchema,
|
|
124
|
+
per_category_strict: PerCategoryAggregateSchema,
|
|
125
|
+
// Decision vocabulary — union of all decisions seen across runs, median count each
|
|
126
|
+
decision_vocabulary: z2.record(z2.string(), AggregateMetricSchema),
|
|
127
|
+
decision_vocab_bar: AggregateDecisionVocabBarSchema,
|
|
128
|
+
unreachable_decisions: z2.array(z2.string()),
|
|
129
|
+
pass_fail: AggregatePassFailSchema,
|
|
130
|
+
// Bars that FAILed in >= ceil(runs_count/2) individual runs.
|
|
131
|
+
// Non-empty list demotes trusted_baseline to conditional_pass.
|
|
132
|
+
recurring_bar_failures: z2.array(z2.string()),
|
|
133
|
+
notes: z2.array(z2.string())
|
|
134
|
+
});
|
|
135
|
+
export {
|
|
136
|
+
AggregateCalibrationReceiptSchema,
|
|
137
|
+
AggregateDecisionVocabBarSchema,
|
|
138
|
+
AggregateMetricSchema,
|
|
139
|
+
AggregatePassFailSchema,
|
|
140
|
+
PerCategoryAggregateEntrySchema,
|
|
141
|
+
PerCategoryAggregateSchema
|
|
142
|
+
};
|
|
143
|
+
//# sourceMappingURL=aggregate-receipt-schema.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/calibration/aggregate-receipt-schema.ts","../../src/calibration/receipt-schema.ts"],"sourcesContent":["import { z } from 'zod';\nimport { StatusLabelSchema, ArchitectureSchema } from './receipt-schema.js';\n\nexport const AggregateMetricSchema = z.object({\n median: z.number(),\n min: z.number(),\n max: z.number(),\n values: z.array(z.number()), // per-run values in run order (run-001, run-002, ...)\n});\n\nexport const PerCategoryAggregateEntrySchema = z.object({\n median_ratio: z.number().min(0).max(1),\n min_ratio: z.number().min(0).max(1),\n max_ratio: z.number().min(0).max(1),\n total: z.number().int().nonnegative(), // seed count — same across all runs\n per_run_ratios: z.array(z.number()),\n});\n\nexport const PerCategoryAggregateSchema = z.record(z.string(), PerCategoryAggregateEntrySchema);\n\nexport const AggregatePassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const AggregateDecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n median_produced: z.number(), // float — median of per-run decisions_produced_count\n passed: z.boolean(),\n});\n\nexport const AggregateCalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n receipt_kind: z.literal('aggregate'), // discriminates from single-run receipt\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n runs_count: z.number().int().min(2),\n run_files: z.array(z.string()), // relative paths: runs/run-001.json, etc.\n aggregated_at: z.string(), // ISO 8601\n research_os_version: z.string(),\n\n // Aggregate metrics — median + min + max + per-run values in run order\n good_fp_count: AggregateMetricSchema,\n any_flag_recall_ratio: AggregateMetricSchema,\n strict_recall_ratio: AggregateMetricSchema,\n decisions_produced_count: AggregateMetricSchema,\n runtime_ms: AggregateMetricSchema,\n empty_or_malformed_responses: AggregateMetricSchema,\n\n per_category_any_flag: PerCategoryAggregateSchema,\n per_category_strict: PerCategoryAggregateSchema,\n\n // Decision vocabulary — union of all decisions seen across runs, median count each\n decision_vocabulary: z.record(z.string(), AggregateMetricSchema),\n decision_vocab_bar: AggregateDecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n\n pass_fail: AggregatePassFailSchema,\n // Bars that FAILed in >= ceil(runs_count/2) individual runs.\n // Non-empty list demotes trusted_baseline to conditional_pass.\n recurring_bar_failures: z.array(z.string()),\n\n notes: z.array(z.string()),\n});\n\nexport type AggregateMetric = z.infer<typeof AggregateMetricSchema>;\nexport type PerCategoryAggregateEntry = z.infer<typeof PerCategoryAggregateEntrySchema>;\nexport type PerCategoryAggregate = z.infer<typeof PerCategoryAggregateSchema>;\nexport type AggregatePassFail = z.infer<typeof AggregatePassFailSchema>;\nexport type AggregateDecisionVocabBar = z.infer<typeof AggregateDecisionVocabBarSchema>;\nexport type AggregateCalibrationReceipt = z.infer<typeof AggregateCalibrationReceiptSchema>;\n","import { z } from 'zod';\n\nexport const StatusLabelSchema = z.enum([\n 'trusted_baseline',\n 'conditional_pass',\n 'failed',\n 'comparison_only',\n]);\n\nexport const ArchitectureSchema = z.enum(['single-pass', 'two-pass']);\n\nexport const RecallSchema = z.object({\n matched: z.number().int().nonnegative(),\n total: z.number().int().nonnegative(),\n ratio: z.number().min(0).max(1),\n});\n\nexport const PerCategoryRecallSchema = z.record(z.string(), RecallSchema);\n\nexport const PassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const DecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n produced: z.number().int().nonnegative(),\n passed: z.boolean(),\n});\n\nexport const CalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n calibrated_at: z.string(),\n research_os_version: z.string(),\n runtime_ms: z.number().int().nonnegative(),\n good_fp_count: z.number().int().nonnegative(),\n any_flag_recall: RecallSchema,\n strict_recall: RecallSchema,\n per_category_any_flag: PerCategoryRecallSchema,\n per_category_strict: PerCategoryRecallSchema,\n decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),\n decisions_produced_count: z.number().int().nonnegative(),\n decision_vocab_bar: DecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n empty_or_malformed_responses: z.number().int().nonnegative(),\n pass_fail: PassFailSchema,\n notes: z.array(z.string()),\n});\n\nexport type StatusLabel = z.infer<typeof StatusLabelSchema>;\nexport type Architecture = z.infer<typeof ArchitectureSchema>;\nexport type Recall = z.infer<typeof RecallSchema>;\nexport type PerCategoryRecall = z.infer<typeof PerCategoryRecallSchema>;\nexport type PassFail = z.infer<typeof PassFailSchema>;\nexport type DecisionVocabBar = z.infer<typeof DecisionVocabBarSchema>;\nexport type CalibrationReceipt = z.infer<typeof CalibrationReceiptSchema>;\n"],"mappings":";AAAA,SAAS,KAAAA,UAAS;;;ACAlB,SAAS,SAAS;AAEX,IAAM,oBAAoB,EAAE,KAAK;AAAA,EACtC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAEM,IAAM,qBAAqB,EAAE,KAAK,CAAC,eAAe,UAAU,CAAC;AAE7D,IAAM,eAAe,EAAE,OAAO;AAAA,EACnC,SAAS,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACtC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACpC,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAChC,CAAC;AAEM,IAAM,0BAA0B,EAAE,OAAO,EAAE,OAAO,GAAG,YAAY;AAEjE,IAAM,iBAAiB,EAAE,OAAO;AAAA,EACrC,YAAY,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6B,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6B,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAc,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAc,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAAS,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,yBAAyB,EAAE,OAAO;AAAA,EAC7C,cAAc;AAAA,EACd,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvC,QAAQ,EAAE,QAAQ;AACpB,CAAC;AAEM,IAAM,2BAA2B,EAAE,OAAO;AAAA,EAC/C,gBAAgB,EAAE,QAAQ,CAAC;AAAA,EAC3B,cAAc,EAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAO,EAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAAS,EAAE,OAAO;AAAA,EAClB,sBAAsB,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqB,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoB,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,eAAe,EAAE,OAAO;AAAA,EACxB,qBAAqB,EAAE,OAAO;AAAA,EAC9B,YAAY,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACzC,eAAe,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC5C,iBAAiB;AAAA,EACjB,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,qBAAqB,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,CAAC;AAAA,EACxE,0BAA0B,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvD,oBAAoB;AAAA,EACpB,uBAAuB,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EACzC,8BAA8B,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC3D,WAAW;AAAA,EACX,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC;AAC3B,CAAC;;;AD5DM,IAAM,wBAAwBC,GAAE,OAAO;AAAA,EAC5C,QAAQA,GAAE,OAAO;AAAA,EACjB,KAAKA,GAAE,OAAO;AAAA,EACd,KAAKA,GAAE,OAAO;AAAA,EACd,QAAQA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAC5B,CAAC;AAEM,IAAM,kCAAkCA,GAAE,OAAO;AAAA,EACtD,cAAcA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EACrC,WAAWA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAClC,WAAWA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAClC,OAAOA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA;AAAA,EACpC,gBAAgBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AACpC,CAAC;AAEM,IAAM,6BAA6BA,GAAE,OAAOA,GAAE,OAAO,GAAG,+BAA+B;AAEvF,IAAM,0BAA0BA,GAAE,OAAO;AAAA,EAC9C,YAAYA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAASA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,kCAAkCA,GAAE,OAAO;AAAA,EACtD,cAAc;AAAA,EACd,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,iBAAiBA,GAAE,OAAO;AAAA;AAAA,EAC1B,QAAQA,GAAE,QAAQ;AACpB,CAAC;AAEM,IAAM,oCAAoCA,GAAE,OAAO;AAAA,EACxD,gBAAgBA,GAAE,QAAQ,CAAC;AAAA,EAC3B,cAAcA,GAAE,QAAQ,WAAW;AAAA;AAAA,EACnC,cAAcA,GAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAOA,GAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAASA,GAAE,OAAO;AAAA,EAClB,sBAAsBA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,YAAYA,GAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC;AAAA,EAClC,WAAWA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAAA,EAC7B,eAAeA,GAAE,OAAO;AAAA;AAAA,EACxB,qBAAqBA,GAAE,OAAO;AAAA;AAAA,EAG9B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,0BAA0B;AAAA,EAC1B,YAAY;AAAA,EACZ,8BAA8B;AAAA,EAE9B,uBAAuB;AAAA,EACvB,qBAAqB;AAAA;AAAA,EAGrB,qBAAqBA,GAAE,OAAOA,GAAE,OAAO,GAAG,qBAAqB;AAAA,EAC/D,oBAAoB;AAAA,EACpB,uBAAuBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EAEzC,WAAW;AAAA;AAAA;AAAA,EAGX,wBAAwBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EAE1C,OAAOA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAC3B,CAAC;","names":["z","z"]}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { PerCategoryRecall, CalibrationReceipt, Architecture, StatusLabel, PassFail } from './receipt-schema.js';
|
|
2
|
+
import { AggregateMetric, PerCategoryAggregate, AggregateCalibrationReceipt, AggregatePassFail } from './aggregate-receipt-schema.js';
|
|
3
|
+
import 'zod';
|
|
4
|
+
|
|
5
|
+
declare function median(values: number[]): number;
|
|
6
|
+
declare function aggregateMetric(values: number[]): AggregateMetric;
|
|
7
|
+
declare function aggregatePerCategoryRecall(perRunBuckets: PerCategoryRecall[]): PerCategoryAggregate;
|
|
8
|
+
declare function aggregateDecisionVocabulary(perRunDicts: Record<string, number>[]): Record<string, AggregateMetric>;
|
|
9
|
+
declare function computeAggregatePassFail(input: {
|
|
10
|
+
good_fp_count: AggregateMetric;
|
|
11
|
+
any_flag_recall_ratio: AggregateMetric;
|
|
12
|
+
per_category_any_flag: PerCategoryAggregate;
|
|
13
|
+
strict_recall_ratio: AggregateMetric;
|
|
14
|
+
decisions_produced_count: AggregateMetric;
|
|
15
|
+
architecture: Architecture;
|
|
16
|
+
runtime_ms: AggregateMetric;
|
|
17
|
+
empty_or_malformed_responses: AggregateMetric;
|
|
18
|
+
}): AggregatePassFail;
|
|
19
|
+
declare function computeRecurringBarFailures(perRunPassFails: PassFail[], totalRuns: number): string[];
|
|
20
|
+
declare function computeAggregateStatusLabel(input: {
|
|
21
|
+
profileName: string;
|
|
22
|
+
architecture: Architecture;
|
|
23
|
+
aggregatePassFail: AggregatePassFail;
|
|
24
|
+
medianGoodFpCount: number;
|
|
25
|
+
recurringBarFailures: string[];
|
|
26
|
+
modeOverride?: 'comparison_only';
|
|
27
|
+
}): StatusLabel;
|
|
28
|
+
declare function aggregateReceipts(runs: CalibrationReceipt[], opts: {
|
|
29
|
+
runFiles: string[];
|
|
30
|
+
modeOverride?: 'comparison_only';
|
|
31
|
+
aggregatedAt?: string;
|
|
32
|
+
}): AggregateCalibrationReceipt;
|
|
33
|
+
declare function buildAggregateReceiptMarkdown(r: AggregateCalibrationReceipt): string;
|
|
34
|
+
|
|
35
|
+
export { aggregateDecisionVocabulary, aggregateMetric, aggregatePerCategoryRecall, aggregateReceipts, buildAggregateReceiptMarkdown, computeAggregatePassFail, computeAggregateStatusLabel, computeRecurringBarFailures, median };
|
|
@@ -0,0 +1,454 @@
|
|
|
1
|
+
// src/calibration/aggregate-receipt-schema.ts
|
|
2
|
+
import { z as z2 } from "zod";
|
|
3
|
+
|
|
4
|
+
// src/calibration/receipt-schema.ts
|
|
5
|
+
import { z } from "zod";
|
|
6
|
+
var StatusLabelSchema = z.enum([
|
|
7
|
+
"trusted_baseline",
|
|
8
|
+
"conditional_pass",
|
|
9
|
+
"failed",
|
|
10
|
+
"comparison_only"
|
|
11
|
+
]);
|
|
12
|
+
var ArchitectureSchema = z.enum(["single-pass", "two-pass"]);
|
|
13
|
+
var RecallSchema = z.object({
|
|
14
|
+
matched: z.number().int().nonnegative(),
|
|
15
|
+
total: z.number().int().nonnegative(),
|
|
16
|
+
ratio: z.number().min(0).max(1)
|
|
17
|
+
});
|
|
18
|
+
var PerCategoryRecallSchema = z.record(z.string(), RecallSchema);
|
|
19
|
+
var PassFailSchema = z.object({
|
|
20
|
+
fp_ceiling: z.enum(["PASS", "FAIL"]),
|
|
21
|
+
any_flag_recall_floor: z.enum(["PASS", "FAIL"]),
|
|
22
|
+
per_category_any_flag_floor: z.enum(["PASS", "FAIL"]),
|
|
23
|
+
strict_recall_floor: z.enum(["PASS", "FAIL"]),
|
|
24
|
+
decision_vocab_completeness: z.enum(["PASS", "FAIL"]),
|
|
25
|
+
latency_soft: z.enum(["PASS", "WARN"]),
|
|
26
|
+
latency_hard: z.enum(["PASS", "FAIL"]),
|
|
27
|
+
empty_or_malformed: z.enum(["PASS", "FAIL"]),
|
|
28
|
+
overall: z.enum(["PASS", "FAIL"])
|
|
29
|
+
});
|
|
30
|
+
var DecisionVocabBarSchema = z.object({
|
|
31
|
+
architecture: ArchitectureSchema,
|
|
32
|
+
required: z.number().int().positive(),
|
|
33
|
+
produced: z.number().int().nonnegative(),
|
|
34
|
+
passed: z.boolean()
|
|
35
|
+
});
|
|
36
|
+
var CalibrationReceiptSchema = z.object({
|
|
37
|
+
schema_version: z.literal(1),
|
|
38
|
+
profile_name: z.string(),
|
|
39
|
+
status: StatusLabelSchema,
|
|
40
|
+
model: z.string(),
|
|
41
|
+
architecture: ArchitectureSchema,
|
|
42
|
+
fixture: z.string(),
|
|
43
|
+
fixture_total_claims: z.number().int().positive(),
|
|
44
|
+
fixture_good_claims: z.number().int().nonnegative(),
|
|
45
|
+
fixture_bad_claims: z.number().int().nonnegative(),
|
|
46
|
+
calibrated_at: z.string(),
|
|
47
|
+
research_os_version: z.string(),
|
|
48
|
+
runtime_ms: z.number().int().nonnegative(),
|
|
49
|
+
good_fp_count: z.number().int().nonnegative(),
|
|
50
|
+
any_flag_recall: RecallSchema,
|
|
51
|
+
strict_recall: RecallSchema,
|
|
52
|
+
per_category_any_flag: PerCategoryRecallSchema,
|
|
53
|
+
per_category_strict: PerCategoryRecallSchema,
|
|
54
|
+
decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),
|
|
55
|
+
decisions_produced_count: z.number().int().nonnegative(),
|
|
56
|
+
decision_vocab_bar: DecisionVocabBarSchema,
|
|
57
|
+
unreachable_decisions: z.array(z.string()),
|
|
58
|
+
empty_or_malformed_responses: z.number().int().nonnegative(),
|
|
59
|
+
pass_fail: PassFailSchema,
|
|
60
|
+
notes: z.array(z.string())
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
// src/calibration/aggregate-receipt-schema.ts
|
|
64
|
+
var AggregateMetricSchema = z2.object({
|
|
65
|
+
median: z2.number(),
|
|
66
|
+
min: z2.number(),
|
|
67
|
+
max: z2.number(),
|
|
68
|
+
values: z2.array(z2.number())
|
|
69
|
+
// per-run values in run order (run-001, run-002, ...)
|
|
70
|
+
});
|
|
71
|
+
var PerCategoryAggregateEntrySchema = z2.object({
|
|
72
|
+
median_ratio: z2.number().min(0).max(1),
|
|
73
|
+
min_ratio: z2.number().min(0).max(1),
|
|
74
|
+
max_ratio: z2.number().min(0).max(1),
|
|
75
|
+
total: z2.number().int().nonnegative(),
|
|
76
|
+
// seed count — same across all runs
|
|
77
|
+
per_run_ratios: z2.array(z2.number())
|
|
78
|
+
});
|
|
79
|
+
var PerCategoryAggregateSchema = z2.record(z2.string(), PerCategoryAggregateEntrySchema);
|
|
80
|
+
var AggregatePassFailSchema = z2.object({
|
|
81
|
+
fp_ceiling: z2.enum(["PASS", "FAIL"]),
|
|
82
|
+
any_flag_recall_floor: z2.enum(["PASS", "FAIL"]),
|
|
83
|
+
per_category_any_flag_floor: z2.enum(["PASS", "FAIL"]),
|
|
84
|
+
strict_recall_floor: z2.enum(["PASS", "FAIL"]),
|
|
85
|
+
decision_vocab_completeness: z2.enum(["PASS", "FAIL"]),
|
|
86
|
+
latency_soft: z2.enum(["PASS", "WARN"]),
|
|
87
|
+
latency_hard: z2.enum(["PASS", "FAIL"]),
|
|
88
|
+
empty_or_malformed: z2.enum(["PASS", "FAIL"]),
|
|
89
|
+
overall: z2.enum(["PASS", "FAIL"])
|
|
90
|
+
});
|
|
91
|
+
var AggregateDecisionVocabBarSchema = z2.object({
|
|
92
|
+
architecture: ArchitectureSchema,
|
|
93
|
+
required: z2.number().int().positive(),
|
|
94
|
+
median_produced: z2.number(),
|
|
95
|
+
// float — median of per-run decisions_produced_count
|
|
96
|
+
passed: z2.boolean()
|
|
97
|
+
});
|
|
98
|
+
var AggregateCalibrationReceiptSchema = z2.object({
|
|
99
|
+
schema_version: z2.literal(1),
|
|
100
|
+
receipt_kind: z2.literal("aggregate"),
|
|
101
|
+
// discriminates from single-run receipt
|
|
102
|
+
profile_name: z2.string(),
|
|
103
|
+
status: StatusLabelSchema,
|
|
104
|
+
model: z2.string(),
|
|
105
|
+
architecture: ArchitectureSchema,
|
|
106
|
+
fixture: z2.string(),
|
|
107
|
+
fixture_total_claims: z2.number().int().positive(),
|
|
108
|
+
fixture_good_claims: z2.number().int().nonnegative(),
|
|
109
|
+
fixture_bad_claims: z2.number().int().nonnegative(),
|
|
110
|
+
runs_count: z2.number().int().min(2),
|
|
111
|
+
run_files: z2.array(z2.string()),
|
|
112
|
+
// relative paths: runs/run-001.json, etc.
|
|
113
|
+
aggregated_at: z2.string(),
|
|
114
|
+
// ISO 8601
|
|
115
|
+
research_os_version: z2.string(),
|
|
116
|
+
// Aggregate metrics — median + min + max + per-run values in run order
|
|
117
|
+
good_fp_count: AggregateMetricSchema,
|
|
118
|
+
any_flag_recall_ratio: AggregateMetricSchema,
|
|
119
|
+
strict_recall_ratio: AggregateMetricSchema,
|
|
120
|
+
decisions_produced_count: AggregateMetricSchema,
|
|
121
|
+
runtime_ms: AggregateMetricSchema,
|
|
122
|
+
empty_or_malformed_responses: AggregateMetricSchema,
|
|
123
|
+
per_category_any_flag: PerCategoryAggregateSchema,
|
|
124
|
+
per_category_strict: PerCategoryAggregateSchema,
|
|
125
|
+
// Decision vocabulary — union of all decisions seen across runs, median count each
|
|
126
|
+
decision_vocabulary: z2.record(z2.string(), AggregateMetricSchema),
|
|
127
|
+
decision_vocab_bar: AggregateDecisionVocabBarSchema,
|
|
128
|
+
unreachable_decisions: z2.array(z2.string()),
|
|
129
|
+
pass_fail: AggregatePassFailSchema,
|
|
130
|
+
// Bars that FAILed in >= ceil(runs_count/2) individual runs.
|
|
131
|
+
// Non-empty list demotes trusted_baseline to conditional_pass.
|
|
132
|
+
recurring_bar_failures: z2.array(z2.string()),
|
|
133
|
+
notes: z2.array(z2.string())
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
// src/calibration/aggregate.ts
|
|
137
|
+
function median(values) {
|
|
138
|
+
if (values.length === 0) throw new Error("median: empty array");
|
|
139
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
140
|
+
const mid = Math.floor(sorted.length / 2);
|
|
141
|
+
if (sorted.length % 2 === 1) return sorted[mid];
|
|
142
|
+
return (sorted[mid - 1] + sorted[mid]) / 2;
|
|
143
|
+
}
|
|
144
|
+
function aggregateMetric(values) {
|
|
145
|
+
const m = median(values);
|
|
146
|
+
return {
|
|
147
|
+
median: m,
|
|
148
|
+
min: Math.min(...values),
|
|
149
|
+
max: Math.max(...values),
|
|
150
|
+
values
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
function aggregatePerCategoryRecall(perRunBuckets) {
|
|
154
|
+
const cats = /* @__PURE__ */ new Set();
|
|
155
|
+
for (const run of perRunBuckets) {
|
|
156
|
+
for (const cat of Object.keys(run)) cats.add(cat);
|
|
157
|
+
}
|
|
158
|
+
const result = {};
|
|
159
|
+
for (const cat of cats) {
|
|
160
|
+
const ratios = perRunBuckets.map((run) => run[cat]?.ratio ?? 0);
|
|
161
|
+
const total = perRunBuckets.find((run) => run[cat] !== void 0)?.[cat]?.total ?? 0;
|
|
162
|
+
result[cat] = {
|
|
163
|
+
median_ratio: median(ratios),
|
|
164
|
+
min_ratio: Math.min(...ratios),
|
|
165
|
+
max_ratio: Math.max(...ratios),
|
|
166
|
+
total,
|
|
167
|
+
per_run_ratios: ratios
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
return result;
|
|
171
|
+
}
|
|
172
|
+
function aggregateDecisionVocabulary(perRunDicts) {
|
|
173
|
+
const decisions = /* @__PURE__ */ new Set();
|
|
174
|
+
for (const run of perRunDicts) {
|
|
175
|
+
for (const d of Object.keys(run)) decisions.add(d);
|
|
176
|
+
}
|
|
177
|
+
const result = {};
|
|
178
|
+
for (const d of decisions) {
|
|
179
|
+
const values = perRunDicts.map((run) => run[d] ?? 0);
|
|
180
|
+
result[d] = aggregateMetric(values);
|
|
181
|
+
}
|
|
182
|
+
return result;
|
|
183
|
+
}
|
|
184
|
+
function computeAggregatePassFail(input) {
|
|
185
|
+
const fp_ceiling = input.good_fp_count.median <= 1 && input.good_fp_count.max <= 2 ? "PASS" : "FAIL";
|
|
186
|
+
const any_flag_recall_floor = input.any_flag_recall_ratio.median >= 0.65 ? "PASS" : "FAIL";
|
|
187
|
+
let per_category_any_flag_floor = "PASS";
|
|
188
|
+
for (const entry of Object.values(input.per_category_any_flag)) {
|
|
189
|
+
if (entry.total >= 2 && entry.median_ratio < 0.5) {
|
|
190
|
+
per_category_any_flag_floor = "FAIL";
|
|
191
|
+
break;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
const strict_recall_floor = input.strict_recall_ratio.median >= 0.2 ? "PASS" : "FAIL";
|
|
195
|
+
const dvRequired = input.architecture === "two-pass" ? 3 : 4;
|
|
196
|
+
const decision_vocab_completeness = input.decisions_produced_count.median >= dvRequired ? "PASS" : "FAIL";
|
|
197
|
+
const latency_soft = input.runtime_ms.median <= 6e5 ? "PASS" : "WARN";
|
|
198
|
+
const latency_hard = input.runtime_ms.max <= 12e5 ? "PASS" : "FAIL";
|
|
199
|
+
const empty_or_malformed = input.empty_or_malformed_responses.max === 0 ? "PASS" : "FAIL";
|
|
200
|
+
const hardBars = [
|
|
201
|
+
fp_ceiling,
|
|
202
|
+
any_flag_recall_floor,
|
|
203
|
+
per_category_any_flag_floor,
|
|
204
|
+
strict_recall_floor,
|
|
205
|
+
decision_vocab_completeness,
|
|
206
|
+
latency_hard,
|
|
207
|
+
empty_or_malformed
|
|
208
|
+
];
|
|
209
|
+
const overall = hardBars.every((v) => v === "PASS") ? "PASS" : "FAIL";
|
|
210
|
+
return {
|
|
211
|
+
fp_ceiling,
|
|
212
|
+
any_flag_recall_floor,
|
|
213
|
+
per_category_any_flag_floor,
|
|
214
|
+
strict_recall_floor,
|
|
215
|
+
decision_vocab_completeness,
|
|
216
|
+
latency_soft,
|
|
217
|
+
latency_hard,
|
|
218
|
+
empty_or_malformed,
|
|
219
|
+
overall
|
|
220
|
+
};
|
|
221
|
+
}
|
|
222
|
+
function computeRecurringBarFailures(perRunPassFails, totalRuns) {
|
|
223
|
+
const threshold = Math.ceil(totalRuns / 2);
|
|
224
|
+
const HARD_BARS = [
|
|
225
|
+
"fp_ceiling",
|
|
226
|
+
"any_flag_recall_floor",
|
|
227
|
+
"per_category_any_flag_floor",
|
|
228
|
+
"strict_recall_floor",
|
|
229
|
+
"decision_vocab_completeness",
|
|
230
|
+
"latency_hard",
|
|
231
|
+
"empty_or_malformed"
|
|
232
|
+
];
|
|
233
|
+
const recurring = [];
|
|
234
|
+
for (const bar of HARD_BARS) {
|
|
235
|
+
const failCount = perRunPassFails.filter((pf) => pf[bar] === "FAIL").length;
|
|
236
|
+
if (failCount >= threshold) recurring.push(bar);
|
|
237
|
+
}
|
|
238
|
+
return recurring;
|
|
239
|
+
}
|
|
240
|
+
function computeAggregateStatusLabel(input) {
|
|
241
|
+
if (input.modeOverride === "comparison_only") return "comparison_only";
|
|
242
|
+
if (input.architecture === "single-pass" && /hermes/i.test(input.profileName)) {
|
|
243
|
+
return "comparison_only";
|
|
244
|
+
}
|
|
245
|
+
if (input.aggregatePassFail.overall === "FAIL") return "failed";
|
|
246
|
+
const isHermesTwoPass = /hermes/i.test(input.profileName) && input.architecture === "two-pass";
|
|
247
|
+
if (isHermesTwoPass && input.medianGoodFpCount === 0 && input.recurringBarFailures.length === 0) {
|
|
248
|
+
return "trusted_baseline";
|
|
249
|
+
}
|
|
250
|
+
return "conditional_pass";
|
|
251
|
+
}
|
|
252
|
+
function aggregateReceipts(runs, opts) {
|
|
253
|
+
if (runs.length === 0) throw new Error("aggregateReceipts: no runs provided");
|
|
254
|
+
const first = runs[0];
|
|
255
|
+
const fpMetric = aggregateMetric(runs.map((r) => r.good_fp_count));
|
|
256
|
+
const anyFlagRatioMetric = aggregateMetric(runs.map((r) => r.any_flag_recall.ratio));
|
|
257
|
+
const strictRatioMetric = aggregateMetric(runs.map((r) => r.strict_recall.ratio));
|
|
258
|
+
const decisionsMetric = aggregateMetric(runs.map((r) => r.decisions_produced_count));
|
|
259
|
+
const runtimeMetric = aggregateMetric(runs.map((r) => r.runtime_ms));
|
|
260
|
+
const emptyOrMalformedMetric = aggregateMetric(
|
|
261
|
+
runs.map((r) => r.empty_or_malformed_responses)
|
|
262
|
+
);
|
|
263
|
+
const perCatAnyFlag = aggregatePerCategoryRecall(runs.map((r) => r.per_category_any_flag));
|
|
264
|
+
const perCatStrict = aggregatePerCategoryRecall(runs.map((r) => r.per_category_strict));
|
|
265
|
+
const decisionVocab = aggregateDecisionVocabulary(runs.map((r) => r.decision_vocabulary));
|
|
266
|
+
const dvRequired = first.architecture === "two-pass" ? 3 : 4;
|
|
267
|
+
const decisionVocabBar = {
|
|
268
|
+
architecture: first.architecture,
|
|
269
|
+
required: dvRequired,
|
|
270
|
+
median_produced: decisionsMetric.median,
|
|
271
|
+
passed: decisionsMetric.median >= dvRequired
|
|
272
|
+
};
|
|
273
|
+
const aggregatePassFail = computeAggregatePassFail({
|
|
274
|
+
good_fp_count: fpMetric,
|
|
275
|
+
any_flag_recall_ratio: anyFlagRatioMetric,
|
|
276
|
+
per_category_any_flag: perCatAnyFlag,
|
|
277
|
+
strict_recall_ratio: strictRatioMetric,
|
|
278
|
+
decisions_produced_count: decisionsMetric,
|
|
279
|
+
architecture: first.architecture,
|
|
280
|
+
runtime_ms: runtimeMetric,
|
|
281
|
+
empty_or_malformed_responses: emptyOrMalformedMetric
|
|
282
|
+
});
|
|
283
|
+
const recurringBarFailures = computeRecurringBarFailures(
|
|
284
|
+
runs.map((r) => r.pass_fail),
|
|
285
|
+
runs.length
|
|
286
|
+
);
|
|
287
|
+
const status = computeAggregateStatusLabel({
|
|
288
|
+
profileName: first.profile_name,
|
|
289
|
+
architecture: first.architecture,
|
|
290
|
+
aggregatePassFail,
|
|
291
|
+
medianGoodFpCount: fpMetric.median,
|
|
292
|
+
recurringBarFailures,
|
|
293
|
+
modeOverride: opts.modeOverride
|
|
294
|
+
});
|
|
295
|
+
const notes = [];
|
|
296
|
+
if (aggregatePassFail.latency_soft === "WARN") {
|
|
297
|
+
notes.push(
|
|
298
|
+
`Latency warning: median ${(runtimeMetric.median / 1e3).toFixed(1)}s exceeds soft limit of 600s`
|
|
299
|
+
);
|
|
300
|
+
}
|
|
301
|
+
if (fpMetric.median > 0) {
|
|
302
|
+
notes.push(`FP at ceiling: median ${fpMetric.median} false positive(s) on good claims`);
|
|
303
|
+
}
|
|
304
|
+
if (recurringBarFailures.length > 0) {
|
|
305
|
+
notes.push(`Recurring bar failures (>= ceil(N/2) runs): ${recurringBarFailures.join(", ")}`);
|
|
306
|
+
}
|
|
307
|
+
if (status === "comparison_only") {
|
|
308
|
+
notes.push(
|
|
309
|
+
"comparison_only: architectural side-run, not a production admission candidate"
|
|
310
|
+
);
|
|
311
|
+
}
|
|
312
|
+
if (status === "conditional_pass") {
|
|
313
|
+
notes.push("conditional_pass: passes all bars but carries a production caution");
|
|
314
|
+
}
|
|
315
|
+
return AggregateCalibrationReceiptSchema.parse({
|
|
316
|
+
schema_version: 1,
|
|
317
|
+
receipt_kind: "aggregate",
|
|
318
|
+
profile_name: first.profile_name,
|
|
319
|
+
status,
|
|
320
|
+
model: first.model,
|
|
321
|
+
architecture: first.architecture,
|
|
322
|
+
fixture: first.fixture,
|
|
323
|
+
fixture_total_claims: first.fixture_total_claims,
|
|
324
|
+
fixture_good_claims: first.fixture_good_claims,
|
|
325
|
+
fixture_bad_claims: first.fixture_bad_claims,
|
|
326
|
+
runs_count: runs.length,
|
|
327
|
+
run_files: opts.runFiles,
|
|
328
|
+
aggregated_at: opts.aggregatedAt ?? (/* @__PURE__ */ new Date()).toISOString(),
|
|
329
|
+
research_os_version: first.research_os_version,
|
|
330
|
+
good_fp_count: fpMetric,
|
|
331
|
+
any_flag_recall_ratio: anyFlagRatioMetric,
|
|
332
|
+
strict_recall_ratio: strictRatioMetric,
|
|
333
|
+
decisions_produced_count: decisionsMetric,
|
|
334
|
+
runtime_ms: runtimeMetric,
|
|
335
|
+
empty_or_malformed_responses: emptyOrMalformedMetric,
|
|
336
|
+
per_category_any_flag: perCatAnyFlag,
|
|
337
|
+
per_category_strict: perCatStrict,
|
|
338
|
+
decision_vocabulary: decisionVocab,
|
|
339
|
+
decision_vocab_bar: decisionVocabBar,
|
|
340
|
+
unreachable_decisions: first.unreachable_decisions,
|
|
341
|
+
pass_fail: aggregatePassFail,
|
|
342
|
+
recurring_bar_failures: recurringBarFailures,
|
|
343
|
+
notes
|
|
344
|
+
});
|
|
345
|
+
}
|
|
346
|
+
function buildAggregateReceiptMarkdown(r) {
|
|
347
|
+
const pct = (ratio) => `${Math.round(ratio * 100)}%`;
|
|
348
|
+
const secRounded = (ms) => `${(ms / 1e3).toFixed(1)}s`;
|
|
349
|
+
const af = r.any_flag_recall_ratio;
|
|
350
|
+
const sr = r.strict_recall_ratio;
|
|
351
|
+
const fp = r.good_fp_count;
|
|
352
|
+
const dec = r.decisions_produced_count;
|
|
353
|
+
const rt = r.runtime_ms;
|
|
354
|
+
const pf = r.pass_fail;
|
|
355
|
+
const bar = r.decision_vocab_bar;
|
|
356
|
+
const runFileList = r.run_files.length > 0 ? `${r.run_files[0]} \u2026 ${r.run_files[r.run_files.length - 1]}` : "(none)";
|
|
357
|
+
const perCatAnyFlagRows = Object.entries(r.per_category_any_flag).map(([cat, entry]) => {
|
|
358
|
+
const st = r.per_category_strict[cat];
|
|
359
|
+
return `| ${cat} | ${pct(entry.median_ratio)} | ${pct(entry.min_ratio)}\u2013${pct(entry.max_ratio)} | ${entry.total} |` + (st ? ` ${pct(st.median_ratio)} | ${pct(st.min_ratio)}\u2013${pct(st.max_ratio)} |` : " \u2014 | \u2014 |");
|
|
360
|
+
}).join("\n");
|
|
361
|
+
const ALL_DECISIONS = [
|
|
362
|
+
"accepted_for_synthesis",
|
|
363
|
+
"rejected",
|
|
364
|
+
"needs_scope_repair",
|
|
365
|
+
"needs_source_repair",
|
|
366
|
+
"needs_contradiction_mapping",
|
|
367
|
+
"needs_human_review"
|
|
368
|
+
];
|
|
369
|
+
const dvRows = ALL_DECISIONS.map((d) => {
|
|
370
|
+
const metric = r.decision_vocabulary[d];
|
|
371
|
+
const unreachable = r.unreachable_decisions.includes(d) ? ` (unreachable from ${r.fixture})` : "";
|
|
372
|
+
if (!metric) return `| ${d} | \u2014 | \u2014 |${unreachable}`;
|
|
373
|
+
return `| ${d} | ${metric.median.toFixed(1)} | ${metric.min}\u2013${metric.max}${unreachable} |`;
|
|
374
|
+
}).join("\n");
|
|
375
|
+
const perRunRows = r.any_flag_recall_ratio.values.map((afr, i) => {
|
|
376
|
+
const fp_i = r.good_fp_count.values[i] ?? "?";
|
|
377
|
+
const sr_i = r.strict_recall_ratio.values[i] ?? "?";
|
|
378
|
+
const dec_i = r.decisions_produced_count.values[i] ?? "?";
|
|
379
|
+
const rt_i = r.runtime_ms.values[i] ?? "?";
|
|
380
|
+
return `| ${i + 1} | ${fp_i}/${r.fixture_good_claims} | ${typeof afr === "number" ? pct(afr) : "?"} | ${typeof sr_i === "number" ? pct(sr_i) : "?"} | ${dec_i}/6 | ${typeof rt_i === "number" ? secRounded(rt_i) : "?"} |`;
|
|
381
|
+
}).join("\n");
|
|
382
|
+
const recurringSection = r.recurring_bar_failures.length > 0 ? r.recurring_bar_failures.map((b) => `- ${b}`).join("\n") : "None.";
|
|
383
|
+
const notesSection = r.notes.length > 0 ? `
|
|
384
|
+
## Notes
|
|
385
|
+
|
|
386
|
+
${r.notes.map((n) => `- ${n}`).join("\n")}
|
|
387
|
+
` : "";
|
|
388
|
+
return `# Calibration Receipt \u2014 ${r.profile_name} (aggregate, N=${r.runs_count} runs)
|
|
389
|
+
|
|
390
|
+
- **Model:** ${r.model}
|
|
391
|
+
- **Architecture:** ${r.architecture}
|
|
392
|
+
- **Status:** ${r.status}
|
|
393
|
+
- **Fixture:** ${r.fixture} (${r.fixture_total_claims} claims = ${r.fixture_good_claims} good + ${r.fixture_bad_claims} bad)
|
|
394
|
+
- **Aggregated at:** ${r.aggregated_at}
|
|
395
|
+
- **Research-OS version:** ${r.research_os_version}
|
|
396
|
+
- **Run count:** ${r.runs_count}
|
|
397
|
+
- **Run files:** ${runFileList}
|
|
398
|
+
|
|
399
|
+
## Headline metrics (median across runs)
|
|
400
|
+
|
|
401
|
+
- FP: median ${fp.median} / ${r.fixture_good_claims} (range ${fp.min}\u2013${fp.max})
|
|
402
|
+
- Any-flag recall: median ${pct(af.median)} (range ${pct(af.min)}\u2013${pct(af.max)})
|
|
403
|
+
- Strict recall: median ${pct(sr.median)} (range ${pct(sr.min)}\u2013${pct(sr.max)})
|
|
404
|
+
- Decisions produced: median ${dec.median} / 6 (range ${dec.min}\u2013${dec.max})
|
|
405
|
+
|
|
406
|
+
## PASS / FAIL (aggregate)
|
|
407
|
+
|
|
408
|
+
| Bar | Rule | Result |
|
|
409
|
+
|---|---|---|
|
|
410
|
+
| FP ceiling | median=${fp.median}, max=${fp.max} (median \u22641 AND max \u22642) | ${pf.fp_ceiling} |
|
|
411
|
+
| Any-flag recall | median=${pct(af.median)} (\u226565%) | ${pf.any_flag_recall_floor} |
|
|
412
|
+
| Per-category any-flag | median \u226550% per cat (see below) | ${pf.per_category_any_flag_floor} |
|
|
413
|
+
| Strict recall | median=${pct(sr.median)} (\u226520%) | ${pf.strict_recall_floor} |
|
|
414
|
+
| Decision vocab | median=${dec.median} / 6 (${bar.architecture} \u2265${bar.required}) | ${pf.decision_vocab_completeness} |
|
|
415
|
+
| Latency soft | median=${secRounded(rt.median)} (\u2264600s, WARN only) | ${pf.latency_soft} |
|
|
416
|
+
| Latency hard | max=${secRounded(rt.max)} (every run \u22641200s) | ${pf.latency_hard} |
|
|
417
|
+
| Empty/malformed | max=${r.empty_or_malformed_responses.max} (every run =0) | ${pf.empty_or_malformed} |
|
|
418
|
+
| **OVERALL** | | **${pf.overall}** |
|
|
419
|
+
|
|
420
|
+
## Recurring hard-bar failures
|
|
421
|
+
|
|
422
|
+
${recurringSection}
|
|
423
|
+
|
|
424
|
+
## Per-category recall (median across runs)
|
|
425
|
+
|
|
426
|
+
| Category | Any-flag median | Any-flag range | Total | Strict median | Strict range |
|
|
427
|
+
|---|---|---|---|---|---|
|
|
428
|
+
${perCatAnyFlagRows}
|
|
429
|
+
|
|
430
|
+
## Decision vocabulary (median count across runs)
|
|
431
|
+
|
|
432
|
+
| Decision | Median | Range |
|
|
433
|
+
|---|---|---|
|
|
434
|
+
${dvRows}
|
|
435
|
+
|
|
436
|
+
## Per-run summary
|
|
437
|
+
|
|
438
|
+
| Run | FP | Any-flag | Strict | Decisions | Runtime |
|
|
439
|
+
|---|---|---|---|---|---|
|
|
440
|
+
${perRunRows}
|
|
441
|
+
${notesSection}`;
|
|
442
|
+
}
|
|
443
|
+
export {
|
|
444
|
+
aggregateDecisionVocabulary,
|
|
445
|
+
aggregateMetric,
|
|
446
|
+
aggregatePerCategoryRecall,
|
|
447
|
+
aggregateReceipts,
|
|
448
|
+
buildAggregateReceiptMarkdown,
|
|
449
|
+
computeAggregatePassFail,
|
|
450
|
+
computeAggregateStatusLabel,
|
|
451
|
+
computeRecurringBarFailures,
|
|
452
|
+
median
|
|
453
|
+
};
|
|
454
|
+
//# sourceMappingURL=aggregate.js.map
|