@mcptoolshop/research-os 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,118 +1,132 @@
1
1
  // src/calibration/aggregate-receipt-schema.ts
2
- import { z as z2 } from "zod";
2
+ import { z as z3 } from "zod";
3
3
 
4
4
  // src/calibration/receipt-schema.ts
5
+ import { z as z2 } from "zod";
6
+
7
+ // src/review/reviewer-options-schema.ts
5
8
  import { z } from "zod";
6
- var StatusLabelSchema = z.enum([
9
+ var ReviewerOptionsSchema = z.object({
10
+ num_ctx: z.number().int().positive().optional(),
11
+ temperature: z.number().min(0).max(2).optional(),
12
+ seed: z.number().int().optional(),
13
+ top_p: z.number().min(0).max(1).optional(),
14
+ top_k: z.number().int().nonnegative().optional(),
15
+ repeat_penalty: z.number().min(0).optional()
16
+ });
17
+
18
+ // src/calibration/receipt-schema.ts
19
+ var StatusLabelSchema = z2.enum([
7
20
  "trusted_baseline",
8
21
  "conditional_pass",
9
22
  "failed",
10
23
  "comparison_only"
11
24
  ]);
12
- var ArchitectureSchema = z.enum(["single-pass", "two-pass"]);
13
- var RecallSchema = z.object({
14
- matched: z.number().int().nonnegative(),
15
- total: z.number().int().nonnegative(),
16
- ratio: z.number().min(0).max(1)
25
+ var ArchitectureSchema = z2.enum(["single-pass", "two-pass"]);
26
+ var RecallSchema = z2.object({
27
+ matched: z2.number().int().nonnegative(),
28
+ total: z2.number().int().nonnegative(),
29
+ ratio: z2.number().min(0).max(1)
17
30
  });
18
- var PerCategoryRecallSchema = z.record(z.string(), RecallSchema);
19
- var PassFailSchema = z.object({
20
- fp_ceiling: z.enum(["PASS", "FAIL"]),
21
- any_flag_recall_floor: z.enum(["PASS", "FAIL"]),
22
- per_category_any_flag_floor: z.enum(["PASS", "FAIL"]),
23
- strict_recall_floor: z.enum(["PASS", "FAIL"]),
24
- decision_vocab_completeness: z.enum(["PASS", "FAIL"]),
25
- latency_soft: z.enum(["PASS", "WARN"]),
26
- latency_hard: z.enum(["PASS", "FAIL"]),
27
- empty_or_malformed: z.enum(["PASS", "FAIL"]),
28
- overall: z.enum(["PASS", "FAIL"])
31
+ var PerCategoryRecallSchema = z2.record(z2.string(), RecallSchema);
32
+ var PassFailSchema = z2.object({
33
+ fp_ceiling: z2.enum(["PASS", "FAIL"]),
34
+ any_flag_recall_floor: z2.enum(["PASS", "FAIL"]),
35
+ per_category_any_flag_floor: z2.enum(["PASS", "FAIL"]),
36
+ strict_recall_floor: z2.enum(["PASS", "FAIL"]),
37
+ decision_vocab_completeness: z2.enum(["PASS", "FAIL"]),
38
+ latency_soft: z2.enum(["PASS", "WARN"]),
39
+ latency_hard: z2.enum(["PASS", "FAIL"]),
40
+ empty_or_malformed: z2.enum(["PASS", "FAIL"]),
41
+ overall: z2.enum(["PASS", "FAIL"])
29
42
  });
30
- var DecisionVocabBarSchema = z.object({
43
+ var DecisionVocabBarSchema = z2.object({
31
44
  architecture: ArchitectureSchema,
32
- required: z.number().int().positive(),
33
- produced: z.number().int().nonnegative(),
34
- passed: z.boolean()
45
+ required: z2.number().int().positive(),
46
+ produced: z2.number().int().nonnegative(),
47
+ passed: z2.boolean()
35
48
  });
36
- var CalibrationReceiptSchema = z.object({
37
- schema_version: z.literal(1),
38
- profile_name: z.string(),
49
+ var CalibrationReceiptSchema = z2.object({
50
+ schema_version: z2.literal(1),
51
+ profile_name: z2.string(),
39
52
  status: StatusLabelSchema,
40
- model: z.string(),
53
+ model: z2.string(),
41
54
  architecture: ArchitectureSchema,
42
- fixture: z.string(),
43
- fixture_total_claims: z.number().int().positive(),
44
- fixture_good_claims: z.number().int().nonnegative(),
45
- fixture_bad_claims: z.number().int().nonnegative(),
46
- calibrated_at: z.string(),
47
- research_os_version: z.string(),
48
- runtime_ms: z.number().int().nonnegative(),
49
- good_fp_count: z.number().int().nonnegative(),
55
+ fixture: z2.string(),
56
+ fixture_total_claims: z2.number().int().positive(),
57
+ fixture_good_claims: z2.number().int().nonnegative(),
58
+ fixture_bad_claims: z2.number().int().nonnegative(),
59
+ calibrated_at: z2.string(),
60
+ research_os_version: z2.string(),
61
+ runtime_ms: z2.number().int().nonnegative(),
62
+ good_fp_count: z2.number().int().nonnegative(),
50
63
  any_flag_recall: RecallSchema,
51
64
  strict_recall: RecallSchema,
52
65
  per_category_any_flag: PerCategoryRecallSchema,
53
66
  per_category_strict: PerCategoryRecallSchema,
54
- decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),
55
- decisions_produced_count: z.number().int().nonnegative(),
67
+ decision_vocabulary: z2.record(z2.string(), z2.number().int().nonnegative()),
68
+ decisions_produced_count: z2.number().int().nonnegative(),
56
69
  decision_vocab_bar: DecisionVocabBarSchema,
57
- unreachable_decisions: z.array(z.string()),
58
- empty_or_malformed_responses: z.number().int().nonnegative(),
70
+ unreachable_decisions: z2.array(z2.string()),
71
+ empty_or_malformed_responses: z2.number().int().nonnegative(),
59
72
  pass_fail: PassFailSchema,
60
- notes: z.array(z.string())
73
+ notes: z2.array(z2.string()),
74
+ reviewer_options: ReviewerOptionsSchema.optional()
61
75
  });
62
76
 
63
77
  // src/calibration/aggregate-receipt-schema.ts
64
- var AggregateMetricSchema = z2.object({
65
- median: z2.number(),
66
- min: z2.number(),
67
- max: z2.number(),
68
- values: z2.array(z2.number())
78
+ var AggregateMetricSchema = z3.object({
79
+ median: z3.number(),
80
+ min: z3.number(),
81
+ max: z3.number(),
82
+ values: z3.array(z3.number())
69
83
  // per-run values in run order (run-001, run-002, ...)
70
84
  });
71
- var PerCategoryAggregateEntrySchema = z2.object({
72
- median_ratio: z2.number().min(0).max(1),
73
- min_ratio: z2.number().min(0).max(1),
74
- max_ratio: z2.number().min(0).max(1),
75
- total: z2.number().int().nonnegative(),
85
+ var PerCategoryAggregateEntrySchema = z3.object({
86
+ median_ratio: z3.number().min(0).max(1),
87
+ min_ratio: z3.number().min(0).max(1),
88
+ max_ratio: z3.number().min(0).max(1),
89
+ total: z3.number().int().nonnegative(),
76
90
  // seed count — same across all runs
77
- per_run_ratios: z2.array(z2.number())
91
+ per_run_ratios: z3.array(z3.number())
78
92
  });
79
- var PerCategoryAggregateSchema = z2.record(z2.string(), PerCategoryAggregateEntrySchema);
80
- var AggregatePassFailSchema = z2.object({
81
- fp_ceiling: z2.enum(["PASS", "FAIL"]),
82
- any_flag_recall_floor: z2.enum(["PASS", "FAIL"]),
83
- per_category_any_flag_floor: z2.enum(["PASS", "FAIL"]),
84
- strict_recall_floor: z2.enum(["PASS", "FAIL"]),
85
- decision_vocab_completeness: z2.enum(["PASS", "FAIL"]),
86
- latency_soft: z2.enum(["PASS", "WARN"]),
87
- latency_hard: z2.enum(["PASS", "FAIL"]),
88
- empty_or_malformed: z2.enum(["PASS", "FAIL"]),
89
- overall: z2.enum(["PASS", "FAIL"])
93
+ var PerCategoryAggregateSchema = z3.record(z3.string(), PerCategoryAggregateEntrySchema);
94
+ var AggregatePassFailSchema = z3.object({
95
+ fp_ceiling: z3.enum(["PASS", "FAIL"]),
96
+ any_flag_recall_floor: z3.enum(["PASS", "FAIL"]),
97
+ per_category_any_flag_floor: z3.enum(["PASS", "FAIL"]),
98
+ strict_recall_floor: z3.enum(["PASS", "FAIL"]),
99
+ decision_vocab_completeness: z3.enum(["PASS", "FAIL"]),
100
+ latency_soft: z3.enum(["PASS", "WARN"]),
101
+ latency_hard: z3.enum(["PASS", "FAIL"]),
102
+ empty_or_malformed: z3.enum(["PASS", "FAIL"]),
103
+ overall: z3.enum(["PASS", "FAIL"])
90
104
  });
91
- var AggregateDecisionVocabBarSchema = z2.object({
105
+ var AggregateDecisionVocabBarSchema = z3.object({
92
106
  architecture: ArchitectureSchema,
93
- required: z2.number().int().positive(),
94
- median_produced: z2.number(),
107
+ required: z3.number().int().positive(),
108
+ median_produced: z3.number(),
95
109
  // float — median of per-run decisions_produced_count
96
- passed: z2.boolean()
110
+ passed: z3.boolean()
97
111
  });
98
- var AggregateCalibrationReceiptSchema = z2.object({
99
- schema_version: z2.literal(1),
100
- receipt_kind: z2.literal("aggregate"),
112
+ var AggregateCalibrationReceiptSchema = z3.object({
113
+ schema_version: z3.literal(1),
114
+ receipt_kind: z3.literal("aggregate"),
101
115
  // discriminates from single-run receipt
102
- profile_name: z2.string(),
116
+ profile_name: z3.string(),
103
117
  status: StatusLabelSchema,
104
- model: z2.string(),
118
+ model: z3.string(),
105
119
  architecture: ArchitectureSchema,
106
- fixture: z2.string(),
107
- fixture_total_claims: z2.number().int().positive(),
108
- fixture_good_claims: z2.number().int().nonnegative(),
109
- fixture_bad_claims: z2.number().int().nonnegative(),
110
- runs_count: z2.number().int().min(2),
111
- run_files: z2.array(z2.string()),
120
+ fixture: z3.string(),
121
+ fixture_total_claims: z3.number().int().positive(),
122
+ fixture_good_claims: z3.number().int().nonnegative(),
123
+ fixture_bad_claims: z3.number().int().nonnegative(),
124
+ runs_count: z3.number().int().min(2),
125
+ run_files: z3.array(z3.string()),
112
126
  // relative paths: runs/run-001.json, etc.
113
- aggregated_at: z2.string(),
127
+ aggregated_at: z3.string(),
114
128
  // ISO 8601
115
- research_os_version: z2.string(),
129
+ research_os_version: z3.string(),
116
130
  // Aggregate metrics — median + min + max + per-run values in run order
117
131
  good_fp_count: AggregateMetricSchema,
118
132
  any_flag_recall_ratio: AggregateMetricSchema,
@@ -123,14 +137,17 @@ var AggregateCalibrationReceiptSchema = z2.object({
123
137
  per_category_any_flag: PerCategoryAggregateSchema,
124
138
  per_category_strict: PerCategoryAggregateSchema,
125
139
  // Decision vocabulary — union of all decisions seen across runs, median count each
126
- decision_vocabulary: z2.record(z2.string(), AggregateMetricSchema),
140
+ decision_vocabulary: z3.record(z3.string(), AggregateMetricSchema),
127
141
  decision_vocab_bar: AggregateDecisionVocabBarSchema,
128
- unreachable_decisions: z2.array(z2.string()),
142
+ unreachable_decisions: z3.array(z3.string()),
129
143
  pass_fail: AggregatePassFailSchema,
130
144
  // Bars that FAILed in >= ceil(runs_count/2) individual runs.
131
145
  // Non-empty list demotes trusted_baseline to conditional_pass.
132
- recurring_bar_failures: z2.array(z2.string()),
133
- notes: z2.array(z2.string())
146
+ recurring_bar_failures: z3.array(z3.string()),
147
+ notes: z3.array(z3.string()),
148
+ // schema_version: 1 — additive-optional (Exp6 Session 2):
149
+ // Same options object stamped on every per-run receipt. Absent = stochastic run.
150
+ reviewer_options: ReviewerOptionsSchema.optional()
134
151
  });
135
152
 
136
153
  // src/calibration/aggregate.ts
@@ -340,9 +357,30 @@ function aggregateReceipts(runs, opts) {
340
357
  unreachable_decisions: first.unreachable_decisions,
341
358
  pass_fail: aggregatePassFail,
342
359
  recurring_bar_failures: recurringBarFailures,
343
- notes
360
+ notes,
361
+ ...opts.reviewerOptions && Object.keys(opts.reviewerOptions).length > 0 && {
362
+ reviewer_options: opts.reviewerOptions
363
+ }
344
364
  });
345
365
  }
366
+ var REVIEWER_OPTIONS_KEY_ORDER = [
367
+ "num_ctx",
368
+ "temperature",
369
+ "seed",
370
+ "top_p",
371
+ "top_k",
372
+ "repeat_penalty"
373
+ ];
374
+ function buildReviewerOptionsSection(opts) {
375
+ if (!opts) return "";
376
+ const lines = REVIEWER_OPTIONS_KEY_ORDER.filter((k) => opts[k] !== void 0).map((k) => `- ${k}: ${opts[k]}`);
377
+ if (lines.length === 0) return "";
378
+ return `
379
+ ## Reviewer options
380
+
381
+ ${lines.join("\n")}
382
+ `;
383
+ }
346
384
  function buildAggregateReceiptMarkdown(r) {
347
385
  const pct = (ratio) => `${Math.round(ratio * 100)}%`;
348
386
  const secRounded = (ms) => `${(ms / 1e3).toFixed(1)}s`;
@@ -385,6 +423,7 @@ function buildAggregateReceiptMarkdown(r) {
385
423
 
386
424
  ${r.notes.map((n) => `- ${n}`).join("\n")}
387
425
  ` : "";
426
+ const reviewerOptionsSection = buildReviewerOptionsSection(r.reviewer_options);
388
427
  return `# Calibration Receipt \u2014 ${r.profile_name} (aggregate, N=${r.runs_count} runs)
389
428
 
390
429
  - **Model:** ${r.model}
@@ -395,7 +434,7 @@ ${r.notes.map((n) => `- ${n}`).join("\n")}
395
434
  - **Research-OS version:** ${r.research_os_version}
396
435
  - **Run count:** ${r.runs_count}
397
436
  - **Run files:** ${runFileList}
398
-
437
+ ${reviewerOptionsSection}
399
438
  ## Headline metrics (median across runs)
400
439
 
401
440
  - FP: median ${fp.median} / ${r.fixture_good_claims} (range ${fp.min}\u2013${fp.max})
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/calibration/aggregate-receipt-schema.ts","../../src/calibration/receipt-schema.ts","../../src/calibration/aggregate.ts"],"sourcesContent":["import { z } from 'zod';\nimport { StatusLabelSchema, ArchitectureSchema } from './receipt-schema.js';\n\nexport const AggregateMetricSchema = z.object({\n median: z.number(),\n min: z.number(),\n max: z.number(),\n values: z.array(z.number()), // per-run values in run order (run-001, run-002, ...)\n});\n\nexport const PerCategoryAggregateEntrySchema = z.object({\n median_ratio: z.number().min(0).max(1),\n min_ratio: z.number().min(0).max(1),\n max_ratio: z.number().min(0).max(1),\n total: z.number().int().nonnegative(), // seed count — same across all runs\n per_run_ratios: z.array(z.number()),\n});\n\nexport const PerCategoryAggregateSchema = z.record(z.string(), PerCategoryAggregateEntrySchema);\n\nexport const AggregatePassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const AggregateDecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n median_produced: z.number(), // float — median of per-run decisions_produced_count\n passed: z.boolean(),\n});\n\nexport const AggregateCalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n receipt_kind: z.literal('aggregate'), // discriminates from single-run receipt\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n runs_count: z.number().int().min(2),\n run_files: z.array(z.string()), // relative paths: runs/run-001.json, etc.\n aggregated_at: z.string(), // ISO 8601\n research_os_version: z.string(),\n\n // Aggregate metrics — median + min + max + per-run values in run order\n good_fp_count: AggregateMetricSchema,\n any_flag_recall_ratio: AggregateMetricSchema,\n strict_recall_ratio: AggregateMetricSchema,\n decisions_produced_count: AggregateMetricSchema,\n runtime_ms: AggregateMetricSchema,\n empty_or_malformed_responses: AggregateMetricSchema,\n\n per_category_any_flag: PerCategoryAggregateSchema,\n per_category_strict: PerCategoryAggregateSchema,\n\n // Decision vocabulary — union of all decisions seen across runs, median count each\n decision_vocabulary: z.record(z.string(), AggregateMetricSchema),\n decision_vocab_bar: AggregateDecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n\n pass_fail: AggregatePassFailSchema,\n // Bars that FAILed in >= ceil(runs_count/2) individual runs.\n // Non-empty list demotes trusted_baseline to conditional_pass.\n recurring_bar_failures: z.array(z.string()),\n\n notes: z.array(z.string()),\n});\n\nexport type AggregateMetric = z.infer<typeof AggregateMetricSchema>;\nexport type PerCategoryAggregateEntry = z.infer<typeof PerCategoryAggregateEntrySchema>;\nexport type PerCategoryAggregate = z.infer<typeof PerCategoryAggregateSchema>;\nexport type AggregatePassFail = z.infer<typeof AggregatePassFailSchema>;\nexport type AggregateDecisionVocabBar = z.infer<typeof AggregateDecisionVocabBarSchema>;\nexport type AggregateCalibrationReceipt = z.infer<typeof AggregateCalibrationReceiptSchema>;\n","import { z } from 'zod';\n\nexport const StatusLabelSchema = z.enum([\n 'trusted_baseline',\n 'conditional_pass',\n 'failed',\n 'comparison_only',\n]);\n\nexport const ArchitectureSchema = z.enum(['single-pass', 'two-pass']);\n\nexport const RecallSchema = z.object({\n matched: z.number().int().nonnegative(),\n total: z.number().int().nonnegative(),\n ratio: z.number().min(0).max(1),\n});\n\nexport const PerCategoryRecallSchema = z.record(z.string(), RecallSchema);\n\nexport const PassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const DecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n produced: z.number().int().nonnegative(),\n passed: z.boolean(),\n});\n\nexport const CalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n calibrated_at: z.string(),\n research_os_version: z.string(),\n runtime_ms: z.number().int().nonnegative(),\n good_fp_count: z.number().int().nonnegative(),\n any_flag_recall: RecallSchema,\n strict_recall: RecallSchema,\n per_category_any_flag: PerCategoryRecallSchema,\n per_category_strict: PerCategoryRecallSchema,\n decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),\n decisions_produced_count: z.number().int().nonnegative(),\n decision_vocab_bar: DecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n empty_or_malformed_responses: z.number().int().nonnegative(),\n pass_fail: PassFailSchema,\n notes: z.array(z.string()),\n});\n\nexport type StatusLabel = z.infer<typeof StatusLabelSchema>;\nexport type Architecture = z.infer<typeof ArchitectureSchema>;\nexport type Recall = z.infer<typeof RecallSchema>;\nexport type PerCategoryRecall = z.infer<typeof PerCategoryRecallSchema>;\nexport type PassFail = z.infer<typeof PassFailSchema>;\nexport type DecisionVocabBar = z.infer<typeof DecisionVocabBarSchema>;\nexport type CalibrationReceipt = z.infer<typeof CalibrationReceiptSchema>;\n","import type { Architecture, CalibrationReceipt, PassFail, PerCategoryRecall } from './receipt-schema.js';\nimport {\n AggregateCalibrationReceiptSchema,\n type AggregateCalibrationReceipt,\n type AggregateMetric,\n type AggregatePassFail,\n type PerCategoryAggregate,\n} from './aggregate-receipt-schema.js';\nimport type { StatusLabel } from './receipt-schema.js';\n\n// Compute median of a sorted or unsorted array.\n// Throws on empty input — callers always have at least one run.\n// For even-length arrays: mean of two middle values (float, not rounded).\n// Integer-valued metrics (FP count, decisions) stay as floats here;\n// the caller's bar comparisons (>= 3, === 0) work correctly on exact floats\n// because the inputs are small integers.\nexport function median(values: number[]): number {\n if (values.length === 0) throw new Error('median: empty array');\n const sorted = [...values].sort((a, b) => a - b);\n const mid = Math.floor(sorted.length / 2);\n if (sorted.length % 2 === 1) return sorted[mid];\n return (sorted[mid - 1] + sorted[mid]) / 2;\n}\n\n// Aggregate a list of per-run scalar values into { median, min, max, values }.\n// values preserves input order (run-001, run-002, ...) for traceability.\nexport function aggregateMetric(values: number[]): AggregateMetric {\n const m = median(values);\n return {\n median: m,\n min: Math.min(...values),\n max: Math.max(...values),\n values,\n };\n}\n\n// Aggregate per-run per-category recall objects.\n// Each element of perRunBuckets is one run's PerCategoryRecall\n// (Record<category, { matched, total, ratio }>).\n// Returns PerCategoryAggregate: per-category median/min/max ratio + per-run ratios.\n// total is taken from the first run that has the category (same across runs —\n// SEEDS is static so category totals never change between runs).\nexport function aggregatePerCategoryRecall(\n perRunBuckets: PerCategoryRecall[],\n): PerCategoryAggregate {\n const cats = new Set<string>();\n for (const run of perRunBuckets) {\n for (const cat of Object.keys(run)) cats.add(cat);\n }\n\n const result: PerCategoryAggregate = {};\n for (const cat of cats) {\n const ratios = perRunBuckets.map((run) => run[cat]?.ratio ?? 0);\n const total = perRunBuckets.find((run) => run[cat] !== undefined)?.[cat]?.total ?? 0;\n result[cat] = {\n median_ratio: median(ratios),\n min_ratio: Math.min(...ratios),\n max_ratio: Math.max(...ratios),\n total,\n per_run_ratios: ratios,\n };\n }\n return result;\n}\n\n// Aggregate per-run decision vocabulary count dicts.\n// Each element is one run's decision_vocabulary (Record<decision, count>).\n// Returns Record<decision, AggregateMetric> with median count per decision.\nexport function aggregateDecisionVocabulary(\n perRunDicts: Record<string, number>[],\n): Record<string, AggregateMetric> {\n const decisions = new Set<string>();\n for (const run of perRunDicts) {\n for (const d of Object.keys(run)) decisions.add(d);\n }\n\n const result: Record<string, AggregateMetric> = {};\n for (const d of decisions) {\n const values = perRunDicts.map((run) => run[d] ?? 0);\n result[d] = aggregateMetric(values);\n }\n return result;\n}\n\n// Compute aggregate PASS/FAIL bars from aggregated metrics.\n//\n// Advisor-locked rules (gospel):\n// FP ceiling: median <= 1 AND max <= 2\n// Any-flag recall: median >= 0.65\n// Per-category: median_ratio >= 0.50 for categories with total >= 2\n// Strict recall: median >= 0.20\n// Decision vocab: median >= required (architecture-aware: two-pass=3, single-pass=4)\n// Latency soft: median <= 600_000 → WARN only, never FAIL\n// Latency hard: every-run rule — max <= 1_200_000\n// Empty/malformed: every-run rule — max === 0\nexport function computeAggregatePassFail(input: {\n good_fp_count: AggregateMetric;\n any_flag_recall_ratio: AggregateMetric;\n per_category_any_flag: PerCategoryAggregate;\n strict_recall_ratio: AggregateMetric;\n decisions_produced_count: AggregateMetric;\n architecture: Architecture;\n runtime_ms: AggregateMetric;\n empty_or_malformed_responses: AggregateMetric;\n}): AggregatePassFail {\n const fp_ceiling: 'PASS' | 'FAIL' =\n input.good_fp_count.median <= 1 && input.good_fp_count.max <= 2 ? 'PASS' : 'FAIL';\n\n const any_flag_recall_floor: 'PASS' | 'FAIL' =\n input.any_flag_recall_ratio.median >= 0.65 ? 'PASS' : 'FAIL';\n\n let per_category_any_flag_floor: 'PASS' | 'FAIL' = 'PASS';\n for (const entry of Object.values(input.per_category_any_flag)) {\n if (entry.total >= 2 && entry.median_ratio < 0.5) {\n per_category_any_flag_floor = 'FAIL';\n break;\n }\n }\n\n const strict_recall_floor: 'PASS' | 'FAIL' =\n input.strict_recall_ratio.median >= 0.2 ? 'PASS' : 'FAIL';\n\n const dvRequired = input.architecture === 'two-pass' ? 3 : 4;\n const decision_vocab_completeness: 'PASS' | 'FAIL' =\n input.decisions_produced_count.median >= dvRequired ? 'PASS' : 'FAIL';\n\n // Latency soft: WARN-only signal — no FAIL contribution\n const latency_soft: 'PASS' | 'WARN' =\n input.runtime_ms.median <= 600_000 ? 'PASS' : 'WARN';\n\n // Latency hard: every-run rule — enforced via max\n const latency_hard: 'PASS' | 'FAIL' =\n input.runtime_ms.max <= 1_200_000 ? 'PASS' : 'FAIL';\n\n // Empty/malformed: every-run rule — enforced via max\n const empty_or_malformed: 'PASS' | 'FAIL' =\n input.empty_or_malformed_responses.max === 0 ? 'PASS' : 'FAIL';\n\n const hardBars: ('PASS' | 'FAIL')[] = [\n fp_ceiling,\n any_flag_recall_floor,\n per_category_any_flag_floor,\n strict_recall_floor,\n decision_vocab_completeness,\n latency_hard,\n empty_or_malformed,\n ];\n const overall: 'PASS' | 'FAIL' = hardBars.every((v) => v === 'PASS') ? 'PASS' : 'FAIL';\n\n return {\n fp_ceiling,\n any_flag_recall_floor,\n per_category_any_flag_floor,\n strict_recall_floor,\n decision_vocab_completeness,\n latency_soft,\n latency_hard,\n empty_or_malformed,\n overall,\n };\n}\n\n// Compute which hard bars FAILed in >= ceil(N/2) individual runs.\n// A non-empty result means that bar was SYSTEMATICALLY unreliable —\n// not just a one-run outlier that happened to median-pass.\n// This is used by computeAggregateStatusLabel to prevent a profile from\n// earning trusted_baseline when one bar failed in the majority of runs.\n//\n// Hard bars checked (latency_soft and overall are excluded):\n// fp_ceiling, any_flag_recall_floor, per_category_any_flag_floor,\n// strict_recall_floor, decision_vocab_completeness, latency_hard, empty_or_malformed\nexport function computeRecurringBarFailures(\n perRunPassFails: PassFail[],\n totalRuns: number,\n): string[] {\n const threshold = Math.ceil(totalRuns / 2);\n const HARD_BARS: (keyof PassFail)[] = [\n 'fp_ceiling',\n 'any_flag_recall_floor',\n 'per_category_any_flag_floor',\n 'strict_recall_floor',\n 'decision_vocab_completeness',\n 'latency_hard',\n 'empty_or_malformed',\n ];\n\n const recurring: string[] = [];\n for (const bar of HARD_BARS) {\n const failCount = perRunPassFails.filter((pf) => pf[bar] === 'FAIL').length;\n if (failCount >= threshold) recurring.push(bar);\n }\n return recurring;\n}\n\n// Assign aggregate status label.\n//\n// Advisor-locked predicates (priority order):\n// 1. comparison_only — explicit mode flag OR single-pass Hermes (regardless of pass/fail)\n// 2. failed — aggregate pass_fail.overall === FAIL\n// 3. trusted_baseline — Hermes two-pass AND aggregate PASS AND median(FP) === 0\n// AND recurring_bar_failures.length === 0\n// The recurring-failure check prevents a profile from earning trusted_baseline\n// when any hard bar FAILed in >= ceil(N/2) runs even if the median still passed.\n// Intent: \"one lucky median cannot mask systemic bar weakness.\"\n// 4. conditional_pass — fallthrough (passes but doesn't earn trusted_baseline)\n// Mistral two-pass is capped at conditional_pass regardless of aggregate result.\nexport function computeAggregateStatusLabel(input: {\n profileName: string;\n architecture: Architecture;\n aggregatePassFail: AggregatePassFail;\n medianGoodFpCount: number;\n recurringBarFailures: string[];\n modeOverride?: 'comparison_only';\n}): StatusLabel {\n if (input.modeOverride === 'comparison_only') return 'comparison_only';\n\n if (input.architecture === 'single-pass' && /hermes/i.test(input.profileName)) {\n return 'comparison_only';\n }\n\n if (input.aggregatePassFail.overall === 'FAIL') return 'failed';\n\n const isHermesTwoPass =\n /hermes/i.test(input.profileName) && input.architecture === 'two-pass';\n if (\n isHermesTwoPass &&\n input.medianGoodFpCount === 0 &&\n input.recurringBarFailures.length === 0\n ) {\n return 'trusted_baseline';\n }\n\n return 'conditional_pass';\n}\n\n// Aggregate N single-run receipts into one AggregateCalibrationReceipt.\n// All receipts must be from the same profile/model/architecture.\n// opts.runFiles: relative paths for each run (e.g. 'runs/run-001.json').\n// opts.modeOverride: forward 'comparison_only' to status-label predicate.\n// opts.aggregatedAt: ISO timestamp (defaults to now).\nexport function aggregateReceipts(\n runs: CalibrationReceipt[],\n opts: {\n runFiles: string[];\n modeOverride?: 'comparison_only';\n aggregatedAt?: string;\n },\n): AggregateCalibrationReceipt {\n if (runs.length === 0) throw new Error('aggregateReceipts: no runs provided');\n const first = runs[0];\n\n const fpMetric = aggregateMetric(runs.map((r) => r.good_fp_count));\n const anyFlagRatioMetric = aggregateMetric(runs.map((r) => r.any_flag_recall.ratio));\n const strictRatioMetric = aggregateMetric(runs.map((r) => r.strict_recall.ratio));\n const decisionsMetric = aggregateMetric(runs.map((r) => r.decisions_produced_count));\n const runtimeMetric = aggregateMetric(runs.map((r) => r.runtime_ms));\n const emptyOrMalformedMetric = aggregateMetric(\n runs.map((r) => r.empty_or_malformed_responses),\n );\n\n const perCatAnyFlag = aggregatePerCategoryRecall(runs.map((r) => r.per_category_any_flag));\n const perCatStrict = aggregatePerCategoryRecall(runs.map((r) => r.per_category_strict));\n const decisionVocab = aggregateDecisionVocabulary(runs.map((r) => r.decision_vocabulary));\n\n const dvRequired = first.architecture === 'two-pass' ? 3 : 4;\n const decisionVocabBar = {\n architecture: first.architecture,\n required: dvRequired,\n median_produced: decisionsMetric.median,\n passed: decisionsMetric.median >= dvRequired,\n };\n\n const aggregatePassFail = computeAggregatePassFail({\n good_fp_count: fpMetric,\n any_flag_recall_ratio: anyFlagRatioMetric,\n per_category_any_flag: perCatAnyFlag,\n strict_recall_ratio: strictRatioMetric,\n decisions_produced_count: decisionsMetric,\n architecture: first.architecture,\n runtime_ms: runtimeMetric,\n empty_or_malformed_responses: emptyOrMalformedMetric,\n });\n\n const recurringBarFailures = computeRecurringBarFailures(\n runs.map((r) => r.pass_fail),\n runs.length,\n );\n\n const status = computeAggregateStatusLabel({\n profileName: first.profile_name,\n architecture: first.architecture,\n aggregatePassFail,\n medianGoodFpCount: fpMetric.median,\n recurringBarFailures,\n modeOverride: opts.modeOverride,\n });\n\n const notes: string[] = [];\n if (aggregatePassFail.latency_soft === 'WARN') {\n notes.push(\n `Latency warning: median ${(runtimeMetric.median / 1000).toFixed(1)}s exceeds soft limit of 600s`,\n );\n }\n if (fpMetric.median > 0) {\n notes.push(`FP at ceiling: median ${fpMetric.median} false positive(s) on good claims`);\n }\n if (recurringBarFailures.length > 0) {\n notes.push(`Recurring bar failures (>= ceil(N/2) runs): ${recurringBarFailures.join(', ')}`);\n }\n if (status === 'comparison_only') {\n notes.push(\n 'comparison_only: architectural side-run, not a production admission candidate',\n );\n }\n if (status === 'conditional_pass') {\n notes.push('conditional_pass: passes all bars but carries a production caution');\n }\n\n return AggregateCalibrationReceiptSchema.parse({\n schema_version: 1,\n receipt_kind: 'aggregate',\n profile_name: first.profile_name,\n status,\n model: first.model,\n architecture: first.architecture,\n fixture: first.fixture,\n fixture_total_claims: first.fixture_total_claims,\n fixture_good_claims: first.fixture_good_claims,\n fixture_bad_claims: first.fixture_bad_claims,\n runs_count: runs.length,\n run_files: opts.runFiles,\n aggregated_at: opts.aggregatedAt ?? new Date().toISOString(),\n research_os_version: first.research_os_version,\n good_fp_count: fpMetric,\n any_flag_recall_ratio: anyFlagRatioMetric,\n strict_recall_ratio: strictRatioMetric,\n decisions_produced_count: decisionsMetric,\n runtime_ms: runtimeMetric,\n empty_or_malformed_responses: emptyOrMalformedMetric,\n per_category_any_flag: perCatAnyFlag,\n per_category_strict: perCatStrict,\n decision_vocabulary: decisionVocab,\n decision_vocab_bar: decisionVocabBar,\n unreachable_decisions: first.unreachable_decisions,\n pass_fail: aggregatePassFail,\n recurring_bar_failures: recurringBarFailures,\n notes,\n });\n}\n\n// Render the aggregate calibration receipt as compact Markdown.\n// Operator proof artifact — no prose.\nexport function buildAggregateReceiptMarkdown(r: AggregateCalibrationReceipt): string {\n const pct = (ratio: number) => `${Math.round(ratio * 100)}%`;\n const secRounded = (ms: number) => `${(ms / 1000).toFixed(1)}s`;\n\n const af = r.any_flag_recall_ratio;\n const sr = r.strict_recall_ratio;\n const fp = r.good_fp_count;\n const dec = r.decisions_produced_count;\n const rt = r.runtime_ms;\n const pf = r.pass_fail;\n const bar = r.decision_vocab_bar;\n\n const runFileList =\n r.run_files.length > 0\n ? `${r.run_files[0]} … ${r.run_files[r.run_files.length - 1]}`\n : '(none)';\n\n const perCatAnyFlagRows = Object.entries(r.per_category_any_flag)\n .map(([cat, entry]) => {\n const st = r.per_category_strict[cat];\n return (\n `| ${cat} | ${pct(entry.median_ratio)} | ${pct(entry.min_ratio)}–${pct(entry.max_ratio)} | ${entry.total} |` +\n (st\n ? ` ${pct(st.median_ratio)} | ${pct(st.min_ratio)}–${pct(st.max_ratio)} |`\n : ' — | — |')\n );\n })\n .join('\\n');\n\n const ALL_DECISIONS = [\n 'accepted_for_synthesis',\n 'rejected',\n 'needs_scope_repair',\n 'needs_source_repair',\n 'needs_contradiction_mapping',\n 'needs_human_review',\n ];\n const dvRows = ALL_DECISIONS.map((d) => {\n const metric = r.decision_vocabulary[d];\n const unreachable = r.unreachable_decisions.includes(d)\n ? ` (unreachable from ${r.fixture})`\n : '';\n if (!metric) return `| ${d} | — | — |${unreachable}`;\n return `| ${d} | ${metric.median.toFixed(1)} | ${metric.min}–${metric.max}${unreachable} |`;\n }).join('\\n');\n\n // Per-run summary table — pulled from run_files labels for clarity\n const perRunRows = r.any_flag_recall_ratio.values\n .map((afr, i) => {\n const fp_i = r.good_fp_count.values[i] ?? '?';\n const sr_i = r.strict_recall_ratio.values[i] ?? '?';\n const dec_i = r.decisions_produced_count.values[i] ?? '?';\n const rt_i = r.runtime_ms.values[i] ?? '?';\n return `| ${i + 1} | ${fp_i}/${r.fixture_good_claims} | ${typeof afr === 'number' ? pct(afr) : '?'} | ${typeof sr_i === 'number' ? pct(sr_i) : '?'} | ${dec_i}/6 | ${typeof rt_i === 'number' ? secRounded(rt_i) : '?'} |`;\n })\n .join('\\n');\n\n const recurringSection =\n r.recurring_bar_failures.length > 0\n ? r.recurring_bar_failures.map((b) => `- ${b}`).join('\\n')\n : 'None.';\n\n const notesSection =\n r.notes.length > 0 ? `\\n## Notes\\n\\n${r.notes.map((n) => `- ${n}`).join('\\n')}\\n` : '';\n\n return `# Calibration Receipt — ${r.profile_name} (aggregate, N=${r.runs_count} runs)\n\n- **Model:** ${r.model}\n- **Architecture:** ${r.architecture}\n- **Status:** ${r.status}\n- **Fixture:** ${r.fixture} (${r.fixture_total_claims} claims = ${r.fixture_good_claims} good + ${r.fixture_bad_claims} bad)\n- **Aggregated at:** ${r.aggregated_at}\n- **Research-OS version:** ${r.research_os_version}\n- **Run count:** ${r.runs_count}\n- **Run files:** ${runFileList}\n\n## Headline metrics (median across runs)\n\n- FP: median ${fp.median} / ${r.fixture_good_claims} (range ${fp.min}–${fp.max})\n- Any-flag recall: median ${pct(af.median)} (range ${pct(af.min)}–${pct(af.max)})\n- Strict recall: median ${pct(sr.median)} (range ${pct(sr.min)}–${pct(sr.max)})\n- Decisions produced: median ${dec.median} / 6 (range ${dec.min}–${dec.max})\n\n## PASS / FAIL (aggregate)\n\n| Bar | Rule | Result |\n|---|---|---|\n| FP ceiling | median=${fp.median}, max=${fp.max} (median ≤1 AND max ≤2) | ${pf.fp_ceiling} |\n| Any-flag recall | median=${pct(af.median)} (≥65%) | ${pf.any_flag_recall_floor} |\n| Per-category any-flag | median ≥50% per cat (see below) | ${pf.per_category_any_flag_floor} |\n| Strict recall | median=${pct(sr.median)} (≥20%) | ${pf.strict_recall_floor} |\n| Decision vocab | median=${dec.median} / 6 (${bar.architecture} ≥${bar.required}) | ${pf.decision_vocab_completeness} |\n| Latency soft | median=${secRounded(rt.median)} (≤600s, WARN only) | ${pf.latency_soft} |\n| Latency hard | max=${secRounded(rt.max)} (every run ≤1200s) | ${pf.latency_hard} |\n| Empty/malformed | max=${r.empty_or_malformed_responses.max} (every run =0) | ${pf.empty_or_malformed} |\n| **OVERALL** | | **${pf.overall}** |\n\n## Recurring hard-bar failures\n\n${recurringSection}\n\n## Per-category recall (median across runs)\n\n| Category | Any-flag median | Any-flag range | Total | Strict median | Strict range |\n|---|---|---|---|---|---|\n${perCatAnyFlagRows}\n\n## Decision vocabulary (median count across runs)\n\n| Decision | Median | Range |\n|---|---|---|\n${dvRows}\n\n## Per-run summary\n\n| Run | FP | Any-flag | Strict | Decisions | Runtime |\n|---|---|---|---|---|---|\n${perRunRows}\n${notesSection}`;\n}\n"],"mappings":";AAAA,SAAS,KAAAA,UAAS;;;ACAlB,SAAS,SAAS;AAEX,IAAM,oBAAoB,EAAE,KAAK;AAAA,EACtC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAEM,IAAM,qBAAqB,EAAE,KAAK,CAAC,eAAe,UAAU,CAAC;AAE7D,IAAM,eAAe,EAAE,OAAO;AAAA,EACnC,SAAS,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACtC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACpC,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAChC,CAAC;AAEM,IAAM,0BAA0B,EAAE,OAAO,EAAE,OAAO,GAAG,YAAY;AAEjE,IAAM,iBAAiB,EAAE,OAAO;AAAA,EACrC,YAAY,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6B,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6B,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAc,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAc,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAAS,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,yBAAyB,EAAE,OAAO;AAAA,EAC7C,cAAc;AAAA,EACd,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvC,QAAQ,EAAE,QAAQ;AACpB,CAAC;AAEM,IAAM,2BAA2B,EAAE,OAAO;AAAA,EAC/C,gBAAgB,EAAE,QAAQ,CAAC;AAAA,EAC3B,cAAc,EAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAO,EAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAAS,EAAE,OAAO;AAAA,EAClB,sBAAsB,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqB,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoB,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,eAAe,EAAE,OAAO;AAAA,EACxB,qBAAqB,EAAE,OAAO;AAAA,EAC9B,YAAY,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACzC,eAAe,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC5C,iBAAiB;AAAA,EACjB,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,qBAAqB,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,CAAC;AAAA,EACxE,0BAA0B,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvD,oBAAoB;AAAA,EACpB,uBAAuB,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EACzC,8BAA8B,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC3D,WAAW;AAAA,EACX,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC;AAC3B,CAAC;;;AD5DM,IAAM,wBAAwBC,GAAE,OAAO;AAAA,EAC5C,QAAQA,GAAE,OAAO;AAAA,EACjB,KAAKA,GAAE,OAAO;AAAA,EACd,KAAKA,GAAE,OAAO;AAAA,EACd,QAAQA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAC5B,CAAC;AAEM,IAAM,kCAAkCA,GAAE,OAAO;AAAA,EACtD,cAAcA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EACrC,WAAWA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAClC,WAAWA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAClC,OAAOA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA;AAAA,EACpC,gBAAgBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AACpC,CAAC;AAEM,IAAM,6BAA6BA,GAAE,OAAOA,GAAE,OAAO,GAAG,+BAA+B;AAEvF,IAAM,0BAA0BA,GAAE,OAAO;AAAA,EAC9C,YAAYA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAASA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,kCAAkCA,GAAE,OAAO;AAAA,EACtD,cAAc;AAAA,EACd,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,iBAAiBA,GAAE,OAAO;AAAA;AAAA,EAC1B,QAAQA,GAAE,QAAQ;AACpB,CAAC;AAEM,IAAM,oCAAoCA,GAAE,OAAO;AAAA,EACxD,gBAAgBA,GAAE,QAAQ,CAAC;AAAA,EAC3B,cAAcA,GAAE,QAAQ,WAAW;AAAA;AAAA,EACnC,cAAcA,GAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAOA,GAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAASA,GAAE,OAAO;AAAA,EAClB,sBAAsBA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,YAAYA,GAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC;AAAA,EAClC,WAAWA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAAA,EAC7B,eAAeA,GAAE,OAAO;AAAA;AAAA,EACxB,qBAAqBA,GAAE,OAAO;AAAA;AAAA,EAG9B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,0BAA0B;AAAA,EAC1B,YAAY;AAAA,EACZ,8BAA8B;AAAA,EAE9B,uBAAuB;AAAA,EACvB,qBAAqB;AAAA;AAAA,EAGrB,qBAAqBA,GAAE,OAAOA,GAAE,OAAO,GAAG,qBAAqB;AAAA,EAC/D,oBAAoB;AAAA,EACpB,uBAAuBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EAEzC,WAAW;AAAA;AAAA;AAAA,EAGX,wBAAwBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EAE1C,OAAOA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAC3B,CAAC;;;AE7DM,SAAS,OAAO,QAA0B;AAC/C,MAAI,OAAO,WAAW,EAAG,OAAM,IAAI,MAAM,qBAAqB;AAC9D,QAAM,SAAS,CAAC,GAAG,MAAM,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC/C,QAAM,MAAM,KAAK,MAAM,OAAO,SAAS,CAAC;AACxC,MAAI,OAAO,SAAS,MAAM,EAAG,QAAO,OAAO,GAAG;AAC9C,UAAQ,OAAO,MAAM,CAAC,IAAI,OAAO,GAAG,KAAK;AAC3C;AAIO,SAAS,gBAAgB,QAAmC;AACjE,QAAM,IAAI,OAAO,MAAM;AACvB,SAAO;AAAA,IACL,QAAQ;AAAA,IACR,KAAK,KAAK,IAAI,GAAG,MAAM;AAAA,IACvB,KAAK,KAAK,IAAI,GAAG,MAAM;AAAA,IACvB;AAAA,EACF;AACF;AAQO,SAAS,2BACd,eACsB;AACtB,QAAM,OAAO,oBAAI,IAAY;AAC7B,aAAW,OAAO,eAAe;AAC/B,eAAW,OAAO,OAAO,KAAK,GAAG,EAAG,MAAK,IAAI,GAAG;AAAA,EAClD;AAEA,QAAM,SAA+B,CAAC;AACtC,aAAW,OAAO,MAAM;AACtB,UAAM,SAAS,cAAc,IAAI,CAAC,QAAQ,IAAI,GAAG,GAAG,SAAS,CAAC;AAC9D,UAAM,QAAQ,cAAc,KAAK,CAAC,QAAQ,IAAI,GAAG,MAAM,MAAS,IAAI,GAAG,GAAG,SAAS;AACnF,WAAO,GAAG,IAAI;AAAA,MACZ,cAAc,OAAO,MAAM;AAAA,MAC3B,WAAW,KAAK,IAAI,GAAG,MAAM;AAAA,MAC7B,WAAW,KAAK,IAAI,GAAG,MAAM;AAAA,MAC7B;AAAA,MACA,gBAAgB;AAAA,IAClB;AAAA,EACF;AACA,SAAO;AACT;AAKO,SAAS,4BACd,aACiC;AACjC,QAAM,YAAY,oBAAI,IAAY;AAClC,aAAW,OAAO,aAAa;AAC7B,eAAW,KAAK,OAAO,KAAK,GAAG,EAAG,WAAU,IAAI,CAAC;AAAA,EACnD;AAEA,QAAM,SAA0C,CAAC;AACjD,aAAW,KAAK,WAAW;AACzB,UAAM,SAAS,YAAY,IAAI,CAAC,QAAQ,IAAI,CAAC,KAAK,CAAC;AACnD,WAAO,CAAC,IAAI,gBAAgB,MAAM;AAAA,EACpC;AACA,SAAO;AACT;AAaO,SAAS,yBAAyB,OASnB;AACpB,QAAM,aACJ,MAAM,cAAc,UAAU,KAAK,MAAM,cAAc,OAAO,IAAI,SAAS;AAE7E,QAAM,wBACJ,MAAM,sBAAsB,UAAU,OAAO,SAAS;AAExD,MAAI,8BAA+C;AACnD,aAAW,SAAS,OAAO,OAAO,MAAM,qBAAqB,GAAG;AAC9D,QAAI,MAAM,SAAS,KAAK,MAAM,eAAe,KAAK;AAChD,oCAA8B;AAC9B;AAAA,IACF;AAAA,EACF;AAEA,QAAM,sBACJ,MAAM,oBAAoB,UAAU,MAAM,SAAS;AAErD,QAAM,aAAa,MAAM,iBAAiB,aAAa,IAAI;AAC3D,QAAM,8BACJ,MAAM,yBAAyB,UAAU,aAAa,SAAS;AAGjE,QAAM,eACJ,MAAM,WAAW,UAAU,MAAU,SAAS;AAGhD,QAAM,eACJ,MAAM,WAAW,OAAO,OAAY,SAAS;AAG/C,QAAM,qBACJ,MAAM,6BAA6B,QAAQ,IAAI,SAAS;AAE1D,QAAM,WAAgC;AAAA,IACpC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,QAAM,UAA2B,SAAS,MAAM,CAAC,MAAM,MAAM,MAAM,IAAI,SAAS;AAEhF,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAWO,SAAS,4BACd,iBACA,WACU;AACV,QAAM,YAAY,KAAK,KAAK,YAAY,CAAC;AACzC,QAAM,YAAgC;AAAA,IACpC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,QAAM,YAAsB,CAAC;AAC7B,aAAW,OAAO,WAAW;AAC3B,UAAM,YAAY,gBAAgB,OAAO,CAAC,OAAO,GAAG,GAAG,MAAM,MAAM,EAAE;AACrE,QAAI,aAAa,UAAW,WAAU,KAAK,GAAG;AAAA,EAChD;AACA,SAAO;AACT;AAcO,SAAS,4BAA4B,OAO5B;AACd,MAAI,MAAM,iBAAiB,kBAAmB,QAAO;AAErD,MAAI,MAAM,iBAAiB,iBAAiB,UAAU,KAAK,MAAM,WAAW,GAAG;AAC7E,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,kBAAkB,YAAY,OAAQ,QAAO;AAEvD,QAAM,kBACJ,UAAU,KAAK,MAAM,WAAW,KAAK,MAAM,iBAAiB;AAC9D,MACE,mBACA,MAAM,sBAAsB,KAC5B,MAAM,qBAAqB,WAAW,GACtC;AACA,WAAO;AAAA,EACT;AAEA,SAAO;AACT;AAOO,SAAS,kBACd,MACA,MAK6B;AAC7B,MAAI,KAAK,WAAW,EAAG,OAAM,IAAI,MAAM,qCAAqC;AAC5E,QAAM,QAAQ,KAAK,CAAC;AAEpB,QAAM,WAAW,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,aAAa,CAAC;AACjE,QAAM,qBAAqB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,gBAAgB,KAAK,CAAC;AACnF,QAAM,oBAAoB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,cAAc,KAAK,CAAC;AAChF,QAAM,kBAAkB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,wBAAwB,CAAC;AACnF,QAAM,gBAAgB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,UAAU,CAAC;AACnE,QAAM,yBAAyB;AAAA,IAC7B,KAAK,IAAI,CAAC,MAAM,EAAE,4BAA4B;AAAA,EAChD;AAEA,QAAM,gBAAgB,2BAA2B,KAAK,IAAI,CAAC,MAAM,EAAE,qBAAqB,CAAC;AACzF,QAAM,eAAe,2BAA2B,KAAK,IAAI,CAAC,MAAM,EAAE,mBAAmB,CAAC;AACtF,QAAM,gBAAgB,4BAA4B,KAAK,IAAI,CAAC,MAAM,EAAE,mBAAmB,CAAC;AAExF,QAAM,aAAa,MAAM,iBAAiB,aAAa,IAAI;AAC3D,QAAM,mBAAmB;AAAA,IACvB,cAAc,MAAM;AAAA,IACpB,UAAU;AAAA,IACV,iBAAiB,gBAAgB;AAAA,IACjC,QAAQ,gBAAgB,UAAU;AAAA,EACpC;AAEA,QAAM,oBAAoB,yBAAyB;AAAA,IACjD,eAAe;AAAA,IACf,uBAAuB;AAAA,IACvB,uBAAuB;AAAA,IACvB,qBAAqB;AAAA,IACrB,0BAA0B;AAAA,IAC1B,cAAc,MAAM;AAAA,IACpB,YAAY;AAAA,IACZ,8BAA8B;AAAA,EAChC,CAAC;AAED,QAAM,uBAAuB;AAAA,IAC3B,KAAK,IAAI,CAAC,MAAM,EAAE,SAAS;AAAA,IAC3B,KAAK;AAAA,EACP;AAEA,QAAM,SAAS,4BAA4B;AAAA,IACzC,aAAa,MAAM;AAAA,IACnB,cAAc,MAAM;AAAA,IACpB;AAAA,IACA,mBAAmB,SAAS;AAAA,IAC5B;AAAA,IACA,cAAc,KAAK;AAAA,EACrB,CAAC;AAED,QAAM,QAAkB,CAAC;AACzB,MAAI,kBAAkB,iBAAiB,QAAQ;AAC7C,UAAM;AAAA,MACJ,4BAA4B,cAAc,SAAS,KAAM,QAAQ,CAAC,CAAC;AAAA,IACrE;AAAA,EACF;AACA,MAAI,SAAS,SAAS,GAAG;AACvB,UAAM,KAAK,yBAAyB,SAAS,MAAM,mCAAmC;AAAA,EACxF;AACA,MAAI,qBAAqB,SAAS,GAAG;AACnC,UAAM,KAAK,+CAA+C,qBAAqB,KAAK,IAAI,CAAC,EAAE;AAAA,EAC7F;AACA,MAAI,WAAW,mBAAmB;AAChC,UAAM;AAAA,MACJ;AAAA,IACF;AAAA,EACF;AACA,MAAI,WAAW,oBAAoB;AACjC,UAAM,KAAK,oEAAoE;AAAA,EACjF;AAEA,SAAO,kCAAkC,MAAM;AAAA,IAC7C,gBAAgB;AAAA,IAChB,cAAc;AAAA,IACd,cAAc,MAAM;AAAA,IACpB;AAAA,IACA,OAAO,MAAM;AAAA,IACb,cAAc,MAAM;AAAA,IACpB,SAAS,MAAM;AAAA,IACf,sBAAsB,MAAM;AAAA,IAC5B,qBAAqB,MAAM;AAAA,IAC3B,oBAAoB,MAAM;AAAA,IAC1B,YAAY,KAAK;AAAA,IACjB,WAAW,KAAK;AAAA,IAChB,eAAe,KAAK,iBAAgB,oBAAI,KAAK,GAAE,YAAY;AAAA,IAC3D,qBAAqB,MAAM;AAAA,IAC3B,eAAe;AAAA,IACf,uBAAuB;AAAA,IACvB,qBAAqB;AAAA,IACrB,0BAA0B;AAAA,IAC1B,YAAY;AAAA,IACZ,8BAA8B;AAAA,IAC9B,uBAAuB;AAAA,IACvB,qBAAqB;AAAA,IACrB,qBAAqB;AAAA,IACrB,oBAAoB;AAAA,IACpB,uBAAuB,MAAM;AAAA,IAC7B,WAAW;AAAA,IACX,wBAAwB;AAAA,IACxB;AAAA,EACF,CAAC;AACH;AAIO,SAAS,8BAA8B,GAAwC;AACpF,QAAM,MAAM,CAAC,UAAkB,GAAG,KAAK,MAAM,QAAQ,GAAG,CAAC;AACzD,QAAM,aAAa,CAAC,OAAe,IAAI,KAAK,KAAM,QAAQ,CAAC,CAAC;AAE5D,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,EAAE;AACb,QAAM,MAAM,EAAE;AACd,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,EAAE;AACb,QAAM,MAAM,EAAE;AAEd,QAAM,cACJ,EAAE,UAAU,SAAS,IACjB,GAAG,EAAE,UAAU,CAAC,CAAC,WAAM,EAAE,UAAU,EAAE,UAAU,SAAS,CAAC,CAAC,KAC1D;AAEN,QAAM,oBAAoB,OAAO,QAAQ,EAAE,qBAAqB,EAC7D,IAAI,CAAC,CAAC,KAAK,KAAK,MAAM;AACrB,UAAM,KAAK,EAAE,oBAAoB,GAAG;AACpC,WACE,KAAK,GAAG,MAAM,IAAI,MAAM,YAAY,CAAC,MAAM,IAAI,MAAM,SAAS,CAAC,SAAI,IAAI,MAAM,SAAS,CAAC,MAAM,MAAM,KAAK,QACvG,KACG,IAAI,IAAI,GAAG,YAAY,CAAC,MAAM,IAAI,GAAG,SAAS,CAAC,SAAI,IAAI,GAAG,SAAS,CAAC,OACpE;AAAA,EAER,CAAC,EACA,KAAK,IAAI;AAEZ,QAAM,gBAAgB;AAAA,IACpB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,QAAM,SAAS,cAAc,IAAI,CAAC,MAAM;AACtC,UAAM,SAAS,EAAE,oBAAoB,CAAC;AACtC,UAAM,cAAc,EAAE,sBAAsB,SAAS,CAAC,IAClD,sBAAsB,EAAE,OAAO,MAC/B;AACJ,QAAI,CAAC,OAAQ,QAAO,KAAK,CAAC,uBAAa,WAAW;AAClD,WAAO,KAAK,CAAC,MAAM,OAAO,OAAO,QAAQ,CAAC,CAAC,MAAM,OAAO,GAAG,SAAI,OAAO,GAAG,GAAG,WAAW;AAAA,EACzF,CAAC,EAAE,KAAK,IAAI;AAGZ,QAAM,aAAa,EAAE,sBAAsB,OACxC,IAAI,CAAC,KAAK,MAAM;AACf,UAAM,OAAO,EAAE,cAAc,OAAO,CAAC,KAAK;AAC1C,UAAM,OAAO,EAAE,oBAAoB,OAAO,CAAC,KAAK;AAChD,UAAM,QAAQ,EAAE,yBAAyB,OAAO,CAAC,KAAK;AACtD,UAAM,OAAO,EAAE,WAAW,OAAO,CAAC,KAAK;AACvC,WAAO,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,EAAE,mBAAmB,MAAM,OAAO,QAAQ,WAAW,IAAI,GAAG,IAAI,GAAG,MAAM,OAAO,SAAS,WAAW,IAAI,IAAI,IAAI,GAAG,MAAM,KAAK,QAAQ,OAAO,SAAS,WAAW,WAAW,IAAI,IAAI,GAAG;AAAA,EACxN,CAAC,EACA,KAAK,IAAI;AAEZ,QAAM,mBACJ,EAAE,uBAAuB,SAAS,IAC9B,EAAE,uBAAuB,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,EAAE,KAAK,IAAI,IACvD;AAEN,QAAM,eACJ,EAAE,MAAM,SAAS,IAAI;AAAA;AAAA;AAAA,EAAiB,EAAE,MAAM,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,EAAE,KAAK,IAAI,CAAC;AAAA,IAAO;AAEtF,SAAO,gCAA2B,EAAE,YAAY,kBAAkB,EAAE,UAAU;AAAA;AAAA,eAEjE,EAAE,KAAK;AAAA,sBACA,EAAE,YAAY;AAAA,gBACpB,EAAE,MAAM;AAAA,iBACP,EAAE,OAAO,KAAK,EAAE,oBAAoB,aAAa,EAAE,mBAAmB,WAAW,EAAE,kBAAkB;AAAA,uBAC/F,EAAE,aAAa;AAAA,6BACT,EAAE,mBAAmB;AAAA,mBAC/B,EAAE,UAAU;AAAA,mBACZ,WAAW;AAAA;AAAA;AAAA;AAAA,eAIf,GAAG,MAAM,MAAM,EAAE,mBAAmB,WAAW,GAAG,GAAG,SAAI,GAAG,GAAG;AAAA,4BAClD,IAAI,GAAG,MAAM,CAAC,WAAW,IAAI,GAAG,GAAG,CAAC,SAAI,IAAI,GAAG,GAAG,CAAC;AAAA,0BACrD,IAAI,GAAG,MAAM,CAAC,WAAW,IAAI,GAAG,GAAG,CAAC,SAAI,IAAI,GAAG,GAAG,CAAC;AAAA,+BAC9C,IAAI,MAAM,eAAe,IAAI,GAAG,SAAI,IAAI,GAAG;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,wBAMlD,GAAG,MAAM,SAAS,GAAG,GAAG,uCAA6B,GAAG,UAAU;AAAA,6BAC7D,IAAI,GAAG,MAAM,CAAC,kBAAa,GAAG,qBAAqB;AAAA,mEAClB,GAAG,2BAA2B;AAAA,2BACjE,IAAI,GAAG,MAAM,CAAC,kBAAa,GAAG,mBAAmB;AAAA,4BAChD,IAAI,MAAM,SAAS,IAAI,YAAY,UAAK,IAAI,QAAQ,OAAO,GAAG,2BAA2B;AAAA,0BAC3F,WAAW,GAAG,MAAM,CAAC,8BAAyB,GAAG,YAAY;AAAA,uBAChE,WAAW,GAAG,GAAG,CAAC,8BAAyB,GAAG,YAAY;AAAA,0BACvD,EAAE,6BAA6B,GAAG,qBAAqB,GAAG,kBAAkB;AAAA,sBAChF,GAAG,OAAO;AAAA;AAAA;AAAA;AAAA,EAI9B,gBAAgB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMhB,iBAAiB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMjB,MAAM;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMN,UAAU;AAAA,EACV,YAAY;AACd;","names":["z","z"]}
1
+ {"version":3,"sources":["../../src/calibration/aggregate-receipt-schema.ts","../../src/calibration/receipt-schema.ts","../../src/review/reviewer-options-schema.ts","../../src/calibration/aggregate.ts"],"sourcesContent":["import { z } from 'zod';\nimport { StatusLabelSchema, ArchitectureSchema, ReviewerOptionsSchema } from './receipt-schema.js';\n\nexport const AggregateMetricSchema = z.object({\n median: z.number(),\n min: z.number(),\n max: z.number(),\n values: z.array(z.number()), // per-run values in run order (run-001, run-002, ...)\n});\n\nexport const PerCategoryAggregateEntrySchema = z.object({\n median_ratio: z.number().min(0).max(1),\n min_ratio: z.number().min(0).max(1),\n max_ratio: z.number().min(0).max(1),\n total: z.number().int().nonnegative(), // seed count — same across all runs\n per_run_ratios: z.array(z.number()),\n});\n\nexport const PerCategoryAggregateSchema = z.record(z.string(), PerCategoryAggregateEntrySchema);\n\nexport const AggregatePassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const AggregateDecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n median_produced: z.number(), // float — median of per-run decisions_produced_count\n passed: z.boolean(),\n});\n\nexport const AggregateCalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n receipt_kind: z.literal('aggregate'), // discriminates from single-run receipt\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n runs_count: z.number().int().min(2),\n run_files: z.array(z.string()), // relative paths: runs/run-001.json, etc.\n aggregated_at: z.string(), // ISO 8601\n research_os_version: z.string(),\n\n // Aggregate metrics — median + min + max + per-run values in run order\n good_fp_count: AggregateMetricSchema,\n any_flag_recall_ratio: AggregateMetricSchema,\n strict_recall_ratio: AggregateMetricSchema,\n decisions_produced_count: AggregateMetricSchema,\n runtime_ms: AggregateMetricSchema,\n empty_or_malformed_responses: AggregateMetricSchema,\n\n per_category_any_flag: PerCategoryAggregateSchema,\n per_category_strict: PerCategoryAggregateSchema,\n\n // Decision vocabulary — union of all decisions seen across runs, median count each\n decision_vocabulary: z.record(z.string(), AggregateMetricSchema),\n decision_vocab_bar: AggregateDecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n\n pass_fail: AggregatePassFailSchema,\n // Bars that FAILed in >= ceil(runs_count/2) individual runs.\n // Non-empty list demotes trusted_baseline to conditional_pass.\n recurring_bar_failures: z.array(z.string()),\n\n notes: z.array(z.string()),\n\n // schema_version: 1 — additive-optional (Exp6 Session 2):\n // Same options object stamped on every per-run receipt. Absent = stochastic run.\n reviewer_options: ReviewerOptionsSchema.optional(),\n});\n\nexport type AggregateMetric = z.infer<typeof AggregateMetricSchema>;\nexport type PerCategoryAggregateEntry = z.infer<typeof PerCategoryAggregateEntrySchema>;\nexport type PerCategoryAggregate = z.infer<typeof PerCategoryAggregateSchema>;\nexport type AggregatePassFail = z.infer<typeof AggregatePassFailSchema>;\nexport type AggregateDecisionVocabBar = z.infer<typeof AggregateDecisionVocabBarSchema>;\nexport type AggregateCalibrationReceipt = z.infer<typeof AggregateCalibrationReceiptSchema>;\n","import { z } from 'zod';\nimport { ReviewerOptionsSchema } from '../review/reviewer-options-schema.js';\nexport { ReviewerOptionsSchema };\nexport type { ReviewerOptions } from '../review/reviewer-options-schema.js';\n\nexport const StatusLabelSchema = z.enum([\n 'trusted_baseline',\n 'conditional_pass',\n 'failed',\n 'comparison_only',\n]);\n\nexport const ArchitectureSchema = z.enum(['single-pass', 'two-pass']);\n\nexport const RecallSchema = z.object({\n matched: z.number().int().nonnegative(),\n total: z.number().int().nonnegative(),\n ratio: z.number().min(0).max(1),\n});\n\nexport const PerCategoryRecallSchema = z.record(z.string(), RecallSchema);\n\nexport const PassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const DecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n produced: z.number().int().nonnegative(),\n passed: z.boolean(),\n});\n\n// schema_version: 1 — additive-optional additions (Exp6 Session 2):\n// reviewer_options: optional sampling params used during this calibration run.\n// Absent = stochastic run (pre-v0.6 compat preserved). Present = keys explicitly set.\nexport const CalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n calibrated_at: z.string(),\n research_os_version: z.string(),\n runtime_ms: z.number().int().nonnegative(),\n good_fp_count: z.number().int().nonnegative(),\n any_flag_recall: RecallSchema,\n strict_recall: RecallSchema,\n per_category_any_flag: PerCategoryRecallSchema,\n per_category_strict: PerCategoryRecallSchema,\n decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),\n decisions_produced_count: z.number().int().nonnegative(),\n decision_vocab_bar: DecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n empty_or_malformed_responses: z.number().int().nonnegative(),\n pass_fail: PassFailSchema,\n notes: z.array(z.string()),\n reviewer_options: ReviewerOptionsSchema.optional(),\n});\n\nexport type StatusLabel = z.infer<typeof StatusLabelSchema>;\nexport type Architecture = z.infer<typeof ArchitectureSchema>;\nexport type Recall = z.infer<typeof RecallSchema>;\nexport type PerCategoryRecall = z.infer<typeof PerCategoryRecallSchema>;\nexport type PassFail = z.infer<typeof PassFailSchema>;\nexport type DecisionVocabBar = z.infer<typeof DecisionVocabBarSchema>;\nexport type CalibrationReceipt = z.infer<typeof CalibrationReceiptSchema>;\n","import { z } from 'zod';\n\n// Sampling parameters passed verbatim to the Ollama /api/chat `options` field.\n// Used by OllamaInternReviewer to control determinism. All fields optional —\n// omitted keys fall back to Ollama/model defaults. Introduced in Experiment 6\n// Session 2 to make reviewer conditions explicit in calibration receipts.\n//\n// LOAD-BEARING: temperature: 0 is valid and must not be dropped. All merges\n// in OllamaInternReviewer use `!== undefined` checks, NOT truthiness.\nexport const ReviewerOptionsSchema = z.object({\n num_ctx: z.number().int().positive().optional(),\n temperature: z.number().min(0).max(2).optional(),\n seed: z.number().int().optional(),\n top_p: z.number().min(0).max(1).optional(),\n top_k: z.number().int().nonnegative().optional(),\n repeat_penalty: z.number().min(0).optional(),\n});\n\nexport type ReviewerOptions = z.infer<typeof ReviewerOptionsSchema>;\n","import type { Architecture, CalibrationReceipt, PassFail, PerCategoryRecall, ReviewerOptions } from './receipt-schema.js';\nimport {\n AggregateCalibrationReceiptSchema,\n type AggregateCalibrationReceipt,\n type AggregateMetric,\n type AggregatePassFail,\n type PerCategoryAggregate,\n} from './aggregate-receipt-schema.js';\nimport type { StatusLabel } from './receipt-schema.js';\n\n// Compute median of a sorted or unsorted array.\n// Throws on empty input — callers always have at least one run.\n// For even-length arrays: mean of two middle values (float, not rounded).\n// Integer-valued metrics (FP count, decisions) stay as floats here;\n// the caller's bar comparisons (>= 3, === 0) work correctly on exact floats\n// because the inputs are small integers.\nexport function median(values: number[]): number {\n if (values.length === 0) throw new Error('median: empty array');\n const sorted = [...values].sort((a, b) => a - b);\n const mid = Math.floor(sorted.length / 2);\n if (sorted.length % 2 === 1) return sorted[mid];\n return (sorted[mid - 1] + sorted[mid]) / 2;\n}\n\n// Aggregate a list of per-run scalar values into { median, min, max, values }.\n// values preserves input order (run-001, run-002, ...) for traceability.\nexport function aggregateMetric(values: number[]): AggregateMetric {\n const m = median(values);\n return {\n median: m,\n min: Math.min(...values),\n max: Math.max(...values),\n values,\n };\n}\n\n// Aggregate per-run per-category recall objects.\n// Each element of perRunBuckets is one run's PerCategoryRecall\n// (Record<category, { matched, total, ratio }>).\n// Returns PerCategoryAggregate: per-category median/min/max ratio + per-run ratios.\n// total is taken from the first run that has the category (same across runs —\n// SEEDS is static so category totals never change between runs).\nexport function aggregatePerCategoryRecall(\n perRunBuckets: PerCategoryRecall[],\n): PerCategoryAggregate {\n const cats = new Set<string>();\n for (const run of perRunBuckets) {\n for (const cat of Object.keys(run)) cats.add(cat);\n }\n\n const result: PerCategoryAggregate = {};\n for (const cat of cats) {\n const ratios = perRunBuckets.map((run) => run[cat]?.ratio ?? 0);\n const total = perRunBuckets.find((run) => run[cat] !== undefined)?.[cat]?.total ?? 0;\n result[cat] = {\n median_ratio: median(ratios),\n min_ratio: Math.min(...ratios),\n max_ratio: Math.max(...ratios),\n total,\n per_run_ratios: ratios,\n };\n }\n return result;\n}\n\n// Aggregate per-run decision vocabulary count dicts.\n// Each element is one run's decision_vocabulary (Record<decision, count>).\n// Returns Record<decision, AggregateMetric> with median count per decision.\nexport function aggregateDecisionVocabulary(\n perRunDicts: Record<string, number>[],\n): Record<string, AggregateMetric> {\n const decisions = new Set<string>();\n for (const run of perRunDicts) {\n for (const d of Object.keys(run)) decisions.add(d);\n }\n\n const result: Record<string, AggregateMetric> = {};\n for (const d of decisions) {\n const values = perRunDicts.map((run) => run[d] ?? 0);\n result[d] = aggregateMetric(values);\n }\n return result;\n}\n\n// Compute aggregate PASS/FAIL bars from aggregated metrics.\n//\n// Advisor-locked rules (gospel):\n// FP ceiling: median <= 1 AND max <= 2\n// Any-flag recall: median >= 0.65\n// Per-category: median_ratio >= 0.50 for categories with total >= 2\n// Strict recall: median >= 0.20\n// Decision vocab: median >= required (architecture-aware: two-pass=3, single-pass=4)\n// Latency soft: median <= 600_000 → WARN only, never FAIL\n// Latency hard: every-run rule — max <= 1_200_000\n// Empty/malformed: every-run rule — max === 0\nexport function computeAggregatePassFail(input: {\n good_fp_count: AggregateMetric;\n any_flag_recall_ratio: AggregateMetric;\n per_category_any_flag: PerCategoryAggregate;\n strict_recall_ratio: AggregateMetric;\n decisions_produced_count: AggregateMetric;\n architecture: Architecture;\n runtime_ms: AggregateMetric;\n empty_or_malformed_responses: AggregateMetric;\n}): AggregatePassFail {\n const fp_ceiling: 'PASS' | 'FAIL' =\n input.good_fp_count.median <= 1 && input.good_fp_count.max <= 2 ? 'PASS' : 'FAIL';\n\n const any_flag_recall_floor: 'PASS' | 'FAIL' =\n input.any_flag_recall_ratio.median >= 0.65 ? 'PASS' : 'FAIL';\n\n let per_category_any_flag_floor: 'PASS' | 'FAIL' = 'PASS';\n for (const entry of Object.values(input.per_category_any_flag)) {\n if (entry.total >= 2 && entry.median_ratio < 0.5) {\n per_category_any_flag_floor = 'FAIL';\n break;\n }\n }\n\n const strict_recall_floor: 'PASS' | 'FAIL' =\n input.strict_recall_ratio.median >= 0.2 ? 'PASS' : 'FAIL';\n\n const dvRequired = input.architecture === 'two-pass' ? 3 : 4;\n const decision_vocab_completeness: 'PASS' | 'FAIL' =\n input.decisions_produced_count.median >= dvRequired ? 'PASS' : 'FAIL';\n\n // Latency soft: WARN-only signal — no FAIL contribution\n const latency_soft: 'PASS' | 'WARN' =\n input.runtime_ms.median <= 600_000 ? 'PASS' : 'WARN';\n\n // Latency hard: every-run rule — enforced via max\n const latency_hard: 'PASS' | 'FAIL' =\n input.runtime_ms.max <= 1_200_000 ? 'PASS' : 'FAIL';\n\n // Empty/malformed: every-run rule — enforced via max\n const empty_or_malformed: 'PASS' | 'FAIL' =\n input.empty_or_malformed_responses.max === 0 ? 'PASS' : 'FAIL';\n\n const hardBars: ('PASS' | 'FAIL')[] = [\n fp_ceiling,\n any_flag_recall_floor,\n per_category_any_flag_floor,\n strict_recall_floor,\n decision_vocab_completeness,\n latency_hard,\n empty_or_malformed,\n ];\n const overall: 'PASS' | 'FAIL' = hardBars.every((v) => v === 'PASS') ? 'PASS' : 'FAIL';\n\n return {\n fp_ceiling,\n any_flag_recall_floor,\n per_category_any_flag_floor,\n strict_recall_floor,\n decision_vocab_completeness,\n latency_soft,\n latency_hard,\n empty_or_malformed,\n overall,\n };\n}\n\n// Compute which hard bars FAILed in >= ceil(N/2) individual runs.\n// A non-empty result means that bar was SYSTEMATICALLY unreliable —\n// not just a one-run outlier that happened to median-pass.\n// This is used by computeAggregateStatusLabel to prevent a profile from\n// earning trusted_baseline when one bar failed in the majority of runs.\n//\n// Hard bars checked (latency_soft and overall are excluded):\n// fp_ceiling, any_flag_recall_floor, per_category_any_flag_floor,\n// strict_recall_floor, decision_vocab_completeness, latency_hard, empty_or_malformed\nexport function computeRecurringBarFailures(\n perRunPassFails: PassFail[],\n totalRuns: number,\n): string[] {\n const threshold = Math.ceil(totalRuns / 2);\n const HARD_BARS: (keyof PassFail)[] = [\n 'fp_ceiling',\n 'any_flag_recall_floor',\n 'per_category_any_flag_floor',\n 'strict_recall_floor',\n 'decision_vocab_completeness',\n 'latency_hard',\n 'empty_or_malformed',\n ];\n\n const recurring: string[] = [];\n for (const bar of HARD_BARS) {\n const failCount = perRunPassFails.filter((pf) => pf[bar] === 'FAIL').length;\n if (failCount >= threshold) recurring.push(bar);\n }\n return recurring;\n}\n\n// Assign aggregate status label.\n//\n// Advisor-locked predicates (priority order):\n// 1. comparison_only — explicit mode flag OR single-pass Hermes (regardless of pass/fail)\n// 2. failed — aggregate pass_fail.overall === FAIL\n// 3. trusted_baseline — Hermes two-pass AND aggregate PASS AND median(FP) === 0\n// AND recurring_bar_failures.length === 0\n// The recurring-failure check prevents a profile from earning trusted_baseline\n// when any hard bar FAILed in >= ceil(N/2) runs even if the median still passed.\n// Intent: \"one lucky median cannot mask systemic bar weakness.\"\n// 4. conditional_pass — fallthrough (passes but doesn't earn trusted_baseline)\n// Mistral two-pass is capped at conditional_pass regardless of aggregate result.\nexport function computeAggregateStatusLabel(input: {\n profileName: string;\n architecture: Architecture;\n aggregatePassFail: AggregatePassFail;\n medianGoodFpCount: number;\n recurringBarFailures: string[];\n modeOverride?: 'comparison_only';\n}): StatusLabel {\n if (input.modeOverride === 'comparison_only') return 'comparison_only';\n\n if (input.architecture === 'single-pass' && /hermes/i.test(input.profileName)) {\n return 'comparison_only';\n }\n\n if (input.aggregatePassFail.overall === 'FAIL') return 'failed';\n\n const isHermesTwoPass =\n /hermes/i.test(input.profileName) && input.architecture === 'two-pass';\n if (\n isHermesTwoPass &&\n input.medianGoodFpCount === 0 &&\n input.recurringBarFailures.length === 0\n ) {\n return 'trusted_baseline';\n }\n\n return 'conditional_pass';\n}\n\n// Aggregate N single-run receipts into one AggregateCalibrationReceipt.\n// All receipts must be from the same profile/model/architecture.\n// opts.runFiles: relative paths for each run (e.g. 'runs/run-001.json').\n// opts.modeOverride: forward 'comparison_only' to status-label predicate.\n// opts.aggregatedAt: ISO timestamp (defaults to now).\n// opts.reviewerOptions: reviewer sampling options stamped on each per-run receipt.\n// Captured once at harness startup and reused across all N runs. The aggregate\n// carries the same object so consumers can reproduce the exact invocation.\nexport function aggregateReceipts(\n runs: CalibrationReceipt[],\n opts: {\n runFiles: string[];\n modeOverride?: 'comparison_only';\n aggregatedAt?: string;\n reviewerOptions?: ReviewerOptions;\n },\n): AggregateCalibrationReceipt {\n if (runs.length === 0) throw new Error('aggregateReceipts: no runs provided');\n const first = runs[0];\n\n const fpMetric = aggregateMetric(runs.map((r) => r.good_fp_count));\n const anyFlagRatioMetric = aggregateMetric(runs.map((r) => r.any_flag_recall.ratio));\n const strictRatioMetric = aggregateMetric(runs.map((r) => r.strict_recall.ratio));\n const decisionsMetric = aggregateMetric(runs.map((r) => r.decisions_produced_count));\n const runtimeMetric = aggregateMetric(runs.map((r) => r.runtime_ms));\n const emptyOrMalformedMetric = aggregateMetric(\n runs.map((r) => r.empty_or_malformed_responses),\n );\n\n const perCatAnyFlag = aggregatePerCategoryRecall(runs.map((r) => r.per_category_any_flag));\n const perCatStrict = aggregatePerCategoryRecall(runs.map((r) => r.per_category_strict));\n const decisionVocab = aggregateDecisionVocabulary(runs.map((r) => r.decision_vocabulary));\n\n const dvRequired = first.architecture === 'two-pass' ? 3 : 4;\n const decisionVocabBar = {\n architecture: first.architecture,\n required: dvRequired,\n median_produced: decisionsMetric.median,\n passed: decisionsMetric.median >= dvRequired,\n };\n\n const aggregatePassFail = computeAggregatePassFail({\n good_fp_count: fpMetric,\n any_flag_recall_ratio: anyFlagRatioMetric,\n per_category_any_flag: perCatAnyFlag,\n strict_recall_ratio: strictRatioMetric,\n decisions_produced_count: decisionsMetric,\n architecture: first.architecture,\n runtime_ms: runtimeMetric,\n empty_or_malformed_responses: emptyOrMalformedMetric,\n });\n\n const recurringBarFailures = computeRecurringBarFailures(\n runs.map((r) => r.pass_fail),\n runs.length,\n );\n\n const status = computeAggregateStatusLabel({\n profileName: first.profile_name,\n architecture: first.architecture,\n aggregatePassFail,\n medianGoodFpCount: fpMetric.median,\n recurringBarFailures,\n modeOverride: opts.modeOverride,\n });\n\n const notes: string[] = [];\n if (aggregatePassFail.latency_soft === 'WARN') {\n notes.push(\n `Latency warning: median ${(runtimeMetric.median / 1000).toFixed(1)}s exceeds soft limit of 600s`,\n );\n }\n if (fpMetric.median > 0) {\n notes.push(`FP at ceiling: median ${fpMetric.median} false positive(s) on good claims`);\n }\n if (recurringBarFailures.length > 0) {\n notes.push(`Recurring bar failures (>= ceil(N/2) runs): ${recurringBarFailures.join(', ')}`);\n }\n if (status === 'comparison_only') {\n notes.push(\n 'comparison_only: architectural side-run, not a production admission candidate',\n );\n }\n if (status === 'conditional_pass') {\n notes.push('conditional_pass: passes all bars but carries a production caution');\n }\n\n return AggregateCalibrationReceiptSchema.parse({\n schema_version: 1,\n receipt_kind: 'aggregate',\n profile_name: first.profile_name,\n status,\n model: first.model,\n architecture: first.architecture,\n fixture: first.fixture,\n fixture_total_claims: first.fixture_total_claims,\n fixture_good_claims: first.fixture_good_claims,\n fixture_bad_claims: first.fixture_bad_claims,\n runs_count: runs.length,\n run_files: opts.runFiles,\n aggregated_at: opts.aggregatedAt ?? new Date().toISOString(),\n research_os_version: first.research_os_version,\n good_fp_count: fpMetric,\n any_flag_recall_ratio: anyFlagRatioMetric,\n strict_recall_ratio: strictRatioMetric,\n decisions_produced_count: decisionsMetric,\n runtime_ms: runtimeMetric,\n empty_or_malformed_responses: emptyOrMalformedMetric,\n per_category_any_flag: perCatAnyFlag,\n per_category_strict: perCatStrict,\n decision_vocabulary: decisionVocab,\n decision_vocab_bar: decisionVocabBar,\n unreachable_decisions: first.unreachable_decisions,\n pass_fail: aggregatePassFail,\n recurring_bar_failures: recurringBarFailures,\n notes,\n ...(opts.reviewerOptions && Object.keys(opts.reviewerOptions).length > 0 && {\n reviewer_options: opts.reviewerOptions,\n }),\n });\n}\n\n// Stable key order for reviewer_options rendering (matches single-run receipt).\nconst REVIEWER_OPTIONS_KEY_ORDER = [\n 'num_ctx',\n 'temperature',\n 'seed',\n 'top_p',\n 'top_k',\n 'repeat_penalty',\n] as const;\n\nfunction buildReviewerOptionsSection(opts: AggregateCalibrationReceipt['reviewer_options']): string {\n if (!opts) return '';\n const lines = REVIEWER_OPTIONS_KEY_ORDER\n .filter((k) => opts[k] !== undefined)\n .map((k) => `- ${k}: ${opts[k]}`);\n if (lines.length === 0) return '';\n return `\\n## Reviewer options\\n\\n${lines.join('\\n')}\\n`;\n}\n\n// Render the aggregate calibration receipt as compact Markdown.\n// Operator proof artifact — no prose.\nexport function buildAggregateReceiptMarkdown(r: AggregateCalibrationReceipt): string {\n const pct = (ratio: number) => `${Math.round(ratio * 100)}%`;\n const secRounded = (ms: number) => `${(ms / 1000).toFixed(1)}s`;\n\n const af = r.any_flag_recall_ratio;\n const sr = r.strict_recall_ratio;\n const fp = r.good_fp_count;\n const dec = r.decisions_produced_count;\n const rt = r.runtime_ms;\n const pf = r.pass_fail;\n const bar = r.decision_vocab_bar;\n\n const runFileList =\n r.run_files.length > 0\n ? `${r.run_files[0]} … ${r.run_files[r.run_files.length - 1]}`\n : '(none)';\n\n const perCatAnyFlagRows = Object.entries(r.per_category_any_flag)\n .map(([cat, entry]) => {\n const st = r.per_category_strict[cat];\n return (\n `| ${cat} | ${pct(entry.median_ratio)} | ${pct(entry.min_ratio)}–${pct(entry.max_ratio)} | ${entry.total} |` +\n (st\n ? ` ${pct(st.median_ratio)} | ${pct(st.min_ratio)}–${pct(st.max_ratio)} |`\n : ' — | — |')\n );\n })\n .join('\\n');\n\n const ALL_DECISIONS = [\n 'accepted_for_synthesis',\n 'rejected',\n 'needs_scope_repair',\n 'needs_source_repair',\n 'needs_contradiction_mapping',\n 'needs_human_review',\n ];\n const dvRows = ALL_DECISIONS.map((d) => {\n const metric = r.decision_vocabulary[d];\n const unreachable = r.unreachable_decisions.includes(d)\n ? ` (unreachable from ${r.fixture})`\n : '';\n if (!metric) return `| ${d} | — | — |${unreachable}`;\n return `| ${d} | ${metric.median.toFixed(1)} | ${metric.min}–${metric.max}${unreachable} |`;\n }).join('\\n');\n\n // Per-run summary table — pulled from run_files labels for clarity\n const perRunRows = r.any_flag_recall_ratio.values\n .map((afr, i) => {\n const fp_i = r.good_fp_count.values[i] ?? '?';\n const sr_i = r.strict_recall_ratio.values[i] ?? '?';\n const dec_i = r.decisions_produced_count.values[i] ?? '?';\n const rt_i = r.runtime_ms.values[i] ?? '?';\n return `| ${i + 1} | ${fp_i}/${r.fixture_good_claims} | ${typeof afr === 'number' ? pct(afr) : '?'} | ${typeof sr_i === 'number' ? pct(sr_i) : '?'} | ${dec_i}/6 | ${typeof rt_i === 'number' ? secRounded(rt_i) : '?'} |`;\n })\n .join('\\n');\n\n const recurringSection =\n r.recurring_bar_failures.length > 0\n ? r.recurring_bar_failures.map((b) => `- ${b}`).join('\\n')\n : 'None.';\n\n const notesSection =\n r.notes.length > 0 ? `\\n## Notes\\n\\n${r.notes.map((n) => `- ${n}`).join('\\n')}\\n` : '';\n\n const reviewerOptionsSection = buildReviewerOptionsSection(r.reviewer_options);\n\n return `# Calibration Receipt — ${r.profile_name} (aggregate, N=${r.runs_count} runs)\n\n- **Model:** ${r.model}\n- **Architecture:** ${r.architecture}\n- **Status:** ${r.status}\n- **Fixture:** ${r.fixture} (${r.fixture_total_claims} claims = ${r.fixture_good_claims} good + ${r.fixture_bad_claims} bad)\n- **Aggregated at:** ${r.aggregated_at}\n- **Research-OS version:** ${r.research_os_version}\n- **Run count:** ${r.runs_count}\n- **Run files:** ${runFileList}\n${reviewerOptionsSection}\n## Headline metrics (median across runs)\n\n- FP: median ${fp.median} / ${r.fixture_good_claims} (range ${fp.min}–${fp.max})\n- Any-flag recall: median ${pct(af.median)} (range ${pct(af.min)}–${pct(af.max)})\n- Strict recall: median ${pct(sr.median)} (range ${pct(sr.min)}–${pct(sr.max)})\n- Decisions produced: median ${dec.median} / 6 (range ${dec.min}–${dec.max})\n\n## PASS / FAIL (aggregate)\n\n| Bar | Rule | Result |\n|---|---|---|\n| FP ceiling | median=${fp.median}, max=${fp.max} (median ≤1 AND max ≤2) | ${pf.fp_ceiling} |\n| Any-flag recall | median=${pct(af.median)} (≥65%) | ${pf.any_flag_recall_floor} |\n| Per-category any-flag | median ≥50% per cat (see below) | ${pf.per_category_any_flag_floor} |\n| Strict recall | median=${pct(sr.median)} (≥20%) | ${pf.strict_recall_floor} |\n| Decision vocab | median=${dec.median} / 6 (${bar.architecture} ≥${bar.required}) | ${pf.decision_vocab_completeness} |\n| Latency soft | median=${secRounded(rt.median)} (≤600s, WARN only) | ${pf.latency_soft} |\n| Latency hard | max=${secRounded(rt.max)} (every run ≤1200s) | ${pf.latency_hard} |\n| Empty/malformed | max=${r.empty_or_malformed_responses.max} (every run =0) | ${pf.empty_or_malformed} |\n| **OVERALL** | | **${pf.overall}** |\n\n## Recurring hard-bar failures\n\n${recurringSection}\n\n## Per-category recall (median across runs)\n\n| Category | Any-flag median | Any-flag range | Total | Strict median | Strict range |\n|---|---|---|---|---|---|\n${perCatAnyFlagRows}\n\n## Decision vocabulary (median count across runs)\n\n| Decision | Median | Range |\n|---|---|---|\n${dvRows}\n\n## Per-run summary\n\n| Run | FP | Any-flag | Strict | Decisions | Runtime |\n|---|---|---|---|---|---|\n${perRunRows}\n${notesSection}`;\n}\n"],"mappings":";AAAA,SAAS,KAAAA,UAAS;;;ACAlB,SAAS,KAAAC,UAAS;;;ACAlB,SAAS,SAAS;AASX,IAAM,wBAAwB,EAAE,OAAO;AAAA,EAC5C,SAAS,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS;AAAA,EAC9C,aAAa,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AAAA,EAC/C,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChC,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AAAA,EACzC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,EAAE,SAAS;AAAA,EAC/C,gBAAgB,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,SAAS;AAC7C,CAAC;;;ADXM,IAAM,oBAAoBC,GAAE,KAAK;AAAA,EACtC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAEM,IAAM,qBAAqBA,GAAE,KAAK,CAAC,eAAe,UAAU,CAAC;AAE7D,IAAM,eAAeA,GAAE,OAAO;AAAA,EACnC,SAASA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACtC,OAAOA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACpC,OAAOA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAChC,CAAC;AAEM,IAAM,0BAA0BA,GAAE,OAAOA,GAAE,OAAO,GAAG,YAAY;AAEjE,IAAM,iBAAiBA,GAAE,OAAO;AAAA,EACrC,YAAYA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAASA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,yBAAyBA,GAAE,OAAO;AAAA,EAC7C,cAAc;AAAA,EACd,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvC,QAAQA,GAAE,QAAQ;AACpB,CAAC;AAKM,IAAM,2BAA2BA,GAAE,OAAO;AAAA,EAC/C,gBAAgBA,GAAE,QAAQ,CAAC;AAAA,EAC3B,cAAcA,GAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAOA,GAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAASA,GAAE,OAAO;AAAA,EAClB,sBAAsBA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,eAAeA,GAAE,OAAO;AAAA,EACxB,qBAAqBA,GAAE,OAAO;AAAA,EAC9B,YAAYA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACzC,eAAeA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC5C,iBAAiB;AAAA,EACjB,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,qBAAqBA,GAAE,OAAOA,GAAE,OAAO,GAAGA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY,CAAC;AAAA,EACxE,0BAA0BA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvD,oBAAoB;AAAA,EACpB,uBAAuBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EACzC,8BAA8BA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC3D,WAAW;AAAA,EACX,OAAOA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EACzB,kBAAkB,sBAAsB,SAAS;AACnD,CAAC;;;ADnEM,IAAM,wBAAwBC,GAAE,OAAO;AAAA,EAC5C,QAAQA,GAAE,OAAO;AAAA,EACjB,KAAKA,GAAE,OAAO;AAAA,EACd,KAAKA,GAAE,OAAO;AAAA,EACd,QAAQA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAC5B,CAAC;AAEM,IAAM,kCAAkCA,GAAE,OAAO;AAAA,EACtD,cAAcA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EACrC,WAAWA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAClC,WAAWA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAClC,OAAOA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA;AAAA,EACpC,gBAAgBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AACpC,CAAC;AAEM,IAAM,6BAA6BA,GAAE,OAAOA,GAAE,OAAO,GAAG,+BAA+B;AAEvF,IAAM,0BAA0BA,GAAE,OAAO;AAAA,EAC9C,YAAYA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAASA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,kCAAkCA,GAAE,OAAO;AAAA,EACtD,cAAc;AAAA,EACd,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,iBAAiBA,GAAE,OAAO;AAAA;AAAA,EAC1B,QAAQA,GAAE,QAAQ;AACpB,CAAC;AAEM,IAAM,oCAAoCA,GAAE,OAAO;AAAA,EACxD,gBAAgBA,GAAE,QAAQ,CAAC;AAAA,EAC3B,cAAcA,GAAE,QAAQ,WAAW;AAAA;AAAA,EACnC,cAAcA,GAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAOA,GAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAASA,GAAE,OAAO;AAAA,EAClB,sBAAsBA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,YAAYA,GAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC;AAAA,EAClC,WAAWA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAAA,EAC7B,eAAeA,GAAE,OAAO;AAAA;AAAA,EACxB,qBAAqBA,GAAE,OAAO;AAAA;AAAA,EAG9B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,0BAA0B;AAAA,EAC1B,YAAY;AAAA,EACZ,8BAA8B;AAAA,EAE9B,uBAAuB;AAAA,EACvB,qBAAqB;AAAA;AAAA,EAGrB,qBAAqBA,GAAE,OAAOA,GAAE,OAAO,GAAG,qBAAqB;AAAA,EAC/D,oBAAoB;AAAA,EACpB,uBAAuBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EAEzC,WAAW;AAAA;AAAA;AAAA,EAGX,wBAAwBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EAE1C,OAAOA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAAA;AAAA,EAIzB,kBAAkB,sBAAsB,SAAS;AACnD,CAAC;;;AGjEM,SAAS,OAAO,QAA0B;AAC/C,MAAI,OAAO,WAAW,EAAG,OAAM,IAAI,MAAM,qBAAqB;AAC9D,QAAM,SAAS,CAAC,GAAG,MAAM,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC/C,QAAM,MAAM,KAAK,MAAM,OAAO,SAAS,CAAC;AACxC,MAAI,OAAO,SAAS,MAAM,EAAG,QAAO,OAAO,GAAG;AAC9C,UAAQ,OAAO,MAAM,CAAC,IAAI,OAAO,GAAG,KAAK;AAC3C;AAIO,SAAS,gBAAgB,QAAmC;AACjE,QAAM,IAAI,OAAO,MAAM;AACvB,SAAO;AAAA,IACL,QAAQ;AAAA,IACR,KAAK,KAAK,IAAI,GAAG,MAAM;AAAA,IACvB,KAAK,KAAK,IAAI,GAAG,MAAM;AAAA,IACvB;AAAA,EACF;AACF;AAQO,SAAS,2BACd,eACsB;AACtB,QAAM,OAAO,oBAAI,IAAY;AAC7B,aAAW,OAAO,eAAe;AAC/B,eAAW,OAAO,OAAO,KAAK,GAAG,EAAG,MAAK,IAAI,GAAG;AAAA,EAClD;AAEA,QAAM,SAA+B,CAAC;AACtC,aAAW,OAAO,MAAM;AACtB,UAAM,SAAS,cAAc,IAAI,CAAC,QAAQ,IAAI,GAAG,GAAG,SAAS,CAAC;AAC9D,UAAM,QAAQ,cAAc,KAAK,CAAC,QAAQ,IAAI,GAAG,MAAM,MAAS,IAAI,GAAG,GAAG,SAAS;AACnF,WAAO,GAAG,IAAI;AAAA,MACZ,cAAc,OAAO,MAAM;AAAA,MAC3B,WAAW,KAAK,IAAI,GAAG,MAAM;AAAA,MAC7B,WAAW,KAAK,IAAI,GAAG,MAAM;AAAA,MAC7B;AAAA,MACA,gBAAgB;AAAA,IAClB;AAAA,EACF;AACA,SAAO;AACT;AAKO,SAAS,4BACd,aACiC;AACjC,QAAM,YAAY,oBAAI,IAAY;AAClC,aAAW,OAAO,aAAa;AAC7B,eAAW,KAAK,OAAO,KAAK,GAAG,EAAG,WAAU,IAAI,CAAC;AAAA,EACnD;AAEA,QAAM,SAA0C,CAAC;AACjD,aAAW,KAAK,WAAW;AACzB,UAAM,SAAS,YAAY,IAAI,CAAC,QAAQ,IAAI,CAAC,KAAK,CAAC;AACnD,WAAO,CAAC,IAAI,gBAAgB,MAAM;AAAA,EACpC;AACA,SAAO;AACT;AAaO,SAAS,yBAAyB,OASnB;AACpB,QAAM,aACJ,MAAM,cAAc,UAAU,KAAK,MAAM,cAAc,OAAO,IAAI,SAAS;AAE7E,QAAM,wBACJ,MAAM,sBAAsB,UAAU,OAAO,SAAS;AAExD,MAAI,8BAA+C;AACnD,aAAW,SAAS,OAAO,OAAO,MAAM,qBAAqB,GAAG;AAC9D,QAAI,MAAM,SAAS,KAAK,MAAM,eAAe,KAAK;AAChD,oCAA8B;AAC9B;AAAA,IACF;AAAA,EACF;AAEA,QAAM,sBACJ,MAAM,oBAAoB,UAAU,MAAM,SAAS;AAErD,QAAM,aAAa,MAAM,iBAAiB,aAAa,IAAI;AAC3D,QAAM,8BACJ,MAAM,yBAAyB,UAAU,aAAa,SAAS;AAGjE,QAAM,eACJ,MAAM,WAAW,UAAU,MAAU,SAAS;AAGhD,QAAM,eACJ,MAAM,WAAW,OAAO,OAAY,SAAS;AAG/C,QAAM,qBACJ,MAAM,6BAA6B,QAAQ,IAAI,SAAS;AAE1D,QAAM,WAAgC;AAAA,IACpC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,QAAM,UAA2B,SAAS,MAAM,CAAC,MAAM,MAAM,MAAM,IAAI,SAAS;AAEhF,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAWO,SAAS,4BACd,iBACA,WACU;AACV,QAAM,YAAY,KAAK,KAAK,YAAY,CAAC;AACzC,QAAM,YAAgC;AAAA,IACpC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,QAAM,YAAsB,CAAC;AAC7B,aAAW,OAAO,WAAW;AAC3B,UAAM,YAAY,gBAAgB,OAAO,CAAC,OAAO,GAAG,GAAG,MAAM,MAAM,EAAE;AACrE,QAAI,aAAa,UAAW,WAAU,KAAK,GAAG;AAAA,EAChD;AACA,SAAO;AACT;AAcO,SAAS,4BAA4B,OAO5B;AACd,MAAI,MAAM,iBAAiB,kBAAmB,QAAO;AAErD,MAAI,MAAM,iBAAiB,iBAAiB,UAAU,KAAK,MAAM,WAAW,GAAG;AAC7E,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,kBAAkB,YAAY,OAAQ,QAAO;AAEvD,QAAM,kBACJ,UAAU,KAAK,MAAM,WAAW,KAAK,MAAM,iBAAiB;AAC9D,MACE,mBACA,MAAM,sBAAsB,KAC5B,MAAM,qBAAqB,WAAW,GACtC;AACA,WAAO;AAAA,EACT;AAEA,SAAO;AACT;AAUO,SAAS,kBACd,MACA,MAM6B;AAC7B,MAAI,KAAK,WAAW,EAAG,OAAM,IAAI,MAAM,qCAAqC;AAC5E,QAAM,QAAQ,KAAK,CAAC;AAEpB,QAAM,WAAW,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,aAAa,CAAC;AACjE,QAAM,qBAAqB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,gBAAgB,KAAK,CAAC;AACnF,QAAM,oBAAoB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,cAAc,KAAK,CAAC;AAChF,QAAM,kBAAkB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,wBAAwB,CAAC;AACnF,QAAM,gBAAgB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,UAAU,CAAC;AACnE,QAAM,yBAAyB;AAAA,IAC7B,KAAK,IAAI,CAAC,MAAM,EAAE,4BAA4B;AAAA,EAChD;AAEA,QAAM,gBAAgB,2BAA2B,KAAK,IAAI,CAAC,MAAM,EAAE,qBAAqB,CAAC;AACzF,QAAM,eAAe,2BAA2B,KAAK,IAAI,CAAC,MAAM,EAAE,mBAAmB,CAAC;AACtF,QAAM,gBAAgB,4BAA4B,KAAK,IAAI,CAAC,MAAM,EAAE,mBAAmB,CAAC;AAExF,QAAM,aAAa,MAAM,iBAAiB,aAAa,IAAI;AAC3D,QAAM,mBAAmB;AAAA,IACvB,cAAc,MAAM;AAAA,IACpB,UAAU;AAAA,IACV,iBAAiB,gBAAgB;AAAA,IACjC,QAAQ,gBAAgB,UAAU;AAAA,EACpC;AAEA,QAAM,oBAAoB,yBAAyB;AAAA,IACjD,eAAe;AAAA,IACf,uBAAuB;AAAA,IACvB,uBAAuB;AAAA,IACvB,qBAAqB;AAAA,IACrB,0BAA0B;AAAA,IAC1B,cAAc,MAAM;AAAA,IACpB,YAAY;AAAA,IACZ,8BAA8B;AAAA,EAChC,CAAC;AAED,QAAM,uBAAuB;AAAA,IAC3B,KAAK,IAAI,CAAC,MAAM,EAAE,SAAS;AAAA,IAC3B,KAAK;AAAA,EACP;AAEA,QAAM,SAAS,4BAA4B;AAAA,IACzC,aAAa,MAAM;AAAA,IACnB,cAAc,MAAM;AAAA,IACpB;AAAA,IACA,mBAAmB,SAAS;AAAA,IAC5B;AAAA,IACA,cAAc,KAAK;AAAA,EACrB,CAAC;AAED,QAAM,QAAkB,CAAC;AACzB,MAAI,kBAAkB,iBAAiB,QAAQ;AAC7C,UAAM;AAAA,MACJ,4BAA4B,cAAc,SAAS,KAAM,QAAQ,CAAC,CAAC;AAAA,IACrE;AAAA,EACF;AACA,MAAI,SAAS,SAAS,GAAG;AACvB,UAAM,KAAK,yBAAyB,SAAS,MAAM,mCAAmC;AAAA,EACxF;AACA,MAAI,qBAAqB,SAAS,GAAG;AACnC,UAAM,KAAK,+CAA+C,qBAAqB,KAAK,IAAI,CAAC,EAAE;AAAA,EAC7F;AACA,MAAI,WAAW,mBAAmB;AAChC,UAAM;AAAA,MACJ;AAAA,IACF;AAAA,EACF;AACA,MAAI,WAAW,oBAAoB;AACjC,UAAM,KAAK,oEAAoE;AAAA,EACjF;AAEA,SAAO,kCAAkC,MAAM;AAAA,IAC7C,gBAAgB;AAAA,IAChB,cAAc;AAAA,IACd,cAAc,MAAM;AAAA,IACpB;AAAA,IACA,OAAO,MAAM;AAAA,IACb,cAAc,MAAM;AAAA,IACpB,SAAS,MAAM;AAAA,IACf,sBAAsB,MAAM;AAAA,IAC5B,qBAAqB,MAAM;AAAA,IAC3B,oBAAoB,MAAM;AAAA,IAC1B,YAAY,KAAK;AAAA,IACjB,WAAW,KAAK;AAAA,IAChB,eAAe,KAAK,iBAAgB,oBAAI,KAAK,GAAE,YAAY;AAAA,IAC3D,qBAAqB,MAAM;AAAA,IAC3B,eAAe;AAAA,IACf,uBAAuB;AAAA,IACvB,qBAAqB;AAAA,IACrB,0BAA0B;AAAA,IAC1B,YAAY;AAAA,IACZ,8BAA8B;AAAA,IAC9B,uBAAuB;AAAA,IACvB,qBAAqB;AAAA,IACrB,qBAAqB;AAAA,IACrB,oBAAoB;AAAA,IACpB,uBAAuB,MAAM;AAAA,IAC7B,WAAW;AAAA,IACX,wBAAwB;AAAA,IACxB;AAAA,IACA,GAAI,KAAK,mBAAmB,OAAO,KAAK,KAAK,eAAe,EAAE,SAAS,KAAK;AAAA,MAC1E,kBAAkB,KAAK;AAAA,IACzB;AAAA,EACF,CAAC;AACH;AAGA,IAAM,6BAA6B;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,4BAA4B,MAA+D;AAClG,MAAI,CAAC,KAAM,QAAO;AAClB,QAAM,QAAQ,2BACX,OAAO,CAAC,MAAM,KAAK,CAAC,MAAM,MAAS,EACnC,IAAI,CAAC,MAAM,KAAK,CAAC,KAAK,KAAK,CAAC,CAAC,EAAE;AAClC,MAAI,MAAM,WAAW,EAAG,QAAO;AAC/B,SAAO;AAAA;AAAA;AAAA,EAA4B,MAAM,KAAK,IAAI,CAAC;AAAA;AACrD;AAIO,SAAS,8BAA8B,GAAwC;AACpF,QAAM,MAAM,CAAC,UAAkB,GAAG,KAAK,MAAM,QAAQ,GAAG,CAAC;AACzD,QAAM,aAAa,CAAC,OAAe,IAAI,KAAK,KAAM,QAAQ,CAAC,CAAC;AAE5D,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,EAAE;AACb,QAAM,MAAM,EAAE;AACd,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,EAAE;AACb,QAAM,MAAM,EAAE;AAEd,QAAM,cACJ,EAAE,UAAU,SAAS,IACjB,GAAG,EAAE,UAAU,CAAC,CAAC,WAAM,EAAE,UAAU,EAAE,UAAU,SAAS,CAAC,CAAC,KAC1D;AAEN,QAAM,oBAAoB,OAAO,QAAQ,EAAE,qBAAqB,EAC7D,IAAI,CAAC,CAAC,KAAK,KAAK,MAAM;AACrB,UAAM,KAAK,EAAE,oBAAoB,GAAG;AACpC,WACE,KAAK,GAAG,MAAM,IAAI,MAAM,YAAY,CAAC,MAAM,IAAI,MAAM,SAAS,CAAC,SAAI,IAAI,MAAM,SAAS,CAAC,MAAM,MAAM,KAAK,QACvG,KACG,IAAI,IAAI,GAAG,YAAY,CAAC,MAAM,IAAI,GAAG,SAAS,CAAC,SAAI,IAAI,GAAG,SAAS,CAAC,OACpE;AAAA,EAER,CAAC,EACA,KAAK,IAAI;AAEZ,QAAM,gBAAgB;AAAA,IACpB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,QAAM,SAAS,cAAc,IAAI,CAAC,MAAM;AACtC,UAAM,SAAS,EAAE,oBAAoB,CAAC;AACtC,UAAM,cAAc,EAAE,sBAAsB,SAAS,CAAC,IAClD,sBAAsB,EAAE,OAAO,MAC/B;AACJ,QAAI,CAAC,OAAQ,QAAO,KAAK,CAAC,uBAAa,WAAW;AAClD,WAAO,KAAK,CAAC,MAAM,OAAO,OAAO,QAAQ,CAAC,CAAC,MAAM,OAAO,GAAG,SAAI,OAAO,GAAG,GAAG,WAAW;AAAA,EACzF,CAAC,EAAE,KAAK,IAAI;AAGZ,QAAM,aAAa,EAAE,sBAAsB,OACxC,IAAI,CAAC,KAAK,MAAM;AACf,UAAM,OAAO,EAAE,cAAc,OAAO,CAAC,KAAK;AAC1C,UAAM,OAAO,EAAE,oBAAoB,OAAO,CAAC,KAAK;AAChD,UAAM,QAAQ,EAAE,yBAAyB,OAAO,CAAC,KAAK;AACtD,UAAM,OAAO,EAAE,WAAW,OAAO,CAAC,KAAK;AACvC,WAAO,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,EAAE,mBAAmB,MAAM,OAAO,QAAQ,WAAW,IAAI,GAAG,IAAI,GAAG,MAAM,OAAO,SAAS,WAAW,IAAI,IAAI,IAAI,GAAG,MAAM,KAAK,QAAQ,OAAO,SAAS,WAAW,WAAW,IAAI,IAAI,GAAG;AAAA,EACxN,CAAC,EACA,KAAK,IAAI;AAEZ,QAAM,mBACJ,EAAE,uBAAuB,SAAS,IAC9B,EAAE,uBAAuB,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,EAAE,KAAK,IAAI,IACvD;AAEN,QAAM,eACJ,EAAE,MAAM,SAAS,IAAI;AAAA;AAAA;AAAA,EAAiB,EAAE,MAAM,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,EAAE,KAAK,IAAI,CAAC;AAAA,IAAO;AAEtF,QAAM,yBAAyB,4BAA4B,EAAE,gBAAgB;AAE7E,SAAO,gCAA2B,EAAE,YAAY,kBAAkB,EAAE,UAAU;AAAA;AAAA,eAEjE,EAAE,KAAK;AAAA,sBACA,EAAE,YAAY;AAAA,gBACpB,EAAE,MAAM;AAAA,iBACP,EAAE,OAAO,KAAK,EAAE,oBAAoB,aAAa,EAAE,mBAAmB,WAAW,EAAE,kBAAkB;AAAA,uBAC/F,EAAE,aAAa;AAAA,6BACT,EAAE,mBAAmB;AAAA,mBAC/B,EAAE,UAAU;AAAA,mBACZ,WAAW;AAAA,EAC5B,sBAAsB;AAAA;AAAA;AAAA,eAGT,GAAG,MAAM,MAAM,EAAE,mBAAmB,WAAW,GAAG,GAAG,SAAI,GAAG,GAAG;AAAA,4BAClD,IAAI,GAAG,MAAM,CAAC,WAAW,IAAI,GAAG,GAAG,CAAC,SAAI,IAAI,GAAG,GAAG,CAAC;AAAA,0BACrD,IAAI,GAAG,MAAM,CAAC,WAAW,IAAI,GAAG,GAAG,CAAC,SAAI,IAAI,GAAG,GAAG,CAAC;AAAA,+BAC9C,IAAI,MAAM,eAAe,IAAI,GAAG,SAAI,IAAI,GAAG;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,wBAMlD,GAAG,MAAM,SAAS,GAAG,GAAG,uCAA6B,GAAG,UAAU;AAAA,6BAC7D,IAAI,GAAG,MAAM,CAAC,kBAAa,GAAG,qBAAqB;AAAA,mEAClB,GAAG,2BAA2B;AAAA,2BACjE,IAAI,GAAG,MAAM,CAAC,kBAAa,GAAG,mBAAmB;AAAA,4BAChD,IAAI,MAAM,SAAS,IAAI,YAAY,UAAK,IAAI,QAAQ,OAAO,GAAG,2BAA2B;AAAA,0BAC3F,WAAW,GAAG,MAAM,CAAC,8BAAyB,GAAG,YAAY;AAAA,uBAChE,WAAW,GAAG,GAAG,CAAC,8BAAyB,GAAG,YAAY;AAAA,0BACvD,EAAE,6BAA6B,GAAG,qBAAqB,GAAG,kBAAkB;AAAA,sBAChF,GAAG,OAAO;AAAA;AAAA;AAAA;AAAA,EAI9B,gBAAgB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMhB,iBAAiB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMjB,MAAM;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMN,UAAU;AAAA,EACV,YAAY;AACd;","names":["z","z","z","z"]}
@@ -1,4 +1,5 @@
1
1
  import { z } from 'zod';
2
+ export { R as ReviewerOptions, a as ReviewerOptionsSchema } from '../reviewer-options-schema-PZacF_MO.js';
2
3
 
3
4
  declare const StatusLabelSchema: z.ZodEnum<["trusted_baseline", "conditional_pass", "failed", "comparison_only"]>;
4
5
  declare const ArchitectureSchema: z.ZodEnum<["single-pass", "two-pass"]>;
@@ -193,19 +194,40 @@ declare const CalibrationReceiptSchema: z.ZodObject<{
193
194
  overall: "PASS" | "FAIL";
194
195
  }>;
195
196
  notes: z.ZodArray<z.ZodString, "many">;
197
+ reviewer_options: z.ZodOptional<z.ZodObject<{
198
+ num_ctx: z.ZodOptional<z.ZodNumber>;
199
+ temperature: z.ZodOptional<z.ZodNumber>;
200
+ seed: z.ZodOptional<z.ZodNumber>;
201
+ top_p: z.ZodOptional<z.ZodNumber>;
202
+ top_k: z.ZodOptional<z.ZodNumber>;
203
+ repeat_penalty: z.ZodOptional<z.ZodNumber>;
204
+ }, "strip", z.ZodTypeAny, {
205
+ num_ctx?: number | undefined;
206
+ temperature?: number | undefined;
207
+ seed?: number | undefined;
208
+ top_p?: number | undefined;
209
+ top_k?: number | undefined;
210
+ repeat_penalty?: number | undefined;
211
+ }, {
212
+ num_ctx?: number | undefined;
213
+ temperature?: number | undefined;
214
+ seed?: number | undefined;
215
+ top_p?: number | undefined;
216
+ top_k?: number | undefined;
217
+ repeat_penalty?: number | undefined;
218
+ }>>;
196
219
  }, "strip", z.ZodTypeAny, {
197
- research_os_version: string;
198
220
  status: "trusted_baseline" | "conditional_pass" | "failed" | "comparison_only";
199
- notes: string[];
221
+ architecture: "single-pass" | "two-pass";
200
222
  schema_version: 1;
201
223
  profile_name: string;
202
224
  model: string;
203
- architecture: "single-pass" | "two-pass";
204
225
  fixture: string;
205
226
  fixture_total_claims: number;
206
227
  fixture_good_claims: number;
207
228
  fixture_bad_claims: number;
208
229
  calibrated_at: string;
230
+ research_os_version: string;
209
231
  runtime_ms: number;
210
232
  good_fp_count: number;
211
233
  any_flag_recall: {
@@ -249,19 +271,27 @@ declare const CalibrationReceiptSchema: z.ZodObject<{
249
271
  empty_or_malformed: "PASS" | "FAIL";
250
272
  overall: "PASS" | "FAIL";
251
273
  };
274
+ notes: string[];
275
+ reviewer_options?: {
276
+ num_ctx?: number | undefined;
277
+ temperature?: number | undefined;
278
+ seed?: number | undefined;
279
+ top_p?: number | undefined;
280
+ top_k?: number | undefined;
281
+ repeat_penalty?: number | undefined;
282
+ } | undefined;
252
283
  }, {
253
- research_os_version: string;
254
284
  status: "trusted_baseline" | "conditional_pass" | "failed" | "comparison_only";
255
- notes: string[];
285
+ architecture: "single-pass" | "two-pass";
256
286
  schema_version: 1;
257
287
  profile_name: string;
258
288
  model: string;
259
- architecture: "single-pass" | "two-pass";
260
289
  fixture: string;
261
290
  fixture_total_claims: number;
262
291
  fixture_good_claims: number;
263
292
  fixture_bad_claims: number;
264
293
  calibrated_at: string;
294
+ research_os_version: string;
265
295
  runtime_ms: number;
266
296
  good_fp_count: number;
267
297
  any_flag_recall: {
@@ -305,6 +335,15 @@ declare const CalibrationReceiptSchema: z.ZodObject<{
305
335
  empty_or_malformed: "PASS" | "FAIL";
306
336
  overall: "PASS" | "FAIL";
307
337
  };
338
+ notes: string[];
339
+ reviewer_options?: {
340
+ num_ctx?: number | undefined;
341
+ temperature?: number | undefined;
342
+ seed?: number | undefined;
343
+ top_p?: number | undefined;
344
+ top_k?: number | undefined;
345
+ repeat_penalty?: number | undefined;
346
+ } | undefined;
308
347
  }>;
309
348
  type StatusLabel = z.infer<typeof StatusLabelSchema>;
310
349
  type Architecture = z.infer<typeof ArchitectureSchema>;