@mcptoolshop/research-os 0.3.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/calibration/aggregate-receipt-schema.ts","../../src/calibration/receipt-schema.ts","../../src/calibration/aggregate.ts"],"sourcesContent":["import { z } from 'zod';\nimport { StatusLabelSchema, ArchitectureSchema } from './receipt-schema.js';\n\nexport const AggregateMetricSchema = z.object({\n median: z.number(),\n min: z.number(),\n max: z.number(),\n values: z.array(z.number()), // per-run values in run order (run-001, run-002, ...)\n});\n\nexport const PerCategoryAggregateEntrySchema = z.object({\n median_ratio: z.number().min(0).max(1),\n min_ratio: z.number().min(0).max(1),\n max_ratio: z.number().min(0).max(1),\n total: z.number().int().nonnegative(), // seed count — same across all runs\n per_run_ratios: z.array(z.number()),\n});\n\nexport const PerCategoryAggregateSchema = z.record(z.string(), PerCategoryAggregateEntrySchema);\n\nexport const AggregatePassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const AggregateDecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n median_produced: z.number(), // float — median of per-run decisions_produced_count\n passed: z.boolean(),\n});\n\nexport const AggregateCalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n receipt_kind: z.literal('aggregate'), // discriminates from single-run receipt\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n runs_count: z.number().int().min(2),\n run_files: z.array(z.string()), // relative paths: runs/run-001.json, etc.\n aggregated_at: z.string(), // ISO 8601\n research_os_version: z.string(),\n\n // Aggregate metrics — median + min + max + per-run values in run order\n good_fp_count: AggregateMetricSchema,\n any_flag_recall_ratio: AggregateMetricSchema,\n strict_recall_ratio: AggregateMetricSchema,\n decisions_produced_count: AggregateMetricSchema,\n runtime_ms: AggregateMetricSchema,\n empty_or_malformed_responses: AggregateMetricSchema,\n\n per_category_any_flag: PerCategoryAggregateSchema,\n per_category_strict: PerCategoryAggregateSchema,\n\n // Decision vocabulary — union of all decisions seen across runs, median count each\n decision_vocabulary: z.record(z.string(), AggregateMetricSchema),\n decision_vocab_bar: AggregateDecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n\n pass_fail: AggregatePassFailSchema,\n // Bars that FAILed in >= ceil(runs_count/2) individual runs.\n // Non-empty list demotes trusted_baseline to conditional_pass.\n recurring_bar_failures: z.array(z.string()),\n\n notes: z.array(z.string()),\n});\n\nexport type AggregateMetric = z.infer<typeof AggregateMetricSchema>;\nexport type PerCategoryAggregateEntry = z.infer<typeof PerCategoryAggregateEntrySchema>;\nexport type PerCategoryAggregate = z.infer<typeof PerCategoryAggregateSchema>;\nexport type AggregatePassFail = z.infer<typeof AggregatePassFailSchema>;\nexport type AggregateDecisionVocabBar = z.infer<typeof AggregateDecisionVocabBarSchema>;\nexport type AggregateCalibrationReceipt = z.infer<typeof AggregateCalibrationReceiptSchema>;\n","import { z } from 'zod';\n\nexport const StatusLabelSchema = z.enum([\n 'trusted_baseline',\n 'conditional_pass',\n 'failed',\n 'comparison_only',\n]);\n\nexport const ArchitectureSchema = z.enum(['single-pass', 'two-pass']);\n\nexport const RecallSchema = z.object({\n matched: z.number().int().nonnegative(),\n total: z.number().int().nonnegative(),\n ratio: z.number().min(0).max(1),\n});\n\nexport const PerCategoryRecallSchema = z.record(z.string(), RecallSchema);\n\nexport const PassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const DecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n produced: z.number().int().nonnegative(),\n passed: z.boolean(),\n});\n\nexport const CalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n calibrated_at: z.string(),\n research_os_version: z.string(),\n runtime_ms: z.number().int().nonnegative(),\n good_fp_count: z.number().int().nonnegative(),\n any_flag_recall: RecallSchema,\n strict_recall: RecallSchema,\n per_category_any_flag: PerCategoryRecallSchema,\n per_category_strict: PerCategoryRecallSchema,\n decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),\n decisions_produced_count: z.number().int().nonnegative(),\n decision_vocab_bar: DecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n empty_or_malformed_responses: z.number().int().nonnegative(),\n pass_fail: PassFailSchema,\n notes: z.array(z.string()),\n});\n\nexport type StatusLabel = z.infer<typeof StatusLabelSchema>;\nexport type Architecture = z.infer<typeof ArchitectureSchema>;\nexport type Recall = z.infer<typeof RecallSchema>;\nexport type PerCategoryRecall = z.infer<typeof PerCategoryRecallSchema>;\nexport type PassFail = z.infer<typeof PassFailSchema>;\nexport type DecisionVocabBar = z.infer<typeof DecisionVocabBarSchema>;\nexport type CalibrationReceipt = z.infer<typeof CalibrationReceiptSchema>;\n","import type { Architecture, CalibrationReceipt, PassFail, PerCategoryRecall } from './receipt-schema.js';\nimport {\n AggregateCalibrationReceiptSchema,\n type AggregateCalibrationReceipt,\n type AggregateMetric,\n type AggregatePassFail,\n type PerCategoryAggregate,\n} from './aggregate-receipt-schema.js';\nimport type { StatusLabel } from './receipt-schema.js';\n\n// Compute median of a sorted or unsorted array.\n// Throws on empty input — callers always have at least one run.\n// For even-length arrays: mean of two middle values (float, not rounded).\n// Integer-valued metrics (FP count, decisions) stay as floats here;\n// the caller's bar comparisons (>= 3, === 0) work correctly on exact floats\n// because the inputs are small integers.\nexport function median(values: number[]): number {\n if (values.length === 0) throw new Error('median: empty array');\n const sorted = [...values].sort((a, b) => a - b);\n const mid = Math.floor(sorted.length / 2);\n if (sorted.length % 2 === 1) return sorted[mid];\n return (sorted[mid - 1] + sorted[mid]) / 2;\n}\n\n// Aggregate a list of per-run scalar values into { median, min, max, values }.\n// values preserves input order (run-001, run-002, ...) for traceability.\nexport function aggregateMetric(values: number[]): AggregateMetric {\n const m = median(values);\n return {\n median: m,\n min: Math.min(...values),\n max: Math.max(...values),\n values,\n };\n}\n\n// Aggregate per-run per-category recall objects.\n// Each element of perRunBuckets is one run's PerCategoryRecall\n// (Record<category, { matched, total, ratio }>).\n// Returns PerCategoryAggregate: per-category median/min/max ratio + per-run ratios.\n// total is taken from the first run that has the category (same across runs —\n// SEEDS is static so category totals never change between runs).\nexport function aggregatePerCategoryRecall(\n perRunBuckets: PerCategoryRecall[],\n): PerCategoryAggregate {\n const cats = new Set<string>();\n for (const run of perRunBuckets) {\n for (const cat of Object.keys(run)) cats.add(cat);\n }\n\n const result: PerCategoryAggregate = {};\n for (const cat of cats) {\n const ratios = perRunBuckets.map((run) => run[cat]?.ratio ?? 0);\n const total = perRunBuckets.find((run) => run[cat] !== undefined)?.[cat]?.total ?? 0;\n result[cat] = {\n median_ratio: median(ratios),\n min_ratio: Math.min(...ratios),\n max_ratio: Math.max(...ratios),\n total,\n per_run_ratios: ratios,\n };\n }\n return result;\n}\n\n// Aggregate per-run decision vocabulary count dicts.\n// Each element is one run's decision_vocabulary (Record<decision, count>).\n// Returns Record<decision, AggregateMetric> with median count per decision.\nexport function aggregateDecisionVocabulary(\n perRunDicts: Record<string, number>[],\n): Record<string, AggregateMetric> {\n const decisions = new Set<string>();\n for (const run of perRunDicts) {\n for (const d of Object.keys(run)) decisions.add(d);\n }\n\n const result: Record<string, AggregateMetric> = {};\n for (const d of decisions) {\n const values = perRunDicts.map((run) => run[d] ?? 0);\n result[d] = aggregateMetric(values);\n }\n return result;\n}\n\n// Compute aggregate PASS/FAIL bars from aggregated metrics.\n//\n// Advisor-locked rules (gospel):\n// FP ceiling: median <= 1 AND max <= 2\n// Any-flag recall: median >= 0.65\n// Per-category: median_ratio >= 0.50 for categories with total >= 2\n// Strict recall: median >= 0.20\n// Decision vocab: median >= required (architecture-aware: two-pass=3, single-pass=4)\n// Latency soft: median <= 600_000 → WARN only, never FAIL\n// Latency hard: every-run rule — max <= 1_200_000\n// Empty/malformed: every-run rule — max === 0\nexport function computeAggregatePassFail(input: {\n good_fp_count: AggregateMetric;\n any_flag_recall_ratio: AggregateMetric;\n per_category_any_flag: PerCategoryAggregate;\n strict_recall_ratio: AggregateMetric;\n decisions_produced_count: AggregateMetric;\n architecture: Architecture;\n runtime_ms: AggregateMetric;\n empty_or_malformed_responses: AggregateMetric;\n}): AggregatePassFail {\n const fp_ceiling: 'PASS' | 'FAIL' =\n input.good_fp_count.median <= 1 && input.good_fp_count.max <= 2 ? 'PASS' : 'FAIL';\n\n const any_flag_recall_floor: 'PASS' | 'FAIL' =\n input.any_flag_recall_ratio.median >= 0.65 ? 'PASS' : 'FAIL';\n\n let per_category_any_flag_floor: 'PASS' | 'FAIL' = 'PASS';\n for (const entry of Object.values(input.per_category_any_flag)) {\n if (entry.total >= 2 && entry.median_ratio < 0.5) {\n per_category_any_flag_floor = 'FAIL';\n break;\n }\n }\n\n const strict_recall_floor: 'PASS' | 'FAIL' =\n input.strict_recall_ratio.median >= 0.2 ? 'PASS' : 'FAIL';\n\n const dvRequired = input.architecture === 'two-pass' ? 3 : 4;\n const decision_vocab_completeness: 'PASS' | 'FAIL' =\n input.decisions_produced_count.median >= dvRequired ? 'PASS' : 'FAIL';\n\n // Latency soft: WARN-only signal — no FAIL contribution\n const latency_soft: 'PASS' | 'WARN' =\n input.runtime_ms.median <= 600_000 ? 'PASS' : 'WARN';\n\n // Latency hard: every-run rule — enforced via max\n const latency_hard: 'PASS' | 'FAIL' =\n input.runtime_ms.max <= 1_200_000 ? 'PASS' : 'FAIL';\n\n // Empty/malformed: every-run rule — enforced via max\n const empty_or_malformed: 'PASS' | 'FAIL' =\n input.empty_or_malformed_responses.max === 0 ? 'PASS' : 'FAIL';\n\n const hardBars: ('PASS' | 'FAIL')[] = [\n fp_ceiling,\n any_flag_recall_floor,\n per_category_any_flag_floor,\n strict_recall_floor,\n decision_vocab_completeness,\n latency_hard,\n empty_or_malformed,\n ];\n const overall: 'PASS' | 'FAIL' = hardBars.every((v) => v === 'PASS') ? 'PASS' : 'FAIL';\n\n return {\n fp_ceiling,\n any_flag_recall_floor,\n per_category_any_flag_floor,\n strict_recall_floor,\n decision_vocab_completeness,\n latency_soft,\n latency_hard,\n empty_or_malformed,\n overall,\n };\n}\n\n// Compute which hard bars FAILed in >= ceil(N/2) individual runs.\n// A non-empty result means that bar was SYSTEMATICALLY unreliable —\n// not just a one-run outlier that happened to median-pass.\n// This is used by computeAggregateStatusLabel to prevent a profile from\n// earning trusted_baseline when one bar failed in the majority of runs.\n//\n// Hard bars checked (latency_soft and overall are excluded):\n// fp_ceiling, any_flag_recall_floor, per_category_any_flag_floor,\n// strict_recall_floor, decision_vocab_completeness, latency_hard, empty_or_malformed\nexport function computeRecurringBarFailures(\n perRunPassFails: PassFail[],\n totalRuns: number,\n): string[] {\n const threshold = Math.ceil(totalRuns / 2);\n const HARD_BARS: (keyof PassFail)[] = [\n 'fp_ceiling',\n 'any_flag_recall_floor',\n 'per_category_any_flag_floor',\n 'strict_recall_floor',\n 'decision_vocab_completeness',\n 'latency_hard',\n 'empty_or_malformed',\n ];\n\n const recurring: string[] = [];\n for (const bar of HARD_BARS) {\n const failCount = perRunPassFails.filter((pf) => pf[bar] === 'FAIL').length;\n if (failCount >= threshold) recurring.push(bar);\n }\n return recurring;\n}\n\n// Assign aggregate status label.\n//\n// Advisor-locked predicates (priority order):\n// 1. comparison_only — explicit mode flag OR single-pass Hermes (regardless of pass/fail)\n// 2. failed — aggregate pass_fail.overall === FAIL\n// 3. trusted_baseline — Hermes two-pass AND aggregate PASS AND median(FP) === 0\n// AND recurring_bar_failures.length === 0\n// The recurring-failure check prevents a profile from earning trusted_baseline\n// when any hard bar FAILed in >= ceil(N/2) runs even if the median still passed.\n// Intent: \"one lucky median cannot mask systemic bar weakness.\"\n// 4. conditional_pass — fallthrough (passes but doesn't earn trusted_baseline)\n// Mistral two-pass is capped at conditional_pass regardless of aggregate result.\nexport function computeAggregateStatusLabel(input: {\n profileName: string;\n architecture: Architecture;\n aggregatePassFail: AggregatePassFail;\n medianGoodFpCount: number;\n recurringBarFailures: string[];\n modeOverride?: 'comparison_only';\n}): StatusLabel {\n if (input.modeOverride === 'comparison_only') return 'comparison_only';\n\n if (input.architecture === 'single-pass' && /hermes/i.test(input.profileName)) {\n return 'comparison_only';\n }\n\n if (input.aggregatePassFail.overall === 'FAIL') return 'failed';\n\n const isHermesTwoPass =\n /hermes/i.test(input.profileName) && input.architecture === 'two-pass';\n if (\n isHermesTwoPass &&\n input.medianGoodFpCount === 0 &&\n input.recurringBarFailures.length === 0\n ) {\n return 'trusted_baseline';\n }\n\n return 'conditional_pass';\n}\n\n// Aggregate N single-run receipts into one AggregateCalibrationReceipt.\n// All receipts must be from the same profile/model/architecture.\n// opts.runFiles: relative paths for each run (e.g. 'runs/run-001.json').\n// opts.modeOverride: forward 'comparison_only' to status-label predicate.\n// opts.aggregatedAt: ISO timestamp (defaults to now).\nexport function aggregateReceipts(\n runs: CalibrationReceipt[],\n opts: {\n runFiles: string[];\n modeOverride?: 'comparison_only';\n aggregatedAt?: string;\n },\n): AggregateCalibrationReceipt {\n if (runs.length === 0) throw new Error('aggregateReceipts: no runs provided');\n const first = runs[0];\n\n const fpMetric = aggregateMetric(runs.map((r) => r.good_fp_count));\n const anyFlagRatioMetric = aggregateMetric(runs.map((r) => r.any_flag_recall.ratio));\n const strictRatioMetric = aggregateMetric(runs.map((r) => r.strict_recall.ratio));\n const decisionsMetric = aggregateMetric(runs.map((r) => r.decisions_produced_count));\n const runtimeMetric = aggregateMetric(runs.map((r) => r.runtime_ms));\n const emptyOrMalformedMetric = aggregateMetric(\n runs.map((r) => r.empty_or_malformed_responses),\n );\n\n const perCatAnyFlag = aggregatePerCategoryRecall(runs.map((r) => r.per_category_any_flag));\n const perCatStrict = aggregatePerCategoryRecall(runs.map((r) => r.per_category_strict));\n const decisionVocab = aggregateDecisionVocabulary(runs.map((r) => r.decision_vocabulary));\n\n const dvRequired = first.architecture === 'two-pass' ? 3 : 4;\n const decisionVocabBar = {\n architecture: first.architecture,\n required: dvRequired,\n median_produced: decisionsMetric.median,\n passed: decisionsMetric.median >= dvRequired,\n };\n\n const aggregatePassFail = computeAggregatePassFail({\n good_fp_count: fpMetric,\n any_flag_recall_ratio: anyFlagRatioMetric,\n per_category_any_flag: perCatAnyFlag,\n strict_recall_ratio: strictRatioMetric,\n decisions_produced_count: decisionsMetric,\n architecture: first.architecture,\n runtime_ms: runtimeMetric,\n empty_or_malformed_responses: emptyOrMalformedMetric,\n });\n\n const recurringBarFailures = computeRecurringBarFailures(\n runs.map((r) => r.pass_fail),\n runs.length,\n );\n\n const status = computeAggregateStatusLabel({\n profileName: first.profile_name,\n architecture: first.architecture,\n aggregatePassFail,\n medianGoodFpCount: fpMetric.median,\n recurringBarFailures,\n modeOverride: opts.modeOverride,\n });\n\n const notes: string[] = [];\n if (aggregatePassFail.latency_soft === 'WARN') {\n notes.push(\n `Latency warning: median ${(runtimeMetric.median / 1000).toFixed(1)}s exceeds soft limit of 600s`,\n );\n }\n if (fpMetric.median > 0) {\n notes.push(`FP at ceiling: median ${fpMetric.median} false positive(s) on good claims`);\n }\n if (recurringBarFailures.length > 0) {\n notes.push(`Recurring bar failures (>= ceil(N/2) runs): ${recurringBarFailures.join(', ')}`);\n }\n if (status === 'comparison_only') {\n notes.push(\n 'comparison_only: architectural side-run, not a production admission candidate',\n );\n }\n if (status === 'conditional_pass') {\n notes.push('conditional_pass: passes all bars but carries a production caution');\n }\n\n return AggregateCalibrationReceiptSchema.parse({\n schema_version: 1,\n receipt_kind: 'aggregate',\n profile_name: first.profile_name,\n status,\n model: first.model,\n architecture: first.architecture,\n fixture: first.fixture,\n fixture_total_claims: first.fixture_total_claims,\n fixture_good_claims: first.fixture_good_claims,\n fixture_bad_claims: first.fixture_bad_claims,\n runs_count: runs.length,\n run_files: opts.runFiles,\n aggregated_at: opts.aggregatedAt ?? new Date().toISOString(),\n research_os_version: first.research_os_version,\n good_fp_count: fpMetric,\n any_flag_recall_ratio: anyFlagRatioMetric,\n strict_recall_ratio: strictRatioMetric,\n decisions_produced_count: decisionsMetric,\n runtime_ms: runtimeMetric,\n empty_or_malformed_responses: emptyOrMalformedMetric,\n per_category_any_flag: perCatAnyFlag,\n per_category_strict: perCatStrict,\n decision_vocabulary: decisionVocab,\n decision_vocab_bar: decisionVocabBar,\n unreachable_decisions: first.unreachable_decisions,\n pass_fail: aggregatePassFail,\n recurring_bar_failures: recurringBarFailures,\n notes,\n });\n}\n\n// Render the aggregate calibration receipt as compact Markdown.\n// Operator proof artifact — no prose.\nexport function buildAggregateReceiptMarkdown(r: AggregateCalibrationReceipt): string {\n const pct = (ratio: number) => `${Math.round(ratio * 100)}%`;\n const secRounded = (ms: number) => `${(ms / 1000).toFixed(1)}s`;\n\n const af = r.any_flag_recall_ratio;\n const sr = r.strict_recall_ratio;\n const fp = r.good_fp_count;\n const dec = r.decisions_produced_count;\n const rt = r.runtime_ms;\n const pf = r.pass_fail;\n const bar = r.decision_vocab_bar;\n\n const runFileList =\n r.run_files.length > 0\n ? `${r.run_files[0]} … ${r.run_files[r.run_files.length - 1]}`\n : '(none)';\n\n const perCatAnyFlagRows = Object.entries(r.per_category_any_flag)\n .map(([cat, entry]) => {\n const st = r.per_category_strict[cat];\n return (\n `| ${cat} | ${pct(entry.median_ratio)} | ${pct(entry.min_ratio)}–${pct(entry.max_ratio)} | ${entry.total} |` +\n (st\n ? ` ${pct(st.median_ratio)} | ${pct(st.min_ratio)}–${pct(st.max_ratio)} |`\n : ' — | — |')\n );\n })\n .join('\\n');\n\n const ALL_DECISIONS = [\n 'accepted_for_synthesis',\n 'rejected',\n 'needs_scope_repair',\n 'needs_source_repair',\n 'needs_contradiction_mapping',\n 'needs_human_review',\n ];\n const dvRows = ALL_DECISIONS.map((d) => {\n const metric = r.decision_vocabulary[d];\n const unreachable = r.unreachable_decisions.includes(d)\n ? ` (unreachable from ${r.fixture})`\n : '';\n if (!metric) return `| ${d} | — | — |${unreachable}`;\n return `| ${d} | ${metric.median.toFixed(1)} | ${metric.min}–${metric.max}${unreachable} |`;\n }).join('\\n');\n\n // Per-run summary table — pulled from run_files labels for clarity\n const perRunRows = r.any_flag_recall_ratio.values\n .map((afr, i) => {\n const fp_i = r.good_fp_count.values[i] ?? '?';\n const sr_i = r.strict_recall_ratio.values[i] ?? '?';\n const dec_i = r.decisions_produced_count.values[i] ?? '?';\n const rt_i = r.runtime_ms.values[i] ?? '?';\n return `| ${i + 1} | ${fp_i}/${r.fixture_good_claims} | ${typeof afr === 'number' ? pct(afr) : '?'} | ${typeof sr_i === 'number' ? pct(sr_i) : '?'} | ${dec_i}/6 | ${typeof rt_i === 'number' ? secRounded(rt_i) : '?'} |`;\n })\n .join('\\n');\n\n const recurringSection =\n r.recurring_bar_failures.length > 0\n ? r.recurring_bar_failures.map((b) => `- ${b}`).join('\\n')\n : 'None.';\n\n const notesSection =\n r.notes.length > 0 ? `\\n## Notes\\n\\n${r.notes.map((n) => `- ${n}`).join('\\n')}\\n` : '';\n\n return `# Calibration Receipt — ${r.profile_name} (aggregate, N=${r.runs_count} runs)\n\n- **Model:** ${r.model}\n- **Architecture:** ${r.architecture}\n- **Status:** ${r.status}\n- **Fixture:** ${r.fixture} (${r.fixture_total_claims} claims = ${r.fixture_good_claims} good + ${r.fixture_bad_claims} bad)\n- **Aggregated at:** ${r.aggregated_at}\n- **Research-OS version:** ${r.research_os_version}\n- **Run count:** ${r.runs_count}\n- **Run files:** ${runFileList}\n\n## Headline metrics (median across runs)\n\n- FP: median ${fp.median} / ${r.fixture_good_claims} (range ${fp.min}–${fp.max})\n- Any-flag recall: median ${pct(af.median)} (range ${pct(af.min)}–${pct(af.max)})\n- Strict recall: median ${pct(sr.median)} (range ${pct(sr.min)}–${pct(sr.max)})\n- Decisions produced: median ${dec.median} / 6 (range ${dec.min}–${dec.max})\n\n## PASS / FAIL (aggregate)\n\n| Bar | Rule | Result |\n|---|---|---|\n| FP ceiling | median=${fp.median}, max=${fp.max} (median ≤1 AND max ≤2) | ${pf.fp_ceiling} |\n| Any-flag recall | median=${pct(af.median)} (≥65%) | ${pf.any_flag_recall_floor} |\n| Per-category any-flag | median ≥50% per cat (see below) | ${pf.per_category_any_flag_floor} |\n| Strict recall | median=${pct(sr.median)} (≥20%) | ${pf.strict_recall_floor} |\n| Decision vocab | median=${dec.median} / 6 (${bar.architecture} ≥${bar.required}) | ${pf.decision_vocab_completeness} |\n| Latency soft | median=${secRounded(rt.median)} (≤600s, WARN only) | ${pf.latency_soft} |\n| Latency hard | max=${secRounded(rt.max)} (every run ≤1200s) | ${pf.latency_hard} |\n| Empty/malformed | max=${r.empty_or_malformed_responses.max} (every run =0) | ${pf.empty_or_malformed} |\n| **OVERALL** | | **${pf.overall}** |\n\n## Recurring hard-bar failures\n\n${recurringSection}\n\n## Per-category recall (median across runs)\n\n| Category | Any-flag median | Any-flag range | Total | Strict median | Strict range |\n|---|---|---|---|---|---|\n${perCatAnyFlagRows}\n\n## Decision vocabulary (median count across runs)\n\n| Decision | Median | Range |\n|---|---|---|\n${dvRows}\n\n## Per-run summary\n\n| Run | FP | Any-flag | Strict | Decisions | Runtime |\n|---|---|---|---|---|---|\n${perRunRows}\n${notesSection}`;\n}\n"],"mappings":";AAAA,SAAS,KAAAA,UAAS;;;ACAlB,SAAS,SAAS;AAEX,IAAM,oBAAoB,EAAE,KAAK;AAAA,EACtC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAEM,IAAM,qBAAqB,EAAE,KAAK,CAAC,eAAe,UAAU,CAAC;AAE7D,IAAM,eAAe,EAAE,OAAO;AAAA,EACnC,SAAS,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACtC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACpC,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAChC,CAAC;AAEM,IAAM,0BAA0B,EAAE,OAAO,EAAE,OAAO,GAAG,YAAY;AAEjE,IAAM,iBAAiB,EAAE,OAAO;AAAA,EACrC,YAAY,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6B,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6B,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAc,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAc,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAAS,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,yBAAyB,EAAE,OAAO;AAAA,EAC7C,cAAc;AAAA,EACd,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvC,QAAQ,EAAE,QAAQ;AACpB,CAAC;AAEM,IAAM,2BAA2B,EAAE,OAAO;AAAA,EAC/C,gBAAgB,EAAE,QAAQ,CAAC;AAAA,EAC3B,cAAc,EAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAO,EAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAAS,EAAE,OAAO;AAAA,EAClB,sBAAsB,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqB,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoB,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,eAAe,EAAE,OAAO;AAAA,EACxB,qBAAqB,EAAE,OAAO;AAAA,EAC9B,YAAY,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACzC,eAAe,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC5C,iBAAiB;AAAA,EACjB,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,qBAAqB,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,CAAC;AAAA,EACxE,0BAA0B,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvD,oBAAoB;AAAA,EACpB,uBAAuB,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EACzC,8BAA8B,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC3D,WAAW;AAAA,EACX,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC;AAC3B,CAAC;;;AD5DM,IAAM,wBAAwBC,GAAE,OAAO;AAAA,EAC5C,QAAQA,GAAE,OAAO;AAAA,EACjB,KAAKA,GAAE,OAAO;AAAA,EACd,KAAKA,GAAE,OAAO;AAAA,EACd,QAAQA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAC5B,CAAC;AAEM,IAAM,kCAAkCA,GAAE,OAAO;AAAA,EACtD,cAAcA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EACrC,WAAWA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAClC,WAAWA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAClC,OAAOA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA;AAAA,EACpC,gBAAgBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AACpC,CAAC;AAEM,IAAM,6BAA6BA,GAAE,OAAOA,GAAE,OAAO,GAAG,+BAA+B;AAEvF,IAAM,0BAA0BA,GAAE,OAAO;AAAA,EAC9C,YAAYA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAASA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,kCAAkCA,GAAE,OAAO;AAAA,EACtD,cAAc;AAAA,EACd,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,iBAAiBA,GAAE,OAAO;AAAA;AAAA,EAC1B,QAAQA,GAAE,QAAQ;AACpB,CAAC;AAEM,IAAM,oCAAoCA,GAAE,OAAO;AAAA,EACxD,gBAAgBA,GAAE,QAAQ,CAAC;AAAA,EAC3B,cAAcA,GAAE,QAAQ,WAAW;AAAA;AAAA,EACnC,cAAcA,GAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAOA,GAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAASA,GAAE,OAAO;AAAA,EAClB,sBAAsBA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,YAAYA,GAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC;AAAA,EAClC,WAAWA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAAA,EAC7B,eAAeA,GAAE,OAAO;AAAA;AAAA,EACxB,qBAAqBA,GAAE,OAAO;AAAA;AAAA,EAG9B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,0BAA0B;AAAA,EAC1B,YAAY;AAAA,EACZ,8BAA8B;AAAA,EAE9B,uBAAuB;AAAA,EACvB,qBAAqB;AAAA;AAAA,EAGrB,qBAAqBA,GAAE,OAAOA,GAAE,OAAO,GAAG,qBAAqB;AAAA,EAC/D,oBAAoB;AAAA,EACpB,uBAAuBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EAEzC,WAAW;AAAA;AAAA;AAAA,EAGX,wBAAwBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EAE1C,OAAOA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAC3B,CAAC;;;AE7DM,SAAS,OAAO,QAA0B;AAC/C,MAAI,OAAO,WAAW,EAAG,OAAM,IAAI,MAAM,qBAAqB;AAC9D,QAAM,SAAS,CAAC,GAAG,MAAM,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC/C,QAAM,MAAM,KAAK,MAAM,OAAO,SAAS,CAAC;AACxC,MAAI,OAAO,SAAS,MAAM,EAAG,QAAO,OAAO,GAAG;AAC9C,UAAQ,OAAO,MAAM,CAAC,IAAI,OAAO,GAAG,KAAK;AAC3C;AAIO,SAAS,gBAAgB,QAAmC;AACjE,QAAM,IAAI,OAAO,MAAM;AACvB,SAAO;AAAA,IACL,QAAQ;AAAA,IACR,KAAK,KAAK,IAAI,GAAG,MAAM;AAAA,IACvB,KAAK,KAAK,IAAI,GAAG,MAAM;AAAA,IACvB;AAAA,EACF;AACF;AAQO,SAAS,2BACd,eACsB;AACtB,QAAM,OAAO,oBAAI,IAAY;AAC7B,aAAW,OAAO,eAAe;AAC/B,eAAW,OAAO,OAAO,KAAK,GAAG,EAAG,MAAK,IAAI,GAAG;AAAA,EAClD;AAEA,QAAM,SAA+B,CAAC;AACtC,aAAW,OAAO,MAAM;AACtB,UAAM,SAAS,cAAc,IAAI,CAAC,QAAQ,IAAI,GAAG,GAAG,SAAS,CAAC;AAC9D,UAAM,QAAQ,cAAc,KAAK,CAAC,QAAQ,IAAI,GAAG,MAAM,MAAS,IAAI,GAAG,GAAG,SAAS;AACnF,WAAO,GAAG,IAAI;AAAA,MACZ,cAAc,OAAO,MAAM;AAAA,MAC3B,WAAW,KAAK,IAAI,GAAG,MAAM;AAAA,MAC7B,WAAW,KAAK,IAAI,GAAG,MAAM;AAAA,MAC7B;AAAA,MACA,gBAAgB;AAAA,IAClB;AAAA,EACF;AACA,SAAO;AACT;AAKO,SAAS,4BACd,aACiC;AACjC,QAAM,YAAY,oBAAI,IAAY;AAClC,aAAW,OAAO,aAAa;AAC7B,eAAW,KAAK,OAAO,KAAK,GAAG,EAAG,WAAU,IAAI,CAAC;AAAA,EACnD;AAEA,QAAM,SAA0C,CAAC;AACjD,aAAW,KAAK,WAAW;AACzB,UAAM,SAAS,YAAY,IAAI,CAAC,QAAQ,IAAI,CAAC,KAAK,CAAC;AACnD,WAAO,CAAC,IAAI,gBAAgB,MAAM;AAAA,EACpC;AACA,SAAO;AACT;AAaO,SAAS,yBAAyB,OASnB;AACpB,QAAM,aACJ,MAAM,cAAc,UAAU,KAAK,MAAM,cAAc,OAAO,IAAI,SAAS;AAE7E,QAAM,wBACJ,MAAM,sBAAsB,UAAU,OAAO,SAAS;AAExD,MAAI,8BAA+C;AACnD,aAAW,SAAS,OAAO,OAAO,MAAM,qBAAqB,GAAG;AAC9D,QAAI,MAAM,SAAS,KAAK,MAAM,eAAe,KAAK;AAChD,oCAA8B;AAC9B;AAAA,IACF;AAAA,EACF;AAEA,QAAM,sBACJ,MAAM,oBAAoB,UAAU,MAAM,SAAS;AAErD,QAAM,aAAa,MAAM,iBAAiB,aAAa,IAAI;AAC3D,QAAM,8BACJ,MAAM,yBAAyB,UAAU,aAAa,SAAS;AAGjE,QAAM,eACJ,MAAM,WAAW,UAAU,MAAU,SAAS;AAGhD,QAAM,eACJ,MAAM,WAAW,OAAO,OAAY,SAAS;AAG/C,QAAM,qBACJ,MAAM,6BAA6B,QAAQ,IAAI,SAAS;AAE1D,QAAM,WAAgC;AAAA,IACpC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,QAAM,UAA2B,SAAS,MAAM,CAAC,MAAM,MAAM,MAAM,IAAI,SAAS;AAEhF,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAWO,SAAS,4BACd,iBACA,WACU;AACV,QAAM,YAAY,KAAK,KAAK,YAAY,CAAC;AACzC,QAAM,YAAgC;AAAA,IACpC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,QAAM,YAAsB,CAAC;AAC7B,aAAW,OAAO,WAAW;AAC3B,UAAM,YAAY,gBAAgB,OAAO,CAAC,OAAO,GAAG,GAAG,MAAM,MAAM,EAAE;AACrE,QAAI,aAAa,UAAW,WAAU,KAAK,GAAG;AAAA,EAChD;AACA,SAAO;AACT;AAcO,SAAS,4BAA4B,OAO5B;AACd,MAAI,MAAM,iBAAiB,kBAAmB,QAAO;AAErD,MAAI,MAAM,iBAAiB,iBAAiB,UAAU,KAAK,MAAM,WAAW,GAAG;AAC7E,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,kBAAkB,YAAY,OAAQ,QAAO;AAEvD,QAAM,kBACJ,UAAU,KAAK,MAAM,WAAW,KAAK,MAAM,iBAAiB;AAC9D,MACE,mBACA,MAAM,sBAAsB,KAC5B,MAAM,qBAAqB,WAAW,GACtC;AACA,WAAO;AAAA,EACT;AAEA,SAAO;AACT;AAOO,SAAS,kBACd,MACA,MAK6B;AAC7B,MAAI,KAAK,WAAW,EAAG,OAAM,IAAI,MAAM,qCAAqC;AAC5E,QAAM,QAAQ,KAAK,CAAC;AAEpB,QAAM,WAAW,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,aAAa,CAAC;AACjE,QAAM,qBAAqB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,gBAAgB,KAAK,CAAC;AACnF,QAAM,oBAAoB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,cAAc,KAAK,CAAC;AAChF,QAAM,kBAAkB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,wBAAwB,CAAC;AACnF,QAAM,gBAAgB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,UAAU,CAAC;AACnE,QAAM,yBAAyB;AAAA,IAC7B,KAAK,IAAI,CAAC,MAAM,EAAE,4BAA4B;AAAA,EAChD;AAEA,QAAM,gBAAgB,2BAA2B,KAAK,IAAI,CAAC,MAAM,EAAE,qBAAqB,CAAC;AACzF,QAAM,eAAe,2BAA2B,KAAK,IAAI,CAAC,MAAM,EAAE,mBAAmB,CAAC;AACtF,QAAM,gBAAgB,4BAA4B,KAAK,IAAI,CAAC,MAAM,EAAE,mBAAmB,CAAC;AAExF,QAAM,aAAa,MAAM,iBAAiB,aAAa,IAAI;AAC3D,QAAM,mBAAmB;AAAA,IACvB,cAAc,MAAM;AAAA,IACpB,UAAU;AAAA,IACV,iBAAiB,gBAAgB;AAAA,IACjC,QAAQ,gBAAgB,UAAU;AAAA,EACpC;AAEA,QAAM,oBAAoB,yBAAyB;AAAA,IACjD,eAAe;AAAA,IACf,uBAAuB;AAAA,IACvB,uBAAuB;AAAA,IACvB,qBAAqB;AAAA,IACrB,0BAA0B;AAAA,IAC1B,cAAc,MAAM;AAAA,IACpB,YAAY;AAAA,IACZ,8BAA8B;AAAA,EAChC,CAAC;AAED,QAAM,uBAAuB;AAAA,IAC3B,KAAK,IAAI,CAAC,MAAM,EAAE,SAAS;AAAA,IAC3B,KAAK;AAAA,EACP;AAEA,QAAM,SAAS,4BAA4B;AAAA,IACzC,aAAa,MAAM;AAAA,IACnB,cAAc,MAAM;AAAA,IACpB;AAAA,IACA,mBAAmB,SAAS;AAAA,IAC5B;AAAA,IACA,cAAc,KAAK;AAAA,EACrB,CAAC;AAED,QAAM,QAAkB,CAAC;AACzB,MAAI,kBAAkB,iBAAiB,QAAQ;AAC7C,UAAM;AAAA,MACJ,4BAA4B,cAAc,SAAS,KAAM,QAAQ,CAAC,CAAC;AAAA,IACrE;AAAA,EACF;AACA,MAAI,SAAS,SAAS,GAAG;AACvB,UAAM,KAAK,yBAAyB,SAAS,MAAM,mCAAmC;AAAA,EACxF;AACA,MAAI,qBAAqB,SAAS,GAAG;AACnC,UAAM,KAAK,+CAA+C,qBAAqB,KAAK,IAAI,CAAC,EAAE;AAAA,EAC7F;AACA,MAAI,WAAW,mBAAmB;AAChC,UAAM;AAAA,MACJ;AAAA,IACF;AAAA,EACF;AACA,MAAI,WAAW,oBAAoB;AACjC,UAAM,KAAK,oEAAoE;AAAA,EACjF;AAEA,SAAO,kCAAkC,MAAM;AAAA,IAC7C,gBAAgB;AAAA,IAChB,cAAc;AAAA,IACd,cAAc,MAAM;AAAA,IACpB;AAAA,IACA,OAAO,MAAM;AAAA,IACb,cAAc,MAAM;AAAA,IACpB,SAAS,MAAM;AAAA,IACf,sBAAsB,MAAM;AAAA,IAC5B,qBAAqB,MAAM;AAAA,IAC3B,oBAAoB,MAAM;AAAA,IAC1B,YAAY,KAAK;AAAA,IACjB,WAAW,KAAK;AAAA,IAChB,eAAe,KAAK,iBAAgB,oBAAI,KAAK,GAAE,YAAY;AAAA,IAC3D,qBAAqB,MAAM;AAAA,IAC3B,eAAe;AAAA,IACf,uBAAuB;AAAA,IACvB,qBAAqB;AAAA,IACrB,0BAA0B;AAAA,IAC1B,YAAY;AAAA,IACZ,8BAA8B;AAAA,IAC9B,uBAAuB;AAAA,IACvB,qBAAqB;AAAA,IACrB,qBAAqB;AAAA,IACrB,oBAAoB;AAAA,IACpB,uBAAuB,MAAM;AAAA,IAC7B,WAAW;AAAA,IACX,wBAAwB;AAAA,IACxB;AAAA,EACF,CAAC;AACH;AAIO,SAAS,8BAA8B,GAAwC;AACpF,QAAM,MAAM,CAAC,UAAkB,GAAG,KAAK,MAAM,QAAQ,GAAG,CAAC;AACzD,QAAM,aAAa,CAAC,OAAe,IAAI,KAAK,KAAM,QAAQ,CAAC,CAAC;AAE5D,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,EAAE;AACb,QAAM,MAAM,EAAE;AACd,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,EAAE;AACb,QAAM,MAAM,EAAE;AAEd,QAAM,cACJ,EAAE,UAAU,SAAS,IACjB,GAAG,EAAE,UAAU,CAAC,CAAC,WAAM,EAAE,UAAU,EAAE,UAAU,SAAS,CAAC,CAAC,KAC1D;AAEN,QAAM,oBAAoB,OAAO,QAAQ,EAAE,qBAAqB,EAC7D,IAAI,CAAC,CAAC,KAAK,KAAK,MAAM;AACrB,UAAM,KAAK,EAAE,oBAAoB,GAAG;AACpC,WACE,KAAK,GAAG,MAAM,IAAI,MAAM,YAAY,CAAC,MAAM,IAAI,MAAM,SAAS,CAAC,SAAI,IAAI,MAAM,SAAS,CAAC,MAAM,MAAM,KAAK,QACvG,KACG,IAAI,IAAI,GAAG,YAAY,CAAC,MAAM,IAAI,GAAG,SAAS,CAAC,SAAI,IAAI,GAAG,SAAS,CAAC,OACpE;AAAA,EAER,CAAC,EACA,KAAK,IAAI;AAEZ,QAAM,gBAAgB;AAAA,IACpB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,QAAM,SAAS,cAAc,IAAI,CAAC,MAAM;AACtC,UAAM,SAAS,EAAE,oBAAoB,CAAC;AACtC,UAAM,cAAc,EAAE,sBAAsB,SAAS,CAAC,IAClD,sBAAsB,EAAE,OAAO,MAC/B;AACJ,QAAI,CAAC,OAAQ,QAAO,KAAK,CAAC,uBAAa,WAAW;AAClD,WAAO,KAAK,CAAC,MAAM,OAAO,OAAO,QAAQ,CAAC,CAAC,MAAM,OAAO,GAAG,SAAI,OAAO,GAAG,GAAG,WAAW;AAAA,EACzF,CAAC,EAAE,KAAK,IAAI;AAGZ,QAAM,aAAa,EAAE,sBAAsB,OACxC,IAAI,CAAC,KAAK,MAAM;AACf,UAAM,OAAO,EAAE,cAAc,OAAO,CAAC,KAAK;AAC1C,UAAM,OAAO,EAAE,oBAAoB,OAAO,CAAC,KAAK;AAChD,UAAM,QAAQ,EAAE,yBAAyB,OAAO,CAAC,KAAK;AACtD,UAAM,OAAO,EAAE,WAAW,OAAO,CAAC,KAAK;AACvC,WAAO,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,EAAE,mBAAmB,MAAM,OAAO,QAAQ,WAAW,IAAI,GAAG,IAAI,GAAG,MAAM,OAAO,SAAS,WAAW,IAAI,IAAI,IAAI,GAAG,MAAM,KAAK,QAAQ,OAAO,SAAS,WAAW,WAAW,IAAI,IAAI,GAAG;AAAA,EACxN,CAAC,EACA,KAAK,IAAI;AAEZ,QAAM,mBACJ,EAAE,uBAAuB,SAAS,IAC9B,EAAE,uBAAuB,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,EAAE,KAAK,IAAI,IACvD;AAEN,QAAM,eACJ,EAAE,MAAM,SAAS,IAAI;AAAA;AAAA;AAAA,EAAiB,EAAE,MAAM,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,EAAE,KAAK,IAAI,CAAC;AAAA,IAAO;AAEtF,SAAO,gCAA2B,EAAE,YAAY,kBAAkB,EAAE,UAAU;AAAA;AAAA,eAEjE,EAAE,KAAK;AAAA,sBACA,EAAE,YAAY;AAAA,gBACpB,EAAE,MAAM;AAAA,iBACP,EAAE,OAAO,KAAK,EAAE,oBAAoB,aAAa,EAAE,mBAAmB,WAAW,EAAE,kBAAkB;AAAA,uBAC/F,EAAE,aAAa;AAAA,6BACT,EAAE,mBAAmB;AAAA,mBAC/B,EAAE,UAAU;AAAA,mBACZ,WAAW;AAAA;AAAA;AAAA;AAAA,eAIf,GAAG,MAAM,MAAM,EAAE,mBAAmB,WAAW,GAAG,GAAG,SAAI,GAAG,GAAG;AAAA,4BAClD,IAAI,GAAG,MAAM,CAAC,WAAW,IAAI,GAAG,GAAG,CAAC,SAAI,IAAI,GAAG,GAAG,CAAC;AAAA,0BACrD,IAAI,GAAG,MAAM,CAAC,WAAW,IAAI,GAAG,GAAG,CAAC,SAAI,IAAI,GAAG,GAAG,CAAC;AAAA,+BAC9C,IAAI,MAAM,eAAe,IAAI,GAAG,SAAI,IAAI,GAAG;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,wBAMlD,GAAG,MAAM,SAAS,GAAG,GAAG,uCAA6B,GAAG,UAAU;AAAA,6BAC7D,IAAI,GAAG,MAAM,CAAC,kBAAa,GAAG,qBAAqB;AAAA,mEAClB,GAAG,2BAA2B;AAAA,2BACjE,IAAI,GAAG,MAAM,CAAC,kBAAa,GAAG,mBAAmB;AAAA,4BAChD,IAAI,MAAM,SAAS,IAAI,YAAY,UAAK,IAAI,QAAQ,OAAO,GAAG,2BAA2B;AAAA,0BAC3F,WAAW,GAAG,MAAM,CAAC,8BAAyB,GAAG,YAAY;AAAA,uBAChE,WAAW,GAAG,GAAG,CAAC,8BAAyB,GAAG,YAAY;AAAA,0BACvD,EAAE,6BAA6B,GAAG,qBAAqB,GAAG,kBAAkB;AAAA,sBAChF,GAAG,OAAO;AAAA;AAAA;AAAA;AAAA,EAI9B,gBAAgB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMhB,iBAAiB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMjB,MAAM;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMN,UAAU;AAAA,EACV,YAAY;AACd;","names":["z","z"]}
@@ -0,0 +1,317 @@
1
+ import { z } from 'zod';
2
+
3
+ declare const StatusLabelSchema: z.ZodEnum<["trusted_baseline", "conditional_pass", "failed", "comparison_only"]>;
4
+ declare const ArchitectureSchema: z.ZodEnum<["single-pass", "two-pass"]>;
5
+ declare const RecallSchema: z.ZodObject<{
6
+ matched: z.ZodNumber;
7
+ total: z.ZodNumber;
8
+ ratio: z.ZodNumber;
9
+ }, "strip", z.ZodTypeAny, {
10
+ matched: number;
11
+ total: number;
12
+ ratio: number;
13
+ }, {
14
+ matched: number;
15
+ total: number;
16
+ ratio: number;
17
+ }>;
18
+ declare const PerCategoryRecallSchema: z.ZodRecord<z.ZodString, z.ZodObject<{
19
+ matched: z.ZodNumber;
20
+ total: z.ZodNumber;
21
+ ratio: z.ZodNumber;
22
+ }, "strip", z.ZodTypeAny, {
23
+ matched: number;
24
+ total: number;
25
+ ratio: number;
26
+ }, {
27
+ matched: number;
28
+ total: number;
29
+ ratio: number;
30
+ }>>;
31
+ declare const PassFailSchema: z.ZodObject<{
32
+ fp_ceiling: z.ZodEnum<["PASS", "FAIL"]>;
33
+ any_flag_recall_floor: z.ZodEnum<["PASS", "FAIL"]>;
34
+ per_category_any_flag_floor: z.ZodEnum<["PASS", "FAIL"]>;
35
+ strict_recall_floor: z.ZodEnum<["PASS", "FAIL"]>;
36
+ decision_vocab_completeness: z.ZodEnum<["PASS", "FAIL"]>;
37
+ latency_soft: z.ZodEnum<["PASS", "WARN"]>;
38
+ latency_hard: z.ZodEnum<["PASS", "FAIL"]>;
39
+ empty_or_malformed: z.ZodEnum<["PASS", "FAIL"]>;
40
+ overall: z.ZodEnum<["PASS", "FAIL"]>;
41
+ }, "strip", z.ZodTypeAny, {
42
+ fp_ceiling: "PASS" | "FAIL";
43
+ any_flag_recall_floor: "PASS" | "FAIL";
44
+ per_category_any_flag_floor: "PASS" | "FAIL";
45
+ strict_recall_floor: "PASS" | "FAIL";
46
+ decision_vocab_completeness: "PASS" | "FAIL";
47
+ latency_soft: "PASS" | "WARN";
48
+ latency_hard: "PASS" | "FAIL";
49
+ empty_or_malformed: "PASS" | "FAIL";
50
+ overall: "PASS" | "FAIL";
51
+ }, {
52
+ fp_ceiling: "PASS" | "FAIL";
53
+ any_flag_recall_floor: "PASS" | "FAIL";
54
+ per_category_any_flag_floor: "PASS" | "FAIL";
55
+ strict_recall_floor: "PASS" | "FAIL";
56
+ decision_vocab_completeness: "PASS" | "FAIL";
57
+ latency_soft: "PASS" | "WARN";
58
+ latency_hard: "PASS" | "FAIL";
59
+ empty_or_malformed: "PASS" | "FAIL";
60
+ overall: "PASS" | "FAIL";
61
+ }>;
62
+ declare const DecisionVocabBarSchema: z.ZodObject<{
63
+ architecture: z.ZodEnum<["single-pass", "two-pass"]>;
64
+ required: z.ZodNumber;
65
+ produced: z.ZodNumber;
66
+ passed: z.ZodBoolean;
67
+ }, "strip", z.ZodTypeAny, {
68
+ required: number;
69
+ architecture: "single-pass" | "two-pass";
70
+ produced: number;
71
+ passed: boolean;
72
+ }, {
73
+ required: number;
74
+ architecture: "single-pass" | "two-pass";
75
+ produced: number;
76
+ passed: boolean;
77
+ }>;
78
+ declare const CalibrationReceiptSchema: z.ZodObject<{
79
+ schema_version: z.ZodLiteral<1>;
80
+ profile_name: z.ZodString;
81
+ status: z.ZodEnum<["trusted_baseline", "conditional_pass", "failed", "comparison_only"]>;
82
+ model: z.ZodString;
83
+ architecture: z.ZodEnum<["single-pass", "two-pass"]>;
84
+ fixture: z.ZodString;
85
+ fixture_total_claims: z.ZodNumber;
86
+ fixture_good_claims: z.ZodNumber;
87
+ fixture_bad_claims: z.ZodNumber;
88
+ calibrated_at: z.ZodString;
89
+ research_os_version: z.ZodString;
90
+ runtime_ms: z.ZodNumber;
91
+ good_fp_count: z.ZodNumber;
92
+ any_flag_recall: z.ZodObject<{
93
+ matched: z.ZodNumber;
94
+ total: z.ZodNumber;
95
+ ratio: z.ZodNumber;
96
+ }, "strip", z.ZodTypeAny, {
97
+ matched: number;
98
+ total: number;
99
+ ratio: number;
100
+ }, {
101
+ matched: number;
102
+ total: number;
103
+ ratio: number;
104
+ }>;
105
+ strict_recall: z.ZodObject<{
106
+ matched: z.ZodNumber;
107
+ total: z.ZodNumber;
108
+ ratio: z.ZodNumber;
109
+ }, "strip", z.ZodTypeAny, {
110
+ matched: number;
111
+ total: number;
112
+ ratio: number;
113
+ }, {
114
+ matched: number;
115
+ total: number;
116
+ ratio: number;
117
+ }>;
118
+ per_category_any_flag: z.ZodRecord<z.ZodString, z.ZodObject<{
119
+ matched: z.ZodNumber;
120
+ total: z.ZodNumber;
121
+ ratio: z.ZodNumber;
122
+ }, "strip", z.ZodTypeAny, {
123
+ matched: number;
124
+ total: number;
125
+ ratio: number;
126
+ }, {
127
+ matched: number;
128
+ total: number;
129
+ ratio: number;
130
+ }>>;
131
+ per_category_strict: z.ZodRecord<z.ZodString, z.ZodObject<{
132
+ matched: z.ZodNumber;
133
+ total: z.ZodNumber;
134
+ ratio: z.ZodNumber;
135
+ }, "strip", z.ZodTypeAny, {
136
+ matched: number;
137
+ total: number;
138
+ ratio: number;
139
+ }, {
140
+ matched: number;
141
+ total: number;
142
+ ratio: number;
143
+ }>>;
144
+ decision_vocabulary: z.ZodRecord<z.ZodString, z.ZodNumber>;
145
+ decisions_produced_count: z.ZodNumber;
146
+ decision_vocab_bar: z.ZodObject<{
147
+ architecture: z.ZodEnum<["single-pass", "two-pass"]>;
148
+ required: z.ZodNumber;
149
+ produced: z.ZodNumber;
150
+ passed: z.ZodBoolean;
151
+ }, "strip", z.ZodTypeAny, {
152
+ required: number;
153
+ architecture: "single-pass" | "two-pass";
154
+ produced: number;
155
+ passed: boolean;
156
+ }, {
157
+ required: number;
158
+ architecture: "single-pass" | "two-pass";
159
+ produced: number;
160
+ passed: boolean;
161
+ }>;
162
+ unreachable_decisions: z.ZodArray<z.ZodString, "many">;
163
+ empty_or_malformed_responses: z.ZodNumber;
164
+ pass_fail: z.ZodObject<{
165
+ fp_ceiling: z.ZodEnum<["PASS", "FAIL"]>;
166
+ any_flag_recall_floor: z.ZodEnum<["PASS", "FAIL"]>;
167
+ per_category_any_flag_floor: z.ZodEnum<["PASS", "FAIL"]>;
168
+ strict_recall_floor: z.ZodEnum<["PASS", "FAIL"]>;
169
+ decision_vocab_completeness: z.ZodEnum<["PASS", "FAIL"]>;
170
+ latency_soft: z.ZodEnum<["PASS", "WARN"]>;
171
+ latency_hard: z.ZodEnum<["PASS", "FAIL"]>;
172
+ empty_or_malformed: z.ZodEnum<["PASS", "FAIL"]>;
173
+ overall: z.ZodEnum<["PASS", "FAIL"]>;
174
+ }, "strip", z.ZodTypeAny, {
175
+ fp_ceiling: "PASS" | "FAIL";
176
+ any_flag_recall_floor: "PASS" | "FAIL";
177
+ per_category_any_flag_floor: "PASS" | "FAIL";
178
+ strict_recall_floor: "PASS" | "FAIL";
179
+ decision_vocab_completeness: "PASS" | "FAIL";
180
+ latency_soft: "PASS" | "WARN";
181
+ latency_hard: "PASS" | "FAIL";
182
+ empty_or_malformed: "PASS" | "FAIL";
183
+ overall: "PASS" | "FAIL";
184
+ }, {
185
+ fp_ceiling: "PASS" | "FAIL";
186
+ any_flag_recall_floor: "PASS" | "FAIL";
187
+ per_category_any_flag_floor: "PASS" | "FAIL";
188
+ strict_recall_floor: "PASS" | "FAIL";
189
+ decision_vocab_completeness: "PASS" | "FAIL";
190
+ latency_soft: "PASS" | "WARN";
191
+ latency_hard: "PASS" | "FAIL";
192
+ empty_or_malformed: "PASS" | "FAIL";
193
+ overall: "PASS" | "FAIL";
194
+ }>;
195
+ notes: z.ZodArray<z.ZodString, "many">;
196
+ }, "strip", z.ZodTypeAny, {
197
+ research_os_version: string;
198
+ status: "trusted_baseline" | "conditional_pass" | "failed" | "comparison_only";
199
+ notes: string[];
200
+ schema_version: 1;
201
+ profile_name: string;
202
+ model: string;
203
+ architecture: "single-pass" | "two-pass";
204
+ fixture: string;
205
+ fixture_total_claims: number;
206
+ fixture_good_claims: number;
207
+ fixture_bad_claims: number;
208
+ calibrated_at: string;
209
+ runtime_ms: number;
210
+ good_fp_count: number;
211
+ any_flag_recall: {
212
+ matched: number;
213
+ total: number;
214
+ ratio: number;
215
+ };
216
+ strict_recall: {
217
+ matched: number;
218
+ total: number;
219
+ ratio: number;
220
+ };
221
+ per_category_any_flag: Record<string, {
222
+ matched: number;
223
+ total: number;
224
+ ratio: number;
225
+ }>;
226
+ per_category_strict: Record<string, {
227
+ matched: number;
228
+ total: number;
229
+ ratio: number;
230
+ }>;
231
+ decision_vocabulary: Record<string, number>;
232
+ decisions_produced_count: number;
233
+ decision_vocab_bar: {
234
+ required: number;
235
+ architecture: "single-pass" | "two-pass";
236
+ produced: number;
237
+ passed: boolean;
238
+ };
239
+ unreachable_decisions: string[];
240
+ empty_or_malformed_responses: number;
241
+ pass_fail: {
242
+ fp_ceiling: "PASS" | "FAIL";
243
+ any_flag_recall_floor: "PASS" | "FAIL";
244
+ per_category_any_flag_floor: "PASS" | "FAIL";
245
+ strict_recall_floor: "PASS" | "FAIL";
246
+ decision_vocab_completeness: "PASS" | "FAIL";
247
+ latency_soft: "PASS" | "WARN";
248
+ latency_hard: "PASS" | "FAIL";
249
+ empty_or_malformed: "PASS" | "FAIL";
250
+ overall: "PASS" | "FAIL";
251
+ };
252
+ }, {
253
+ research_os_version: string;
254
+ status: "trusted_baseline" | "conditional_pass" | "failed" | "comparison_only";
255
+ notes: string[];
256
+ schema_version: 1;
257
+ profile_name: string;
258
+ model: string;
259
+ architecture: "single-pass" | "two-pass";
260
+ fixture: string;
261
+ fixture_total_claims: number;
262
+ fixture_good_claims: number;
263
+ fixture_bad_claims: number;
264
+ calibrated_at: string;
265
+ runtime_ms: number;
266
+ good_fp_count: number;
267
+ any_flag_recall: {
268
+ matched: number;
269
+ total: number;
270
+ ratio: number;
271
+ };
272
+ strict_recall: {
273
+ matched: number;
274
+ total: number;
275
+ ratio: number;
276
+ };
277
+ per_category_any_flag: Record<string, {
278
+ matched: number;
279
+ total: number;
280
+ ratio: number;
281
+ }>;
282
+ per_category_strict: Record<string, {
283
+ matched: number;
284
+ total: number;
285
+ ratio: number;
286
+ }>;
287
+ decision_vocabulary: Record<string, number>;
288
+ decisions_produced_count: number;
289
+ decision_vocab_bar: {
290
+ required: number;
291
+ architecture: "single-pass" | "two-pass";
292
+ produced: number;
293
+ passed: boolean;
294
+ };
295
+ unreachable_decisions: string[];
296
+ empty_or_malformed_responses: number;
297
+ pass_fail: {
298
+ fp_ceiling: "PASS" | "FAIL";
299
+ any_flag_recall_floor: "PASS" | "FAIL";
300
+ per_category_any_flag_floor: "PASS" | "FAIL";
301
+ strict_recall_floor: "PASS" | "FAIL";
302
+ decision_vocab_completeness: "PASS" | "FAIL";
303
+ latency_soft: "PASS" | "WARN";
304
+ latency_hard: "PASS" | "FAIL";
305
+ empty_or_malformed: "PASS" | "FAIL";
306
+ overall: "PASS" | "FAIL";
307
+ };
308
+ }>;
309
+ type StatusLabel = z.infer<typeof StatusLabelSchema>;
310
+ type Architecture = z.infer<typeof ArchitectureSchema>;
311
+ type Recall = z.infer<typeof RecallSchema>;
312
+ type PerCategoryRecall = z.infer<typeof PerCategoryRecallSchema>;
313
+ type PassFail = z.infer<typeof PassFailSchema>;
314
+ type DecisionVocabBar = z.infer<typeof DecisionVocabBarSchema>;
315
+ type CalibrationReceipt = z.infer<typeof CalibrationReceiptSchema>;
316
+
317
+ export { type Architecture, ArchitectureSchema, type CalibrationReceipt, CalibrationReceiptSchema, type DecisionVocabBar, DecisionVocabBarSchema, type PassFail, PassFailSchema, type PerCategoryRecall, PerCategoryRecallSchema, type Recall, RecallSchema, type StatusLabel, StatusLabelSchema };
@@ -0,0 +1,68 @@
1
+ // src/calibration/receipt-schema.ts
2
+ import { z } from "zod";
3
+ var StatusLabelSchema = z.enum([
4
+ "trusted_baseline",
5
+ "conditional_pass",
6
+ "failed",
7
+ "comparison_only"
8
+ ]);
9
+ var ArchitectureSchema = z.enum(["single-pass", "two-pass"]);
10
+ var RecallSchema = z.object({
11
+ matched: z.number().int().nonnegative(),
12
+ total: z.number().int().nonnegative(),
13
+ ratio: z.number().min(0).max(1)
14
+ });
15
+ var PerCategoryRecallSchema = z.record(z.string(), RecallSchema);
16
+ var PassFailSchema = z.object({
17
+ fp_ceiling: z.enum(["PASS", "FAIL"]),
18
+ any_flag_recall_floor: z.enum(["PASS", "FAIL"]),
19
+ per_category_any_flag_floor: z.enum(["PASS", "FAIL"]),
20
+ strict_recall_floor: z.enum(["PASS", "FAIL"]),
21
+ decision_vocab_completeness: z.enum(["PASS", "FAIL"]),
22
+ latency_soft: z.enum(["PASS", "WARN"]),
23
+ latency_hard: z.enum(["PASS", "FAIL"]),
24
+ empty_or_malformed: z.enum(["PASS", "FAIL"]),
25
+ overall: z.enum(["PASS", "FAIL"])
26
+ });
27
+ var DecisionVocabBarSchema = z.object({
28
+ architecture: ArchitectureSchema,
29
+ required: z.number().int().positive(),
30
+ produced: z.number().int().nonnegative(),
31
+ passed: z.boolean()
32
+ });
33
+ var CalibrationReceiptSchema = z.object({
34
+ schema_version: z.literal(1),
35
+ profile_name: z.string(),
36
+ status: StatusLabelSchema,
37
+ model: z.string(),
38
+ architecture: ArchitectureSchema,
39
+ fixture: z.string(),
40
+ fixture_total_claims: z.number().int().positive(),
41
+ fixture_good_claims: z.number().int().nonnegative(),
42
+ fixture_bad_claims: z.number().int().nonnegative(),
43
+ calibrated_at: z.string(),
44
+ research_os_version: z.string(),
45
+ runtime_ms: z.number().int().nonnegative(),
46
+ good_fp_count: z.number().int().nonnegative(),
47
+ any_flag_recall: RecallSchema,
48
+ strict_recall: RecallSchema,
49
+ per_category_any_flag: PerCategoryRecallSchema,
50
+ per_category_strict: PerCategoryRecallSchema,
51
+ decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),
52
+ decisions_produced_count: z.number().int().nonnegative(),
53
+ decision_vocab_bar: DecisionVocabBarSchema,
54
+ unreachable_decisions: z.array(z.string()),
55
+ empty_or_malformed_responses: z.number().int().nonnegative(),
56
+ pass_fail: PassFailSchema,
57
+ notes: z.array(z.string())
58
+ });
59
+ export {
60
+ ArchitectureSchema,
61
+ CalibrationReceiptSchema,
62
+ DecisionVocabBarSchema,
63
+ PassFailSchema,
64
+ PerCategoryRecallSchema,
65
+ RecallSchema,
66
+ StatusLabelSchema
67
+ };
68
+ //# sourceMappingURL=receipt-schema.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/calibration/receipt-schema.ts"],"sourcesContent":["import { z } from 'zod';\n\nexport const StatusLabelSchema = z.enum([\n 'trusted_baseline',\n 'conditional_pass',\n 'failed',\n 'comparison_only',\n]);\n\nexport const ArchitectureSchema = z.enum(['single-pass', 'two-pass']);\n\nexport const RecallSchema = z.object({\n matched: z.number().int().nonnegative(),\n total: z.number().int().nonnegative(),\n ratio: z.number().min(0).max(1),\n});\n\nexport const PerCategoryRecallSchema = z.record(z.string(), RecallSchema);\n\nexport const PassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const DecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n produced: z.number().int().nonnegative(),\n passed: z.boolean(),\n});\n\nexport const CalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n calibrated_at: z.string(),\n research_os_version: z.string(),\n runtime_ms: z.number().int().nonnegative(),\n good_fp_count: z.number().int().nonnegative(),\n any_flag_recall: RecallSchema,\n strict_recall: RecallSchema,\n per_category_any_flag: PerCategoryRecallSchema,\n per_category_strict: PerCategoryRecallSchema,\n decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),\n decisions_produced_count: z.number().int().nonnegative(),\n decision_vocab_bar: DecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n empty_or_malformed_responses: z.number().int().nonnegative(),\n pass_fail: PassFailSchema,\n notes: z.array(z.string()),\n});\n\nexport type StatusLabel = z.infer<typeof StatusLabelSchema>;\nexport type Architecture = z.infer<typeof ArchitectureSchema>;\nexport type Recall = z.infer<typeof RecallSchema>;\nexport type PerCategoryRecall = z.infer<typeof PerCategoryRecallSchema>;\nexport type PassFail = z.infer<typeof PassFailSchema>;\nexport type DecisionVocabBar = z.infer<typeof DecisionVocabBarSchema>;\nexport type CalibrationReceipt = z.infer<typeof CalibrationReceiptSchema>;\n"],"mappings":";AAAA,SAAS,SAAS;AAEX,IAAM,oBAAoB,EAAE,KAAK;AAAA,EACtC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAEM,IAAM,qBAAqB,EAAE,KAAK,CAAC,eAAe,UAAU,CAAC;AAE7D,IAAM,eAAe,EAAE,OAAO;AAAA,EACnC,SAAS,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACtC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACpC,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAChC,CAAC;AAEM,IAAM,0BAA0B,EAAE,OAAO,EAAE,OAAO,GAAG,YAAY;AAEjE,IAAM,iBAAiB,EAAE,OAAO;AAAA,EACrC,YAAY,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6B,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6B,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAc,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAc,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoB,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAAS,EAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,yBAAyB,EAAE,OAAO;AAAA,EAC7C,cAAc;AAAA,EACd,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvC,QAAQ,EAAE,QAAQ;AACpB,CAAC;AAEM,IAAM,2BAA2B,EAAE,OAAO;AAAA,EAC/C,gBAAgB,EAAE,QAAQ,CAAC;AAAA,EAC3B,cAAc,EAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAO,EAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAAS,EAAE,OAAO;AAAA,EAClB,sBAAsB,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqB,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoB,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,eAAe,EAAE,OAAO;AAAA,EACxB,qBAAqB,EAAE,OAAO;AAAA,EAC9B,YAAY,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACzC,eAAe,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC5C,iBAAiB;AAAA,EACjB,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,qBAAqB,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,CAAC;AAAA,EACxE,0BAA0B,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvD,oBAAoB;AAAA,EACpB,uBAAuB,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EACzC,8BAA8B,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC3D,WAAW;AAAA,EACX,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC;AAC3B,CAAC;","names":[]}
@@ -0,0 +1,31 @@
1
+ import { CalibrationReceipt, Architecture, DecisionVocabBar, Recall, PerCategoryRecall, PassFail, StatusLabel } from './receipt-schema.js';
2
+ import 'zod';
3
+
4
+ declare function computeDecisionVocabBar(architecture: Architecture, decisionsProducedCount: number): DecisionVocabBar;
5
+ declare function computePassFail(input: {
6
+ good_fp_count: number;
7
+ any_flag_recall: Recall;
8
+ per_category_any_flag: PerCategoryRecall;
9
+ strict_recall: Recall;
10
+ decision_vocab_bar: DecisionVocabBar;
11
+ runtime_ms: number;
12
+ empty_or_malformed_responses: number;
13
+ }): PassFail;
14
+ declare function computeStatusLabel(input: {
15
+ profileName: string;
16
+ architecture: Architecture;
17
+ passFail: PassFail;
18
+ goodFpCount: number;
19
+ modeOverride?: 'comparison_only';
20
+ }): StatusLabel;
21
+ declare function receiptToCalibrationSummary(receipt: CalibrationReceipt): {
22
+ fixture: string | null;
23
+ good_false_positive_rate: string | null;
24
+ bad_any_flag_recall: string | null;
25
+ strict_category_recall: string | null;
26
+ unsupported_claim_recall: string | null;
27
+ notes: string | null;
28
+ };
29
+ declare function buildReceiptMarkdown(r: CalibrationReceipt): string;
30
+
31
+ export { buildReceiptMarkdown, computeDecisionVocabBar, computePassFail, computeStatusLabel, receiptToCalibrationSummary };
@@ -0,0 +1,151 @@
1
+ // src/calibration/receipt.ts
2
+ function computeDecisionVocabBar(architecture, decisionsProducedCount) {
3
+ const required = architecture === "two-pass" ? 3 : 4;
4
+ return {
5
+ architecture,
6
+ required,
7
+ produced: decisionsProducedCount,
8
+ passed: decisionsProducedCount >= required
9
+ };
10
+ }
11
+ function computePerCategoryFloor(perCategoryAnyFlag) {
12
+ for (const [, recall] of Object.entries(perCategoryAnyFlag)) {
13
+ if (recall.total >= 2 && recall.ratio < 0.5) return "FAIL";
14
+ }
15
+ return "PASS";
16
+ }
17
+ function computePassFail(input) {
18
+ const fp_ceiling = input.good_fp_count <= 1 ? "PASS" : "FAIL";
19
+ const any_flag_recall_floor = input.any_flag_recall.ratio >= 0.65 ? "PASS" : "FAIL";
20
+ const per_category_any_flag_floor = computePerCategoryFloor(input.per_category_any_flag);
21
+ const strict_recall_floor = input.strict_recall.ratio >= 0.2 ? "PASS" : "FAIL";
22
+ const decision_vocab_completeness = input.decision_vocab_bar.passed ? "PASS" : "FAIL";
23
+ const latency_soft = input.runtime_ms <= 6e5 ? "PASS" : "WARN";
24
+ const latency_hard = input.runtime_ms <= 12e5 ? "PASS" : "FAIL";
25
+ const empty_or_malformed = input.empty_or_malformed_responses === 0 ? "PASS" : "FAIL";
26
+ const hardBars = [
27
+ fp_ceiling,
28
+ any_flag_recall_floor,
29
+ per_category_any_flag_floor,
30
+ strict_recall_floor,
31
+ decision_vocab_completeness,
32
+ latency_hard,
33
+ empty_or_malformed
34
+ ];
35
+ const overall = hardBars.every((v) => v === "PASS") ? "PASS" : "FAIL";
36
+ return {
37
+ fp_ceiling,
38
+ any_flag_recall_floor,
39
+ per_category_any_flag_floor,
40
+ strict_recall_floor,
41
+ decision_vocab_completeness,
42
+ latency_soft,
43
+ latency_hard,
44
+ empty_or_malformed,
45
+ overall
46
+ };
47
+ }
48
+ function computeStatusLabel(input) {
49
+ if (input.modeOverride === "comparison_only") return "comparison_only";
50
+ if (input.architecture === "single-pass" && /hermes/i.test(input.profileName)) {
51
+ return "comparison_only";
52
+ }
53
+ if (input.passFail.overall === "FAIL") return "failed";
54
+ const isHermesTwoPass = /hermes/i.test(input.profileName) && input.architecture === "two-pass";
55
+ if (isHermesTwoPass && input.goodFpCount === 0) return "trusted_baseline";
56
+ return "conditional_pass";
57
+ }
58
+ function receiptToCalibrationSummary(receipt) {
59
+ const fp = receipt.good_fp_count;
60
+ const fpTotal = receipt.fixture_good_claims;
61
+ const fpPct = fpTotal > 0 ? Math.round(fp / fpTotal * 100) : 0;
62
+ const af = receipt.any_flag_recall;
63
+ const sr = receipt.strict_recall;
64
+ const unsupported = receipt.per_category_any_flag["unsupported_claim"];
65
+ return {
66
+ fixture: receipt.fixture,
67
+ good_false_positive_rate: `${fp}/${fpTotal} (${fpPct}%)`,
68
+ bad_any_flag_recall: `${af.matched}/${af.total} (${Math.round(af.ratio * 100)}%)`,
69
+ strict_category_recall: `${sr.matched}/${sr.total} (${Math.round(sr.ratio * 100)}%)`,
70
+ unsupported_claim_recall: unsupported ? `${unsupported.matched}/${unsupported.total} (${Math.round(unsupported.ratio * 100)}%)` : null,
71
+ notes: `status=${receipt.status} model=${receipt.model} arch=${receipt.architecture} overall=${receipt.pass_fail.overall} decisions=${receipt.decisions_produced_count}/6`
72
+ };
73
+ }
74
+ function buildReceiptMarkdown(r) {
75
+ const pct = (ratio) => `${Math.round(ratio * 100)}%`;
76
+ const runtimeSec = (r.runtime_ms / 1e3).toFixed(1);
77
+ const perCatRows = Object.entries(r.per_category_any_flag).map(([cat, af]) => {
78
+ const st = r.per_category_strict[cat] ?? { matched: 0, total: af.total, ratio: 0 };
79
+ return `| ${cat} | ${af.matched}/${af.total} (${pct(af.ratio)}) | ${st.matched}/${st.total} (${pct(st.ratio)}) |`;
80
+ }).join("\n");
81
+ const dvRows = [
82
+ "accepted_for_synthesis",
83
+ "rejected",
84
+ "needs_scope_repair",
85
+ "needs_source_repair",
86
+ "needs_contradiction_mapping",
87
+ "needs_human_review"
88
+ ].map((d) => {
89
+ const count = r.decision_vocabulary[d] ?? 0;
90
+ const unreachable = r.unreachable_decisions.includes(d) ? ` (unreachable from ${r.fixture})` : "";
91
+ return `| ${d} | ${count}${unreachable} |`;
92
+ }).join("\n");
93
+ const pf = r.pass_fail;
94
+ const bar = r.decision_vocab_bar;
95
+ const notesSection = r.notes.length > 0 ? `
96
+ ## Notes
97
+
98
+ ${r.notes.map((n) => `- ${n}`).join("\n")}
99
+ ` : "";
100
+ return `# Calibration Receipt \u2014 ${r.profile_name}
101
+
102
+ - **Model:** ${r.model}
103
+ - **Architecture:** ${r.architecture}
104
+ - **Status:** ${r.status}
105
+ - **Fixture:** ${r.fixture} (${r.fixture_total_claims} claims = ${r.fixture_good_claims} good + ${r.fixture_bad_claims} bad)
106
+ - **Calibrated at:** ${r.calibrated_at}
107
+ - **Research-OS version:** ${r.research_os_version}
108
+ - **Runtime:** ${runtimeSec} seconds
109
+
110
+ ## Headline metrics
111
+
112
+ - FP: ${r.good_fp_count} / ${r.fixture_good_claims}
113
+ - Any-flag recall: ${r.any_flag_recall.matched} / ${r.any_flag_recall.total} (${pct(r.any_flag_recall.ratio)})
114
+ - Strict recall: ${r.strict_recall.matched} / ${r.strict_recall.total} (${pct(r.strict_recall.ratio)})
115
+ - Decisions produced: ${r.decisions_produced_count} / 6
116
+
117
+ ## PASS / FAIL
118
+
119
+ | Bar | Result |
120
+ |---|---|
121
+ | FP ceiling (\u22641) | ${pf.fp_ceiling} |
122
+ | Any-flag recall (\u226565%) | ${pf.any_flag_recall_floor} |
123
+ | Per-category any-flag (\u226550%) | ${pf.per_category_any_flag_floor} |
124
+ | Strict recall (\u226520%) | ${pf.strict_recall_floor} |
125
+ | Decision vocab (${bar.architecture} \u2265 ${bar.required}) | ${pf.decision_vocab_completeness} |
126
+ | Latency soft (\u226410 min) | ${pf.latency_soft} |
127
+ | Latency hard (\u226420 min) | ${pf.latency_hard} |
128
+ | Empty/malformed (=0) | ${pf.empty_or_malformed} |
129
+ | **OVERALL** | **${pf.overall}** |
130
+
131
+ ## Per-category recall
132
+
133
+ | Category | Any-flag | Strict |
134
+ |---|---|---|
135
+ ${perCatRows}
136
+
137
+ ## Decision vocabulary
138
+
139
+ | Decision | Count |
140
+ |---|---:|
141
+ ${dvRows}
142
+ ${notesSection}`;
143
+ }
144
+ export {
145
+ buildReceiptMarkdown,
146
+ computeDecisionVocabBar,
147
+ computePassFail,
148
+ computeStatusLabel,
149
+ receiptToCalibrationSummary
150
+ };
151
+ //# sourceMappingURL=receipt.js.map