@mcptoolshop/research-os 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +253 -0
- package/README.es.md +33 -2
- package/README.fr.md +32 -1
- package/README.hi.md +52 -1
- package/README.it.md +33 -2
- package/README.ja.md +32 -1
- package/README.md +53 -1
- package/README.pt-BR.md +32 -1
- package/README.zh.md +33 -2
- package/dist/calibration/aggregate-receipt-schema.d.ts +547 -0
- package/dist/calibration/aggregate-receipt-schema.js +160 -0
- package/dist/calibration/aggregate-receipt-schema.js.map +1 -0
- package/dist/calibration/aggregate.d.ts +37 -0
- package/dist/calibration/aggregate.js +493 -0
- package/dist/calibration/aggregate.js.map +1 -0
- package/dist/calibration/receipt-schema.d.ts +356 -0
- package/dist/calibration/receipt-schema.js +83 -0
- package/dist/calibration/receipt-schema.js.map +1 -0
- package/dist/calibration/receipt.d.ts +32 -0
- package/dist/calibration/receipt.js +170 -0
- package/dist/calibration/receipt.js.map +1 -0
- package/dist/cli.js +1041 -851
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +154 -49
- package/dist/index.js +881 -818
- package/dist/index.js.map +1 -1
- package/dist/reviewer-options-schema-PZacF_MO.d.ts +27 -0
- package/package.json +1 -1
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/calibration/aggregate-receipt-schema.ts","../../src/calibration/receipt-schema.ts","../../src/review/reviewer-options-schema.ts"],"sourcesContent":["import { z } from 'zod';\nimport { StatusLabelSchema, ArchitectureSchema, ReviewerOptionsSchema } from './receipt-schema.js';\n\nexport const AggregateMetricSchema = z.object({\n median: z.number(),\n min: z.number(),\n max: z.number(),\n values: z.array(z.number()), // per-run values in run order (run-001, run-002, ...)\n});\n\nexport const PerCategoryAggregateEntrySchema = z.object({\n median_ratio: z.number().min(0).max(1),\n min_ratio: z.number().min(0).max(1),\n max_ratio: z.number().min(0).max(1),\n total: z.number().int().nonnegative(), // seed count — same across all runs\n per_run_ratios: z.array(z.number()),\n});\n\nexport const PerCategoryAggregateSchema = z.record(z.string(), PerCategoryAggregateEntrySchema);\n\nexport const AggregatePassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const AggregateDecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n median_produced: z.number(), // float — median of per-run decisions_produced_count\n passed: z.boolean(),\n});\n\nexport const AggregateCalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n receipt_kind: z.literal('aggregate'), // discriminates from single-run receipt\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n runs_count: z.number().int().min(2),\n run_files: z.array(z.string()), // relative paths: runs/run-001.json, etc.\n aggregated_at: z.string(), // ISO 8601\n research_os_version: z.string(),\n\n // Aggregate metrics — median + min + max + per-run values in run order\n good_fp_count: AggregateMetricSchema,\n any_flag_recall_ratio: AggregateMetricSchema,\n strict_recall_ratio: AggregateMetricSchema,\n decisions_produced_count: AggregateMetricSchema,\n runtime_ms: AggregateMetricSchema,\n empty_or_malformed_responses: AggregateMetricSchema,\n\n per_category_any_flag: PerCategoryAggregateSchema,\n per_category_strict: PerCategoryAggregateSchema,\n\n // Decision vocabulary — union of all decisions seen across runs, median count each\n decision_vocabulary: z.record(z.string(), AggregateMetricSchema),\n decision_vocab_bar: AggregateDecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n\n pass_fail: AggregatePassFailSchema,\n // Bars that FAILed in >= ceil(runs_count/2) individual runs.\n // Non-empty list demotes trusted_baseline to conditional_pass.\n recurring_bar_failures: z.array(z.string()),\n\n notes: z.array(z.string()),\n\n // schema_version: 1 — additive-optional (Exp6 Session 2):\n // Same options object stamped on every per-run receipt. Absent = stochastic run.\n reviewer_options: ReviewerOptionsSchema.optional(),\n});\n\nexport type AggregateMetric = z.infer<typeof AggregateMetricSchema>;\nexport type PerCategoryAggregateEntry = z.infer<typeof PerCategoryAggregateEntrySchema>;\nexport type PerCategoryAggregate = z.infer<typeof PerCategoryAggregateSchema>;\nexport type AggregatePassFail = z.infer<typeof AggregatePassFailSchema>;\nexport type AggregateDecisionVocabBar = z.infer<typeof AggregateDecisionVocabBarSchema>;\nexport type AggregateCalibrationReceipt = z.infer<typeof AggregateCalibrationReceiptSchema>;\n","import { z } from 'zod';\nimport { ReviewerOptionsSchema } from '../review/reviewer-options-schema.js';\nexport { ReviewerOptionsSchema };\nexport type { ReviewerOptions } from '../review/reviewer-options-schema.js';\n\nexport const StatusLabelSchema = z.enum([\n 'trusted_baseline',\n 'conditional_pass',\n 'failed',\n 'comparison_only',\n]);\n\nexport const ArchitectureSchema = z.enum(['single-pass', 'two-pass']);\n\nexport const RecallSchema = z.object({\n matched: z.number().int().nonnegative(),\n total: z.number().int().nonnegative(),\n ratio: z.number().min(0).max(1),\n});\n\nexport const PerCategoryRecallSchema = z.record(z.string(), RecallSchema);\n\nexport const PassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const DecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n produced: z.number().int().nonnegative(),\n passed: z.boolean(),\n});\n\n// schema_version: 1 — additive-optional additions (Exp6 Session 2):\n// reviewer_options: optional sampling params used during this calibration run.\n// Absent = stochastic run (pre-v0.6 compat preserved). Present = keys explicitly set.\nexport const CalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n calibrated_at: z.string(),\n research_os_version: z.string(),\n runtime_ms: z.number().int().nonnegative(),\n good_fp_count: z.number().int().nonnegative(),\n any_flag_recall: RecallSchema,\n strict_recall: RecallSchema,\n per_category_any_flag: PerCategoryRecallSchema,\n per_category_strict: PerCategoryRecallSchema,\n decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),\n decisions_produced_count: z.number().int().nonnegative(),\n decision_vocab_bar: DecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n empty_or_malformed_responses: z.number().int().nonnegative(),\n pass_fail: PassFailSchema,\n notes: z.array(z.string()),\n reviewer_options: ReviewerOptionsSchema.optional(),\n});\n\nexport type StatusLabel = z.infer<typeof StatusLabelSchema>;\nexport type Architecture = z.infer<typeof ArchitectureSchema>;\nexport type Recall = z.infer<typeof RecallSchema>;\nexport type PerCategoryRecall = z.infer<typeof PerCategoryRecallSchema>;\nexport type PassFail = z.infer<typeof PassFailSchema>;\nexport type DecisionVocabBar = z.infer<typeof DecisionVocabBarSchema>;\nexport type CalibrationReceipt = z.infer<typeof CalibrationReceiptSchema>;\n","import { z } from 'zod';\n\n// Sampling parameters passed verbatim to the Ollama /api/chat `options` field.\n// Used by OllamaInternReviewer to control determinism. All fields optional —\n// omitted keys fall back to Ollama/model defaults. Introduced in Experiment 6\n// Session 2 to make reviewer conditions explicit in calibration receipts.\n//\n// LOAD-BEARING: temperature: 0 is valid and must not be dropped. All merges\n// in OllamaInternReviewer use `!== undefined` checks, NOT truthiness.\nexport const ReviewerOptionsSchema = z.object({\n num_ctx: z.number().int().positive().optional(),\n temperature: z.number().min(0).max(2).optional(),\n seed: z.number().int().optional(),\n top_p: z.number().min(0).max(1).optional(),\n top_k: z.number().int().nonnegative().optional(),\n repeat_penalty: z.number().min(0).optional(),\n});\n\nexport type ReviewerOptions = z.infer<typeof ReviewerOptionsSchema>;\n"],"mappings":";AAAA,SAAS,KAAAA,UAAS;;;ACAlB,SAAS,KAAAC,UAAS;;;ACAlB,SAAS,SAAS;AASX,IAAM,wBAAwB,EAAE,OAAO;AAAA,EAC5C,SAAS,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS;AAAA,EAC9C,aAAa,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AAAA,EAC/C,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChC,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AAAA,EACzC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,EAAE,SAAS;AAAA,EAC/C,gBAAgB,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,SAAS;AAC7C,CAAC;;;ADXM,IAAM,oBAAoBC,GAAE,KAAK;AAAA,EACtC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAEM,IAAM,qBAAqBA,GAAE,KAAK,CAAC,eAAe,UAAU,CAAC;AAE7D,IAAM,eAAeA,GAAE,OAAO;AAAA,EACnC,SAASA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACtC,OAAOA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACpC,OAAOA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAChC,CAAC;AAEM,IAAM,0BAA0BA,GAAE,OAAOA,GAAE,OAAO,GAAG,YAAY;AAEjE,IAAM,iBAAiBA,GAAE,OAAO;AAAA,EACrC,YAAYA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAASA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,yBAAyBA,GAAE,OAAO;AAAA,EAC7C,cAAc;AAAA,EACd,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvC,QAAQA,GAAE,QAAQ;AACpB,CAAC;AAKM,IAAM,2BAA2BA,GAAE,OAAO;AAAA,EAC/C,gBAAgBA,GAAE,QAAQ,CAAC;AAAA,EAC3B,cAAcA,GAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAOA,GAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAASA,GAAE,OAAO;AAAA,EAClB,sBAAsBA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,eAAeA,GAAE,OAAO;AAAA,EACxB,qBAAqBA,GAAE,OAAO;AAAA,EAC9B,YAAYA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACzC,eAAeA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC5C,iBAAiB;AAAA,EACjB,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,qBAAqBA,GAAE,OAAOA,GAAE,OAAO,GAAGA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY,CAAC;AAAA,EACxE,0BAA0BA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvD,oBAAoB;AAAA,EACpB,uBAAuBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EACzC,8BAA8BA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC3D,WAAW;AAAA,EACX,OAAOA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EACzB,kBAAkB,sBAAsB,SAAS;AACnD,CAAC;;;ADnEM,IAAM,wBAAwBC,GAAE,OAAO;AAAA,EAC5C,QAAQA,GAAE,OAAO;AAAA,EACjB,KAAKA,GAAE,OAAO;AAAA,EACd,KAAKA,GAAE,OAAO;AAAA,EACd,QAAQA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAC5B,CAAC;AAEM,IAAM,kCAAkCA,GAAE,OAAO;AAAA,EACtD,cAAcA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EACrC,WAAWA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAClC,WAAWA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAClC,OAAOA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA;AAAA,EACpC,gBAAgBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AACpC,CAAC;AAEM,IAAM,6BAA6BA,GAAE,OAAOA,GAAE,OAAO,GAAG,+BAA+B;AAEvF,IAAM,0BAA0BA,GAAE,OAAO;AAAA,EAC9C,YAAYA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAASA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,kCAAkCA,GAAE,OAAO;AAAA,EACtD,cAAc;AAAA,EACd,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,iBAAiBA,GAAE,OAAO;AAAA;AAAA,EAC1B,QAAQA,GAAE,QAAQ;AACpB,CAAC;AAEM,IAAM,oCAAoCA,GAAE,OAAO;AAAA,EACxD,gBAAgBA,GAAE,QAAQ,CAAC;AAAA,EAC3B,cAAcA,GAAE,QAAQ,WAAW;AAAA;AAAA,EACnC,cAAcA,GAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAOA,GAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAASA,GAAE,OAAO;AAAA,EAClB,sBAAsBA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,YAAYA,GAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC;AAAA,EAClC,WAAWA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAAA,EAC7B,eAAeA,GAAE,OAAO;AAAA;AAAA,EACxB,qBAAqBA,GAAE,OAAO;AAAA;AAAA,EAG9B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,0BAA0B;AAAA,EAC1B,YAAY;AAAA,EACZ,8BAA8B;AAAA,EAE9B,uBAAuB;AAAA,EACvB,qBAAqB;AAAA;AAAA,EAGrB,qBAAqBA,GAAE,OAAOA,GAAE,OAAO,GAAG,qBAAqB;AAAA,EAC/D,oBAAoB;AAAA,EACpB,uBAAuBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EAEzC,WAAW;AAAA;AAAA;AAAA,EAGX,wBAAwBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EAE1C,OAAOA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAAA;AAAA,EAIzB,kBAAkB,sBAAsB,SAAS;AACnD,CAAC;","names":["z","z","z","z"]}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { PerCategoryRecall, CalibrationReceipt, Architecture, StatusLabel, PassFail } from './receipt-schema.js';
|
|
2
|
+
import { AggregateMetric, PerCategoryAggregate, AggregateCalibrationReceipt, AggregatePassFail } from './aggregate-receipt-schema.js';
|
|
3
|
+
import { R as ReviewerOptions } from '../reviewer-options-schema-PZacF_MO.js';
|
|
4
|
+
import 'zod';
|
|
5
|
+
|
|
6
|
+
declare function median(values: number[]): number;
|
|
7
|
+
declare function aggregateMetric(values: number[]): AggregateMetric;
|
|
8
|
+
declare function aggregatePerCategoryRecall(perRunBuckets: PerCategoryRecall[]): PerCategoryAggregate;
|
|
9
|
+
declare function aggregateDecisionVocabulary(perRunDicts: Record<string, number>[]): Record<string, AggregateMetric>;
|
|
10
|
+
declare function computeAggregatePassFail(input: {
|
|
11
|
+
good_fp_count: AggregateMetric;
|
|
12
|
+
any_flag_recall_ratio: AggregateMetric;
|
|
13
|
+
per_category_any_flag: PerCategoryAggregate;
|
|
14
|
+
strict_recall_ratio: AggregateMetric;
|
|
15
|
+
decisions_produced_count: AggregateMetric;
|
|
16
|
+
architecture: Architecture;
|
|
17
|
+
runtime_ms: AggregateMetric;
|
|
18
|
+
empty_or_malformed_responses: AggregateMetric;
|
|
19
|
+
}): AggregatePassFail;
|
|
20
|
+
declare function computeRecurringBarFailures(perRunPassFails: PassFail[], totalRuns: number): string[];
|
|
21
|
+
declare function computeAggregateStatusLabel(input: {
|
|
22
|
+
profileName: string;
|
|
23
|
+
architecture: Architecture;
|
|
24
|
+
aggregatePassFail: AggregatePassFail;
|
|
25
|
+
medianGoodFpCount: number;
|
|
26
|
+
recurringBarFailures: string[];
|
|
27
|
+
modeOverride?: 'comparison_only';
|
|
28
|
+
}): StatusLabel;
|
|
29
|
+
declare function aggregateReceipts(runs: CalibrationReceipt[], opts: {
|
|
30
|
+
runFiles: string[];
|
|
31
|
+
modeOverride?: 'comparison_only';
|
|
32
|
+
aggregatedAt?: string;
|
|
33
|
+
reviewerOptions?: ReviewerOptions;
|
|
34
|
+
}): AggregateCalibrationReceipt;
|
|
35
|
+
declare function buildAggregateReceiptMarkdown(r: AggregateCalibrationReceipt): string;
|
|
36
|
+
|
|
37
|
+
export { aggregateDecisionVocabulary, aggregateMetric, aggregatePerCategoryRecall, aggregateReceipts, buildAggregateReceiptMarkdown, computeAggregatePassFail, computeAggregateStatusLabel, computeRecurringBarFailures, median };
|
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
// src/calibration/aggregate-receipt-schema.ts
|
|
2
|
+
import { z as z3 } from "zod";
|
|
3
|
+
|
|
4
|
+
// src/calibration/receipt-schema.ts
|
|
5
|
+
import { z as z2 } from "zod";
|
|
6
|
+
|
|
7
|
+
// src/review/reviewer-options-schema.ts
|
|
8
|
+
import { z } from "zod";
|
|
9
|
+
var ReviewerOptionsSchema = z.object({
|
|
10
|
+
num_ctx: z.number().int().positive().optional(),
|
|
11
|
+
temperature: z.number().min(0).max(2).optional(),
|
|
12
|
+
seed: z.number().int().optional(),
|
|
13
|
+
top_p: z.number().min(0).max(1).optional(),
|
|
14
|
+
top_k: z.number().int().nonnegative().optional(),
|
|
15
|
+
repeat_penalty: z.number().min(0).optional()
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
// src/calibration/receipt-schema.ts
|
|
19
|
+
var StatusLabelSchema = z2.enum([
|
|
20
|
+
"trusted_baseline",
|
|
21
|
+
"conditional_pass",
|
|
22
|
+
"failed",
|
|
23
|
+
"comparison_only"
|
|
24
|
+
]);
|
|
25
|
+
var ArchitectureSchema = z2.enum(["single-pass", "two-pass"]);
|
|
26
|
+
var RecallSchema = z2.object({
|
|
27
|
+
matched: z2.number().int().nonnegative(),
|
|
28
|
+
total: z2.number().int().nonnegative(),
|
|
29
|
+
ratio: z2.number().min(0).max(1)
|
|
30
|
+
});
|
|
31
|
+
var PerCategoryRecallSchema = z2.record(z2.string(), RecallSchema);
|
|
32
|
+
var PassFailSchema = z2.object({
|
|
33
|
+
fp_ceiling: z2.enum(["PASS", "FAIL"]),
|
|
34
|
+
any_flag_recall_floor: z2.enum(["PASS", "FAIL"]),
|
|
35
|
+
per_category_any_flag_floor: z2.enum(["PASS", "FAIL"]),
|
|
36
|
+
strict_recall_floor: z2.enum(["PASS", "FAIL"]),
|
|
37
|
+
decision_vocab_completeness: z2.enum(["PASS", "FAIL"]),
|
|
38
|
+
latency_soft: z2.enum(["PASS", "WARN"]),
|
|
39
|
+
latency_hard: z2.enum(["PASS", "FAIL"]),
|
|
40
|
+
empty_or_malformed: z2.enum(["PASS", "FAIL"]),
|
|
41
|
+
overall: z2.enum(["PASS", "FAIL"])
|
|
42
|
+
});
|
|
43
|
+
var DecisionVocabBarSchema = z2.object({
|
|
44
|
+
architecture: ArchitectureSchema,
|
|
45
|
+
required: z2.number().int().positive(),
|
|
46
|
+
produced: z2.number().int().nonnegative(),
|
|
47
|
+
passed: z2.boolean()
|
|
48
|
+
});
|
|
49
|
+
var CalibrationReceiptSchema = z2.object({
|
|
50
|
+
schema_version: z2.literal(1),
|
|
51
|
+
profile_name: z2.string(),
|
|
52
|
+
status: StatusLabelSchema,
|
|
53
|
+
model: z2.string(),
|
|
54
|
+
architecture: ArchitectureSchema,
|
|
55
|
+
fixture: z2.string(),
|
|
56
|
+
fixture_total_claims: z2.number().int().positive(),
|
|
57
|
+
fixture_good_claims: z2.number().int().nonnegative(),
|
|
58
|
+
fixture_bad_claims: z2.number().int().nonnegative(),
|
|
59
|
+
calibrated_at: z2.string(),
|
|
60
|
+
research_os_version: z2.string(),
|
|
61
|
+
runtime_ms: z2.number().int().nonnegative(),
|
|
62
|
+
good_fp_count: z2.number().int().nonnegative(),
|
|
63
|
+
any_flag_recall: RecallSchema,
|
|
64
|
+
strict_recall: RecallSchema,
|
|
65
|
+
per_category_any_flag: PerCategoryRecallSchema,
|
|
66
|
+
per_category_strict: PerCategoryRecallSchema,
|
|
67
|
+
decision_vocabulary: z2.record(z2.string(), z2.number().int().nonnegative()),
|
|
68
|
+
decisions_produced_count: z2.number().int().nonnegative(),
|
|
69
|
+
decision_vocab_bar: DecisionVocabBarSchema,
|
|
70
|
+
unreachable_decisions: z2.array(z2.string()),
|
|
71
|
+
empty_or_malformed_responses: z2.number().int().nonnegative(),
|
|
72
|
+
pass_fail: PassFailSchema,
|
|
73
|
+
notes: z2.array(z2.string()),
|
|
74
|
+
reviewer_options: ReviewerOptionsSchema.optional()
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
// src/calibration/aggregate-receipt-schema.ts
|
|
78
|
+
var AggregateMetricSchema = z3.object({
|
|
79
|
+
median: z3.number(),
|
|
80
|
+
min: z3.number(),
|
|
81
|
+
max: z3.number(),
|
|
82
|
+
values: z3.array(z3.number())
|
|
83
|
+
// per-run values in run order (run-001, run-002, ...)
|
|
84
|
+
});
|
|
85
|
+
var PerCategoryAggregateEntrySchema = z3.object({
|
|
86
|
+
median_ratio: z3.number().min(0).max(1),
|
|
87
|
+
min_ratio: z3.number().min(0).max(1),
|
|
88
|
+
max_ratio: z3.number().min(0).max(1),
|
|
89
|
+
total: z3.number().int().nonnegative(),
|
|
90
|
+
// seed count — same across all runs
|
|
91
|
+
per_run_ratios: z3.array(z3.number())
|
|
92
|
+
});
|
|
93
|
+
var PerCategoryAggregateSchema = z3.record(z3.string(), PerCategoryAggregateEntrySchema);
|
|
94
|
+
var AggregatePassFailSchema = z3.object({
|
|
95
|
+
fp_ceiling: z3.enum(["PASS", "FAIL"]),
|
|
96
|
+
any_flag_recall_floor: z3.enum(["PASS", "FAIL"]),
|
|
97
|
+
per_category_any_flag_floor: z3.enum(["PASS", "FAIL"]),
|
|
98
|
+
strict_recall_floor: z3.enum(["PASS", "FAIL"]),
|
|
99
|
+
decision_vocab_completeness: z3.enum(["PASS", "FAIL"]),
|
|
100
|
+
latency_soft: z3.enum(["PASS", "WARN"]),
|
|
101
|
+
latency_hard: z3.enum(["PASS", "FAIL"]),
|
|
102
|
+
empty_or_malformed: z3.enum(["PASS", "FAIL"]),
|
|
103
|
+
overall: z3.enum(["PASS", "FAIL"])
|
|
104
|
+
});
|
|
105
|
+
var AggregateDecisionVocabBarSchema = z3.object({
|
|
106
|
+
architecture: ArchitectureSchema,
|
|
107
|
+
required: z3.number().int().positive(),
|
|
108
|
+
median_produced: z3.number(),
|
|
109
|
+
// float — median of per-run decisions_produced_count
|
|
110
|
+
passed: z3.boolean()
|
|
111
|
+
});
|
|
112
|
+
var AggregateCalibrationReceiptSchema = z3.object({
|
|
113
|
+
schema_version: z3.literal(1),
|
|
114
|
+
receipt_kind: z3.literal("aggregate"),
|
|
115
|
+
// discriminates from single-run receipt
|
|
116
|
+
profile_name: z3.string(),
|
|
117
|
+
status: StatusLabelSchema,
|
|
118
|
+
model: z3.string(),
|
|
119
|
+
architecture: ArchitectureSchema,
|
|
120
|
+
fixture: z3.string(),
|
|
121
|
+
fixture_total_claims: z3.number().int().positive(),
|
|
122
|
+
fixture_good_claims: z3.number().int().nonnegative(),
|
|
123
|
+
fixture_bad_claims: z3.number().int().nonnegative(),
|
|
124
|
+
runs_count: z3.number().int().min(2),
|
|
125
|
+
run_files: z3.array(z3.string()),
|
|
126
|
+
// relative paths: runs/run-001.json, etc.
|
|
127
|
+
aggregated_at: z3.string(),
|
|
128
|
+
// ISO 8601
|
|
129
|
+
research_os_version: z3.string(),
|
|
130
|
+
// Aggregate metrics — median + min + max + per-run values in run order
|
|
131
|
+
good_fp_count: AggregateMetricSchema,
|
|
132
|
+
any_flag_recall_ratio: AggregateMetricSchema,
|
|
133
|
+
strict_recall_ratio: AggregateMetricSchema,
|
|
134
|
+
decisions_produced_count: AggregateMetricSchema,
|
|
135
|
+
runtime_ms: AggregateMetricSchema,
|
|
136
|
+
empty_or_malformed_responses: AggregateMetricSchema,
|
|
137
|
+
per_category_any_flag: PerCategoryAggregateSchema,
|
|
138
|
+
per_category_strict: PerCategoryAggregateSchema,
|
|
139
|
+
// Decision vocabulary — union of all decisions seen across runs, median count each
|
|
140
|
+
decision_vocabulary: z3.record(z3.string(), AggregateMetricSchema),
|
|
141
|
+
decision_vocab_bar: AggregateDecisionVocabBarSchema,
|
|
142
|
+
unreachable_decisions: z3.array(z3.string()),
|
|
143
|
+
pass_fail: AggregatePassFailSchema,
|
|
144
|
+
// Bars that FAILed in >= ceil(runs_count/2) individual runs.
|
|
145
|
+
// Non-empty list demotes trusted_baseline to conditional_pass.
|
|
146
|
+
recurring_bar_failures: z3.array(z3.string()),
|
|
147
|
+
notes: z3.array(z3.string()),
|
|
148
|
+
// schema_version: 1 — additive-optional (Exp6 Session 2):
|
|
149
|
+
// Same options object stamped on every per-run receipt. Absent = stochastic run.
|
|
150
|
+
reviewer_options: ReviewerOptionsSchema.optional()
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
// src/calibration/aggregate.ts
|
|
154
|
+
function median(values) {
|
|
155
|
+
if (values.length === 0) throw new Error("median: empty array");
|
|
156
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
157
|
+
const mid = Math.floor(sorted.length / 2);
|
|
158
|
+
if (sorted.length % 2 === 1) return sorted[mid];
|
|
159
|
+
return (sorted[mid - 1] + sorted[mid]) / 2;
|
|
160
|
+
}
|
|
161
|
+
function aggregateMetric(values) {
|
|
162
|
+
const m = median(values);
|
|
163
|
+
return {
|
|
164
|
+
median: m,
|
|
165
|
+
min: Math.min(...values),
|
|
166
|
+
max: Math.max(...values),
|
|
167
|
+
values
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
function aggregatePerCategoryRecall(perRunBuckets) {
|
|
171
|
+
const cats = /* @__PURE__ */ new Set();
|
|
172
|
+
for (const run of perRunBuckets) {
|
|
173
|
+
for (const cat of Object.keys(run)) cats.add(cat);
|
|
174
|
+
}
|
|
175
|
+
const result = {};
|
|
176
|
+
for (const cat of cats) {
|
|
177
|
+
const ratios = perRunBuckets.map((run) => run[cat]?.ratio ?? 0);
|
|
178
|
+
const total = perRunBuckets.find((run) => run[cat] !== void 0)?.[cat]?.total ?? 0;
|
|
179
|
+
result[cat] = {
|
|
180
|
+
median_ratio: median(ratios),
|
|
181
|
+
min_ratio: Math.min(...ratios),
|
|
182
|
+
max_ratio: Math.max(...ratios),
|
|
183
|
+
total,
|
|
184
|
+
per_run_ratios: ratios
|
|
185
|
+
};
|
|
186
|
+
}
|
|
187
|
+
return result;
|
|
188
|
+
}
|
|
189
|
+
function aggregateDecisionVocabulary(perRunDicts) {
|
|
190
|
+
const decisions = /* @__PURE__ */ new Set();
|
|
191
|
+
for (const run of perRunDicts) {
|
|
192
|
+
for (const d of Object.keys(run)) decisions.add(d);
|
|
193
|
+
}
|
|
194
|
+
const result = {};
|
|
195
|
+
for (const d of decisions) {
|
|
196
|
+
const values = perRunDicts.map((run) => run[d] ?? 0);
|
|
197
|
+
result[d] = aggregateMetric(values);
|
|
198
|
+
}
|
|
199
|
+
return result;
|
|
200
|
+
}
|
|
201
|
+
function computeAggregatePassFail(input) {
|
|
202
|
+
const fp_ceiling = input.good_fp_count.median <= 1 && input.good_fp_count.max <= 2 ? "PASS" : "FAIL";
|
|
203
|
+
const any_flag_recall_floor = input.any_flag_recall_ratio.median >= 0.65 ? "PASS" : "FAIL";
|
|
204
|
+
let per_category_any_flag_floor = "PASS";
|
|
205
|
+
for (const entry of Object.values(input.per_category_any_flag)) {
|
|
206
|
+
if (entry.total >= 2 && entry.median_ratio < 0.5) {
|
|
207
|
+
per_category_any_flag_floor = "FAIL";
|
|
208
|
+
break;
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
const strict_recall_floor = input.strict_recall_ratio.median >= 0.2 ? "PASS" : "FAIL";
|
|
212
|
+
const dvRequired = input.architecture === "two-pass" ? 3 : 4;
|
|
213
|
+
const decision_vocab_completeness = input.decisions_produced_count.median >= dvRequired ? "PASS" : "FAIL";
|
|
214
|
+
const latency_soft = input.runtime_ms.median <= 6e5 ? "PASS" : "WARN";
|
|
215
|
+
const latency_hard = input.runtime_ms.max <= 12e5 ? "PASS" : "FAIL";
|
|
216
|
+
const empty_or_malformed = input.empty_or_malformed_responses.max === 0 ? "PASS" : "FAIL";
|
|
217
|
+
const hardBars = [
|
|
218
|
+
fp_ceiling,
|
|
219
|
+
any_flag_recall_floor,
|
|
220
|
+
per_category_any_flag_floor,
|
|
221
|
+
strict_recall_floor,
|
|
222
|
+
decision_vocab_completeness,
|
|
223
|
+
latency_hard,
|
|
224
|
+
empty_or_malformed
|
|
225
|
+
];
|
|
226
|
+
const overall = hardBars.every((v) => v === "PASS") ? "PASS" : "FAIL";
|
|
227
|
+
return {
|
|
228
|
+
fp_ceiling,
|
|
229
|
+
any_flag_recall_floor,
|
|
230
|
+
per_category_any_flag_floor,
|
|
231
|
+
strict_recall_floor,
|
|
232
|
+
decision_vocab_completeness,
|
|
233
|
+
latency_soft,
|
|
234
|
+
latency_hard,
|
|
235
|
+
empty_or_malformed,
|
|
236
|
+
overall
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
function computeRecurringBarFailures(perRunPassFails, totalRuns) {
|
|
240
|
+
const threshold = Math.ceil(totalRuns / 2);
|
|
241
|
+
const HARD_BARS = [
|
|
242
|
+
"fp_ceiling",
|
|
243
|
+
"any_flag_recall_floor",
|
|
244
|
+
"per_category_any_flag_floor",
|
|
245
|
+
"strict_recall_floor",
|
|
246
|
+
"decision_vocab_completeness",
|
|
247
|
+
"latency_hard",
|
|
248
|
+
"empty_or_malformed"
|
|
249
|
+
];
|
|
250
|
+
const recurring = [];
|
|
251
|
+
for (const bar of HARD_BARS) {
|
|
252
|
+
const failCount = perRunPassFails.filter((pf) => pf[bar] === "FAIL").length;
|
|
253
|
+
if (failCount >= threshold) recurring.push(bar);
|
|
254
|
+
}
|
|
255
|
+
return recurring;
|
|
256
|
+
}
|
|
257
|
+
function computeAggregateStatusLabel(input) {
|
|
258
|
+
if (input.modeOverride === "comparison_only") return "comparison_only";
|
|
259
|
+
if (input.architecture === "single-pass" && /hermes/i.test(input.profileName)) {
|
|
260
|
+
return "comparison_only";
|
|
261
|
+
}
|
|
262
|
+
if (input.aggregatePassFail.overall === "FAIL") return "failed";
|
|
263
|
+
const isHermesTwoPass = /hermes/i.test(input.profileName) && input.architecture === "two-pass";
|
|
264
|
+
if (isHermesTwoPass && input.medianGoodFpCount === 0 && input.recurringBarFailures.length === 0) {
|
|
265
|
+
return "trusted_baseline";
|
|
266
|
+
}
|
|
267
|
+
return "conditional_pass";
|
|
268
|
+
}
|
|
269
|
+
function aggregateReceipts(runs, opts) {
|
|
270
|
+
if (runs.length === 0) throw new Error("aggregateReceipts: no runs provided");
|
|
271
|
+
const first = runs[0];
|
|
272
|
+
const fpMetric = aggregateMetric(runs.map((r) => r.good_fp_count));
|
|
273
|
+
const anyFlagRatioMetric = aggregateMetric(runs.map((r) => r.any_flag_recall.ratio));
|
|
274
|
+
const strictRatioMetric = aggregateMetric(runs.map((r) => r.strict_recall.ratio));
|
|
275
|
+
const decisionsMetric = aggregateMetric(runs.map((r) => r.decisions_produced_count));
|
|
276
|
+
const runtimeMetric = aggregateMetric(runs.map((r) => r.runtime_ms));
|
|
277
|
+
const emptyOrMalformedMetric = aggregateMetric(
|
|
278
|
+
runs.map((r) => r.empty_or_malformed_responses)
|
|
279
|
+
);
|
|
280
|
+
const perCatAnyFlag = aggregatePerCategoryRecall(runs.map((r) => r.per_category_any_flag));
|
|
281
|
+
const perCatStrict = aggregatePerCategoryRecall(runs.map((r) => r.per_category_strict));
|
|
282
|
+
const decisionVocab = aggregateDecisionVocabulary(runs.map((r) => r.decision_vocabulary));
|
|
283
|
+
const dvRequired = first.architecture === "two-pass" ? 3 : 4;
|
|
284
|
+
const decisionVocabBar = {
|
|
285
|
+
architecture: first.architecture,
|
|
286
|
+
required: dvRequired,
|
|
287
|
+
median_produced: decisionsMetric.median,
|
|
288
|
+
passed: decisionsMetric.median >= dvRequired
|
|
289
|
+
};
|
|
290
|
+
const aggregatePassFail = computeAggregatePassFail({
|
|
291
|
+
good_fp_count: fpMetric,
|
|
292
|
+
any_flag_recall_ratio: anyFlagRatioMetric,
|
|
293
|
+
per_category_any_flag: perCatAnyFlag,
|
|
294
|
+
strict_recall_ratio: strictRatioMetric,
|
|
295
|
+
decisions_produced_count: decisionsMetric,
|
|
296
|
+
architecture: first.architecture,
|
|
297
|
+
runtime_ms: runtimeMetric,
|
|
298
|
+
empty_or_malformed_responses: emptyOrMalformedMetric
|
|
299
|
+
});
|
|
300
|
+
const recurringBarFailures = computeRecurringBarFailures(
|
|
301
|
+
runs.map((r) => r.pass_fail),
|
|
302
|
+
runs.length
|
|
303
|
+
);
|
|
304
|
+
const status = computeAggregateStatusLabel({
|
|
305
|
+
profileName: first.profile_name,
|
|
306
|
+
architecture: first.architecture,
|
|
307
|
+
aggregatePassFail,
|
|
308
|
+
medianGoodFpCount: fpMetric.median,
|
|
309
|
+
recurringBarFailures,
|
|
310
|
+
modeOverride: opts.modeOverride
|
|
311
|
+
});
|
|
312
|
+
const notes = [];
|
|
313
|
+
if (aggregatePassFail.latency_soft === "WARN") {
|
|
314
|
+
notes.push(
|
|
315
|
+
`Latency warning: median ${(runtimeMetric.median / 1e3).toFixed(1)}s exceeds soft limit of 600s`
|
|
316
|
+
);
|
|
317
|
+
}
|
|
318
|
+
if (fpMetric.median > 0) {
|
|
319
|
+
notes.push(`FP at ceiling: median ${fpMetric.median} false positive(s) on good claims`);
|
|
320
|
+
}
|
|
321
|
+
if (recurringBarFailures.length > 0) {
|
|
322
|
+
notes.push(`Recurring bar failures (>= ceil(N/2) runs): ${recurringBarFailures.join(", ")}`);
|
|
323
|
+
}
|
|
324
|
+
if (status === "comparison_only") {
|
|
325
|
+
notes.push(
|
|
326
|
+
"comparison_only: architectural side-run, not a production admission candidate"
|
|
327
|
+
);
|
|
328
|
+
}
|
|
329
|
+
if (status === "conditional_pass") {
|
|
330
|
+
notes.push("conditional_pass: passes all bars but carries a production caution");
|
|
331
|
+
}
|
|
332
|
+
return AggregateCalibrationReceiptSchema.parse({
|
|
333
|
+
schema_version: 1,
|
|
334
|
+
receipt_kind: "aggregate",
|
|
335
|
+
profile_name: first.profile_name,
|
|
336
|
+
status,
|
|
337
|
+
model: first.model,
|
|
338
|
+
architecture: first.architecture,
|
|
339
|
+
fixture: first.fixture,
|
|
340
|
+
fixture_total_claims: first.fixture_total_claims,
|
|
341
|
+
fixture_good_claims: first.fixture_good_claims,
|
|
342
|
+
fixture_bad_claims: first.fixture_bad_claims,
|
|
343
|
+
runs_count: runs.length,
|
|
344
|
+
run_files: opts.runFiles,
|
|
345
|
+
aggregated_at: opts.aggregatedAt ?? (/* @__PURE__ */ new Date()).toISOString(),
|
|
346
|
+
research_os_version: first.research_os_version,
|
|
347
|
+
good_fp_count: fpMetric,
|
|
348
|
+
any_flag_recall_ratio: anyFlagRatioMetric,
|
|
349
|
+
strict_recall_ratio: strictRatioMetric,
|
|
350
|
+
decisions_produced_count: decisionsMetric,
|
|
351
|
+
runtime_ms: runtimeMetric,
|
|
352
|
+
empty_or_malformed_responses: emptyOrMalformedMetric,
|
|
353
|
+
per_category_any_flag: perCatAnyFlag,
|
|
354
|
+
per_category_strict: perCatStrict,
|
|
355
|
+
decision_vocabulary: decisionVocab,
|
|
356
|
+
decision_vocab_bar: decisionVocabBar,
|
|
357
|
+
unreachable_decisions: first.unreachable_decisions,
|
|
358
|
+
pass_fail: aggregatePassFail,
|
|
359
|
+
recurring_bar_failures: recurringBarFailures,
|
|
360
|
+
notes,
|
|
361
|
+
...opts.reviewerOptions && Object.keys(opts.reviewerOptions).length > 0 && {
|
|
362
|
+
reviewer_options: opts.reviewerOptions
|
|
363
|
+
}
|
|
364
|
+
});
|
|
365
|
+
}
|
|
366
|
+
var REVIEWER_OPTIONS_KEY_ORDER = [
|
|
367
|
+
"num_ctx",
|
|
368
|
+
"temperature",
|
|
369
|
+
"seed",
|
|
370
|
+
"top_p",
|
|
371
|
+
"top_k",
|
|
372
|
+
"repeat_penalty"
|
|
373
|
+
];
|
|
374
|
+
function buildReviewerOptionsSection(opts) {
|
|
375
|
+
if (!opts) return "";
|
|
376
|
+
const lines = REVIEWER_OPTIONS_KEY_ORDER.filter((k) => opts[k] !== void 0).map((k) => `- ${k}: ${opts[k]}`);
|
|
377
|
+
if (lines.length === 0) return "";
|
|
378
|
+
return `
|
|
379
|
+
## Reviewer options
|
|
380
|
+
|
|
381
|
+
${lines.join("\n")}
|
|
382
|
+
`;
|
|
383
|
+
}
|
|
384
|
+
function buildAggregateReceiptMarkdown(r) {
|
|
385
|
+
const pct = (ratio) => `${Math.round(ratio * 100)}%`;
|
|
386
|
+
const secRounded = (ms) => `${(ms / 1e3).toFixed(1)}s`;
|
|
387
|
+
const af = r.any_flag_recall_ratio;
|
|
388
|
+
const sr = r.strict_recall_ratio;
|
|
389
|
+
const fp = r.good_fp_count;
|
|
390
|
+
const dec = r.decisions_produced_count;
|
|
391
|
+
const rt = r.runtime_ms;
|
|
392
|
+
const pf = r.pass_fail;
|
|
393
|
+
const bar = r.decision_vocab_bar;
|
|
394
|
+
const runFileList = r.run_files.length > 0 ? `${r.run_files[0]} \u2026 ${r.run_files[r.run_files.length - 1]}` : "(none)";
|
|
395
|
+
const perCatAnyFlagRows = Object.entries(r.per_category_any_flag).map(([cat, entry]) => {
|
|
396
|
+
const st = r.per_category_strict[cat];
|
|
397
|
+
return `| ${cat} | ${pct(entry.median_ratio)} | ${pct(entry.min_ratio)}\u2013${pct(entry.max_ratio)} | ${entry.total} |` + (st ? ` ${pct(st.median_ratio)} | ${pct(st.min_ratio)}\u2013${pct(st.max_ratio)} |` : " \u2014 | \u2014 |");
|
|
398
|
+
}).join("\n");
|
|
399
|
+
const ALL_DECISIONS = [
|
|
400
|
+
"accepted_for_synthesis",
|
|
401
|
+
"rejected",
|
|
402
|
+
"needs_scope_repair",
|
|
403
|
+
"needs_source_repair",
|
|
404
|
+
"needs_contradiction_mapping",
|
|
405
|
+
"needs_human_review"
|
|
406
|
+
];
|
|
407
|
+
const dvRows = ALL_DECISIONS.map((d) => {
|
|
408
|
+
const metric = r.decision_vocabulary[d];
|
|
409
|
+
const unreachable = r.unreachable_decisions.includes(d) ? ` (unreachable from ${r.fixture})` : "";
|
|
410
|
+
if (!metric) return `| ${d} | \u2014 | \u2014 |${unreachable}`;
|
|
411
|
+
return `| ${d} | ${metric.median.toFixed(1)} | ${metric.min}\u2013${metric.max}${unreachable} |`;
|
|
412
|
+
}).join("\n");
|
|
413
|
+
const perRunRows = r.any_flag_recall_ratio.values.map((afr, i) => {
|
|
414
|
+
const fp_i = r.good_fp_count.values[i] ?? "?";
|
|
415
|
+
const sr_i = r.strict_recall_ratio.values[i] ?? "?";
|
|
416
|
+
const dec_i = r.decisions_produced_count.values[i] ?? "?";
|
|
417
|
+
const rt_i = r.runtime_ms.values[i] ?? "?";
|
|
418
|
+
return `| ${i + 1} | ${fp_i}/${r.fixture_good_claims} | ${typeof afr === "number" ? pct(afr) : "?"} | ${typeof sr_i === "number" ? pct(sr_i) : "?"} | ${dec_i}/6 | ${typeof rt_i === "number" ? secRounded(rt_i) : "?"} |`;
|
|
419
|
+
}).join("\n");
|
|
420
|
+
const recurringSection = r.recurring_bar_failures.length > 0 ? r.recurring_bar_failures.map((b) => `- ${b}`).join("\n") : "None.";
|
|
421
|
+
const notesSection = r.notes.length > 0 ? `
|
|
422
|
+
## Notes
|
|
423
|
+
|
|
424
|
+
${r.notes.map((n) => `- ${n}`).join("\n")}
|
|
425
|
+
` : "";
|
|
426
|
+
const reviewerOptionsSection = buildReviewerOptionsSection(r.reviewer_options);
|
|
427
|
+
return `# Calibration Receipt \u2014 ${r.profile_name} (aggregate, N=${r.runs_count} runs)
|
|
428
|
+
|
|
429
|
+
- **Model:** ${r.model}
|
|
430
|
+
- **Architecture:** ${r.architecture}
|
|
431
|
+
- **Status:** ${r.status}
|
|
432
|
+
- **Fixture:** ${r.fixture} (${r.fixture_total_claims} claims = ${r.fixture_good_claims} good + ${r.fixture_bad_claims} bad)
|
|
433
|
+
- **Aggregated at:** ${r.aggregated_at}
|
|
434
|
+
- **Research-OS version:** ${r.research_os_version}
|
|
435
|
+
- **Run count:** ${r.runs_count}
|
|
436
|
+
- **Run files:** ${runFileList}
|
|
437
|
+
${reviewerOptionsSection}
|
|
438
|
+
## Headline metrics (median across runs)
|
|
439
|
+
|
|
440
|
+
- FP: median ${fp.median} / ${r.fixture_good_claims} (range ${fp.min}\u2013${fp.max})
|
|
441
|
+
- Any-flag recall: median ${pct(af.median)} (range ${pct(af.min)}\u2013${pct(af.max)})
|
|
442
|
+
- Strict recall: median ${pct(sr.median)} (range ${pct(sr.min)}\u2013${pct(sr.max)})
|
|
443
|
+
- Decisions produced: median ${dec.median} / 6 (range ${dec.min}\u2013${dec.max})
|
|
444
|
+
|
|
445
|
+
## PASS / FAIL (aggregate)
|
|
446
|
+
|
|
447
|
+
| Bar | Rule | Result |
|
|
448
|
+
|---|---|---|
|
|
449
|
+
| FP ceiling | median=${fp.median}, max=${fp.max} (median \u22641 AND max \u22642) | ${pf.fp_ceiling} |
|
|
450
|
+
| Any-flag recall | median=${pct(af.median)} (\u226565%) | ${pf.any_flag_recall_floor} |
|
|
451
|
+
| Per-category any-flag | median \u226550% per cat (see below) | ${pf.per_category_any_flag_floor} |
|
|
452
|
+
| Strict recall | median=${pct(sr.median)} (\u226520%) | ${pf.strict_recall_floor} |
|
|
453
|
+
| Decision vocab | median=${dec.median} / 6 (${bar.architecture} \u2265${bar.required}) | ${pf.decision_vocab_completeness} |
|
|
454
|
+
| Latency soft | median=${secRounded(rt.median)} (\u2264600s, WARN only) | ${pf.latency_soft} |
|
|
455
|
+
| Latency hard | max=${secRounded(rt.max)} (every run \u22641200s) | ${pf.latency_hard} |
|
|
456
|
+
| Empty/malformed | max=${r.empty_or_malformed_responses.max} (every run =0) | ${pf.empty_or_malformed} |
|
|
457
|
+
| **OVERALL** | | **${pf.overall}** |
|
|
458
|
+
|
|
459
|
+
## Recurring hard-bar failures
|
|
460
|
+
|
|
461
|
+
${recurringSection}
|
|
462
|
+
|
|
463
|
+
## Per-category recall (median across runs)
|
|
464
|
+
|
|
465
|
+
| Category | Any-flag median | Any-flag range | Total | Strict median | Strict range |
|
|
466
|
+
|---|---|---|---|---|---|
|
|
467
|
+
${perCatAnyFlagRows}
|
|
468
|
+
|
|
469
|
+
## Decision vocabulary (median count across runs)
|
|
470
|
+
|
|
471
|
+
| Decision | Median | Range |
|
|
472
|
+
|---|---|---|
|
|
473
|
+
${dvRows}
|
|
474
|
+
|
|
475
|
+
## Per-run summary
|
|
476
|
+
|
|
477
|
+
| Run | FP | Any-flag | Strict | Decisions | Runtime |
|
|
478
|
+
|---|---|---|---|---|---|
|
|
479
|
+
${perRunRows}
|
|
480
|
+
${notesSection}`;
|
|
481
|
+
}
|
|
482
|
+
export {
|
|
483
|
+
aggregateDecisionVocabulary,
|
|
484
|
+
aggregateMetric,
|
|
485
|
+
aggregatePerCategoryRecall,
|
|
486
|
+
aggregateReceipts,
|
|
487
|
+
buildAggregateReceiptMarkdown,
|
|
488
|
+
computeAggregatePassFail,
|
|
489
|
+
computeAggregateStatusLabel,
|
|
490
|
+
computeRecurringBarFailures,
|
|
491
|
+
median
|
|
492
|
+
};
|
|
493
|
+
//# sourceMappingURL=aggregate.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/calibration/aggregate-receipt-schema.ts","../../src/calibration/receipt-schema.ts","../../src/review/reviewer-options-schema.ts","../../src/calibration/aggregate.ts"],"sourcesContent":["import { z } from 'zod';\nimport { StatusLabelSchema, ArchitectureSchema, ReviewerOptionsSchema } from './receipt-schema.js';\n\nexport const AggregateMetricSchema = z.object({\n median: z.number(),\n min: z.number(),\n max: z.number(),\n values: z.array(z.number()), // per-run values in run order (run-001, run-002, ...)\n});\n\nexport const PerCategoryAggregateEntrySchema = z.object({\n median_ratio: z.number().min(0).max(1),\n min_ratio: z.number().min(0).max(1),\n max_ratio: z.number().min(0).max(1),\n total: z.number().int().nonnegative(), // seed count — same across all runs\n per_run_ratios: z.array(z.number()),\n});\n\nexport const PerCategoryAggregateSchema = z.record(z.string(), PerCategoryAggregateEntrySchema);\n\nexport const AggregatePassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const AggregateDecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n median_produced: z.number(), // float — median of per-run decisions_produced_count\n passed: z.boolean(),\n});\n\nexport const AggregateCalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n receipt_kind: z.literal('aggregate'), // discriminates from single-run receipt\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n runs_count: z.number().int().min(2),\n run_files: z.array(z.string()), // relative paths: runs/run-001.json, etc.\n aggregated_at: z.string(), // ISO 8601\n research_os_version: z.string(),\n\n // Aggregate metrics — median + min + max + per-run values in run order\n good_fp_count: AggregateMetricSchema,\n any_flag_recall_ratio: AggregateMetricSchema,\n strict_recall_ratio: AggregateMetricSchema,\n decisions_produced_count: AggregateMetricSchema,\n runtime_ms: AggregateMetricSchema,\n empty_or_malformed_responses: AggregateMetricSchema,\n\n per_category_any_flag: PerCategoryAggregateSchema,\n per_category_strict: PerCategoryAggregateSchema,\n\n // Decision vocabulary — union of all decisions seen across runs, median count each\n decision_vocabulary: z.record(z.string(), AggregateMetricSchema),\n decision_vocab_bar: AggregateDecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n\n pass_fail: AggregatePassFailSchema,\n // Bars that FAILed in >= ceil(runs_count/2) individual runs.\n // Non-empty list demotes trusted_baseline to conditional_pass.\n recurring_bar_failures: z.array(z.string()),\n\n notes: z.array(z.string()),\n\n // schema_version: 1 — additive-optional (Exp6 Session 2):\n // Same options object stamped on every per-run receipt. Absent = stochastic run.\n reviewer_options: ReviewerOptionsSchema.optional(),\n});\n\nexport type AggregateMetric = z.infer<typeof AggregateMetricSchema>;\nexport type PerCategoryAggregateEntry = z.infer<typeof PerCategoryAggregateEntrySchema>;\nexport type PerCategoryAggregate = z.infer<typeof PerCategoryAggregateSchema>;\nexport type AggregatePassFail = z.infer<typeof AggregatePassFailSchema>;\nexport type AggregateDecisionVocabBar = z.infer<typeof AggregateDecisionVocabBarSchema>;\nexport type AggregateCalibrationReceipt = z.infer<typeof AggregateCalibrationReceiptSchema>;\n","import { z } from 'zod';\nimport { ReviewerOptionsSchema } from '../review/reviewer-options-schema.js';\nexport { ReviewerOptionsSchema };\nexport type { ReviewerOptions } from '../review/reviewer-options-schema.js';\n\nexport const StatusLabelSchema = z.enum([\n 'trusted_baseline',\n 'conditional_pass',\n 'failed',\n 'comparison_only',\n]);\n\nexport const ArchitectureSchema = z.enum(['single-pass', 'two-pass']);\n\nexport const RecallSchema = z.object({\n matched: z.number().int().nonnegative(),\n total: z.number().int().nonnegative(),\n ratio: z.number().min(0).max(1),\n});\n\nexport const PerCategoryRecallSchema = z.record(z.string(), RecallSchema);\n\nexport const PassFailSchema = z.object({\n fp_ceiling: z.enum(['PASS', 'FAIL']),\n any_flag_recall_floor: z.enum(['PASS', 'FAIL']),\n per_category_any_flag_floor: z.enum(['PASS', 'FAIL']),\n strict_recall_floor: z.enum(['PASS', 'FAIL']),\n decision_vocab_completeness: z.enum(['PASS', 'FAIL']),\n latency_soft: z.enum(['PASS', 'WARN']),\n latency_hard: z.enum(['PASS', 'FAIL']),\n empty_or_malformed: z.enum(['PASS', 'FAIL']),\n overall: z.enum(['PASS', 'FAIL']),\n});\n\nexport const DecisionVocabBarSchema = z.object({\n architecture: ArchitectureSchema,\n required: z.number().int().positive(),\n produced: z.number().int().nonnegative(),\n passed: z.boolean(),\n});\n\n// schema_version: 1 — additive-optional additions (Exp6 Session 2):\n// reviewer_options: optional sampling params used during this calibration run.\n// Absent = stochastic run (pre-v0.6 compat preserved). Present = keys explicitly set.\nexport const CalibrationReceiptSchema = z.object({\n schema_version: z.literal(1),\n profile_name: z.string(),\n status: StatusLabelSchema,\n model: z.string(),\n architecture: ArchitectureSchema,\n fixture: z.string(),\n fixture_total_claims: z.number().int().positive(),\n fixture_good_claims: z.number().int().nonnegative(),\n fixture_bad_claims: z.number().int().nonnegative(),\n calibrated_at: z.string(),\n research_os_version: z.string(),\n runtime_ms: z.number().int().nonnegative(),\n good_fp_count: z.number().int().nonnegative(),\n any_flag_recall: RecallSchema,\n strict_recall: RecallSchema,\n per_category_any_flag: PerCategoryRecallSchema,\n per_category_strict: PerCategoryRecallSchema,\n decision_vocabulary: z.record(z.string(), z.number().int().nonnegative()),\n decisions_produced_count: z.number().int().nonnegative(),\n decision_vocab_bar: DecisionVocabBarSchema,\n unreachable_decisions: z.array(z.string()),\n empty_or_malformed_responses: z.number().int().nonnegative(),\n pass_fail: PassFailSchema,\n notes: z.array(z.string()),\n reviewer_options: ReviewerOptionsSchema.optional(),\n});\n\nexport type StatusLabel = z.infer<typeof StatusLabelSchema>;\nexport type Architecture = z.infer<typeof ArchitectureSchema>;\nexport type Recall = z.infer<typeof RecallSchema>;\nexport type PerCategoryRecall = z.infer<typeof PerCategoryRecallSchema>;\nexport type PassFail = z.infer<typeof PassFailSchema>;\nexport type DecisionVocabBar = z.infer<typeof DecisionVocabBarSchema>;\nexport type CalibrationReceipt = z.infer<typeof CalibrationReceiptSchema>;\n","import { z } from 'zod';\n\n// Sampling parameters passed verbatim to the Ollama /api/chat `options` field.\n// Used by OllamaInternReviewer to control determinism. All fields optional —\n// omitted keys fall back to Ollama/model defaults. Introduced in Experiment 6\n// Session 2 to make reviewer conditions explicit in calibration receipts.\n//\n// LOAD-BEARING: temperature: 0 is valid and must not be dropped. All merges\n// in OllamaInternReviewer use `!== undefined` checks, NOT truthiness.\nexport const ReviewerOptionsSchema = z.object({\n num_ctx: z.number().int().positive().optional(),\n temperature: z.number().min(0).max(2).optional(),\n seed: z.number().int().optional(),\n top_p: z.number().min(0).max(1).optional(),\n top_k: z.number().int().nonnegative().optional(),\n repeat_penalty: z.number().min(0).optional(),\n});\n\nexport type ReviewerOptions = z.infer<typeof ReviewerOptionsSchema>;\n","import type { Architecture, CalibrationReceipt, PassFail, PerCategoryRecall, ReviewerOptions } from './receipt-schema.js';\nimport {\n AggregateCalibrationReceiptSchema,\n type AggregateCalibrationReceipt,\n type AggregateMetric,\n type AggregatePassFail,\n type PerCategoryAggregate,\n} from './aggregate-receipt-schema.js';\nimport type { StatusLabel } from './receipt-schema.js';\n\n// Compute median of a sorted or unsorted array.\n// Throws on empty input — callers always have at least one run.\n// For even-length arrays: mean of two middle values (float, not rounded).\n// Integer-valued metrics (FP count, decisions) stay as floats here;\n// the caller's bar comparisons (>= 3, === 0) work correctly on exact floats\n// because the inputs are small integers.\nexport function median(values: number[]): number {\n if (values.length === 0) throw new Error('median: empty array');\n const sorted = [...values].sort((a, b) => a - b);\n const mid = Math.floor(sorted.length / 2);\n if (sorted.length % 2 === 1) return sorted[mid];\n return (sorted[mid - 1] + sorted[mid]) / 2;\n}\n\n// Aggregate a list of per-run scalar values into { median, min, max, values }.\n// values preserves input order (run-001, run-002, ...) for traceability.\nexport function aggregateMetric(values: number[]): AggregateMetric {\n const m = median(values);\n return {\n median: m,\n min: Math.min(...values),\n max: Math.max(...values),\n values,\n };\n}\n\n// Aggregate per-run per-category recall objects.\n// Each element of perRunBuckets is one run's PerCategoryRecall\n// (Record<category, { matched, total, ratio }>).\n// Returns PerCategoryAggregate: per-category median/min/max ratio + per-run ratios.\n// total is taken from the first run that has the category (same across runs —\n// SEEDS is static so category totals never change between runs).\nexport function aggregatePerCategoryRecall(\n perRunBuckets: PerCategoryRecall[],\n): PerCategoryAggregate {\n const cats = new Set<string>();\n for (const run of perRunBuckets) {\n for (const cat of Object.keys(run)) cats.add(cat);\n }\n\n const result: PerCategoryAggregate = {};\n for (const cat of cats) {\n const ratios = perRunBuckets.map((run) => run[cat]?.ratio ?? 0);\n const total = perRunBuckets.find((run) => run[cat] !== undefined)?.[cat]?.total ?? 0;\n result[cat] = {\n median_ratio: median(ratios),\n min_ratio: Math.min(...ratios),\n max_ratio: Math.max(...ratios),\n total,\n per_run_ratios: ratios,\n };\n }\n return result;\n}\n\n// Aggregate per-run decision vocabulary count dicts.\n// Each element is one run's decision_vocabulary (Record<decision, count>).\n// Returns Record<decision, AggregateMetric> with median count per decision.\nexport function aggregateDecisionVocabulary(\n perRunDicts: Record<string, number>[],\n): Record<string, AggregateMetric> {\n const decisions = new Set<string>();\n for (const run of perRunDicts) {\n for (const d of Object.keys(run)) decisions.add(d);\n }\n\n const result: Record<string, AggregateMetric> = {};\n for (const d of decisions) {\n const values = perRunDicts.map((run) => run[d] ?? 0);\n result[d] = aggregateMetric(values);\n }\n return result;\n}\n\n// Compute aggregate PASS/FAIL bars from aggregated metrics.\n//\n// Advisor-locked rules (gospel):\n// FP ceiling: median <= 1 AND max <= 2\n// Any-flag recall: median >= 0.65\n// Per-category: median_ratio >= 0.50 for categories with total >= 2\n// Strict recall: median >= 0.20\n// Decision vocab: median >= required (architecture-aware: two-pass=3, single-pass=4)\n// Latency soft: median <= 600_000 → WARN only, never FAIL\n// Latency hard: every-run rule — max <= 1_200_000\n// Empty/malformed: every-run rule — max === 0\nexport function computeAggregatePassFail(input: {\n good_fp_count: AggregateMetric;\n any_flag_recall_ratio: AggregateMetric;\n per_category_any_flag: PerCategoryAggregate;\n strict_recall_ratio: AggregateMetric;\n decisions_produced_count: AggregateMetric;\n architecture: Architecture;\n runtime_ms: AggregateMetric;\n empty_or_malformed_responses: AggregateMetric;\n}): AggregatePassFail {\n const fp_ceiling: 'PASS' | 'FAIL' =\n input.good_fp_count.median <= 1 && input.good_fp_count.max <= 2 ? 'PASS' : 'FAIL';\n\n const any_flag_recall_floor: 'PASS' | 'FAIL' =\n input.any_flag_recall_ratio.median >= 0.65 ? 'PASS' : 'FAIL';\n\n let per_category_any_flag_floor: 'PASS' | 'FAIL' = 'PASS';\n for (const entry of Object.values(input.per_category_any_flag)) {\n if (entry.total >= 2 && entry.median_ratio < 0.5) {\n per_category_any_flag_floor = 'FAIL';\n break;\n }\n }\n\n const strict_recall_floor: 'PASS' | 'FAIL' =\n input.strict_recall_ratio.median >= 0.2 ? 'PASS' : 'FAIL';\n\n const dvRequired = input.architecture === 'two-pass' ? 3 : 4;\n const decision_vocab_completeness: 'PASS' | 'FAIL' =\n input.decisions_produced_count.median >= dvRequired ? 'PASS' : 'FAIL';\n\n // Latency soft: WARN-only signal — no FAIL contribution\n const latency_soft: 'PASS' | 'WARN' =\n input.runtime_ms.median <= 600_000 ? 'PASS' : 'WARN';\n\n // Latency hard: every-run rule — enforced via max\n const latency_hard: 'PASS' | 'FAIL' =\n input.runtime_ms.max <= 1_200_000 ? 'PASS' : 'FAIL';\n\n // Empty/malformed: every-run rule — enforced via max\n const empty_or_malformed: 'PASS' | 'FAIL' =\n input.empty_or_malformed_responses.max === 0 ? 'PASS' : 'FAIL';\n\n const hardBars: ('PASS' | 'FAIL')[] = [\n fp_ceiling,\n any_flag_recall_floor,\n per_category_any_flag_floor,\n strict_recall_floor,\n decision_vocab_completeness,\n latency_hard,\n empty_or_malformed,\n ];\n const overall: 'PASS' | 'FAIL' = hardBars.every((v) => v === 'PASS') ? 'PASS' : 'FAIL';\n\n return {\n fp_ceiling,\n any_flag_recall_floor,\n per_category_any_flag_floor,\n strict_recall_floor,\n decision_vocab_completeness,\n latency_soft,\n latency_hard,\n empty_or_malformed,\n overall,\n };\n}\n\n// Compute which hard bars FAILed in >= ceil(N/2) individual runs.\n// A non-empty result means that bar was SYSTEMATICALLY unreliable —\n// not just a one-run outlier that happened to median-pass.\n// This is used by computeAggregateStatusLabel to prevent a profile from\n// earning trusted_baseline when one bar failed in the majority of runs.\n//\n// Hard bars checked (latency_soft and overall are excluded):\n// fp_ceiling, any_flag_recall_floor, per_category_any_flag_floor,\n// strict_recall_floor, decision_vocab_completeness, latency_hard, empty_or_malformed\nexport function computeRecurringBarFailures(\n perRunPassFails: PassFail[],\n totalRuns: number,\n): string[] {\n const threshold = Math.ceil(totalRuns / 2);\n const HARD_BARS: (keyof PassFail)[] = [\n 'fp_ceiling',\n 'any_flag_recall_floor',\n 'per_category_any_flag_floor',\n 'strict_recall_floor',\n 'decision_vocab_completeness',\n 'latency_hard',\n 'empty_or_malformed',\n ];\n\n const recurring: string[] = [];\n for (const bar of HARD_BARS) {\n const failCount = perRunPassFails.filter((pf) => pf[bar] === 'FAIL').length;\n if (failCount >= threshold) recurring.push(bar);\n }\n return recurring;\n}\n\n// Assign aggregate status label.\n//\n// Advisor-locked predicates (priority order):\n// 1. comparison_only — explicit mode flag OR single-pass Hermes (regardless of pass/fail)\n// 2. failed — aggregate pass_fail.overall === FAIL\n// 3. trusted_baseline — Hermes two-pass AND aggregate PASS AND median(FP) === 0\n// AND recurring_bar_failures.length === 0\n// The recurring-failure check prevents a profile from earning trusted_baseline\n// when any hard bar FAILed in >= ceil(N/2) runs even if the median still passed.\n// Intent: \"one lucky median cannot mask systemic bar weakness.\"\n// 4. conditional_pass — fallthrough (passes but doesn't earn trusted_baseline)\n// Mistral two-pass is capped at conditional_pass regardless of aggregate result.\nexport function computeAggregateStatusLabel(input: {\n profileName: string;\n architecture: Architecture;\n aggregatePassFail: AggregatePassFail;\n medianGoodFpCount: number;\n recurringBarFailures: string[];\n modeOverride?: 'comparison_only';\n}): StatusLabel {\n if (input.modeOverride === 'comparison_only') return 'comparison_only';\n\n if (input.architecture === 'single-pass' && /hermes/i.test(input.profileName)) {\n return 'comparison_only';\n }\n\n if (input.aggregatePassFail.overall === 'FAIL') return 'failed';\n\n const isHermesTwoPass =\n /hermes/i.test(input.profileName) && input.architecture === 'two-pass';\n if (\n isHermesTwoPass &&\n input.medianGoodFpCount === 0 &&\n input.recurringBarFailures.length === 0\n ) {\n return 'trusted_baseline';\n }\n\n return 'conditional_pass';\n}\n\n// Aggregate N single-run receipts into one AggregateCalibrationReceipt.\n// All receipts must be from the same profile/model/architecture.\n// opts.runFiles: relative paths for each run (e.g. 'runs/run-001.json').\n// opts.modeOverride: forward 'comparison_only' to status-label predicate.\n// opts.aggregatedAt: ISO timestamp (defaults to now).\n// opts.reviewerOptions: reviewer sampling options stamped on each per-run receipt.\n// Captured once at harness startup and reused across all N runs. The aggregate\n// carries the same object so consumers can reproduce the exact invocation.\nexport function aggregateReceipts(\n runs: CalibrationReceipt[],\n opts: {\n runFiles: string[];\n modeOverride?: 'comparison_only';\n aggregatedAt?: string;\n reviewerOptions?: ReviewerOptions;\n },\n): AggregateCalibrationReceipt {\n if (runs.length === 0) throw new Error('aggregateReceipts: no runs provided');\n const first = runs[0];\n\n const fpMetric = aggregateMetric(runs.map((r) => r.good_fp_count));\n const anyFlagRatioMetric = aggregateMetric(runs.map((r) => r.any_flag_recall.ratio));\n const strictRatioMetric = aggregateMetric(runs.map((r) => r.strict_recall.ratio));\n const decisionsMetric = aggregateMetric(runs.map((r) => r.decisions_produced_count));\n const runtimeMetric = aggregateMetric(runs.map((r) => r.runtime_ms));\n const emptyOrMalformedMetric = aggregateMetric(\n runs.map((r) => r.empty_or_malformed_responses),\n );\n\n const perCatAnyFlag = aggregatePerCategoryRecall(runs.map((r) => r.per_category_any_flag));\n const perCatStrict = aggregatePerCategoryRecall(runs.map((r) => r.per_category_strict));\n const decisionVocab = aggregateDecisionVocabulary(runs.map((r) => r.decision_vocabulary));\n\n const dvRequired = first.architecture === 'two-pass' ? 3 : 4;\n const decisionVocabBar = {\n architecture: first.architecture,\n required: dvRequired,\n median_produced: decisionsMetric.median,\n passed: decisionsMetric.median >= dvRequired,\n };\n\n const aggregatePassFail = computeAggregatePassFail({\n good_fp_count: fpMetric,\n any_flag_recall_ratio: anyFlagRatioMetric,\n per_category_any_flag: perCatAnyFlag,\n strict_recall_ratio: strictRatioMetric,\n decisions_produced_count: decisionsMetric,\n architecture: first.architecture,\n runtime_ms: runtimeMetric,\n empty_or_malformed_responses: emptyOrMalformedMetric,\n });\n\n const recurringBarFailures = computeRecurringBarFailures(\n runs.map((r) => r.pass_fail),\n runs.length,\n );\n\n const status = computeAggregateStatusLabel({\n profileName: first.profile_name,\n architecture: first.architecture,\n aggregatePassFail,\n medianGoodFpCount: fpMetric.median,\n recurringBarFailures,\n modeOverride: opts.modeOverride,\n });\n\n const notes: string[] = [];\n if (aggregatePassFail.latency_soft === 'WARN') {\n notes.push(\n `Latency warning: median ${(runtimeMetric.median / 1000).toFixed(1)}s exceeds soft limit of 600s`,\n );\n }\n if (fpMetric.median > 0) {\n notes.push(`FP at ceiling: median ${fpMetric.median} false positive(s) on good claims`);\n }\n if (recurringBarFailures.length > 0) {\n notes.push(`Recurring bar failures (>= ceil(N/2) runs): ${recurringBarFailures.join(', ')}`);\n }\n if (status === 'comparison_only') {\n notes.push(\n 'comparison_only: architectural side-run, not a production admission candidate',\n );\n }\n if (status === 'conditional_pass') {\n notes.push('conditional_pass: passes all bars but carries a production caution');\n }\n\n return AggregateCalibrationReceiptSchema.parse({\n schema_version: 1,\n receipt_kind: 'aggregate',\n profile_name: first.profile_name,\n status,\n model: first.model,\n architecture: first.architecture,\n fixture: first.fixture,\n fixture_total_claims: first.fixture_total_claims,\n fixture_good_claims: first.fixture_good_claims,\n fixture_bad_claims: first.fixture_bad_claims,\n runs_count: runs.length,\n run_files: opts.runFiles,\n aggregated_at: opts.aggregatedAt ?? new Date().toISOString(),\n research_os_version: first.research_os_version,\n good_fp_count: fpMetric,\n any_flag_recall_ratio: anyFlagRatioMetric,\n strict_recall_ratio: strictRatioMetric,\n decisions_produced_count: decisionsMetric,\n runtime_ms: runtimeMetric,\n empty_or_malformed_responses: emptyOrMalformedMetric,\n per_category_any_flag: perCatAnyFlag,\n per_category_strict: perCatStrict,\n decision_vocabulary: decisionVocab,\n decision_vocab_bar: decisionVocabBar,\n unreachable_decisions: first.unreachable_decisions,\n pass_fail: aggregatePassFail,\n recurring_bar_failures: recurringBarFailures,\n notes,\n ...(opts.reviewerOptions && Object.keys(opts.reviewerOptions).length > 0 && {\n reviewer_options: opts.reviewerOptions,\n }),\n });\n}\n\n// Stable key order for reviewer_options rendering (matches single-run receipt).\nconst REVIEWER_OPTIONS_KEY_ORDER = [\n 'num_ctx',\n 'temperature',\n 'seed',\n 'top_p',\n 'top_k',\n 'repeat_penalty',\n] as const;\n\nfunction buildReviewerOptionsSection(opts: AggregateCalibrationReceipt['reviewer_options']): string {\n if (!opts) return '';\n const lines = REVIEWER_OPTIONS_KEY_ORDER\n .filter((k) => opts[k] !== undefined)\n .map((k) => `- ${k}: ${opts[k]}`);\n if (lines.length === 0) return '';\n return `\\n## Reviewer options\\n\\n${lines.join('\\n')}\\n`;\n}\n\n// Render the aggregate calibration receipt as compact Markdown.\n// Operator proof artifact — no prose.\nexport function buildAggregateReceiptMarkdown(r: AggregateCalibrationReceipt): string {\n const pct = (ratio: number) => `${Math.round(ratio * 100)}%`;\n const secRounded = (ms: number) => `${(ms / 1000).toFixed(1)}s`;\n\n const af = r.any_flag_recall_ratio;\n const sr = r.strict_recall_ratio;\n const fp = r.good_fp_count;\n const dec = r.decisions_produced_count;\n const rt = r.runtime_ms;\n const pf = r.pass_fail;\n const bar = r.decision_vocab_bar;\n\n const runFileList =\n r.run_files.length > 0\n ? `${r.run_files[0]} … ${r.run_files[r.run_files.length - 1]}`\n : '(none)';\n\n const perCatAnyFlagRows = Object.entries(r.per_category_any_flag)\n .map(([cat, entry]) => {\n const st = r.per_category_strict[cat];\n return (\n `| ${cat} | ${pct(entry.median_ratio)} | ${pct(entry.min_ratio)}–${pct(entry.max_ratio)} | ${entry.total} |` +\n (st\n ? ` ${pct(st.median_ratio)} | ${pct(st.min_ratio)}–${pct(st.max_ratio)} |`\n : ' — | — |')\n );\n })\n .join('\\n');\n\n const ALL_DECISIONS = [\n 'accepted_for_synthesis',\n 'rejected',\n 'needs_scope_repair',\n 'needs_source_repair',\n 'needs_contradiction_mapping',\n 'needs_human_review',\n ];\n const dvRows = ALL_DECISIONS.map((d) => {\n const metric = r.decision_vocabulary[d];\n const unreachable = r.unreachable_decisions.includes(d)\n ? ` (unreachable from ${r.fixture})`\n : '';\n if (!metric) return `| ${d} | — | — |${unreachable}`;\n return `| ${d} | ${metric.median.toFixed(1)} | ${metric.min}–${metric.max}${unreachable} |`;\n }).join('\\n');\n\n // Per-run summary table — pulled from run_files labels for clarity\n const perRunRows = r.any_flag_recall_ratio.values\n .map((afr, i) => {\n const fp_i = r.good_fp_count.values[i] ?? '?';\n const sr_i = r.strict_recall_ratio.values[i] ?? '?';\n const dec_i = r.decisions_produced_count.values[i] ?? '?';\n const rt_i = r.runtime_ms.values[i] ?? '?';\n return `| ${i + 1} | ${fp_i}/${r.fixture_good_claims} | ${typeof afr === 'number' ? pct(afr) : '?'} | ${typeof sr_i === 'number' ? pct(sr_i) : '?'} | ${dec_i}/6 | ${typeof rt_i === 'number' ? secRounded(rt_i) : '?'} |`;\n })\n .join('\\n');\n\n const recurringSection =\n r.recurring_bar_failures.length > 0\n ? r.recurring_bar_failures.map((b) => `- ${b}`).join('\\n')\n : 'None.';\n\n const notesSection =\n r.notes.length > 0 ? `\\n## Notes\\n\\n${r.notes.map((n) => `- ${n}`).join('\\n')}\\n` : '';\n\n const reviewerOptionsSection = buildReviewerOptionsSection(r.reviewer_options);\n\n return `# Calibration Receipt — ${r.profile_name} (aggregate, N=${r.runs_count} runs)\n\n- **Model:** ${r.model}\n- **Architecture:** ${r.architecture}\n- **Status:** ${r.status}\n- **Fixture:** ${r.fixture} (${r.fixture_total_claims} claims = ${r.fixture_good_claims} good + ${r.fixture_bad_claims} bad)\n- **Aggregated at:** ${r.aggregated_at}\n- **Research-OS version:** ${r.research_os_version}\n- **Run count:** ${r.runs_count}\n- **Run files:** ${runFileList}\n${reviewerOptionsSection}\n## Headline metrics (median across runs)\n\n- FP: median ${fp.median} / ${r.fixture_good_claims} (range ${fp.min}–${fp.max})\n- Any-flag recall: median ${pct(af.median)} (range ${pct(af.min)}–${pct(af.max)})\n- Strict recall: median ${pct(sr.median)} (range ${pct(sr.min)}–${pct(sr.max)})\n- Decisions produced: median ${dec.median} / 6 (range ${dec.min}–${dec.max})\n\n## PASS / FAIL (aggregate)\n\n| Bar | Rule | Result |\n|---|---|---|\n| FP ceiling | median=${fp.median}, max=${fp.max} (median ≤1 AND max ≤2) | ${pf.fp_ceiling} |\n| Any-flag recall | median=${pct(af.median)} (≥65%) | ${pf.any_flag_recall_floor} |\n| Per-category any-flag | median ≥50% per cat (see below) | ${pf.per_category_any_flag_floor} |\n| Strict recall | median=${pct(sr.median)} (≥20%) | ${pf.strict_recall_floor} |\n| Decision vocab | median=${dec.median} / 6 (${bar.architecture} ≥${bar.required}) | ${pf.decision_vocab_completeness} |\n| Latency soft | median=${secRounded(rt.median)} (≤600s, WARN only) | ${pf.latency_soft} |\n| Latency hard | max=${secRounded(rt.max)} (every run ≤1200s) | ${pf.latency_hard} |\n| Empty/malformed | max=${r.empty_or_malformed_responses.max} (every run =0) | ${pf.empty_or_malformed} |\n| **OVERALL** | | **${pf.overall}** |\n\n## Recurring hard-bar failures\n\n${recurringSection}\n\n## Per-category recall (median across runs)\n\n| Category | Any-flag median | Any-flag range | Total | Strict median | Strict range |\n|---|---|---|---|---|---|\n${perCatAnyFlagRows}\n\n## Decision vocabulary (median count across runs)\n\n| Decision | Median | Range |\n|---|---|---|\n${dvRows}\n\n## Per-run summary\n\n| Run | FP | Any-flag | Strict | Decisions | Runtime |\n|---|---|---|---|---|---|\n${perRunRows}\n${notesSection}`;\n}\n"],"mappings":";AAAA,SAAS,KAAAA,UAAS;;;ACAlB,SAAS,KAAAC,UAAS;;;ACAlB,SAAS,SAAS;AASX,IAAM,wBAAwB,EAAE,OAAO;AAAA,EAC5C,SAAS,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS;AAAA,EAC9C,aAAa,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AAAA,EAC/C,MAAM,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChC,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AAAA,EACzC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,EAAE,SAAS;AAAA,EAC/C,gBAAgB,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,SAAS;AAC7C,CAAC;;;ADXM,IAAM,oBAAoBC,GAAE,KAAK;AAAA,EACtC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAEM,IAAM,qBAAqBA,GAAE,KAAK,CAAC,eAAe,UAAU,CAAC;AAE7D,IAAM,eAAeA,GAAE,OAAO;AAAA,EACnC,SAASA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACtC,OAAOA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACpC,OAAOA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAChC,CAAC;AAEM,IAAM,0BAA0BA,GAAE,OAAOA,GAAE,OAAO,GAAG,YAAY;AAEjE,IAAM,iBAAiBA,GAAE,OAAO;AAAA,EACrC,YAAYA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAASA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,yBAAyBA,GAAE,OAAO;AAAA,EAC7C,cAAc;AAAA,EACd,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvC,QAAQA,GAAE,QAAQ;AACpB,CAAC;AAKM,IAAM,2BAA2BA,GAAE,OAAO;AAAA,EAC/C,gBAAgBA,GAAE,QAAQ,CAAC;AAAA,EAC3B,cAAcA,GAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAOA,GAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAASA,GAAE,OAAO;AAAA,EAClB,sBAAsBA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,eAAeA,GAAE,OAAO;AAAA,EACxB,qBAAqBA,GAAE,OAAO;AAAA,EAC9B,YAAYA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACzC,eAAeA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC5C,iBAAiB;AAAA,EACjB,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,qBAAqBA,GAAE,OAAOA,GAAE,OAAO,GAAGA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY,CAAC;AAAA,EACxE,0BAA0BA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACvD,oBAAoB;AAAA,EACpB,uBAAuBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EACzC,8BAA8BA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAC3D,WAAW;AAAA,EACX,OAAOA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EACzB,kBAAkB,sBAAsB,SAAS;AACnD,CAAC;;;ADnEM,IAAM,wBAAwBC,GAAE,OAAO;AAAA,EAC5C,QAAQA,GAAE,OAAO;AAAA,EACjB,KAAKA,GAAE,OAAO;AAAA,EACd,KAAKA,GAAE,OAAO;AAAA,EACd,QAAQA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAC5B,CAAC;AAEM,IAAM,kCAAkCA,GAAE,OAAO;AAAA,EACtD,cAAcA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EACrC,WAAWA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAClC,WAAWA,GAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAClC,OAAOA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA;AAAA,EACpC,gBAAgBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AACpC,CAAC;AAEM,IAAM,6BAA6BA,GAAE,OAAOA,GAAE,OAAO,GAAG,+BAA+B;AAEvF,IAAM,0BAA0BA,GAAE,OAAO;AAAA,EAC9C,YAAYA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACnC,uBAAuBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC9C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,qBAAqBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC5C,6BAA6BA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACpD,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,cAAcA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EACrC,oBAAoBA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,EAC3C,SAASA,GAAE,KAAK,CAAC,QAAQ,MAAM,CAAC;AAClC,CAAC;AAEM,IAAM,kCAAkCA,GAAE,OAAO;AAAA,EACtD,cAAc;AAAA,EACd,UAAUA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACpC,iBAAiBA,GAAE,OAAO;AAAA;AAAA,EAC1B,QAAQA,GAAE,QAAQ;AACpB,CAAC;AAEM,IAAM,oCAAoCA,GAAE,OAAO;AAAA,EACxD,gBAAgBA,GAAE,QAAQ,CAAC;AAAA,EAC3B,cAAcA,GAAE,QAAQ,WAAW;AAAA;AAAA,EACnC,cAAcA,GAAE,OAAO;AAAA,EACvB,QAAQ;AAAA,EACR,OAAOA,GAAE,OAAO;AAAA,EAChB,cAAc;AAAA,EACd,SAASA,GAAE,OAAO;AAAA,EAClB,sBAAsBA,GAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAChD,qBAAqBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EAClD,oBAAoBA,GAAE,OAAO,EAAE,IAAI,EAAE,YAAY;AAAA,EACjD,YAAYA,GAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC;AAAA,EAClC,WAAWA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAAA,EAC7B,eAAeA,GAAE,OAAO;AAAA;AAAA,EACxB,qBAAqBA,GAAE,OAAO;AAAA;AAAA,EAG9B,eAAe;AAAA,EACf,uBAAuB;AAAA,EACvB,qBAAqB;AAAA,EACrB,0BAA0B;AAAA,EAC1B,YAAY;AAAA,EACZ,8BAA8B;AAAA,EAE9B,uBAAuB;AAAA,EACvB,qBAAqB;AAAA;AAAA,EAGrB,qBAAqBA,GAAE,OAAOA,GAAE,OAAO,GAAG,qBAAqB;AAAA,EAC/D,oBAAoB;AAAA,EACpB,uBAAuBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EAEzC,WAAW;AAAA;AAAA;AAAA,EAGX,wBAAwBA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA,EAE1C,OAAOA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAAA;AAAA;AAAA,EAIzB,kBAAkB,sBAAsB,SAAS;AACnD,CAAC;;;AGjEM,SAAS,OAAO,QAA0B;AAC/C,MAAI,OAAO,WAAW,EAAG,OAAM,IAAI,MAAM,qBAAqB;AAC9D,QAAM,SAAS,CAAC,GAAG,MAAM,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC/C,QAAM,MAAM,KAAK,MAAM,OAAO,SAAS,CAAC;AACxC,MAAI,OAAO,SAAS,MAAM,EAAG,QAAO,OAAO,GAAG;AAC9C,UAAQ,OAAO,MAAM,CAAC,IAAI,OAAO,GAAG,KAAK;AAC3C;AAIO,SAAS,gBAAgB,QAAmC;AACjE,QAAM,IAAI,OAAO,MAAM;AACvB,SAAO;AAAA,IACL,QAAQ;AAAA,IACR,KAAK,KAAK,IAAI,GAAG,MAAM;AAAA,IACvB,KAAK,KAAK,IAAI,GAAG,MAAM;AAAA,IACvB;AAAA,EACF;AACF;AAQO,SAAS,2BACd,eACsB;AACtB,QAAM,OAAO,oBAAI,IAAY;AAC7B,aAAW,OAAO,eAAe;AAC/B,eAAW,OAAO,OAAO,KAAK,GAAG,EAAG,MAAK,IAAI,GAAG;AAAA,EAClD;AAEA,QAAM,SAA+B,CAAC;AACtC,aAAW,OAAO,MAAM;AACtB,UAAM,SAAS,cAAc,IAAI,CAAC,QAAQ,IAAI,GAAG,GAAG,SAAS,CAAC;AAC9D,UAAM,QAAQ,cAAc,KAAK,CAAC,QAAQ,IAAI,GAAG,MAAM,MAAS,IAAI,GAAG,GAAG,SAAS;AACnF,WAAO,GAAG,IAAI;AAAA,MACZ,cAAc,OAAO,MAAM;AAAA,MAC3B,WAAW,KAAK,IAAI,GAAG,MAAM;AAAA,MAC7B,WAAW,KAAK,IAAI,GAAG,MAAM;AAAA,MAC7B;AAAA,MACA,gBAAgB;AAAA,IAClB;AAAA,EACF;AACA,SAAO;AACT;AAKO,SAAS,4BACd,aACiC;AACjC,QAAM,YAAY,oBAAI,IAAY;AAClC,aAAW,OAAO,aAAa;AAC7B,eAAW,KAAK,OAAO,KAAK,GAAG,EAAG,WAAU,IAAI,CAAC;AAAA,EACnD;AAEA,QAAM,SAA0C,CAAC;AACjD,aAAW,KAAK,WAAW;AACzB,UAAM,SAAS,YAAY,IAAI,CAAC,QAAQ,IAAI,CAAC,KAAK,CAAC;AACnD,WAAO,CAAC,IAAI,gBAAgB,MAAM;AAAA,EACpC;AACA,SAAO;AACT;AAaO,SAAS,yBAAyB,OASnB;AACpB,QAAM,aACJ,MAAM,cAAc,UAAU,KAAK,MAAM,cAAc,OAAO,IAAI,SAAS;AAE7E,QAAM,wBACJ,MAAM,sBAAsB,UAAU,OAAO,SAAS;AAExD,MAAI,8BAA+C;AACnD,aAAW,SAAS,OAAO,OAAO,MAAM,qBAAqB,GAAG;AAC9D,QAAI,MAAM,SAAS,KAAK,MAAM,eAAe,KAAK;AAChD,oCAA8B;AAC9B;AAAA,IACF;AAAA,EACF;AAEA,QAAM,sBACJ,MAAM,oBAAoB,UAAU,MAAM,SAAS;AAErD,QAAM,aAAa,MAAM,iBAAiB,aAAa,IAAI;AAC3D,QAAM,8BACJ,MAAM,yBAAyB,UAAU,aAAa,SAAS;AAGjE,QAAM,eACJ,MAAM,WAAW,UAAU,MAAU,SAAS;AAGhD,QAAM,eACJ,MAAM,WAAW,OAAO,OAAY,SAAS;AAG/C,QAAM,qBACJ,MAAM,6BAA6B,QAAQ,IAAI,SAAS;AAE1D,QAAM,WAAgC;AAAA,IACpC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,QAAM,UAA2B,SAAS,MAAM,CAAC,MAAM,MAAM,MAAM,IAAI,SAAS;AAEhF,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAWO,SAAS,4BACd,iBACA,WACU;AACV,QAAM,YAAY,KAAK,KAAK,YAAY,CAAC;AACzC,QAAM,YAAgC;AAAA,IACpC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,QAAM,YAAsB,CAAC;AAC7B,aAAW,OAAO,WAAW;AAC3B,UAAM,YAAY,gBAAgB,OAAO,CAAC,OAAO,GAAG,GAAG,MAAM,MAAM,EAAE;AACrE,QAAI,aAAa,UAAW,WAAU,KAAK,GAAG;AAAA,EAChD;AACA,SAAO;AACT;AAcO,SAAS,4BAA4B,OAO5B;AACd,MAAI,MAAM,iBAAiB,kBAAmB,QAAO;AAErD,MAAI,MAAM,iBAAiB,iBAAiB,UAAU,KAAK,MAAM,WAAW,GAAG;AAC7E,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,kBAAkB,YAAY,OAAQ,QAAO;AAEvD,QAAM,kBACJ,UAAU,KAAK,MAAM,WAAW,KAAK,MAAM,iBAAiB;AAC9D,MACE,mBACA,MAAM,sBAAsB,KAC5B,MAAM,qBAAqB,WAAW,GACtC;AACA,WAAO;AAAA,EACT;AAEA,SAAO;AACT;AAUO,SAAS,kBACd,MACA,MAM6B;AAC7B,MAAI,KAAK,WAAW,EAAG,OAAM,IAAI,MAAM,qCAAqC;AAC5E,QAAM,QAAQ,KAAK,CAAC;AAEpB,QAAM,WAAW,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,aAAa,CAAC;AACjE,QAAM,qBAAqB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,gBAAgB,KAAK,CAAC;AACnF,QAAM,oBAAoB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,cAAc,KAAK,CAAC;AAChF,QAAM,kBAAkB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,wBAAwB,CAAC;AACnF,QAAM,gBAAgB,gBAAgB,KAAK,IAAI,CAAC,MAAM,EAAE,UAAU,CAAC;AACnE,QAAM,yBAAyB;AAAA,IAC7B,KAAK,IAAI,CAAC,MAAM,EAAE,4BAA4B;AAAA,EAChD;AAEA,QAAM,gBAAgB,2BAA2B,KAAK,IAAI,CAAC,MAAM,EAAE,qBAAqB,CAAC;AACzF,QAAM,eAAe,2BAA2B,KAAK,IAAI,CAAC,MAAM,EAAE,mBAAmB,CAAC;AACtF,QAAM,gBAAgB,4BAA4B,KAAK,IAAI,CAAC,MAAM,EAAE,mBAAmB,CAAC;AAExF,QAAM,aAAa,MAAM,iBAAiB,aAAa,IAAI;AAC3D,QAAM,mBAAmB;AAAA,IACvB,cAAc,MAAM;AAAA,IACpB,UAAU;AAAA,IACV,iBAAiB,gBAAgB;AAAA,IACjC,QAAQ,gBAAgB,UAAU;AAAA,EACpC;AAEA,QAAM,oBAAoB,yBAAyB;AAAA,IACjD,eAAe;AAAA,IACf,uBAAuB;AAAA,IACvB,uBAAuB;AAAA,IACvB,qBAAqB;AAAA,IACrB,0BAA0B;AAAA,IAC1B,cAAc,MAAM;AAAA,IACpB,YAAY;AAAA,IACZ,8BAA8B;AAAA,EAChC,CAAC;AAED,QAAM,uBAAuB;AAAA,IAC3B,KAAK,IAAI,CAAC,MAAM,EAAE,SAAS;AAAA,IAC3B,KAAK;AAAA,EACP;AAEA,QAAM,SAAS,4BAA4B;AAAA,IACzC,aAAa,MAAM;AAAA,IACnB,cAAc,MAAM;AAAA,IACpB;AAAA,IACA,mBAAmB,SAAS;AAAA,IAC5B;AAAA,IACA,cAAc,KAAK;AAAA,EACrB,CAAC;AAED,QAAM,QAAkB,CAAC;AACzB,MAAI,kBAAkB,iBAAiB,QAAQ;AAC7C,UAAM;AAAA,MACJ,4BAA4B,cAAc,SAAS,KAAM,QAAQ,CAAC,CAAC;AAAA,IACrE;AAAA,EACF;AACA,MAAI,SAAS,SAAS,GAAG;AACvB,UAAM,KAAK,yBAAyB,SAAS,MAAM,mCAAmC;AAAA,EACxF;AACA,MAAI,qBAAqB,SAAS,GAAG;AACnC,UAAM,KAAK,+CAA+C,qBAAqB,KAAK,IAAI,CAAC,EAAE;AAAA,EAC7F;AACA,MAAI,WAAW,mBAAmB;AAChC,UAAM;AAAA,MACJ;AAAA,IACF;AAAA,EACF;AACA,MAAI,WAAW,oBAAoB;AACjC,UAAM,KAAK,oEAAoE;AAAA,EACjF;AAEA,SAAO,kCAAkC,MAAM;AAAA,IAC7C,gBAAgB;AAAA,IAChB,cAAc;AAAA,IACd,cAAc,MAAM;AAAA,IACpB;AAAA,IACA,OAAO,MAAM;AAAA,IACb,cAAc,MAAM;AAAA,IACpB,SAAS,MAAM;AAAA,IACf,sBAAsB,MAAM;AAAA,IAC5B,qBAAqB,MAAM;AAAA,IAC3B,oBAAoB,MAAM;AAAA,IAC1B,YAAY,KAAK;AAAA,IACjB,WAAW,KAAK;AAAA,IAChB,eAAe,KAAK,iBAAgB,oBAAI,KAAK,GAAE,YAAY;AAAA,IAC3D,qBAAqB,MAAM;AAAA,IAC3B,eAAe;AAAA,IACf,uBAAuB;AAAA,IACvB,qBAAqB;AAAA,IACrB,0BAA0B;AAAA,IAC1B,YAAY;AAAA,IACZ,8BAA8B;AAAA,IAC9B,uBAAuB;AAAA,IACvB,qBAAqB;AAAA,IACrB,qBAAqB;AAAA,IACrB,oBAAoB;AAAA,IACpB,uBAAuB,MAAM;AAAA,IAC7B,WAAW;AAAA,IACX,wBAAwB;AAAA,IACxB;AAAA,IACA,GAAI,KAAK,mBAAmB,OAAO,KAAK,KAAK,eAAe,EAAE,SAAS,KAAK;AAAA,MAC1E,kBAAkB,KAAK;AAAA,IACzB;AAAA,EACF,CAAC;AACH;AAGA,IAAM,6BAA6B;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,4BAA4B,MAA+D;AAClG,MAAI,CAAC,KAAM,QAAO;AAClB,QAAM,QAAQ,2BACX,OAAO,CAAC,MAAM,KAAK,CAAC,MAAM,MAAS,EACnC,IAAI,CAAC,MAAM,KAAK,CAAC,KAAK,KAAK,CAAC,CAAC,EAAE;AAClC,MAAI,MAAM,WAAW,EAAG,QAAO;AAC/B,SAAO;AAAA;AAAA;AAAA,EAA4B,MAAM,KAAK,IAAI,CAAC;AAAA;AACrD;AAIO,SAAS,8BAA8B,GAAwC;AACpF,QAAM,MAAM,CAAC,UAAkB,GAAG,KAAK,MAAM,QAAQ,GAAG,CAAC;AACzD,QAAM,aAAa,CAAC,OAAe,IAAI,KAAK,KAAM,QAAQ,CAAC,CAAC;AAE5D,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,EAAE;AACb,QAAM,MAAM,EAAE;AACd,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,EAAE;AACb,QAAM,MAAM,EAAE;AAEd,QAAM,cACJ,EAAE,UAAU,SAAS,IACjB,GAAG,EAAE,UAAU,CAAC,CAAC,WAAM,EAAE,UAAU,EAAE,UAAU,SAAS,CAAC,CAAC,KAC1D;AAEN,QAAM,oBAAoB,OAAO,QAAQ,EAAE,qBAAqB,EAC7D,IAAI,CAAC,CAAC,KAAK,KAAK,MAAM;AACrB,UAAM,KAAK,EAAE,oBAAoB,GAAG;AACpC,WACE,KAAK,GAAG,MAAM,IAAI,MAAM,YAAY,CAAC,MAAM,IAAI,MAAM,SAAS,CAAC,SAAI,IAAI,MAAM,SAAS,CAAC,MAAM,MAAM,KAAK,QACvG,KACG,IAAI,IAAI,GAAG,YAAY,CAAC,MAAM,IAAI,GAAG,SAAS,CAAC,SAAI,IAAI,GAAG,SAAS,CAAC,OACpE;AAAA,EAER,CAAC,EACA,KAAK,IAAI;AAEZ,QAAM,gBAAgB;AAAA,IACpB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,QAAM,SAAS,cAAc,IAAI,CAAC,MAAM;AACtC,UAAM,SAAS,EAAE,oBAAoB,CAAC;AACtC,UAAM,cAAc,EAAE,sBAAsB,SAAS,CAAC,IAClD,sBAAsB,EAAE,OAAO,MAC/B;AACJ,QAAI,CAAC,OAAQ,QAAO,KAAK,CAAC,uBAAa,WAAW;AAClD,WAAO,KAAK,CAAC,MAAM,OAAO,OAAO,QAAQ,CAAC,CAAC,MAAM,OAAO,GAAG,SAAI,OAAO,GAAG,GAAG,WAAW;AAAA,EACzF,CAAC,EAAE,KAAK,IAAI;AAGZ,QAAM,aAAa,EAAE,sBAAsB,OACxC,IAAI,CAAC,KAAK,MAAM;AACf,UAAM,OAAO,EAAE,cAAc,OAAO,CAAC,KAAK;AAC1C,UAAM,OAAO,EAAE,oBAAoB,OAAO,CAAC,KAAK;AAChD,UAAM,QAAQ,EAAE,yBAAyB,OAAO,CAAC,KAAK;AACtD,UAAM,OAAO,EAAE,WAAW,OAAO,CAAC,KAAK;AACvC,WAAO,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,EAAE,mBAAmB,MAAM,OAAO,QAAQ,WAAW,IAAI,GAAG,IAAI,GAAG,MAAM,OAAO,SAAS,WAAW,IAAI,IAAI,IAAI,GAAG,MAAM,KAAK,QAAQ,OAAO,SAAS,WAAW,WAAW,IAAI,IAAI,GAAG;AAAA,EACxN,CAAC,EACA,KAAK,IAAI;AAEZ,QAAM,mBACJ,EAAE,uBAAuB,SAAS,IAC9B,EAAE,uBAAuB,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,EAAE,KAAK,IAAI,IACvD;AAEN,QAAM,eACJ,EAAE,MAAM,SAAS,IAAI;AAAA;AAAA;AAAA,EAAiB,EAAE,MAAM,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,EAAE,KAAK,IAAI,CAAC;AAAA,IAAO;AAEtF,QAAM,yBAAyB,4BAA4B,EAAE,gBAAgB;AAE7E,SAAO,gCAA2B,EAAE,YAAY,kBAAkB,EAAE,UAAU;AAAA;AAAA,eAEjE,EAAE,KAAK;AAAA,sBACA,EAAE,YAAY;AAAA,gBACpB,EAAE,MAAM;AAAA,iBACP,EAAE,OAAO,KAAK,EAAE,oBAAoB,aAAa,EAAE,mBAAmB,WAAW,EAAE,kBAAkB;AAAA,uBAC/F,EAAE,aAAa;AAAA,6BACT,EAAE,mBAAmB;AAAA,mBAC/B,EAAE,UAAU;AAAA,mBACZ,WAAW;AAAA,EAC5B,sBAAsB;AAAA;AAAA;AAAA,eAGT,GAAG,MAAM,MAAM,EAAE,mBAAmB,WAAW,GAAG,GAAG,SAAI,GAAG,GAAG;AAAA,4BAClD,IAAI,GAAG,MAAM,CAAC,WAAW,IAAI,GAAG,GAAG,CAAC,SAAI,IAAI,GAAG,GAAG,CAAC;AAAA,0BACrD,IAAI,GAAG,MAAM,CAAC,WAAW,IAAI,GAAG,GAAG,CAAC,SAAI,IAAI,GAAG,GAAG,CAAC;AAAA,+BAC9C,IAAI,MAAM,eAAe,IAAI,GAAG,SAAI,IAAI,GAAG;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,wBAMlD,GAAG,MAAM,SAAS,GAAG,GAAG,uCAA6B,GAAG,UAAU;AAAA,6BAC7D,IAAI,GAAG,MAAM,CAAC,kBAAa,GAAG,qBAAqB;AAAA,mEAClB,GAAG,2BAA2B;AAAA,2BACjE,IAAI,GAAG,MAAM,CAAC,kBAAa,GAAG,mBAAmB;AAAA,4BAChD,IAAI,MAAM,SAAS,IAAI,YAAY,UAAK,IAAI,QAAQ,OAAO,GAAG,2BAA2B;AAAA,0BAC3F,WAAW,GAAG,MAAM,CAAC,8BAAyB,GAAG,YAAY;AAAA,uBAChE,WAAW,GAAG,GAAG,CAAC,8BAAyB,GAAG,YAAY;AAAA,0BACvD,EAAE,6BAA6B,GAAG,qBAAqB,GAAG,kBAAkB;AAAA,sBAChF,GAAG,OAAO;AAAA;AAAA;AAAA;AAAA,EAI9B,gBAAgB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMhB,iBAAiB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMjB,MAAM;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMN,UAAU;AAAA,EACV,YAAY;AACd;","names":["z","z","z","z"]}
|