@mcptoolshop/research-os 0.3.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +210 -0
- package/README.es.md +26 -1
- package/README.fr.md +30 -5
- package/README.hi.md +45 -5
- package/README.it.md +26 -1
- package/README.ja.md +30 -5
- package/README.md +39 -1
- package/README.pt-BR.md +26 -1
- package/README.zh.md +26 -1
- package/dist/calibration/aggregate-receipt-schema.d.ts +509 -0
- package/dist/calibration/aggregate-receipt-schema.js +143 -0
- package/dist/calibration/aggregate-receipt-schema.js.map +1 -0
- package/dist/calibration/aggregate.d.ts +35 -0
- package/dist/calibration/aggregate.js +454 -0
- package/dist/calibration/aggregate.js.map +1 -0
- package/dist/calibration/receipt-schema.d.ts +317 -0
- package/dist/calibration/receipt-schema.js +68 -0
- package/dist/calibration/receipt-schema.js.map +1 -0
- package/dist/calibration/receipt.d.ts +31 -0
- package/dist/calibration/receipt.js +151 -0
- package/dist/calibration/receipt.js.map +1 -0
- package/dist/cli.js +1957 -1253
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +140 -4
- package/dist/index.js +1499 -1168
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/calibration/receipt.ts"],"sourcesContent":["import type {\n Architecture,\n CalibrationReceipt,\n DecisionVocabBar,\n PassFail,\n PerCategoryRecall,\n Recall,\n StatusLabel,\n} from './receipt-schema.js';\n\n// Architecture-aware decision-vocab bar.\n// single-pass: narrow_critic pass is absent, so model uses full 6-decision\n// vocabulary. Bar: >= 4.\n// two-pass: narrow_critic collapses needs_human_review into harder decisions,\n// reducing diversity. Bar: >= 3 (F-49 resolution).\nexport function computeDecisionVocabBar(\n architecture: Architecture,\n decisionsProducedCount: number,\n): DecisionVocabBar {\n const required = architecture === 'two-pass' ? 3 : 4;\n return {\n architecture,\n required,\n produced: decisionsProducedCount,\n passed: decisionsProducedCount >= required,\n };\n}\n\n// Per-category any-flag floor: seeded categories with total >= 2 must have\n// ratio >= 0.50. Categories with fewer than 2 seeds are excluded (not enough\n// signal to enforce a floor — e.g. a 1-seed category with 0 misses is fine).\nfunction computePerCategoryFloor(perCategoryAnyFlag: PerCategoryRecall): 'PASS' | 'FAIL' {\n for (const [, recall] of Object.entries(perCategoryAnyFlag)) {\n if (recall.total >= 2 && recall.ratio < 0.5) return 'FAIL';\n }\n return 'PASS';\n}\n\nexport function computePassFail(input: {\n good_fp_count: number;\n any_flag_recall: Recall;\n per_category_any_flag: PerCategoryRecall;\n strict_recall: Recall;\n decision_vocab_bar: DecisionVocabBar;\n runtime_ms: number;\n empty_or_malformed_responses: number;\n}): PassFail {\n const fp_ceiling = input.good_fp_count <= 1 ? 'PASS' : 'FAIL';\n const any_flag_recall_floor = input.any_flag_recall.ratio >= 0.65 ? 'PASS' : 'FAIL';\n const per_category_any_flag_floor = computePerCategoryFloor(input.per_category_any_flag);\n const strict_recall_floor = input.strict_recall.ratio >= 0.2 ? 'PASS' : 'FAIL';\n const decision_vocab_completeness = input.decision_vocab_bar.passed ? 'PASS' : 'FAIL';\n // Latency soft: warn-only, never FAIL\n const latency_soft = input.runtime_ms <= 600_000 ? 'PASS' : 'WARN';\n const latency_hard = input.runtime_ms <= 1_200_000 ? 'PASS' : 'FAIL';\n const empty_or_malformed = input.empty_or_malformed_responses === 0 ? 'PASS' : 'FAIL';\n\n const hardBars: Array<'PASS' | 'FAIL'> = [\n fp_ceiling,\n any_flag_recall_floor,\n per_category_any_flag_floor,\n strict_recall_floor,\n decision_vocab_completeness,\n latency_hard,\n empty_or_malformed,\n ];\n const overall = hardBars.every((v) => v === 'PASS') ? 'PASS' : 'FAIL';\n\n return {\n fp_ceiling,\n any_flag_recall_floor,\n per_category_any_flag_floor,\n strict_recall_floor,\n decision_vocab_completeness,\n latency_soft,\n latency_hard,\n empty_or_malformed,\n overall,\n };\n}\n\n// Status-label assignment (advisor-locked predicates).\n//\n// Priority order:\n// 1. comparison_only — explicit flag OR single-pass Hermes (architectural side-run)\n// 2. failed — any hard bar FAIL\n// 3. trusted_baseline — canonical Hermes two-pass with PASS + FP=0\n// 4. conditional_pass — everything else that passes bars\n//\n// trusted_baseline encodes the canonical Hermes two-pass admission: the profile\n// is a named Hermes model run in two-pass architecture, all bars pass, and FP=0.\n// Hermes family is detected by case-insensitive substring match on profile name.\n//\n// conditional_pass is the admission status for non-baseline profiles that pass\n// all hard bars but carry a caution (FP at ceiling, non-hermes model, etc.).\n// mistral-nemo:12b two-pass = conditional_pass (FP=1, passes recalibrated bars).\nexport function computeStatusLabel(input: {\n profileName: string;\n architecture: Architecture;\n passFail: PassFail;\n goodFpCount: number;\n modeOverride?: 'comparison_only';\n}): StatusLabel {\n // comparison_only: explicit operator flag\n if (input.modeOverride === 'comparison_only') return 'comparison_only';\n\n // comparison_only: single-pass Hermes is an architectural side-run by design\n // (the canonical profile is two-pass; single-pass exists only for comparison)\n if (input.architecture === 'single-pass' && /hermes/i.test(input.profileName)) {\n return 'comparison_only';\n }\n\n // failed: any hard bar fails (latency_soft is WARN-only, never blocks)\n if (input.passFail.overall === 'FAIL') return 'failed';\n\n // trusted_baseline: canonical Hermes two-pass profile with perfect FP\n // Predicate: profile name contains \"hermes\" (case-insensitive) AND\n // architecture is two-pass AND all bars pass AND FP = 0\n const isHermesTwoPass =\n /hermes/i.test(input.profileName) && input.architecture === 'two-pass';\n if (isHermesTwoPass && input.goodFpCount === 0) return 'trusted_baseline';\n\n // conditional_pass: passes all bars but carries caution\n // (FP at ceiling, non-baseline profile, or non-hermes model)\n return 'conditional_pass';\n}\n\n// Map a receipt to the PromotionCalibrationSummary string shape used by\n// review-active.json. Called by the review-promote CLI when auto-populating\n// calibration_summary from a persisted receipt.\nexport function receiptToCalibrationSummary(receipt: CalibrationReceipt): {\n fixture: string | null;\n good_false_positive_rate: string | null;\n bad_any_flag_recall: string | null;\n strict_category_recall: string | null;\n unsupported_claim_recall: string | null;\n notes: string | null;\n} {\n const fp = receipt.good_fp_count;\n const fpTotal = receipt.fixture_good_claims;\n const fpPct = fpTotal > 0 ? Math.round((fp / fpTotal) * 100) : 0;\n\n const af = receipt.any_flag_recall;\n const sr = receipt.strict_recall;\n const unsupported = receipt.per_category_any_flag['unsupported_claim'];\n\n return {\n fixture: receipt.fixture,\n good_false_positive_rate: `${fp}/${fpTotal} (${fpPct}%)`,\n bad_any_flag_recall: `${af.matched}/${af.total} (${Math.round(af.ratio * 100)}%)`,\n strict_category_recall: `${sr.matched}/${sr.total} (${Math.round(sr.ratio * 100)}%)`,\n unsupported_claim_recall: unsupported\n ? `${unsupported.matched}/${unsupported.total} (${Math.round(unsupported.ratio * 100)}%)`\n : null,\n notes: `status=${receipt.status} model=${receipt.model} arch=${receipt.architecture} overall=${receipt.pass_fail.overall} decisions=${receipt.decisions_produced_count}/6`,\n };\n}\n\n// Render a compact Markdown receipt. Operator proof artifact — no prose.\nexport function buildReceiptMarkdown(r: CalibrationReceipt): string {\n const pct = (ratio: number) => `${Math.round(ratio * 100)}%`;\n const runtimeSec = (r.runtime_ms / 1000).toFixed(1);\n\n const perCatRows = Object.entries(r.per_category_any_flag)\n .map(([cat, af]) => {\n const st = r.per_category_strict[cat] ?? { matched: 0, total: af.total, ratio: 0 };\n return `| ${cat} | ${af.matched}/${af.total} (${pct(af.ratio)}) | ${st.matched}/${st.total} (${pct(st.ratio)}) |`;\n })\n .join('\\n');\n\n const dvRows = [\n 'accepted_for_synthesis',\n 'rejected',\n 'needs_scope_repair',\n 'needs_source_repair',\n 'needs_contradiction_mapping',\n 'needs_human_review',\n ]\n .map((d) => {\n const count = r.decision_vocabulary[d] ?? 0;\n const unreachable = r.unreachable_decisions.includes(d) ? ` (unreachable from ${r.fixture})` : '';\n return `| ${d} | ${count}${unreachable} |`;\n })\n .join('\\n');\n\n const pf = r.pass_fail;\n const bar = r.decision_vocab_bar;\n\n const notesSection =\n r.notes.length > 0 ? `\\n## Notes\\n\\n${r.notes.map((n) => `- ${n}`).join('\\n')}\\n` : '';\n\n return `# Calibration Receipt — ${r.profile_name}\n\n- **Model:** ${r.model}\n- **Architecture:** ${r.architecture}\n- **Status:** ${r.status}\n- **Fixture:** ${r.fixture} (${r.fixture_total_claims} claims = ${r.fixture_good_claims} good + ${r.fixture_bad_claims} bad)\n- **Calibrated at:** ${r.calibrated_at}\n- **Research-OS version:** ${r.research_os_version}\n- **Runtime:** ${runtimeSec} seconds\n\n## Headline metrics\n\n- FP: ${r.good_fp_count} / ${r.fixture_good_claims}\n- Any-flag recall: ${r.any_flag_recall.matched} / ${r.any_flag_recall.total} (${pct(r.any_flag_recall.ratio)})\n- Strict recall: ${r.strict_recall.matched} / ${r.strict_recall.total} (${pct(r.strict_recall.ratio)})\n- Decisions produced: ${r.decisions_produced_count} / 6\n\n## PASS / FAIL\n\n| Bar | Result |\n|---|---|\n| FP ceiling (≤1) | ${pf.fp_ceiling} |\n| Any-flag recall (≥65%) | ${pf.any_flag_recall_floor} |\n| Per-category any-flag (≥50%) | ${pf.per_category_any_flag_floor} |\n| Strict recall (≥20%) | ${pf.strict_recall_floor} |\n| Decision vocab (${bar.architecture} ≥ ${bar.required}) | ${pf.decision_vocab_completeness} |\n| Latency soft (≤10 min) | ${pf.latency_soft} |\n| Latency hard (≤20 min) | ${pf.latency_hard} |\n| Empty/malformed (=0) | ${pf.empty_or_malformed} |\n| **OVERALL** | **${pf.overall}** |\n\n## Per-category recall\n\n| Category | Any-flag | Strict |\n|---|---|---|\n${perCatRows}\n\n## Decision vocabulary\n\n| Decision | Count |\n|---|---:|\n${dvRows}\n${notesSection}`;\n}\n"],"mappings":";AAeO,SAAS,wBACd,cACA,wBACkB;AAClB,QAAM,WAAW,iBAAiB,aAAa,IAAI;AACnD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,UAAU;AAAA,IACV,QAAQ,0BAA0B;AAAA,EACpC;AACF;AAKA,SAAS,wBAAwB,oBAAwD;AACvF,aAAW,CAAC,EAAE,MAAM,KAAK,OAAO,QAAQ,kBAAkB,GAAG;AAC3D,QAAI,OAAO,SAAS,KAAK,OAAO,QAAQ,IAAK,QAAO;AAAA,EACtD;AACA,SAAO;AACT;AAEO,SAAS,gBAAgB,OAQnB;AACX,QAAM,aAAa,MAAM,iBAAiB,IAAI,SAAS;AACvD,QAAM,wBAAwB,MAAM,gBAAgB,SAAS,OAAO,SAAS;AAC7E,QAAM,8BAA8B,wBAAwB,MAAM,qBAAqB;AACvF,QAAM,sBAAsB,MAAM,cAAc,SAAS,MAAM,SAAS;AACxE,QAAM,8BAA8B,MAAM,mBAAmB,SAAS,SAAS;AAE/E,QAAM,eAAe,MAAM,cAAc,MAAU,SAAS;AAC5D,QAAM,eAAe,MAAM,cAAc,OAAY,SAAS;AAC9D,QAAM,qBAAqB,MAAM,iCAAiC,IAAI,SAAS;AAE/E,QAAM,WAAmC;AAAA,IACvC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,QAAM,UAAU,SAAS,MAAM,CAAC,MAAM,MAAM,MAAM,IAAI,SAAS;AAE/D,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAiBO,SAAS,mBAAmB,OAMnB;AAEd,MAAI,MAAM,iBAAiB,kBAAmB,QAAO;AAIrD,MAAI,MAAM,iBAAiB,iBAAiB,UAAU,KAAK,MAAM,WAAW,GAAG;AAC7E,WAAO;AAAA,EACT;AAGA,MAAI,MAAM,SAAS,YAAY,OAAQ,QAAO;AAK9C,QAAM,kBACJ,UAAU,KAAK,MAAM,WAAW,KAAK,MAAM,iBAAiB;AAC9D,MAAI,mBAAmB,MAAM,gBAAgB,EAAG,QAAO;AAIvD,SAAO;AACT;AAKO,SAAS,4BAA4B,SAO1C;AACA,QAAM,KAAK,QAAQ;AACnB,QAAM,UAAU,QAAQ;AACxB,QAAM,QAAQ,UAAU,IAAI,KAAK,MAAO,KAAK,UAAW,GAAG,IAAI;AAE/D,QAAM,KAAK,QAAQ;AACnB,QAAM,KAAK,QAAQ;AACnB,QAAM,cAAc,QAAQ,sBAAsB,mBAAmB;AAErE,SAAO;AAAA,IACL,SAAS,QAAQ;AAAA,IACjB,0BAA0B,GAAG,EAAE,IAAI,OAAO,KAAK,KAAK;AAAA,IACpD,qBAAqB,GAAG,GAAG,OAAO,IAAI,GAAG,KAAK,KAAK,KAAK,MAAM,GAAG,QAAQ,GAAG,CAAC;AAAA,IAC7E,wBAAwB,GAAG,GAAG,OAAO,IAAI,GAAG,KAAK,KAAK,KAAK,MAAM,GAAG,QAAQ,GAAG,CAAC;AAAA,IAChF,0BAA0B,cACtB,GAAG,YAAY,OAAO,IAAI,YAAY,KAAK,KAAK,KAAK,MAAM,YAAY,QAAQ,GAAG,CAAC,OACnF;AAAA,IACJ,OAAO,UAAU,QAAQ,MAAM,UAAU,QAAQ,KAAK,SAAS,QAAQ,YAAY,YAAY,QAAQ,UAAU,OAAO,cAAc,QAAQ,wBAAwB;AAAA,EACxK;AACF;AAGO,SAAS,qBAAqB,GAA+B;AAClE,QAAM,MAAM,CAAC,UAAkB,GAAG,KAAK,MAAM,QAAQ,GAAG,CAAC;AACzD,QAAM,cAAc,EAAE,aAAa,KAAM,QAAQ,CAAC;AAElD,QAAM,aAAa,OAAO,QAAQ,EAAE,qBAAqB,EACtD,IAAI,CAAC,CAAC,KAAK,EAAE,MAAM;AAClB,UAAM,KAAK,EAAE,oBAAoB,GAAG,KAAK,EAAE,SAAS,GAAG,OAAO,GAAG,OAAO,OAAO,EAAE;AACjF,WAAO,KAAK,GAAG,MAAM,GAAG,OAAO,IAAI,GAAG,KAAK,KAAK,IAAI,GAAG,KAAK,CAAC,OAAO,GAAG,OAAO,IAAI,GAAG,KAAK,KAAK,IAAI,GAAG,KAAK,CAAC;AAAA,EAC9G,CAAC,EACA,KAAK,IAAI;AAEZ,QAAM,SAAS;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,EACG,IAAI,CAAC,MAAM;AACV,UAAM,QAAQ,EAAE,oBAAoB,CAAC,KAAK;AAC1C,UAAM,cAAc,EAAE,sBAAsB,SAAS,CAAC,IAAI,sBAAsB,EAAE,OAAO,MAAM;AAC/F,WAAO,KAAK,CAAC,MAAM,KAAK,GAAG,WAAW;AAAA,EACxC,CAAC,EACA,KAAK,IAAI;AAEZ,QAAM,KAAK,EAAE;AACb,QAAM,MAAM,EAAE;AAEd,QAAM,eACJ,EAAE,MAAM,SAAS,IAAI;AAAA;AAAA;AAAA,EAAiB,EAAE,MAAM,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,EAAE,KAAK,IAAI,CAAC;AAAA,IAAO;AAEtF,SAAO,gCAA2B,EAAE,YAAY;AAAA;AAAA,eAEnC,EAAE,KAAK;AAAA,sBACA,EAAE,YAAY;AAAA,gBACpB,EAAE,MAAM;AAAA,iBACP,EAAE,OAAO,KAAK,EAAE,oBAAoB,aAAa,EAAE,mBAAmB,WAAW,EAAE,kBAAkB;AAAA,uBAC/F,EAAE,aAAa;AAAA,6BACT,EAAE,mBAAmB;AAAA,iBACjC,UAAU;AAAA;AAAA;AAAA;AAAA,QAInB,EAAE,aAAa,MAAM,EAAE,mBAAmB;AAAA,qBAC7B,EAAE,gBAAgB,OAAO,MAAM,EAAE,gBAAgB,KAAK,KAAK,IAAI,EAAE,gBAAgB,KAAK,CAAC;AAAA,mBACzF,EAAE,cAAc,OAAO,MAAM,EAAE,cAAc,KAAK,KAAK,IAAI,EAAE,cAAc,KAAK,CAAC;AAAA,wBAC5E,EAAE,wBAAwB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,2BAM5B,GAAG,UAAU;AAAA,kCACN,GAAG,qBAAqB;AAAA,wCAClB,GAAG,2BAA2B;AAAA,gCACtC,GAAG,mBAAmB;AAAA,oBAC7B,IAAI,YAAY,WAAM,IAAI,QAAQ,OAAO,GAAG,2BAA2B;AAAA,kCAC9D,GAAG,YAAY;AAAA,kCACf,GAAG,YAAY;AAAA,2BACjB,GAAG,kBAAkB;AAAA,oBAC5B,GAAG,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAM5B,UAAU;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAMV,MAAM;AAAA,EACN,YAAY;AACd;","names":[]}
|