npm - onto-mcp - Versions diffs - 0.4.11 → 0.4.12 - Mend

onto-mcp 0.4.11 → 0.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/.onto/authority/supported-models.yaml CHANGED Viewed

@@ -25,3 +25,19 @@ supported_models:
       Completed a full reconstruct pipeline run in the live medium baseline
       (1 of 6 attempts completed end to end); support verified. Performance
       evidence is PRELIMINARY in that record — support, not a performance claim.
+  - provider: anthropic
+    model: claude-opus-4-8
+    verified_at: "2026-06-15"
+    benchmark_evidence_refs:
+      - development-records/benchmark/reconstruct-pipeline-live-claude-20260615.json
+    notes: >-
+      Completed a full reconstruct pipeline run end to end via the Anthropic
+      OAuth Claude Code CLI worker (execution_adapter=claude_code):
+      record_stage=completed with final_output present and provenance valid; the
+      maturation track ran to a `blocked` continuation decision (no actionable
+      ontology emitted). The CLI worker path (codex_cli/claude_code) uses the
+      600000ms DEFAULT_WORKER_TIMEOUT_MS — the slow opus seed-authoring call
+      exceeds the 120000ms SDK default — so support holds under default settings
+      (no unencoded env override required). Single-run completion proof
+      (PRELIMINARY for any performance claim, INV-BENCH-1 needs >=3 reps /
+      >=2 fixtures); support — model completes the pipeline — verified.

package/.onto/processes/reconstruct/reconstruct-contract-registry.yaml CHANGED Viewed

@@ -1124,6 +1124,12 @@ artifact_authorities:
   answer_support_ledger_validation:
     authority_ref: answer-support-ledger-validation.yaml
     validation_ref: null
+  answer_support_judgment:
+    authority_ref: answer-support-judgment.yaml
+    validation_ref: answer-support-judgment-validation.yaml
+  answer_support_judgment_validation:
+    authority_ref: answer-support-judgment-validation.yaml
+    validation_ref: null
   maturation_authority_response:
     authority_ref: maturation-authority-response.yaml
     validation_ref: maturation-authority-response-validation.yaml
@@ -1229,14 +1235,6 @@ planned_artifact_authorities:
     authority_ref: required-when-evaluation-validation.yaml
     validation_ref: null
     activation_condition: registry_predicate_evaluator_runtime_is_implemented
-  answer_support_judgment:
-    authority_ref: answer-support-judgment.yaml
-    validation_ref: answer-support-judgment-validation.yaml
-    activation_condition: answer_support_judge_runtime_is_implemented
-  answer_support_judgment_validation:
-    authority_ref: answer-support-judgment-validation.yaml
-    validation_ref: null
-    activation_condition: answer_support_judge_runtime_is_implemented
 validation_gate_catalog:
   - gate_id: reconstruct_run_control_gate
@@ -1339,6 +1337,9 @@ validation_gate_catalog:
   - gate_id: answer_support_gate
     validation_artifact_ref: answer-support-ledger-validation.yaml
     required_when: answer_support_ledger_exists
+  - gate_id: answer_support_judgment_gate
+    validation_artifact_ref: answer-support-judgment-validation.yaml
+    required_when: answer_support_judgment_required_minimal
   - gate_id: maturation_answer_claim_gate
     validation_artifact_ref: maturation-answer-claims-validation.yaml
     required_when: maturation_answer_claims_exist
@@ -1396,12 +1397,6 @@ planned_validation_gate_catalog:
     validation_artifact_ref: required-when-evaluation-validation.yaml
     required_when: always
     activation_condition: registry_predicate_evaluator_runtime_is_implemented
-  - gate_id: answer_support_judgment_gate
-    validation_artifact_ref: answer-support-judgment-validation.yaml
-    required_when: answer_support_judgment_required
-    activation_condition: answer_support_judge_runtime_is_implemented
-    activation_prerequisites:
-      - answer_support_ledger_validation_is_valid
 required_when_predicate_family_catalog:
   - predicate_family_id: frontier_observation_use_by_downstream_artifact
@@ -1529,20 +1524,6 @@ required_when_predicate_catalog:
     truth_expression: "source_observation_delta_validation.validation_status == valid and answer_support_ledger_refs_delta_observation_ids"
     unknown_projection: not_applicable
     explanation_template: "Answer support ledger cites observation ids from a frontier-triggered observation delta."
-  - predicate_id: answer_support_judgment_uses_frontier_observation
-    predicate_family_id: frontier_observation_use_by_downstream_artifact
-    gate_instance_scope: per_round
-    downstream_artifact_ref: answer-support-judgment.yaml
-    downstream_validation_ref: answer-support-judgment-validation.yaml
-    input_authority_refs: [rounds/<round-id>/source-observation-delta.yaml, rounds/<round-id>/source-observation-delta-validation.yaml, answer-support-judgment.yaml]
-    truth_expression: "source_observation_delta_validation.validation_status == valid and answer_support_judgment_refs_delta_observation_ids"
-    unknown_projection: not_applicable
-    explanation_template: "Answer support judgment cites observation ids from a frontier-triggered observation delta."
-  - predicate_id: answer_support_judgment_required
-    input_authority_refs: [answer-support-ledger.yaml, answer-support-ledger-validation.yaml]
-    truth_expression: "artifact_exists(answer-support-ledger.yaml) and answer_support_ledger_has_convergent_source_evidence_cluster"
-    unknown_projection: not_applicable
-    explanation_template: "A judge confirmation is required when answer support uses convergent source evidence."
   - predicate_id: maturation_answer_claims_use_frontier_observation
     predicate_family_id: frontier_observation_use_by_downstream_artifact
     gate_instance_scope: per_round
@@ -1738,6 +1719,11 @@ required_when_predicate_catalog:
     truth_expression: "artifact_exists(answer-support-ledger.yaml)"
     unknown_projection: not_applicable
     explanation_template: "An answer support ledger exists and requires support-ledger validation."
+  - predicate_id: answer_support_judgment_required_minimal
+    input_authority_refs: [answer-support-judgment.yaml]
+    truth_expression: "artifact_exists(answer-support-judgment.yaml)"
+    unknown_projection: not_applicable
+    explanation_template: "An answer support judgment exists and requires judgment validation. Convergent-source-evidence necessity and sufficiency are enforced by the maturation-answer-claims validator (B-6)."
   - predicate_id: maturation_authority_response_exists
     input_authority_refs: [maturation-authority-response.yaml]
     truth_expression: "artifact_exists(maturation-authority-response.yaml)"
@@ -2692,6 +2678,10 @@ validator_records:
       - ontology-seed.yaml
       - reconstruct-contract-registry.yaml
     conditional_input_authority_refs:
+      - artifact_ref: answer-support-judgment.yaml
+        activation_condition: answer_support_judge_runtime_is_implemented
+        consumed_for:
+          - require_convergent_source_evidence_claims_to_have_two_independent_judge_confirmed_supports
       - artifact_ref: answer-support-judgment-validation.yaml
         activation_condition: answer_support_judge_runtime_is_implemented
         consumed_for:
@@ -2704,8 +2694,22 @@ validator_records:
     conditional_validation_obligations:
       - obligation_id: require_convergent_source_evidence_claims_to_have_two_independent_judge_confirmed_supports
         activation_condition: answer_support_judge_runtime_is_implemented
-        input_authority_refs: [answer-support-judgment-validation.yaml]
+        input_authority_refs: [answer-support-judgment.yaml, answer-support-judgment-validation.yaml]
     output_ref: maturation-answer-claims-validation.yaml
+  - validator_id: answer-support-judgment-validator
+    gate_ids: [answer_support_judgment_gate]
+    validator_version: 1
+    input_authority_refs:
+      - answer-support-judgment.yaml
+      - answer-support-ledger.yaml
+      - answer-support-ledger-validation.yaml
+      - reconstruct-contract-registry.yaml
+    validation_obligations:
+      - validate_judgment_refs_resolve_to_answer_support_ledger_clusters_and_evidence
+      - require_supports_enum_for_each_judgment
+      - require_rationale_ref_for_each_judgment
+      - require_convergent_clusters_to_judge_every_cited_evidence_ref
+    output_ref: answer-support-judgment-validation.yaml
   - validator_id: ontology-expansion-validator
     gate_ids: [ontology_expansion_gate]
     validator_version: 1
@@ -2943,18 +2947,6 @@ validator_records:
     output_ref: handoff-decision-validation.yaml
 planned_validator_records:
-  - validator_id: answer-support-judgment-validator
-    gate_ids: [answer_support_judgment_gate]
-    validator_version: 1
-    input_authority_refs:
-      - answer-support-judgment.yaml
-      - answer-support-ledger-validation.yaml
-      - reconstruct-contract-registry.yaml
-    validation_obligations:
-      - validate_judgment_refs_resolve_to_answer_support_ledger_clusters_and_evidence
-      - require_supports_enum_for_each_judgment
-      - require_rationale_ref_for_each_judgment
-    output_ref: answer-support-judgment-validation.yaml
   - validator_id: maturation-promotion-request-validator
     gate_ids: [maturation_promotion_request_gate]
     validator_version: 1

package/.onto/processes/review/nesting-batch-worker-contract.md CHANGED Viewed

@@ -36,7 +36,7 @@ outer의 유일한 역할: **script를 `bash -s`로 실행하고 stdout을 verba
 | brand | spawn | 비고 |
 |---|---|---|
 | codex (`codex-nesting-batch-worker.ts`) | `codex exec --sandbox danger-full-access --ephemeral`, prompt는 stdin | outer가 subprocess를 spawn해야 하므로 non-seatbelt; inner unit executor는 자체 read-only sandbox 유지 |
-| claude (`claude-nesting-batch-worker.ts`) | `claude -p <prompt positional> --allowedTools Bash --permission-mode bypassPermissions --strict-mcp-config`(빈 MCP) | prompt는 **positional**(stdin 무시됨). `--effort` 지원, service_tier 표면 없음(API 전용). `ONTO_CLAUDE_BIN` 오버라이드 |
+| claude (`claude-nesting-batch-worker.ts`) | `claude -p <prompt positional> --allowedTools Bash --permission-mode bypassPermissions --strict-mcp-config`(빈 MCP) | prompt는 **positional**(stdin 무시됨). `--effort` 지원, service_tier 표면 없음(API 전용). 바이너리는 `resolveClaudeBin()`로 해석(`ONTO_CLAUDE_BIN` 오버라이드 → PATH → 일반 설치 위치) |
 outer(teamlead seat) model/effort는 settings `review.execution.teamlead.llm`에서 brand adapter(codex_cli/claude_code) 일치 시에만 해석된다. **inner unit의 LLM 설정은 outer 설정이 아니라 호출자가 구성한 inner argv에 실린다**(flat 동등).

package/dist/core-api/reconstruct-api.js CHANGED Viewed

@@ -13,6 +13,7 @@ import { createDirectCallReconstructConfirmationProvider, createDirectCallRecons
 import { RECONSTRUCT_MOCK_AUTHOR_ID, RECONSTRUCT_MOCK_CONFIRMATION_PROVIDER_ID, callReconstructMockLlm, isReconstructMockLlmRealizationEnabled, } from "../core-runtime/reconstruct/mock-llm-realization.js";
 import { assertSettingsModelsSupported, resolveSettingsChain, resolveReconstructActorLlmSettings, } from "../core-runtime/discovery/settings-chain.js";
 import { resolveOntoHome, } from "../core-runtime/discovery/onto-home.js";
+import { isSupportedModelRoute, loadSupportedModelRegistry, } from "../core-runtime/discovery/supported-models.js";
 import { resolveLlmProviderConfig, } from "../core-runtime/llm/llm-caller.js";
 import { writeOntologySeedValidationArtifact, writeCandidateDispositionValidationArtifact, } from "../core-runtime/reconstruct/ontology-seed-validation.js";
 import { writeSourceObservationDirectiveValidationArtifact, } from "../core-runtime/reconstruct/directive-validation.js";
@@ -35,6 +36,57 @@ function resolveFromBase(basePath, maybeRelativePath) {
         ? path.resolve(maybeRelativePath)
         : path.resolve(basePath, maybeRelativePath);
 }
+/**
+ * Pure adopt-vs-degrade decision for the opt-in answer-support judge config.
+ * The judge keeps the semantic author's config except for the requested
+ * overrides. A judgeModelCandidate (already resolved on the author's provider,
+ * so its credentials/adapter match the author) is adopted only when it is a
+ * benchmark-verified route (INV-MODEL-1) AND keeps the author's provider; any
+ * other case degrades to the author model with a recorded note. Effort always
+ * INHERITS the author's effective effort (e.g. a pinned `--effort`) unless
+ * judgeLlmEffort explicitly overrides it — never the model candidate's raw
+ * settings effort, which could otherwise silently run the judge weaker than the
+ * author. Returns `undefined` config when nothing was requested (caller inherits
+ * the author config — zero change).
+ */
+export function resolveJudgeLlmConfig(args) {
+    if (!args.judgeLlmEffort && !args.judgeModelCandidate) {
+        return { judgeLlmConfig: undefined, note: null };
+    }
+    const judge = { ...args.authorLlmConfig };
+    const authorEffort = args.authorLlmConfig.reasoning_effort;
+    let note = null;
+    const candidate = args.judgeModelCandidate;
+    if (candidate) {
+        // INV-MODEL-1 is keyed by MODEL provider (e.g. openai/gpt-5.5), not the
+        // runtime adapter provider (OpenAI OAuth normalizes to codex). Check the
+        // model provider so a supported judge model is not spuriously degraded.
+        const supported = isSupportedModelRoute(args.judgeModelProvider, candidate.model_id, args.registry);
+        // Credential safety: the candidate resolves on the author's provider, so its
+        // runtime provider must match the author's (guarantees api_key_env/adapter
+        // never cross providers). Uses the runtime provider, not the model provider.
+        const sameProvider = candidate.provider === args.authorLlmConfig.provider;
+        if (supported && sameProvider) {
+            Object.assign(judge, candidate);
+        }
+        else {
+            note = `answer-support judge model override (${args.judgeModelProvider ?? "(unresolved provider)"}/${candidate.model_id ?? "(unresolved model)"}) ${supported
+                ? "requires a different provider than the semantic author"
+                : "is not a benchmark-verified route"}; degraded to the semantic-author model`;
+        }
+    }
+    // Effort = explicit judge override, else the author's effective effort. This
+    // wins over any reasoning_effort Object.assign copied from the model candidate
+    // (the candidate is resolved without the author's effort pin, so its raw
+    // settings effort can diverge from the author's pinned effort).
+    if (args.judgeLlmEffort)
+        judge.reasoning_effort = args.judgeLlmEffort;
+    else if (authorEffort !== undefined)
+        judge.reasoning_effort = authorEffort;
+    else
+        delete judge.reasoning_effort;
+    return { judgeLlmConfig: judge, note };
+}
 function dateStamp() {
     return new Date().toISOString().slice(0, 10).replace(/-/g, "");
 }
@@ -277,8 +329,51 @@ export function createOntoReconstructCoreApi(options = {}) {
                     },
                     ...(llmEffortOverride ? { cliOverrides: llmEffortOverride } : {}),
                 });
+            // Opt-in per-stage JUDGE config (semantic-independence lever). Default =
+            // inherit the semantic-author config (judgeLlmConfig undefined → no change,
+            // zero regression). A judgeModel override resolves ON THE AUTHOR'S PROVIDER
+            // (same credentials/route), so it is adopted only when the resulting
+            // (author provider, judgeModel) pair is benchmark-verified, otherwise it
+            // degrades. resolveJudgeLlmConfig owns the adopt-vs-degrade decision.
+            const judgeOverrideRequested = Boolean(request.judgeLlmEffort || request.judgeModel);
+            let judgeConfigNote = null;
+            if (judgeOverrideRequested && mockRealizationEnabled) {
+                judgeConfigNote =
+                    "answer-support judge override ignored under mock realization (no real provider calls)";
+            }
+            // A judgeModel candidate is resolved on the SAME actor settings as the
+            // author (no provider override), so api_key_env / execution_adapter /
+            // base_url stay the author provider's — consistent, never cross-provider.
+            const judgeAuthorActorLlm = !mockRealizationEnabled && request.judgeModel
+                ? resolveReconstructActorLlmSettings(settings, "semantic_author")
+                : null;
+            const judgeModelCandidate = judgeAuthorActorLlm
+                ? resolveLlmProviderConfig({
+                    config: { llm: judgeAuthorActorLlm },
+                    cliOverrides: { model: request.judgeModel },
+                })
+                : null;
+            const judgeResolution = mockRealizationEnabled
+                ? { judgeLlmConfig: undefined, note: judgeConfigNote }
+                : resolveJudgeLlmConfig({
+                    authorLlmConfig: semanticAuthorLlmConfig,
+                    ...(request.judgeLlmEffort
+                        ? { judgeLlmEffort: request.judgeLlmEffort }
+                        : {}),
+                    judgeModelCandidate,
+                    // Registry key is the MODEL provider (e.g. openai), not the runtime
+                    // adapter (openai OAuth → codex). The judge uses the author's provider.
+                    ...(judgeAuthorActorLlm?.provider
+                        ? { judgeModelProvider: judgeAuthorActorLlm.provider }
+                        : {}),
+                    registry: loadSupportedModelRegistry(),
+                });
+            const judgeLlmConfig = judgeResolution.judgeLlmConfig;
+            if (!mockRealizationEnabled)
+                judgeConfigNote = judgeResolution.note;
             const directiveAuthor = createDirectCallReconstructDirectiveAuthor({
                 llmConfig: semanticAuthorLlmConfig,
+                ...(judgeLlmConfig ? { judgeLlmConfig } : {}),
                 ...(mockRealizationEnabled
                     ? {
                         llmCall: callReconstructMockLlm,
@@ -304,6 +399,21 @@ export function createOntoReconstructCoreApi(options = {}) {
                     : "reconstruct session starting",
                 stageId: "start",
             });
+            if (judgeConfigNote) {
+                // Honest accounting: the operator opted into a judge override that was
+                // not used (unsupported model degraded to the author model, or ignored
+                // under mock), so the rubber-stamp mitigation did NOT take effect. The
+                // judge's actual model/effort is independently recorded in the judge
+                // step execution telemetry. Emitted BEFORE the run so it survives a
+                // run failure — the degrade decision is independent of the run outcome.
+                appendRuntimeStatusEventSync({
+                    pipeline: "reconstruct",
+                    sessionRoot,
+                    sourceLabel: "onto_reconstruct",
+                    message: judgeConfigNote,
+                    stageId: "answer_support_judgment",
+                });
+            }
             const watcherResult = spawnRuntimeWatcherPane(projectRoot, sessionRoot, ontoHome);
             appendRuntimeStatusEventSync({
                 pipeline: "reconstruct",

package/dist/core-runtime/artifact-io.js ADDED Viewed

@@ -0,0 +1,59 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { stringify as stringifyYaml } from "yaml";
+/**
+ * Atomic artifact writes shared across the review and reconstruct runtimes.
+ *
+ * Pipeline artifacts (canonical `source-observations.yaml`, validation
+ * artifacts, the review record) are rewritten every round and trusted on read.
+ * A plain `mkdir` + `writeFile` is not atomic: a crash or full disk mid-write
+ * leaves a truncated-but-parseable file that the YAML parser silently accepts
+ * (a half-written `validation_status: valid` reads as a clean pass; an empty
+ * file reads as `null`). Writing to a same-directory temp file and renaming it
+ * into place makes the final path flip atomically — readers see either the
+ * prior complete file or the new complete file, never a torn one.
+ */
+// Process-monotonic counter guarantees temp-path uniqueness even when the same
+// target path is rewritten concurrently within one process.
+let tempWriteCounter = 0;
+/**
+ * Write `contents` to `filePath` atomically: create parent dirs, write to a
+ * unique same-directory temp file, then rename it into place. On any failure
+ * the temp file is removed and no partial file is left at `filePath`.
+ */
+export async function atomicWriteFile(filePath, contents) {
+    await fs.mkdir(path.dirname(filePath), { recursive: true });
+    tempWriteCounter += 1;
+    const tempPath = `${filePath}.${process.pid}.${tempWriteCounter}.tmp`;
+    try {
+        await fs.writeFile(tempPath, contents, "utf8");
+        // Same-filesystem rename is atomic; the target flips in one step.
+        await fs.rename(tempPath, filePath);
+    }
+    catch (error) {
+        await fs.rm(tempPath, { force: true });
+        throw error;
+    }
+}
+/**
+ * Serialize `value` to YAML and write it atomically. Byte-for-byte identical
+ * output to a direct `stringifyYaml(value)` write — only the write mechanism
+ * changes.
+ */
+export async function atomicWriteYamlDocument(filePath, value) {
+    await atomicWriteFile(filePath, stringifyYaml(value));
+}
+/**
+ * Fail-closed shape guard for trusted artifact reads. The pipeline reads its
+ * own artifacts and trusts them on read; a malformed artifact (e.g. a required
+ * array field that is missing, null, or a scalar — from a torn write or
+ * out-of-band tampering) would otherwise crash deep inside a validator with an
+ * uncontextualized `TypeError: ... is not iterable`. This throws an integrity
+ * error that names the artifact and field instead, so the run halts with an
+ * actionable message rather than continuing on misread data.
+ */
+export function assertArrayField(value, artifactLabel, fieldName) {
+    if (!Array.isArray(value)) {
+        throw new Error(`artifact integrity: ${artifactLabel} field '${fieldName}' must be an array, got ${value === null ? "null" : typeof value}`);
+    }
+}

package/dist/core-runtime/cli/claude-code-review-unit-executor.js CHANGED Viewed

@@ -7,6 +7,7 @@ import { parseArgs } from "node:util";
 import { pathToFileURL } from "node:url";
 import { appendRuntimeStreamChunkSync, appendRuntimeStreamEventSync, } from "../observability/runtime-stream-observation.js";
 import { semanticQualityEvidenceForArtifactGeneration } from "../review/artifact-generation-realization.js";
+import { resolveClaudeBin } from "../llm/claude-bin.js";
 import { buildBoundedPrompt, buildWorkerSubmitSchema, coerceStructuredPayload, parseOutputFormat, requireString, writeLensSidecarArtifactFromPayload, writeRuntimeSubmitArtifactFromPayload, } from "./worker-structured-output.js";
 import { SALVAGE_INCOMPLETE_SENTINEL, buildDeltaRowsSalvagePrompt, buildTranscriptionSalvagePrompt, classifySalvageMode, mergeMissingStanceRows, salvageInputPathFor, } from "./submit-salvage.js";
 /**
@@ -27,7 +28,7 @@ import { SALVAGE_INCOMPLETE_SENTINEL, buildDeltaRowsSalvagePrompt, buildTranscri
  * path, never by the worker.
  */
 const CLAUDE_READONLY_ALLOWED_TOOLS = ["Read", "Grep", "Glob"];
-const CLAUDE_BIN = process.env.ONTO_CLAUDE_BIN?.trim() || "claude";
+const CLAUDE_BIN = resolveClaudeBin();
 /**
  * Embed the submit-tool JSON Schema into the bounded prompt. Claude Code's
  * `--json-schema` flag silently rejects the runtime's complex submit schemas,

package/dist/core-runtime/cli/claude-nesting-batch-worker.js CHANGED Viewed

@@ -22,7 +22,8 @@
  *     servers load in the bounded outer.
  *   - effort maps to `--effort`; `service_tier` is API-only and NOT
  *     supported on `claude -p`/OAuth — deliberately absent here.
- *   - `ONTO_CLAUDE_BIN` overrides the binary (matches the unit executor).
+ *   - The binary is resolved via `resolveClaudeBin()` (ONTO_CLAUDE_BIN override
+ *     → PATH → common install locations); matches the unit executor.
  *
  * # How it relates
  *
@@ -33,7 +34,8 @@
 import { spawn } from "node:child_process";
 import fs from "node:fs";
 import { buildNestingBatchWorkerPrompt, parseNestingBatchSummary, reconcileNestingBatchOutcomes, } from "../review/nesting-batch.js";
-const CLAUDE_BIN = process.env.ONTO_CLAUDE_BIN ?? "claude";
+import { resolveClaudeBin } from "../llm/claude-bin.js";
+const CLAUDE_BIN = resolveClaudeBin();
 /**
  * Start the outer Claude worker with the batch prompt as the positional
  * arg. Isolated from `runClaudeNestingBatchWorker` so tests can stub it.

package/dist/core-runtime/discovery/supported-models.js CHANGED Viewed

@@ -163,6 +163,17 @@ export function collectModelSelections(settings) {
     visit(settings, "");
     return out;
 }
+/** Non-throwing membership check: is (provider, model) a benchmark-verified
+ * supported route? Reuses the same verified-pair set as
+ * {@link assertSupportedModelRoutes}, but returns a boolean so opt-in callers
+ * (e.g. the answer-support judge per-stage model override) can DEGRADE to the
+ * inherited config when an override is unsupported, instead of failing the run.
+ * An unresolved provider or model is not verified. */
+export function isSupportedModelRoute(provider, model, registry) {
+    if (provider === undefined || model === undefined)
+        return false;
+    return registry.supported_models.some((entry) => entry.provider === provider && entry.model === model);
+}
 /** Throws if any effective route is not a benchmark-verified (provider, model)
  * pair. A route whose effective provider OR model could not be resolved is
  * rejected (fail-loud) rather than leniently accepted — the route must resolve

package/dist/core-runtime/effort-calibration-ingest.js ADDED Viewed

@@ -0,0 +1,191 @@
+/**
+ * Effort-calibration ingestion (P4) — turn an existing benchmark report's JSON
+ * into the `EffortSweepRun[]` the sweep aggregator consumes. Pure: no LLM calls
+ * and no IO. The benchmark scripts already run the (paid, live) sweep and write
+ * the report; this module only re-reads that record, so the calibration report
+ * is produced deterministically and is unit-testable against captured output.
+ *
+ * Honesty rule (carried from the sweep aggregator): a run that produced no
+ * quality verdict — a failed run with no gate — is emitted as an unjudged
+ * (passed=null, qualityScore=null) sweep run, NOT dropped. Dropping failures
+ * would let an effort look viable on a single surviving pass; counting them as
+ * non-passing in the denominator keeps the quorum honest.
+ *
+ * Pipeline asymmetry, mirrored from the two benchmark harnesses:
+ *  - review: one unit-sweep invocation varies one unit's effort internally, so
+ *    each run self-describes its (unit, effort); the baseline run (all units at
+ *    base_effort) is the shared base-effort point for every swept unit.
+ *  - reconstruct: one invocation pins one effort knob (global author --effort or
+ *    --judge-effort), so the (stage, effort) of a report is the knob that was
+ *    pinned; the sweep is several invocations, one report per effort point.
+ *
+ * Cost is intentionally NOT populated here. Per-stage cost is reporting-only
+ * (the frontier decision ignores it) and correct attribution needs a verified
+ * mapping that this stage cannot prove: review's swept stage ids live in the
+ * execution namespace (e.g. `finding_ledger`, `synthesis_response`, `lens`)
+ * while per-unit telemetry uses runtime ids (`finding-ledger`, `synthesize`,
+ * per-lens ids), and a reconstruct report's `totals` is the whole-pipeline cost,
+ * not the swept stage's. The optional `cost` field stays in the sweep/frontier
+ * layer; populating it correctly is deferred to P4b, when real multi-effort
+ * sweep data exists to verify the per-stage unit-telemetry attribution.
+ */
+import { reviewRunGateSignal } from "./effort-calibration-review.js";
+import { reconstructRunGateSignal } from "./effort-calibration-reconstruct.js";
+const UNJUDGED = { passed: null, qualityScore: null };
+/** Only a unit-sweep baseline case is the shared base-effort point per unit. */
+const UNIT_SWEEP_BASE_PREFIX = "unit-sweep-base";
+/**
+ * Ingest a review unit-sweep benchmark report into sweep runs. Candidate runs
+ * map to their swept unit at its varied effort; each unit-sweep BASELINE run
+ * becomes the base-effort point for EVERY swept unit (the units observed across
+ * candidate runs), since "unit X at base, others at base" is exactly the
+ * baseline. Only `unit-sweep-base-*` runs are treated as baselines: a report may
+ * also carry non-unit-sweep cases (e.g. `all-high`) that share `base_effort` but
+ * vary every unit at once, and replicating those would contaminate the
+ * single-variable frontier. Runs without a quality gate (failures) are emitted
+ * unjudged; runs that are neither a candidate nor a unit-sweep baseline are
+ * skipped (nothing single-variable to attribute).
+ */
+export function ingestReviewReport(report) {
+    const runs = report.runs ?? [];
+    const sweptUnits = [
+        ...new Set(runs
+            .map((r) => r.varied_unit_id)
+            .filter((id) => typeof id === "string")),
+    ];
+    // The base-effort sample is the frontier's reference point; a candidate-only
+    // report (e.g. run with --unit-sweep-candidate-only) omits it, so the frontier
+    // would recommend an effort purely because the baseline was never measured.
+    const hasBaseline = runs.some((r) => r.base_effort && r.case_id?.startsWith(UNIT_SWEEP_BASE_PREFIX));
+    if (sweptUnits.length > 0 && !hasBaseline) {
+        throw new Error("review report has unit-sweep candidates but no unit-sweep baseline (base-effort sample missing; was it run with --unit-sweep-candidate-only?)");
+    }
+    // Points (stage|effort) with at least one completed (gated) run. Failed runs
+    // carry no quality gate or route telemetry, so they are attributed only to a
+    // point that also has completed evidence — three failures alone must not stamp
+    // a unit-effort on a route no retained run proved executed there.
+    const completedPoints = new Set();
+    for (const run of runs) {
+        if (!run.semantic_quality_gate)
+            continue;
+        if (run.varied_unit_id && run.varied_effort) {
+            completedPoints.add(`${run.varied_unit_id}|${run.varied_effort}`);
+        }
+        else if (run.base_effort && run.case_id?.startsWith(UNIT_SWEEP_BASE_PREFIX)) {
+            for (const unit of sweptUnits)
+                completedPoints.add(`${unit}|${run.base_effort}`);
+        }
+    }
+    const out = [];
+    for (const run of runs) {
+        const gated = Boolean(run.semantic_quality_gate);
+        const gate = run.semantic_quality_gate
+            ? reviewRunGateSignal(run.semantic_quality_gate)
+            : { ...UNJUDGED };
+        if (run.varied_unit_id && run.varied_effort) {
+            if (gated || completedPoints.has(`${run.varied_unit_id}|${run.varied_effort}`)) {
+                out.push({ stage: run.varied_unit_id, effort: run.varied_effort, gate });
+            }
+        }
+        else if (run.base_effort && run.case_id?.startsWith(UNIT_SWEEP_BASE_PREFIX)) {
+            for (const unit of sweptUnits) {
+                if (gated || completedPoints.has(`${unit}|${run.base_effort}`)) {
+                    out.push({ stage: unit, effort: run.base_effort, gate });
+                }
+            }
+        }
+    }
+    return out;
+}
+/** The answer-support judge's LLM step; no-call-exempt, so it can early-exit. */
+export const JUDGE_STEP_ID = "answer_support_judgment";
+/**
+ * Whether the answer-support judge actually ran an LLM call at `effort` in this
+ * run. The judge is no-call-exempt: it early-exits (no convergent evidence
+ * clusters) or degrades to an inherited config, in which case it leaves no
+ * `answer_support_judgment` telemetry at the requested effort — so a judge
+ * sample must be backed by real judge telemetry, not just the requested knob.
+ */
+function judgeExercisedAt(run, effort) {
+    return (run.units ?? []).some((u) => u.step_id === JUDGE_STEP_ID &&
+        u.effort === effort &&
+        (u.llm_call_count ?? 0) >= 1);
+}
+/**
+ * Whether a completed run's telemetry shows the swept stage actually ran at
+ * `effort` — the requested knob is never trusted over telemetry. The author
+ * stage always runs, so its `metadata.applied_effort` must equal the point (a
+ * route that ignored the pin, or a recovery de-escalation, is not an `effort`
+ * sample). The judge stage must show an answer_support_judgment call at that
+ * effort. Runs that don't match are not evidence for this frontier point.
+ */
+function appliedEffortMatches(run, stage, effort) {
+    return stage === "judge"
+        ? judgeExercisedAt(run, effort)
+        : run.metadata?.applied_effort === effort;
+}
+/**
+ * Derive the (stage, effort) a reconstruct report pinned. A judge override marks
+ * a judge-stage report ONLY when it pins a judge EFFORT; a model-only override
+ * (effort null/absent) varied the judge model, not an effort, so there is no
+ * effort point to calibrate and this returns null (the caller must pass an
+ * explicit tag) rather than mislabeling it as author. Without a judge override
+ * it is an author-stage report at the pinned `requested_effort`, falling back to
+ * the first run's telemetry `applied_effort` when the settings chain governed
+ * the effort. Returns null when no effort can be attributed.
+ */
+export function deriveReconstructTag(report) {
+    const judgeOverride = report.requested_judge_override;
+    if (judgeOverride) {
+        return judgeOverride.effort
+            ? { stage: "judge", effort: judgeOverride.effort }
+            : null;
+    }
+    if (report.requested_effort) {
+        return { stage: "author", effort: report.requested_effort };
+    }
+    const applied = report.runs?.[0]?.metadata?.applied_effort;
+    if (applied)
+        return { stage: "author", effort: applied };
+    return null;
+}
+/**
+ * Ingest a reconstruct benchmark report into sweep runs for one (stage, effort)
+ * point. The point is the explicit `tag` when given, else derived from the
+ * report's pinned knob. A completed run only contributes a sample when its
+ * telemetry shows the swept stage actually ran at that effort (author:
+ * applied_effort matches; judge: an answer_support_judgment call at that effort),
+ * so a pin the route ignored, a recovery de-escalation, or a judge early-exit
+ * does not fabricate a sample. Their golden gate is then distilled. Failed runs
+ * (no telemetry) are emitted unjudged for the author stage only — they cannot
+ * prove the judge ran. Throws when no tag can be determined.
+ */
+export function ingestReconstructReport(report, tag) {
+    const point = tag ?? deriveReconstructTag(report);
+    if (!point) {
+        throw new Error("reconstruct report pins no effort (no judge override, requested_effort, or applied_effort); pass an explicit stage:effort tag");
+    }
+    const out = [];
+    for (const run of report.runs ?? []) {
+        if (!appliedEffortMatches(run, point.stage, point.effort)) {
+            continue; // telemetry doesn't show this stage running at this effort
+        }
+        out.push({
+            stage: point.stage,
+            effort: point.effort,
+            gate: reconstructRunGateSignal(run.quality_gate),
+        });
+    }
+    // Failed runs carry no telemetry, so attribute them only to the author stage
+    // (which always runs) AND only when at least one completed run was retained at
+    // this point — that retained run's telemetry is what proves the model/route/
+    // effort. A source whose runs all failed proves no route/effort, so its
+    // failures are not turned into route-keyed samples.
+    if (point.stage === "author" && out.length > 0) {
+        const failedCount = report.reconstruct_extension?.failed_runs?.length ?? 0;
+        for (let i = 0; i < failedCount; i++) {
+            out.push({ stage: "author", effort: point.effort, gate: { ...UNJUDGED } });
+        }
+    }
+    return out;
+}