onto-mcp 0.4.11 → 0.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/.onto/authority/supported-models.yaml +16 -0
  2. package/.onto/processes/reconstruct/reconstruct-contract-registry.yaml +33 -41
  3. package/.onto/processes/review/nesting-batch-worker-contract.md +1 -1
  4. package/dist/core-api/reconstruct-api.js +110 -0
  5. package/dist/core-runtime/artifact-io.js +59 -0
  6. package/dist/core-runtime/cli/claude-code-review-unit-executor.js +2 -1
  7. package/dist/core-runtime/cli/claude-nesting-batch-worker.js +4 -2
  8. package/dist/core-runtime/discovery/supported-models.js +11 -0
  9. package/dist/core-runtime/effort-calibration-ingest.js +191 -0
  10. package/dist/core-runtime/effort-calibration-reconstruct.js +47 -0
  11. package/dist/core-runtime/effort-calibration-review.js +22 -0
  12. package/dist/core-runtime/effort-calibration-sweep.js +110 -0
  13. package/dist/core-runtime/effort-frontier.js +134 -0
  14. package/dist/core-runtime/llm/claude-bin.js +77 -0
  15. package/dist/core-runtime/llm/llm-caller.js +169 -29
  16. package/dist/core-runtime/reconstruct/artifact-types.js +2 -0
  17. package/dist/core-runtime/reconstruct/benchmark-evidence.js +16 -0
  18. package/dist/core-runtime/reconstruct/claim-projection-validation.js +2 -5
  19. package/dist/core-runtime/reconstruct/directive-validation.js +4 -3
  20. package/dist/core-runtime/reconstruct/execution-telemetry.js +1 -0
  21. package/dist/core-runtime/reconstruct/material-admission-validation.js +2 -5
  22. package/dist/core-runtime/reconstruct/material-profile-validation.js +10 -6
  23. package/dist/core-runtime/reconstruct/materialize-preparation.js +91 -20
  24. package/dist/core-runtime/reconstruct/maturation-validation.js +296 -19
  25. package/dist/core-runtime/reconstruct/mock-llm-realization.js +15 -0
  26. package/dist/core-runtime/reconstruct/ontology-seed-validation.js +12 -7
  27. package/dist/core-runtime/reconstruct/pipeline-execution-ledger.js +25 -2
  28. package/dist/core-runtime/reconstruct/post-seed-validation.js +4 -6
  29. package/dist/core-runtime/reconstruct/proof-authority-validation.js +2 -6
  30. package/dist/core-runtime/reconstruct/purpose-authority-validation.js +2 -5
  31. package/dist/core-runtime/reconstruct/record.js +7 -3
  32. package/dist/core-runtime/reconstruct/registry-verification-validation.js +25 -5
  33. package/dist/core-runtime/reconstruct/run-control-validation.js +9 -4
  34. package/dist/core-runtime/reconstruct/run.js +248 -18
  35. package/dist/core-runtime/reconstruct/seed-authoring-readiness-validation.js +2 -5
  36. package/dist/core-runtime/reconstruct/semantic-quality-gate.js +7 -5
  37. package/dist/core-runtime/reconstruct/source-observation-delta-validation.js +12 -5
  38. package/dist/core-runtime/reconstruct/source-safety-validation.js +6 -5
  39. package/dist/core-runtime/reconstruct/source-scout-pack-validation.js +13 -5
  40. package/dist/core-runtime/reconstruct/terminal-validation.js +13 -11
  41. package/dist/core-runtime/review/review-artifact-utils.js +2 -2
  42. package/dist/mcp/server.js +15 -0
  43. package/dist/mcp/tool-schemas.js +8 -0
  44. package/package.json +3 -1
@@ -25,3 +25,19 @@ supported_models:
25
25
  Completed a full reconstruct pipeline run in the live medium baseline
26
26
  (1 of 6 attempts completed end to end); support verified. Performance
27
27
  evidence is PRELIMINARY in that record — support, not a performance claim.
28
+ - provider: anthropic
29
+ model: claude-opus-4-8
30
+ verified_at: "2026-06-15"
31
+ benchmark_evidence_refs:
32
+ - development-records/benchmark/reconstruct-pipeline-live-claude-20260615.json
33
+ notes: >-
34
+ Completed a full reconstruct pipeline run end to end via the Anthropic
35
+ OAuth Claude Code CLI worker (execution_adapter=claude_code):
36
+ record_stage=completed with final_output present and provenance valid; the
37
+ maturation track ran to a `blocked` continuation decision (no actionable
38
+ ontology emitted). The CLI worker path (codex_cli/claude_code) uses the
39
+ 600000ms DEFAULT_WORKER_TIMEOUT_MS — the slow opus seed-authoring call
40
+ exceeds the 120000ms SDK default — so support holds under default settings
41
+ (no unencoded env override required). Single-run completion proof
42
+ (PRELIMINARY for any performance claim, INV-BENCH-1 needs >=3 reps /
43
+ >=2 fixtures); support — model completes the pipeline — verified.
@@ -1124,6 +1124,12 @@ artifact_authorities:
1124
1124
  answer_support_ledger_validation:
1125
1125
  authority_ref: answer-support-ledger-validation.yaml
1126
1126
  validation_ref: null
1127
+ answer_support_judgment:
1128
+ authority_ref: answer-support-judgment.yaml
1129
+ validation_ref: answer-support-judgment-validation.yaml
1130
+ answer_support_judgment_validation:
1131
+ authority_ref: answer-support-judgment-validation.yaml
1132
+ validation_ref: null
1127
1133
  maturation_authority_response:
1128
1134
  authority_ref: maturation-authority-response.yaml
1129
1135
  validation_ref: maturation-authority-response-validation.yaml
@@ -1229,14 +1235,6 @@ planned_artifact_authorities:
1229
1235
  authority_ref: required-when-evaluation-validation.yaml
1230
1236
  validation_ref: null
1231
1237
  activation_condition: registry_predicate_evaluator_runtime_is_implemented
1232
- answer_support_judgment:
1233
- authority_ref: answer-support-judgment.yaml
1234
- validation_ref: answer-support-judgment-validation.yaml
1235
- activation_condition: answer_support_judge_runtime_is_implemented
1236
- answer_support_judgment_validation:
1237
- authority_ref: answer-support-judgment-validation.yaml
1238
- validation_ref: null
1239
- activation_condition: answer_support_judge_runtime_is_implemented
1240
1238
 
1241
1239
  validation_gate_catalog:
1242
1240
  - gate_id: reconstruct_run_control_gate
@@ -1339,6 +1337,9 @@ validation_gate_catalog:
1339
1337
  - gate_id: answer_support_gate
1340
1338
  validation_artifact_ref: answer-support-ledger-validation.yaml
1341
1339
  required_when: answer_support_ledger_exists
1340
+ - gate_id: answer_support_judgment_gate
1341
+ validation_artifact_ref: answer-support-judgment-validation.yaml
1342
+ required_when: answer_support_judgment_required_minimal
1342
1343
  - gate_id: maturation_answer_claim_gate
1343
1344
  validation_artifact_ref: maturation-answer-claims-validation.yaml
1344
1345
  required_when: maturation_answer_claims_exist
@@ -1396,12 +1397,6 @@ planned_validation_gate_catalog:
1396
1397
  validation_artifact_ref: required-when-evaluation-validation.yaml
1397
1398
  required_when: always
1398
1399
  activation_condition: registry_predicate_evaluator_runtime_is_implemented
1399
- - gate_id: answer_support_judgment_gate
1400
- validation_artifact_ref: answer-support-judgment-validation.yaml
1401
- required_when: answer_support_judgment_required
1402
- activation_condition: answer_support_judge_runtime_is_implemented
1403
- activation_prerequisites:
1404
- - answer_support_ledger_validation_is_valid
1405
1400
 
1406
1401
  required_when_predicate_family_catalog:
1407
1402
  - predicate_family_id: frontier_observation_use_by_downstream_artifact
@@ -1529,20 +1524,6 @@ required_when_predicate_catalog:
1529
1524
  truth_expression: "source_observation_delta_validation.validation_status == valid and answer_support_ledger_refs_delta_observation_ids"
1530
1525
  unknown_projection: not_applicable
1531
1526
  explanation_template: "Answer support ledger cites observation ids from a frontier-triggered observation delta."
1532
- - predicate_id: answer_support_judgment_uses_frontier_observation
1533
- predicate_family_id: frontier_observation_use_by_downstream_artifact
1534
- gate_instance_scope: per_round
1535
- downstream_artifact_ref: answer-support-judgment.yaml
1536
- downstream_validation_ref: answer-support-judgment-validation.yaml
1537
- input_authority_refs: [rounds/<round-id>/source-observation-delta.yaml, rounds/<round-id>/source-observation-delta-validation.yaml, answer-support-judgment.yaml]
1538
- truth_expression: "source_observation_delta_validation.validation_status == valid and answer_support_judgment_refs_delta_observation_ids"
1539
- unknown_projection: not_applicable
1540
- explanation_template: "Answer support judgment cites observation ids from a frontier-triggered observation delta."
1541
- - predicate_id: answer_support_judgment_required
1542
- input_authority_refs: [answer-support-ledger.yaml, answer-support-ledger-validation.yaml]
1543
- truth_expression: "artifact_exists(answer-support-ledger.yaml) and answer_support_ledger_has_convergent_source_evidence_cluster"
1544
- unknown_projection: not_applicable
1545
- explanation_template: "A judge confirmation is required when answer support uses convergent source evidence."
1546
1527
  - predicate_id: maturation_answer_claims_use_frontier_observation
1547
1528
  predicate_family_id: frontier_observation_use_by_downstream_artifact
1548
1529
  gate_instance_scope: per_round
@@ -1738,6 +1719,11 @@ required_when_predicate_catalog:
1738
1719
  truth_expression: "artifact_exists(answer-support-ledger.yaml)"
1739
1720
  unknown_projection: not_applicable
1740
1721
  explanation_template: "An answer support ledger exists and requires support-ledger validation."
1722
+ - predicate_id: answer_support_judgment_required_minimal
1723
+ input_authority_refs: [answer-support-judgment.yaml]
1724
+ truth_expression: "artifact_exists(answer-support-judgment.yaml)"
1725
+ unknown_projection: not_applicable
1726
+ explanation_template: "An answer support judgment exists and requires judgment validation. Convergent-source-evidence necessity and sufficiency are enforced by the maturation-answer-claims validator (B-6)."
1741
1727
  - predicate_id: maturation_authority_response_exists
1742
1728
  input_authority_refs: [maturation-authority-response.yaml]
1743
1729
  truth_expression: "artifact_exists(maturation-authority-response.yaml)"
@@ -2692,6 +2678,10 @@ validator_records:
2692
2678
  - ontology-seed.yaml
2693
2679
  - reconstruct-contract-registry.yaml
2694
2680
  conditional_input_authority_refs:
2681
+ - artifact_ref: answer-support-judgment.yaml
2682
+ activation_condition: answer_support_judge_runtime_is_implemented
2683
+ consumed_for:
2684
+ - require_convergent_source_evidence_claims_to_have_two_independent_judge_confirmed_supports
2695
2685
  - artifact_ref: answer-support-judgment-validation.yaml
2696
2686
  activation_condition: answer_support_judge_runtime_is_implemented
2697
2687
  consumed_for:
@@ -2704,8 +2694,22 @@ validator_records:
2704
2694
  conditional_validation_obligations:
2705
2695
  - obligation_id: require_convergent_source_evidence_claims_to_have_two_independent_judge_confirmed_supports
2706
2696
  activation_condition: answer_support_judge_runtime_is_implemented
2707
- input_authority_refs: [answer-support-judgment-validation.yaml]
2697
+ input_authority_refs: [answer-support-judgment.yaml, answer-support-judgment-validation.yaml]
2708
2698
  output_ref: maturation-answer-claims-validation.yaml
2699
+ - validator_id: answer-support-judgment-validator
2700
+ gate_ids: [answer_support_judgment_gate]
2701
+ validator_version: 1
2702
+ input_authority_refs:
2703
+ - answer-support-judgment.yaml
2704
+ - answer-support-ledger.yaml
2705
+ - answer-support-ledger-validation.yaml
2706
+ - reconstruct-contract-registry.yaml
2707
+ validation_obligations:
2708
+ - validate_judgment_refs_resolve_to_answer_support_ledger_clusters_and_evidence
2709
+ - require_supports_enum_for_each_judgment
2710
+ - require_rationale_ref_for_each_judgment
2711
+ - require_convergent_clusters_to_judge_every_cited_evidence_ref
2712
+ output_ref: answer-support-judgment-validation.yaml
2709
2713
  - validator_id: ontology-expansion-validator
2710
2714
  gate_ids: [ontology_expansion_gate]
2711
2715
  validator_version: 1
@@ -2943,18 +2947,6 @@ validator_records:
2943
2947
  output_ref: handoff-decision-validation.yaml
2944
2948
 
2945
2949
  planned_validator_records:
2946
- - validator_id: answer-support-judgment-validator
2947
- gate_ids: [answer_support_judgment_gate]
2948
- validator_version: 1
2949
- input_authority_refs:
2950
- - answer-support-judgment.yaml
2951
- - answer-support-ledger-validation.yaml
2952
- - reconstruct-contract-registry.yaml
2953
- validation_obligations:
2954
- - validate_judgment_refs_resolve_to_answer_support_ledger_clusters_and_evidence
2955
- - require_supports_enum_for_each_judgment
2956
- - require_rationale_ref_for_each_judgment
2957
- output_ref: answer-support-judgment-validation.yaml
2958
2950
  - validator_id: maturation-promotion-request-validator
2959
2951
  gate_ids: [maturation_promotion_request_gate]
2960
2952
  validator_version: 1
@@ -36,7 +36,7 @@ outer의 유일한 역할: **script를 `bash -s`로 실행하고 stdout을 verba
36
36
  | brand | spawn | 비고 |
37
37
  |---|---|---|
38
38
  | codex (`codex-nesting-batch-worker.ts`) | `codex exec --sandbox danger-full-access --ephemeral`, prompt는 stdin | outer가 subprocess를 spawn해야 하므로 non-seatbelt; inner unit executor는 자체 read-only sandbox 유지 |
39
- | claude (`claude-nesting-batch-worker.ts`) | `claude -p <prompt positional> --allowedTools Bash --permission-mode bypassPermissions --strict-mcp-config`(빈 MCP) | prompt는 **positional**(stdin 무시됨). `--effort` 지원, service_tier 표면 없음(API 전용). `ONTO_CLAUDE_BIN` 오버라이드 |
39
+ | claude (`claude-nesting-batch-worker.ts`) | `claude -p <prompt positional> --allowedTools Bash --permission-mode bypassPermissions --strict-mcp-config`(빈 MCP) | prompt는 **positional**(stdin 무시됨). `--effort` 지원, service_tier 표면 없음(API 전용). 바이너리는 `resolveClaudeBin()`로 해석(`ONTO_CLAUDE_BIN` 오버라이드 → PATH → 일반 설치 위치) |
40
40
 
41
41
  outer(teamlead seat) model/effort는 settings `review.execution.teamlead.llm`에서 brand adapter(codex_cli/claude_code) 일치 시에만 해석된다. **inner unit의 LLM 설정은 outer 설정이 아니라 호출자가 구성한 inner argv에 실린다**(flat 동등).
42
42
 
@@ -13,6 +13,7 @@ import { createDirectCallReconstructConfirmationProvider, createDirectCallRecons
13
13
  import { RECONSTRUCT_MOCK_AUTHOR_ID, RECONSTRUCT_MOCK_CONFIRMATION_PROVIDER_ID, callReconstructMockLlm, isReconstructMockLlmRealizationEnabled, } from "../core-runtime/reconstruct/mock-llm-realization.js";
14
14
  import { assertSettingsModelsSupported, resolveSettingsChain, resolveReconstructActorLlmSettings, } from "../core-runtime/discovery/settings-chain.js";
15
15
  import { resolveOntoHome, } from "../core-runtime/discovery/onto-home.js";
16
+ import { isSupportedModelRoute, loadSupportedModelRegistry, } from "../core-runtime/discovery/supported-models.js";
16
17
  import { resolveLlmProviderConfig, } from "../core-runtime/llm/llm-caller.js";
17
18
  import { writeOntologySeedValidationArtifact, writeCandidateDispositionValidationArtifact, } from "../core-runtime/reconstruct/ontology-seed-validation.js";
18
19
  import { writeSourceObservationDirectiveValidationArtifact, } from "../core-runtime/reconstruct/directive-validation.js";
@@ -35,6 +36,57 @@ function resolveFromBase(basePath, maybeRelativePath) {
35
36
  ? path.resolve(maybeRelativePath)
36
37
  : path.resolve(basePath, maybeRelativePath);
37
38
  }
39
+ /**
40
+ * Pure adopt-vs-degrade decision for the opt-in answer-support judge config.
41
+ * The judge keeps the semantic author's config except for the requested
42
+ * overrides. A judgeModelCandidate (already resolved on the author's provider,
43
+ * so its credentials/adapter match the author) is adopted only when it is a
44
+ * benchmark-verified route (INV-MODEL-1) AND keeps the author's provider; any
45
+ * other case degrades to the author model with a recorded note. Effort always
46
+ * INHERITS the author's effective effort (e.g. a pinned `--effort`) unless
47
+ * judgeLlmEffort explicitly overrides it — never the model candidate's raw
48
+ * settings effort, which could otherwise silently run the judge weaker than the
49
+ * author. Returns `undefined` config when nothing was requested (caller inherits
50
+ * the author config — zero change).
51
+ */
52
+ export function resolveJudgeLlmConfig(args) {
53
+ if (!args.judgeLlmEffort && !args.judgeModelCandidate) {
54
+ return { judgeLlmConfig: undefined, note: null };
55
+ }
56
+ const judge = { ...args.authorLlmConfig };
57
+ const authorEffort = args.authorLlmConfig.reasoning_effort;
58
+ let note = null;
59
+ const candidate = args.judgeModelCandidate;
60
+ if (candidate) {
61
+ // INV-MODEL-1 is keyed by MODEL provider (e.g. openai/gpt-5.5), not the
62
+ // runtime adapter provider (OpenAI OAuth normalizes to codex). Check the
63
+ // model provider so a supported judge model is not spuriously degraded.
64
+ const supported = isSupportedModelRoute(args.judgeModelProvider, candidate.model_id, args.registry);
65
+ // Credential safety: the candidate resolves on the author's provider, so its
66
+ // runtime provider must match the author's (guarantees api_key_env/adapter
67
+ // never cross providers). Uses the runtime provider, not the model provider.
68
+ const sameProvider = candidate.provider === args.authorLlmConfig.provider;
69
+ if (supported && sameProvider) {
70
+ Object.assign(judge, candidate);
71
+ }
72
+ else {
73
+ note = `answer-support judge model override (${args.judgeModelProvider ?? "(unresolved provider)"}/${candidate.model_id ?? "(unresolved model)"}) ${supported
74
+ ? "requires a different provider than the semantic author"
75
+ : "is not a benchmark-verified route"}; degraded to the semantic-author model`;
76
+ }
77
+ }
78
+ // Effort = explicit judge override, else the author's effective effort. This
79
+ // wins over any reasoning_effort Object.assign copied from the model candidate
80
+ // (the candidate is resolved without the author's effort pin, so its raw
81
+ // settings effort can diverge from the author's pinned effort).
82
+ if (args.judgeLlmEffort)
83
+ judge.reasoning_effort = args.judgeLlmEffort;
84
+ else if (authorEffort !== undefined)
85
+ judge.reasoning_effort = authorEffort;
86
+ else
87
+ delete judge.reasoning_effort;
88
+ return { judgeLlmConfig: judge, note };
89
+ }
38
90
  function dateStamp() {
39
91
  return new Date().toISOString().slice(0, 10).replace(/-/g, "");
40
92
  }
@@ -277,8 +329,51 @@ export function createOntoReconstructCoreApi(options = {}) {
277
329
  },
278
330
  ...(llmEffortOverride ? { cliOverrides: llmEffortOverride } : {}),
279
331
  });
332
+ // Opt-in per-stage JUDGE config (semantic-independence lever). Default =
333
+ // inherit the semantic-author config (judgeLlmConfig undefined → no change,
334
+ // zero regression). A judgeModel override resolves ON THE AUTHOR'S PROVIDER
335
+ // (same credentials/route), so it is adopted only when the resulting
336
+ // (author provider, judgeModel) pair is benchmark-verified, otherwise it
337
+ // degrades. resolveJudgeLlmConfig owns the adopt-vs-degrade decision.
338
+ const judgeOverrideRequested = Boolean(request.judgeLlmEffort || request.judgeModel);
339
+ let judgeConfigNote = null;
340
+ if (judgeOverrideRequested && mockRealizationEnabled) {
341
+ judgeConfigNote =
342
+ "answer-support judge override ignored under mock realization (no real provider calls)";
343
+ }
344
+ // A judgeModel candidate is resolved on the SAME actor settings as the
345
+ // author (no provider override), so api_key_env / execution_adapter /
346
+ // base_url stay the author provider's — consistent, never cross-provider.
347
+ const judgeAuthorActorLlm = !mockRealizationEnabled && request.judgeModel
348
+ ? resolveReconstructActorLlmSettings(settings, "semantic_author")
349
+ : null;
350
+ const judgeModelCandidate = judgeAuthorActorLlm
351
+ ? resolveLlmProviderConfig({
352
+ config: { llm: judgeAuthorActorLlm },
353
+ cliOverrides: { model: request.judgeModel },
354
+ })
355
+ : null;
356
+ const judgeResolution = mockRealizationEnabled
357
+ ? { judgeLlmConfig: undefined, note: judgeConfigNote }
358
+ : resolveJudgeLlmConfig({
359
+ authorLlmConfig: semanticAuthorLlmConfig,
360
+ ...(request.judgeLlmEffort
361
+ ? { judgeLlmEffort: request.judgeLlmEffort }
362
+ : {}),
363
+ judgeModelCandidate,
364
+ // Registry key is the MODEL provider (e.g. openai), not the runtime
365
+ // adapter (openai OAuth → codex). The judge uses the author's provider.
366
+ ...(judgeAuthorActorLlm?.provider
367
+ ? { judgeModelProvider: judgeAuthorActorLlm.provider }
368
+ : {}),
369
+ registry: loadSupportedModelRegistry(),
370
+ });
371
+ const judgeLlmConfig = judgeResolution.judgeLlmConfig;
372
+ if (!mockRealizationEnabled)
373
+ judgeConfigNote = judgeResolution.note;
280
374
  const directiveAuthor = createDirectCallReconstructDirectiveAuthor({
281
375
  llmConfig: semanticAuthorLlmConfig,
376
+ ...(judgeLlmConfig ? { judgeLlmConfig } : {}),
282
377
  ...(mockRealizationEnabled
283
378
  ? {
284
379
  llmCall: callReconstructMockLlm,
@@ -304,6 +399,21 @@ export function createOntoReconstructCoreApi(options = {}) {
304
399
  : "reconstruct session starting",
305
400
  stageId: "start",
306
401
  });
402
+ if (judgeConfigNote) {
403
+ // Honest accounting: the operator opted into a judge override that was
404
+ // not used (unsupported model degraded to the author model, or ignored
405
+ // under mock), so the rubber-stamp mitigation did NOT take effect. The
406
+ // judge's actual model/effort is independently recorded in the judge
407
+ // step execution telemetry. Emitted BEFORE the run so it survives a
408
+ // run failure — the degrade decision is independent of the run outcome.
409
+ appendRuntimeStatusEventSync({
410
+ pipeline: "reconstruct",
411
+ sessionRoot,
412
+ sourceLabel: "onto_reconstruct",
413
+ message: judgeConfigNote,
414
+ stageId: "answer_support_judgment",
415
+ });
416
+ }
307
417
  const watcherResult = spawnRuntimeWatcherPane(projectRoot, sessionRoot, ontoHome);
308
418
  appendRuntimeStatusEventSync({
309
419
  pipeline: "reconstruct",
@@ -0,0 +1,59 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { stringify as stringifyYaml } from "yaml";
4
+ /**
5
+ * Atomic artifact writes shared across the review and reconstruct runtimes.
6
+ *
7
+ * Pipeline artifacts (canonical `source-observations.yaml`, validation
8
+ * artifacts, the review record) are rewritten every round and trusted on read.
9
+ * A plain `mkdir` + `writeFile` is not atomic: a crash or full disk mid-write
10
+ * leaves a truncated-but-parseable file that the YAML parser silently accepts
11
+ * (a half-written `validation_status: valid` reads as a clean pass; an empty
12
+ * file reads as `null`). Writing to a same-directory temp file and renaming it
13
+ * into place makes the final path flip atomically — readers see either the
14
+ * prior complete file or the new complete file, never a torn one.
15
+ */
16
+ // Process-monotonic counter guarantees temp-path uniqueness even when the same
17
+ // target path is rewritten concurrently within one process.
18
+ let tempWriteCounter = 0;
19
+ /**
20
+ * Write `contents` to `filePath` atomically: create parent dirs, write to a
21
+ * unique same-directory temp file, then rename it into place. On any failure
22
+ * the temp file is removed and no partial file is left at `filePath`.
23
+ */
24
+ export async function atomicWriteFile(filePath, contents) {
25
+ await fs.mkdir(path.dirname(filePath), { recursive: true });
26
+ tempWriteCounter += 1;
27
+ const tempPath = `${filePath}.${process.pid}.${tempWriteCounter}.tmp`;
28
+ try {
29
+ await fs.writeFile(tempPath, contents, "utf8");
30
+ // Same-filesystem rename is atomic; the target flips in one step.
31
+ await fs.rename(tempPath, filePath);
32
+ }
33
+ catch (error) {
34
+ await fs.rm(tempPath, { force: true });
35
+ throw error;
36
+ }
37
+ }
38
+ /**
39
+ * Serialize `value` to YAML and write it atomically. Byte-for-byte identical
40
+ * output to a direct `stringifyYaml(value)` write — only the write mechanism
41
+ * changes.
42
+ */
43
+ export async function atomicWriteYamlDocument(filePath, value) {
44
+ await atomicWriteFile(filePath, stringifyYaml(value));
45
+ }
46
+ /**
47
+ * Fail-closed shape guard for trusted artifact reads. The pipeline reads its
48
+ * own artifacts and trusts them on read; a malformed artifact (e.g. a required
49
+ * array field that is missing, null, or a scalar — from a torn write or
50
+ * out-of-band tampering) would otherwise crash deep inside a validator with an
51
+ * uncontextualized `TypeError: ... is not iterable`. This throws an integrity
52
+ * error that names the artifact and field instead, so the run halts with an
53
+ * actionable message rather than continuing on misread data.
54
+ */
55
+ export function assertArrayField(value, artifactLabel, fieldName) {
56
+ if (!Array.isArray(value)) {
57
+ throw new Error(`artifact integrity: ${artifactLabel} field '${fieldName}' must be an array, got ${value === null ? "null" : typeof value}`);
58
+ }
59
+ }
@@ -7,6 +7,7 @@ import { parseArgs } from "node:util";
7
7
  import { pathToFileURL } from "node:url";
8
8
  import { appendRuntimeStreamChunkSync, appendRuntimeStreamEventSync, } from "../observability/runtime-stream-observation.js";
9
9
  import { semanticQualityEvidenceForArtifactGeneration } from "../review/artifact-generation-realization.js";
10
+ import { resolveClaudeBin } from "../llm/claude-bin.js";
10
11
  import { buildBoundedPrompt, buildWorkerSubmitSchema, coerceStructuredPayload, parseOutputFormat, requireString, writeLensSidecarArtifactFromPayload, writeRuntimeSubmitArtifactFromPayload, } from "./worker-structured-output.js";
11
12
  import { SALVAGE_INCOMPLETE_SENTINEL, buildDeltaRowsSalvagePrompt, buildTranscriptionSalvagePrompt, classifySalvageMode, mergeMissingStanceRows, salvageInputPathFor, } from "./submit-salvage.js";
12
13
  /**
@@ -27,7 +28,7 @@ import { SALVAGE_INCOMPLETE_SENTINEL, buildDeltaRowsSalvagePrompt, buildTranscri
27
28
  * path, never by the worker.
28
29
  */
29
30
  const CLAUDE_READONLY_ALLOWED_TOOLS = ["Read", "Grep", "Glob"];
30
- const CLAUDE_BIN = process.env.ONTO_CLAUDE_BIN?.trim() || "claude";
31
+ const CLAUDE_BIN = resolveClaudeBin();
31
32
  /**
32
33
  * Embed the submit-tool JSON Schema into the bounded prompt. Claude Code's
33
34
  * `--json-schema` flag silently rejects the runtime's complex submit schemas,
@@ -22,7 +22,8 @@
22
22
  * servers load in the bounded outer.
23
23
  * - effort maps to `--effort`; `service_tier` is API-only and NOT
24
24
  * supported on `claude -p`/OAuth — deliberately absent here.
25
- * - `ONTO_CLAUDE_BIN` overrides the binary (matches the unit executor).
25
+ * - The binary is resolved via `resolveClaudeBin()` (ONTO_CLAUDE_BIN override
26
+ * → PATH → common install locations); matches the unit executor.
26
27
  *
27
28
  * # How it relates
28
29
  *
@@ -33,7 +34,8 @@
33
34
  import { spawn } from "node:child_process";
34
35
  import fs from "node:fs";
35
36
  import { buildNestingBatchWorkerPrompt, parseNestingBatchSummary, reconcileNestingBatchOutcomes, } from "../review/nesting-batch.js";
36
- const CLAUDE_BIN = process.env.ONTO_CLAUDE_BIN ?? "claude";
37
+ import { resolveClaudeBin } from "../llm/claude-bin.js";
38
+ const CLAUDE_BIN = resolveClaudeBin();
37
39
  /**
38
40
  * Start the outer Claude worker with the batch prompt as the positional
39
41
  * arg. Isolated from `runClaudeNestingBatchWorker` so tests can stub it.
@@ -163,6 +163,17 @@ export function collectModelSelections(settings) {
163
163
  visit(settings, "");
164
164
  return out;
165
165
  }
166
+ /** Non-throwing membership check: is (provider, model) a benchmark-verified
167
+ * supported route? Reuses the same verified-pair set as
168
+ * {@link assertSupportedModelRoutes}, but returns a boolean so opt-in callers
169
+ * (e.g. the answer-support judge per-stage model override) can DEGRADE to the
170
+ * inherited config when an override is unsupported, instead of failing the run.
171
+ * An unresolved provider or model is not verified. */
172
+ export function isSupportedModelRoute(provider, model, registry) {
173
+ if (provider === undefined || model === undefined)
174
+ return false;
175
+ return registry.supported_models.some((entry) => entry.provider === provider && entry.model === model);
176
+ }
166
177
  /** Throws if any effective route is not a benchmark-verified (provider, model)
167
178
  * pair. A route whose effective provider OR model could not be resolved is
168
179
  * rejected (fail-loud) rather than leniently accepted — the route must resolve
@@ -0,0 +1,191 @@
1
+ /**
2
+ * Effort-calibration ingestion (P4) — turn an existing benchmark report's JSON
3
+ * into the `EffortSweepRun[]` the sweep aggregator consumes. Pure: no LLM calls
4
+ * and no IO. The benchmark scripts already run the (paid, live) sweep and write
5
+ * the report; this module only re-reads that record, so the calibration report
6
+ * is produced deterministically and is unit-testable against captured output.
7
+ *
8
+ * Honesty rule (carried from the sweep aggregator): a run that produced no
9
+ * quality verdict — a failed run with no gate — is emitted as an unjudged
10
+ * (passed=null, qualityScore=null) sweep run, NOT dropped. Dropping failures
11
+ * would let an effort look viable on a single surviving pass; counting them as
12
+ * non-passing in the denominator keeps the quorum honest.
13
+ *
14
+ * Pipeline asymmetry, mirrored from the two benchmark harnesses:
15
+ * - review: one unit-sweep invocation varies one unit's effort internally, so
16
+ * each run self-describes its (unit, effort); the baseline run (all units at
17
+ * base_effort) is the shared base-effort point for every swept unit.
18
+ * - reconstruct: one invocation pins one effort knob (global author --effort or
19
+ * --judge-effort), so the (stage, effort) of a report is the knob that was
20
+ * pinned; the sweep is several invocations, one report per effort point.
21
+ *
22
+ * Cost is intentionally NOT populated here. Per-stage cost is reporting-only
23
+ * (the frontier decision ignores it) and correct attribution needs a verified
24
+ * mapping that this stage cannot prove: review's swept stage ids live in the
25
+ * execution namespace (e.g. `finding_ledger`, `synthesis_response`, `lens`)
26
+ * while per-unit telemetry uses runtime ids (`finding-ledger`, `synthesize`,
27
+ * per-lens ids), and a reconstruct report's `totals` is the whole-pipeline cost,
28
+ * not the swept stage's. The optional `cost` field stays in the sweep/frontier
29
+ * layer; populating it correctly is deferred to P4b, when real multi-effort
30
+ * sweep data exists to verify the per-stage unit-telemetry attribution.
31
+ */
32
+ import { reviewRunGateSignal } from "./effort-calibration-review.js";
33
+ import { reconstructRunGateSignal } from "./effort-calibration-reconstruct.js";
34
+ const UNJUDGED = { passed: null, qualityScore: null };
35
+ /** Only a unit-sweep baseline case is the shared base-effort point per unit. */
36
+ const UNIT_SWEEP_BASE_PREFIX = "unit-sweep-base";
37
+ /**
38
+ * Ingest a review unit-sweep benchmark report into sweep runs. Candidate runs
39
+ * map to their swept unit at its varied effort; each unit-sweep BASELINE run
40
+ * becomes the base-effort point for EVERY swept unit (the units observed across
41
+ * candidate runs), since "unit X at base, others at base" is exactly the
42
+ * baseline. Only `unit-sweep-base-*` runs are treated as baselines: a report may
43
+ * also carry non-unit-sweep cases (e.g. `all-high`) that share `base_effort` but
44
+ * vary every unit at once, and replicating those would contaminate the
45
+ * single-variable frontier. Runs without a quality gate (failures) are emitted
46
+ * unjudged; runs that are neither a candidate nor a unit-sweep baseline are
47
+ * skipped (nothing single-variable to attribute).
48
+ */
49
+ export function ingestReviewReport(report) {
50
+ const runs = report.runs ?? [];
51
+ const sweptUnits = [
52
+ ...new Set(runs
53
+ .map((r) => r.varied_unit_id)
54
+ .filter((id) => typeof id === "string")),
55
+ ];
56
+ // The base-effort sample is the frontier's reference point; a candidate-only
57
+ // report (e.g. run with --unit-sweep-candidate-only) omits it, so the frontier
58
+ // would recommend an effort purely because the baseline was never measured.
59
+ const hasBaseline = runs.some((r) => r.base_effort && r.case_id?.startsWith(UNIT_SWEEP_BASE_PREFIX));
60
+ if (sweptUnits.length > 0 && !hasBaseline) {
61
+ throw new Error("review report has unit-sweep candidates but no unit-sweep baseline (base-effort sample missing; was it run with --unit-sweep-candidate-only?)");
62
+ }
63
+ // Points (stage|effort) with at least one completed (gated) run. Failed runs
64
+ // carry no quality gate or route telemetry, so they are attributed only to a
65
+ // point that also has completed evidence — three failures alone must not stamp
66
+ // a unit-effort on a route no retained run proved executed there.
67
+ const completedPoints = new Set();
68
+ for (const run of runs) {
69
+ if (!run.semantic_quality_gate)
70
+ continue;
71
+ if (run.varied_unit_id && run.varied_effort) {
72
+ completedPoints.add(`${run.varied_unit_id}|${run.varied_effort}`);
73
+ }
74
+ else if (run.base_effort && run.case_id?.startsWith(UNIT_SWEEP_BASE_PREFIX)) {
75
+ for (const unit of sweptUnits)
76
+ completedPoints.add(`${unit}|${run.base_effort}`);
77
+ }
78
+ }
79
+ const out = [];
80
+ for (const run of runs) {
81
+ const gated = Boolean(run.semantic_quality_gate);
82
+ const gate = run.semantic_quality_gate
83
+ ? reviewRunGateSignal(run.semantic_quality_gate)
84
+ : { ...UNJUDGED };
85
+ if (run.varied_unit_id && run.varied_effort) {
86
+ if (gated || completedPoints.has(`${run.varied_unit_id}|${run.varied_effort}`)) {
87
+ out.push({ stage: run.varied_unit_id, effort: run.varied_effort, gate });
88
+ }
89
+ }
90
+ else if (run.base_effort && run.case_id?.startsWith(UNIT_SWEEP_BASE_PREFIX)) {
91
+ for (const unit of sweptUnits) {
92
+ if (gated || completedPoints.has(`${unit}|${run.base_effort}`)) {
93
+ out.push({ stage: unit, effort: run.base_effort, gate });
94
+ }
95
+ }
96
+ }
97
+ }
98
+ return out;
99
+ }
100
+ /** The answer-support judge's LLM step; no-call-exempt, so it can early-exit. */
101
+ export const JUDGE_STEP_ID = "answer_support_judgment";
102
+ /**
103
+ * Whether the answer-support judge actually ran an LLM call at `effort` in this
104
+ * run. The judge is no-call-exempt: it early-exits (no convergent evidence
105
+ * clusters) or degrades to an inherited config, in which case it leaves no
106
+ * `answer_support_judgment` telemetry at the requested effort — so a judge
107
+ * sample must be backed by real judge telemetry, not just the requested knob.
108
+ */
109
+ function judgeExercisedAt(run, effort) {
110
+ return (run.units ?? []).some((u) => u.step_id === JUDGE_STEP_ID &&
111
+ u.effort === effort &&
112
+ (u.llm_call_count ?? 0) >= 1);
113
+ }
114
+ /**
115
+ * Whether a completed run's telemetry shows the swept stage actually ran at
116
+ * `effort` — the requested knob is never trusted over telemetry. The author
117
+ * stage always runs, so its `metadata.applied_effort` must equal the point (a
118
+ * route that ignored the pin, or a recovery de-escalation, is not an `effort`
119
+ * sample). The judge stage must show an answer_support_judgment call at that
120
+ * effort. Runs that don't match are not evidence for this frontier point.
121
+ */
122
+ function appliedEffortMatches(run, stage, effort) {
123
+ return stage === "judge"
124
+ ? judgeExercisedAt(run, effort)
125
+ : run.metadata?.applied_effort === effort;
126
+ }
127
+ /**
128
+ * Derive the (stage, effort) a reconstruct report pinned. A judge override marks
129
+ * a judge-stage report ONLY when it pins a judge EFFORT; a model-only override
130
+ * (effort null/absent) varied the judge model, not an effort, so there is no
131
+ * effort point to calibrate and this returns null (the caller must pass an
132
+ * explicit tag) rather than mislabeling it as author. Without a judge override
133
+ * it is an author-stage report at the pinned `requested_effort`, falling back to
134
+ * the first run's telemetry `applied_effort` when the settings chain governed
135
+ * the effort. Returns null when no effort can be attributed.
136
+ */
137
+ export function deriveReconstructTag(report) {
138
+ const judgeOverride = report.requested_judge_override;
139
+ if (judgeOverride) {
140
+ return judgeOverride.effort
141
+ ? { stage: "judge", effort: judgeOverride.effort }
142
+ : null;
143
+ }
144
+ if (report.requested_effort) {
145
+ return { stage: "author", effort: report.requested_effort };
146
+ }
147
+ const applied = report.runs?.[0]?.metadata?.applied_effort;
148
+ if (applied)
149
+ return { stage: "author", effort: applied };
150
+ return null;
151
+ }
152
+ /**
153
+ * Ingest a reconstruct benchmark report into sweep runs for one (stage, effort)
154
+ * point. The point is the explicit `tag` when given, else derived from the
155
+ * report's pinned knob. A completed run only contributes a sample when its
156
+ * telemetry shows the swept stage actually ran at that effort (author:
157
+ * applied_effort matches; judge: an answer_support_judgment call at that effort),
158
+ * so a pin the route ignored, a recovery de-escalation, or a judge early-exit
159
+ * does not fabricate a sample. Their golden gate is then distilled. Failed runs
160
+ * (no telemetry) are emitted unjudged for the author stage only — they cannot
161
+ * prove the judge ran. Throws when no tag can be determined.
162
+ */
163
+ export function ingestReconstructReport(report, tag) {
164
+ const point = tag ?? deriveReconstructTag(report);
165
+ if (!point) {
166
+ throw new Error("reconstruct report pins no effort (no judge override, requested_effort, or applied_effort); pass an explicit stage:effort tag");
167
+ }
168
+ const out = [];
169
+ for (const run of report.runs ?? []) {
170
+ if (!appliedEffortMatches(run, point.stage, point.effort)) {
171
+ continue; // telemetry doesn't show this stage running at this effort
172
+ }
173
+ out.push({
174
+ stage: point.stage,
175
+ effort: point.effort,
176
+ gate: reconstructRunGateSignal(run.quality_gate),
177
+ });
178
+ }
179
+ // Failed runs carry no telemetry, so attribute them only to the author stage
180
+ // (which always runs) AND only when at least one completed run was retained at
181
+ // this point — that retained run's telemetry is what proves the model/route/
182
+ // effort. A source whose runs all failed proves no route/effort, so its
183
+ // failures are not turned into route-keyed samples.
184
+ if (point.stage === "author" && out.length > 0) {
185
+ const failedCount = report.reconstruct_extension?.failed_runs?.length ?? 0;
186
+ for (let i = 0; i < failedCount; i++) {
187
+ out.push({ stage: "author", effort: point.effort, gate: { ...UNJUDGED } });
188
+ }
189
+ }
190
+ return out;
191
+ }