@lcv-ideas-software/cross-review 4.2.5 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -7,6 +7,36 @@ standard `v00.00.00`; npm package versions remain SemVer.
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [v04.03.00] — 2026-06-05
11
+
12
+ **Minor — P1/P2/P3 audit follow-up.** This release closes the first concrete
13
+ items from the post-v4.2.5 runtime/session audit: unresolved evidence is harder
14
+ to miss at finalization time, fixture-level regressions can be evaluated
15
+ offline, and operators get a read-only peer reliability report without changing
16
+ peer selection.
17
+
18
+ ### Added
19
+
20
+ - Added `session_peer_reliability_report`, a read-only MCP tool that aggregates
21
+ per-peer parser warnings, decision quality, rejected/provider failures,
22
+ evidence checklist dispositions, fabrication-related events, latency and
23
+ cost.
24
+ - Added `npm run eval:fixtures`, an offline fixture harness for truthfulness
25
+ preflight, parser diagnostics and report rendering contracts. It does not
26
+ start provider sessions or call reviewers.
27
+ - `session_report` now includes an **Unresolved Evidence Disposition** section
28
+ when checklist items remain `open` or `not_resurfaced`.
29
+
30
+ ### Changed
31
+
32
+ - Automatic convergence with unresolved checklist items now finalizes with
33
+ `unanimous_ready_with_unresolved_evidence` or
34
+ `recovered_unanimity_with_unresolved_evidence` instead of a plain success
35
+ reason.
36
+ - Finalization now emits `session.evidence_checklist_unresolved_on_finalize`
37
+ with unresolved counts and item summaries when a session closes while
38
+ evidence asks are still open or only inferred as not resurfaced.
39
+
10
40
  ## [v04.02.05] — 2026-06-05
11
41
 
12
42
  **Patch — session audit hardening.** This release closes follow-ups from the
package/README.md CHANGED
@@ -24,7 +24,7 @@ npm install -g @lcv-ideas-software/cross-review
24
24
  npm install -g @lcv-ideas-software/cross-review --registry=https://npm.pkg.github.com
25
25
  ```
26
26
 
27
- **Status.** Stable. Current release: **v04.02.05** (npm package `4.2.5`). See [CHANGELOG.md](./CHANGELOG.md) for the full release history.
27
+ **Status.** Stable. Current release: **v04.03.00** (npm package `4.3.0`). See [CHANGELOG.md](./CHANGELOG.md) for the full release history.
28
28
 
29
29
  > **Project renamed 2026-05-15.** This project was previously published as
30
30
  > [`@lcv-ideas-software/cross-review-v2`](https://www.npmjs.com/package/@lcv-ideas-software/cross-review-v2)
@@ -38,6 +38,7 @@ The version history at a glance:
38
38
 
39
39
  | Release | Scope |
40
40
  | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
41
+ | **`v04.03.00`** | Minor — P1/P2/P3 follow-up with unresolved-evidence close-out visibility, an offline fixture eval harness, and a read-only peer reliability report. |
41
42
  | **`v04.02.05`** | Patch — harden session auditability with terminal events, cost split reporting, `not_resurfaced` visibility, and relator provenance checks for session IDs/GitHub URLs. |
42
43
  | **`v04.02.04`** | Patch — harden truthfulness preflight auditability, add a read-only preflight retest tool, and reduce false parser warnings for attached/log evidence. |
43
44
  | **`v04.02.03`** | Patch — promote the Gemini canonical default to `gemini-3.1-pro-preview` and refresh the active local Gemini rate card. |
@@ -211,6 +212,7 @@ these environment variables before running real sessions (example):
211
212
  - `session_metrics`
212
213
  - `session_doctor`
213
214
  - `session_report`
215
+ - `session_peer_reliability_report`
214
216
  - `session_check_convergence`
215
217
  - `session_truthfulness_preflight_check`
216
218
  - `session_attach_evidence`
@@ -228,7 +230,12 @@ these environment variables before running real sessions (example):
228
230
  lack terminal events, and reports peer-call cost separately from generation
229
231
  artifact cost. `session_report` uses the same split and calls out
230
232
  `not_resurfaced` evidence checklist items as inference-only, not proof that the
231
- requested evidence was satisfied.
233
+ requested evidence was satisfied. If a session otherwise reaches unanimity with
234
+ open or `not_resurfaced` checklist items, finalization records an
235
+ `*_with_unresolved_evidence` outcome reason and emits a durable unresolved
236
+ evidence event. `session_peer_reliability_report` is read-only and aggregates
237
+ per-peer parser warnings, evidence ask status, provider failures, cost and
238
+ latency.
232
239
 
233
240
  ## Repository conventions
234
241
 
@@ -0,0 +1,54 @@
1
+ export declare const truthfulnessCases: readonly [{
2
+ readonly name: "current runtime contradiction is blocked";
3
+ readonly input: {
4
+ readonly task: "The current cross-review runtime is 4.2.4.";
5
+ readonly runtimeFacts: {
6
+ readonly runtime_version: "4.2.5";
7
+ readonly release_date: "2026-06-05";
8
+ };
9
+ readonly attachmentsPresent: false;
10
+ };
11
+ readonly expectPass: false;
12
+ readonly expectIssueClass: "runtime_contradiction";
13
+ }, {
14
+ readonly name: "matching current runtime facts pass";
15
+ readonly input: {
16
+ readonly task: "server_info shows current cross-review runtime 4.2.5.";
17
+ readonly runtimeFacts: {
18
+ readonly runtime_version: "4.2.5";
19
+ readonly release_date: "2026-06-05";
20
+ };
21
+ readonly attachmentsPresent: false;
22
+ };
23
+ readonly expectPass: true;
24
+ }, {
25
+ readonly name: "historical timing claim needs snapshot evidence";
26
+ readonly input: {
27
+ readonly task: "When the audit began, cross-review was running 4.2.4.";
28
+ readonly runtimeFacts: {
29
+ readonly runtime_version: "4.2.5";
30
+ readonly release_date: "2026-06-05";
31
+ };
32
+ readonly attachmentsPresent: false;
33
+ };
34
+ readonly expectPass: false;
35
+ readonly expectIssueClass: "unsupported_historical_claim";
36
+ }];
37
+ export declare const parserCases: readonly [{
38
+ readonly name: "verified with empty evidence gets empty-evidence warning";
39
+ readonly text: string;
40
+ readonly expectStatus: "READY";
41
+ readonly expectWarning: "verified_without_evidence_sources";
42
+ }, {
43
+ readonly name: "verified with attached evidence path is concrete";
44
+ readonly text: string;
45
+ readonly expectStatus: "READY";
46
+ readonly absentWarning: "verified_without_evidence_sources";
47
+ }];
48
+ export declare const reportCases: readonly [{
49
+ readonly name: "cost split and unresolved evidence are surfaced";
50
+ readonly peerCost: 14.652426;
51
+ readonly generationCost: 1.876718;
52
+ readonly totalCost: 16.529144;
53
+ readonly unresolvedAsk: "attach raw npm test output";
54
+ }];
@@ -0,0 +1,216 @@
1
+ import assert from "node:assert/strict";
2
+ import fs from "node:fs";
3
+ import os from "node:os";
4
+ import path from "node:path";
5
+ import { loadConfig } from "../src/core/config.js";
6
+ import { truthfulnessPreflight } from "../src/core/orchestrator.js";
7
+ import { sessionReportMarkdown } from "../src/core/reports.js";
8
+ import { SessionStore } from "../src/core/session-store.js";
9
+ import { parsePeerStatus } from "../src/core/status.js";
10
+ function evalTmpDir(label) {
11
+ return fs.mkdtempSync(path.join(os.tmpdir(), `cross-review-eval-${label}-`));
12
+ }
13
+ export const truthfulnessCases = [
14
+ {
15
+ name: "current runtime contradiction is blocked",
16
+ input: {
17
+ task: "The current cross-review runtime is 4.2.4.",
18
+ runtimeFacts: { runtime_version: "4.2.5", release_date: "2026-06-05" },
19
+ attachmentsPresent: false,
20
+ },
21
+ expectPass: false,
22
+ expectIssueClass: "runtime_contradiction",
23
+ },
24
+ {
25
+ name: "matching current runtime facts pass",
26
+ input: {
27
+ task: "server_info shows current cross-review runtime 4.2.5.",
28
+ runtimeFacts: { runtime_version: "4.2.5", release_date: "2026-06-05" },
29
+ attachmentsPresent: false,
30
+ },
31
+ expectPass: true,
32
+ },
33
+ {
34
+ name: "historical timing claim needs snapshot evidence",
35
+ input: {
36
+ task: "When the audit began, cross-review was running 4.2.4.",
37
+ runtimeFacts: { runtime_version: "4.2.5", release_date: "2026-06-05" },
38
+ attachmentsPresent: false,
39
+ },
40
+ expectPass: false,
41
+ expectIssueClass: "unsupported_historical_claim",
42
+ },
43
+ ];
44
+ export const parserCases = [
45
+ {
46
+ name: "verified with empty evidence gets empty-evidence warning",
47
+ text: JSON.stringify({
48
+ status: "READY",
49
+ summary: "ok",
50
+ confidence: "verified",
51
+ evidence_sources: [],
52
+ caller_requests: [],
53
+ follow_ups: [],
54
+ }),
55
+ expectStatus: "READY",
56
+ expectWarning: "verified_without_evidence_sources",
57
+ },
58
+ {
59
+ name: "verified with attached evidence path is concrete",
60
+ text: JSON.stringify({
61
+ status: "READY",
62
+ summary: "ok",
63
+ confidence: "verified",
64
+ evidence_sources: ["evidence/2026-06-05T00-00-00Z-raw-smoke.txt: npm test 42 passed"],
65
+ caller_requests: [],
66
+ follow_ups: [],
67
+ }),
68
+ expectStatus: "READY",
69
+ absentWarning: "verified_without_evidence_sources",
70
+ },
71
+ ];
72
+ export const reportCases = [
73
+ {
74
+ name: "cost split and unresolved evidence are surfaced",
75
+ peerCost: 14.652426,
76
+ generationCost: 1.876718,
77
+ totalCost: 16.529144,
78
+ unresolvedAsk: "attach raw npm test output",
79
+ },
80
+ ];
81
+ for (const testCase of truthfulnessCases) {
82
+ const result = truthfulnessPreflight({
83
+ task: testCase.input.task,
84
+ runtimeFacts: testCase.input.runtimeFacts,
85
+ attachmentsPresent: testCase.input.attachmentsPresent,
86
+ });
87
+ assert.equal(result.pass, testCase.expectPass, testCase.name);
88
+ if ("expectIssueClass" in testCase) {
89
+ assert.ok(result.issue_classes.includes(testCase.expectIssueClass), testCase.name);
90
+ }
91
+ }
92
+ for (const testCase of parserCases) {
93
+ const result = parsePeerStatus(testCase.text);
94
+ assert.equal(result.status, testCase.expectStatus, testCase.name);
95
+ if ("expectWarning" in testCase) {
96
+ assert.ok(result.parser_warnings.includes(testCase.expectWarning), testCase.name);
97
+ }
98
+ if ("absentWarning" in testCase) {
99
+ assert.ok(!result.parser_warnings.includes(testCase.absentWarning), testCase.name);
100
+ }
101
+ }
102
+ for (const testCase of reportCases) {
103
+ const store = new SessionStore({
104
+ ...loadConfig(),
105
+ data_dir: evalTmpDir("report"),
106
+ });
107
+ const session = await store.init(`eval report fixture: ${testCase.name}`, "operator", []);
108
+ const meta = store.read(session.session_id);
109
+ const ts = new Date().toISOString();
110
+ meta.rounds = [
111
+ {
112
+ round: 1,
113
+ started_at: ts,
114
+ completed_at: ts,
115
+ caller_status: "READY",
116
+ prompt_file: "agent-runs/round-1-prompt.md",
117
+ peers: [
118
+ {
119
+ peer: "codex",
120
+ provider: "openai",
121
+ model: "gpt-5.5",
122
+ status: "READY",
123
+ structured: {
124
+ status: "READY",
125
+ summary: "ready",
126
+ confidence: "verified",
127
+ evidence_sources: ["server_info: version 4.2.5"],
128
+ caller_requests: [],
129
+ follow_ups: [],
130
+ },
131
+ text: "{}",
132
+ raw: { fixture: true },
133
+ decision_quality: "clean",
134
+ parser_warnings: [],
135
+ attempts: 1,
136
+ latency_ms: 1,
137
+ usage: { input_tokens: 1, output_tokens: 1, total_tokens: 2 },
138
+ cost: {
139
+ currency: "USD",
140
+ estimated: false,
141
+ source: "configured-rate",
142
+ total_cost: testCase.peerCost,
143
+ },
144
+ },
145
+ ],
146
+ rejected: [],
147
+ convergence: {
148
+ converged: true,
149
+ reason: "fixture",
150
+ ready_peers: ["codex"],
151
+ not_ready_peers: [],
152
+ needs_evidence_peers: [],
153
+ rejected_peers: [],
154
+ skipped_peers: [],
155
+ decision_quality: {
156
+ codex: "clean",
157
+ claude: "clean",
158
+ gemini: "clean",
159
+ deepseek: "clean",
160
+ grok: "clean",
161
+ perplexity: "clean",
162
+ },
163
+ blocking_details: [],
164
+ },
165
+ },
166
+ ];
167
+ meta.generation_files = [
168
+ {
169
+ round: 0,
170
+ peer: "codex",
171
+ label: "initial_draft",
172
+ path: "agent-runs/round-0-initial-draft.md",
173
+ ts,
174
+ usage: { input_tokens: 1, output_tokens: 1, total_tokens: 2 },
175
+ cost: {
176
+ currency: "USD",
177
+ estimated: false,
178
+ source: "configured-rate",
179
+ total_cost: testCase.generationCost,
180
+ },
181
+ },
182
+ ];
183
+ meta.totals.cost = {
184
+ currency: "USD",
185
+ estimated: false,
186
+ source: "configured-rate",
187
+ total_cost: testCase.totalCost,
188
+ };
189
+ meta.evidence_checklist = [
190
+ {
191
+ id: "eval-1",
192
+ peer: "codex",
193
+ first_round: 1,
194
+ last_round: 1,
195
+ round_count: 1,
196
+ ask: testCase.unresolvedAsk,
197
+ first_seen_at: ts,
198
+ last_seen_at: ts,
199
+ status: "not_resurfaced",
200
+ addressed_at_round: 2,
201
+ address_method: "resurfacing",
202
+ },
203
+ ];
204
+ fs.writeFileSync(store.metaPath(session.session_id), JSON.stringify(meta));
205
+ const report = sessionReportMarkdown(store.read(session.session_id), []);
206
+ assert.ok(report.includes("$16.529144 USD = $14.652426 peer + $1.876718 generation"));
207
+ assert.ok(report.includes("## Unresolved Evidence Disposition"));
208
+ assert.ok(report.includes(testCase.unresolvedAsk));
209
+ }
210
+ console.log(JSON.stringify({
211
+ ok: true,
212
+ truthfulness_cases: truthfulnessCases.length,
213
+ parser_cases: parserCases.length,
214
+ report_cases: reportCases.length,
215
+ }));
216
+ //# sourceMappingURL=eval-fixtures.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-fixtures.js","sourceRoot":"","sources":["../../scripts/eval-fixtures.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,oBAAoB,CAAC;AACxC,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAE,qBAAqB,EAAE,MAAM,6BAA6B,CAAC;AACpE,OAAO,EAAE,qBAAqB,EAAE,MAAM,wBAAwB,CAAC;AAC/D,OAAO,EAAE,YAAY,EAAE,MAAM,8BAA8B,CAAC;AAC5D,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAExD,SAAS,UAAU,CAAC,KAAa;IAC/B,OAAO,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,qBAAqB,KAAK,GAAG,CAAC,CAAC,CAAC;AAC/E,CAAC;AAED,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B;QACE,IAAI,EAAE,0CAA0C;QAChD,KAAK,EAAE;YACL,IAAI,EAAE,4CAA4C;YAClD,YAAY,EAAE,EAAE,eAAe,EAAE,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE;YACtE,kBAAkB,EAAE,KAAK;SAC1B;QACD,UAAU,EAAE,KAAK;QACjB,gBAAgB,EAAE,uBAAuB;KAC1C;IACD;QACE,IAAI,EAAE,qCAAqC;QAC3C,KAAK,EAAE;YACL,IAAI,EAAE,uDAAuD;YAC7D,YAAY,EAAE,EAAE,eAAe,EAAE,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE;YACtE,kBAAkB,EAAE,KAAK;SAC1B;QACD,UAAU,EAAE,IAAI;KACjB;IACD;QACE,IAAI,EAAE,iDAAiD;QACvD,KAAK,EAAE;YACL,IAAI,EAAE,uDAAuD;YAC7D,YAAY,EAAE,EAAE,eAAe,EAAE,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE;YACtE,kBAAkB,EAAE,KAAK;SAC1B;QACD,UAAU,EAAE,KAAK;QACjB,gBAAgB,EAAE,8BAA8B;KACjD;CACO,CAAC;AAEX,MAAM,CAAC,MAAM,WAAW,GAAG;IACzB;QACE,IAAI,EAAE,0DAA0D;QAChE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;YACnB,MAAM,EAAE,OAAO;YACf,OAAO,EAAE,IAAI;YACb,UAAU,EAAE,UAAU;YACtB,gBAAgB,EAAE,EAAE;YACpB,eAAe,EAAE,EAAE;YACnB,UAAU,EAAE,EAAE;SACf,CAAC;QACF,YAAY,EAAE,OAAO;QACrB,aAAa,EAAE,mCAAmC;KACnD;IACD;QACE,IAAI,EAAE,kDAAkD;QACxD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;YACnB,MAAM,EAAE,OAAO;YACf,OAAO,EAAE,IAAI;YACb,UAAU,EAAE,UAAU;YACtB,gBAAgB,EAAE,CAAC,iEAAiE,CAAC;YACrF,eAAe,EAAE,EAAE;YACnB,UAAU,EAAE,EAAE;SACf,CAAC;QACF,YAAY,EAAE,OAAO;QACrB,aAAa,EAAE,mCAAmC;KACnD;CACO,CAAC;AAEX,MAAM,CAAC,MAAM,WAAW,GAAG;IACzB;QACE,IAAI,EAAE,iDAAiD;QACvD,QAAQ,EAAE,SAAS;QACnB,cAAc,EAAE,QAAQ;QACxB,SAAS,EAAE,SAAS;QACpB,aAAa,EAAE,4BAA4B;KAC5C;CACO,CAAC;AAEX,KAAK,MAAM,QAAQ,IAAI,iBAAiB,EAAE,CAAC;IACzC,MAAM,MAAM,GAAG,qBAAqB,CAAC;QACnC,IAAI,EAAE,QAAQ,CAAC,KAAK,CAAC,IAAI;QACzB,YAAY,EAAE,QAAQ,CAAC,KAAK,CAAC,YAAY;QACzC,kBAAkB,EAAE,QAAQ,CAAC,KAAK,CAAC,kBAAkB;KACtD,CAAC,CAAC;IACH,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,IAAI,EAAE,QAAQ,CAAC,UAAU,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC9D,IAAI,kBAAkB,IAAI,QAAQ,EAAE,CAAC;QACnC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,aAAa,CAAC,QAAQ,CAAC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IACrF,CAAC;AACH,CAAC;AAED,KAAK,MAAM,QAAQ,IAAI,WAAW,EAAE,CAAC;IACnC,MAAM,MAAM,GAAG,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC9C,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,QAAQ,CAAC,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IAClE,IAAI,eAAe,IAAI,QAAQ,EAAE,CAAC;QAChC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,eAAe,CAAC,QAAQ,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IACpF,CAAC;IACD,IAAI,eAAe,IAAI,QAAQ,EAAE,CAAC;QAChC,MAAM,CAAC,EAAE,CAAC,CAAC,MAAM,CAAC,eAAe,CAAC,QAAQ,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IACrF,CAAC;AACH,CAAC;AAED,KAAK,MAAM,QAAQ,IAAI,WAAW,EAAE,CAAC;IACnC,MAAM,KAAK,GAAG,IAAI,YAAY,CAAC;QAC7B,GAAG,UAAU,EAAE;QACf,QAAQ,EAAE,UAAU,CAAC,QAAQ,CAAC;KAC/B,CAAC,CAAC;IACH,MAAM,OAAO,GAAG,MAAM,KAAK,CAAC,IAAI,CAAC,wBAAwB,QAAQ,CAAC,IAAI,EAAE,EAAE,UAAU,EAAE,EAAE,CAAC,CAAC;IAC1F,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;IAC5C,MAAM,EAAE,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACpC,IAAI,CAAC,MAAM,GAAG;QACZ;YACE,KAAK,EAAE,CAAC;YACR,UAAU,EAAE,EAAE;YACd,YAAY,EAAE,EAAE;YAChB,aAAa,EAAE,OAAO;YACtB,WAAW,EAAE,8BAA8B;YAC3C,KAAK,EAAE;gBACL;oBACE,IAAI,EAAE,OAAO;oBACb,QAAQ,EAAE,QAAQ;oBAClB,KAAK,EAAE,SAAS;oBAChB,MAAM,EAAE,OAAO;oBACf,UAAU,EAAE;wBACV,MAAM,EAAE,OAAO;wBACf,OAAO,EAAE,OAAO;wBAChB,UAAU,EAAE,UAAU;wBACtB,gBAAgB,EAAE,CAAC,4BAA4B,CAAC;wBAChD,eAAe,EAAE,EAAE;wBACnB,UAAU,EAAE,EAAE;qBACf;oBACD,IAAI,EAAE,IAAI;oBACV,GAAG,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE;oBACtB,gBAAgB,EAAE,OAAO;oBACzB,eAAe,EAAE,EAAE;oBACnB,QAAQ,EAAE,CAAC;oBACX,UAAU,EAAE,CAAC;oBACb,KAAK,EAAE,EAAE,YAAY,EAAE,CAAC,EAAE,aAAa,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE;oBAC7D,IAAI,EAAE;wBACJ,QAAQ,EAAE,KAAK;wBACf,SAAS,EAAE,KAAK;wBAChB,MAAM,EAAE,iBAAiB;wBACzB,UAAU,EAAE,QAAQ,CAAC,QAAQ;qBAC9B;iBACF;aACF;YACD,QAAQ,EAAE,EAAE;YACZ,WAAW,EAAE;gBACX,SAAS,EAAE,IAAI;gBACf,MAAM,EAAE,SAAS;gBACjB,WAAW,EAAE,CAAC,OAAO,CAAC;gBACtB,eAAe,EAAE,EAAE;gBACnB,oBAAoB,EAAE,EAAE;gBACxB,cAAc,EAAE,EAAE;gBAClB,aAAa,EAAE,EAAE;gBACjB,gBAAgB,EAAE;oBAChB,KAAK,EAAE,OAAO;oBACd,MAAM,EAAE,OAAO;oBACf,MAAM,EAAE,OAAO;oBACf,QAAQ,EAAE,OAAO;oBACjB,IAAI,EAAE,OAAO;oBACb,UAAU,EAAE,OAAO;iBACpB;gBACD,gBAAgB,EAAE,EAAE;aACrB;SACF;KACF,CAAC;IACF,IAAI,CAAC,gBAAgB,GAAG;QACtB;YACE,KAAK,EAAE,CAAC;YACR,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,eAAe;YACtB,IAAI,EAAE,qCAAqC;YAC3C,EAAE;YACF,KAAK,EAAE,EAAE,YAAY,EAAE,CAAC,EAAE,aAAa,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE;YAC7D,IAAI,EAAE;gBACJ,QAAQ,EAAE,KAAK;gBACf,SAAS,EAAE,KAAK;gBAChB,MAAM,EAAE,iBAAiB;gBACzB,UAAU,EAAE,QAAQ,CAAC,cAAc;aACpC;SACF;KACF,CAAC;IACF,IAAI,CAAC,MAAM,CAAC,IAAI,GAAG;QACjB,QAAQ,EAAE,KAAK;QACf,SAAS,EAAE,KAAK;QAChB,MAAM,EAAE,iBAAiB;QACzB,UAAU,EAAE,QAAQ,CAAC,SAAS;KAC/B,CAAC;IACF,IAAI,CAAC,kBAAkB,GAAG;QACxB;YACE,EAAE,EAAE,QAAQ;YACZ,IAAI,EAAE,OAAO;YACb,WAAW,EAAE,CAAC;YACd,UAAU,EAAE,CAAC;YACb,WAAW,EAAE,CAAC;YACd,GAAG,EAAE,QAAQ,CAAC,aAAa;YAC3B,aAAa,EAAE,EAAE;YACjB,YAAY,EAAE,EAAE;YAChB,MAAM,EAAE,gBAAgB;YACxB,kBAAkB,EAAE,CAAC;YACrB,cAAc,EAAE,aAAa;SAC9B;KACF,CAAC;IACF,EAAE,CAAC,aAAa,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC;IAC3E,MAAM,MAAM,GAAG,qBAAqB,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,EAAE,CAAC,CAAC;IACzE,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,yDAAyD,CAAC,CAAC,CAAC;IACtF,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,oCAAoC,CAAC,CAAC,CAAC;IACjE,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,CAAC;AACrD,CAAC;AAED,OAAO,CAAC,GAAG,CACT,IAAI,CAAC,SAAS,CAAC;IACb,EAAE,EAAE,IAAI;IACR,kBAAkB,EAAE,iBAAiB,CAAC,MAAM;IAC5C,YAAY,EAAE,WAAW,CAAC,MAAM;IAChC,YAAY,EAAE,WAAW,CAAC,MAAM;CACjC,CAAC,CACH,CAAC"}
@@ -1248,6 +1248,193 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
1248
1248
  assert.ok(notResurfacedReport.includes("not_resurfaced means the ask was not repeated; it is not proof that evidence was satisfied."), "v4.2.5 / not_resurfaced: session report must state the not_resurfaced semantics");
1249
1249
  console.log("[smoke] terminal_cost_evidence_audit_test: PASS");
1250
1250
  }
1251
+ // v4.3.0 / P1: unanimous READY with unresolved evidence must not look like a
1252
+ // plain unanimous_ready close-out. `not_resurfaced` is inference-only: it may
1253
+ // allow convergence, but the final metadata/report must keep that disposition
1254
+ // visible for operators.
1255
+ {
1256
+ const { sessionReportMarkdown } = await import("../src/core/reports.js");
1257
+ const unresolvedEvents = [];
1258
+ const unresolvedConfig = {
1259
+ ...loadConfig(),
1260
+ data_dir: smokeTmpDir("unresolved-evidence-finalize"),
1261
+ budget: {
1262
+ ...loadConfig().budget,
1263
+ max_session_cost_usd: 10000,
1264
+ preflight_max_round_cost_usd: 10000,
1265
+ until_stopped_max_cost_usd: 10000,
1266
+ },
1267
+ evidence_judge_autowire: {
1268
+ ...loadConfig().evidence_judge_autowire,
1269
+ mode: "off",
1270
+ active: false,
1271
+ },
1272
+ };
1273
+ const unresolvedOrch = new CrossReviewOrchestrator(unresolvedConfig, (event) => unresolvedEvents.push(event.type));
1274
+ const unresolvedR1 = await unresolvedOrch.askPeers({
1275
+ task: "P1 unresolved evidence finalization guard fixture.",
1276
+ draft: "FORCE_NEEDS_EVIDENCE",
1277
+ caller: "operator",
1278
+ peers: ["claude"],
1279
+ });
1280
+ const unresolvedR2 = await unresolvedOrch.askPeers({
1281
+ session_id: unresolvedR1.session.session_id,
1282
+ task: "P1 unresolved evidence finalization guard fixture.",
1283
+ draft: "Clean revised draft, no test marker present.",
1284
+ caller: "operator",
1285
+ peers: ["claude"],
1286
+ });
1287
+ assert.equal(unresolvedR2.converged, true);
1288
+ assert.equal(unresolvedR2.session.outcome, "converged");
1289
+ assert.equal(unresolvedR2.session.outcome_reason, "unanimous_ready_with_unresolved_evidence", "v4.3.0 / P1: convergence with not_resurfaced evidence must not finalize as plain unanimous_ready");
1290
+ assert.ok(unresolvedEvents.includes("session.evidence_checklist_unresolved_on_finalize"), "v4.3.0 / P1: unresolved evidence close-out must emit an audit event");
1291
+ const unresolvedReport = sessionReportMarkdown(unresolvedOrch.store.read(unresolvedR2.session.session_id), unresolvedOrch.store.readEvents(unresolvedR2.session.session_id));
1292
+ assert.ok(unresolvedReport.includes("## Unresolved Evidence Disposition"), "v4.3.0 / P1: session_report must include unresolved-evidence disposition table");
1293
+ assert.ok(unresolvedReport.includes("not_resurfaced"), "v4.3.0 / P1: session_report must name not_resurfaced unresolved items");
1294
+ console.log("[smoke] unresolved_evidence_finalization_guard_test: PASS");
1295
+ }
1296
+ // v4.3.0 / P3: read-only peer reliability telemetry. This is deliberately
1297
+ // observational; it must not change peer selection or mutate sessions.
1298
+ {
1299
+ const { SessionStore } = await import("../src/core/session-store.js");
1300
+ const reliabilityStore = new SessionStore({
1301
+ ...config,
1302
+ data_dir: smokeTmpDir("peer-reliability"),
1303
+ });
1304
+ const reliabilitySession = await reliabilityStore.init("peer reliability report fixture", "operator", []);
1305
+ const reliabilityMeta = reliabilityStore.read(reliabilitySession.session_id);
1306
+ const ts = new Date().toISOString();
1307
+ reliabilityMeta.rounds = [
1308
+ {
1309
+ round: 1,
1310
+ started_at: ts,
1311
+ completed_at: ts,
1312
+ caller_status: "READY",
1313
+ prompt_file: "agent-runs/round-1-prompt.md",
1314
+ peers: [
1315
+ {
1316
+ peer: "claude",
1317
+ provider: "anthropic",
1318
+ model: "claude-opus-4-8",
1319
+ status: "NEEDS_EVIDENCE",
1320
+ structured: {
1321
+ status: "NEEDS_EVIDENCE",
1322
+ summary: "needs log",
1323
+ confidence: "verified",
1324
+ evidence_sources: ["src/core/session-store.ts:1"],
1325
+ caller_requests: ["attach raw npm test output"],
1326
+ follow_ups: [],
1327
+ },
1328
+ text: "{}",
1329
+ raw: { fixture: true },
1330
+ decision_quality: "clean",
1331
+ parser_warnings: [],
1332
+ attempts: 1,
1333
+ latency_ms: 50,
1334
+ usage: { input_tokens: 10, output_tokens: 5, total_tokens: 15 },
1335
+ cost: { currency: "USD", estimated: false, source: "configured-rate", total_cost: 1 },
1336
+ },
1337
+ {
1338
+ peer: "grok",
1339
+ provider: "xai",
1340
+ model: "grok-4.3",
1341
+ status: "READY",
1342
+ structured: {
1343
+ status: "READY",
1344
+ summary: "ready",
1345
+ confidence: "verified",
1346
+ evidence_sources: ["server_info: version 4.2.5"],
1347
+ caller_requests: [],
1348
+ follow_ups: [],
1349
+ },
1350
+ text: "{}",
1351
+ raw: { fixture: true },
1352
+ decision_quality: "format_warning",
1353
+ parser_warnings: ["verified_without_concrete_evidence_sources"],
1354
+ attempts: 1,
1355
+ latency_ms: 100,
1356
+ usage: { input_tokens: 20, output_tokens: 10, total_tokens: 30 },
1357
+ cost: { currency: "USD", estimated: false, source: "configured-rate", total_cost: 2 },
1358
+ },
1359
+ ],
1360
+ rejected: [
1361
+ {
1362
+ peer: "perplexity",
1363
+ provider: "perplexity",
1364
+ model: "sonar-reasoning-pro",
1365
+ failure_class: "provider_error",
1366
+ message: "fixture provider error",
1367
+ retryable: false,
1368
+ attempts: 1,
1369
+ latency_ms: 0,
1370
+ },
1371
+ ],
1372
+ convergence: {
1373
+ converged: false,
1374
+ reason: "fixture",
1375
+ ready_peers: ["grok"],
1376
+ not_ready_peers: [],
1377
+ needs_evidence_peers: ["claude"],
1378
+ rejected_peers: ["perplexity"],
1379
+ skipped_peers: [],
1380
+ decision_quality: {
1381
+ codex: "clean",
1382
+ claude: "clean",
1383
+ gemini: "clean",
1384
+ deepseek: "clean",
1385
+ grok: "format_warning",
1386
+ perplexity: "failed",
1387
+ },
1388
+ blocking_details: ["claude:NEEDS_EVIDENCE", "perplexity:provider_error"],
1389
+ },
1390
+ },
1391
+ ];
1392
+ reliabilityMeta.evidence_checklist = [
1393
+ {
1394
+ id: "rel-1",
1395
+ peer: "claude",
1396
+ first_round: 1,
1397
+ last_round: 1,
1398
+ round_count: 1,
1399
+ ask: "attach raw npm test output",
1400
+ first_seen_at: ts,
1401
+ last_seen_at: ts,
1402
+ status: "not_resurfaced",
1403
+ addressed_at_round: 2,
1404
+ address_method: "resurfacing",
1405
+ },
1406
+ ];
1407
+ fs.writeFileSync(reliabilityStore.metaPath(reliabilitySession.session_id), JSON.stringify(reliabilityMeta));
1408
+ await reliabilityStore.appendEvent({
1409
+ ts,
1410
+ type: "session.lead_meta_audit_fabrication_detected",
1411
+ session_id: reliabilitySession.session_id,
1412
+ message: "fixture fabrication event",
1413
+ data: { peer: "grok" },
1414
+ });
1415
+ const reliability = reliabilityStore.peerReliabilityReport();
1416
+ assert.equal(reliability.scope, "all");
1417
+ assert.equal(reliability.by_peer.claude?.needs_evidence, 1);
1418
+ assert.equal(reliability.by_peer.claude?.not_resurfaced_asks, 1);
1419
+ assert.equal(reliability.by_peer.grok?.ready, 1);
1420
+ assert.equal(reliability.by_peer.grok?.parser_warnings_total, 1);
1421
+ assert.equal(reliability.by_peer.grok?.fabrication_events, 1);
1422
+ assert.equal(reliability.by_peer.perplexity?.provider_errors, 1);
1423
+ console.log("[smoke] peer_reliability_report_test: PASS");
1424
+ }
1425
+ // v4.3.0 / P2: offline declarative eval harness. This pins the existence of a
1426
+ // no-provider-call fixture runner so regressions found in real sessions can be
1427
+ // replayed without growing the ad hoc smoke body indefinitely.
1428
+ {
1429
+ const pkg = JSON.parse(fs.readFileSync("package.json", "utf8"));
1430
+ assert.equal(pkg.scripts?.["eval:fixtures"], "tsx scripts/eval-fixtures.ts", "v4.3.0 / P2: package.json must expose the offline fixture eval runner");
1431
+ const evalHarness = fs.readFileSync("scripts/eval-fixtures.ts", "utf8");
1432
+ assert.ok(/truthfulnessCases/.test(evalHarness) &&
1433
+ /parserCases/.test(evalHarness) &&
1434
+ /reportCases/.test(evalHarness), "v4.3.0 / P2: eval-fixtures must use declarative truthfulness/parser/report case tables");
1435
+ assert.ok(!/askPeers\(|runUntilUnanimous\(|session_start_round/.test(evalHarness), "v4.3.0 / P2: eval-fixtures must stay offline and avoid provider-review entry points");
1436
+ console.log("[smoke] offline_fixture_eval_contract_test: PASS");
1437
+ }
1251
1438
  // v2.22.0 (B.P3): session.budget_warning event emit + idempotency. The
1252
1439
  // orchestrator emits a one-shot warning when cumulative cost crosses
1253
1440
  // 75% of cost_ceiling_usd; the budget_warning_emitted flag persists