@lcv-ideas-software/cross-review 4.2.4 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1096,6 +1096,345 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
1096
1096
  assert.equal(entry?.chronic_blockers?.[0], "ask-codex-1", "chronic_blockers must contain the codex round_count=4 item id");
1097
1097
  console.log("[smoke] evidence_checklist_drilldown_test: PASS");
1098
1098
  }
1099
+ // v4.2.5 — terminal events, cost split and not_resurfaced audit visibility.
1100
+ // The 2026-06-05 disk audit found sessions whose meta.json had a terminal
1101
+ // outcome without a matching terminal event, cost reports that blurred peer
1102
+ // calls with lead-generation artifacts, and converged sessions with
1103
+ // not_resurfaced checklist items that were too easy to misread as satisfied.
1104
+ {
1105
+ const { SessionStore } = await import("../src/core/session-store.js");
1106
+ const { sessionReportMarkdown } = await import("../src/core/reports.js");
1107
+ const auditStore = new SessionStore({
1108
+ ...config,
1109
+ data_dir: smokeTmpDir("terminal-cost-evidence-audit"),
1110
+ });
1111
+ const finalizedSession = await auditStore.init("terminal event fixture", "operator", []);
1112
+ await auditStore.finalize(finalizedSession.session_id, "aborted", "smoke_terminal_abort");
1113
+ const finalizedEvents = auditStore.readEvents(finalizedSession.session_id);
1114
+ assert.ok(finalizedEvents.some((event) => event.type === "session.finalized" &&
1115
+ event.data?.outcome === "aborted" &&
1116
+ event.data?.reason === "smoke_terminal_abort"), "v4.2.5 / terminal_events: finalize() must persist a session.finalized event");
1117
+ const cancelledSession = await auditStore.init("cancelled terminal event fixture", "operator", []);
1118
+ await auditStore.markCancelled(cancelledSession.session_id, "session_cancelled");
1119
+ const cancelledEvents = auditStore.readEvents(cancelledSession.session_id);
1120
+ assert.ok(cancelledEvents.some((event) => event.type === "session.cancelled" && event.data?.reason === "session_cancelled"), "v4.2.5 / terminal_events: markCancelled() must persist a session.cancelled event");
1121
+ const sweptSession = await auditStore.init("idle sweep terminal event fixture", "operator", []);
1122
+ const sweptMeta = auditStore.read(sweptSession.session_id);
1123
+ sweptMeta.updated_at = new Date(Date.now() - 25 * 60 * 60 * 1000).toISOString();
1124
+ fs.writeFileSync(auditStore.metaPath(sweptSession.session_id), JSON.stringify(sweptMeta));
1125
+ const swept = await auditStore.sweepIdle(0, "aborted", "smoke_idle_sweep");
1126
+ assert.equal(swept.some((session) => session.session_id === sweptSession.session_id), true, "v4.2.5 / terminal_events: sweepIdle() must finalize stale sessions");
1127
+ const sweptEvents = auditStore.readEvents(sweptSession.session_id);
1128
+ assert.ok(sweptEvents.some((event) => event.type === "session.finalized" &&
1129
+ event.data?.outcome === "aborted" &&
1130
+ event.data?.reason === "smoke_idle_sweep" &&
1131
+ typeof event.data?.idle_ms === "number"), "v4.2.5 / terminal_events: sweepIdle() must persist a session.finalized event");
1132
+ const reportSession = await auditStore.init("cost split report fixture", "operator", []);
1133
+ const reportMeta = auditStore.read(reportSession.session_id);
1134
+ const nowIso = new Date().toISOString();
1135
+ reportMeta.rounds = [
1136
+ {
1137
+ round: 1,
1138
+ started_at: nowIso,
1139
+ completed_at: nowIso,
1140
+ caller_status: "READY",
1141
+ prompt_file: "agent-runs/round-1-prompt.md",
1142
+ peers: [
1143
+ {
1144
+ peer: "codex",
1145
+ provider: "openai",
1146
+ model: "gpt-5",
1147
+ status: "READY",
1148
+ structured: { status: "READY", summary: "ready", confidence: "verified" },
1149
+ text: '{"status":"READY","summary":"ready","confidence":"verified"}',
1150
+ raw: { fixture: true },
1151
+ decision_quality: "clean",
1152
+ parser_warnings: [],
1153
+ attempts: 1,
1154
+ latency_ms: 10,
1155
+ usage: { input_tokens: 10, output_tokens: 5, total_tokens: 15 },
1156
+ cost: {
1157
+ currency: "USD",
1158
+ estimated: false,
1159
+ source: "configured-rate",
1160
+ total_cost: 14.652426,
1161
+ },
1162
+ },
1163
+ ],
1164
+ rejected: [],
1165
+ convergence: {
1166
+ converged: true,
1167
+ reason: "unanimous",
1168
+ ready_peers: ["codex"],
1169
+ not_ready_peers: [],
1170
+ needs_evidence_peers: [],
1171
+ rejected_peers: [],
1172
+ skipped_peers: [],
1173
+ decision_quality: {
1174
+ codex: "clean",
1175
+ claude: "clean",
1176
+ gemini: "clean",
1177
+ deepseek: "clean",
1178
+ grok: "clean",
1179
+ perplexity: "clean",
1180
+ },
1181
+ blocking_details: [],
1182
+ },
1183
+ },
1184
+ ];
1185
+ reportMeta.generation_files = [
1186
+ {
1187
+ round: 0,
1188
+ peer: "gemini",
1189
+ label: "initial_draft",
1190
+ path: "agent-runs/round-0-initial-draft.md",
1191
+ ts: nowIso,
1192
+ usage: { input_tokens: 6, output_tokens: 4, total_tokens: 10 },
1193
+ cost: {
1194
+ currency: "USD",
1195
+ estimated: false,
1196
+ source: "configured-rate",
1197
+ total_cost: 1.876718,
1198
+ },
1199
+ },
1200
+ ];
1201
+ reportMeta.totals.cost = {
1202
+ currency: "USD",
1203
+ estimated: false,
1204
+ source: "configured-rate",
1205
+ total_cost: 16.529144,
1206
+ };
1207
+ fs.writeFileSync(auditStore.metaPath(reportSession.session_id), JSON.stringify(reportMeta));
1208
+ const reportMarkdown = sessionReportMarkdown(auditStore.read(reportSession.session_id), []);
1209
+ assert.ok(reportMarkdown.includes("- Peer call cost: $14.652426 USD"), "v4.2.5 / cost_split: session report must show peer-call cost separately");
1210
+ assert.ok(reportMarkdown.includes("- Generation cost: $1.876718 USD"), "v4.2.5 / cost_split: session report must show lead-generation cost separately");
1211
+ assert.ok(reportMarkdown.includes("$16.529144 USD = $14.652426 peer + $1.876718 generation"), "v4.2.5 / cost_split: session report must explicitly reconcile total = peer + generation");
1212
+ const notResurfacedSession = await auditStore.init("not resurfaced visibility fixture", "operator", []);
1213
+ const notResurfacedMeta = auditStore.read(notResurfacedSession.session_id);
1214
+ notResurfacedMeta.evidence_checklist = [
1215
+ {
1216
+ id: "nr-1",
1217
+ peer: "deepseek",
1218
+ first_round: 1,
1219
+ last_round: 1,
1220
+ round_count: 1,
1221
+ ask: "attach raw npm ci output",
1222
+ first_seen_at: nowIso,
1223
+ last_seen_at: nowIso,
1224
+ status: "not_resurfaced",
1225
+ addressed_at_round: 2,
1226
+ address_method: "resurfacing",
1227
+ },
1228
+ ];
1229
+ fs.writeFileSync(auditStore.metaPath(notResurfacedSession.session_id), JSON.stringify(notResurfacedMeta));
1230
+ const legacyGapSession = await auditStore.init("legacy terminal event gap fixture", "operator", []);
1231
+ const legacyGapMeta = auditStore.read(legacyGapSession.session_id);
1232
+ legacyGapMeta.outcome = "aborted";
1233
+ legacyGapMeta.outcome_reason = "legacy_without_terminal_event";
1234
+ legacyGapMeta.updated_at = nowIso;
1235
+ legacyGapMeta.convergence_health = {
1236
+ state: "stale",
1237
+ last_event_at: nowIso,
1238
+ detail: "legacy_without_terminal_event",
1239
+ };
1240
+ fs.writeFileSync(auditStore.metaPath(legacyGapSession.session_id), JSON.stringify(legacyGapMeta));
1241
+ const notResurfacedDoctor = await auditStore.sessionDoctor(20);
1242
+ assert.equal(notResurfacedDoctor.totals.not_resurfaced_evidence_sessions, 1, "v4.2.5 / not_resurfaced: session_doctor totals must count not_resurfaced sessions separately");
1243
+ assert.equal(notResurfacedDoctor.findings.not_resurfaced_evidence_sessions[0]?.session_id, notResurfacedSession.session_id, "v4.2.5 / not_resurfaced: session_doctor must enumerate not_resurfaced sessions");
1244
+ assert.equal(notResurfacedDoctor.totals.terminal_event_missing_sessions, 1, "v4.2.5 / terminal_event_missing: session_doctor totals must count legacy terminal-event gaps");
1245
+ assert.equal(notResurfacedDoctor.findings.terminal_event_missing_sessions[0]?.session_id, legacyGapSession.session_id, "v4.2.5 / terminal_event_missing: session_doctor must enumerate legacy terminal-event gaps");
1246
+ assert.equal(notResurfacedDoctor.findings.terminal_event_missing_sessions[0]?.terminal_event_expected, "session.finalized", "v4.2.5 / terminal_event_missing: session_doctor must report the expected terminal event");
1247
+ const notResurfacedReport = sessionReportMarkdown(auditStore.read(notResurfacedSession.session_id), []);
1248
+ assert.ok(notResurfacedReport.includes("not_resurfaced means the ask was not repeated; it is not proof that evidence was satisfied."), "v4.2.5 / not_resurfaced: session report must state the not_resurfaced semantics");
1249
+ console.log("[smoke] terminal_cost_evidence_audit_test: PASS");
1250
+ }
1251
+ // v4.3.0 / P1: unanimous READY with unresolved evidence must not look like a
1252
+ // plain unanimous_ready close-out. `not_resurfaced` is inference-only: it may
1253
+ // allow convergence, but the final metadata/report must keep that disposition
1254
+ // visible for operators.
1255
+ {
1256
+ const { sessionReportMarkdown } = await import("../src/core/reports.js");
1257
+ const unresolvedEvents = [];
1258
+ const unresolvedConfig = {
1259
+ ...loadConfig(),
1260
+ data_dir: smokeTmpDir("unresolved-evidence-finalize"),
1261
+ budget: {
1262
+ ...loadConfig().budget,
1263
+ max_session_cost_usd: 10000,
1264
+ preflight_max_round_cost_usd: 10000,
1265
+ until_stopped_max_cost_usd: 10000,
1266
+ },
1267
+ evidence_judge_autowire: {
1268
+ ...loadConfig().evidence_judge_autowire,
1269
+ mode: "off",
1270
+ active: false,
1271
+ },
1272
+ };
1273
+ const unresolvedOrch = new CrossReviewOrchestrator(unresolvedConfig, (event) => unresolvedEvents.push(event.type));
1274
+ const unresolvedR1 = await unresolvedOrch.askPeers({
1275
+ task: "P1 unresolved evidence finalization guard fixture.",
1276
+ draft: "FORCE_NEEDS_EVIDENCE",
1277
+ caller: "operator",
1278
+ peers: ["claude"],
1279
+ });
1280
+ const unresolvedR2 = await unresolvedOrch.askPeers({
1281
+ session_id: unresolvedR1.session.session_id,
1282
+ task: "P1 unresolved evidence finalization guard fixture.",
1283
+ draft: "Clean revised draft, no test marker present.",
1284
+ caller: "operator",
1285
+ peers: ["claude"],
1286
+ });
1287
+ assert.equal(unresolvedR2.converged, true);
1288
+ assert.equal(unresolvedR2.session.outcome, "converged");
1289
+ assert.equal(unresolvedR2.session.outcome_reason, "unanimous_ready_with_unresolved_evidence", "v4.3.0 / P1: convergence with not_resurfaced evidence must not finalize as plain unanimous_ready");
1290
+ assert.ok(unresolvedEvents.includes("session.evidence_checklist_unresolved_on_finalize"), "v4.3.0 / P1: unresolved evidence close-out must emit an audit event");
1291
+ const unresolvedReport = sessionReportMarkdown(unresolvedOrch.store.read(unresolvedR2.session.session_id), unresolvedOrch.store.readEvents(unresolvedR2.session.session_id));
1292
+ assert.ok(unresolvedReport.includes("## Unresolved Evidence Disposition"), "v4.3.0 / P1: session_report must include unresolved-evidence disposition table");
1293
+ assert.ok(unresolvedReport.includes("not_resurfaced"), "v4.3.0 / P1: session_report must name not_resurfaced unresolved items");
1294
+ console.log("[smoke] unresolved_evidence_finalization_guard_test: PASS");
1295
+ }
1296
+ // v4.3.0 / P3: read-only peer reliability telemetry. This is deliberately
1297
+ // observational; it must not change peer selection or mutate sessions.
1298
+ {
1299
+ const { SessionStore } = await import("../src/core/session-store.js");
1300
+ const reliabilityStore = new SessionStore({
1301
+ ...config,
1302
+ data_dir: smokeTmpDir("peer-reliability"),
1303
+ });
1304
+ const reliabilitySession = await reliabilityStore.init("peer reliability report fixture", "operator", []);
1305
+ const reliabilityMeta = reliabilityStore.read(reliabilitySession.session_id);
1306
+ const ts = new Date().toISOString();
1307
+ reliabilityMeta.rounds = [
1308
+ {
1309
+ round: 1,
1310
+ started_at: ts,
1311
+ completed_at: ts,
1312
+ caller_status: "READY",
1313
+ prompt_file: "agent-runs/round-1-prompt.md",
1314
+ peers: [
1315
+ {
1316
+ peer: "claude",
1317
+ provider: "anthropic",
1318
+ model: "claude-opus-4-8",
1319
+ status: "NEEDS_EVIDENCE",
1320
+ structured: {
1321
+ status: "NEEDS_EVIDENCE",
1322
+ summary: "needs log",
1323
+ confidence: "verified",
1324
+ evidence_sources: ["src/core/session-store.ts:1"],
1325
+ caller_requests: ["attach raw npm test output"],
1326
+ follow_ups: [],
1327
+ },
1328
+ text: "{}",
1329
+ raw: { fixture: true },
1330
+ decision_quality: "clean",
1331
+ parser_warnings: [],
1332
+ attempts: 1,
1333
+ latency_ms: 50,
1334
+ usage: { input_tokens: 10, output_tokens: 5, total_tokens: 15 },
1335
+ cost: { currency: "USD", estimated: false, source: "configured-rate", total_cost: 1 },
1336
+ },
1337
+ {
1338
+ peer: "grok",
1339
+ provider: "xai",
1340
+ model: "grok-4.3",
1341
+ status: "READY",
1342
+ structured: {
1343
+ status: "READY",
1344
+ summary: "ready",
1345
+ confidence: "verified",
1346
+ evidence_sources: ["server_info: version 4.2.5"],
1347
+ caller_requests: [],
1348
+ follow_ups: [],
1349
+ },
1350
+ text: "{}",
1351
+ raw: { fixture: true },
1352
+ decision_quality: "format_warning",
1353
+ parser_warnings: ["verified_without_concrete_evidence_sources"],
1354
+ attempts: 1,
1355
+ latency_ms: 100,
1356
+ usage: { input_tokens: 20, output_tokens: 10, total_tokens: 30 },
1357
+ cost: { currency: "USD", estimated: false, source: "configured-rate", total_cost: 2 },
1358
+ },
1359
+ ],
1360
+ rejected: [
1361
+ {
1362
+ peer: "perplexity",
1363
+ provider: "perplexity",
1364
+ model: "sonar-reasoning-pro",
1365
+ failure_class: "provider_error",
1366
+ message: "fixture provider error",
1367
+ retryable: false,
1368
+ attempts: 1,
1369
+ latency_ms: 0,
1370
+ },
1371
+ ],
1372
+ convergence: {
1373
+ converged: false,
1374
+ reason: "fixture",
1375
+ ready_peers: ["grok"],
1376
+ not_ready_peers: [],
1377
+ needs_evidence_peers: ["claude"],
1378
+ rejected_peers: ["perplexity"],
1379
+ skipped_peers: [],
1380
+ decision_quality: {
1381
+ codex: "clean",
1382
+ claude: "clean",
1383
+ gemini: "clean",
1384
+ deepseek: "clean",
1385
+ grok: "format_warning",
1386
+ perplexity: "failed",
1387
+ },
1388
+ blocking_details: ["claude:NEEDS_EVIDENCE", "perplexity:provider_error"],
1389
+ },
1390
+ },
1391
+ ];
1392
+ reliabilityMeta.evidence_checklist = [
1393
+ {
1394
+ id: "rel-1",
1395
+ peer: "claude",
1396
+ first_round: 1,
1397
+ last_round: 1,
1398
+ round_count: 1,
1399
+ ask: "attach raw npm test output",
1400
+ first_seen_at: ts,
1401
+ last_seen_at: ts,
1402
+ status: "not_resurfaced",
1403
+ addressed_at_round: 2,
1404
+ address_method: "resurfacing",
1405
+ },
1406
+ ];
1407
+ fs.writeFileSync(reliabilityStore.metaPath(reliabilitySession.session_id), JSON.stringify(reliabilityMeta));
1408
+ await reliabilityStore.appendEvent({
1409
+ ts,
1410
+ type: "session.lead_meta_audit_fabrication_detected",
1411
+ session_id: reliabilitySession.session_id,
1412
+ message: "fixture fabrication event",
1413
+ data: { peer: "grok" },
1414
+ });
1415
+ const reliability = reliabilityStore.peerReliabilityReport();
1416
+ assert.equal(reliability.scope, "all");
1417
+ assert.equal(reliability.by_peer.claude?.needs_evidence, 1);
1418
+ assert.equal(reliability.by_peer.claude?.not_resurfaced_asks, 1);
1419
+ assert.equal(reliability.by_peer.grok?.ready, 1);
1420
+ assert.equal(reliability.by_peer.grok?.parser_warnings_total, 1);
1421
+ assert.equal(reliability.by_peer.grok?.fabrication_events, 1);
1422
+ assert.equal(reliability.by_peer.perplexity?.provider_errors, 1);
1423
+ console.log("[smoke] peer_reliability_report_test: PASS");
1424
+ }
1425
+ // v4.3.0 / P2: offline declarative eval harness. This pins the existence of a
1426
+ // no-provider-call fixture runner so regressions found in real sessions can be
1427
+ // replayed without growing the ad hoc smoke body indefinitely.
1428
+ {
1429
+ const pkg = JSON.parse(fs.readFileSync("package.json", "utf8"));
1430
+ assert.equal(pkg.scripts?.["eval:fixtures"], "tsx scripts/eval-fixtures.ts", "v4.3.0 / P2: package.json must expose the offline fixture eval runner");
1431
+ const evalHarness = fs.readFileSync("scripts/eval-fixtures.ts", "utf8");
1432
+ assert.ok(/truthfulnessCases/.test(evalHarness) &&
1433
+ /parserCases/.test(evalHarness) &&
1434
+ /reportCases/.test(evalHarness), "v4.3.0 / P2: eval-fixtures must use declarative truthfulness/parser/report case tables");
1435
+ assert.ok(!/askPeers\(|runUntilUnanimous\(|session_start_round/.test(evalHarness), "v4.3.0 / P2: eval-fixtures must stay offline and avoid provider-review entry points");
1436
+ console.log("[smoke] offline_fixture_eval_contract_test: PASS");
1437
+ }
1099
1438
  // v2.22.0 (B.P3): session.budget_warning event emit + idempotency. The
1100
1439
  // orchestrator emits a one-shot warning when cumulative cost crosses
1101
1440
  // 75% of cost_ceiling_usd; the budget_warning_emitted flag persists
@@ -5264,6 +5603,16 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
5264
5603
  narrativeCorpus: "",
5265
5604
  });
5266
5605
  assert.equal(genericConfirmation.fabricated, false, "v4.2.2 / truthfulness_guardrails: generic 'confirmed' prose without a dispatch/authorization claim must not trip fabrication detection");
5606
+ const fabricatedReviewReferences = detectFabricatedEvidence([
5607
+ "R2 evidence confirms sessions 604dcecc-df8d-483c-b598-733b8cbb64b0 and 37929ed7-3b71-454c-8231-5e1657ad17af.",
5608
+ "The external comparison used https://github.com/qhjqhj00/GossipCat and https://github.com/alibaba/mira.",
5609
+ ].join("\n"), {
5610
+ provenanceCorpus: "",
5611
+ priorDraftCorpus: "The prior artifact did not contain session IDs or GitHub repository URLs.",
5612
+ narrativeCorpus: "Audit cross-review improvements.",
5613
+ });
5614
+ assert.ok(fabricatedReviewReferences.fabricated === true &&
5615
+ fabricatedReviewReferences.suspicious_assertion_count >= 2, `v4.2.5 / fabrication_lock: net-new session IDs and GitHub URLs in relator text must trip fabricated=true (got count=${fabricatedReviewReferences.suspicious_assertion_count}, fabricated=${fabricatedReviewReferences.fabricated})`);
5267
5616
  // Source-level: threshold constants pinned at the documented values.
5268
5617
  assert.ok(/FABRICATED_NET_NEW_HEX_THRESHOLD\s*=\s*3/.test(orchSrc), "v2.24.0 / fabrication_lock: net-new hex threshold pinned at 3");
5269
5618
  assert.ok(/FABRICATED_SUSPICIOUS_ASSERTION_THRESHOLD\s*=\s*2/.test(orchSrc), "v2.24.0 / fabrication_lock: suspicious assertion threshold pinned at 2");