@lcv-ideas-software/cross-review 4.2.4 → 4.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -7,6 +7,36 @@ standard `v00.00.00`; npm package versions remain SemVer.
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [v04.02.05] — 2026-06-05
11
+
12
+ **Patch — session audit hardening.** This release closes follow-ups from the
13
+ 2026-06-05 GitHub/tooling and on-disk session audit: terminal events are now
14
+ durably recorded at the store boundary, cost reporting separates peer calls from
15
+ lead-generation artifacts, and evidence/checklist diagnostics make
16
+ `not_resurfaced` and relator provenance risks harder to misread.
17
+
18
+ ### Added
19
+
20
+ - `SessionStore.finalize`, `markCancelled`, and idle sweeps now persist
21
+ terminal events (`session.finalized` / `session.cancelled`) alongside
22
+ `meta.json` outcome changes.
23
+ - `session_doctor` now reports real-vs-stub session counts, aggregate cost
24
+ breakdown (`total_cost_usd`, `peer_call_cost_usd`, `generation_cost_usd`),
25
+ sessions missing terminal events, and sessions carrying
26
+ `not_resurfaced` evidence checklist items.
27
+ - `session_report` now shows total cost as peer-call cost plus generation cost
28
+ and includes an Evidence Checklist section explaining that
29
+ `not_resurfaced` is inference-only, not proof of satisfaction.
30
+
31
+ ### Changed
32
+
33
+ - Truthfulness preflight failure reasons now include `source_marker_found` and
34
+ `runtime_facts_available` in addition to attachment/structured-evidence
35
+ visibility.
36
+ - The relator evidence-provenance detector now treats net-new session UUIDs and
37
+ GitHub URLs as provenance-bound operational references, preventing final text
38
+ from introducing unverified session/repository evidence.
39
+
10
40
  ## [v04.02.04] — 2026-06-05
11
41
 
12
42
  **Patch — truthfulness preflight auditability.** This release tightens the
package/README.md CHANGED
@@ -24,7 +24,7 @@ npm install -g @lcv-ideas-software/cross-review
24
24
  npm install -g @lcv-ideas-software/cross-review --registry=https://npm.pkg.github.com
25
25
  ```
26
26
 
27
- **Status.** Stable. Current release: **v04.02.04** (npm package `4.2.4`). See [CHANGELOG.md](./CHANGELOG.md) for the full release history.
27
+ **Status.** Stable. Current release: **v04.02.05** (npm package `4.2.5`). See [CHANGELOG.md](./CHANGELOG.md) for the full release history.
28
28
 
29
29
  > **Project renamed 2026-05-15.** This project was previously published as
30
30
  > [`@lcv-ideas-software/cross-review-v2`](https://www.npmjs.com/package/@lcv-ideas-software/cross-review-v2)
@@ -38,6 +38,7 @@ The version history at a glance:
38
38
 
39
39
  | Release | Scope |
40
40
  | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
41
+ | **`v04.02.05`** | Patch — harden session auditability with terminal events, cost split reporting, `not_resurfaced` visibility, and relator provenance checks for session IDs/GitHub URLs. |
41
42
  | **`v04.02.04`** | Patch — harden truthfulness preflight auditability, add a read-only preflight retest tool, and reduce false parser warnings for attached/log evidence. |
42
43
  | **`v04.02.03`** | Patch — promote the Gemini canonical default to `gemini-3.1-pro-preview` and refresh the active local Gemini rate card. |
43
44
  | **`v04.02.02`** | Patch — provider-doc refresh, Perplexity probe repair, current model pins, and rate-card guidance. |
@@ -223,6 +224,12 @@ these environment variables before running real sessions (example):
223
224
  - `session_sweep`
224
225
  - `session_finalize`
225
226
 
227
+ `session_doctor` separates real and stub sessions, flags terminal outcomes that
228
+ lack terminal events, and reports peer-call cost separately from generation
229
+ artifact cost. `session_report` uses the same split and calls out
230
+ `not_resurfaced` evidence checklist items as inference-only, not proof that the
231
+ requested evidence was satisfied.
232
+
226
233
  ## Repository conventions
227
234
 
228
235
  - **License**: [Apache-2.0](./LICENSE). See [NOTICE](./NOTICE) and [THIRDPARTY](./THIRDPARTY.md).
@@ -1096,6 +1096,158 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
1096
1096
  assert.equal(entry?.chronic_blockers?.[0], "ask-codex-1", "chronic_blockers must contain the codex round_count=4 item id");
1097
1097
  console.log("[smoke] evidence_checklist_drilldown_test: PASS");
1098
1098
  }
1099
+ // v4.2.5 — terminal events, cost split and not_resurfaced audit visibility.
1100
+ // The 2026-06-05 disk audit found sessions whose meta.json had a terminal
1101
+ // outcome without a matching terminal event, cost reports that blurred peer
1102
+ // calls with lead-generation artifacts, and converged sessions with
1103
+ // not_resurfaced checklist items that were too easy to misread as satisfied.
1104
+ {
1105
+ const { SessionStore } = await import("../src/core/session-store.js");
1106
+ const { sessionReportMarkdown } = await import("../src/core/reports.js");
1107
+ const auditStore = new SessionStore({
1108
+ ...config,
1109
+ data_dir: smokeTmpDir("terminal-cost-evidence-audit"),
1110
+ });
1111
+ const finalizedSession = await auditStore.init("terminal event fixture", "operator", []);
1112
+ await auditStore.finalize(finalizedSession.session_id, "aborted", "smoke_terminal_abort");
1113
+ const finalizedEvents = auditStore.readEvents(finalizedSession.session_id);
1114
+ assert.ok(finalizedEvents.some((event) => event.type === "session.finalized" &&
1115
+ event.data?.outcome === "aborted" &&
1116
+ event.data?.reason === "smoke_terminal_abort"), "v4.2.5 / terminal_events: finalize() must persist a session.finalized event");
1117
+ const cancelledSession = await auditStore.init("cancelled terminal event fixture", "operator", []);
1118
+ await auditStore.markCancelled(cancelledSession.session_id, "session_cancelled");
1119
+ const cancelledEvents = auditStore.readEvents(cancelledSession.session_id);
1120
+ assert.ok(cancelledEvents.some((event) => event.type === "session.cancelled" && event.data?.reason === "session_cancelled"), "v4.2.5 / terminal_events: markCancelled() must persist a session.cancelled event");
1121
+ const sweptSession = await auditStore.init("idle sweep terminal event fixture", "operator", []);
1122
+ const sweptMeta = auditStore.read(sweptSession.session_id);
1123
+ sweptMeta.updated_at = new Date(Date.now() - 25 * 60 * 60 * 1000).toISOString();
1124
+ fs.writeFileSync(auditStore.metaPath(sweptSession.session_id), JSON.stringify(sweptMeta));
1125
+ const swept = await auditStore.sweepIdle(0, "aborted", "smoke_idle_sweep");
1126
+ assert.equal(swept.some((session) => session.session_id === sweptSession.session_id), true, "v4.2.5 / terminal_events: sweepIdle() must finalize stale sessions");
1127
+ const sweptEvents = auditStore.readEvents(sweptSession.session_id);
1128
+ assert.ok(sweptEvents.some((event) => event.type === "session.finalized" &&
1129
+ event.data?.outcome === "aborted" &&
1130
+ event.data?.reason === "smoke_idle_sweep" &&
1131
+ typeof event.data?.idle_ms === "number"), "v4.2.5 / terminal_events: sweepIdle() must persist a session.finalized event");
1132
+ const reportSession = await auditStore.init("cost split report fixture", "operator", []);
1133
+ const reportMeta = auditStore.read(reportSession.session_id);
1134
+ const nowIso = new Date().toISOString();
1135
+ reportMeta.rounds = [
1136
+ {
1137
+ round: 1,
1138
+ started_at: nowIso,
1139
+ completed_at: nowIso,
1140
+ caller_status: "READY",
1141
+ prompt_file: "agent-runs/round-1-prompt.md",
1142
+ peers: [
1143
+ {
1144
+ peer: "codex",
1145
+ provider: "openai",
1146
+ model: "gpt-5",
1147
+ status: "READY",
1148
+ structured: { status: "READY", summary: "ready", confidence: "verified" },
1149
+ text: '{"status":"READY","summary":"ready","confidence":"verified"}',
1150
+ raw: { fixture: true },
1151
+ decision_quality: "clean",
1152
+ parser_warnings: [],
1153
+ attempts: 1,
1154
+ latency_ms: 10,
1155
+ usage: { input_tokens: 10, output_tokens: 5, total_tokens: 15 },
1156
+ cost: {
1157
+ currency: "USD",
1158
+ estimated: false,
1159
+ source: "configured-rate",
1160
+ total_cost: 14.652426,
1161
+ },
1162
+ },
1163
+ ],
1164
+ rejected: [],
1165
+ convergence: {
1166
+ converged: true,
1167
+ reason: "unanimous",
1168
+ ready_peers: ["codex"],
1169
+ not_ready_peers: [],
1170
+ needs_evidence_peers: [],
1171
+ rejected_peers: [],
1172
+ skipped_peers: [],
1173
+ decision_quality: {
1174
+ codex: "clean",
1175
+ claude: "clean",
1176
+ gemini: "clean",
1177
+ deepseek: "clean",
1178
+ grok: "clean",
1179
+ perplexity: "clean",
1180
+ },
1181
+ blocking_details: [],
1182
+ },
1183
+ },
1184
+ ];
1185
+ reportMeta.generation_files = [
1186
+ {
1187
+ round: 0,
1188
+ peer: "gemini",
1189
+ label: "initial_draft",
1190
+ path: "agent-runs/round-0-initial-draft.md",
1191
+ ts: nowIso,
1192
+ usage: { input_tokens: 6, output_tokens: 4, total_tokens: 10 },
1193
+ cost: {
1194
+ currency: "USD",
1195
+ estimated: false,
1196
+ source: "configured-rate",
1197
+ total_cost: 1.876718,
1198
+ },
1199
+ },
1200
+ ];
1201
+ reportMeta.totals.cost = {
1202
+ currency: "USD",
1203
+ estimated: false,
1204
+ source: "configured-rate",
1205
+ total_cost: 16.529144,
1206
+ };
1207
+ fs.writeFileSync(auditStore.metaPath(reportSession.session_id), JSON.stringify(reportMeta));
1208
+ const reportMarkdown = sessionReportMarkdown(auditStore.read(reportSession.session_id), []);
1209
+ assert.ok(reportMarkdown.includes("- Peer call cost: $14.652426 USD"), "v4.2.5 / cost_split: session report must show peer-call cost separately");
1210
+ assert.ok(reportMarkdown.includes("- Generation cost: $1.876718 USD"), "v4.2.5 / cost_split: session report must show lead-generation cost separately");
1211
+ assert.ok(reportMarkdown.includes("$16.529144 USD = $14.652426 peer + $1.876718 generation"), "v4.2.5 / cost_split: session report must explicitly reconcile total = peer + generation");
1212
+ const notResurfacedSession = await auditStore.init("not resurfaced visibility fixture", "operator", []);
1213
+ const notResurfacedMeta = auditStore.read(notResurfacedSession.session_id);
1214
+ notResurfacedMeta.evidence_checklist = [
1215
+ {
1216
+ id: "nr-1",
1217
+ peer: "deepseek",
1218
+ first_round: 1,
1219
+ last_round: 1,
1220
+ round_count: 1,
1221
+ ask: "attach raw npm ci output",
1222
+ first_seen_at: nowIso,
1223
+ last_seen_at: nowIso,
1224
+ status: "not_resurfaced",
1225
+ addressed_at_round: 2,
1226
+ address_method: "resurfacing",
1227
+ },
1228
+ ];
1229
+ fs.writeFileSync(auditStore.metaPath(notResurfacedSession.session_id), JSON.stringify(notResurfacedMeta));
1230
+ const legacyGapSession = await auditStore.init("legacy terminal event gap fixture", "operator", []);
1231
+ const legacyGapMeta = auditStore.read(legacyGapSession.session_id);
1232
+ legacyGapMeta.outcome = "aborted";
1233
+ legacyGapMeta.outcome_reason = "legacy_without_terminal_event";
1234
+ legacyGapMeta.updated_at = nowIso;
1235
+ legacyGapMeta.convergence_health = {
1236
+ state: "stale",
1237
+ last_event_at: nowIso,
1238
+ detail: "legacy_without_terminal_event",
1239
+ };
1240
+ fs.writeFileSync(auditStore.metaPath(legacyGapSession.session_id), JSON.stringify(legacyGapMeta));
1241
+ const notResurfacedDoctor = await auditStore.sessionDoctor(20);
1242
+ assert.equal(notResurfacedDoctor.totals.not_resurfaced_evidence_sessions, 1, "v4.2.5 / not_resurfaced: session_doctor totals must count not_resurfaced sessions separately");
1243
+ assert.equal(notResurfacedDoctor.findings.not_resurfaced_evidence_sessions[0]?.session_id, notResurfacedSession.session_id, "v4.2.5 / not_resurfaced: session_doctor must enumerate not_resurfaced sessions");
1244
+ assert.equal(notResurfacedDoctor.totals.terminal_event_missing_sessions, 1, "v4.2.5 / terminal_event_missing: session_doctor totals must count legacy terminal-event gaps");
1245
+ assert.equal(notResurfacedDoctor.findings.terminal_event_missing_sessions[0]?.session_id, legacyGapSession.session_id, "v4.2.5 / terminal_event_missing: session_doctor must enumerate legacy terminal-event gaps");
1246
+ assert.equal(notResurfacedDoctor.findings.terminal_event_missing_sessions[0]?.terminal_event_expected, "session.finalized", "v4.2.5 / terminal_event_missing: session_doctor must report the expected terminal event");
1247
+ const notResurfacedReport = sessionReportMarkdown(auditStore.read(notResurfacedSession.session_id), []);
1248
+ assert.ok(notResurfacedReport.includes("not_resurfaced means the ask was not repeated; it is not proof that evidence was satisfied."), "v4.2.5 / not_resurfaced: session report must state the not_resurfaced semantics");
1249
+ console.log("[smoke] terminal_cost_evidence_audit_test: PASS");
1250
+ }
1099
1251
  // v2.22.0 (B.P3): session.budget_warning event emit + idempotency. The
1100
1252
  // orchestrator emits a one-shot warning when cumulative cost crosses
1101
1253
  // 75% of cost_ceiling_usd; the budget_warning_emitted flag persists
@@ -5264,6 +5416,16 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
5264
5416
  narrativeCorpus: "",
5265
5417
  });
5266
5418
  assert.equal(genericConfirmation.fabricated, false, "v4.2.2 / truthfulness_guardrails: generic 'confirmed' prose without a dispatch/authorization claim must not trip fabrication detection");
5419
+ const fabricatedReviewReferences = detectFabricatedEvidence([
5420
+ "R2 evidence confirms sessions 604dcecc-df8d-483c-b598-733b8cbb64b0 and 37929ed7-3b71-454c-8231-5e1657ad17af.",
5421
+ "The external comparison used https://github.com/qhjqhj00/GossipCat and https://github.com/alibaba/mira.",
5422
+ ].join("\n"), {
5423
+ provenanceCorpus: "",
5424
+ priorDraftCorpus: "The prior artifact did not contain session IDs or GitHub repository URLs.",
5425
+ narrativeCorpus: "Audit cross-review improvements.",
5426
+ });
5427
+ assert.ok(fabricatedReviewReferences.fabricated === true &&
5428
+ fabricatedReviewReferences.suspicious_assertion_count >= 2, `v4.2.5 / fabrication_lock: net-new session IDs and GitHub URLs in relator text must trip fabricated=true (got count=${fabricatedReviewReferences.suspicious_assertion_count}, fabricated=${fabricatedReviewReferences.fabricated})`);
5267
5429
  // Source-level: threshold constants pinned at the documented values.
5268
5430
  assert.ok(/FABRICATED_NET_NEW_HEX_THRESHOLD\s*=\s*3/.test(orchSrc), "v2.24.0 / fabrication_lock: net-new hex threshold pinned at 3");
5269
5431
  assert.ok(/FABRICATED_SUSPICIOUS_ASSERTION_THRESHOLD\s*=\s*2/.test(orchSrc), "v2.24.0 / fabrication_lock: suspicious assertion threshold pinned at 2");