@lcv-ideas-software/cross-review 4.2.3 → 4.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +57 -0
- package/README.md +10 -1
- package/dist/scripts/smoke.js +215 -0
- package/dist/scripts/smoke.js.map +1 -1
- package/dist/src/core/config.d.ts +2 -2
- package/dist/src/core/config.js +2 -2
- package/dist/src/core/orchestrator.d.ts +2 -0
- package/dist/src/core/orchestrator.js +140 -6
- package/dist/src/core/orchestrator.js.map +1 -1
- package/dist/src/core/reports.d.ts +6 -0
- package/dist/src/core/reports.js +74 -4
- package/dist/src/core/reports.js.map +1 -1
- package/dist/src/core/session-store.d.ts +3 -0
- package/dist/src/core/session-store.js +181 -16
- package/dist/src/core/session-store.js.map +1 -1
- package/dist/src/core/status.js +9 -3
- package/dist/src/core/status.js.map +1 -1
- package/dist/src/core/types.d.ts +15 -0
- package/dist/src/mcp/server.js +56 -1
- package/dist/src/mcp/server.js.map +1 -1
- package/docs/apresentacao-cross-review.md +30 -27
- package/docs/apresentacao.md +29 -18
- package/docs/architecture.md +17 -1
- package/docs/costs.md +6 -0
- package/docs/evidence-preflight.md +34 -1
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,63 @@ standard `v00.00.00`; npm package versions remain SemVer.
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [v04.02.05] — 2026-06-05
|
|
11
|
+
|
|
12
|
+
**Patch — session audit hardening.** This release closes follow-ups from the
|
|
13
|
+
2026-06-05 GitHub/tooling and on-disk session audit: terminal events are now
|
|
14
|
+
durably recorded at the store boundary, cost reporting separates peer calls from
|
|
15
|
+
lead-generation artifacts, and evidence/checklist diagnostics make
|
|
16
|
+
`not_resurfaced` and relator provenance risks harder to misread.
|
|
17
|
+
|
|
18
|
+
### Added
|
|
19
|
+
|
|
20
|
+
- `SessionStore.finalize`, `markCancelled`, and idle sweeps now persist
|
|
21
|
+
terminal events (`session.finalized` / `session.cancelled`) alongside
|
|
22
|
+
`meta.json` outcome changes.
|
|
23
|
+
- `session_doctor` now reports real-vs-stub session counts, aggregate cost
|
|
24
|
+
breakdown (`total_cost_usd`, `peer_call_cost_usd`, `generation_cost_usd`),
|
|
25
|
+
sessions missing terminal events, and sessions carrying
|
|
26
|
+
`not_resurfaced` evidence checklist items.
|
|
27
|
+
- `session_report` now shows total cost as peer-call cost plus generation cost
|
|
28
|
+
and includes an Evidence Checklist section explaining that
|
|
29
|
+
`not_resurfaced` is inference-only, not proof of satisfaction.
|
|
30
|
+
|
|
31
|
+
### Changed
|
|
32
|
+
|
|
33
|
+
- Truthfulness preflight failure reasons now include `source_marker_found` and
|
|
34
|
+
`runtime_facts_available` in addition to attachment/structured-evidence
|
|
35
|
+
visibility.
|
|
36
|
+
- The relator evidence-provenance detector now treats net-new session UUIDs and
|
|
37
|
+
GitHub URLs as provenance-bound operational references, preventing final text
|
|
38
|
+
from introducing unverified session/repository evidence.
|
|
39
|
+
|
|
40
|
+
## [v04.02.04] — 2026-06-05
|
|
41
|
+
|
|
42
|
+
**Patch — truthfulness preflight auditability.** This release tightens the
|
|
43
|
+
guardrails added after the v4.2.x session audit so unsupported runtime/history
|
|
44
|
+
claims fail with clearer classes and can be retested after evidence is attached.
|
|
45
|
+
|
|
46
|
+
### Added
|
|
47
|
+
|
|
48
|
+
- Added `session_truthfulness_preflight_check`, a read-only MCP tool that
|
|
49
|
+
re-runs the local truthfulness preflight for an existing session without
|
|
50
|
+
calling providers.
|
|
51
|
+
- Added `issue_classes` to truthfulness preflight results and abort events for
|
|
52
|
+
`runtime_contradiction`, `unsupported_current_state_claim`,
|
|
53
|
+
`unsupported_historical_claim`, and `fabrication_pattern`.
|
|
54
|
+
- Added durable `failed_attempts` metadata for `run_until_unanimous` preflight
|
|
55
|
+
aborts that happen before a peer-review round is appended.
|
|
56
|
+
|
|
57
|
+
### Changed
|
|
58
|
+
|
|
59
|
+
- Re-runs truthfulness preflight on lead-generated initial drafts and revisions
|
|
60
|
+
before dispatching reviewer peer calls, blocking unsupported generated
|
|
61
|
+
runtime claims before they propagate through the panel.
|
|
62
|
+
- Parser diagnostics now distinguish empty verified `evidence_sources` from
|
|
63
|
+
non-empty but generic evidence sources, and recognize attached-evidence
|
|
64
|
+
labels, `evidence/` paths, log lines, line labels, and command/test-output
|
|
65
|
+
citations as concrete evidence markers.
|
|
66
|
+
|
|
10
67
|
## [v04.02.03] — 2026-06-03
|
|
11
68
|
|
|
12
69
|
**Patch — Gemini replacement pin and rate-card refresh.** This release follows
|
package/README.md
CHANGED
|
@@ -24,7 +24,7 @@ npm install -g @lcv-ideas-software/cross-review
|
|
|
24
24
|
npm install -g @lcv-ideas-software/cross-review --registry=https://npm.pkg.github.com
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
-
**Status.** Stable. Current release: **v04.02.
|
|
27
|
+
**Status.** Stable. Current release: **v04.02.05** (npm package `4.2.5`). See [CHANGELOG.md](./CHANGELOG.md) for the full release history.
|
|
28
28
|
|
|
29
29
|
> **Project renamed 2026-05-15.** This project was previously published as
|
|
30
30
|
> [`@lcv-ideas-software/cross-review-v2`](https://www.npmjs.com/package/@lcv-ideas-software/cross-review-v2)
|
|
@@ -38,6 +38,8 @@ The version history at a glance:
|
|
|
38
38
|
|
|
39
39
|
| Release | Scope |
|
|
40
40
|
| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
|
41
|
+
| **`v04.02.05`** | Patch — harden session auditability with terminal events, cost split reporting, `not_resurfaced` visibility, and relator provenance checks for session IDs/GitHub URLs. |
|
|
42
|
+
| **`v04.02.04`** | Patch — harden truthfulness preflight auditability, add a read-only preflight retest tool, and reduce false parser warnings for attached/log evidence. |
|
|
41
43
|
| **`v04.02.03`** | Patch — promote the Gemini canonical default to `gemini-3.1-pro-preview` and refresh the active local Gemini rate card. |
|
|
42
44
|
| **`v04.02.02`** | Patch — provider-doc refresh, Perplexity probe repair, current model pins, and rate-card guidance. |
|
|
43
45
|
| **`v04.02.01`** | Patch — publish the workspace hard-gate cleanup as a package release. |
|
|
@@ -210,6 +212,7 @@ these environment variables before running real sessions (example):
|
|
|
210
212
|
- `session_doctor`
|
|
211
213
|
- `session_report`
|
|
212
214
|
- `session_check_convergence`
|
|
215
|
+
- `session_truthfulness_preflight_check`
|
|
213
216
|
- `session_attach_evidence`
|
|
214
217
|
- `session_evidence_checklist_update`
|
|
215
218
|
- `session_evidence_judge_pass`
|
|
@@ -221,6 +224,12 @@ these environment variables before running real sessions (example):
|
|
|
221
224
|
- `session_sweep`
|
|
222
225
|
- `session_finalize`
|
|
223
226
|
|
|
227
|
+
`session_doctor` separates real and stub sessions, flags terminal outcomes that
|
|
228
|
+
lack terminal events, and reports peer-call cost separately from generation
|
|
229
|
+
artifact cost. `session_report` uses the same split and calls out
|
|
230
|
+
`not_resurfaced` evidence checklist items as inference-only, not proof that the
|
|
231
|
+
requested evidence was satisfied.
|
|
232
|
+
|
|
224
233
|
## Repository conventions
|
|
225
234
|
|
|
226
235
|
- **License**: [Apache-2.0](./LICENSE). See [NOTICE](./NOTICE) and [THIRDPARTY](./THIRDPARTY.md).
|
package/dist/scripts/smoke.js
CHANGED
|
@@ -1096,6 +1096,158 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
1096
1096
|
assert.equal(entry?.chronic_blockers?.[0], "ask-codex-1", "chronic_blockers must contain the codex round_count=4 item id");
|
|
1097
1097
|
console.log("[smoke] evidence_checklist_drilldown_test: PASS");
|
|
1098
1098
|
}
|
|
1099
|
+
// v4.2.5 — terminal events, cost split and not_resurfaced audit visibility.
|
|
1100
|
+
// The 2026-06-05 disk audit found sessions whose meta.json had a terminal
|
|
1101
|
+
// outcome without a matching terminal event, cost reports that blurred peer
|
|
1102
|
+
// calls with lead-generation artifacts, and converged sessions with
|
|
1103
|
+
// not_resurfaced checklist items that were too easy to misread as satisfied.
|
|
1104
|
+
{
|
|
1105
|
+
const { SessionStore } = await import("../src/core/session-store.js");
|
|
1106
|
+
const { sessionReportMarkdown } = await import("../src/core/reports.js");
|
|
1107
|
+
const auditStore = new SessionStore({
|
|
1108
|
+
...config,
|
|
1109
|
+
data_dir: smokeTmpDir("terminal-cost-evidence-audit"),
|
|
1110
|
+
});
|
|
1111
|
+
const finalizedSession = await auditStore.init("terminal event fixture", "operator", []);
|
|
1112
|
+
await auditStore.finalize(finalizedSession.session_id, "aborted", "smoke_terminal_abort");
|
|
1113
|
+
const finalizedEvents = auditStore.readEvents(finalizedSession.session_id);
|
|
1114
|
+
assert.ok(finalizedEvents.some((event) => event.type === "session.finalized" &&
|
|
1115
|
+
event.data?.outcome === "aborted" &&
|
|
1116
|
+
event.data?.reason === "smoke_terminal_abort"), "v4.2.5 / terminal_events: finalize() must persist a session.finalized event");
|
|
1117
|
+
const cancelledSession = await auditStore.init("cancelled terminal event fixture", "operator", []);
|
|
1118
|
+
await auditStore.markCancelled(cancelledSession.session_id, "session_cancelled");
|
|
1119
|
+
const cancelledEvents = auditStore.readEvents(cancelledSession.session_id);
|
|
1120
|
+
assert.ok(cancelledEvents.some((event) => event.type === "session.cancelled" && event.data?.reason === "session_cancelled"), "v4.2.5 / terminal_events: markCancelled() must persist a session.cancelled event");
|
|
1121
|
+
const sweptSession = await auditStore.init("idle sweep terminal event fixture", "operator", []);
|
|
1122
|
+
const sweptMeta = auditStore.read(sweptSession.session_id);
|
|
1123
|
+
sweptMeta.updated_at = new Date(Date.now() - 25 * 60 * 60 * 1000).toISOString();
|
|
1124
|
+
fs.writeFileSync(auditStore.metaPath(sweptSession.session_id), JSON.stringify(sweptMeta));
|
|
1125
|
+
const swept = await auditStore.sweepIdle(0, "aborted", "smoke_idle_sweep");
|
|
1126
|
+
assert.equal(swept.some((session) => session.session_id === sweptSession.session_id), true, "v4.2.5 / terminal_events: sweepIdle() must finalize stale sessions");
|
|
1127
|
+
const sweptEvents = auditStore.readEvents(sweptSession.session_id);
|
|
1128
|
+
assert.ok(sweptEvents.some((event) => event.type === "session.finalized" &&
|
|
1129
|
+
event.data?.outcome === "aborted" &&
|
|
1130
|
+
event.data?.reason === "smoke_idle_sweep" &&
|
|
1131
|
+
typeof event.data?.idle_ms === "number"), "v4.2.5 / terminal_events: sweepIdle() must persist a session.finalized event");
|
|
1132
|
+
const reportSession = await auditStore.init("cost split report fixture", "operator", []);
|
|
1133
|
+
const reportMeta = auditStore.read(reportSession.session_id);
|
|
1134
|
+
const nowIso = new Date().toISOString();
|
|
1135
|
+
reportMeta.rounds = [
|
|
1136
|
+
{
|
|
1137
|
+
round: 1,
|
|
1138
|
+
started_at: nowIso,
|
|
1139
|
+
completed_at: nowIso,
|
|
1140
|
+
caller_status: "READY",
|
|
1141
|
+
prompt_file: "agent-runs/round-1-prompt.md",
|
|
1142
|
+
peers: [
|
|
1143
|
+
{
|
|
1144
|
+
peer: "codex",
|
|
1145
|
+
provider: "openai",
|
|
1146
|
+
model: "gpt-5",
|
|
1147
|
+
status: "READY",
|
|
1148
|
+
structured: { status: "READY", summary: "ready", confidence: "verified" },
|
|
1149
|
+
text: '{"status":"READY","summary":"ready","confidence":"verified"}',
|
|
1150
|
+
raw: { fixture: true },
|
|
1151
|
+
decision_quality: "clean",
|
|
1152
|
+
parser_warnings: [],
|
|
1153
|
+
attempts: 1,
|
|
1154
|
+
latency_ms: 10,
|
|
1155
|
+
usage: { input_tokens: 10, output_tokens: 5, total_tokens: 15 },
|
|
1156
|
+
cost: {
|
|
1157
|
+
currency: "USD",
|
|
1158
|
+
estimated: false,
|
|
1159
|
+
source: "configured-rate",
|
|
1160
|
+
total_cost: 14.652426,
|
|
1161
|
+
},
|
|
1162
|
+
},
|
|
1163
|
+
],
|
|
1164
|
+
rejected: [],
|
|
1165
|
+
convergence: {
|
|
1166
|
+
converged: true,
|
|
1167
|
+
reason: "unanimous",
|
|
1168
|
+
ready_peers: ["codex"],
|
|
1169
|
+
not_ready_peers: [],
|
|
1170
|
+
needs_evidence_peers: [],
|
|
1171
|
+
rejected_peers: [],
|
|
1172
|
+
skipped_peers: [],
|
|
1173
|
+
decision_quality: {
|
|
1174
|
+
codex: "clean",
|
|
1175
|
+
claude: "clean",
|
|
1176
|
+
gemini: "clean",
|
|
1177
|
+
deepseek: "clean",
|
|
1178
|
+
grok: "clean",
|
|
1179
|
+
perplexity: "clean",
|
|
1180
|
+
},
|
|
1181
|
+
blocking_details: [],
|
|
1182
|
+
},
|
|
1183
|
+
},
|
|
1184
|
+
];
|
|
1185
|
+
reportMeta.generation_files = [
|
|
1186
|
+
{
|
|
1187
|
+
round: 0,
|
|
1188
|
+
peer: "gemini",
|
|
1189
|
+
label: "initial_draft",
|
|
1190
|
+
path: "agent-runs/round-0-initial-draft.md",
|
|
1191
|
+
ts: nowIso,
|
|
1192
|
+
usage: { input_tokens: 6, output_tokens: 4, total_tokens: 10 },
|
|
1193
|
+
cost: {
|
|
1194
|
+
currency: "USD",
|
|
1195
|
+
estimated: false,
|
|
1196
|
+
source: "configured-rate",
|
|
1197
|
+
total_cost: 1.876718,
|
|
1198
|
+
},
|
|
1199
|
+
},
|
|
1200
|
+
];
|
|
1201
|
+
reportMeta.totals.cost = {
|
|
1202
|
+
currency: "USD",
|
|
1203
|
+
estimated: false,
|
|
1204
|
+
source: "configured-rate",
|
|
1205
|
+
total_cost: 16.529144,
|
|
1206
|
+
};
|
|
1207
|
+
fs.writeFileSync(auditStore.metaPath(reportSession.session_id), JSON.stringify(reportMeta));
|
|
1208
|
+
const reportMarkdown = sessionReportMarkdown(auditStore.read(reportSession.session_id), []);
|
|
1209
|
+
assert.ok(reportMarkdown.includes("- Peer call cost: $14.652426 USD"), "v4.2.5 / cost_split: session report must show peer-call cost separately");
|
|
1210
|
+
assert.ok(reportMarkdown.includes("- Generation cost: $1.876718 USD"), "v4.2.5 / cost_split: session report must show lead-generation cost separately");
|
|
1211
|
+
assert.ok(reportMarkdown.includes("$16.529144 USD = $14.652426 peer + $1.876718 generation"), "v4.2.5 / cost_split: session report must explicitly reconcile total = peer + generation");
|
|
1212
|
+
const notResurfacedSession = await auditStore.init("not resurfaced visibility fixture", "operator", []);
|
|
1213
|
+
const notResurfacedMeta = auditStore.read(notResurfacedSession.session_id);
|
|
1214
|
+
notResurfacedMeta.evidence_checklist = [
|
|
1215
|
+
{
|
|
1216
|
+
id: "nr-1",
|
|
1217
|
+
peer: "deepseek",
|
|
1218
|
+
first_round: 1,
|
|
1219
|
+
last_round: 1,
|
|
1220
|
+
round_count: 1,
|
|
1221
|
+
ask: "attach raw npm ci output",
|
|
1222
|
+
first_seen_at: nowIso,
|
|
1223
|
+
last_seen_at: nowIso,
|
|
1224
|
+
status: "not_resurfaced",
|
|
1225
|
+
addressed_at_round: 2,
|
|
1226
|
+
address_method: "resurfacing",
|
|
1227
|
+
},
|
|
1228
|
+
];
|
|
1229
|
+
fs.writeFileSync(auditStore.metaPath(notResurfacedSession.session_id), JSON.stringify(notResurfacedMeta));
|
|
1230
|
+
const legacyGapSession = await auditStore.init("legacy terminal event gap fixture", "operator", []);
|
|
1231
|
+
const legacyGapMeta = auditStore.read(legacyGapSession.session_id);
|
|
1232
|
+
legacyGapMeta.outcome = "aborted";
|
|
1233
|
+
legacyGapMeta.outcome_reason = "legacy_without_terminal_event";
|
|
1234
|
+
legacyGapMeta.updated_at = nowIso;
|
|
1235
|
+
legacyGapMeta.convergence_health = {
|
|
1236
|
+
state: "stale",
|
|
1237
|
+
last_event_at: nowIso,
|
|
1238
|
+
detail: "legacy_without_terminal_event",
|
|
1239
|
+
};
|
|
1240
|
+
fs.writeFileSync(auditStore.metaPath(legacyGapSession.session_id), JSON.stringify(legacyGapMeta));
|
|
1241
|
+
const notResurfacedDoctor = await auditStore.sessionDoctor(20);
|
|
1242
|
+
assert.equal(notResurfacedDoctor.totals.not_resurfaced_evidence_sessions, 1, "v4.2.5 / not_resurfaced: session_doctor totals must count not_resurfaced sessions separately");
|
|
1243
|
+
assert.equal(notResurfacedDoctor.findings.not_resurfaced_evidence_sessions[0]?.session_id, notResurfacedSession.session_id, "v4.2.5 / not_resurfaced: session_doctor must enumerate not_resurfaced sessions");
|
|
1244
|
+
assert.equal(notResurfacedDoctor.totals.terminal_event_missing_sessions, 1, "v4.2.5 / terminal_event_missing: session_doctor totals must count legacy terminal-event gaps");
|
|
1245
|
+
assert.equal(notResurfacedDoctor.findings.terminal_event_missing_sessions[0]?.session_id, legacyGapSession.session_id, "v4.2.5 / terminal_event_missing: session_doctor must enumerate legacy terminal-event gaps");
|
|
1246
|
+
assert.equal(notResurfacedDoctor.findings.terminal_event_missing_sessions[0]?.terminal_event_expected, "session.finalized", "v4.2.5 / terminal_event_missing: session_doctor must report the expected terminal event");
|
|
1247
|
+
const notResurfacedReport = sessionReportMarkdown(auditStore.read(notResurfacedSession.session_id), []);
|
|
1248
|
+
assert.ok(notResurfacedReport.includes("not_resurfaced means the ask was not repeated; it is not proof that evidence was satisfied."), "v4.2.5 / not_resurfaced: session report must state the not_resurfaced semantics");
|
|
1249
|
+
console.log("[smoke] terminal_cost_evidence_audit_test: PASS");
|
|
1250
|
+
}
|
|
1099
1251
|
// v2.22.0 (B.P3): session.budget_warning event emit + idempotency. The
|
|
1100
1252
|
// orchestrator emits a one-shot warning when cumulative cost crosses
|
|
1101
1253
|
// 75% of cost_ceiling_usd; the budget_warning_emitted flag persists
|
|
@@ -1374,6 +1526,19 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
1374
1526
|
follow_ups: [],
|
|
1375
1527
|
}));
|
|
1376
1528
|
assert.ok(!grounded.parser_warnings.includes("verified_without_evidence_sources"), "v4.2.2 / truthfulness_guardrails: concrete evidence_sources must satisfy verified confidence");
|
|
1529
|
+
const attachedEvidenceGrounded = parseStatusForTruth(JSON.stringify({
|
|
1530
|
+
status: "READY",
|
|
1531
|
+
summary: "The raw gate proves the fix.",
|
|
1532
|
+
confidence: "verified",
|
|
1533
|
+
evidence_sources: [
|
|
1534
|
+
"Attachment: RAW clean-room CI-equivalent gate (Node 24.14.0): npm ci exit 0; npm test 22 passed.",
|
|
1535
|
+
"evidence/2026-06-05T09-55-29-249Z-RAW-clean-room-CI-equivalent-gate.txt: Test Files 4 passed (4)",
|
|
1536
|
+
"L7001 jsdom dependency undici ^7.25.0; L9544 resolved undici 6.24.0",
|
|
1537
|
+
],
|
|
1538
|
+
caller_requests: [],
|
|
1539
|
+
follow_ups: [],
|
|
1540
|
+
}));
|
|
1541
|
+
assert.ok(!attachedEvidenceGrounded.parser_warnings.includes("verified_without_evidence_sources"), "v4.2.4 / truthfulness_guardrails: attachment paths, raw gate logs, and line-number labels are evidence_sources, not empty-evidence warnings");
|
|
1377
1542
|
assert.ok(/confidence.*verified[\s\S]+evidence_sources/i.test(statusInstruction()), "v4.2.2 / truthfulness_guardrails: statusInstruction must tie verified confidence to concrete evidence_sources");
|
|
1378
1543
|
console.log("[smoke] verified_requires_evidence_sources_test: PASS");
|
|
1379
1544
|
}
|
|
@@ -5251,6 +5416,16 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
5251
5416
|
narrativeCorpus: "",
|
|
5252
5417
|
});
|
|
5253
5418
|
assert.equal(genericConfirmation.fabricated, false, "v4.2.2 / truthfulness_guardrails: generic 'confirmed' prose without a dispatch/authorization claim must not trip fabrication detection");
|
|
5419
|
+
const fabricatedReviewReferences = detectFabricatedEvidence([
|
|
5420
|
+
"R2 evidence confirms sessions 604dcecc-df8d-483c-b598-733b8cbb64b0 and 37929ed7-3b71-454c-8231-5e1657ad17af.",
|
|
5421
|
+
"The external comparison used https://github.com/qhjqhj00/GossipCat and https://github.com/alibaba/mira.",
|
|
5422
|
+
].join("\n"), {
|
|
5423
|
+
provenanceCorpus: "",
|
|
5424
|
+
priorDraftCorpus: "The prior artifact did not contain session IDs or GitHub repository URLs.",
|
|
5425
|
+
narrativeCorpus: "Audit cross-review improvements.",
|
|
5426
|
+
});
|
|
5427
|
+
assert.ok(fabricatedReviewReferences.fabricated === true &&
|
|
5428
|
+
fabricatedReviewReferences.suspicious_assertion_count >= 2, `v4.2.5 / fabrication_lock: net-new session IDs and GitHub URLs in relator text must trip fabricated=true (got count=${fabricatedReviewReferences.suspicious_assertion_count}, fabricated=${fabricatedReviewReferences.fabricated})`);
|
|
5254
5429
|
// Source-level: threshold constants pinned at the documented values.
|
|
5255
5430
|
assert.ok(/FABRICATED_NET_NEW_HEX_THRESHOLD\s*=\s*3/.test(orchSrc), "v2.24.0 / fabrication_lock: net-new hex threshold pinned at 3");
|
|
5256
5431
|
assert.ok(/FABRICATED_SUSPICIOUS_ASSERTION_THRESHOLD\s*=\s*2/.test(orchSrc), "v2.24.0 / fabrication_lock: suspicious assertion threshold pinned at 2");
|
|
@@ -5740,6 +5915,7 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
5740
5915
|
});
|
|
5741
5916
|
assert.equal(contradictedByRuntime.pass, false, "v4.2.2 / truthfulness_preflight: current-runtime version claim contradicting runtime facts must trip even when server_info text is present");
|
|
5742
5917
|
assert.ok(contradictedByRuntime.contradictions.some((item) => item.includes("4.2.0")), "v4.2.2 / truthfulness_preflight: mismatch diagnostics must include the contradicted version token");
|
|
5918
|
+
assert.ok(contradictedByRuntime.issue_classes?.includes("runtime_contradiction"), "v4.2.4 / truthfulness_preflight: runtime contradictions must surface issue_classes=runtime_contradiction");
|
|
5743
5919
|
const backedByRuntime = truthfulnessPreflight({
|
|
5744
5920
|
task: "Audit all sessions generated with the current cross-review version.",
|
|
5745
5921
|
initialDraft: 'Live server_info: {"version":"4.2.1","release_date":"2026-05-21"}\nAudit report for cross-review v4.2.1 current production, released 2026-05-21.',
|
|
@@ -5754,6 +5930,7 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
5754
5930
|
attachmentsPresent: false,
|
|
5755
5931
|
});
|
|
5756
5932
|
assert.equal(unsupportedCurrentState.pass, false, "v4.2.2 / truthfulness_preflight: current-runtime claim without runtime facts or source evidence must trip");
|
|
5933
|
+
assert.ok(unsupportedCurrentState.issue_classes?.includes("unsupported_current_state_claim"), "v4.2.4 / truthfulness_preflight: unsupported current-state claims must have their own issue class");
|
|
5757
5934
|
const historicalChangelog = truthfulnessPreflight({
|
|
5758
5935
|
task: "Review this changelog text.",
|
|
5759
5936
|
initialDraft: "v4.2.0 was released on 2026-05-17. v4.2.1 was released on 2026-05-21.",
|
|
@@ -5768,6 +5945,17 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
5768
5945
|
attachmentsPresent: false,
|
|
5769
5946
|
});
|
|
5770
5947
|
assert.equal(fabricatedTiming.pass, false, "v4.2.2 / truthfulness_preflight: historical runtime timing narrative without snapshot evidence must trip");
|
|
5948
|
+
assert.ok(fabricatedTiming.issue_classes?.includes("unsupported_historical_claim"), "v4.2.4 / truthfulness_preflight: historical timing claims without snapshot evidence must surface unsupported_historical_claim");
|
|
5949
|
+
assert.ok(/attachments_present=false/.test(fabricatedTiming.reason) &&
|
|
5950
|
+
/session_attach_evidence/.test(fabricatedTiming.reason), "v4.2.4 / truthfulness_preflight: failure reason must tell operators that no attachment was visible and how to fix it");
|
|
5951
|
+
const fabricatedWorkflowClaim = truthfulnessPreflight({
|
|
5952
|
+
task: "Summarize the release closure.",
|
|
5953
|
+
initialDraft: "I triggered the workflow dispatch after operator authorization and confirmed the remote deployment succeeded.",
|
|
5954
|
+
runtimeFacts,
|
|
5955
|
+
attachmentsPresent: false,
|
|
5956
|
+
});
|
|
5957
|
+
assert.equal(fabricatedWorkflowClaim.pass, false, "v4.2.4 / truthfulness_preflight: fabricated workflow or authorization claims must trip before paid calls");
|
|
5958
|
+
assert.ok(fabricatedWorkflowClaim.issue_classes?.includes("fabrication_pattern"), "v4.2.4 / truthfulness_preflight: fabricated workflow/authorization claims must surface issue_classes=fabrication_pattern");
|
|
5771
5959
|
const withStructuredEvidence = truthfulnessPreflight({
|
|
5772
5960
|
task: "Explain why the report said v4.2.0.",
|
|
5773
5961
|
initialDraft: "When the workflow began, cross-review was running v4.2.0. It was bumped to v4.2.1 between R1 and R3.",
|
|
@@ -5782,9 +5970,36 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
5782
5970
|
assert.ok(/truthfulness_preflight_enabled/.test(orchSrcTruth) &&
|
|
5783
5971
|
/askPeers[\s\S]+truthfulnessPreflight/.test(orchSrcTruth) &&
|
|
5784
5972
|
/runUntilUnanimous[\s\S]+truthfulnessPreflight/.test(orchSrcTruth), "v4.2.2 / truthfulness_preflight: both askPeers and runUntilUnanimous must gate on config.truthfulness_preflight_enabled");
|
|
5973
|
+
assert.ok(/recordPreflightFailure/.test(orchSrcTruth), "v4.2.4 / truthfulness_preflight: preflight aborts without rounds must still persist failed_attempts metadata");
|
|
5785
5974
|
assert.ok(/boolEnv\("CROSS_REVIEW_TRUTHFULNESS_PREFLIGHT", true\)/.test(configSrcTruth), "v4.2.2 / truthfulness_preflight: CROSS_REVIEW_TRUTHFULNESS_PREFLIGHT env var must default ON");
|
|
5786
5975
|
console.log("[smoke] truthfulness_preflight_test: PASS");
|
|
5787
5976
|
}
|
|
5977
|
+
// v4.2.4 — truthfulness_preflight_runtime_contract_test.
|
|
5978
|
+
// A failed preflight should be inspectable without scraping events, and
|
|
5979
|
+
// operators should be able to re-run the same read-only preflight after
|
|
5980
|
+
// attaching evidence instead of starting duplicate sessions.
|
|
5981
|
+
{
|
|
5982
|
+
const orchSrcTruth = fs.readFileSync(new URL("../src/core/orchestrator.ts", import.meta.url), "utf8");
|
|
5983
|
+
const storeSrcTruth = fs.readFileSync(new URL("../src/core/session-store.ts", import.meta.url), "utf8");
|
|
5984
|
+
const serverSrcTruth = fs.readFileSync(new URL("../src/mcp/server.ts", import.meta.url), "utf8");
|
|
5985
|
+
assert.ok(/recordPreflightFailure/.test(storeSrcTruth) &&
|
|
5986
|
+
/failed_attempts/.test(storeSrcTruth) &&
|
|
5987
|
+
/truthfulness_preflight/.test(storeSrcTruth), "v4.2.4 / truthfulness_preflight: SessionStore must persist preflight failed_attempts even when no round is appended");
|
|
5988
|
+
const runUntilIndex = orchSrcTruth.indexOf("async runUntilUnanimous");
|
|
5989
|
+
const truthfulnessIndex = orchSrcTruth.indexOf("const truthfulness = truthfulnessPreflight", runUntilIndex);
|
|
5990
|
+
const evidenceIndex = orchSrcTruth.indexOf("const preflight = evidencePreflight", runUntilIndex);
|
|
5991
|
+
const leadGenerationIndex = orchSrcTruth.indexOf("const generation = await adapters[leadPeer].generate", runUntilIndex);
|
|
5992
|
+
assert.ok(runUntilIndex >= 0 &&
|
|
5993
|
+
truthfulnessIndex > runUntilIndex &&
|
|
5994
|
+
evidenceIndex > truthfulnessIndex &&
|
|
5995
|
+
leadGenerationIndex > evidenceIndex, "v4.2.4 / truthfulness_preflight: runUntilUnanimous must run truthfulness/evidence preflight before paid lead generation");
|
|
5996
|
+
assert.ok(/"session_truthfulness_preflight_check"/.test(serverSrcTruth) &&
|
|
5997
|
+
/readEvidenceAttachments/.test(serverSrcTruth) &&
|
|
5998
|
+
/truthfulnessPreflight/.test(serverSrcTruth), "v4.2.4 / truthfulness_preflight: MCP must expose a read-only session_truthfulness_preflight_check retest tool");
|
|
5999
|
+
assert.ok(/"session_truthfulness_preflight_check"/.test(serverSrcTruth) &&
|
|
6000
|
+
/TOOL_NAMES[\s\S]*session_truthfulness_preflight_check/.test(serverSrcTruth), "v4.2.4 / truthfulness_preflight: server_info tool list must include session_truthfulness_preflight_check");
|
|
6001
|
+
console.log("[smoke] truthfulness_preflight_runtime_contract_test: PASS");
|
|
6002
|
+
}
|
|
5788
6003
|
// v3.5.0 (CRV2-1 + CRV2-6) — budget + max_rounds traceability.
|
|
5789
6004
|
//
|
|
5790
6005
|
// setSessionTraceability persists requested-vs-effective max_rounds and
|