@lcv-ideas-software/cross-review 4.2.5 → 4.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +30 -0
- package/README.md +9 -2
- package/dist/scripts/eval-fixtures.d.ts +54 -0
- package/dist/scripts/eval-fixtures.js +216 -0
- package/dist/scripts/eval-fixtures.js.map +1 -0
- package/dist/scripts/smoke.js +187 -0
- package/dist/scripts/smoke.js.map +1 -1
- package/dist/src/core/config.d.ts +1 -1
- package/dist/src/core/config.js +1 -1
- package/dist/src/core/orchestrator.js +27 -2
- package/dist/src/core/orchestrator.js.map +1 -1
- package/dist/src/core/reports.d.ts +2 -1
- package/dist/src/core/reports.js +30 -0
- package/dist/src/core/reports.js.map +1 -1
- package/dist/src/core/session-store.d.ts +2 -1
- package/dist/src/core/session-store.js +141 -0
- package/dist/src/core/session-store.js.map +1 -1
- package/dist/src/core/types.d.ts +30 -0
- package/dist/src/mcp/server.js +15 -0
- package/dist/src/mcp/server.js.map +1 -1
- package/docs/apresentacao-cross-review.md +10 -8
- package/docs/apresentacao.md +23 -20
- package/docs/architecture.md +5 -0
- package/package.json +2 -1
package/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,36 @@ standard `v00.00.00`; npm package versions remain SemVer.
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [v04.03.00] — 2026-06-05
|
|
11
|
+
|
|
12
|
+
**Minor — P1/P2/P3 audit follow-up.** This release closes the first concrete
|
|
13
|
+
items from the post-v4.2.5 runtime/session audit: unresolved evidence is harder
|
|
14
|
+
to miss at finalization time, fixture-level regressions can be evaluated
|
|
15
|
+
offline, and operators get a read-only peer reliability report without changing
|
|
16
|
+
peer selection.
|
|
17
|
+
|
|
18
|
+
### Added
|
|
19
|
+
|
|
20
|
+
- Added `session_peer_reliability_report`, a read-only MCP tool that aggregates
|
|
21
|
+
per-peer parser warnings, decision quality, rejected/provider failures,
|
|
22
|
+
evidence checklist dispositions, fabrication-related events, latency and
|
|
23
|
+
cost.
|
|
24
|
+
- Added `npm run eval:fixtures`, an offline fixture harness for truthfulness
|
|
25
|
+
preflight, parser diagnostics and report rendering contracts. It does not
|
|
26
|
+
start provider sessions or call reviewers.
|
|
27
|
+
- `session_report` now includes an **Unresolved Evidence Disposition** section
|
|
28
|
+
when checklist items remain `open` or `not_resurfaced`.
|
|
29
|
+
|
|
30
|
+
### Changed
|
|
31
|
+
|
|
32
|
+
- Automatic convergence with unresolved checklist items now finalizes with
|
|
33
|
+
`unanimous_ready_with_unresolved_evidence` or
|
|
34
|
+
`recovered_unanimity_with_unresolved_evidence` instead of a plain success
|
|
35
|
+
reason.
|
|
36
|
+
- Finalization now emits `session.evidence_checklist_unresolved_on_finalize`
|
|
37
|
+
with unresolved counts and item summaries when a session closes while
|
|
38
|
+
evidence asks are still open or only inferred as not resurfaced.
|
|
39
|
+
|
|
10
40
|
## [v04.02.05] — 2026-06-05
|
|
11
41
|
|
|
12
42
|
**Patch — session audit hardening.** This release closes follow-ups from the
|
package/README.md
CHANGED
|
@@ -24,7 +24,7 @@ npm install -g @lcv-ideas-software/cross-review
|
|
|
24
24
|
npm install -g @lcv-ideas-software/cross-review --registry=https://npm.pkg.github.com
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
-
**Status.** Stable. Current release: **v04.
|
|
27
|
+
**Status.** Stable. Current release: **v04.03.00** (npm package `4.3.0`). See [CHANGELOG.md](./CHANGELOG.md) for the full release history.
|
|
28
28
|
|
|
29
29
|
> **Project renamed 2026-05-15.** This project was previously published as
|
|
30
30
|
> [`@lcv-ideas-software/cross-review-v2`](https://www.npmjs.com/package/@lcv-ideas-software/cross-review-v2)
|
|
@@ -38,6 +38,7 @@ The version history at a glance:
|
|
|
38
38
|
|
|
39
39
|
| Release | Scope |
|
|
40
40
|
| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
|
41
|
+
| **`v04.03.00`** | Minor — P1/P2/P3 follow-up with unresolved-evidence close-out visibility, an offline fixture eval harness, and a read-only peer reliability report. |
|
|
41
42
|
| **`v04.02.05`** | Patch — harden session auditability with terminal events, cost split reporting, `not_resurfaced` visibility, and relator provenance checks for session IDs/GitHub URLs. |
|
|
42
43
|
| **`v04.02.04`** | Patch — harden truthfulness preflight auditability, add a read-only preflight retest tool, and reduce false parser warnings for attached/log evidence. |
|
|
43
44
|
| **`v04.02.03`** | Patch — promote the Gemini canonical default to `gemini-3.1-pro-preview` and refresh the active local Gemini rate card. |
|
|
@@ -211,6 +212,7 @@ these environment variables before running real sessions (example):
|
|
|
211
212
|
- `session_metrics`
|
|
212
213
|
- `session_doctor`
|
|
213
214
|
- `session_report`
|
|
215
|
+
- `session_peer_reliability_report`
|
|
214
216
|
- `session_check_convergence`
|
|
215
217
|
- `session_truthfulness_preflight_check`
|
|
216
218
|
- `session_attach_evidence`
|
|
@@ -228,7 +230,12 @@ these environment variables before running real sessions (example):
|
|
|
228
230
|
lack terminal events, and reports peer-call cost separately from generation
|
|
229
231
|
artifact cost. `session_report` uses the same split and calls out
|
|
230
232
|
`not_resurfaced` evidence checklist items as inference-only, not proof that the
|
|
231
|
-
requested evidence was satisfied.
|
|
233
|
+
requested evidence was satisfied. If a session otherwise reaches unanimity with
|
|
234
|
+
open or `not_resurfaced` checklist items, finalization records an
|
|
235
|
+
`*_with_unresolved_evidence` outcome reason and emits a durable unresolved
|
|
236
|
+
evidence event. `session_peer_reliability_report` is read-only and aggregates
|
|
237
|
+
per-peer parser warnings, evidence ask status, provider failures, cost and
|
|
238
|
+
latency.
|
|
232
239
|
|
|
233
240
|
## Repository conventions
|
|
234
241
|
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
export declare const truthfulnessCases: readonly [{
|
|
2
|
+
readonly name: "current runtime contradiction is blocked";
|
|
3
|
+
readonly input: {
|
|
4
|
+
readonly task: "The current cross-review runtime is 4.2.4.";
|
|
5
|
+
readonly runtimeFacts: {
|
|
6
|
+
readonly runtime_version: "4.2.5";
|
|
7
|
+
readonly release_date: "2026-06-05";
|
|
8
|
+
};
|
|
9
|
+
readonly attachmentsPresent: false;
|
|
10
|
+
};
|
|
11
|
+
readonly expectPass: false;
|
|
12
|
+
readonly expectIssueClass: "runtime_contradiction";
|
|
13
|
+
}, {
|
|
14
|
+
readonly name: "matching current runtime facts pass";
|
|
15
|
+
readonly input: {
|
|
16
|
+
readonly task: "server_info shows current cross-review runtime 4.2.5.";
|
|
17
|
+
readonly runtimeFacts: {
|
|
18
|
+
readonly runtime_version: "4.2.5";
|
|
19
|
+
readonly release_date: "2026-06-05";
|
|
20
|
+
};
|
|
21
|
+
readonly attachmentsPresent: false;
|
|
22
|
+
};
|
|
23
|
+
readonly expectPass: true;
|
|
24
|
+
}, {
|
|
25
|
+
readonly name: "historical timing claim needs snapshot evidence";
|
|
26
|
+
readonly input: {
|
|
27
|
+
readonly task: "When the audit began, cross-review was running 4.2.4.";
|
|
28
|
+
readonly runtimeFacts: {
|
|
29
|
+
readonly runtime_version: "4.2.5";
|
|
30
|
+
readonly release_date: "2026-06-05";
|
|
31
|
+
};
|
|
32
|
+
readonly attachmentsPresent: false;
|
|
33
|
+
};
|
|
34
|
+
readonly expectPass: false;
|
|
35
|
+
readonly expectIssueClass: "unsupported_historical_claim";
|
|
36
|
+
}];
|
|
37
|
+
export declare const parserCases: readonly [{
|
|
38
|
+
readonly name: "verified with empty evidence gets empty-evidence warning";
|
|
39
|
+
readonly text: string;
|
|
40
|
+
readonly expectStatus: "READY";
|
|
41
|
+
readonly expectWarning: "verified_without_evidence_sources";
|
|
42
|
+
}, {
|
|
43
|
+
readonly name: "verified with attached evidence path is concrete";
|
|
44
|
+
readonly text: string;
|
|
45
|
+
readonly expectStatus: "READY";
|
|
46
|
+
readonly absentWarning: "verified_without_evidence_sources";
|
|
47
|
+
}];
|
|
48
|
+
export declare const reportCases: readonly [{
|
|
49
|
+
readonly name: "cost split and unresolved evidence are surfaced";
|
|
50
|
+
readonly peerCost: 14.652426;
|
|
51
|
+
readonly generationCost: 1.876718;
|
|
52
|
+
readonly totalCost: 16.529144;
|
|
53
|
+
readonly unresolvedAsk: "attach raw npm test output";
|
|
54
|
+
}];
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
import assert from "node:assert/strict";
|
|
2
|
+
import fs from "node:fs";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { loadConfig } from "../src/core/config.js";
|
|
6
|
+
import { truthfulnessPreflight } from "../src/core/orchestrator.js";
|
|
7
|
+
import { sessionReportMarkdown } from "../src/core/reports.js";
|
|
8
|
+
import { SessionStore } from "../src/core/session-store.js";
|
|
9
|
+
import { parsePeerStatus } from "../src/core/status.js";
|
|
10
|
+
function evalTmpDir(label) {
|
|
11
|
+
return fs.mkdtempSync(path.join(os.tmpdir(), `cross-review-eval-${label}-`));
|
|
12
|
+
}
|
|
13
|
+
export const truthfulnessCases = [
|
|
14
|
+
{
|
|
15
|
+
name: "current runtime contradiction is blocked",
|
|
16
|
+
input: {
|
|
17
|
+
task: "The current cross-review runtime is 4.2.4.",
|
|
18
|
+
runtimeFacts: { runtime_version: "4.2.5", release_date: "2026-06-05" },
|
|
19
|
+
attachmentsPresent: false,
|
|
20
|
+
},
|
|
21
|
+
expectPass: false,
|
|
22
|
+
expectIssueClass: "runtime_contradiction",
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
name: "matching current runtime facts pass",
|
|
26
|
+
input: {
|
|
27
|
+
task: "server_info shows current cross-review runtime 4.2.5.",
|
|
28
|
+
runtimeFacts: { runtime_version: "4.2.5", release_date: "2026-06-05" },
|
|
29
|
+
attachmentsPresent: false,
|
|
30
|
+
},
|
|
31
|
+
expectPass: true,
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
name: "historical timing claim needs snapshot evidence",
|
|
35
|
+
input: {
|
|
36
|
+
task: "When the audit began, cross-review was running 4.2.4.",
|
|
37
|
+
runtimeFacts: { runtime_version: "4.2.5", release_date: "2026-06-05" },
|
|
38
|
+
attachmentsPresent: false,
|
|
39
|
+
},
|
|
40
|
+
expectPass: false,
|
|
41
|
+
expectIssueClass: "unsupported_historical_claim",
|
|
42
|
+
},
|
|
43
|
+
];
|
|
44
|
+
export const parserCases = [
|
|
45
|
+
{
|
|
46
|
+
name: "verified with empty evidence gets empty-evidence warning",
|
|
47
|
+
text: JSON.stringify({
|
|
48
|
+
status: "READY",
|
|
49
|
+
summary: "ok",
|
|
50
|
+
confidence: "verified",
|
|
51
|
+
evidence_sources: [],
|
|
52
|
+
caller_requests: [],
|
|
53
|
+
follow_ups: [],
|
|
54
|
+
}),
|
|
55
|
+
expectStatus: "READY",
|
|
56
|
+
expectWarning: "verified_without_evidence_sources",
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
name: "verified with attached evidence path is concrete",
|
|
60
|
+
text: JSON.stringify({
|
|
61
|
+
status: "READY",
|
|
62
|
+
summary: "ok",
|
|
63
|
+
confidence: "verified",
|
|
64
|
+
evidence_sources: ["evidence/2026-06-05T00-00-00Z-raw-smoke.txt: npm test 42 passed"],
|
|
65
|
+
caller_requests: [],
|
|
66
|
+
follow_ups: [],
|
|
67
|
+
}),
|
|
68
|
+
expectStatus: "READY",
|
|
69
|
+
absentWarning: "verified_without_evidence_sources",
|
|
70
|
+
},
|
|
71
|
+
];
|
|
72
|
+
export const reportCases = [
|
|
73
|
+
{
|
|
74
|
+
name: "cost split and unresolved evidence are surfaced",
|
|
75
|
+
peerCost: 14.652426,
|
|
76
|
+
generationCost: 1.876718,
|
|
77
|
+
totalCost: 16.529144,
|
|
78
|
+
unresolvedAsk: "attach raw npm test output",
|
|
79
|
+
},
|
|
80
|
+
];
|
|
81
|
+
for (const testCase of truthfulnessCases) {
|
|
82
|
+
const result = truthfulnessPreflight({
|
|
83
|
+
task: testCase.input.task,
|
|
84
|
+
runtimeFacts: testCase.input.runtimeFacts,
|
|
85
|
+
attachmentsPresent: testCase.input.attachmentsPresent,
|
|
86
|
+
});
|
|
87
|
+
assert.equal(result.pass, testCase.expectPass, testCase.name);
|
|
88
|
+
if ("expectIssueClass" in testCase) {
|
|
89
|
+
assert.ok(result.issue_classes.includes(testCase.expectIssueClass), testCase.name);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
for (const testCase of parserCases) {
|
|
93
|
+
const result = parsePeerStatus(testCase.text);
|
|
94
|
+
assert.equal(result.status, testCase.expectStatus, testCase.name);
|
|
95
|
+
if ("expectWarning" in testCase) {
|
|
96
|
+
assert.ok(result.parser_warnings.includes(testCase.expectWarning), testCase.name);
|
|
97
|
+
}
|
|
98
|
+
if ("absentWarning" in testCase) {
|
|
99
|
+
assert.ok(!result.parser_warnings.includes(testCase.absentWarning), testCase.name);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
for (const testCase of reportCases) {
|
|
103
|
+
const store = new SessionStore({
|
|
104
|
+
...loadConfig(),
|
|
105
|
+
data_dir: evalTmpDir("report"),
|
|
106
|
+
});
|
|
107
|
+
const session = await store.init(`eval report fixture: ${testCase.name}`, "operator", []);
|
|
108
|
+
const meta = store.read(session.session_id);
|
|
109
|
+
const ts = new Date().toISOString();
|
|
110
|
+
meta.rounds = [
|
|
111
|
+
{
|
|
112
|
+
round: 1,
|
|
113
|
+
started_at: ts,
|
|
114
|
+
completed_at: ts,
|
|
115
|
+
caller_status: "READY",
|
|
116
|
+
prompt_file: "agent-runs/round-1-prompt.md",
|
|
117
|
+
peers: [
|
|
118
|
+
{
|
|
119
|
+
peer: "codex",
|
|
120
|
+
provider: "openai",
|
|
121
|
+
model: "gpt-5.5",
|
|
122
|
+
status: "READY",
|
|
123
|
+
structured: {
|
|
124
|
+
status: "READY",
|
|
125
|
+
summary: "ready",
|
|
126
|
+
confidence: "verified",
|
|
127
|
+
evidence_sources: ["server_info: version 4.2.5"],
|
|
128
|
+
caller_requests: [],
|
|
129
|
+
follow_ups: [],
|
|
130
|
+
},
|
|
131
|
+
text: "{}",
|
|
132
|
+
raw: { fixture: true },
|
|
133
|
+
decision_quality: "clean",
|
|
134
|
+
parser_warnings: [],
|
|
135
|
+
attempts: 1,
|
|
136
|
+
latency_ms: 1,
|
|
137
|
+
usage: { input_tokens: 1, output_tokens: 1, total_tokens: 2 },
|
|
138
|
+
cost: {
|
|
139
|
+
currency: "USD",
|
|
140
|
+
estimated: false,
|
|
141
|
+
source: "configured-rate",
|
|
142
|
+
total_cost: testCase.peerCost,
|
|
143
|
+
},
|
|
144
|
+
},
|
|
145
|
+
],
|
|
146
|
+
rejected: [],
|
|
147
|
+
convergence: {
|
|
148
|
+
converged: true,
|
|
149
|
+
reason: "fixture",
|
|
150
|
+
ready_peers: ["codex"],
|
|
151
|
+
not_ready_peers: [],
|
|
152
|
+
needs_evidence_peers: [],
|
|
153
|
+
rejected_peers: [],
|
|
154
|
+
skipped_peers: [],
|
|
155
|
+
decision_quality: {
|
|
156
|
+
codex: "clean",
|
|
157
|
+
claude: "clean",
|
|
158
|
+
gemini: "clean",
|
|
159
|
+
deepseek: "clean",
|
|
160
|
+
grok: "clean",
|
|
161
|
+
perplexity: "clean",
|
|
162
|
+
},
|
|
163
|
+
blocking_details: [],
|
|
164
|
+
},
|
|
165
|
+
},
|
|
166
|
+
];
|
|
167
|
+
meta.generation_files = [
|
|
168
|
+
{
|
|
169
|
+
round: 0,
|
|
170
|
+
peer: "codex",
|
|
171
|
+
label: "initial_draft",
|
|
172
|
+
path: "agent-runs/round-0-initial-draft.md",
|
|
173
|
+
ts,
|
|
174
|
+
usage: { input_tokens: 1, output_tokens: 1, total_tokens: 2 },
|
|
175
|
+
cost: {
|
|
176
|
+
currency: "USD",
|
|
177
|
+
estimated: false,
|
|
178
|
+
source: "configured-rate",
|
|
179
|
+
total_cost: testCase.generationCost,
|
|
180
|
+
},
|
|
181
|
+
},
|
|
182
|
+
];
|
|
183
|
+
meta.totals.cost = {
|
|
184
|
+
currency: "USD",
|
|
185
|
+
estimated: false,
|
|
186
|
+
source: "configured-rate",
|
|
187
|
+
total_cost: testCase.totalCost,
|
|
188
|
+
};
|
|
189
|
+
meta.evidence_checklist = [
|
|
190
|
+
{
|
|
191
|
+
id: "eval-1",
|
|
192
|
+
peer: "codex",
|
|
193
|
+
first_round: 1,
|
|
194
|
+
last_round: 1,
|
|
195
|
+
round_count: 1,
|
|
196
|
+
ask: testCase.unresolvedAsk,
|
|
197
|
+
first_seen_at: ts,
|
|
198
|
+
last_seen_at: ts,
|
|
199
|
+
status: "not_resurfaced",
|
|
200
|
+
addressed_at_round: 2,
|
|
201
|
+
address_method: "resurfacing",
|
|
202
|
+
},
|
|
203
|
+
];
|
|
204
|
+
fs.writeFileSync(store.metaPath(session.session_id), JSON.stringify(meta));
|
|
205
|
+
const report = sessionReportMarkdown(store.read(session.session_id), []);
|
|
206
|
+
assert.ok(report.includes("$16.529144 USD = $14.652426 peer + $1.876718 generation"));
|
|
207
|
+
assert.ok(report.includes("## Unresolved Evidence Disposition"));
|
|
208
|
+
assert.ok(report.includes(testCase.unresolvedAsk));
|
|
209
|
+
}
|
|
210
|
+
console.log(JSON.stringify({
|
|
211
|
+
ok: true,
|
|
212
|
+
truthfulness_cases: truthfulnessCases.length,
|
|
213
|
+
parser_cases: parserCases.length,
|
|
214
|
+
report_cases: reportCases.length,
|
|
215
|
+
}));
|
|
216
|
+
//# sourceMappingURL=eval-fixtures.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-fixtures.js","sourceRoot":"","sources":["../../scripts/eval-fixtures.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,oBAAoB,CAAC;AACxC,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAE,qBAAqB,EAAE,MAAM,6BAA6B,CAAC;AACpE,OAAO,EAAE,qBAAqB,EAAE,MAAM,wBAAwB,CAAC;AAC/D,OAAO,EAAE,YAAY,EAAE,MAAM,8BAA8B,CAAC;AAC5D,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAExD,SAAS,UAAU,CAAC,KAAa;IAC/B,OAAO,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,qBAAqB,KAAK,GAAG,CAAC,CAAC,CAAC;AAC/E,CAAC;AAED,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B;QACE,IAAI,EAAE,0CAA0C;QAChD,KAAK,EAAE;YACL,IAAI,EAAE,4CAA4C;YAClD,YAAY,EAAE,EAAE,eAAe,EAAE,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE;YACtE,kBAAkB,EAAE,KAAK;SAC1B;QACD,UAAU,EAAE,KAAK;QACjB,gBAAgB,EAAE,uBAAuB;KAC1C;IACD;QACE,IAAI,EAAE,qCAAqC;QAC3C,KAAK,EAAE;YACL,IAAI,EAAE,uDAAuD;YAC7D,YAAY,EAAE,EAAE,eAAe,EAAE,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE;YACtE,kBAAkB,EAAE,KAAK;SAC1B;QACD,UAAU,EAAE,IAAI;KACjB;IACD;QACE,IAAI,EAAE,iDAAiD;QACvD,KAAK,EAAE;YACL,IAAI,EAAE,uDAAuD;YAC7D,YAAY,EAAE,EAAE,eAAe,EAAE,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE;YACtE,kBAAkB,EAAE,KAAK;SAC1B;QACD,UAAU,EAAE,KAAK;QACjB,gBAAgB,EAAE,8BAA8B;KACjD;CACO,CAAC;AAEX,MAAM,CAAC,MAAM,WAAW,GAAG;IACzB;QACE,IAAI,EAAE,0DAA0D;QAChE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;YACnB,MAAM,EAAE,OAAO;YACf,OAAO,EAAE,IAAI;YACb,UAAU,EAAE,UAAU;YACtB,gBAAgB,EAAE,EAAE;YACpB,eAAe,EAAE,EAAE;YACnB,UAAU,EAAE,EAAE;SACf,CAAC;QACF,YAAY,EAAE,OAAO;QACrB,aAAa,EAAE,mCAAmC;KACnD;IACD;QACE,IAAI,EAAE,kDAAkD;QACxD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;YACnB,MAAM,EAAE,OAAO;YACf,OAAO,EAAE,IAAI;YACb,UAAU,EAAE,UAAU;YACtB,gBAAgB,EAAE,CAAC,iEAAiE,CAAC;YACrF,eAAe,EAAE,EAAE;YACnB,UAAU,EAAE,EAAE;SACf,CAAC;QACF,YAAY,EAAE,OAAO;QACrB,aAAa,EAAE,mCAAmC;KACnD;CACO,CAAC;AAEX,MAAM,CAAC,MAAM,WAAW,GAAG;IACzB;QACE,IAAI,EAAE,iDAAiD;QACvD,QAAQ,EAAE,SAAS;QACnB,cAAc,EAAE,QAAQ;QACxB,SAAS,EAAE,SAAS;QACpB,aAAa,EAAE,4BAA4B;KAC5C;CACO,CAAC;AAEX,KAAK,MAAM,QAAQ,IAAI,iBAAiB,EAAE,CAAC;IACzC,MAAM,MAAM,GAAG,qBAAqB,CAAC;QACnC,IAAI,EAAE,QAAQ,CAAC,KAAK,CAAC,IAAI;QACzB,YAAY,EAAE,QAAQ,CAAC,KAAK,CAAC,YAAY;QACzC,kBAAkB,EAAE,QAAQ,CAAC,KAAK,CAAC,kBAAkB;KACtD,CAAC,CAAC;IACH,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,IAAI,EAAE,QAAQ,CAAC,UAAU,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC9D,IAAI,kBAAkB,IAAI,QAAQ,EAAE,CAAC;QACnC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,aAAa,CAAC,QAAQ,CAAC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IACrF,CAAC;AACH,CAAC;AAED,KAAK,MAAM,QAAQ,IAAI,WAAW,EAAE,CAAC;IACnC,MAAM,MAAM,GAAG,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC9C,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,QAAQ,CAAC,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IAClE,IAAI,eAAe,IAAI,QAAQ,EAAE,CAAC;QAChC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,eAAe,CAAC,QAAQ,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IACpF,CAAC;IACD,IAAI,eAAe,IAAI,QAAQ,EAAE,CAAC;QAChC,MAAM,CAAC,EAAE,CAAC,CAAC,MAAM,CAAC,eAAe,CAAC,QAAQ,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IACrF,CAAC;AACH,CAAC;AAED,KAAK,MAAM,QAAQ,IAAI,WAAW,EAAE,CAAC;IACnC,MAAM,KAAK,GAAG,IAAI,YAAY,CAAC;QAC7B,GAAG,UAAU,EAAE;QACf,QAAQ,EAAE,UAAU,CAAC,QAAQ,CAAC;KAC/B,CAAC,CAAC;IACH,MAAM,OAAO,GAAG,MAAM,KAAK,CAAC,IAAI,CAAC,wBAAwB,QAAQ,CAAC,IAAI,EAAE,EAAE,UAAU,EAAE,EAAE,CAAC,CAAC;IAC1F,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;IAC5C,MAAM,EAAE,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACpC,IAAI,CAAC,MAAM,GAAG;QACZ;YACE,KAAK,EAAE,CAAC;YACR,UAAU,EAAE,EAAE;YACd,YAAY,EAAE,EAAE;YAChB,aAAa,EAAE,OAAO;YACtB,WAAW,EAAE,8BAA8B;YAC3C,KAAK,EAAE;gBACL;oBACE,IAAI,EAAE,OAAO;oBACb,QAAQ,EAAE,QAAQ;oBAClB,KAAK,EAAE,SAAS;oBAChB,MAAM,EAAE,OAAO;oBACf,UAAU,EAAE;wBACV,MAAM,EAAE,OAAO;wBACf,OAAO,EAAE,OAAO;wBAChB,UAAU,EAAE,UAAU;wBACtB,gBAAgB,EAAE,CAAC,4BAA4B,CAAC;wBAChD,eAAe,EAAE,EAAE;wBACnB,UAAU,EAAE,EAAE;qBACf;oBACD,IAAI,EAAE,IAAI;oBACV,GAAG,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE;oBACtB,gBAAgB,EAAE,OAAO;oBACzB,eAAe,EAAE,EAAE;oBACnB,QAAQ,EAAE,CAAC;oBACX,UAAU,EAAE,CAAC;oBACb,KAAK,EAAE,EAAE,YAAY,EAAE,CAAC,EAAE,aAAa,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE;oBAC7D,IAAI,EAAE;wBACJ,QAAQ,EAAE,KAAK;wBACf,SAAS,EAAE,KAAK;wBAChB,MAAM,EAAE,iBAAiB;wBACzB,UAAU,EAAE,QAAQ,CAAC,QAAQ;qBAC9B;iBACF;aACF;YACD,QAAQ,EAAE,EAAE;YACZ,WAAW,EAAE;gBACX,SAAS,EAAE,IAAI;gBACf,MAAM,EAAE,SAAS;gBACjB,WAAW,EAAE,CAAC,OAAO,CAAC;gBACtB,eAAe,EAAE,EAAE;gBACnB,oBAAoB,EAAE,EAAE;gBACxB,cAAc,EAAE,EAAE;gBAClB,aAAa,EAAE,EAAE;gBACjB,gBAAgB,EAAE;oBAChB,KAAK,EAAE,OAAO;oBACd,MAAM,EAAE,OAAO;oBACf,MAAM,EAAE,OAAO;oBACf,QAAQ,EAAE,OAAO;oBACjB,IAAI,EAAE,OAAO;oBACb,UAAU,EAAE,OAAO;iBACpB;gBACD,gBAAgB,EAAE,EAAE;aACrB;SACF;KACF,CAAC;IACF,IAAI,CAAC,gBAAgB,GAAG;QACtB;YACE,KAAK,EAAE,CAAC;YACR,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,eAAe;YACtB,IAAI,EAAE,qCAAqC;YAC3C,EAAE;YACF,KAAK,EAAE,EAAE,YAAY,EAAE,CAAC,EAAE,aAAa,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE;YAC7D,IAAI,EAAE;gBACJ,QAAQ,EAAE,KAAK;gBACf,SAAS,EAAE,KAAK;gBAChB,MAAM,EAAE,iBAAiB;gBACzB,UAAU,EAAE,QAAQ,CAAC,cAAc;aACpC;SACF;KACF,CAAC;IACF,IAAI,CAAC,MAAM,CAAC,IAAI,GAAG;QACjB,QAAQ,EAAE,KAAK;QACf,SAAS,EAAE,KAAK;QAChB,MAAM,EAAE,iBAAiB;QACzB,UAAU,EAAE,QAAQ,CAAC,SAAS;KAC/B,CAAC;IACF,IAAI,CAAC,kBAAkB,GAAG;QACxB;YACE,EAAE,EAAE,QAAQ;YACZ,IAAI,EAAE,OAAO;YACb,WAAW,EAAE,CAAC;YACd,UAAU,EAAE,CAAC;YACb,WAAW,EAAE,CAAC;YACd,GAAG,EAAE,QAAQ,CAAC,aAAa;YAC3B,aAAa,EAAE,EAAE;YACjB,YAAY,EAAE,EAAE;YAChB,MAAM,EAAE,gBAAgB;YACxB,kBAAkB,EAAE,CAAC;YACrB,cAAc,EAAE,aAAa;SAC9B;KACF,CAAC;IACF,EAAE,CAAC,aAAa,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC;IAC3E,MAAM,MAAM,GAAG,qBAAqB,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,EAAE,CAAC,CAAC;IACzE,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,yDAAyD,CAAC,CAAC,CAAC;IACtF,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,oCAAoC,CAAC,CAAC,CAAC;IACjE,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,CAAC;AACrD,CAAC;AAED,OAAO,CAAC,GAAG,CACT,IAAI,CAAC,SAAS,CAAC;IACb,EAAE,EAAE,IAAI;IACR,kBAAkB,EAAE,iBAAiB,CAAC,MAAM;IAC5C,YAAY,EAAE,WAAW,CAAC,MAAM;IAChC,YAAY,EAAE,WAAW,CAAC,MAAM;CACjC,CAAC,CACH,CAAC"}
|
package/dist/scripts/smoke.js
CHANGED
|
@@ -1248,6 +1248,193 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
1248
1248
|
assert.ok(notResurfacedReport.includes("not_resurfaced means the ask was not repeated; it is not proof that evidence was satisfied."), "v4.2.5 / not_resurfaced: session report must state the not_resurfaced semantics");
|
|
1249
1249
|
console.log("[smoke] terminal_cost_evidence_audit_test: PASS");
|
|
1250
1250
|
}
|
|
1251
|
+
// v4.3.0 / P1: unanimous READY with unresolved evidence must not look like a
|
|
1252
|
+
// plain unanimous_ready close-out. `not_resurfaced` is inference-only: it may
|
|
1253
|
+
// allow convergence, but the final metadata/report must keep that disposition
|
|
1254
|
+
// visible for operators.
|
|
1255
|
+
{
|
|
1256
|
+
const { sessionReportMarkdown } = await import("../src/core/reports.js");
|
|
1257
|
+
const unresolvedEvents = [];
|
|
1258
|
+
const unresolvedConfig = {
|
|
1259
|
+
...loadConfig(),
|
|
1260
|
+
data_dir: smokeTmpDir("unresolved-evidence-finalize"),
|
|
1261
|
+
budget: {
|
|
1262
|
+
...loadConfig().budget,
|
|
1263
|
+
max_session_cost_usd: 10000,
|
|
1264
|
+
preflight_max_round_cost_usd: 10000,
|
|
1265
|
+
until_stopped_max_cost_usd: 10000,
|
|
1266
|
+
},
|
|
1267
|
+
evidence_judge_autowire: {
|
|
1268
|
+
...loadConfig().evidence_judge_autowire,
|
|
1269
|
+
mode: "off",
|
|
1270
|
+
active: false,
|
|
1271
|
+
},
|
|
1272
|
+
};
|
|
1273
|
+
const unresolvedOrch = new CrossReviewOrchestrator(unresolvedConfig, (event) => unresolvedEvents.push(event.type));
|
|
1274
|
+
const unresolvedR1 = await unresolvedOrch.askPeers({
|
|
1275
|
+
task: "P1 unresolved evidence finalization guard fixture.",
|
|
1276
|
+
draft: "FORCE_NEEDS_EVIDENCE",
|
|
1277
|
+
caller: "operator",
|
|
1278
|
+
peers: ["claude"],
|
|
1279
|
+
});
|
|
1280
|
+
const unresolvedR2 = await unresolvedOrch.askPeers({
|
|
1281
|
+
session_id: unresolvedR1.session.session_id,
|
|
1282
|
+
task: "P1 unresolved evidence finalization guard fixture.",
|
|
1283
|
+
draft: "Clean revised draft, no test marker present.",
|
|
1284
|
+
caller: "operator",
|
|
1285
|
+
peers: ["claude"],
|
|
1286
|
+
});
|
|
1287
|
+
assert.equal(unresolvedR2.converged, true);
|
|
1288
|
+
assert.equal(unresolvedR2.session.outcome, "converged");
|
|
1289
|
+
assert.equal(unresolvedR2.session.outcome_reason, "unanimous_ready_with_unresolved_evidence", "v4.3.0 / P1: convergence with not_resurfaced evidence must not finalize as plain unanimous_ready");
|
|
1290
|
+
assert.ok(unresolvedEvents.includes("session.evidence_checklist_unresolved_on_finalize"), "v4.3.0 / P1: unresolved evidence close-out must emit an audit event");
|
|
1291
|
+
const unresolvedReport = sessionReportMarkdown(unresolvedOrch.store.read(unresolvedR2.session.session_id), unresolvedOrch.store.readEvents(unresolvedR2.session.session_id));
|
|
1292
|
+
assert.ok(unresolvedReport.includes("## Unresolved Evidence Disposition"), "v4.3.0 / P1: session_report must include unresolved-evidence disposition table");
|
|
1293
|
+
assert.ok(unresolvedReport.includes("not_resurfaced"), "v4.3.0 / P1: session_report must name not_resurfaced unresolved items");
|
|
1294
|
+
console.log("[smoke] unresolved_evidence_finalization_guard_test: PASS");
|
|
1295
|
+
}
|
|
1296
|
+
// v4.3.0 / P3: read-only peer reliability telemetry. This is deliberately
|
|
1297
|
+
// observational; it must not change peer selection or mutate sessions.
|
|
1298
|
+
{
|
|
1299
|
+
const { SessionStore } = await import("../src/core/session-store.js");
|
|
1300
|
+
const reliabilityStore = new SessionStore({
|
|
1301
|
+
...config,
|
|
1302
|
+
data_dir: smokeTmpDir("peer-reliability"),
|
|
1303
|
+
});
|
|
1304
|
+
const reliabilitySession = await reliabilityStore.init("peer reliability report fixture", "operator", []);
|
|
1305
|
+
const reliabilityMeta = reliabilityStore.read(reliabilitySession.session_id);
|
|
1306
|
+
const ts = new Date().toISOString();
|
|
1307
|
+
reliabilityMeta.rounds = [
|
|
1308
|
+
{
|
|
1309
|
+
round: 1,
|
|
1310
|
+
started_at: ts,
|
|
1311
|
+
completed_at: ts,
|
|
1312
|
+
caller_status: "READY",
|
|
1313
|
+
prompt_file: "agent-runs/round-1-prompt.md",
|
|
1314
|
+
peers: [
|
|
1315
|
+
{
|
|
1316
|
+
peer: "claude",
|
|
1317
|
+
provider: "anthropic",
|
|
1318
|
+
model: "claude-opus-4-8",
|
|
1319
|
+
status: "NEEDS_EVIDENCE",
|
|
1320
|
+
structured: {
|
|
1321
|
+
status: "NEEDS_EVIDENCE",
|
|
1322
|
+
summary: "needs log",
|
|
1323
|
+
confidence: "verified",
|
|
1324
|
+
evidence_sources: ["src/core/session-store.ts:1"],
|
|
1325
|
+
caller_requests: ["attach raw npm test output"],
|
|
1326
|
+
follow_ups: [],
|
|
1327
|
+
},
|
|
1328
|
+
text: "{}",
|
|
1329
|
+
raw: { fixture: true },
|
|
1330
|
+
decision_quality: "clean",
|
|
1331
|
+
parser_warnings: [],
|
|
1332
|
+
attempts: 1,
|
|
1333
|
+
latency_ms: 50,
|
|
1334
|
+
usage: { input_tokens: 10, output_tokens: 5, total_tokens: 15 },
|
|
1335
|
+
cost: { currency: "USD", estimated: false, source: "configured-rate", total_cost: 1 },
|
|
1336
|
+
},
|
|
1337
|
+
{
|
|
1338
|
+
peer: "grok",
|
|
1339
|
+
provider: "xai",
|
|
1340
|
+
model: "grok-4.3",
|
|
1341
|
+
status: "READY",
|
|
1342
|
+
structured: {
|
|
1343
|
+
status: "READY",
|
|
1344
|
+
summary: "ready",
|
|
1345
|
+
confidence: "verified",
|
|
1346
|
+
evidence_sources: ["server_info: version 4.2.5"],
|
|
1347
|
+
caller_requests: [],
|
|
1348
|
+
follow_ups: [],
|
|
1349
|
+
},
|
|
1350
|
+
text: "{}",
|
|
1351
|
+
raw: { fixture: true },
|
|
1352
|
+
decision_quality: "format_warning",
|
|
1353
|
+
parser_warnings: ["verified_without_concrete_evidence_sources"],
|
|
1354
|
+
attempts: 1,
|
|
1355
|
+
latency_ms: 100,
|
|
1356
|
+
usage: { input_tokens: 20, output_tokens: 10, total_tokens: 30 },
|
|
1357
|
+
cost: { currency: "USD", estimated: false, source: "configured-rate", total_cost: 2 },
|
|
1358
|
+
},
|
|
1359
|
+
],
|
|
1360
|
+
rejected: [
|
|
1361
|
+
{
|
|
1362
|
+
peer: "perplexity",
|
|
1363
|
+
provider: "perplexity",
|
|
1364
|
+
model: "sonar-reasoning-pro",
|
|
1365
|
+
failure_class: "provider_error",
|
|
1366
|
+
message: "fixture provider error",
|
|
1367
|
+
retryable: false,
|
|
1368
|
+
attempts: 1,
|
|
1369
|
+
latency_ms: 0,
|
|
1370
|
+
},
|
|
1371
|
+
],
|
|
1372
|
+
convergence: {
|
|
1373
|
+
converged: false,
|
|
1374
|
+
reason: "fixture",
|
|
1375
|
+
ready_peers: ["grok"],
|
|
1376
|
+
not_ready_peers: [],
|
|
1377
|
+
needs_evidence_peers: ["claude"],
|
|
1378
|
+
rejected_peers: ["perplexity"],
|
|
1379
|
+
skipped_peers: [],
|
|
1380
|
+
decision_quality: {
|
|
1381
|
+
codex: "clean",
|
|
1382
|
+
claude: "clean",
|
|
1383
|
+
gemini: "clean",
|
|
1384
|
+
deepseek: "clean",
|
|
1385
|
+
grok: "format_warning",
|
|
1386
|
+
perplexity: "failed",
|
|
1387
|
+
},
|
|
1388
|
+
blocking_details: ["claude:NEEDS_EVIDENCE", "perplexity:provider_error"],
|
|
1389
|
+
},
|
|
1390
|
+
},
|
|
1391
|
+
];
|
|
1392
|
+
reliabilityMeta.evidence_checklist = [
|
|
1393
|
+
{
|
|
1394
|
+
id: "rel-1",
|
|
1395
|
+
peer: "claude",
|
|
1396
|
+
first_round: 1,
|
|
1397
|
+
last_round: 1,
|
|
1398
|
+
round_count: 1,
|
|
1399
|
+
ask: "attach raw npm test output",
|
|
1400
|
+
first_seen_at: ts,
|
|
1401
|
+
last_seen_at: ts,
|
|
1402
|
+
status: "not_resurfaced",
|
|
1403
|
+
addressed_at_round: 2,
|
|
1404
|
+
address_method: "resurfacing",
|
|
1405
|
+
},
|
|
1406
|
+
];
|
|
1407
|
+
fs.writeFileSync(reliabilityStore.metaPath(reliabilitySession.session_id), JSON.stringify(reliabilityMeta));
|
|
1408
|
+
await reliabilityStore.appendEvent({
|
|
1409
|
+
ts,
|
|
1410
|
+
type: "session.lead_meta_audit_fabrication_detected",
|
|
1411
|
+
session_id: reliabilitySession.session_id,
|
|
1412
|
+
message: "fixture fabrication event",
|
|
1413
|
+
data: { peer: "grok" },
|
|
1414
|
+
});
|
|
1415
|
+
const reliability = reliabilityStore.peerReliabilityReport();
|
|
1416
|
+
assert.equal(reliability.scope, "all");
|
|
1417
|
+
assert.equal(reliability.by_peer.claude?.needs_evidence, 1);
|
|
1418
|
+
assert.equal(reliability.by_peer.claude?.not_resurfaced_asks, 1);
|
|
1419
|
+
assert.equal(reliability.by_peer.grok?.ready, 1);
|
|
1420
|
+
assert.equal(reliability.by_peer.grok?.parser_warnings_total, 1);
|
|
1421
|
+
assert.equal(reliability.by_peer.grok?.fabrication_events, 1);
|
|
1422
|
+
assert.equal(reliability.by_peer.perplexity?.provider_errors, 1);
|
|
1423
|
+
console.log("[smoke] peer_reliability_report_test: PASS");
|
|
1424
|
+
}
|
|
1425
|
+
// v4.3.0 / P2: offline declarative eval harness. This pins the existence of a
|
|
1426
|
+
// no-provider-call fixture runner so regressions found in real sessions can be
|
|
1427
|
+
// replayed without growing the ad hoc smoke body indefinitely.
|
|
1428
|
+
{
|
|
1429
|
+
const pkg = JSON.parse(fs.readFileSync("package.json", "utf8"));
|
|
1430
|
+
assert.equal(pkg.scripts?.["eval:fixtures"], "tsx scripts/eval-fixtures.ts", "v4.3.0 / P2: package.json must expose the offline fixture eval runner");
|
|
1431
|
+
const evalHarness = fs.readFileSync("scripts/eval-fixtures.ts", "utf8");
|
|
1432
|
+
assert.ok(/truthfulnessCases/.test(evalHarness) &&
|
|
1433
|
+
/parserCases/.test(evalHarness) &&
|
|
1434
|
+
/reportCases/.test(evalHarness), "v4.3.0 / P2: eval-fixtures must use declarative truthfulness/parser/report case tables");
|
|
1435
|
+
assert.ok(!/askPeers\(|runUntilUnanimous\(|session_start_round/.test(evalHarness), "v4.3.0 / P2: eval-fixtures must stay offline and avoid provider-review entry points");
|
|
1436
|
+
console.log("[smoke] offline_fixture_eval_contract_test: PASS");
|
|
1437
|
+
}
|
|
1251
1438
|
// v2.22.0 (B.P3): session.budget_warning event emit + idempotency. The
|
|
1252
1439
|
// orchestrator emits a one-shot warning when cumulative cost crosses
|
|
1253
1440
|
// 75% of cost_ceiling_usd; the budget_warning_emitted flag persists
|