@lcv-ideas-software/cross-review 4.2.5 → 4.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +48 -0
- package/README.md +10 -2
- package/dist/scripts/eval-fixtures.d.ts +54 -0
- package/dist/scripts/eval-fixtures.js +216 -0
- package/dist/scripts/eval-fixtures.js.map +1 -0
- package/dist/scripts/smoke.js +195 -3
- package/dist/scripts/smoke.js.map +1 -1
- package/dist/src/core/config.d.ts +1 -1
- package/dist/src/core/config.js +1 -1
- package/dist/src/core/convergence.js +5 -3
- package/dist/src/core/convergence.js.map +1 -1
- package/dist/src/core/orchestrator.js +31 -3
- package/dist/src/core/orchestrator.js.map +1 -1
- package/dist/src/core/reports.d.ts +2 -1
- package/dist/src/core/reports.js +30 -0
- package/dist/src/core/reports.js.map +1 -1
- package/dist/src/core/session-store.d.ts +2 -1
- package/dist/src/core/session-store.js +141 -0
- package/dist/src/core/session-store.js.map +1 -1
- package/dist/src/core/types.d.ts +30 -0
- package/dist/src/mcp/server.js +15 -0
- package/dist/src/mcp/server.js.map +1 -1
- package/dist/src/peers/errors.js +5 -2
- package/dist/src/peers/errors.js.map +1 -1
- package/docs/apresentacao-cross-review.md +11 -8
- package/docs/apresentacao.md +24 -20
- package/docs/architecture.md +10 -0
- package/package.json +2 -1
package/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,54 @@ standard `v00.00.00`; npm package versions remain SemVer.
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [v04.03.01] — 2026-06-05
|
|
11
|
+
|
|
12
|
+
**Patch — provider skip classification hotfix.** This release follows up on a
|
|
13
|
+
real hard-gate incident where Claude/Anthropic was skipped after provider
|
|
14
|
+
overload. The immediate provider cause was Anthropic `overloaded_error`; the
|
|
15
|
+
runtime issue was that any `provider_error`, including non-retryable provider
|
|
16
|
+
400-style failures, could be treated as skippable.
|
|
17
|
+
|
|
18
|
+
### Changed
|
|
19
|
+
|
|
20
|
+
- `provider_error` is now skippable only when classified as retryable, so
|
|
21
|
+
non-retryable provider payload/schema rejections block convergence instead of
|
|
22
|
+
being silently removed from the panel.
|
|
23
|
+
- Anthropic `overloaded_error` without preserved HTTP status text is now treated
|
|
24
|
+
as retryable, matching HTTP 529 overload behavior.
|
|
25
|
+
- `session.peer_skipped_unavailable` events now include retryability, recovery
|
|
26
|
+
hint, and a redacted provider error preview in event data.
|
|
27
|
+
|
|
28
|
+
## [v04.03.00] — 2026-06-05
|
|
29
|
+
|
|
30
|
+
**Minor — P1/P2/P3 audit follow-up.** This release closes the first concrete
|
|
31
|
+
items from the post-v4.2.5 runtime/session audit: unresolved evidence is harder
|
|
32
|
+
to miss at finalization time, fixture-level regressions can be evaluated
|
|
33
|
+
offline, and operators get a read-only peer reliability report without changing
|
|
34
|
+
peer selection.
|
|
35
|
+
|
|
36
|
+
### Added
|
|
37
|
+
|
|
38
|
+
- Added `session_peer_reliability_report`, a read-only MCP tool that aggregates
|
|
39
|
+
per-peer parser warnings, decision quality, rejected/provider failures,
|
|
40
|
+
evidence checklist dispositions, fabrication-related events, latency and
|
|
41
|
+
cost.
|
|
42
|
+
- Added `npm run eval:fixtures`, an offline fixture harness for truthfulness
|
|
43
|
+
preflight, parser diagnostics and report rendering contracts. It does not
|
|
44
|
+
start provider sessions or call reviewers.
|
|
45
|
+
- `session_report` now includes an **Unresolved Evidence Disposition** section
|
|
46
|
+
when checklist items remain `open` or `not_resurfaced`.
|
|
47
|
+
|
|
48
|
+
### Changed
|
|
49
|
+
|
|
50
|
+
- Automatic convergence with unresolved checklist items now finalizes with
|
|
51
|
+
`unanimous_ready_with_unresolved_evidence` or
|
|
52
|
+
`recovered_unanimity_with_unresolved_evidence` instead of a plain success
|
|
53
|
+
reason.
|
|
54
|
+
- Finalization now emits `session.evidence_checklist_unresolved_on_finalize`
|
|
55
|
+
with unresolved counts and item summaries when a session closes while
|
|
56
|
+
evidence asks are still open or only inferred as not resurfaced.
|
|
57
|
+
|
|
10
58
|
## [v04.02.05] — 2026-06-05
|
|
11
59
|
|
|
12
60
|
**Patch — session audit hardening.** This release closes follow-ups from the
|
package/README.md
CHANGED
|
@@ -24,7 +24,7 @@ npm install -g @lcv-ideas-software/cross-review
|
|
|
24
24
|
npm install -g @lcv-ideas-software/cross-review --registry=https://npm.pkg.github.com
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
-
**Status.** Stable. Current release: **v04.
|
|
27
|
+
**Status.** Stable. Current release: **v04.03.01** (npm package `4.3.1`). See [CHANGELOG.md](./CHANGELOG.md) for the full release history.
|
|
28
28
|
|
|
29
29
|
> **Project renamed 2026-05-15.** This project was previously published as
|
|
30
30
|
> [`@lcv-ideas-software/cross-review-v2`](https://www.npmjs.com/package/@lcv-ideas-software/cross-review-v2)
|
|
@@ -38,6 +38,8 @@ The version history at a glance:
|
|
|
38
38
|
|
|
39
39
|
| Release | Scope |
|
|
40
40
|
| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
|
41
|
+
| **`v04.03.01`** | Patch — tighten skip-peer classification so non-retryable provider errors block, while Anthropic overload events remain retryable and better surfaced in skip diagnostics. |
|
|
42
|
+
| **`v04.03.00`** | Minor — P1/P2/P3 follow-up with unresolved-evidence close-out visibility, an offline fixture eval harness, and a read-only peer reliability report. |
|
|
41
43
|
| **`v04.02.05`** | Patch — harden session auditability with terminal events, cost split reporting, `not_resurfaced` visibility, and relator provenance checks for session IDs/GitHub URLs. |
|
|
42
44
|
| **`v04.02.04`** | Patch — harden truthfulness preflight auditability, add a read-only preflight retest tool, and reduce false parser warnings for attached/log evidence. |
|
|
43
45
|
| **`v04.02.03`** | Patch — promote the Gemini canonical default to `gemini-3.1-pro-preview` and refresh the active local Gemini rate card. |
|
|
@@ -211,6 +213,7 @@ these environment variables before running real sessions (example):
|
|
|
211
213
|
- `session_metrics`
|
|
212
214
|
- `session_doctor`
|
|
213
215
|
- `session_report`
|
|
216
|
+
- `session_peer_reliability_report`
|
|
214
217
|
- `session_check_convergence`
|
|
215
218
|
- `session_truthfulness_preflight_check`
|
|
216
219
|
- `session_attach_evidence`
|
|
@@ -228,7 +231,12 @@ these environment variables before running real sessions (example):
|
|
|
228
231
|
lack terminal events, and reports peer-call cost separately from generation
|
|
229
232
|
artifact cost. `session_report` uses the same split and calls out
|
|
230
233
|
`not_resurfaced` evidence checklist items as inference-only, not proof that the
|
|
231
|
-
requested evidence was satisfied.
|
|
234
|
+
requested evidence was satisfied. If a session otherwise reaches unanimity with
|
|
235
|
+
open or `not_resurfaced` checklist items, finalization records an
|
|
236
|
+
`*_with_unresolved_evidence` outcome reason and emits a durable unresolved
|
|
237
|
+
evidence event. `session_peer_reliability_report` is read-only and aggregates
|
|
238
|
+
per-peer parser warnings, evidence ask status, provider failures, cost and
|
|
239
|
+
latency.
|
|
232
240
|
|
|
233
241
|
## Repository conventions
|
|
234
242
|
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
export declare const truthfulnessCases: readonly [{
|
|
2
|
+
readonly name: "current runtime contradiction is blocked";
|
|
3
|
+
readonly input: {
|
|
4
|
+
readonly task: "The current cross-review runtime is 4.2.4.";
|
|
5
|
+
readonly runtimeFacts: {
|
|
6
|
+
readonly runtime_version: "4.2.5";
|
|
7
|
+
readonly release_date: "2026-06-05";
|
|
8
|
+
};
|
|
9
|
+
readonly attachmentsPresent: false;
|
|
10
|
+
};
|
|
11
|
+
readonly expectPass: false;
|
|
12
|
+
readonly expectIssueClass: "runtime_contradiction";
|
|
13
|
+
}, {
|
|
14
|
+
readonly name: "matching current runtime facts pass";
|
|
15
|
+
readonly input: {
|
|
16
|
+
readonly task: "server_info shows current cross-review runtime 4.2.5.";
|
|
17
|
+
readonly runtimeFacts: {
|
|
18
|
+
readonly runtime_version: "4.2.5";
|
|
19
|
+
readonly release_date: "2026-06-05";
|
|
20
|
+
};
|
|
21
|
+
readonly attachmentsPresent: false;
|
|
22
|
+
};
|
|
23
|
+
readonly expectPass: true;
|
|
24
|
+
}, {
|
|
25
|
+
readonly name: "historical timing claim needs snapshot evidence";
|
|
26
|
+
readonly input: {
|
|
27
|
+
readonly task: "When the audit began, cross-review was running 4.2.4.";
|
|
28
|
+
readonly runtimeFacts: {
|
|
29
|
+
readonly runtime_version: "4.2.5";
|
|
30
|
+
readonly release_date: "2026-06-05";
|
|
31
|
+
};
|
|
32
|
+
readonly attachmentsPresent: false;
|
|
33
|
+
};
|
|
34
|
+
readonly expectPass: false;
|
|
35
|
+
readonly expectIssueClass: "unsupported_historical_claim";
|
|
36
|
+
}];
|
|
37
|
+
export declare const parserCases: readonly [{
|
|
38
|
+
readonly name: "verified with empty evidence gets empty-evidence warning";
|
|
39
|
+
readonly text: string;
|
|
40
|
+
readonly expectStatus: "READY";
|
|
41
|
+
readonly expectWarning: "verified_without_evidence_sources";
|
|
42
|
+
}, {
|
|
43
|
+
readonly name: "verified with attached evidence path is concrete";
|
|
44
|
+
readonly text: string;
|
|
45
|
+
readonly expectStatus: "READY";
|
|
46
|
+
readonly absentWarning: "verified_without_evidence_sources";
|
|
47
|
+
}];
|
|
48
|
+
export declare const reportCases: readonly [{
|
|
49
|
+
readonly name: "cost split and unresolved evidence are surfaced";
|
|
50
|
+
readonly peerCost: 14.652426;
|
|
51
|
+
readonly generationCost: 1.876718;
|
|
52
|
+
readonly totalCost: 16.529144;
|
|
53
|
+
readonly unresolvedAsk: "attach raw npm test output";
|
|
54
|
+
}];
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
import assert from "node:assert/strict";
|
|
2
|
+
import fs from "node:fs";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { loadConfig } from "../src/core/config.js";
|
|
6
|
+
import { truthfulnessPreflight } from "../src/core/orchestrator.js";
|
|
7
|
+
import { sessionReportMarkdown } from "../src/core/reports.js";
|
|
8
|
+
import { SessionStore } from "../src/core/session-store.js";
|
|
9
|
+
import { parsePeerStatus } from "../src/core/status.js";
|
|
10
|
+
function evalTmpDir(label) {
|
|
11
|
+
return fs.mkdtempSync(path.join(os.tmpdir(), `cross-review-eval-${label}-`));
|
|
12
|
+
}
|
|
13
|
+
export const truthfulnessCases = [
|
|
14
|
+
{
|
|
15
|
+
name: "current runtime contradiction is blocked",
|
|
16
|
+
input: {
|
|
17
|
+
task: "The current cross-review runtime is 4.2.4.",
|
|
18
|
+
runtimeFacts: { runtime_version: "4.2.5", release_date: "2026-06-05" },
|
|
19
|
+
attachmentsPresent: false,
|
|
20
|
+
},
|
|
21
|
+
expectPass: false,
|
|
22
|
+
expectIssueClass: "runtime_contradiction",
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
name: "matching current runtime facts pass",
|
|
26
|
+
input: {
|
|
27
|
+
task: "server_info shows current cross-review runtime 4.2.5.",
|
|
28
|
+
runtimeFacts: { runtime_version: "4.2.5", release_date: "2026-06-05" },
|
|
29
|
+
attachmentsPresent: false,
|
|
30
|
+
},
|
|
31
|
+
expectPass: true,
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
name: "historical timing claim needs snapshot evidence",
|
|
35
|
+
input: {
|
|
36
|
+
task: "When the audit began, cross-review was running 4.2.4.",
|
|
37
|
+
runtimeFacts: { runtime_version: "4.2.5", release_date: "2026-06-05" },
|
|
38
|
+
attachmentsPresent: false,
|
|
39
|
+
},
|
|
40
|
+
expectPass: false,
|
|
41
|
+
expectIssueClass: "unsupported_historical_claim",
|
|
42
|
+
},
|
|
43
|
+
];
|
|
44
|
+
export const parserCases = [
|
|
45
|
+
{
|
|
46
|
+
name: "verified with empty evidence gets empty-evidence warning",
|
|
47
|
+
text: JSON.stringify({
|
|
48
|
+
status: "READY",
|
|
49
|
+
summary: "ok",
|
|
50
|
+
confidence: "verified",
|
|
51
|
+
evidence_sources: [],
|
|
52
|
+
caller_requests: [],
|
|
53
|
+
follow_ups: [],
|
|
54
|
+
}),
|
|
55
|
+
expectStatus: "READY",
|
|
56
|
+
expectWarning: "verified_without_evidence_sources",
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
name: "verified with attached evidence path is concrete",
|
|
60
|
+
text: JSON.stringify({
|
|
61
|
+
status: "READY",
|
|
62
|
+
summary: "ok",
|
|
63
|
+
confidence: "verified",
|
|
64
|
+
evidence_sources: ["evidence/2026-06-05T00-00-00Z-raw-smoke.txt: npm test 42 passed"],
|
|
65
|
+
caller_requests: [],
|
|
66
|
+
follow_ups: [],
|
|
67
|
+
}),
|
|
68
|
+
expectStatus: "READY",
|
|
69
|
+
absentWarning: "verified_without_evidence_sources",
|
|
70
|
+
},
|
|
71
|
+
];
|
|
72
|
+
export const reportCases = [
|
|
73
|
+
{
|
|
74
|
+
name: "cost split and unresolved evidence are surfaced",
|
|
75
|
+
peerCost: 14.652426,
|
|
76
|
+
generationCost: 1.876718,
|
|
77
|
+
totalCost: 16.529144,
|
|
78
|
+
unresolvedAsk: "attach raw npm test output",
|
|
79
|
+
},
|
|
80
|
+
];
|
|
81
|
+
for (const testCase of truthfulnessCases) {
|
|
82
|
+
const result = truthfulnessPreflight({
|
|
83
|
+
task: testCase.input.task,
|
|
84
|
+
runtimeFacts: testCase.input.runtimeFacts,
|
|
85
|
+
attachmentsPresent: testCase.input.attachmentsPresent,
|
|
86
|
+
});
|
|
87
|
+
assert.equal(result.pass, testCase.expectPass, testCase.name);
|
|
88
|
+
if ("expectIssueClass" in testCase) {
|
|
89
|
+
assert.ok(result.issue_classes.includes(testCase.expectIssueClass), testCase.name);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
for (const testCase of parserCases) {
|
|
93
|
+
const result = parsePeerStatus(testCase.text);
|
|
94
|
+
assert.equal(result.status, testCase.expectStatus, testCase.name);
|
|
95
|
+
if ("expectWarning" in testCase) {
|
|
96
|
+
assert.ok(result.parser_warnings.includes(testCase.expectWarning), testCase.name);
|
|
97
|
+
}
|
|
98
|
+
if ("absentWarning" in testCase) {
|
|
99
|
+
assert.ok(!result.parser_warnings.includes(testCase.absentWarning), testCase.name);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
for (const testCase of reportCases) {
|
|
103
|
+
const store = new SessionStore({
|
|
104
|
+
...loadConfig(),
|
|
105
|
+
data_dir: evalTmpDir("report"),
|
|
106
|
+
});
|
|
107
|
+
const session = await store.init(`eval report fixture: ${testCase.name}`, "operator", []);
|
|
108
|
+
const meta = store.read(session.session_id);
|
|
109
|
+
const ts = new Date().toISOString();
|
|
110
|
+
meta.rounds = [
|
|
111
|
+
{
|
|
112
|
+
round: 1,
|
|
113
|
+
started_at: ts,
|
|
114
|
+
completed_at: ts,
|
|
115
|
+
caller_status: "READY",
|
|
116
|
+
prompt_file: "agent-runs/round-1-prompt.md",
|
|
117
|
+
peers: [
|
|
118
|
+
{
|
|
119
|
+
peer: "codex",
|
|
120
|
+
provider: "openai",
|
|
121
|
+
model: "gpt-5.5",
|
|
122
|
+
status: "READY",
|
|
123
|
+
structured: {
|
|
124
|
+
status: "READY",
|
|
125
|
+
summary: "ready",
|
|
126
|
+
confidence: "verified",
|
|
127
|
+
evidence_sources: ["server_info: version 4.2.5"],
|
|
128
|
+
caller_requests: [],
|
|
129
|
+
follow_ups: [],
|
|
130
|
+
},
|
|
131
|
+
text: "{}",
|
|
132
|
+
raw: { fixture: true },
|
|
133
|
+
decision_quality: "clean",
|
|
134
|
+
parser_warnings: [],
|
|
135
|
+
attempts: 1,
|
|
136
|
+
latency_ms: 1,
|
|
137
|
+
usage: { input_tokens: 1, output_tokens: 1, total_tokens: 2 },
|
|
138
|
+
cost: {
|
|
139
|
+
currency: "USD",
|
|
140
|
+
estimated: false,
|
|
141
|
+
source: "configured-rate",
|
|
142
|
+
total_cost: testCase.peerCost,
|
|
143
|
+
},
|
|
144
|
+
},
|
|
145
|
+
],
|
|
146
|
+
rejected: [],
|
|
147
|
+
convergence: {
|
|
148
|
+
converged: true,
|
|
149
|
+
reason: "fixture",
|
|
150
|
+
ready_peers: ["codex"],
|
|
151
|
+
not_ready_peers: [],
|
|
152
|
+
needs_evidence_peers: [],
|
|
153
|
+
rejected_peers: [],
|
|
154
|
+
skipped_peers: [],
|
|
155
|
+
decision_quality: {
|
|
156
|
+
codex: "clean",
|
|
157
|
+
claude: "clean",
|
|
158
|
+
gemini: "clean",
|
|
159
|
+
deepseek: "clean",
|
|
160
|
+
grok: "clean",
|
|
161
|
+
perplexity: "clean",
|
|
162
|
+
},
|
|
163
|
+
blocking_details: [],
|
|
164
|
+
},
|
|
165
|
+
},
|
|
166
|
+
];
|
|
167
|
+
meta.generation_files = [
|
|
168
|
+
{
|
|
169
|
+
round: 0,
|
|
170
|
+
peer: "codex",
|
|
171
|
+
label: "initial_draft",
|
|
172
|
+
path: "agent-runs/round-0-initial-draft.md",
|
|
173
|
+
ts,
|
|
174
|
+
usage: { input_tokens: 1, output_tokens: 1, total_tokens: 2 },
|
|
175
|
+
cost: {
|
|
176
|
+
currency: "USD",
|
|
177
|
+
estimated: false,
|
|
178
|
+
source: "configured-rate",
|
|
179
|
+
total_cost: testCase.generationCost,
|
|
180
|
+
},
|
|
181
|
+
},
|
|
182
|
+
];
|
|
183
|
+
meta.totals.cost = {
|
|
184
|
+
currency: "USD",
|
|
185
|
+
estimated: false,
|
|
186
|
+
source: "configured-rate",
|
|
187
|
+
total_cost: testCase.totalCost,
|
|
188
|
+
};
|
|
189
|
+
meta.evidence_checklist = [
|
|
190
|
+
{
|
|
191
|
+
id: "eval-1",
|
|
192
|
+
peer: "codex",
|
|
193
|
+
first_round: 1,
|
|
194
|
+
last_round: 1,
|
|
195
|
+
round_count: 1,
|
|
196
|
+
ask: testCase.unresolvedAsk,
|
|
197
|
+
first_seen_at: ts,
|
|
198
|
+
last_seen_at: ts,
|
|
199
|
+
status: "not_resurfaced",
|
|
200
|
+
addressed_at_round: 2,
|
|
201
|
+
address_method: "resurfacing",
|
|
202
|
+
},
|
|
203
|
+
];
|
|
204
|
+
fs.writeFileSync(store.metaPath(session.session_id), JSON.stringify(meta));
|
|
205
|
+
const report = sessionReportMarkdown(store.read(session.session_id), []);
|
|
206
|
+
assert.ok(report.includes("$16.529144 USD = $14.652426 peer + $1.876718 generation"));
|
|
207
|
+
assert.ok(report.includes("## Unresolved Evidence Disposition"));
|
|
208
|
+
assert.ok(report.includes(testCase.unresolvedAsk));
|
|
209
|
+
}
|
|
210
|
+
console.log(JSON.stringify({
|
|
211
|
+
ok: true,
|
|
212
|
+
truthfulness_cases: truthfulnessCases.length,
|
|
213
|
+
parser_cases: parserCases.length,
|
|
214
|
+
report_cases: reportCases.length,
|
|
215
|
+
}));
|
|
216
|
+
//# sourceMappingURL=eval-fixtures.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-fixtures.js","sourceRoot":"","sources":["../../scripts/eval-fixtures.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,oBAAoB,CAAC;AACxC,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAE,qBAAqB,EAAE,MAAM,6BAA6B,CAAC;AACpE,OAAO,EAAE,qBAAqB,EAAE,MAAM,wBAAwB,CAAC;AAC/D,OAAO,EAAE,YAAY,EAAE,MAAM,8BAA8B,CAAC;AAC5D,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAExD,SAAS,UAAU,CAAC,KAAa;IAC/B,OAAO,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,qBAAqB,KAAK,GAAG,CAAC,CAAC,CAAC;AAC/E,CAAC;AAED,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B;QACE,IAAI,EAAE,0CAA0C;QAChD,KAAK,EAAE;YACL,IAAI,EAAE,4CAA4C;YAClD,YAAY,EAAE,EAAE,eAAe,EAAE,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE;YACtE,kBAAkB,EAAE,KAAK;SAC1B;QACD,UAAU,EAAE,KAAK;QACjB,gBAAgB,EAAE,uBAAuB;KAC1C;IACD;QACE,IAAI,EAAE,qCAAqC;QAC3C,KAAK,EAAE;YACL,IAAI,EAAE,uDAAuD;YAC7D,YAAY,EAAE,EAAE,eAAe,EAAE,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE;YACtE,kBAAkB,EAAE,KAAK;SAC1B;QACD,UAAU,EAAE,IAAI;KACjB;IACD;QACE,IAAI,EAAE,iDAAiD;QACvD,KAAK,EAAE;YACL,IAAI,EAAE,uDAAuD;YAC7D,YAAY,EAAE,EAAE,eAAe,EAAE,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE;YACtE,kBAAkB,EAAE,KAAK;SAC1B;QACD,UAAU,EAAE,KAAK;QACjB,gBAAgB,EAAE,8BAA8B;KACjD;CACO,CAAC;AAEX,MAAM,CAAC,MAAM,WAAW,GAAG;IACzB;QACE,IAAI,EAAE,0DAA0D;QAChE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;YACnB,MAAM,EAAE,OAAO;YACf,OAAO,EAAE,IAAI;YACb,UAAU,EAAE,UAAU;YACtB,gBAAgB,EAAE,EAAE;YACpB,eAAe,EAAE,EAAE;YACnB,UAAU,EAAE,EAAE;SACf,CAAC;QACF,YAAY,EAAE,OAAO;QACrB,aAAa,EAAE,mCAAmC;KACnD;IACD;QACE,IAAI,EAAE,kDAAkD;QACxD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;YACnB,MAAM,EAAE,OAAO;YACf,OAAO,EAAE,IAAI;YACb,UAAU,EAAE,UAAU;YACtB,gBAAgB,EAAE,CAAC,iEAAiE,CAAC;YACrF,eAAe,EAAE,EAAE;YACnB,UAAU,EAAE,EAAE;SACf,CAAC;QACF,YAAY,EAAE,OAAO;QACrB,aAAa,EAAE,mCAAmC;KACnD;CACO,CAAC;AAEX,MAAM,CAAC,MAAM,WAAW,GAAG;IACzB;QACE,IAAI,EAAE,iDAAiD;QACvD,QAAQ,EAAE,SAAS;QACnB,cAAc,EAAE,QAAQ;QACxB,SAAS,EAAE,SAAS;QACpB,aAAa,EAAE,4BAA4B;KAC5C;CACO,CAAC;AAEX,KAAK,MAAM,QAAQ,IAAI,iBAAiB,EAAE,CAAC;IACzC,MAAM,MAAM,GAAG,qBAAqB,CAAC;QACnC,IAAI,EAAE,QAAQ,CAAC,KAAK,CAAC,IAAI;QACzB,YAAY,EAAE,QAAQ,CAAC,KAAK,CAAC,YAAY;QACzC,kBAAkB,EAAE,QAAQ,CAAC,KAAK,CAAC,kBAAkB;KACtD,CAAC,CAAC;IACH,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,IAAI,EAAE,QAAQ,CAAC,UAAU,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC9D,IAAI,kBAAkB,IAAI,QAAQ,EAAE,CAAC;QACnC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,aAAa,CAAC,QAAQ,CAAC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IACrF,CAAC;AACH,CAAC;AAED,KAAK,MAAM,QAAQ,IAAI,WAAW,EAAE,CAAC;IACnC,MAAM,MAAM,GAAG,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC9C,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,QAAQ,CAAC,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IAClE,IAAI,eAAe,IAAI,QAAQ,EAAE,CAAC;QAChC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,eAAe,CAAC,QAAQ,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IACpF,CAAC;IACD,IAAI,eAAe,IAAI,QAAQ,EAAE,CAAC;QAChC,MAAM,CAAC,EAAE,CAAC,CAAC,MAAM,CAAC,eAAe,CAAC,QAAQ,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;IACrF,CAAC;AACH,CAAC;AAED,KAAK,MAAM,QAAQ,IAAI,WAAW,EAAE,CAAC;IACnC,MAAM,KAAK,GAAG,IAAI,YAAY,CAAC;QAC7B,GAAG,UAAU,EAAE;QACf,QAAQ,EAAE,UAAU,CAAC,QAAQ,CAAC;KAC/B,CAAC,CAAC;IACH,MAAM,OAAO,GAAG,MAAM,KAAK,CAAC,IAAI,CAAC,wBAAwB,QAAQ,CAAC,IAAI,EAAE,EAAE,UAAU,EAAE,EAAE,CAAC,CAAC;IAC1F,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;IAC5C,MAAM,EAAE,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IACpC,IAAI,CAAC,MAAM,GAAG;QACZ;YACE,KAAK,EAAE,CAAC;YACR,UAAU,EAAE,EAAE;YACd,YAAY,EAAE,EAAE;YAChB,aAAa,EAAE,OAAO;YACtB,WAAW,EAAE,8BAA8B;YAC3C,KAAK,EAAE;gBACL;oBACE,IAAI,EAAE,OAAO;oBACb,QAAQ,EAAE,QAAQ;oBAClB,KAAK,EAAE,SAAS;oBAChB,MAAM,EAAE,OAAO;oBACf,UAAU,EAAE;wBACV,MAAM,EAAE,OAAO;wBACf,OAAO,EAAE,OAAO;wBAChB,UAAU,EAAE,UAAU;wBACtB,gBAAgB,EAAE,CAAC,4BAA4B,CAAC;wBAChD,eAAe,EAAE,EAAE;wBACnB,UAAU,EAAE,EAAE;qBACf;oBACD,IAAI,EAAE,IAAI;oBACV,GAAG,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE;oBACtB,gBAAgB,EAAE,OAAO;oBACzB,eAAe,EAAE,EAAE;oBACnB,QAAQ,EAAE,CAAC;oBACX,UAAU,EAAE,CAAC;oBACb,KAAK,EAAE,EAAE,YAAY,EAAE,CAAC,EAAE,aAAa,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE;oBAC7D,IAAI,EAAE;wBACJ,QAAQ,EAAE,KAAK;wBACf,SAAS,EAAE,KAAK;wBAChB,MAAM,EAAE,iBAAiB;wBACzB,UAAU,EAAE,QAAQ,CAAC,QAAQ;qBAC9B;iBACF;aACF;YACD,QAAQ,EAAE,EAAE;YACZ,WAAW,EAAE;gBACX,SAAS,EAAE,IAAI;gBACf,MAAM,EAAE,SAAS;gBACjB,WAAW,EAAE,CAAC,OAAO,CAAC;gBACtB,eAAe,EAAE,EAAE;gBACnB,oBAAoB,EAAE,EAAE;gBACxB,cAAc,EAAE,EAAE;gBAClB,aAAa,EAAE,EAAE;gBACjB,gBAAgB,EAAE;oBAChB,KAAK,EAAE,OAAO;oBACd,MAAM,EAAE,OAAO;oBACf,MAAM,EAAE,OAAO;oBACf,QAAQ,EAAE,OAAO;oBACjB,IAAI,EAAE,OAAO;oBACb,UAAU,EAAE,OAAO;iBACpB;gBACD,gBAAgB,EAAE,EAAE;aACrB;SACF;KACF,CAAC;IACF,IAAI,CAAC,gBAAgB,GAAG;QACtB;YACE,KAAK,EAAE,CAAC;YACR,IAAI,EAAE,OAAO;YACb,KAAK,EAAE,eAAe;YACtB,IAAI,EAAE,qCAAqC;YAC3C,EAAE;YACF,KAAK,EAAE,EAAE,YAAY,EAAE,CAAC,EAAE,aAAa,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE;YAC7D,IAAI,EAAE;gBACJ,QAAQ,EAAE,KAAK;gBACf,SAAS,EAAE,KAAK;gBAChB,MAAM,EAAE,iBAAiB;gBACzB,UAAU,EAAE,QAAQ,CAAC,cAAc;aACpC;SACF;KACF,CAAC;IACF,IAAI,CAAC,MAAM,CAAC,IAAI,GAAG;QACjB,QAAQ,EAAE,KAAK;QACf,SAAS,EAAE,KAAK;QAChB,MAAM,EAAE,iBAAiB;QACzB,UAAU,EAAE,QAAQ,CAAC,SAAS;KAC/B,CAAC;IACF,IAAI,CAAC,kBAAkB,GAAG;QACxB;YACE,EAAE,EAAE,QAAQ;YACZ,IAAI,EAAE,OAAO;YACb,WAAW,EAAE,CAAC;YACd,UAAU,EAAE,CAAC;YACb,WAAW,EAAE,CAAC;YACd,GAAG,EAAE,QAAQ,CAAC,aAAa;YAC3B,aAAa,EAAE,EAAE;YACjB,YAAY,EAAE,EAAE;YAChB,MAAM,EAAE,gBAAgB;YACxB,kBAAkB,EAAE,CAAC;YACrB,cAAc,EAAE,aAAa;SAC9B;KACF,CAAC;IACF,EAAE,CAAC,aAAa,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC;IAC3E,MAAM,MAAM,GAAG,qBAAqB,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,EAAE,CAAC,CAAC;IACzE,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,yDAAyD,CAAC,CAAC,CAAC;IACtF,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,oCAAoC,CAAC,CAAC,CAAC;IACjE,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC,CAAC;AACrD,CAAC;AAED,OAAO,CAAC,GAAG,CACT,IAAI,CAAC,SAAS,CAAC;IACb,EAAE,EAAE,IAAI;IACR,kBAAkB,EAAE,iBAAiB,CAAC,MAAM;IAC5C,YAAY,EAAE,WAAW,CAAC,MAAM;IAChC,YAAY,EAAE,WAAW,CAAC,MAAM;CACjC,CAAC,CACH,CAAC"}
|
package/dist/scripts/smoke.js
CHANGED
|
@@ -22,6 +22,7 @@ import { SWEEP_MIN_IDLE_MS } from "../src/core/session-store.js";
|
|
|
22
22
|
import { parsePeerStatus } from "../src/core/status.js";
|
|
23
23
|
import { PEERS } from "../src/core/types.js";
|
|
24
24
|
import { getCallerCandidatesFromClientInfo, lockCallerPeerSelection, pruneCompletedJobs, SessionIdSchema, setHostTokensRecord, verifyCallerIdentity, } from "../src/mcp/server.js";
|
|
25
|
+
import { classifyProviderError } from "../src/peers/errors.js";
|
|
25
26
|
import { selectFromCandidates } from "../src/peers/model-selection.js";
|
|
26
27
|
import { StubAdapter } from "../src/peers/stub.js";
|
|
27
28
|
import { redact } from "../src/security/redact.js";
|
|
@@ -488,13 +489,13 @@ assert.equal(checkConvergence(["codex", "claude"], "READY", [fakeReady("codex"),
|
|
|
488
489
|
// taxonomy + the skip-gated quorum floor + the zero-skip non-regression
|
|
489
490
|
// invariant.
|
|
490
491
|
{
|
|
491
|
-
const fakeFailure = (peer, failureClass) => ({
|
|
492
|
+
const fakeFailure = (peer, failureClass, retryable = true) => ({
|
|
492
493
|
peer,
|
|
493
494
|
provider: "stub",
|
|
494
495
|
model: "stub",
|
|
495
496
|
failure_class: failureClass,
|
|
496
497
|
message: `stub ${failureClass}`,
|
|
497
|
-
retryable
|
|
498
|
+
retryable,
|
|
498
499
|
attempts: 3,
|
|
499
500
|
latency_ms: 0,
|
|
500
501
|
});
|
|
@@ -509,8 +510,12 @@ assert.equal(checkConvergence(["codex", "claude"], "READY", [fakeReady("codex"),
|
|
|
509
510
|
"fallback_exhausted",
|
|
510
511
|
]) {
|
|
511
512
|
assert.equal(isSkippableFailure(fakeFailure("grok", fc)), true, `v3.7.3 / skip-peer: ${fc} must be skippable (infra unavailability)`);
|
|
512
|
-
assert.equal(SKIPPABLE_FAILURE_CLASSES.has(fc), true, `v3.7.3 / skip-peer: ${fc} must be in SKIPPABLE_FAILURE_CLASSES`);
|
|
513
513
|
}
|
|
514
|
+
assert.equal(SKIPPABLE_FAILURE_CLASSES.has("provider_error"), true, "v3.7.3 / skip-peer: provider_error remains a known skip candidate when retryable");
|
|
515
|
+
assert.equal(isSkippableFailure(fakeFailure("grok", "provider_error", false)), false, "v4.3.1 / skip-peer: non-retryable provider_error (e.g. provider 400 payload/schema rejection) must block instead of being skipped");
|
|
516
|
+
const anthropicOverloaded = classifyProviderError("claude", "anthropic", "claude-opus-4-8", new Error('{"type":"error","error":{"details":null,"type":"overloaded_error","message":"Overloaded"},"request_id":"req_fixture"}'), 1, Date.now());
|
|
517
|
+
assert.equal(anthropicOverloaded.failure_class, "provider_error");
|
|
518
|
+
assert.equal(anthropicOverloaded.retryable, true, "v4.3.1 / provider-errors: Anthropic overloaded_error without HTTP status text must still be retryable");
|
|
514
519
|
for (const fc of [
|
|
515
520
|
"schema",
|
|
516
521
|
"unparseable_after_recovery",
|
|
@@ -1248,6 +1253,193 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
1248
1253
|
assert.ok(notResurfacedReport.includes("not_resurfaced means the ask was not repeated; it is not proof that evidence was satisfied."), "v4.2.5 / not_resurfaced: session report must state the not_resurfaced semantics");
|
|
1249
1254
|
console.log("[smoke] terminal_cost_evidence_audit_test: PASS");
|
|
1250
1255
|
}
|
|
1256
|
+
// v4.3.0 / P1: unanimous READY with unresolved evidence must not look like a
|
|
1257
|
+
// plain unanimous_ready close-out. `not_resurfaced` is inference-only: it may
|
|
1258
|
+
// allow convergence, but the final metadata/report must keep that disposition
|
|
1259
|
+
// visible for operators.
|
|
1260
|
+
{
|
|
1261
|
+
const { sessionReportMarkdown } = await import("../src/core/reports.js");
|
|
1262
|
+
const unresolvedEvents = [];
|
|
1263
|
+
const unresolvedConfig = {
|
|
1264
|
+
...loadConfig(),
|
|
1265
|
+
data_dir: smokeTmpDir("unresolved-evidence-finalize"),
|
|
1266
|
+
budget: {
|
|
1267
|
+
...loadConfig().budget,
|
|
1268
|
+
max_session_cost_usd: 10000,
|
|
1269
|
+
preflight_max_round_cost_usd: 10000,
|
|
1270
|
+
until_stopped_max_cost_usd: 10000,
|
|
1271
|
+
},
|
|
1272
|
+
evidence_judge_autowire: {
|
|
1273
|
+
...loadConfig().evidence_judge_autowire,
|
|
1274
|
+
mode: "off",
|
|
1275
|
+
active: false,
|
|
1276
|
+
},
|
|
1277
|
+
};
|
|
1278
|
+
const unresolvedOrch = new CrossReviewOrchestrator(unresolvedConfig, (event) => unresolvedEvents.push(event.type));
|
|
1279
|
+
const unresolvedR1 = await unresolvedOrch.askPeers({
|
|
1280
|
+
task: "P1 unresolved evidence finalization guard fixture.",
|
|
1281
|
+
draft: "FORCE_NEEDS_EVIDENCE",
|
|
1282
|
+
caller: "operator",
|
|
1283
|
+
peers: ["claude"],
|
|
1284
|
+
});
|
|
1285
|
+
const unresolvedR2 = await unresolvedOrch.askPeers({
|
|
1286
|
+
session_id: unresolvedR1.session.session_id,
|
|
1287
|
+
task: "P1 unresolved evidence finalization guard fixture.",
|
|
1288
|
+
draft: "Clean revised draft, no test marker present.",
|
|
1289
|
+
caller: "operator",
|
|
1290
|
+
peers: ["claude"],
|
|
1291
|
+
});
|
|
1292
|
+
assert.equal(unresolvedR2.converged, true);
|
|
1293
|
+
assert.equal(unresolvedR2.session.outcome, "converged");
|
|
1294
|
+
assert.equal(unresolvedR2.session.outcome_reason, "unanimous_ready_with_unresolved_evidence", "v4.3.0 / P1: convergence with not_resurfaced evidence must not finalize as plain unanimous_ready");
|
|
1295
|
+
assert.ok(unresolvedEvents.includes("session.evidence_checklist_unresolved_on_finalize"), "v4.3.0 / P1: unresolved evidence close-out must emit an audit event");
|
|
1296
|
+
const unresolvedReport = sessionReportMarkdown(unresolvedOrch.store.read(unresolvedR2.session.session_id), unresolvedOrch.store.readEvents(unresolvedR2.session.session_id));
|
|
1297
|
+
assert.ok(unresolvedReport.includes("## Unresolved Evidence Disposition"), "v4.3.0 / P1: session_report must include unresolved-evidence disposition table");
|
|
1298
|
+
assert.ok(unresolvedReport.includes("not_resurfaced"), "v4.3.0 / P1: session_report must name not_resurfaced unresolved items");
|
|
1299
|
+
console.log("[smoke] unresolved_evidence_finalization_guard_test: PASS");
|
|
1300
|
+
}
|
|
1301
|
+
// v4.3.0 / P3: read-only peer reliability telemetry. This is deliberately
|
|
1302
|
+
// observational; it must not change peer selection or mutate sessions.
|
|
1303
|
+
{
|
|
1304
|
+
const { SessionStore } = await import("../src/core/session-store.js");
|
|
1305
|
+
const reliabilityStore = new SessionStore({
|
|
1306
|
+
...config,
|
|
1307
|
+
data_dir: smokeTmpDir("peer-reliability"),
|
|
1308
|
+
});
|
|
1309
|
+
const reliabilitySession = await reliabilityStore.init("peer reliability report fixture", "operator", []);
|
|
1310
|
+
const reliabilityMeta = reliabilityStore.read(reliabilitySession.session_id);
|
|
1311
|
+
const ts = new Date().toISOString();
|
|
1312
|
+
reliabilityMeta.rounds = [
|
|
1313
|
+
{
|
|
1314
|
+
round: 1,
|
|
1315
|
+
started_at: ts,
|
|
1316
|
+
completed_at: ts,
|
|
1317
|
+
caller_status: "READY",
|
|
1318
|
+
prompt_file: "agent-runs/round-1-prompt.md",
|
|
1319
|
+
peers: [
|
|
1320
|
+
{
|
|
1321
|
+
peer: "claude",
|
|
1322
|
+
provider: "anthropic",
|
|
1323
|
+
model: "claude-opus-4-8",
|
|
1324
|
+
status: "NEEDS_EVIDENCE",
|
|
1325
|
+
structured: {
|
|
1326
|
+
status: "NEEDS_EVIDENCE",
|
|
1327
|
+
summary: "needs log",
|
|
1328
|
+
confidence: "verified",
|
|
1329
|
+
evidence_sources: ["src/core/session-store.ts:1"],
|
|
1330
|
+
caller_requests: ["attach raw npm test output"],
|
|
1331
|
+
follow_ups: [],
|
|
1332
|
+
},
|
|
1333
|
+
text: "{}",
|
|
1334
|
+
raw: { fixture: true },
|
|
1335
|
+
decision_quality: "clean",
|
|
1336
|
+
parser_warnings: [],
|
|
1337
|
+
attempts: 1,
|
|
1338
|
+
latency_ms: 50,
|
|
1339
|
+
usage: { input_tokens: 10, output_tokens: 5, total_tokens: 15 },
|
|
1340
|
+
cost: { currency: "USD", estimated: false, source: "configured-rate", total_cost: 1 },
|
|
1341
|
+
},
|
|
1342
|
+
{
|
|
1343
|
+
peer: "grok",
|
|
1344
|
+
provider: "xai",
|
|
1345
|
+
model: "grok-4.3",
|
|
1346
|
+
status: "READY",
|
|
1347
|
+
structured: {
|
|
1348
|
+
status: "READY",
|
|
1349
|
+
summary: "ready",
|
|
1350
|
+
confidence: "verified",
|
|
1351
|
+
evidence_sources: ["server_info: version 4.2.5"],
|
|
1352
|
+
caller_requests: [],
|
|
1353
|
+
follow_ups: [],
|
|
1354
|
+
},
|
|
1355
|
+
text: "{}",
|
|
1356
|
+
raw: { fixture: true },
|
|
1357
|
+
decision_quality: "format_warning",
|
|
1358
|
+
parser_warnings: ["verified_without_concrete_evidence_sources"],
|
|
1359
|
+
attempts: 1,
|
|
1360
|
+
latency_ms: 100,
|
|
1361
|
+
usage: { input_tokens: 20, output_tokens: 10, total_tokens: 30 },
|
|
1362
|
+
cost: { currency: "USD", estimated: false, source: "configured-rate", total_cost: 2 },
|
|
1363
|
+
},
|
|
1364
|
+
],
|
|
1365
|
+
rejected: [
|
|
1366
|
+
{
|
|
1367
|
+
peer: "perplexity",
|
|
1368
|
+
provider: "perplexity",
|
|
1369
|
+
model: "sonar-reasoning-pro",
|
|
1370
|
+
failure_class: "provider_error",
|
|
1371
|
+
message: "fixture provider error",
|
|
1372
|
+
retryable: false,
|
|
1373
|
+
attempts: 1,
|
|
1374
|
+
latency_ms: 0,
|
|
1375
|
+
},
|
|
1376
|
+
],
|
|
1377
|
+
convergence: {
|
|
1378
|
+
converged: false,
|
|
1379
|
+
reason: "fixture",
|
|
1380
|
+
ready_peers: ["grok"],
|
|
1381
|
+
not_ready_peers: [],
|
|
1382
|
+
needs_evidence_peers: ["claude"],
|
|
1383
|
+
rejected_peers: ["perplexity"],
|
|
1384
|
+
skipped_peers: [],
|
|
1385
|
+
decision_quality: {
|
|
1386
|
+
codex: "clean",
|
|
1387
|
+
claude: "clean",
|
|
1388
|
+
gemini: "clean",
|
|
1389
|
+
deepseek: "clean",
|
|
1390
|
+
grok: "format_warning",
|
|
1391
|
+
perplexity: "failed",
|
|
1392
|
+
},
|
|
1393
|
+
blocking_details: ["claude:NEEDS_EVIDENCE", "perplexity:provider_error"],
|
|
1394
|
+
},
|
|
1395
|
+
},
|
|
1396
|
+
];
|
|
1397
|
+
reliabilityMeta.evidence_checklist = [
|
|
1398
|
+
{
|
|
1399
|
+
id: "rel-1",
|
|
1400
|
+
peer: "claude",
|
|
1401
|
+
first_round: 1,
|
|
1402
|
+
last_round: 1,
|
|
1403
|
+
round_count: 1,
|
|
1404
|
+
ask: "attach raw npm test output",
|
|
1405
|
+
first_seen_at: ts,
|
|
1406
|
+
last_seen_at: ts,
|
|
1407
|
+
status: "not_resurfaced",
|
|
1408
|
+
addressed_at_round: 2,
|
|
1409
|
+
address_method: "resurfacing",
|
|
1410
|
+
},
|
|
1411
|
+
];
|
|
1412
|
+
fs.writeFileSync(reliabilityStore.metaPath(reliabilitySession.session_id), JSON.stringify(reliabilityMeta));
|
|
1413
|
+
await reliabilityStore.appendEvent({
|
|
1414
|
+
ts,
|
|
1415
|
+
type: "session.lead_meta_audit_fabrication_detected",
|
|
1416
|
+
session_id: reliabilitySession.session_id,
|
|
1417
|
+
message: "fixture fabrication event",
|
|
1418
|
+
data: { peer: "grok" },
|
|
1419
|
+
});
|
|
1420
|
+
const reliability = reliabilityStore.peerReliabilityReport();
|
|
1421
|
+
assert.equal(reliability.scope, "all");
|
|
1422
|
+
assert.equal(reliability.by_peer.claude?.needs_evidence, 1);
|
|
1423
|
+
assert.equal(reliability.by_peer.claude?.not_resurfaced_asks, 1);
|
|
1424
|
+
assert.equal(reliability.by_peer.grok?.ready, 1);
|
|
1425
|
+
assert.equal(reliability.by_peer.grok?.parser_warnings_total, 1);
|
|
1426
|
+
assert.equal(reliability.by_peer.grok?.fabrication_events, 1);
|
|
1427
|
+
assert.equal(reliability.by_peer.perplexity?.provider_errors, 1);
|
|
1428
|
+
console.log("[smoke] peer_reliability_report_test: PASS");
|
|
1429
|
+
}
|
|
1430
|
+
// v4.3.0 / P2: offline declarative eval harness. This pins the existence of a
|
|
1431
|
+
// no-provider-call fixture runner so regressions found in real sessions can be
|
|
1432
|
+
// replayed without growing the ad hoc smoke body indefinitely.
|
|
1433
|
+
{
|
|
1434
|
+
const pkg = JSON.parse(fs.readFileSync("package.json", "utf8"));
|
|
1435
|
+
assert.equal(pkg.scripts?.["eval:fixtures"], "tsx scripts/eval-fixtures.ts", "v4.3.0 / P2: package.json must expose the offline fixture eval runner");
|
|
1436
|
+
const evalHarness = fs.readFileSync("scripts/eval-fixtures.ts", "utf8");
|
|
1437
|
+
assert.ok(/truthfulnessCases/.test(evalHarness) &&
|
|
1438
|
+
/parserCases/.test(evalHarness) &&
|
|
1439
|
+
/reportCases/.test(evalHarness), "v4.3.0 / P2: eval-fixtures must use declarative truthfulness/parser/report case tables");
|
|
1440
|
+
assert.ok(!/askPeers\(|runUntilUnanimous\(|session_start_round/.test(evalHarness), "v4.3.0 / P2: eval-fixtures must stay offline and avoid provider-review entry points");
|
|
1441
|
+
console.log("[smoke] offline_fixture_eval_contract_test: PASS");
|
|
1442
|
+
}
|
|
1251
1443
|
// v2.22.0 (B.P3): session.budget_warning event emit + idempotency. The
|
|
1252
1444
|
// orchestrator emits a one-shot warning when cumulative cost crosses
|
|
1253
1445
|
// 75% of cost_ceiling_usd; the budget_warning_emitted flag persists
|