@lcv-ideas-software/cross-review 4.2.1 → 4.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/README.md +12 -9
- package/dist/scripts/provider-refresh-smoke.d.ts +1 -0
- package/dist/scripts/provider-refresh-smoke.js +49 -0
- package/dist/scripts/provider-refresh-smoke.js.map +1 -0
- package/dist/scripts/smoke.js +123 -14
- package/dist/scripts/smoke.js.map +1 -1
- package/dist/src/core/config.d.ts +2 -2
- package/dist/src/core/config.js +13 -12
- package/dist/src/core/config.js.map +1 -1
- package/dist/src/core/orchestrator.d.ts +24 -0
- package/dist/src/core/orchestrator.js +200 -1
- package/dist/src/core/orchestrator.js.map +1 -1
- package/dist/src/core/status.js +13 -0
- package/dist/src/core/status.js.map +1 -1
- package/dist/src/core/types.d.ts +2 -1
- package/dist/src/core/types.js +3 -3
- package/dist/src/core/types.js.map +1 -1
- package/dist/src/peers/errors.js +3 -3
- package/dist/src/peers/errors.js.map +1 -1
- package/dist/src/peers/grok.js +5 -5
- package/dist/src/peers/grok.js.map +1 -1
- package/dist/src/peers/model-selection.js +5 -7
- package/dist/src/peers/model-selection.js.map +1 -1
- package/dist/src/peers/perplexity.js +3 -3
- package/dist/src/peers/perplexity.js.map +1 -1
- package/docs/api-keys.md +2 -2
- package/docs/apresentacao-cross-review.md +769 -0
- package/docs/apresentacao.md +571 -0
- package/docs/architecture.md +2 -0
- package/docs/caching.md +9 -8
- package/docs/costs.md +11 -0
- package/docs/model-selection.md +19 -14
- package/package.json +6 -3
package/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,33 @@ standard `v00.00.00`; npm package versions remain SemVer.
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [v04.02.02] — 2026-06-02
|
|
11
|
+
|
|
12
|
+
**Patch — provider-doc refresh and Perplexity probe repair.** This release
|
|
13
|
+
updates the maintained provider pins and rate-card guidance after a
|
|
14
|
+
cross-review audit of the current v4.2.1 session corpus.
|
|
15
|
+
|
|
16
|
+
### Fixed
|
|
17
|
+
|
|
18
|
+
- Raised the Perplexity `sonar-reasoning-pro` health probe to `max_tokens=16`,
|
|
19
|
+
matching the provider's current minimum and preventing false unavailable
|
|
20
|
+
capability snapshots while still keeping `disable_search=true`.
|
|
21
|
+
- Added `provider-refresh-smoke` coverage for the Perplexity probe minimum and
|
|
22
|
+
for the current Claude/Grok canonical model pins.
|
|
23
|
+
|
|
24
|
+
### Changed
|
|
25
|
+
|
|
26
|
+
- Promoted the Anthropic canonical/default model from `claude-opus-4-7` to
|
|
27
|
+
`claude-opus-4-8`.
|
|
28
|
+
- Promoted the Grok canonical/default model from the alias `grok-4-latest` to
|
|
29
|
+
the concrete `grok-4.3` pin while keeping alias behavior documented.
|
|
30
|
+
- Refreshed provider rate-card documentation for GPT-5.5, Claude Opus 4.8,
|
|
31
|
+
Gemini 2.5 Pro, DeepSeek V4 Pro, Grok 4.3, and Perplexity Sonar Reasoning
|
|
32
|
+
Pro.
|
|
33
|
+
- Updated the active local runtime config at
|
|
34
|
+
`C:\Users\leona\.cross-review\data\config.json` with current cached-input,
|
|
35
|
+
extended-tier, and DeepSeek base rates.
|
|
36
|
+
|
|
10
37
|
## [v04.02.01] — 2026-05-21
|
|
11
38
|
|
|
12
39
|
**Patch — publish the workspace hard-gate cleanup as a package release.** The
|
package/README.md
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
# cross-review
|
|
6
6
|
|
|
7
7
|
> MCP server orchestrating API-first cross-review between Claude, ChatGPT Codex,
|
|
8
|
-
> Gemini, DeepSeek, and
|
|
8
|
+
> Gemini, DeepSeek, Grok, and Perplexity with unanimous convergence gates.
|
|
9
9
|
|
|
10
10
|
[](#status)
|
|
11
11
|
[](https://github.com/LCV-Ideas-Software/cross-review/releases)
|
|
@@ -24,7 +24,7 @@ npm install -g @lcv-ideas-software/cross-review
|
|
|
24
24
|
npm install -g @lcv-ideas-software/cross-review --registry=https://npm.pkg.github.com
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
-
**Status.** Stable. Current release: **v04.02.
|
|
27
|
+
**Status.** Stable. Current release: **v04.02.02** (npm package `4.2.2`). See [CHANGELOG.md](./CHANGELOG.md) for the full release history.
|
|
28
28
|
|
|
29
29
|
> **Project renamed 2026-05-15.** This project was previously published as
|
|
30
30
|
> [`@lcv-ideas-software/cross-review-v2`](https://www.npmjs.com/package/@lcv-ideas-software/cross-review-v2)
|
|
@@ -38,6 +38,7 @@ The version history at a glance:
|
|
|
38
38
|
|
|
39
39
|
| Release | Scope |
|
|
40
40
|
| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
|
41
|
+
| **`v04.02.02`** | Patch — provider-doc refresh, Perplexity probe repair, current model pins, and rate-card guidance. |
|
|
41
42
|
| **`v04.02.01`** | Patch — publish the workspace hard-gate cleanup as a package release. |
|
|
42
43
|
| **`v04.02.00`** | Minor — bounded MCP session listing and cancellation semantics cleanup. |
|
|
43
44
|
| **`v04.01.01`** | Patch — release the hard-gate cleanup as a published package. |
|
|
@@ -119,8 +120,8 @@ The version history at a glance:
|
|
|
119
120
|
|
|
120
121
|
`cross-review` is the stable API-first implementation of the cross-review
|
|
121
122
|
pattern. It orchestrates provider API clients (OpenAI/Codex, Anthropic/Claude,
|
|
122
|
-
Google Gemini, DeepSeek,
|
|
123
|
-
surface.
|
|
123
|
+
Google Gemini, DeepSeek, xAI/Grok, and Perplexity Sonar) and provides an
|
|
124
|
+
MCP-compatible server surface.
|
|
124
125
|
|
|
125
126
|
Runtime calls are real provider calls by default. Stubs exist only for smoke
|
|
126
127
|
tests and CI when `CROSS_REVIEW_STUB=1`.
|
|
@@ -130,6 +131,7 @@ tests and CI when `CROSS_REVIEW_STUB=1`.
|
|
|
130
131
|
- Google Gen AI client library for Gemini.
|
|
131
132
|
- OpenAI-compatible DeepSeek API through the OpenAI client library.
|
|
132
133
|
- OpenAI-compatible xAI Grok API through the OpenAI client library.
|
|
134
|
+
- OpenAI-compatible Perplexity Sonar API through the OpenAI client library.
|
|
133
135
|
|
|
134
136
|
## Quick Start
|
|
135
137
|
|
|
@@ -171,11 +173,12 @@ variables. Example overrides (PowerShell):
|
|
|
171
173
|
[Environment]::SetEnvironmentVariable("CROSS_REVIEW_GROK_REASONING_EFFORT", "xhigh", "User")
|
|
172
174
|
```
|
|
173
175
|
|
|
174
|
-
For Grok, `GROK_API_KEY` is canonical.
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
`reasoning
|
|
178
|
-
|
|
176
|
+
For Grok, `GROK_API_KEY` is canonical. The default pin is `grok-4.3`, which
|
|
177
|
+
accepts explicit `reasoning.effort` through `high`; the adapter clamps the
|
|
178
|
+
shared effort scale before sending it. `grok-4-latest`, `grok-4.20`, and
|
|
179
|
+
`grok-4.20-reasoning` use xAI automatic reasoning in this runtime.
|
|
180
|
+
`grok-4.20-multi-agent` remains available as an explicit override for the
|
|
181
|
+
multi-agent variant.
|
|
179
182
|
|
|
180
183
|
Financial and budget controls are required for paid provider calls. Configure
|
|
181
184
|
these environment variables before running real sessions (example):
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import assert from "node:assert/strict";
|
|
2
|
+
import fs from "node:fs";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { loadConfig } from "../src/core/config.js";
|
|
6
|
+
import { selectFromCandidates } from "../src/peers/model-selection.js";
|
|
7
|
+
import { PerplexityAdapter } from "../src/peers/perplexity.js";
|
|
8
|
+
process.env.CROSS_REVIEW_STUB = "1";
|
|
9
|
+
process.env.CROSS_REVIEW_STUB_CONFIRMED = "1";
|
|
10
|
+
process.env.PERPLEXITY_API_KEY = "test-perplexity-key";
|
|
11
|
+
process.env.CROSS_REVIEW_DATA_DIR = fs.mkdtempSync(path.join(os.tmpdir(), "cross-review-provider-refresh-"));
|
|
12
|
+
const config = loadConfig();
|
|
13
|
+
{
|
|
14
|
+
const adapter = new PerplexityAdapter(config);
|
|
15
|
+
let capturedPayload;
|
|
16
|
+
adapter.client = async () => ({
|
|
17
|
+
chat: {
|
|
18
|
+
completions: {
|
|
19
|
+
create: async (payload) => {
|
|
20
|
+
capturedPayload = payload;
|
|
21
|
+
},
|
|
22
|
+
},
|
|
23
|
+
},
|
|
24
|
+
});
|
|
25
|
+
const probe = await adapter.probe();
|
|
26
|
+
assert.equal(probe.available, true);
|
|
27
|
+
assert.equal(capturedPayload?.disable_search, true);
|
|
28
|
+
assert.ok(typeof capturedPayload?.max_tokens === "number" && capturedPayload.max_tokens >= 16, "Perplexity probe must request at least 16 max_tokens for sonar-reasoning-pro.");
|
|
29
|
+
}
|
|
30
|
+
{
|
|
31
|
+
const claude = selectFromCandidates("claude", [{ id: "claude-opus-4-8", source: "api" }], "claude-opus-4-8");
|
|
32
|
+
assert.equal(claude.selected, "claude-opus-4-8");
|
|
33
|
+
assert.equal(claude.confidence, "verified");
|
|
34
|
+
}
|
|
35
|
+
{
|
|
36
|
+
const grok = selectFromCandidates("grok", [{ id: "grok-4.3", source: "api" }], "grok-4.3");
|
|
37
|
+
assert.equal(grok.selected, "grok-4.3");
|
|
38
|
+
assert.equal(grok.confidence, "verified");
|
|
39
|
+
}
|
|
40
|
+
{
|
|
41
|
+
const configSource = fs.readFileSync("src/core/config.ts", "utf8");
|
|
42
|
+
const modelSelectionSource = fs.readFileSync("src/peers/model-selection.ts", "utf8");
|
|
43
|
+
assert.ok(configSource.includes('claude: envValue("CROSS_REVIEW_ANTHROPIC_MODEL") || "claude-opus-4-8"'));
|
|
44
|
+
assert.ok(configSource.includes('grok: envValue("CROSS_REVIEW_GROK_MODEL") || "grok-4.3"'));
|
|
45
|
+
assert.ok(modelSelectionSource.includes('claude: ["claude-opus-4-8"]'));
|
|
46
|
+
assert.ok(modelSelectionSource.includes('grok: ["grok-4.3"]'));
|
|
47
|
+
}
|
|
48
|
+
console.log("[provider-refresh-smoke] PASS");
|
|
49
|
+
//# sourceMappingURL=provider-refresh-smoke.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"provider-refresh-smoke.js","sourceRoot":"","sources":["../../scripts/provider-refresh-smoke.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,oBAAoB,CAAC;AACxC,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,IAAI,MAAM,WAAW,CAAC;AAE7B,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAE,oBAAoB,EAAE,MAAM,iCAAiC,CAAC;AACvE,OAAO,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAE/D,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,GAAG,CAAC;AACpC,OAAO,CAAC,GAAG,CAAC,2BAA2B,GAAG,GAAG,CAAC;AAC9C,OAAO,CAAC,GAAG,CAAC,kBAAkB,GAAG,qBAAqB,CAAC;AACvD,OAAO,CAAC,GAAG,CAAC,qBAAqB,GAAG,EAAE,CAAC,WAAW,CAChD,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,gCAAgC,CAAC,CACzD,CAAC;AAEF,MAAM,MAAM,GAAG,UAAU,EAAE,CAAC;AAE5B,CAAC;IACC,MAAM,OAAO,GAAG,IAAI,iBAAiB,CAAC,MAAM,CAAC,CAAC;IAC9C,IAAI,eAA8E,CAAC;IAEjF,OASD,CAAC,MAAM,GAAG,KAAK,IAAI,EAAE,CAAC,CAAC;QACtB,IAAI,EAAE;YACJ,WAAW,EAAE;gBACX,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE;oBACxB,eAAe,GAAG,OAAO,CAAC;gBAC5B,CAAC;aACF;SACF;KACF,CAAC,CAAC;IAEH,MAAM,KAAK,GAAG,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;IACpC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC;IACpC,MAAM,CAAC,KAAK,CAAC,eAAe,EAAE,cAAc,EAAE,IAAI,CAAC,CAAC;IACpD,MAAM,CAAC,EAAE,CACP,OAAO,eAAe,EAAE,UAAU,KAAK,QAAQ,IAAI,eAAe,CAAC,UAAU,IAAI,EAAE,EACnF,+EAA+E,CAChF,CAAC;AACJ,CAAC;AAED,CAAC;IACC,MAAM,MAAM,GAAG,oBAAoB,CACjC,QAAQ,EACR,CAAC,EAAE,EAAE,EAAE,iBAAiB,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,EAC1C,iBAAiB,CAClB,CAAC;IACF,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,iBAAiB,CAAC,CAAC;IACjD,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;AAC9C,CAAC;AAED,CAAC;IACC,MAAM,IAAI,GAAG,oBAAoB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,EAAE,UAAU,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,EAAE,UAAU,CAAC,CAAC;IAC3F,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IACxC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;AAC5C,CAAC;AAED,CAAC;IACC,MAAM,YAAY,GAAG,EAAE,CAAC,YAAY,CAAC,oBAAoB,EAAE,MAAM,CAAC,CAAC;IACnE,MAAM,oBAAoB,GAAG,EAAE,CAAC,YAAY,CAAC,8BAA8B,EAAE,MAAM,CAAC,CAAC;IACrF,MAAM,CAAC,EAAE,CACP,YAAY,CAAC,QAAQ,CAAC,uEAAuE,CAAC,CAC/F,CAAC;IACF,MAAM,CAAC,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,yDAAyD,CAAC,CAAC,CAAC;IAC5F,MAAM,CAAC,EAAE,CAAC,oBAAoB,CAAC,QAAQ,CAAC,6BAA6B,CAAC,CAAC,CAAC;IACxE,MAAM,CAAC,EAAE,CAAC,oBAAoB,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC,CAAC;AACjE,CAAC;AAED,OAAO,CAAC,GAAG,CAAC,+BAA+B,CAAC,CAAC"}
|
package/dist/scripts/smoke.js
CHANGED
|
@@ -220,16 +220,16 @@ for (const deprecatedOrWeakModel of [
|
|
|
220
220
|
// "must remain" list is therefore exactly the 6 lone canonical pins.
|
|
221
221
|
for (const canonicalPin of [
|
|
222
222
|
"gpt-5.5",
|
|
223
|
-
"claude-opus-4-
|
|
223
|
+
"claude-opus-4-8",
|
|
224
224
|
"gemini-2.5-pro",
|
|
225
225
|
"deepseek-v4-pro",
|
|
226
|
-
"grok-4
|
|
226
|
+
"grok-4.3",
|
|
227
227
|
"sonar-reasoning-pro",
|
|
228
228
|
]) {
|
|
229
229
|
assert.ok(modelSelectionSource.includes(`"${canonicalPin}"`), `${canonicalPin} must remain the lone canonical PRIORITY pin`);
|
|
230
230
|
}
|
|
231
|
-
const noWeakDowngrade = selectFromCandidates("claude", [{ id: "claude-haiku-4-5-20251001", source: "api" }], "claude-opus-4-
|
|
232
|
-
assert.equal(noWeakDowngrade.selected, "claude-opus-4-
|
|
231
|
+
const noWeakDowngrade = selectFromCandidates("claude", [{ id: "claude-haiku-4-5-20251001", source: "api" }], "claude-opus-4-8");
|
|
232
|
+
assert.equal(noWeakDowngrade.selected, "claude-opus-4-8");
|
|
233
233
|
assert.equal(noWeakDowngrade.confidence, "unknown");
|
|
234
234
|
assert.match(noWeakDowngrade.reason, /silently downgrading/);
|
|
235
235
|
const pemMarker = (side, label) => ["-----", side, " ", label, "-----"].join("");
|
|
@@ -1347,6 +1347,35 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
1347
1347
|
assert.ok(/evidence_sources/.test(instruction), "statusInstruction must direct detail to evidence_sources (v2.5.0)");
|
|
1348
1348
|
console.log("[smoke] session_contract_directives_test: PASS");
|
|
1349
1349
|
}
|
|
1350
|
+
// v4.2.2 — verified_requires_evidence_sources_test. Peer JSON may still
|
|
1351
|
+
// declare READY, but a `confidence:"verified"` verdict without concrete
|
|
1352
|
+
// evidence_sources must not be classified as a clean machine decision.
|
|
1353
|
+
{
|
|
1354
|
+
const statusModule = await import("../src/core/status.js");
|
|
1355
|
+
const parseStatusForTruth = statusModule.parsePeerStatus;
|
|
1356
|
+
const statusInstruction = statusModule.statusInstruction;
|
|
1357
|
+
const ungrounded = parseStatusForTruth(JSON.stringify({
|
|
1358
|
+
status: "READY",
|
|
1359
|
+
summary: "Looks correct.",
|
|
1360
|
+
confidence: "verified",
|
|
1361
|
+
evidence_sources: [],
|
|
1362
|
+
caller_requests: [],
|
|
1363
|
+
follow_ups: [],
|
|
1364
|
+
}));
|
|
1365
|
+
assert.ok(ungrounded.parser_warnings.includes("verified_without_evidence_sources"), "v4.2.2 / truthfulness_guardrails: confidence=verified with empty evidence_sources must emit verified_without_evidence_sources");
|
|
1366
|
+
assert.equal(ungrounded.structured?.confidence, "verified", "v4.2.2 / truthfulness_guardrails: parser warning must not silently rewrite peer confidence");
|
|
1367
|
+
const grounded = parseStatusForTruth(JSON.stringify({
|
|
1368
|
+
status: "READY",
|
|
1369
|
+
summary: "Runtime claim matches the raw source.",
|
|
1370
|
+
confidence: "verified",
|
|
1371
|
+
evidence_sources: ['server_info: {"version":"4.2.1","release_date":"2026-05-21"}'],
|
|
1372
|
+
caller_requests: [],
|
|
1373
|
+
follow_ups: [],
|
|
1374
|
+
}));
|
|
1375
|
+
assert.ok(!grounded.parser_warnings.includes("verified_without_evidence_sources"), "v4.2.2 / truthfulness_guardrails: concrete evidence_sources must satisfy verified confidence");
|
|
1376
|
+
assert.ok(/confidence.*verified[\s\S]+evidence_sources/i.test(statusInstruction()), "v4.2.2 / truthfulness_guardrails: statusInstruction must tie verified confidence to concrete evidence_sources");
|
|
1377
|
+
console.log("[smoke] verified_requires_evidence_sources_test: PASS");
|
|
1378
|
+
}
|
|
1350
1379
|
// v2.5.0: CROSS_REVIEW_DEFAULT_MAX_ROUNDS env override is honored.
|
|
1351
1380
|
{
|
|
1352
1381
|
const { loadConfig: reload } = await import("../src/core/config.js");
|
|
@@ -3644,14 +3673,12 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
3644
3673
|
// v3.0.0: PEERS now has 6 entries (perplexity added).
|
|
3645
3674
|
assert.equal(PEERS.length, 6, "PEERS must have 6 entries (codex/claude/gemini/deepseek/grok/perplexity)");
|
|
3646
3675
|
const cfg = loadConfig();
|
|
3647
|
-
//
|
|
3648
|
-
//
|
|
3649
|
-
//
|
|
3650
|
-
//
|
|
3651
|
-
//
|
|
3652
|
-
|
|
3653
|
-
// clampEffortForModel tests below continue to pin that capability.
|
|
3654
|
-
assert.equal(cfg.models.grok, "grok-4-latest", "default grok model must be grok-4-latest (v3.7.2, operator directive)");
|
|
3676
|
+
// v4.2.2 provider-doc refresh: default grok model is the concrete
|
|
3677
|
+
// `grok-4.3` pin. `grok-4-latest` remains a valid xAI alias and
|
|
3678
|
+
// `grok-4.20-multi-agent` remains a valid env-override for explicit
|
|
3679
|
+
// multi-agent reasoning behavior; the adapter tests below continue to
|
|
3680
|
+
// pin those capabilities.
|
|
3681
|
+
assert.equal(cfg.models.grok, "grok-4.3", "default grok model must be grok-4.3 (v4.2.2 provider-doc refresh)");
|
|
3655
3682
|
assert.ok("grok" in cfg.fallback_models, "fallback_models must have grok entry");
|
|
3656
3683
|
assert.equal(cfg.peer_enabled.grok, true, "grok must be enabled by default");
|
|
3657
3684
|
assert.ok(cfg.cost_rates.grok, "grok cost rates must be configured (env-set in smoke setup)");
|
|
@@ -5210,6 +5237,19 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
5210
5237
|
});
|
|
5211
5238
|
assert.ok(netNewAssertionWithDraft.fabricated === true &&
|
|
5212
5239
|
netNewAssertionWithDraft.suspicious_assertion_count >= 2, `v3.7.4 / fabrication_lock: operational assertions NET-NEW vs {provenance ∪ priorDraft} — invented by the relator even though a prior draft exists — MUST still trip fabricated=true (got count=${netNewAssertionWithDraft.suspicious_assertion_count}, fabricated=${netNewAssertionWithDraft.fabricated})`);
|
|
5240
|
+
const inventedWorkflowDispatch = detectFabricatedEvidence("Refazendo agora. Workflow launched in background. Task ID: wllbll9am. Run ID: wf_e7c69578-e23.", {
|
|
5241
|
+
provenanceCorpus: "",
|
|
5242
|
+
priorDraftCorpus: "The user challenged the report and did not authorize a redo.",
|
|
5243
|
+
narrativeCorpus: "Analyze why Claude lied about the prior v4.2.0 audit.",
|
|
5244
|
+
});
|
|
5245
|
+
assert.ok(inventedWorkflowDispatch.fabricated === true &&
|
|
5246
|
+
inventedWorkflowDispatch.suspicious_assertion_count >= 2, `v4.2.2 / truthfulness_guardrails: invented workflow dispatch claims MUST trip fabricated=true (got count=${inventedWorkflowDispatch.suspicious_assertion_count}, fabricated=${inventedWorkflowDispatch.fabricated})`);
|
|
5247
|
+
const genericConfirmation = detectFabricatedEvidence("The reviewer confirmed the model-selection rationale is clear.", {
|
|
5248
|
+
provenanceCorpus: "",
|
|
5249
|
+
priorDraftCorpus: "",
|
|
5250
|
+
narrativeCorpus: "",
|
|
5251
|
+
});
|
|
5252
|
+
assert.equal(genericConfirmation.fabricated, false, "v4.2.2 / truthfulness_guardrails: generic 'confirmed' prose without a dispatch/authorization claim must not trip fabrication detection");
|
|
5213
5253
|
// Source-level: threshold constants pinned at the documented values.
|
|
5214
5254
|
assert.ok(/FABRICATED_NET_NEW_HEX_THRESHOLD\s*=\s*3/.test(orchSrc), "v2.24.0 / fabrication_lock: net-new hex threshold pinned at 3");
|
|
5215
5255
|
assert.ok(/FABRICATED_SUSPICIOUS_ASSERTION_THRESHOLD\s*=\s*2/.test(orchSrc), "v2.24.0 / fabrication_lock: suspicious assertion threshold pinned at 2");
|
|
@@ -5675,6 +5715,75 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
5675
5715
|
assert.ok(/boolEnv\("CROSS_REVIEW_EVIDENCE_PREFLIGHT", true\)/.test(configSrcPf), "v3.5.0 / evidence_preflight: CROSS_REVIEW_EVIDENCE_PREFLIGHT env var must default ON");
|
|
5676
5716
|
console.log("[smoke] evidence_preflight_test: PASS");
|
|
5677
5717
|
}
|
|
5718
|
+
// v4.2.2 — truthfulness_preflight_test. Pins the guard added after the
|
|
5719
|
+
// Claude Code Opus 4.8 incident where a report asserted
|
|
5720
|
+
// "v4.2.0 current production" despite live server_info showing
|
|
5721
|
+
// v4.2.1. The old evidence preflight only checked completed-work
|
|
5722
|
+
// claims (tests/diff/build) and did not reject current-runtime
|
|
5723
|
+
// contradictions or unsupported historical timing narratives.
|
|
5724
|
+
{
|
|
5725
|
+
const { truthfulnessPreflight } = await import("../src/core/orchestrator.js");
|
|
5726
|
+
const runtimeFacts = {
|
|
5727
|
+
runtime_version: "4.2.1",
|
|
5728
|
+
release_date: "2026-05-21",
|
|
5729
|
+
model_pins: {
|
|
5730
|
+
claude: "claude-opus-4-8",
|
|
5731
|
+
grok: "grok-4.3",
|
|
5732
|
+
},
|
|
5733
|
+
};
|
|
5734
|
+
const contradictedByRuntime = truthfulnessPreflight({
|
|
5735
|
+
task: "Audit all sessions generated with the current cross-review version.",
|
|
5736
|
+
initialDraft: 'Live server_info: {"version":"4.2.1","release_date":"2026-05-21"}\nAudit report for cross-review v4.2.0 current production, released 2026-05-17.',
|
|
5737
|
+
runtimeFacts,
|
|
5738
|
+
attachmentsPresent: false,
|
|
5739
|
+
});
|
|
5740
|
+
assert.equal(contradictedByRuntime.pass, false, "v4.2.2 / truthfulness_preflight: current-runtime version claim contradicting runtime facts must trip even when server_info text is present");
|
|
5741
|
+
assert.ok(contradictedByRuntime.contradictions.some((item) => item.includes("4.2.0")), "v4.2.2 / truthfulness_preflight: mismatch diagnostics must include the contradicted version token");
|
|
5742
|
+
const backedByRuntime = truthfulnessPreflight({
|
|
5743
|
+
task: "Audit all sessions generated with the current cross-review version.",
|
|
5744
|
+
initialDraft: 'Live server_info: {"version":"4.2.1","release_date":"2026-05-21"}\nAudit report for cross-review v4.2.1 current production, released 2026-05-21.',
|
|
5745
|
+
runtimeFacts,
|
|
5746
|
+
attachmentsPresent: false,
|
|
5747
|
+
});
|
|
5748
|
+
assert.equal(backedByRuntime.pass, true, "v4.2.2 / truthfulness_preflight: current-runtime claim matching runtime facts must pass");
|
|
5749
|
+
const unsupportedCurrentState = truthfulnessPreflight({
|
|
5750
|
+
task: "Audit all sessions generated with the current cross-review version.",
|
|
5751
|
+
initialDraft: "Audit report for cross-review v4.2.1 current production.",
|
|
5752
|
+
runtimeFacts: {},
|
|
5753
|
+
attachmentsPresent: false,
|
|
5754
|
+
});
|
|
5755
|
+
assert.equal(unsupportedCurrentState.pass, false, "v4.2.2 / truthfulness_preflight: current-runtime claim without runtime facts or source evidence must trip");
|
|
5756
|
+
const historicalChangelog = truthfulnessPreflight({
|
|
5757
|
+
task: "Review this changelog text.",
|
|
5758
|
+
initialDraft: "v4.2.0 was released on 2026-05-17. v4.2.1 was released on 2026-05-21.",
|
|
5759
|
+
runtimeFacts,
|
|
5760
|
+
attachmentsPresent: false,
|
|
5761
|
+
});
|
|
5762
|
+
assert.equal(historicalChangelog.pass, true, "v4.2.2 / truthfulness_preflight: historical version text without current/timing claims must not trip");
|
|
5763
|
+
const fabricatedTiming = truthfulnessPreflight({
|
|
5764
|
+
task: "Explain why the report said v4.2.0.",
|
|
5765
|
+
initialDraft: "When the workflow began, cross-review was running v4.2.0. It was bumped to v4.2.1 between R1 and R3.",
|
|
5766
|
+
runtimeFacts,
|
|
5767
|
+
attachmentsPresent: false,
|
|
5768
|
+
});
|
|
5769
|
+
assert.equal(fabricatedTiming.pass, false, "v4.2.2 / truthfulness_preflight: historical runtime timing narrative without snapshot evidence must trip");
|
|
5770
|
+
const withStructuredEvidence = truthfulnessPreflight({
|
|
5771
|
+
task: "Explain why the report said v4.2.0.",
|
|
5772
|
+
initialDraft: "When the workflow began, cross-review was running v4.2.0. It was bumped to v4.2.1 between R1 and R3.",
|
|
5773
|
+
runtimeFacts,
|
|
5774
|
+
structuredEvidence: "Historical runtime snapshot from events.ndjson: workflow_start server_info version=4.2.0; later reload server_info version=4.2.1.",
|
|
5775
|
+
attachmentsPresent: false,
|
|
5776
|
+
});
|
|
5777
|
+
assert.equal(withStructuredEvidence.pass, true, "v4.2.2 / truthfulness_preflight: structured evidence can satisfy historical timing claims");
|
|
5778
|
+
const orchSrcTruth = fs.readFileSync(new URL("../src/core/orchestrator.ts", import.meta.url), "utf8");
|
|
5779
|
+
const configSrcTruth = fs.readFileSync(new URL("../src/core/config.ts", import.meta.url), "utf8");
|
|
5780
|
+
assert.ok(/export function truthfulnessPreflight\b/.test(orchSrcTruth), "v4.2.2 / truthfulness_preflight: truthfulnessPreflight must be exported");
|
|
5781
|
+
assert.ok(/truthfulness_preflight_enabled/.test(orchSrcTruth) &&
|
|
5782
|
+
/askPeers[\s\S]+truthfulnessPreflight/.test(orchSrcTruth) &&
|
|
5783
|
+
/runUntilUnanimous[\s\S]+truthfulnessPreflight/.test(orchSrcTruth), "v4.2.2 / truthfulness_preflight: both askPeers and runUntilUnanimous must gate on config.truthfulness_preflight_enabled");
|
|
5784
|
+
assert.ok(/boolEnv\("CROSS_REVIEW_TRUTHFULNESS_PREFLIGHT", true\)/.test(configSrcTruth), "v4.2.2 / truthfulness_preflight: CROSS_REVIEW_TRUTHFULNESS_PREFLIGHT env var must default ON");
|
|
5785
|
+
console.log("[smoke] truthfulness_preflight_test: PASS");
|
|
5786
|
+
}
|
|
5678
5787
|
// v3.5.0 (CRV2-1 + CRV2-6) — budget + max_rounds traceability.
|
|
5679
5788
|
//
|
|
5680
5789
|
// setSessionTraceability persists requested-vs-effective max_rounds and
|
|
@@ -6025,10 +6134,10 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
6025
6134
|
}
|
|
6026
6135
|
for (const [peer, pin] of [
|
|
6027
6136
|
["codex", "gpt-5.5"],
|
|
6028
|
-
["claude", "claude-opus-4-
|
|
6137
|
+
["claude", "claude-opus-4-8"],
|
|
6029
6138
|
["gemini", "gemini-2.5-pro"],
|
|
6030
6139
|
["deepseek", "deepseek-v4-pro"],
|
|
6031
|
-
["grok", "grok-4
|
|
6140
|
+
["grok", "grok-4.3"],
|
|
6032
6141
|
["perplexity", "sonar-reasoning-pro"],
|
|
6033
6142
|
]) {
|
|
6034
6143
|
assert.ok(new RegExp(`${peer}: \\["${pin}"\\]`).test(a3ModelSrc), `v3.7.2 / AUDIT-3: ${peer} PRIORITY must be the lone canonical pin ["${pin}"] (no fallback)`);
|