@lcv-ideas-software/cross-review 4.2.1 → 4.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +42 -0
- package/README.md +13 -9
- package/dist/scripts/provider-refresh-smoke.d.ts +1 -0
- package/dist/scripts/provider-refresh-smoke.js +56 -0
- package/dist/scripts/provider-refresh-smoke.js.map +1 -0
- package/dist/scripts/smoke.js +127 -17
- package/dist/scripts/smoke.js.map +1 -1
- package/dist/src/core/config.d.ts +2 -2
- package/dist/src/core/config.js +14 -13
- package/dist/src/core/config.js.map +1 -1
- package/dist/src/core/orchestrator.d.ts +24 -0
- package/dist/src/core/orchestrator.js +200 -1
- package/dist/src/core/orchestrator.js.map +1 -1
- package/dist/src/core/status.js +13 -0
- package/dist/src/core/status.js.map +1 -1
- package/dist/src/core/types.d.ts +2 -1
- package/dist/src/core/types.js +3 -3
- package/dist/src/core/types.js.map +1 -1
- package/dist/src/peers/errors.js +3 -3
- package/dist/src/peers/errors.js.map +1 -1
- package/dist/src/peers/grok.js +5 -5
- package/dist/src/peers/grok.js.map +1 -1
- package/dist/src/peers/model-selection.js +6 -8
- package/dist/src/peers/model-selection.js.map +1 -1
- package/dist/src/peers/perplexity.js +3 -3
- package/dist/src/peers/perplexity.js.map +1 -1
- package/docs/api-keys.md +3 -3
- package/docs/apresentacao-cross-review.md +770 -0
- package/docs/apresentacao.md +572 -0
- package/docs/architecture.md +2 -0
- package/docs/caching.md +9 -8
- package/docs/costs.md +11 -0
- package/docs/model-selection.md +37 -29
- package/package.json +6 -3
package/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,48 @@ standard `v00.00.00`; npm package versions remain SemVer.
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [v04.02.03] — 2026-06-03
|
|
11
|
+
|
|
12
|
+
**Patch — Gemini replacement pin and rate-card refresh.** This release follows
|
|
13
|
+
Google's deprecation schedule for Gemini 2.5 Pro by making Gemini 3.1 Pro Preview
|
|
14
|
+
the active canonical Gemini pin.
|
|
15
|
+
|
|
16
|
+
### Changed
|
|
17
|
+
|
|
18
|
+
- Promoted the Google/Gemini canonical default from `gemini-2.5-pro` to
|
|
19
|
+
`gemini-3.1-pro-preview` after Google's deprecation schedule listed the
|
|
20
|
+
former for shutdown on 2026-10-16.
|
|
21
|
+
- Updated the active local Gemini rate card from Gemini 2.5 Pro pricing to
|
|
22
|
+
Gemini 3.1 Pro Preview pricing, including the >200K extended tier and
|
|
23
|
+
cached-input rates.
|
|
24
|
+
|
|
25
|
+
## [v04.02.02] — 2026-06-02
|
|
26
|
+
|
|
27
|
+
**Patch — provider-doc refresh and Perplexity probe repair.** This release
|
|
28
|
+
updates the maintained provider pins and rate-card guidance after a
|
|
29
|
+
cross-review audit of the current v4.2.1 session corpus.
|
|
30
|
+
|
|
31
|
+
### Fixed
|
|
32
|
+
|
|
33
|
+
- Raised the Perplexity `sonar-reasoning-pro` health probe to `max_tokens=16`,
|
|
34
|
+
matching the provider's current minimum and preventing false unavailable
|
|
35
|
+
capability snapshots while still keeping `disable_search=true`.
|
|
36
|
+
- Added `provider-refresh-smoke` coverage for the Perplexity probe minimum and
|
|
37
|
+
for the current Claude/Grok canonical model pins.
|
|
38
|
+
|
|
39
|
+
### Changed
|
|
40
|
+
|
|
41
|
+
- Promoted the Anthropic canonical/default model from `claude-opus-4-7` to
|
|
42
|
+
`claude-opus-4-8`.
|
|
43
|
+
- Promoted the Grok canonical/default model from the alias `grok-4-latest` to
|
|
44
|
+
the concrete `grok-4.3` pin while keeping alias behavior documented.
|
|
45
|
+
- Refreshed provider rate-card documentation for GPT-5.5, Claude Opus 4.8,
|
|
46
|
+
Gemini 2.5 Pro, DeepSeek V4 Pro, Grok 4.3, and Perplexity Sonar Reasoning
|
|
47
|
+
Pro.
|
|
48
|
+
- Updated the active local runtime config at
|
|
49
|
+
`C:\Users\leona\.cross-review\data\config.json` with current cached-input,
|
|
50
|
+
extended-tier, and DeepSeek base rates.
|
|
51
|
+
|
|
10
52
|
## [v04.02.01] — 2026-05-21
|
|
11
53
|
|
|
12
54
|
**Patch — publish the workspace hard-gate cleanup as a package release.** The
|
package/README.md
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
# cross-review
|
|
6
6
|
|
|
7
7
|
> MCP server orchestrating API-first cross-review between Claude, ChatGPT Codex,
|
|
8
|
-
> Gemini, DeepSeek, and
|
|
8
|
+
> Gemini, DeepSeek, Grok, and Perplexity with unanimous convergence gates.
|
|
9
9
|
|
|
10
10
|
[](#status)
|
|
11
11
|
[](https://github.com/LCV-Ideas-Software/cross-review/releases)
|
|
@@ -24,7 +24,7 @@ npm install -g @lcv-ideas-software/cross-review
|
|
|
24
24
|
npm install -g @lcv-ideas-software/cross-review --registry=https://npm.pkg.github.com
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
-
**Status.** Stable. Current release: **v04.02.
|
|
27
|
+
**Status.** Stable. Current release: **v04.02.03** (npm package `4.2.3`). See [CHANGELOG.md](./CHANGELOG.md) for the full release history.
|
|
28
28
|
|
|
29
29
|
> **Project renamed 2026-05-15.** This project was previously published as
|
|
30
30
|
> [`@lcv-ideas-software/cross-review-v2`](https://www.npmjs.com/package/@lcv-ideas-software/cross-review-v2)
|
|
@@ -38,6 +38,8 @@ The version history at a glance:
|
|
|
38
38
|
|
|
39
39
|
| Release | Scope |
|
|
40
40
|
| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
|
41
|
+
| **`v04.02.03`** | Patch — promote the Gemini canonical default to `gemini-3.1-pro-preview` and refresh the active local Gemini rate card. |
|
|
42
|
+
| **`v04.02.02`** | Patch — provider-doc refresh, Perplexity probe repair, current model pins, and rate-card guidance. |
|
|
41
43
|
| **`v04.02.01`** | Patch — publish the workspace hard-gate cleanup as a package release. |
|
|
42
44
|
| **`v04.02.00`** | Minor — bounded MCP session listing and cancellation semantics cleanup. |
|
|
43
45
|
| **`v04.01.01`** | Patch — release the hard-gate cleanup as a published package. |
|
|
@@ -119,8 +121,8 @@ The version history at a glance:
|
|
|
119
121
|
|
|
120
122
|
`cross-review` is the stable API-first implementation of the cross-review
|
|
121
123
|
pattern. It orchestrates provider API clients (OpenAI/Codex, Anthropic/Claude,
|
|
122
|
-
Google Gemini, DeepSeek,
|
|
123
|
-
surface.
|
|
124
|
+
Google Gemini, DeepSeek, xAI/Grok, and Perplexity Sonar) and provides an
|
|
125
|
+
MCP-compatible server surface.
|
|
124
126
|
|
|
125
127
|
Runtime calls are real provider calls by default. Stubs exist only for smoke
|
|
126
128
|
tests and CI when `CROSS_REVIEW_STUB=1`.
|
|
@@ -130,6 +132,7 @@ tests and CI when `CROSS_REVIEW_STUB=1`.
|
|
|
130
132
|
- Google Gen AI client library for Gemini.
|
|
131
133
|
- OpenAI-compatible DeepSeek API through the OpenAI client library.
|
|
132
134
|
- OpenAI-compatible xAI Grok API through the OpenAI client library.
|
|
135
|
+
- OpenAI-compatible Perplexity Sonar API through the OpenAI client library.
|
|
133
136
|
|
|
134
137
|
## Quick Start
|
|
135
138
|
|
|
@@ -171,11 +174,12 @@ variables. Example overrides (PowerShell):
|
|
|
171
174
|
[Environment]::SetEnvironmentVariable("CROSS_REVIEW_GROK_REASONING_EFFORT", "xhigh", "User")
|
|
172
175
|
```
|
|
173
176
|
|
|
174
|
-
For Grok, `GROK_API_KEY` is canonical.
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
`reasoning
|
|
178
|
-
|
|
177
|
+
For Grok, `GROK_API_KEY` is canonical. The default pin is `grok-4.3`, which
|
|
178
|
+
accepts explicit `reasoning.effort` through `high`; the adapter clamps the
|
|
179
|
+
shared effort scale before sending it. `grok-4-latest`, `grok-4.20`, and
|
|
180
|
+
`grok-4.20-reasoning` use xAI automatic reasoning in this runtime.
|
|
181
|
+
`grok-4.20-multi-agent` remains available as an explicit override for the
|
|
182
|
+
multi-agent variant.
|
|
179
183
|
|
|
180
184
|
Financial and budget controls are required for paid provider calls. Configure
|
|
181
185
|
these environment variables before running real sessions (example):
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import assert from "node:assert/strict";
|
|
2
|
+
import fs from "node:fs";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { loadConfig } from "../src/core/config.js";
|
|
6
|
+
import { selectFromCandidates } from "../src/peers/model-selection.js";
|
|
7
|
+
import { PerplexityAdapter } from "../src/peers/perplexity.js";
|
|
8
|
+
process.env.CROSS_REVIEW_STUB = "1";
|
|
9
|
+
process.env.CROSS_REVIEW_STUB_CONFIRMED = "1";
|
|
10
|
+
process.env.PERPLEXITY_API_KEY = "test-perplexity-key";
|
|
11
|
+
process.env.CROSS_REVIEW_DATA_DIR = fs.mkdtempSync(path.join(os.tmpdir(), "cross-review-provider-refresh-"));
|
|
12
|
+
const config = loadConfig();
|
|
13
|
+
{
|
|
14
|
+
const adapter = new PerplexityAdapter(config);
|
|
15
|
+
let capturedPayload;
|
|
16
|
+
adapter.client = async () => ({
|
|
17
|
+
chat: {
|
|
18
|
+
completions: {
|
|
19
|
+
create: async (payload) => {
|
|
20
|
+
capturedPayload = payload;
|
|
21
|
+
},
|
|
22
|
+
},
|
|
23
|
+
},
|
|
24
|
+
});
|
|
25
|
+
const probe = await adapter.probe();
|
|
26
|
+
assert.equal(probe.available, true);
|
|
27
|
+
assert.equal(capturedPayload?.disable_search, true);
|
|
28
|
+
assert.ok(typeof capturedPayload?.max_tokens === "number" && capturedPayload.max_tokens >= 16, "Perplexity probe must request at least 16 max_tokens for sonar-reasoning-pro.");
|
|
29
|
+
}
|
|
30
|
+
{
|
|
31
|
+
const claude = selectFromCandidates("claude", [{ id: "claude-opus-4-8", source: "api" }], "claude-opus-4-8");
|
|
32
|
+
assert.equal(claude.selected, "claude-opus-4-8");
|
|
33
|
+
assert.equal(claude.confidence, "verified");
|
|
34
|
+
}
|
|
35
|
+
{
|
|
36
|
+
const gemini = selectFromCandidates("gemini", [{ id: "gemini-3.1-pro-preview", source: "api" }], "gemini-3.1-pro-preview");
|
|
37
|
+
assert.equal(gemini.selected, "gemini-3.1-pro-preview");
|
|
38
|
+
assert.equal(gemini.confidence, "verified");
|
|
39
|
+
}
|
|
40
|
+
{
|
|
41
|
+
const grok = selectFromCandidates("grok", [{ id: "grok-4.3", source: "api" }], "grok-4.3");
|
|
42
|
+
assert.equal(grok.selected, "grok-4.3");
|
|
43
|
+
assert.equal(grok.confidence, "verified");
|
|
44
|
+
}
|
|
45
|
+
{
|
|
46
|
+
const configSource = fs.readFileSync("src/core/config.ts", "utf8");
|
|
47
|
+
const modelSelectionSource = fs.readFileSync("src/peers/model-selection.ts", "utf8");
|
|
48
|
+
assert.ok(configSource.includes('claude: envValue("CROSS_REVIEW_ANTHROPIC_MODEL") || "claude-opus-4-8"'));
|
|
49
|
+
assert.ok(configSource.includes('gemini: envValue("CROSS_REVIEW_GEMINI_MODEL") || "gemini-3.1-pro-preview"'));
|
|
50
|
+
assert.ok(configSource.includes('grok: envValue("CROSS_REVIEW_GROK_MODEL") || "grok-4.3"'));
|
|
51
|
+
assert.ok(modelSelectionSource.includes('claude: ["claude-opus-4-8"]'));
|
|
52
|
+
assert.ok(modelSelectionSource.includes('gemini: ["gemini-3.1-pro-preview"]'));
|
|
53
|
+
assert.ok(modelSelectionSource.includes('grok: ["grok-4.3"]'));
|
|
54
|
+
}
|
|
55
|
+
console.log("[provider-refresh-smoke] PASS");
|
|
56
|
+
//# sourceMappingURL=provider-refresh-smoke.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"provider-refresh-smoke.js","sourceRoot":"","sources":["../../scripts/provider-refresh-smoke.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,oBAAoB,CAAC;AACxC,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,IAAI,MAAM,WAAW,CAAC;AAE7B,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAE,oBAAoB,EAAE,MAAM,iCAAiC,CAAC;AACvE,OAAO,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAE/D,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,GAAG,CAAC;AACpC,OAAO,CAAC,GAAG,CAAC,2BAA2B,GAAG,GAAG,CAAC;AAC9C,OAAO,CAAC,GAAG,CAAC,kBAAkB,GAAG,qBAAqB,CAAC;AACvD,OAAO,CAAC,GAAG,CAAC,qBAAqB,GAAG,EAAE,CAAC,WAAW,CAChD,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,gCAAgC,CAAC,CACzD,CAAC;AAEF,MAAM,MAAM,GAAG,UAAU,EAAE,CAAC;AAE5B,CAAC;IACC,MAAM,OAAO,GAAG,IAAI,iBAAiB,CAAC,MAAM,CAAC,CAAC;IAC9C,IAAI,eAA8E,CAAC;IAEjF,OASD,CAAC,MAAM,GAAG,KAAK,IAAI,EAAE,CAAC,CAAC;QACtB,IAAI,EAAE;YACJ,WAAW,EAAE;gBACX,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE;oBACxB,eAAe,GAAG,OAAO,CAAC;gBAC5B,CAAC;aACF;SACF;KACF,CAAC,CAAC;IAEH,MAAM,KAAK,GAAG,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;IACpC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC;IACpC,MAAM,CAAC,KAAK,CAAC,eAAe,EAAE,cAAc,EAAE,IAAI,CAAC,CAAC;IACpD,MAAM,CAAC,EAAE,CACP,OAAO,eAAe,EAAE,UAAU,KAAK,QAAQ,IAAI,eAAe,CAAC,UAAU,IAAI,EAAE,EACnF,+EAA+E,CAChF,CAAC;AACJ,CAAC;AAED,CAAC;IACC,MAAM,MAAM,GAAG,oBAAoB,CACjC,QAAQ,EACR,CAAC,EAAE,EAAE,EAAE,iBAAiB,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,EAC1C,iBAAiB,CAClB,CAAC;IACF,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,iBAAiB,CAAC,CAAC;IACjD,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;AAC9C,CAAC;AAED,CAAC;IACC,MAAM,MAAM,GAAG,oBAAoB,CACjC,QAAQ,EACR,CAAC,EAAE,EAAE,EAAE,wBAAwB,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,EACjD,wBAAwB,CACzB,CAAC;IACF,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,wBAAwB,CAAC,CAAC;IACxD,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;AAC9C,CAAC;AAED,CAAC;IACC,MAAM,IAAI,GAAG,oBAAoB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,EAAE,UAAU,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,EAAE,UAAU,CAAC,CAAC;IAC3F,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IACxC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;AAC5C,CAAC;AAED,CAAC;IACC,MAAM,YAAY,GAAG,EAAE,CAAC,YAAY,CAAC,oBAAoB,EAAE,MAAM,CAAC,CAAC;IACnE,MAAM,oBAAoB,GAAG,EAAE,CAAC,YAAY,CAAC,8BAA8B,EAAE,MAAM,CAAC,CAAC;IACrF,MAAM,CAAC,EAAE,CACP,YAAY,CAAC,QAAQ,CAAC,uEAAuE,CAAC,CAC/F,CAAC;IACF,MAAM,CAAC,EAAE,CACP,YAAY,CAAC,QAAQ,CACnB,2EAA2E,CAC5E,CACF,CAAC;IACF,MAAM,CAAC,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,yDAAyD,CAAC,CAAC,CAAC;IAC5F,MAAM,CAAC,EAAE,CAAC,oBAAoB,CAAC,QAAQ,CAAC,6BAA6B,CAAC,CAAC,CAAC;IACxE,MAAM,CAAC,EAAE,CAAC,oBAAoB,CAAC,QAAQ,CAAC,oCAAoC,CAAC,CAAC,CAAC;IAC/E,MAAM,CAAC,EAAE,CAAC,oBAAoB,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC,CAAC;AACjE,CAAC;AAED,OAAO,CAAC,GAAG,CAAC,+BAA+B,CAAC,CAAC"}
|
package/dist/scripts/smoke.js
CHANGED
|
@@ -209,6 +209,7 @@ for (const { file, field } of adapterExpectations) {
|
|
|
209
209
|
const modelSelectionSource = fs.readFileSync("src/peers/model-selection.ts", "utf8");
|
|
210
210
|
for (const deprecatedOrWeakModel of [
|
|
211
211
|
"claude-haiku-4-5",
|
|
212
|
+
"gemini-2.5-pro",
|
|
212
213
|
"gemini-3-pro-preview",
|
|
213
214
|
"deepseek-reasoner",
|
|
214
215
|
"deepseek-chat",
|
|
@@ -220,16 +221,16 @@ for (const deprecatedOrWeakModel of [
|
|
|
220
221
|
// "must remain" list is therefore exactly the 6 lone canonical pins.
|
|
221
222
|
for (const canonicalPin of [
|
|
222
223
|
"gpt-5.5",
|
|
223
|
-
"claude-opus-4-
|
|
224
|
-
"gemini-
|
|
224
|
+
"claude-opus-4-8",
|
|
225
|
+
"gemini-3.1-pro-preview",
|
|
225
226
|
"deepseek-v4-pro",
|
|
226
|
-
"grok-4
|
|
227
|
+
"grok-4.3",
|
|
227
228
|
"sonar-reasoning-pro",
|
|
228
229
|
]) {
|
|
229
230
|
assert.ok(modelSelectionSource.includes(`"${canonicalPin}"`), `${canonicalPin} must remain the lone canonical PRIORITY pin`);
|
|
230
231
|
}
|
|
231
|
-
const noWeakDowngrade = selectFromCandidates("claude", [{ id: "claude-haiku-4-5-20251001", source: "api" }], "claude-opus-4-
|
|
232
|
-
assert.equal(noWeakDowngrade.selected, "claude-opus-4-
|
|
232
|
+
const noWeakDowngrade = selectFromCandidates("claude", [{ id: "claude-haiku-4-5-20251001", source: "api" }], "claude-opus-4-8");
|
|
233
|
+
assert.equal(noWeakDowngrade.selected, "claude-opus-4-8");
|
|
233
234
|
assert.equal(noWeakDowngrade.confidence, "unknown");
|
|
234
235
|
assert.match(noWeakDowngrade.reason, /silently downgrading/);
|
|
235
236
|
const pemMarker = (side, label) => ["-----", side, " ", label, "-----"].join("");
|
|
@@ -1347,6 +1348,35 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
1347
1348
|
assert.ok(/evidence_sources/.test(instruction), "statusInstruction must direct detail to evidence_sources (v2.5.0)");
|
|
1348
1349
|
console.log("[smoke] session_contract_directives_test: PASS");
|
|
1349
1350
|
}
|
|
1351
|
+
// v4.2.2 — verified_requires_evidence_sources_test. Peer JSON may still
|
|
1352
|
+
// declare READY, but a `confidence:"verified"` verdict without concrete
|
|
1353
|
+
// evidence_sources must not be classified as a clean machine decision.
|
|
1354
|
+
{
|
|
1355
|
+
const statusModule = await import("../src/core/status.js");
|
|
1356
|
+
const parseStatusForTruth = statusModule.parsePeerStatus;
|
|
1357
|
+
const statusInstruction = statusModule.statusInstruction;
|
|
1358
|
+
const ungrounded = parseStatusForTruth(JSON.stringify({
|
|
1359
|
+
status: "READY",
|
|
1360
|
+
summary: "Looks correct.",
|
|
1361
|
+
confidence: "verified",
|
|
1362
|
+
evidence_sources: [],
|
|
1363
|
+
caller_requests: [],
|
|
1364
|
+
follow_ups: [],
|
|
1365
|
+
}));
|
|
1366
|
+
assert.ok(ungrounded.parser_warnings.includes("verified_without_evidence_sources"), "v4.2.2 / truthfulness_guardrails: confidence=verified with empty evidence_sources must emit verified_without_evidence_sources");
|
|
1367
|
+
assert.equal(ungrounded.structured?.confidence, "verified", "v4.2.2 / truthfulness_guardrails: parser warning must not silently rewrite peer confidence");
|
|
1368
|
+
const grounded = parseStatusForTruth(JSON.stringify({
|
|
1369
|
+
status: "READY",
|
|
1370
|
+
summary: "Runtime claim matches the raw source.",
|
|
1371
|
+
confidence: "verified",
|
|
1372
|
+
evidence_sources: ['server_info: {"version":"4.2.1","release_date":"2026-05-21"}'],
|
|
1373
|
+
caller_requests: [],
|
|
1374
|
+
follow_ups: [],
|
|
1375
|
+
}));
|
|
1376
|
+
assert.ok(!grounded.parser_warnings.includes("verified_without_evidence_sources"), "v4.2.2 / truthfulness_guardrails: concrete evidence_sources must satisfy verified confidence");
|
|
1377
|
+
assert.ok(/confidence.*verified[\s\S]+evidence_sources/i.test(statusInstruction()), "v4.2.2 / truthfulness_guardrails: statusInstruction must tie verified confidence to concrete evidence_sources");
|
|
1378
|
+
console.log("[smoke] verified_requires_evidence_sources_test: PASS");
|
|
1379
|
+
}
|
|
1350
1380
|
// v2.5.0: CROSS_REVIEW_DEFAULT_MAX_ROUNDS env override is honored.
|
|
1351
1381
|
{
|
|
1352
1382
|
const { loadConfig: reload } = await import("../src/core/config.js");
|
|
@@ -3644,14 +3674,12 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
3644
3674
|
// v3.0.0: PEERS now has 6 entries (perplexity added).
|
|
3645
3675
|
assert.equal(PEERS.length, 6, "PEERS must have 6 entries (codex/claude/gemini/deepseek/grok/perplexity)");
|
|
3646
3676
|
const cfg = loadConfig();
|
|
3647
|
-
//
|
|
3648
|
-
//
|
|
3649
|
-
//
|
|
3650
|
-
//
|
|
3651
|
-
//
|
|
3652
|
-
|
|
3653
|
-
// clampEffortForModel tests below continue to pin that capability.
|
|
3654
|
-
assert.equal(cfg.models.grok, "grok-4-latest", "default grok model must be grok-4-latest (v3.7.2, operator directive)");
|
|
3677
|
+
// v4.2.2 provider-doc refresh: default grok model is the concrete
|
|
3678
|
+
// `grok-4.3` pin. `grok-4-latest` remains a valid xAI alias and
|
|
3679
|
+
// `grok-4.20-multi-agent` remains a valid env-override for explicit
|
|
3680
|
+
// multi-agent reasoning behavior; the adapter tests below continue to
|
|
3681
|
+
// pin those capabilities.
|
|
3682
|
+
assert.equal(cfg.models.grok, "grok-4.3", "default grok model must be grok-4.3 (v4.2.2 provider-doc refresh)");
|
|
3655
3683
|
assert.ok("grok" in cfg.fallback_models, "fallback_models must have grok entry");
|
|
3656
3684
|
assert.equal(cfg.peer_enabled.grok, true, "grok must be enabled by default");
|
|
3657
3685
|
assert.ok(cfg.cost_rates.grok, "grok cost rates must be configured (env-set in smoke setup)");
|
|
@@ -5210,6 +5238,19 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
5210
5238
|
});
|
|
5211
5239
|
assert.ok(netNewAssertionWithDraft.fabricated === true &&
|
|
5212
5240
|
netNewAssertionWithDraft.suspicious_assertion_count >= 2, `v3.7.4 / fabrication_lock: operational assertions NET-NEW vs {provenance ∪ priorDraft} — invented by the relator even though a prior draft exists — MUST still trip fabricated=true (got count=${netNewAssertionWithDraft.suspicious_assertion_count}, fabricated=${netNewAssertionWithDraft.fabricated})`);
|
|
5241
|
+
const inventedWorkflowDispatch = detectFabricatedEvidence("Refazendo agora. Workflow launched in background. Task ID: wllbll9am. Run ID: wf_e7c69578-e23.", {
|
|
5242
|
+
provenanceCorpus: "",
|
|
5243
|
+
priorDraftCorpus: "The user challenged the report and did not authorize a redo.",
|
|
5244
|
+
narrativeCorpus: "Analyze why Claude lied about the prior v4.2.0 audit.",
|
|
5245
|
+
});
|
|
5246
|
+
assert.ok(inventedWorkflowDispatch.fabricated === true &&
|
|
5247
|
+
inventedWorkflowDispatch.suspicious_assertion_count >= 2, `v4.2.2 / truthfulness_guardrails: invented workflow dispatch claims MUST trip fabricated=true (got count=${inventedWorkflowDispatch.suspicious_assertion_count}, fabricated=${inventedWorkflowDispatch.fabricated})`);
|
|
5248
|
+
const genericConfirmation = detectFabricatedEvidence("The reviewer confirmed the model-selection rationale is clear.", {
|
|
5249
|
+
provenanceCorpus: "",
|
|
5250
|
+
priorDraftCorpus: "",
|
|
5251
|
+
narrativeCorpus: "",
|
|
5252
|
+
});
|
|
5253
|
+
assert.equal(genericConfirmation.fabricated, false, "v4.2.2 / truthfulness_guardrails: generic 'confirmed' prose without a dispatch/authorization claim must not trip fabrication detection");
|
|
5213
5254
|
// Source-level: threshold constants pinned at the documented values.
|
|
5214
5255
|
assert.ok(/FABRICATED_NET_NEW_HEX_THRESHOLD\s*=\s*3/.test(orchSrc), "v2.24.0 / fabrication_lock: net-new hex threshold pinned at 3");
|
|
5215
5256
|
assert.ok(/FABRICATED_SUSPICIOUS_ASSERTION_THRESHOLD\s*=\s*2/.test(orchSrc), "v2.24.0 / fabrication_lock: suspicious assertion threshold pinned at 2");
|
|
@@ -5675,6 +5716,75 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
5675
5716
|
assert.ok(/boolEnv\("CROSS_REVIEW_EVIDENCE_PREFLIGHT", true\)/.test(configSrcPf), "v3.5.0 / evidence_preflight: CROSS_REVIEW_EVIDENCE_PREFLIGHT env var must default ON");
|
|
5676
5717
|
console.log("[smoke] evidence_preflight_test: PASS");
|
|
5677
5718
|
}
|
|
5719
|
+
// v4.2.2 — truthfulness_preflight_test. Pins the guard added after the
|
|
5720
|
+
// Claude Code Opus 4.8 incident where a report asserted
|
|
5721
|
+
// "v4.2.0 current production" despite live server_info showing
|
|
5722
|
+
// v4.2.1. The old evidence preflight only checked completed-work
|
|
5723
|
+
// claims (tests/diff/build) and did not reject current-runtime
|
|
5724
|
+
// contradictions or unsupported historical timing narratives.
|
|
5725
|
+
{
|
|
5726
|
+
const { truthfulnessPreflight } = await import("../src/core/orchestrator.js");
|
|
5727
|
+
const runtimeFacts = {
|
|
5728
|
+
runtime_version: "4.2.1",
|
|
5729
|
+
release_date: "2026-05-21",
|
|
5730
|
+
model_pins: {
|
|
5731
|
+
claude: "claude-opus-4-8",
|
|
5732
|
+
grok: "grok-4.3",
|
|
5733
|
+
},
|
|
5734
|
+
};
|
|
5735
|
+
const contradictedByRuntime = truthfulnessPreflight({
|
|
5736
|
+
task: "Audit all sessions generated with the current cross-review version.",
|
|
5737
|
+
initialDraft: 'Live server_info: {"version":"4.2.1","release_date":"2026-05-21"}\nAudit report for cross-review v4.2.0 current production, released 2026-05-17.',
|
|
5738
|
+
runtimeFacts,
|
|
5739
|
+
attachmentsPresent: false,
|
|
5740
|
+
});
|
|
5741
|
+
assert.equal(contradictedByRuntime.pass, false, "v4.2.2 / truthfulness_preflight: current-runtime version claim contradicting runtime facts must trip even when server_info text is present");
|
|
5742
|
+
assert.ok(contradictedByRuntime.contradictions.some((item) => item.includes("4.2.0")), "v4.2.2 / truthfulness_preflight: mismatch diagnostics must include the contradicted version token");
|
|
5743
|
+
const backedByRuntime = truthfulnessPreflight({
|
|
5744
|
+
task: "Audit all sessions generated with the current cross-review version.",
|
|
5745
|
+
initialDraft: 'Live server_info: {"version":"4.2.1","release_date":"2026-05-21"}\nAudit report for cross-review v4.2.1 current production, released 2026-05-21.',
|
|
5746
|
+
runtimeFacts,
|
|
5747
|
+
attachmentsPresent: false,
|
|
5748
|
+
});
|
|
5749
|
+
assert.equal(backedByRuntime.pass, true, "v4.2.2 / truthfulness_preflight: current-runtime claim matching runtime facts must pass");
|
|
5750
|
+
const unsupportedCurrentState = truthfulnessPreflight({
|
|
5751
|
+
task: "Audit all sessions generated with the current cross-review version.",
|
|
5752
|
+
initialDraft: "Audit report for cross-review v4.2.1 current production.",
|
|
5753
|
+
runtimeFacts: {},
|
|
5754
|
+
attachmentsPresent: false,
|
|
5755
|
+
});
|
|
5756
|
+
assert.equal(unsupportedCurrentState.pass, false, "v4.2.2 / truthfulness_preflight: current-runtime claim without runtime facts or source evidence must trip");
|
|
5757
|
+
const historicalChangelog = truthfulnessPreflight({
|
|
5758
|
+
task: "Review this changelog text.",
|
|
5759
|
+
initialDraft: "v4.2.0 was released on 2026-05-17. v4.2.1 was released on 2026-05-21.",
|
|
5760
|
+
runtimeFacts,
|
|
5761
|
+
attachmentsPresent: false,
|
|
5762
|
+
});
|
|
5763
|
+
assert.equal(historicalChangelog.pass, true, "v4.2.2 / truthfulness_preflight: historical version text without current/timing claims must not trip");
|
|
5764
|
+
const fabricatedTiming = truthfulnessPreflight({
|
|
5765
|
+
task: "Explain why the report said v4.2.0.",
|
|
5766
|
+
initialDraft: "When the workflow began, cross-review was running v4.2.0. It was bumped to v4.2.1 between R1 and R3.",
|
|
5767
|
+
runtimeFacts,
|
|
5768
|
+
attachmentsPresent: false,
|
|
5769
|
+
});
|
|
5770
|
+
assert.equal(fabricatedTiming.pass, false, "v4.2.2 / truthfulness_preflight: historical runtime timing narrative without snapshot evidence must trip");
|
|
5771
|
+
const withStructuredEvidence = truthfulnessPreflight({
|
|
5772
|
+
task: "Explain why the report said v4.2.0.",
|
|
5773
|
+
initialDraft: "When the workflow began, cross-review was running v4.2.0. It was bumped to v4.2.1 between R1 and R3.",
|
|
5774
|
+
runtimeFacts,
|
|
5775
|
+
structuredEvidence: "Historical runtime snapshot from events.ndjson: workflow_start server_info version=4.2.0; later reload server_info version=4.2.1.",
|
|
5776
|
+
attachmentsPresent: false,
|
|
5777
|
+
});
|
|
5778
|
+
assert.equal(withStructuredEvidence.pass, true, "v4.2.2 / truthfulness_preflight: structured evidence can satisfy historical timing claims");
|
|
5779
|
+
const orchSrcTruth = fs.readFileSync(new URL("../src/core/orchestrator.ts", import.meta.url), "utf8");
|
|
5780
|
+
const configSrcTruth = fs.readFileSync(new URL("../src/core/config.ts", import.meta.url), "utf8");
|
|
5781
|
+
assert.ok(/export function truthfulnessPreflight\b/.test(orchSrcTruth), "v4.2.2 / truthfulness_preflight: truthfulnessPreflight must be exported");
|
|
5782
|
+
assert.ok(/truthfulness_preflight_enabled/.test(orchSrcTruth) &&
|
|
5783
|
+
/askPeers[\s\S]+truthfulnessPreflight/.test(orchSrcTruth) &&
|
|
5784
|
+
/runUntilUnanimous[\s\S]+truthfulnessPreflight/.test(orchSrcTruth), "v4.2.2 / truthfulness_preflight: both askPeers and runUntilUnanimous must gate on config.truthfulness_preflight_enabled");
|
|
5785
|
+
assert.ok(/boolEnv\("CROSS_REVIEW_TRUTHFULNESS_PREFLIGHT", true\)/.test(configSrcTruth), "v4.2.2 / truthfulness_preflight: CROSS_REVIEW_TRUTHFULNESS_PREFLIGHT env var must default ON");
|
|
5786
|
+
console.log("[smoke] truthfulness_preflight_test: PASS");
|
|
5787
|
+
}
|
|
5678
5788
|
// v3.5.0 (CRV2-1 + CRV2-6) — budget + max_rounds traceability.
|
|
5679
5789
|
//
|
|
5680
5790
|
// setSessionTraceability persists requested-vs-effective max_rounds and
|
|
@@ -6020,15 +6130,15 @@ assert.equal(Object.hasOwn(metrics.decision_quality, "undefined"), false);
|
|
|
6020
6130
|
// SINGLE canonical pin. Negative pins (off-policy models that must never
|
|
6021
6131
|
// appear) + positive pins (the exact lone-entry shape per peer).
|
|
6022
6132
|
const a3ModelSrc = fs.readFileSync(new URL("../src/peers/model-selection.ts", import.meta.url), "utf8");
|
|
6023
|
-
for (const offPolicyModel of ["deepseek-v4-flash", "gemini-
|
|
6133
|
+
for (const offPolicyModel of ["deepseek-v4-flash", "gemini-2.5-pro"]) {
|
|
6024
6134
|
assert.ok(!a3ModelSrc.includes(`"${offPolicyModel}"`), `v3.7.2 / AUDIT-3: ${offPolicyModel} must not appear in the PRIORITY lists`);
|
|
6025
6135
|
}
|
|
6026
6136
|
for (const [peer, pin] of [
|
|
6027
6137
|
["codex", "gpt-5.5"],
|
|
6028
|
-
["claude", "claude-opus-4-
|
|
6029
|
-
["gemini", "gemini-
|
|
6138
|
+
["claude", "claude-opus-4-8"],
|
|
6139
|
+
["gemini", "gemini-3.1-pro-preview"],
|
|
6030
6140
|
["deepseek", "deepseek-v4-pro"],
|
|
6031
|
-
["grok", "grok-4
|
|
6141
|
+
["grok", "grok-4.3"],
|
|
6032
6142
|
["perplexity", "sonar-reasoning-pro"],
|
|
6033
6143
|
]) {
|
|
6034
6144
|
assert.ok(new RegExp(`${peer}: \\["${pin}"\\]`).test(a3ModelSrc), `v3.7.2 / AUDIT-3: ${peer} PRIORITY must be the lone canonical pin ["${pin}"] (no fallback)`);
|