@bookedsolid/rea 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.husky/commit-msg +130 -0
- package/.husky/pre-push +128 -0
- package/README.md +5 -5
- package/agents/codex-adversarial.md +23 -8
- package/commands/codex-review.md +2 -2
- package/dist/audit/append.d.ts +62 -0
- package/dist/audit/append.js +189 -0
- package/dist/audit/codex-event.d.ts +28 -0
- package/dist/audit/codex-event.js +15 -0
- package/dist/cli/doctor.d.ts +60 -1
- package/dist/cli/doctor.js +459 -20
- package/dist/cli/index.js +35 -5
- package/dist/cli/init.d.ts +13 -0
- package/dist/cli/init.js +278 -67
- package/dist/cli/install/canonical.d.ts +43 -0
- package/dist/cli/install/canonical.js +101 -0
- package/dist/cli/install/claude-md.d.ts +48 -0
- package/dist/cli/install/claude-md.js +93 -0
- package/dist/cli/install/commit-msg.d.ts +30 -0
- package/dist/cli/install/commit-msg.js +102 -0
- package/dist/cli/install/copy.d.ts +169 -0
- package/dist/cli/install/copy.js +455 -0
- package/dist/cli/install/fs-safe.d.ts +91 -0
- package/dist/cli/install/fs-safe.js +347 -0
- package/dist/cli/install/manifest-io.d.ts +12 -0
- package/dist/cli/install/manifest-io.js +44 -0
- package/dist/cli/install/manifest-schema.d.ts +83 -0
- package/dist/cli/install/manifest-schema.js +80 -0
- package/dist/cli/install/reagent.d.ts +59 -0
- package/dist/cli/install/reagent.js +160 -0
- package/dist/cli/install/settings-merge.d.ts +91 -0
- package/dist/cli/install/settings-merge.js +239 -0
- package/dist/cli/install/sha.d.ts +9 -0
- package/dist/cli/install/sha.js +21 -0
- package/dist/cli/serve.d.ts +11 -0
- package/dist/cli/serve.js +72 -6
- package/dist/cli/upgrade.d.ts +67 -0
- package/dist/cli/upgrade.js +509 -0
- package/dist/gateway/downstream-pool.d.ts +39 -0
- package/dist/gateway/downstream-pool.js +93 -0
- package/dist/gateway/downstream.d.ts +80 -0
- package/dist/gateway/downstream.js +196 -0
- package/dist/gateway/middleware/audit-types.d.ts +10 -0
- package/dist/gateway/middleware/audit.js +14 -0
- package/dist/gateway/middleware/injection.d.ts +59 -2
- package/dist/gateway/middleware/injection.js +91 -14
- package/dist/gateway/middleware/kill-switch.d.ts +20 -5
- package/dist/gateway/middleware/kill-switch.js +57 -35
- package/dist/gateway/middleware/redact.d.ts +83 -6
- package/dist/gateway/middleware/redact.js +133 -46
- package/dist/gateway/observability/codex-probe.d.ts +110 -0
- package/dist/gateway/observability/codex-probe.js +234 -0
- package/dist/gateway/observability/codex-telemetry.d.ts +93 -0
- package/dist/gateway/observability/codex-telemetry.js +221 -0
- package/dist/gateway/redact-safe/match-timeout.d.ts +83 -0
- package/dist/gateway/redact-safe/match-timeout.js +179 -0
- package/dist/gateway/reviewers/claude-self.d.ts +99 -0
- package/dist/gateway/reviewers/claude-self.js +316 -0
- package/dist/gateway/reviewers/codex.d.ts +64 -0
- package/dist/gateway/reviewers/codex.js +80 -0
- package/dist/gateway/reviewers/select.d.ts +64 -0
- package/dist/gateway/reviewers/select.js +102 -0
- package/dist/gateway/reviewers/types.d.ts +85 -0
- package/dist/gateway/reviewers/types.js +14 -0
- package/dist/gateway/server.d.ts +51 -0
- package/dist/gateway/server.js +258 -0
- package/dist/gateway/session.d.ts +9 -0
- package/dist/gateway/session.js +17 -0
- package/dist/policy/loader.d.ts +59 -0
- package/dist/policy/loader.js +65 -0
- package/dist/policy/profiles.d.ts +80 -0
- package/dist/policy/profiles.js +94 -0
- package/dist/policy/types.d.ts +38 -0
- package/dist/registry/loader.d.ts +98 -0
- package/dist/registry/loader.js +153 -0
- package/dist/registry/types.d.ts +44 -0
- package/dist/registry/types.js +6 -0
- package/dist/scripts/read-policy-field.d.ts +36 -0
- package/dist/scripts/read-policy-field.js +96 -0
- package/hooks/push-review-gate.sh +627 -17
- package/package.json +13 -2
- package/profiles/bst-internal-no-codex.yaml +40 -0
- package/profiles/bst-internal.yaml +23 -0
- package/profiles/client-engagement.yaml +23 -0
- package/profiles/lit-wc.yaml +17 -0
- package/profiles/minimal.yaml +11 -0
- package/profiles/open-source-no-codex.yaml +33 -0
- package/profiles/open-source.yaml +18 -0
- package/scripts/lint-safe-regex.mjs +78 -0
- package/scripts/postinstall.mjs +131 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import { MessageChannel, Worker, receiveMessageOnPort } from 'node:worker_threads';
|
|
2
|
+
const DEFAULT_TIMEOUT_MS = 100;
|
|
3
|
+
/**
|
|
4
|
+
* Worker source — one script handles both `test` and `replace` ops. The worker
|
|
5
|
+
* receives the request + a SharedArrayBuffer for synchronization via
|
|
6
|
+
* `workerData`, compiles the regex inside the worker (so a catastrophic
|
|
7
|
+
* pattern burns worker CPU only), writes the result payload into a parentPort
|
|
8
|
+
* message, and then signals completion by writing `1` into the SAB and calling
|
|
9
|
+
* `Atomics.notify`. The parent blocks on `Atomics.wait(sab, 0, 0, timeoutMs)`
|
|
10
|
+
* and wakes when the worker notifies — OR when the timeout expires, in which
|
|
11
|
+
* case the parent terminates the worker.
|
|
12
|
+
*
|
|
13
|
+
* SECURITY: The parent must NOT rely on the `message` event alone, because
|
|
14
|
+
* `Atomics.wait` blocks the main thread's event loop. The SAB signal is the
|
|
15
|
+
* authoritative wake source. The parent reads the reply AFTER wake by draining
|
|
16
|
+
* the worker's `receiveMessageOnPort` queue.
|
|
17
|
+
*/
|
|
18
|
+
const WORKER_SOURCE = `
|
|
19
|
+
const { workerData } = require('node:worker_threads');
|
|
20
|
+
const { signalSab, replyPort, req } = workerData;
|
|
21
|
+
const view = new Int32Array(signalSab);
|
|
22
|
+
try {
|
|
23
|
+
const re = new RegExp(req.source, req.flags);
|
|
24
|
+
let reply;
|
|
25
|
+
if (req.op === 'test') {
|
|
26
|
+
re.lastIndex = 0;
|
|
27
|
+
reply = { ok: true, op: 'test', matched: re.test(req.input) };
|
|
28
|
+
} else if (req.op === 'replace') {
|
|
29
|
+
re.lastIndex = 0;
|
|
30
|
+
reply = { ok: true, op: 'replace', output: req.input.replace(re, req.replacer) };
|
|
31
|
+
} else if (req.op === 'matchAll') {
|
|
32
|
+
// Force the global flag on so matchAll is meaningful.
|
|
33
|
+
const flags = req.flags.includes('g') ? req.flags : req.flags + 'g';
|
|
34
|
+
const gre = new RegExp(req.source, flags);
|
|
35
|
+
const out = [];
|
|
36
|
+
for (const m of req.input.matchAll(gre)) {
|
|
37
|
+
out.push(m[0]);
|
|
38
|
+
}
|
|
39
|
+
reply = { ok: true, op: 'matchAll', matches: out };
|
|
40
|
+
} else {
|
|
41
|
+
reply = { ok: false, error: 'unknown op: ' + req.op };
|
|
42
|
+
}
|
|
43
|
+
replyPort.postMessage(reply);
|
|
44
|
+
} catch (err) {
|
|
45
|
+
replyPort.postMessage({ ok: false, error: err && err.message ? err.message : String(err) });
|
|
46
|
+
} finally {
|
|
47
|
+
// Signal completion via SAB so the parent's Atomics.wait unblocks. The parent
|
|
48
|
+
// then drains the replyPort synchronously via receiveMessageOnPort.
|
|
49
|
+
Atomics.store(view, 0, 1);
|
|
50
|
+
Atomics.notify(view, 0);
|
|
51
|
+
}
|
|
52
|
+
`;
|
|
53
|
+
/**
|
|
54
|
+
* Synchronous wrapper around the worker. Middleware hot paths call `.test()`
|
|
55
|
+
* and `.replace()` inside tight synchronous loops (see `redactSecrets`), so the
|
|
56
|
+
* public `SafeRegex` surface has to be synchronous to be a drop-in replacement.
|
|
57
|
+
*
|
|
58
|
+
* How it works:
|
|
59
|
+
* 1. Allocate a 4-byte SharedArrayBuffer. The worker and parent both see it.
|
|
60
|
+
* 2. Spawn the worker with `workerData: { signalSab, req }`.
|
|
61
|
+
* 3. Parent blocks on `Atomics.wait(view, 0, 0, timeoutMs)` — allowed on the
|
|
62
|
+
* Node main thread (unlike the browser).
|
|
63
|
+
* 4. Worker computes the result, posts the reply message, then writes `1` to
|
|
64
|
+
* the SAB and calls `Atomics.notify`. The SAB notify is the authoritative
|
|
65
|
+
* wake — the message event cannot fire because the event loop is blocked.
|
|
66
|
+
* 5. Parent wakes, drains the worker's message queue synchronously via
|
|
67
|
+
* `receiveMessageOnPort`, then terminates the worker.
|
|
68
|
+
*
|
|
69
|
+
* On timeout the parent `terminate()`s the worker — a hard kill that stops a
|
|
70
|
+
* catastrophic backtracker cold.
|
|
71
|
+
*/
|
|
72
|
+
function runInWorkerSync(req, timeoutMs) {
|
|
73
|
+
const signalSab = new SharedArrayBuffer(4);
|
|
74
|
+
const view = new Int32Array(signalSab);
|
|
75
|
+
// Create a MessageChannel so the parent can drain the reply synchronously
|
|
76
|
+
// via `receiveMessageOnPort`. We give the worker the `port1` end and keep
|
|
77
|
+
// `port2` on the parent side.
|
|
78
|
+
const { port1: workerSendPort, port2: parentRecvPort } = new MessageChannel();
|
|
79
|
+
const worker = new Worker(WORKER_SOURCE, {
|
|
80
|
+
eval: true,
|
|
81
|
+
workerData: { signalSab, replyPort: workerSendPort, req },
|
|
82
|
+
transferList: [workerSendPort],
|
|
83
|
+
});
|
|
84
|
+
// Don't pin the process on the worker's existence.
|
|
85
|
+
worker.unref();
|
|
86
|
+
// Block this thread until the worker signals completion OR the timeout
|
|
87
|
+
// expires. `Atomics.wait` is allowed on the Node main thread (unlike the
|
|
88
|
+
// browser, where it is blocked on the UI thread).
|
|
89
|
+
const waitResult = Atomics.wait(view, 0, 0, timeoutMs);
|
|
90
|
+
if (waitResult === 'timed-out') {
|
|
91
|
+
// Worker is still running — kill it and report timeout.
|
|
92
|
+
void worker.terminate();
|
|
93
|
+
parentRecvPort.close();
|
|
94
|
+
return { reply: null, timedOut: true };
|
|
95
|
+
}
|
|
96
|
+
// Worker signaled completion. Drain the port queue synchronously to
|
|
97
|
+
// recover the reply payload. `receiveMessageOnPort` returns `undefined` if
|
|
98
|
+
// no message is queued — that should not happen on the happy path because
|
|
99
|
+
// the worker posts the message BEFORE notifying the SAB, but we guard
|
|
100
|
+
// defensively.
|
|
101
|
+
let reply = null;
|
|
102
|
+
const msg = receiveMessageOnPort(parentRecvPort);
|
|
103
|
+
if (msg !== undefined) {
|
|
104
|
+
reply = msg.message;
|
|
105
|
+
}
|
|
106
|
+
// Release the worker thread and close the port.
|
|
107
|
+
void worker.terminate();
|
|
108
|
+
parentRecvPort.close();
|
|
109
|
+
if (reply !== null) {
|
|
110
|
+
return { reply, timedOut: false };
|
|
111
|
+
}
|
|
112
|
+
return { reply: { ok: false, error: 'worker produced no result' }, timedOut: false };
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Wrap a RegExp in a timeout-enforced `SafeRegex`. Compilation happens both in
|
|
116
|
+
* the parent (to catch syntax errors early) and inside the worker (so a
|
|
117
|
+
* catastrophic compile or match spends only worker CPU).
|
|
118
|
+
*
|
|
119
|
+
* SECURITY: callers should pass regexes that have ALSO been cleared by
|
|
120
|
+
* `safe-regex` at load time — the timeout is a defense-in-depth backstop, not
|
|
121
|
+
* a replacement for static analysis. See `scripts/lint-safe-regex.mjs` and the
|
|
122
|
+
* load-time check in `src/policy/loader.ts`.
|
|
123
|
+
*/
|
|
124
|
+
export function wrapRegex(pattern, opts) {
|
|
125
|
+
const timeoutMs = opts?.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
126
|
+
const onTimeout = opts?.onTimeout;
|
|
127
|
+
const source = pattern.source;
|
|
128
|
+
const flags = pattern.flags;
|
|
129
|
+
const emitTimeout = (input) => {
|
|
130
|
+
if (onTimeout) {
|
|
131
|
+
try {
|
|
132
|
+
onTimeout(pattern, input);
|
|
133
|
+
}
|
|
134
|
+
catch {
|
|
135
|
+
// Callback errors MUST NOT break middleware. Swallow silently — the
|
|
136
|
+
// middleware has its own audit path if it cares.
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
};
|
|
140
|
+
return {
|
|
141
|
+
pattern,
|
|
142
|
+
test(input) {
|
|
143
|
+
const { reply, timedOut } = runInWorkerSync({ op: 'test', source, flags, input }, timeoutMs);
|
|
144
|
+
if (timedOut) {
|
|
145
|
+
emitTimeout(input);
|
|
146
|
+
return { matched: false, timedOut: true };
|
|
147
|
+
}
|
|
148
|
+
if (reply && reply.ok && reply.op === 'test') {
|
|
149
|
+
return { matched: reply.matched, timedOut: false };
|
|
150
|
+
}
|
|
151
|
+
// Worker errored (compile error, etc.) — treat as no match, no timeout.
|
|
152
|
+
return { matched: false, timedOut: false };
|
|
153
|
+
},
|
|
154
|
+
replace(input, replacer) {
|
|
155
|
+
const { reply, timedOut } = runInWorkerSync({ op: 'replace', source, flags, input, replacer }, timeoutMs);
|
|
156
|
+
if (timedOut) {
|
|
157
|
+
emitTimeout(input);
|
|
158
|
+
return { output: input, timedOut: true };
|
|
159
|
+
}
|
|
160
|
+
if (reply && reply.ok && reply.op === 'replace') {
|
|
161
|
+
return { output: reply.output, timedOut: false };
|
|
162
|
+
}
|
|
163
|
+
// Worker errored — preserve input unchanged (never corrupt payload).
|
|
164
|
+
return { output: input, timedOut: false };
|
|
165
|
+
},
|
|
166
|
+
matchAll(input) {
|
|
167
|
+
const { reply, timedOut } = runInWorkerSync({ op: 'matchAll', source, flags, input }, timeoutMs);
|
|
168
|
+
if (timedOut) {
|
|
169
|
+
emitTimeout(input);
|
|
170
|
+
return { matches: [], timedOut: true };
|
|
171
|
+
}
|
|
172
|
+
if (reply && reply.ok && reply.op === 'matchAll') {
|
|
173
|
+
return { matches: reply.matches, timedOut: false };
|
|
174
|
+
}
|
|
175
|
+
// Worker errored — return empty match set.
|
|
176
|
+
return { matches: [], timedOut: false };
|
|
177
|
+
},
|
|
178
|
+
};
|
|
179
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ClaudeSelfReviewer — the runtime fallback for the adversarial reviewer
|
|
3
|
+
* slot (G11.2).
|
|
4
|
+
*
|
|
5
|
+
* When Codex is unreachable (rate-limited, unauthenticated, CLI missing)
|
|
6
|
+
* and the operator hasn't opted into a first-class no-Codex policy, we
|
|
7
|
+
* still want SOMETHING pushing back on the diff before it lands. A fresh-
|
|
8
|
+
* context Opus call with a review-only system prompt is not a cross-model
|
|
9
|
+
* check — it's the same family reviewing its own output — so we label every
|
|
10
|
+
* result `degraded: true` so the audit trail is honest about it.
|
|
11
|
+
*
|
|
12
|
+
* ## Design notes
|
|
13
|
+
*
|
|
14
|
+
* - One-shot, no conversation history. The SDK call is synchronous from
|
|
15
|
+
* our caller's perspective.
|
|
16
|
+
* - We pin the model id in `version` so older audit entries stay
|
|
17
|
+
* reproducible when we bump the default.
|
|
18
|
+
* - The model is prompted to return STRICT JSON matching `ReviewResult`.
|
|
19
|
+
* If parsing fails we return `verdict: 'error'` rather than guessing —
|
|
20
|
+
* the operator gets a clear signal that the fallback didn't work.
|
|
21
|
+
* - Rate-limit / 5xx errors bubble up as `verdict: 'error'` with the raw
|
|
22
|
+
* message; callers can decide to retry, prompt the human, or abort.
|
|
23
|
+
* - We cap the diff at 200KB and note the truncation in the summary. The
|
|
24
|
+
* inbound token budget for a big Opus call is much larger, but a 200KB
|
|
25
|
+
* diff is already a red flag on its own and we don't want to silently
|
|
26
|
+
* eat massive payloads.
|
|
27
|
+
*/
|
|
28
|
+
import { recordTelemetry } from '../observability/codex-telemetry.js';
|
|
29
|
+
import type { AdversarialReviewer, ReviewRequest, ReviewResult } from './types.js';
|
|
30
|
+
/**
|
|
31
|
+
* Thin shape of the one SDK method we call. Lets the tests swap in a fake
|
|
32
|
+
* without pulling the full Anthropic client into the unit test. Shape
|
|
33
|
+
* mirrors `client.messages.create` closely enough for our purposes.
|
|
34
|
+
*/
|
|
35
|
+
export interface MessagesCreateFn {
|
|
36
|
+
(params: {
|
|
37
|
+
model: string;
|
|
38
|
+
max_tokens: number;
|
|
39
|
+
system: string;
|
|
40
|
+
messages: Array<{
|
|
41
|
+
role: 'user';
|
|
42
|
+
content: string;
|
|
43
|
+
}>;
|
|
44
|
+
}): Promise<{
|
|
45
|
+
content: Array<{
|
|
46
|
+
type: string;
|
|
47
|
+
text?: string;
|
|
48
|
+
}>;
|
|
49
|
+
}>;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Constructor seams let tests inject a fake exec/SDK without stubbing the
|
|
53
|
+
* module registry. Production callers use the defaults.
|
|
54
|
+
*/
|
|
55
|
+
export interface ClaudeSelfReviewerOptions {
|
|
56
|
+
apiKey?: string;
|
|
57
|
+
model?: string;
|
|
58
|
+
create?: MessagesCreateFn;
|
|
59
|
+
/**
|
|
60
|
+
* Repo root used for the telemetry write path (`<baseDir>/.rea/metrics.jsonl`).
|
|
61
|
+
* Defaults to `process.cwd()`. Tests inject a temp dir so they don't scribble
|
|
62
|
+
* into the repo's real metrics file.
|
|
63
|
+
*/
|
|
64
|
+
baseDir?: string;
|
|
65
|
+
/**
|
|
66
|
+
* Test seam — override the telemetry-record function so unit tests can
|
|
67
|
+
* assert on the exact shape without hitting disk. Defaults to the real
|
|
68
|
+
* `recordTelemetry` helper.
|
|
69
|
+
*/
|
|
70
|
+
recordTelemetryFn?: typeof recordTelemetry;
|
|
71
|
+
}
|
|
72
|
+
export declare class ClaudeSelfReviewer implements AdversarialReviewer {
|
|
73
|
+
readonly name = "claude-self";
|
|
74
|
+
readonly version: string;
|
|
75
|
+
private readonly apiKey;
|
|
76
|
+
private readonly createFn;
|
|
77
|
+
private readonly baseDir;
|
|
78
|
+
private readonly recordTelemetryFn;
|
|
79
|
+
constructor(opts?: ClaudeSelfReviewerOptions);
|
|
80
|
+
/**
|
|
81
|
+
* Cheap check. We don't actually ping the API — the selector only needs
|
|
82
|
+
* to know whether we CAN try. If the key is bogus we'll find out in
|
|
83
|
+
* `review()` and surface it as `verdict: 'error'`.
|
|
84
|
+
*/
|
|
85
|
+
isAvailable(): Promise<boolean>;
|
|
86
|
+
review(req: ReviewRequest): Promise<ReviewResult>;
|
|
87
|
+
/**
|
|
88
|
+
* Fire-and-forget telemetry write. The helper itself is fail-soft, but a
|
|
89
|
+
* misbehaving injected `recordTelemetryFn` (synchronous throw) could
|
|
90
|
+
* still escape a bare `void fn(...)`. This wrapper catches both sync and
|
|
91
|
+
* async rejections so a telemetry bug never breaks a review.
|
|
92
|
+
*/
|
|
93
|
+
private emitTelemetry;
|
|
94
|
+
/**
|
|
95
|
+
* Resolve the create() closure lazily so we only build the real client
|
|
96
|
+
* when there's an API key and nothing was injected.
|
|
97
|
+
*/
|
|
98
|
+
private getCreateFn;
|
|
99
|
+
}
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ClaudeSelfReviewer — the runtime fallback for the adversarial reviewer
|
|
3
|
+
* slot (G11.2).
|
|
4
|
+
*
|
|
5
|
+
* When Codex is unreachable (rate-limited, unauthenticated, CLI missing)
|
|
6
|
+
* and the operator hasn't opted into a first-class no-Codex policy, we
|
|
7
|
+
* still want SOMETHING pushing back on the diff before it lands. A fresh-
|
|
8
|
+
* context Opus call with a review-only system prompt is not a cross-model
|
|
9
|
+
* check — it's the same family reviewing its own output — so we label every
|
|
10
|
+
* result `degraded: true` so the audit trail is honest about it.
|
|
11
|
+
*
|
|
12
|
+
* ## Design notes
|
|
13
|
+
*
|
|
14
|
+
* - One-shot, no conversation history. The SDK call is synchronous from
|
|
15
|
+
* our caller's perspective.
|
|
16
|
+
* - We pin the model id in `version` so older audit entries stay
|
|
17
|
+
* reproducible when we bump the default.
|
|
18
|
+
* - The model is prompted to return STRICT JSON matching `ReviewResult`.
|
|
19
|
+
* If parsing fails we return `verdict: 'error'` rather than guessing —
|
|
20
|
+
* the operator gets a clear signal that the fallback didn't work.
|
|
21
|
+
* - Rate-limit / 5xx errors bubble up as `verdict: 'error'` with the raw
|
|
22
|
+
* message; callers can decide to retry, prompt the human, or abort.
|
|
23
|
+
* - We cap the diff at 200KB and note the truncation in the summary. The
|
|
24
|
+
* inbound token budget for a big Opus call is much larger, but a 200KB
|
|
25
|
+
* diff is already a red flag on its own and we don't want to silently
|
|
26
|
+
* eat massive payloads.
|
|
27
|
+
*/
|
|
28
|
+
import Anthropic, { APIError } from '@anthropic-ai/sdk';
|
|
29
|
+
import { recordTelemetry } from '../observability/codex-telemetry.js';
|
|
30
|
+
/** Pin the model id — audit entries reference this verbatim. */
|
|
31
|
+
const CLAUDE_MODEL_ID = 'claude-opus-4-7';
|
|
32
|
+
/** 200KB cap on the diff before we truncate and flag degraded. */
|
|
33
|
+
const DIFF_TRUNCATE_BYTES = 200 * 1024;
|
|
34
|
+
/** Bounded output so a runaway model can't exhaust our token budget. */
|
|
35
|
+
const MAX_OUTPUT_TOKENS = 4096;
|
|
36
|
+
const SYSTEM_PROMPT = `You are an adversarial code reviewer. A diff will be provided along with
|
|
37
|
+
commit metadata. Identify high-impact security, correctness, edge-case,
|
|
38
|
+
test-gap, api-design, or performance issues. Do not restate what the diff
|
|
39
|
+
does; surface what is wrong or risky.
|
|
40
|
+
|
|
41
|
+
Respond with STRICT JSON matching exactly this schema. Do not include
|
|
42
|
+
markdown fences, commentary, or any text outside the JSON object:
|
|
43
|
+
|
|
44
|
+
{
|
|
45
|
+
"verdict": "pass" | "concerns" | "blocking" | "error",
|
|
46
|
+
"summary": "one sentence",
|
|
47
|
+
"findings": [
|
|
48
|
+
{
|
|
49
|
+
"category": "security" | "correctness" | "edge-case" | "test-gap" | "api-design" | "performance",
|
|
50
|
+
"severity": "high" | "medium" | "low",
|
|
51
|
+
"file": "relative/path",
|
|
52
|
+
"line": 123,
|
|
53
|
+
"issue": "short problem statement",
|
|
54
|
+
"evidence": "optional quote from the diff",
|
|
55
|
+
"suggested_fix": "optional one-line fix hint"
|
|
56
|
+
}
|
|
57
|
+
]
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
Return an empty findings array for a clean pass. Use "blocking" only for
|
|
61
|
+
issues that must be fixed before merge.`;
|
|
62
|
+
function buildUserMessage(req, diffWasTruncated) {
|
|
63
|
+
const truncNote = diffWasTruncated
|
|
64
|
+
? '\n\nNOTE: The diff was truncated to 200KB. The review is necessarily partial.'
|
|
65
|
+
: '';
|
|
66
|
+
return [
|
|
67
|
+
`Branch: ${req.branch}`,
|
|
68
|
+
`Head SHA: ${req.head_sha}`,
|
|
69
|
+
`Diffed against: ${req.target}`,
|
|
70
|
+
'',
|
|
71
|
+
'## Commit log',
|
|
72
|
+
req.commit_log || '(empty)',
|
|
73
|
+
'',
|
|
74
|
+
'## Diff',
|
|
75
|
+
req.diff || '(empty)',
|
|
76
|
+
truncNote,
|
|
77
|
+
].join('\n');
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Safe parse — we don't want a malformed model response to crash the push
|
|
81
|
+
* gate. Any parse failure or shape mismatch folds into an error verdict.
|
|
82
|
+
*/
|
|
83
|
+
function parseModelJson(raw) {
|
|
84
|
+
let parsed;
|
|
85
|
+
try {
|
|
86
|
+
parsed = JSON.parse(raw);
|
|
87
|
+
}
|
|
88
|
+
catch (err) {
|
|
89
|
+
return {
|
|
90
|
+
error: `unparseable JSON from model: ${err instanceof Error ? err.message : String(err)}`,
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
if (typeof parsed !== 'object' || parsed === null) {
|
|
94
|
+
return { error: 'model response was not a JSON object' };
|
|
95
|
+
}
|
|
96
|
+
const obj = parsed;
|
|
97
|
+
const verdict = obj['verdict'];
|
|
98
|
+
const summary = obj['summary'];
|
|
99
|
+
const findings = obj['findings'];
|
|
100
|
+
const validVerdicts = ['pass', 'concerns', 'blocking', 'error'];
|
|
101
|
+
if (typeof verdict !== 'string' || !validVerdicts.includes(verdict)) {
|
|
102
|
+
return { error: `invalid verdict: ${String(verdict)}` };
|
|
103
|
+
}
|
|
104
|
+
if (typeof summary !== 'string') {
|
|
105
|
+
return { error: 'missing or non-string summary' };
|
|
106
|
+
}
|
|
107
|
+
if (!Array.isArray(findings)) {
|
|
108
|
+
return { error: 'missing or non-array findings' };
|
|
109
|
+
}
|
|
110
|
+
// Findings get shallow validation — we pass each through a narrow guard
|
|
111
|
+
// and drop any entries that can't be coerced. A noisy model is better
|
|
112
|
+
// handled by discarding junk than by erroring the whole review.
|
|
113
|
+
const cleanFindings = [];
|
|
114
|
+
for (const f of findings) {
|
|
115
|
+
const finding = toReviewFinding(f);
|
|
116
|
+
if (finding !== undefined)
|
|
117
|
+
cleanFindings.push(finding);
|
|
118
|
+
}
|
|
119
|
+
return {
|
|
120
|
+
reviewer_name: 'claude-self',
|
|
121
|
+
reviewer_version: CLAUDE_MODEL_ID,
|
|
122
|
+
verdict: verdict,
|
|
123
|
+
findings: cleanFindings,
|
|
124
|
+
summary,
|
|
125
|
+
// Always true for this reviewer — same-model is structurally degraded.
|
|
126
|
+
// Callers that compose results should keep this value, not overwrite it.
|
|
127
|
+
degraded: true,
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
function toReviewFinding(input) {
|
|
131
|
+
if (typeof input !== 'object' || input === null)
|
|
132
|
+
return undefined;
|
|
133
|
+
const o = input;
|
|
134
|
+
const validCategories = [
|
|
135
|
+
'security',
|
|
136
|
+
'correctness',
|
|
137
|
+
'edge-case',
|
|
138
|
+
'test-gap',
|
|
139
|
+
'api-design',
|
|
140
|
+
'performance',
|
|
141
|
+
];
|
|
142
|
+
const validSeverities = ['high', 'medium', 'low'];
|
|
143
|
+
if (typeof o['category'] !== 'string' || !validCategories.includes(o['category'])) {
|
|
144
|
+
return undefined;
|
|
145
|
+
}
|
|
146
|
+
if (typeof o['severity'] !== 'string' || !validSeverities.includes(o['severity'])) {
|
|
147
|
+
return undefined;
|
|
148
|
+
}
|
|
149
|
+
if (typeof o['file'] !== 'string')
|
|
150
|
+
return undefined;
|
|
151
|
+
if (typeof o['issue'] !== 'string')
|
|
152
|
+
return undefined;
|
|
153
|
+
const out = {
|
|
154
|
+
category: o['category'],
|
|
155
|
+
severity: o['severity'],
|
|
156
|
+
file: o['file'],
|
|
157
|
+
issue: o['issue'],
|
|
158
|
+
};
|
|
159
|
+
if (typeof o['line'] === 'number')
|
|
160
|
+
out.line = o['line'];
|
|
161
|
+
if (typeof o['start_line'] === 'number')
|
|
162
|
+
out.start_line = o['start_line'];
|
|
163
|
+
if (typeof o['evidence'] === 'string')
|
|
164
|
+
out.evidence = o['evidence'];
|
|
165
|
+
if (typeof o['suggested_fix'] === 'string')
|
|
166
|
+
out.suggested_fix = o['suggested_fix'];
|
|
167
|
+
return out;
|
|
168
|
+
}
|
|
169
|
+
function errorResult(message, summary, degradedNote) {
|
|
170
|
+
return {
|
|
171
|
+
reviewer_name: 'claude-self',
|
|
172
|
+
reviewer_version: CLAUDE_MODEL_ID,
|
|
173
|
+
verdict: 'error',
|
|
174
|
+
findings: [],
|
|
175
|
+
summary: `${summary}${degradedNote}`,
|
|
176
|
+
degraded: true,
|
|
177
|
+
error: message,
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
export class ClaudeSelfReviewer {
|
|
181
|
+
name = 'claude-self';
|
|
182
|
+
version;
|
|
183
|
+
apiKey;
|
|
184
|
+
createFn;
|
|
185
|
+
baseDir;
|
|
186
|
+
recordTelemetryFn;
|
|
187
|
+
constructor(opts = {}) {
|
|
188
|
+
this.version = opts.model ?? CLAUDE_MODEL_ID;
|
|
189
|
+
this.apiKey = opts.apiKey ?? process.env['ANTHROPIC_API_KEY'];
|
|
190
|
+
this.createFn = opts.create;
|
|
191
|
+
this.baseDir = opts.baseDir ?? process.cwd();
|
|
192
|
+
this.recordTelemetryFn = opts.recordTelemetryFn ?? recordTelemetry;
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Cheap check. We don't actually ping the API — the selector only needs
|
|
196
|
+
* to know whether we CAN try. If the key is bogus we'll find out in
|
|
197
|
+
* `review()` and surface it as `verdict: 'error'`.
|
|
198
|
+
*/
|
|
199
|
+
async isAvailable() {
|
|
200
|
+
// When a test injects `create`, treat the reviewer as available so we
|
|
201
|
+
// don't need to juggle fake env in every test.
|
|
202
|
+
if (this.createFn !== undefined)
|
|
203
|
+
return true;
|
|
204
|
+
return this.apiKey !== undefined && this.apiKey.length > 0;
|
|
205
|
+
}
|
|
206
|
+
async review(req) {
|
|
207
|
+
const create = this.getCreateFn();
|
|
208
|
+
if (create === undefined) {
|
|
209
|
+
return errorResult('ANTHROPIC_API_KEY not set', 'claude-self fallback unavailable: no API key', '');
|
|
210
|
+
}
|
|
211
|
+
const diffBytes = Buffer.byteLength(req.diff, 'utf8');
|
|
212
|
+
const truncated = diffBytes > DIFF_TRUNCATE_BYTES;
|
|
213
|
+
const effectiveDiff = truncated
|
|
214
|
+
? req.diff.slice(0, DIFF_TRUNCATE_BYTES)
|
|
215
|
+
: req.diff;
|
|
216
|
+
const userMessage = buildUserMessage({ ...req, diff: effectiveDiff }, truncated);
|
|
217
|
+
// G11.5 — measure the SDK call. The telemetry write is best-effort and
|
|
218
|
+
// fire-and-forget; it MUST NOT block or fail the review. We call
|
|
219
|
+
// `recordTelemetryFn` in a fire-and-forget void expression; the helper
|
|
220
|
+
// itself is already fail-soft (single stderr warn on error).
|
|
221
|
+
const startedAt = Date.now();
|
|
222
|
+
let response;
|
|
223
|
+
try {
|
|
224
|
+
response = await create({
|
|
225
|
+
model: this.version,
|
|
226
|
+
max_tokens: MAX_OUTPUT_TOKENS,
|
|
227
|
+
system: SYSTEM_PROMPT,
|
|
228
|
+
messages: [
|
|
229
|
+
{
|
|
230
|
+
role: 'user',
|
|
231
|
+
content: userMessage,
|
|
232
|
+
},
|
|
233
|
+
],
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
catch (err) {
|
|
237
|
+
// Rate-limits, 5xx, network errors all land here. Surface the raw
|
|
238
|
+
// message so operators can act on it; the caller decides whether
|
|
239
|
+
// to retry or abort.
|
|
240
|
+
const message = err instanceof APIError ? `API ${err.status ?? '?'}: ${err.message}` : err instanceof Error ? err.message : String(err);
|
|
241
|
+
this.emitTelemetry({
|
|
242
|
+
invocation_type: 'adversarial-review',
|
|
243
|
+
input_text: userMessage,
|
|
244
|
+
output_text: '',
|
|
245
|
+
duration_ms: Date.now() - startedAt,
|
|
246
|
+
exit_code: 1,
|
|
247
|
+
stderr: message,
|
|
248
|
+
});
|
|
249
|
+
return errorResult(message, 'claude-self review failed', '');
|
|
250
|
+
}
|
|
251
|
+
const text = response.content
|
|
252
|
+
.filter((c) => c.type === 'text')
|
|
253
|
+
.map((c) => c.text ?? '')
|
|
254
|
+
.join('');
|
|
255
|
+
const parsed = parseModelJson(text);
|
|
256
|
+
if ('error' in parsed) {
|
|
257
|
+
this.emitTelemetry({
|
|
258
|
+
invocation_type: 'adversarial-review',
|
|
259
|
+
input_text: userMessage,
|
|
260
|
+
output_text: text,
|
|
261
|
+
duration_ms: Date.now() - startedAt,
|
|
262
|
+
exit_code: 1,
|
|
263
|
+
stderr: parsed.error,
|
|
264
|
+
});
|
|
265
|
+
return errorResult(parsed.error, 'claude-self produced unparseable output', '');
|
|
266
|
+
}
|
|
267
|
+
if (truncated) {
|
|
268
|
+
parsed.summary = `[diff truncated to 200KB] ${parsed.summary}`;
|
|
269
|
+
}
|
|
270
|
+
// Defense in depth — parseModelJson should always set degraded=true
|
|
271
|
+
// for this reviewer, but this reviewer is the canonical authority on
|
|
272
|
+
// that flag so re-pin it.
|
|
273
|
+
parsed.degraded = true;
|
|
274
|
+
this.emitTelemetry({
|
|
275
|
+
invocation_type: 'adversarial-review',
|
|
276
|
+
input_text: userMessage,
|
|
277
|
+
output_text: text,
|
|
278
|
+
duration_ms: Date.now() - startedAt,
|
|
279
|
+
exit_code: 0,
|
|
280
|
+
});
|
|
281
|
+
return parsed;
|
|
282
|
+
}
|
|
283
|
+
/**
|
|
284
|
+
* Fire-and-forget telemetry write. The helper itself is fail-soft, but a
|
|
285
|
+
* misbehaving injected `recordTelemetryFn` (synchronous throw) could
|
|
286
|
+
* still escape a bare `void fn(...)`. This wrapper catches both sync and
|
|
287
|
+
* async rejections so a telemetry bug never breaks a review.
|
|
288
|
+
*/
|
|
289
|
+
emitTelemetry(input) {
|
|
290
|
+
try {
|
|
291
|
+
void Promise.resolve(this.recordTelemetryFn(this.baseDir, input)).catch(() => {
|
|
292
|
+
/* swallowed — telemetry is observational, never fatal */
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
catch {
|
|
296
|
+
/* same rationale — a sync throw from the injected fn is contained */
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
/**
|
|
300
|
+
* Resolve the create() closure lazily so we only build the real client
|
|
301
|
+
* when there's an API key and nothing was injected.
|
|
302
|
+
*/
|
|
303
|
+
getCreateFn() {
|
|
304
|
+
if (this.createFn !== undefined)
|
|
305
|
+
return this.createFn;
|
|
306
|
+
if (this.apiKey === undefined || this.apiKey.length === 0)
|
|
307
|
+
return undefined;
|
|
308
|
+
const client = new Anthropic({ apiKey: this.apiKey });
|
|
309
|
+
return async (params) => {
|
|
310
|
+
const res = await client.messages.create(params);
|
|
311
|
+
return {
|
|
312
|
+
content: res.content.map((block) => block.type === 'text' ? { type: 'text', text: block.text } : { type: block.type }),
|
|
313
|
+
};
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Codex adversarial reviewer adapter (G11.2).
|
|
3
|
+
*
|
|
4
|
+
* ## Why this class throws from `review()`
|
|
5
|
+
*
|
|
6
|
+
* The actual Codex review path is the `codex-adversarial` agent shipped under
|
|
7
|
+
* `.claude/agents/`, invoked from Claude Code via the `/codex-review` slash
|
|
8
|
+
* command (which eventually reaches the Codex plugin's
|
|
9
|
+
* `/codex:adversarial-review`). None of that is importable from TS — the
|
|
10
|
+
* agent runtime is the harness, not a library.
|
|
11
|
+
*
|
|
12
|
+
* `CodexReviewer` exists so:
|
|
13
|
+
*
|
|
14
|
+
* 1. `selectReviewer()` can return a typed reviewer handle with a stable
|
|
15
|
+
* `name`/`version` that the audit log and CLI can surface.
|
|
16
|
+
* 2. `isAvailable()` can cheaply probe the CLI without invoking a review.
|
|
17
|
+
* 3. G11.3 (startup probe) and G11.4 (no-Codex policy) have something to
|
|
18
|
+
* type-check against now, so the broader flow can land without waiting
|
|
19
|
+
* for an in-process Codex SDK that may never ship.
|
|
20
|
+
*
|
|
21
|
+
* If we ever get a native Codex TS client, `review()` becomes real and this
|
|
22
|
+
* comment block goes away. Until then: treat a Codex selection as
|
|
23
|
+
* "dispatch to the agent", not "await reviewer.review(...)".
|
|
24
|
+
*/
|
|
25
|
+
import type { AdversarialReviewer, ReviewRequest, ReviewResult } from './types.js';
|
|
26
|
+
/**
|
|
27
|
+
* Narrow test seam: the unit tests swap the exec implementation via the
|
|
28
|
+
* constructor so we don't have to hit the real CLI. Kept internal to this
|
|
29
|
+
* file — production callers always use the default.
|
|
30
|
+
*/
|
|
31
|
+
export type ExecFileFn = (file: string, args: readonly string[], options: {
|
|
32
|
+
timeout: number;
|
|
33
|
+
}) => Promise<{
|
|
34
|
+
stdout: string;
|
|
35
|
+
stderr: string;
|
|
36
|
+
}>;
|
|
37
|
+
export declare class CodexReviewer implements AdversarialReviewer {
|
|
38
|
+
readonly name = "codex";
|
|
39
|
+
private readonly exec;
|
|
40
|
+
private cachedVersion;
|
|
41
|
+
constructor(opts?: {
|
|
42
|
+
exec?: ExecFileFn;
|
|
43
|
+
});
|
|
44
|
+
/**
|
|
45
|
+
* Lazily populated via `codex --version`. We don't block construction on
|
|
46
|
+
* the probe because the selector calls `isAvailable()` before it commits
|
|
47
|
+
* to Codex, so we'll have a fresh value by the time anything reads it.
|
|
48
|
+
*/
|
|
49
|
+
get version(): string;
|
|
50
|
+
isAvailable(): Promise<boolean>;
|
|
51
|
+
/**
|
|
52
|
+
* Not invokable from TS — see the file header. The selector contract is
|
|
53
|
+
* "CodexReviewer handles mean dispatch to the codex-adversarial agent";
|
|
54
|
+
* if a caller ignores that and awaits this, we throw loudly rather than
|
|
55
|
+
* silently produce a bad `ReviewResult`.
|
|
56
|
+
*
|
|
57
|
+
* TODO(0.3.0): when Codex ships a native TS client, this path will
|
|
58
|
+
* actually run the review. At that point, instrument with
|
|
59
|
+
* `recordTelemetry` the same way `ClaudeSelfReviewer.review()` does
|
|
60
|
+
* today (G11.5). The throwing placeholder below is deliberately NOT
|
|
61
|
+
* instrumented — there is nothing to measure.
|
|
62
|
+
*/
|
|
63
|
+
review(_req: ReviewRequest): Promise<ReviewResult>;
|
|
64
|
+
}
|