@roodriigoooo/pi-scrutiny 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -0
- package/extensions/scrutiny/analysis.ts +335 -0
- package/extensions/scrutiny/config.ts +407 -0
- package/extensions/scrutiny/engine.ts +513 -0
- package/extensions/scrutiny/history.ts +566 -0
- package/extensions/scrutiny/packet.ts +188 -0
- package/extensions/scrutiny/palette.ts +413 -0
- package/extensions/scrutiny/preview.ts +261 -0
- package/extensions/scrutiny/registry.ts +48 -0
- package/extensions/scrutiny/runner.ts +128 -0
- package/extensions/scrutiny/scout.ts +314 -0
- package/extensions/scrutiny/summary.ts +270 -0
- package/extensions/scrutiny/types.ts +184 -0
- package/extensions/scrutiny/ui.ts +299 -0
- package/extensions/scrutiny/util.ts +123 -0
- package/extensions/scrutiny.ts +333 -0
- package/package.json +48 -0
|
@@ -0,0 +1,513 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { buildDeterministicAnalysis, detectMush, formatFailureBrief, formatScrutinyBrief, formatVerifyBrief } from "./analysis.js";
|
|
4
|
+
import { SURFACE_DEFAULTS, readScrutinyConfig, resolveJudge, resolvePanel, resolveTools } from "./config.js";
|
|
5
|
+
import { buildTaskPacket, judgePrompt, panelPrompt, panelRoles } from "./packet.js";
|
|
6
|
+
import { runModelTask } from "./runner.js";
|
|
7
|
+
import { recordRunEnd, recordRunProgress, recordRunStart } from "./registry.js";
|
|
8
|
+
import { writeRunResult } from "./summary.js";
|
|
9
|
+
import type { PanelMode, ScrutinyAnalysis, ScrutinyParams, ScrutinyRunProgress, ScrutinyRunResult, ScrutinySurface, PanelResponse, VerifyCheck, VerifyReport } from "./types.js";
|
|
10
|
+
import { createRunId, formatDuration, formatTokens, scrutinyDataDir, parseAnalysisJson, safeMkdir, truncate } from "./util.js";
|
|
11
|
+
|
|
12
|
+
type ExecLike = (command: string, args: string[], options?: { timeout?: number; signal?: AbortSignal }) => Promise<{ stdout?: string; stderr?: string; code?: number; killed?: boolean }>;
|
|
13
|
+
|
|
14
|
+
export const SCRUTINY_PACKET_PREVIEW_CANCELLED = "SCRUTINY_PACKET_PREVIEW_CANCELLED";
|
|
15
|
+
|
|
16
|
+
type RunScrutinyInput = {
|
|
17
|
+
params: ScrutinyParams;
|
|
18
|
+
cwd: string;
|
|
19
|
+
exec: ExecLike;
|
|
20
|
+
signal?: AbortSignal;
|
|
21
|
+
onProgress?: (progress: ScrutinyRunProgress) => void;
|
|
22
|
+
projectTrusted?: boolean;
|
|
23
|
+
confirmPacket?: (input: { runId: string; surface: ScrutinySurface; packet: string; panelCount: number; judgeRan: boolean; verifyRan: boolean }) => Promise<string | null>;
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
const PANEL_EXCERPT_CHARS = 2_400;
|
|
27
|
+
const PROGRESS_HEARTBEAT_MS = 1_000;
|
|
28
|
+
|
|
29
|
+
let activeRunId: string | undefined;
|
|
30
|
+
|
|
31
|
+
function acquireRunLock(runId: string): boolean {
|
|
32
|
+
if (activeRunId && activeRunId !== runId) return false;
|
|
33
|
+
activeRunId = runId;
|
|
34
|
+
return true;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function releaseRunLock(runId: string): void {
|
|
38
|
+
if (activeRunId === runId) activeRunId = undefined;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export async function runScrutiny(input: RunScrutinyInput): Promise<{ result: ScrutinyRunResult; brief: string }> {
|
|
42
|
+
const startedAt = Date.now();
|
|
43
|
+
const runId = createRunId();
|
|
44
|
+
const config = readScrutinyConfig({ cwd: input.cwd, projectTrusted: input.projectTrusted });
|
|
45
|
+
const surface: ScrutinySurface = resolveSurface(input.params);
|
|
46
|
+
const panelMode = SURFACE_DEFAULTS[surface].panelMode;
|
|
47
|
+
const panelMembers = surface === "verify" ? [] : resolvePanel(input.params, config);
|
|
48
|
+
const tools = resolveTools(input.params, config);
|
|
49
|
+
const judgeModel = resolveJudge(input.params, config, panelMembers);
|
|
50
|
+
const runJudgeByPolicy = shouldRunJudge(surface, input.params.judgeMode);
|
|
51
|
+
const runVerifyByPolicy = shouldRunVerify(surface, input.params.verify);
|
|
52
|
+
const runDir = path.join(scrutinyDataDir(input.cwd), runId);
|
|
53
|
+
const packetPath = path.join(runDir, "packet.md");
|
|
54
|
+
|
|
55
|
+
if (!acquireRunLock(runId)) {
|
|
56
|
+
const result = emptyError({ runId, surface, startedAt, error: "A scrutiny run is already in progress. Wait for it to finish before starting another.", failure_reason: "unexpected_error" });
|
|
57
|
+
safeMkdir(runDir);
|
|
58
|
+
await writeRunResult({ cwd: input.cwd, runDir, result, prompt: input.params.prompt });
|
|
59
|
+
return { result, brief: result.error ?? "Scrutiny failed." };
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if (process.env.PI_SCRUTINY_DEPTH) {
|
|
63
|
+
const result = emptyError({ runId, surface, startedAt, error: "nested scrutiny invocation blocked", failure_reason: "recursion_capped" });
|
|
64
|
+
safeMkdir(runDir);
|
|
65
|
+
await writeRunResult({ cwd: input.cwd, runDir, result, prompt: input.params.prompt });
|
|
66
|
+
releaseRunLock(runId);
|
|
67
|
+
return { result, brief: "Scrutiny blocked: nested invocation." };
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
safeMkdir(runDir);
|
|
71
|
+
|
|
72
|
+
if (surface === "verify") {
|
|
73
|
+
const out = await runVerifyOnly({ runId, surface, cwd: input.cwd, exec: input.exec, config, runDir, startedAt, signal: input.signal, onProgress: input.onProgress, params: input.params });
|
|
74
|
+
releaseRunLock(runId);
|
|
75
|
+
return out;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (panelMembers.length === 0) {
|
|
79
|
+
const result = emptyError({ runId, surface, startedAt, error: "No panel models configured. Set PI_SCRUTINY_PANEL or pass panel.", failure_reason: "missing_panel" });
|
|
80
|
+
await writeRunResult({ cwd: input.cwd, runDir, result, prompt: input.params.prompt });
|
|
81
|
+
releaseRunLock(runId);
|
|
82
|
+
return { result, brief: result.error ?? "Scrutiny failed." };
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
recordRunStart({ runId, surface, status: "running", startedAt, runDir });
|
|
86
|
+
|
|
87
|
+
let packet = await buildTaskPacket({ params: input.params, surface, cwd: input.cwd, config, exec: input.exec, signal: input.signal });
|
|
88
|
+
if (input.confirmPacket) {
|
|
89
|
+
const confirmedPacket = await input.confirmPacket({ runId, surface, packet, panelCount: panelMembers.length, judgeRan: runJudgeByPolicy && Boolean(judgeModel), verifyRan: runVerifyByPolicy });
|
|
90
|
+
if (!confirmedPacket) {
|
|
91
|
+
await fs.rm(runDir, { recursive: true, force: true }).catch(() => undefined);
|
|
92
|
+
releaseRunLock(runId);
|
|
93
|
+
throw new Error(SCRUTINY_PACKET_PREVIEW_CANCELLED);
|
|
94
|
+
}
|
|
95
|
+
packet = confirmedPacket;
|
|
96
|
+
}
|
|
97
|
+
await fs.writeFile(packetPath, packet, { encoding: "utf8", mode: 0o600 });
|
|
98
|
+
|
|
99
|
+
const panel = panelRoles(panelMembers, surface);
|
|
100
|
+
let progress: ScrutinyRunProgress = {
|
|
101
|
+
runId,
|
|
102
|
+
surface,
|
|
103
|
+
panel_mode: panelMode,
|
|
104
|
+
packetPath,
|
|
105
|
+
panel: panel.map((item) => ({ model: item.model, role: item.role, thinking: item.thinking, status: "pending" })),
|
|
106
|
+
judge: runJudgeByPolicy && judgeModel ? { model: judgeModel, role: "trade-off explainer", status: "pending" } : undefined,
|
|
107
|
+
startedAt,
|
|
108
|
+
updatedAt: Date.now(),
|
|
109
|
+
status: "running",
|
|
110
|
+
message: replicatedBudgetLine(packet, panel.length, runJudgeByPolicy),
|
|
111
|
+
};
|
|
112
|
+
emit(input, progress);
|
|
113
|
+
|
|
114
|
+
const responses: PanelResponse[] = [];
|
|
115
|
+
for (let index = 0; index < panel.length; index++) {
|
|
116
|
+
const item = panel[index]!;
|
|
117
|
+
progress = updatePanel(progress, index, { status: "running", startedAt: Date.now() });
|
|
118
|
+
progress.message = `${item.role} · ${index + 1}/${panel.length}`;
|
|
119
|
+
emit(input, progress);
|
|
120
|
+
const response = await withProgressHeartbeat(
|
|
121
|
+
() => runModelTask({
|
|
122
|
+
model: item.model,
|
|
123
|
+
role: item.role,
|
|
124
|
+
prompt: panelPrompt({ packet, role: item.role, surface, panelMode }),
|
|
125
|
+
cwd: input.cwd,
|
|
126
|
+
tools,
|
|
127
|
+
timeoutMs: config.panelTimeoutMs,
|
|
128
|
+
outputCharLimit: config.maxPanelOutputChars,
|
|
129
|
+
thinkingLevel: item.thinking,
|
|
130
|
+
signal: input.signal,
|
|
131
|
+
}),
|
|
132
|
+
() => emit(input, progress),
|
|
133
|
+
);
|
|
134
|
+
responses.push(response);
|
|
135
|
+
progress = updatePanel(progress, index, { status: response.status === "ok" ? "ready" : "failed", endedAt: Date.now() });
|
|
136
|
+
progress.message = panelProgressLine(responsesSoFar(progress));
|
|
137
|
+
emit(input, progress);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
await fs.writeFile(path.join(runDir, "responses.json"), JSON.stringify(responses, null, 2), { encoding: "utf8", mode: 0o600 });
|
|
141
|
+
const okResponses = responses.filter((response) => response.status === "ok" && response.content.trim());
|
|
142
|
+
const failedModels = responses.filter((response) => response.status === "error").map((response) => ({ model: response.model, error: response.error ?? "unknown error" }));
|
|
143
|
+
|
|
144
|
+
if (okResponses.length === 0) {
|
|
145
|
+
const endedAt = Date.now();
|
|
146
|
+
const result: ScrutinyRunResult = {
|
|
147
|
+
runId,
|
|
148
|
+
surface,
|
|
149
|
+
panel_mode: panelMode,
|
|
150
|
+
status: "error",
|
|
151
|
+
failure_reason: "all_panels_failed",
|
|
152
|
+
error: "all panel models failed",
|
|
153
|
+
packetPath,
|
|
154
|
+
packet,
|
|
155
|
+
responses,
|
|
156
|
+
failed_models: failedModels,
|
|
157
|
+
startedAt,
|
|
158
|
+
endedAt,
|
|
159
|
+
durationMs: endedAt - startedAt,
|
|
160
|
+
};
|
|
161
|
+
await writeRunResult({ cwd: input.cwd, runDir, result, prompt: input.params.prompt });
|
|
162
|
+
progress = { ...progress, status: "error", updatedAt: endedAt, message: "all panel models failed" };
|
|
163
|
+
emit(input, progress);
|
|
164
|
+
recordRunEnd(runId, { status: "error", endedAt, error: "all panel models failed" });
|
|
165
|
+
releaseRunLock(runId);
|
|
166
|
+
return { result, brief: formatFailureBrief({ surface, runId, runDir, responses, failedModels, reason: "all panel models failed" }) };
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
const mush = detectMush(okResponses);
|
|
170
|
+
if (mush) {
|
|
171
|
+
const endedAt = Date.now();
|
|
172
|
+
const result: ScrutinyRunResult = {
|
|
173
|
+
runId,
|
|
174
|
+
surface,
|
|
175
|
+
panel_mode: panelMode,
|
|
176
|
+
status: "error",
|
|
177
|
+
failure_reason: "all_panels_failed",
|
|
178
|
+
error: `panel outputs unusable: ${mush}`,
|
|
179
|
+
packetPath,
|
|
180
|
+
packet,
|
|
181
|
+
responses,
|
|
182
|
+
failed_models: failedModels,
|
|
183
|
+
startedAt,
|
|
184
|
+
endedAt,
|
|
185
|
+
durationMs: endedAt - startedAt,
|
|
186
|
+
};
|
|
187
|
+
await writeRunResult({ cwd: input.cwd, runDir, result, prompt: input.params.prompt });
|
|
188
|
+
progress = { ...progress, status: "error", updatedAt: endedAt, message: `panel outputs unusable: ${mush}` };
|
|
189
|
+
emit(input, progress);
|
|
190
|
+
recordRunEnd(runId, { status: "error", endedAt, error: `panel outputs unusable: ${mush}` });
|
|
191
|
+
releaseRunLock(runId);
|
|
192
|
+
return { result, brief: formatFailureBrief({ surface, runId, runDir, responses, failedModels, reason: `panel outputs unusable: ${mush}` }) };
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
let judge: PanelResponse | undefined;
|
|
196
|
+
let analysis: ScrutinyAnalysis | undefined = buildDeterministicAnalysis(responses, panelMode);
|
|
197
|
+
const runJudge = runJudgeByPolicy && Boolean(judgeModel);
|
|
198
|
+
|
|
199
|
+
if (runJudge && judgeModel) {
|
|
200
|
+
progress = { ...progress, judge: { model: judgeModel, role: "trade-off explainer", status: "running", startedAt: Date.now() }, updatedAt: Date.now(), message: "trade-off explainer comparing panel evidence" };
|
|
201
|
+
emit(input, progress);
|
|
202
|
+
judge = await withProgressHeartbeat(
|
|
203
|
+
() => runModelTask({
|
|
204
|
+
model: judgeModel,
|
|
205
|
+
role: "trade-off explainer",
|
|
206
|
+
prompt: judgePrompt({ packet, panelMode, responses: okResponses.map((response) => ({ model: response.model, role: response.role, content: response.content })) }),
|
|
207
|
+
cwd: input.cwd,
|
|
208
|
+
tools,
|
|
209
|
+
timeoutMs: config.judgeTimeoutMs,
|
|
210
|
+
outputCharLimit: config.maxJudgeOutputChars,
|
|
211
|
+
thinkingLevel: "off",
|
|
212
|
+
signal: input.signal,
|
|
213
|
+
}),
|
|
214
|
+
() => emit(input, progress),
|
|
215
|
+
);
|
|
216
|
+
const judgeAnalysis = judge.status === "ok" ? parseAnalysisJson(judge.content) : undefined;
|
|
217
|
+
if (judgeAnalysis) analysis = mergeAnalysis(analysis, judgeAnalysis, panelMode);
|
|
218
|
+
progress = { ...progress, judge: { model: judgeModel, role: "trade-off explainer", status: judgeAnalysis ? "ready" : "failed", endedAt: Date.now() }, updatedAt: Date.now(), message: judgeAnalysis ? "trade-off explainer ready" : "trade-off explainer failed; deterministic evidence map kept" };
|
|
219
|
+
emit(input, progress);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
let verify: VerifyReport | undefined;
|
|
223
|
+
if (runVerifyByPolicy) {
|
|
224
|
+
progress = { ...progress, message: "running objective verify checks", updatedAt: Date.now() };
|
|
225
|
+
emit(input, progress);
|
|
226
|
+
verify = await withProgressHeartbeat(
|
|
227
|
+
() => runVerify({
|
|
228
|
+
cwd: input.cwd,
|
|
229
|
+
exec: input.exec,
|
|
230
|
+
config,
|
|
231
|
+
signal: input.signal,
|
|
232
|
+
onCheckProgress: (event) => {
|
|
233
|
+
progress = { ...progress, message: verifyProgressMessage(event), updatedAt: Date.now() };
|
|
234
|
+
emit(input, progress);
|
|
235
|
+
},
|
|
236
|
+
}),
|
|
237
|
+
() => emit(input, progress),
|
|
238
|
+
);
|
|
239
|
+
await fs.writeFile(path.join(runDir, "verify.json"), JSON.stringify(verify, null, 2), { encoding: "utf8", mode: 0o600 });
|
|
240
|
+
progress = { ...progress, message: `verify: ${verify.passed} pass · ${verify.failed} fail · ${verify.skipped} skipped`, updatedAt: Date.now() };
|
|
241
|
+
emit(input, progress);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
const endedAt = Date.now();
|
|
245
|
+
const result: ScrutinyRunResult = {
|
|
246
|
+
runId,
|
|
247
|
+
surface,
|
|
248
|
+
panel_mode: panelMode,
|
|
249
|
+
status: "ok",
|
|
250
|
+
failure_reason: judge && judge.status !== "ok" ? "judge_failed" : undefined,
|
|
251
|
+
packetPath,
|
|
252
|
+
packet,
|
|
253
|
+
responses,
|
|
254
|
+
failed_models: failedModels,
|
|
255
|
+
judge,
|
|
256
|
+
analysis,
|
|
257
|
+
verify,
|
|
258
|
+
startedAt,
|
|
259
|
+
endedAt,
|
|
260
|
+
durationMs: endedAt - startedAt,
|
|
261
|
+
};
|
|
262
|
+
await writeRunResult({ cwd: input.cwd, runDir, result, prompt: input.params.prompt });
|
|
263
|
+
progress = { ...progress, status: "ok", updatedAt: endedAt, message: `done in ${formatDuration(result.durationMs)}` };
|
|
264
|
+
emit(input, progress);
|
|
265
|
+
recordRunEnd(runId, { status: "ok", endedAt });
|
|
266
|
+
releaseRunLock(runId);
|
|
267
|
+
|
|
268
|
+
const brief = formatScrutinyBrief({
|
|
269
|
+
surface,
|
|
270
|
+
panelMode,
|
|
271
|
+
analysis,
|
|
272
|
+
responses,
|
|
273
|
+
failedModels,
|
|
274
|
+
judgeRan: runJudge,
|
|
275
|
+
verify,
|
|
276
|
+
llmPanelExcerptChars: PANEL_EXCERPT_CHARS,
|
|
277
|
+
budgetLine: budgetLine(packet, responses, runJudge),
|
|
278
|
+
});
|
|
279
|
+
return { result, brief };
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
async function runVerifyOnly(input: {
|
|
283
|
+
runId: string;
|
|
284
|
+
surface: ScrutinySurface;
|
|
285
|
+
cwd: string;
|
|
286
|
+
exec: ExecLike;
|
|
287
|
+
config: import("./types.js").ScrutinyConfig;
|
|
288
|
+
runDir: string;
|
|
289
|
+
startedAt: number;
|
|
290
|
+
signal?: AbortSignal;
|
|
291
|
+
onProgress?: (progress: ScrutinyRunProgress) => void;
|
|
292
|
+
params: ScrutinyParams;
|
|
293
|
+
}): Promise<{ result: ScrutinyRunResult; brief: string }> {
|
|
294
|
+
const { runId, surface, cwd, exec, config, runDir, startedAt, signal, onProgress, params } = input;
|
|
295
|
+
recordRunStart({ runId, surface, status: "running", startedAt, runDir });
|
|
296
|
+
let progress: ScrutinyRunProgress = {
|
|
297
|
+
runId,
|
|
298
|
+
surface,
|
|
299
|
+
panel: [],
|
|
300
|
+
startedAt,
|
|
301
|
+
updatedAt: Date.now(),
|
|
302
|
+
status: "running",
|
|
303
|
+
message: "running objective verify checks",
|
|
304
|
+
};
|
|
305
|
+
emit({ onProgress }, progress);
|
|
306
|
+
const verify = await withProgressHeartbeat(
|
|
307
|
+
() => runVerify({
|
|
308
|
+
cwd,
|
|
309
|
+
exec,
|
|
310
|
+
config,
|
|
311
|
+
signal,
|
|
312
|
+
onCheckProgress: (event) => {
|
|
313
|
+
progress = { ...progress, message: verifyProgressMessage(event), updatedAt: Date.now() };
|
|
314
|
+
emit({ onProgress }, progress);
|
|
315
|
+
},
|
|
316
|
+
}),
|
|
317
|
+
() => emit({ onProgress }, progress),
|
|
318
|
+
);
|
|
319
|
+
await fs.writeFile(path.join(runDir, "verify.json"), JSON.stringify(verify, null, 2), { encoding: "utf8", mode: 0o600 });
|
|
320
|
+
const endedAt = Date.now();
|
|
321
|
+
const result: ScrutinyRunResult = {
|
|
322
|
+
runId,
|
|
323
|
+
surface,
|
|
324
|
+
status: "ok",
|
|
325
|
+
packet: "",
|
|
326
|
+
responses: [],
|
|
327
|
+
failed_models: [],
|
|
328
|
+
verify,
|
|
329
|
+
startedAt,
|
|
330
|
+
endedAt,
|
|
331
|
+
durationMs: endedAt - startedAt,
|
|
332
|
+
};
|
|
333
|
+
await writeRunResult({ cwd, runDir, result, prompt: params.prompt });
|
|
334
|
+
progress = { ...progress, status: "ok", updatedAt: endedAt, message: `verify: ${verify.passed} pass · ${verify.failed} fail · ${verify.skipped} skipped` };
|
|
335
|
+
emit({ onProgress }, progress);
|
|
336
|
+
recordRunEnd(runId, { status: "ok", endedAt });
|
|
337
|
+
const brief = formatVerifyBrief({ verify, budgetLine: verifyBudgetLine(verify) });
|
|
338
|
+
return { result, brief };
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
type VerifyProgressEvent = {
|
|
342
|
+
name: string;
|
|
343
|
+
index: number;
|
|
344
|
+
total: number;
|
|
345
|
+
status: "running" | "pass" | "fail" | "error";
|
|
346
|
+
durationMs?: number;
|
|
347
|
+
};
|
|
348
|
+
|
|
349
|
+
async function runVerify(input: { cwd: string; exec: ExecLike; config: import("./types.js").ScrutinyConfig; signal?: AbortSignal; onCheckProgress?: (event: VerifyProgressEvent) => void }): Promise<VerifyReport> {
|
|
350
|
+
const startedAt = Date.now();
|
|
351
|
+
const checks: VerifyCheck[] = [];
|
|
352
|
+
const total = input.config.verifyChecks.length;
|
|
353
|
+
for (let index = 0; index < total; index++) {
|
|
354
|
+
const spec = input.config.verifyChecks[index]!;
|
|
355
|
+
const checkStart = Date.now();
|
|
356
|
+
input.onCheckProgress?.({ name: spec.name, index, total, status: "running" });
|
|
357
|
+
try {
|
|
358
|
+
const result = await input.exec(spec.command, spec.args ?? [], { timeout: spec.timeoutMs ?? input.config.verifyTimeoutMs, signal: input.signal });
|
|
359
|
+
const durationMs = Date.now() - checkStart;
|
|
360
|
+
const code = result.code ?? 0;
|
|
361
|
+
const output = `${result.stdout ?? ""}${result.stderr ? `\n${result.stderr}` : ""}`.trim();
|
|
362
|
+
if (code === 0) {
|
|
363
|
+
checks.push({ name: spec.name, command: `${spec.command} ${(spec.args ?? []).join(" ")}`.trim(), status: "pass", exitCode: code, output: truncate(output, 4_000), durationMs });
|
|
364
|
+
input.onCheckProgress?.({ name: spec.name, index, total, status: "pass", durationMs });
|
|
365
|
+
} else {
|
|
366
|
+
checks.push({ name: spec.name, command: `${spec.command} ${(spec.args ?? []).join(" ")}`.trim(), status: "fail", exitCode: code, output: truncate(output, 4_000), durationMs });
|
|
367
|
+
input.onCheckProgress?.({ name: spec.name, index, total, status: "fail", durationMs });
|
|
368
|
+
}
|
|
369
|
+
} catch (error) {
|
|
370
|
+
const durationMs = Date.now() - checkStart;
|
|
371
|
+
checks.push({ name: spec.name, command: `${spec.command} ${(spec.args ?? []).join(" ")}`.trim(), status: "error", output: error instanceof Error ? error.message : String(error), durationMs });
|
|
372
|
+
input.onCheckProgress?.({ name: spec.name, index, total, status: "error", durationMs });
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
let diffStat: string | undefined;
|
|
376
|
+
try {
|
|
377
|
+
const stat = await input.exec("git", ["diff", "--stat"], { timeout: 5_000, signal: input.signal });
|
|
378
|
+
if (stat.code === 0 && stat.stdout?.trim()) diffStat = truncate(stat.stdout.trim(), 2_000);
|
|
379
|
+
} catch {
|
|
380
|
+
// diff optional
|
|
381
|
+
}
|
|
382
|
+
const passed = checks.filter((c) => c.status === "pass").length;
|
|
383
|
+
const failed = checks.filter((c) => c.status === "fail" || c.status === "error").length;
|
|
384
|
+
const skipped = checks.filter((c) => c.status === "skipped").length;
|
|
385
|
+
return { checks, diffStat, passed, failed, skipped, durationMs: Date.now() - startedAt };
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
async function withProgressHeartbeat<T>(work: () => Promise<T>, tick: () => void, intervalMs = PROGRESS_HEARTBEAT_MS): Promise<T> {
|
|
389
|
+
const timer = setInterval(() => {
|
|
390
|
+
try {
|
|
391
|
+
tick();
|
|
392
|
+
} catch {
|
|
393
|
+
// UI progress must never affect the underlying run.
|
|
394
|
+
}
|
|
395
|
+
}, intervalMs);
|
|
396
|
+
try {
|
|
397
|
+
return await work();
|
|
398
|
+
} finally {
|
|
399
|
+
clearInterval(timer);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
function verifyProgressMessage(event: VerifyProgressEvent): string {
|
|
404
|
+
const pos = `${event.index + 1}/${event.total}`;
|
|
405
|
+
if (event.status === "running") return `verify ${pos}: ${event.name} running`;
|
|
406
|
+
return `verify ${pos}: ${event.name} ${event.status}${event.durationMs !== undefined ? ` in ${formatDuration(event.durationMs)}` : ""}`;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
function resolveSurface(params: ScrutinyParams): ScrutinySurface {
|
|
410
|
+
if (params.surface) return params.surface;
|
|
411
|
+
return inferSurface(params.prompt);
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
export function inferSurface(prompt: string): ScrutinySurface {
|
|
415
|
+
const text = prompt.toLowerCase();
|
|
416
|
+
if (/\b(verify|typecheck|lint|run tests|test suite|does it pass|check the build|ci)\b/.test(text)) return "verify";
|
|
417
|
+
if (/\b(risk|review the patch|review this change|concurrency|race|reactive|idempoten|circuit.?breaker|security review)\b/.test(text)) return "risks";
|
|
418
|
+
if (/\b(root cause|why does|debug|intermittent|flaky|bug in|what is causing)\b/.test(text)) return "hypotheses";
|
|
419
|
+
if (/\b(acceptance criter|edge case|backward.?compat|migrat|spec for|definition of done)\b/.test(text)) return "criteria";
|
|
420
|
+
if (/\b(repo map|where is|call path|callers of|symbols|trace|how does .* work|navigate the code)\b/.test(text)) return "repo-map";
|
|
421
|
+
return "consult";
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
function shouldRunJudge(surface: ScrutinySurface, judgeMode: ScrutinyParams["judgeMode"]): boolean {
|
|
425
|
+
if (surface === "verify") return false;
|
|
426
|
+
const resolved = judgeMode ?? SURFACE_DEFAULTS[surface].judgeMode;
|
|
427
|
+
if (resolved === "off") return false;
|
|
428
|
+
if (resolved === "on") return true;
|
|
429
|
+
return surface === "consult";
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
function shouldRunVerify(surface: ScrutinySurface, verifyParam: ScrutinyParams["verify"]): boolean {
|
|
433
|
+
if (surface === "verify") return true;
|
|
434
|
+
if (verifyParam !== undefined) return verifyParam;
|
|
435
|
+
return SURFACE_DEFAULTS[surface].verify;
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
function mergeAnalysis(deterministic: ScrutinyAnalysis, judge: ScrutinyAnalysis, panelMode: PanelMode | undefined): ScrutinyAnalysis {
|
|
439
|
+
const canDisagree = panelMode !== "roles";
|
|
440
|
+
return {
|
|
441
|
+
consensus: judge.consensus ?? deterministic.consensus,
|
|
442
|
+
contradictions: canDisagree && judge.contradictions?.length ? judge.contradictions : deterministic.contradictions,
|
|
443
|
+
unique_insights: judge.unique_insights ?? deterministic.unique_insights,
|
|
444
|
+
risks: judge.risks?.length ? judge.risks : deterministic.risks,
|
|
445
|
+
coverage: judge.coverage?.length ? judge.coverage : deterministic.coverage,
|
|
446
|
+
blind_spots: judge.blind_spots ?? deterministic.blind_spots,
|
|
447
|
+
confidence: judge.confidence ?? deterministic.confidence,
|
|
448
|
+
disagreement_signal: canDisagree ? judge.disagreement_signal ?? deterministic.disagreement_signal : false,
|
|
449
|
+
};
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
function emit(input: { onProgress?: (progress: ScrutinyRunProgress) => void }, progress: ScrutinyRunProgress): void {
|
|
453
|
+
const updated = { ...progress, updatedAt: Date.now() };
|
|
454
|
+
recordRunProgress(updated);
|
|
455
|
+
input.onProgress?.(updated);
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
function updatePanel(progress: ScrutinyRunProgress, index: number, patch: Partial<ScrutinyRunProgress["panel"][number]>): ScrutinyRunProgress {
|
|
459
|
+
return {
|
|
460
|
+
...progress,
|
|
461
|
+
updatedAt: Date.now(),
|
|
462
|
+
panel: progress.panel.map((item, itemIndex) => (itemIndex === index ? { ...item, ...patch } : item)),
|
|
463
|
+
};
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
function responsesSoFar(progress: ScrutinyRunProgress): { ready: number; failed: number; total: number } {
|
|
467
|
+
return {
|
|
468
|
+
ready: progress.panel.filter((item) => item.status === "ready").length,
|
|
469
|
+
failed: progress.panel.filter((item) => item.status === "failed").length,
|
|
470
|
+
total: progress.panel.length,
|
|
471
|
+
};
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
function panelProgressLine(progress: { ready: number; failed: number; total: number }): string {
|
|
475
|
+
const active = progress.total - progress.ready - progress.failed;
|
|
476
|
+
return `${progress.ready}/${progress.total} ready${progress.failed ? `, ${progress.failed} failed` : ""}${active ? `, ${active} running` : ""}`;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
function replicatedBudgetLine(packet: string, panelCount: number, judgeRan: boolean): string {
|
|
480
|
+
const packetTokens = Math.ceil(packet.length / 4);
|
|
481
|
+
const replicated = packetTokens * panelCount;
|
|
482
|
+
return `budget: packet ~${formatTokens(packetTokens)} tokens × ${panelCount} panelists = ~${formatTokens(replicated)} replicated input tokens${judgeRan ? "; trade-off explainer also reads panel outputs" : "; trade-off explainer skipped"}`;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
function budgetLine(packet: string, responses: PanelResponse[], judgeRan: boolean): string {
|
|
486
|
+
const base = replicatedBudgetLine(packet, responses.length, judgeRan);
|
|
487
|
+
const input = responses.reduce((sum, response) => sum + response.usage.input, 0);
|
|
488
|
+
const output = responses.reduce((sum, response) => sum + response.usage.output, 0);
|
|
489
|
+
const cost = responses.reduce((sum, response) => sum + response.usage.cost, 0);
|
|
490
|
+
const actual = input || output || cost ? `actual panel usage: ↑${formatTokens(input)} ↓${formatTokens(output)}${cost ? ` $${cost.toFixed(4)}` : ""}` : "actual panel usage unavailable";
|
|
491
|
+
return `${base}\n${actual}`;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
function verifyBudgetLine(verify: VerifyReport): string {
|
|
495
|
+
return `budget: ${verify.checks.length} objective checks · no panel · no judge · ${formatDuration(verify.durationMs)}`;
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
function emptyError(input: { runId: string; surface: ScrutinySurface; startedAt: number; error: string; failure_reason: ScrutinyRunResult["failure_reason"] }): ScrutinyRunResult {
|
|
499
|
+
const endedAt = Date.now();
|
|
500
|
+
return {
|
|
501
|
+
runId: input.runId,
|
|
502
|
+
surface: input.surface,
|
|
503
|
+
status: "error",
|
|
504
|
+
failure_reason: input.failure_reason,
|
|
505
|
+
error: input.error,
|
|
506
|
+
packet: "",
|
|
507
|
+
responses: [],
|
|
508
|
+
failed_models: [],
|
|
509
|
+
startedAt: input.startedAt,
|
|
510
|
+
endedAt,
|
|
511
|
+
durationMs: endedAt - input.startedAt,
|
|
512
|
+
};
|
|
513
|
+
}
|