imprint-mcp 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -201
- package/examples/discoverandgo/README.md +1 -1
- package/examples/echo/README.md +1 -1
- package/examples/google-flights/README.md +28 -0
- package/examples/google-flights/_shared/batchexecute.ts +63 -0
- package/examples/google-flights/_shared/flights_request.ts +95 -0
- package/examples/google-flights/_shared/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/index.ts +159 -0
- package/examples/google-flights/get_flight_booking_details/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/parser.ts +182 -0
- package/examples/google-flights/get_flight_booking_details/playbook.yaml +138 -0
- package/examples/google-flights/get_flight_booking_details/request-transform.ts +86 -0
- package/examples/google-flights/get_flight_booking_details/workflow.json +98 -0
- package/examples/google-flights/get_flight_calendar_prices/index.ts +131 -0
- package/examples/google-flights/get_flight_calendar_prices/package.json +9 -0
- package/examples/google-flights/get_flight_calendar_prices/parser.ts +86 -0
- package/examples/google-flights/get_flight_calendar_prices/playbook.yaml +97 -0
- package/examples/google-flights/get_flight_calendar_prices/request-transform.ts +31 -0
- package/examples/google-flights/get_flight_calendar_prices/workflow.json +76 -0
- package/examples/google-flights/lookup_airport/index.ts +101 -0
- package/examples/google-flights/lookup_airport/package.json +9 -0
- package/examples/google-flights/lookup_airport/parser.ts +66 -0
- package/examples/google-flights/lookup_airport/playbook.yaml +47 -0
- package/examples/google-flights/lookup_airport/request-transform.ts +20 -0
- package/examples/google-flights/lookup_airport/workflow.json +57 -0
- package/examples/google-flights/search_flights/index.ts +219 -0
- package/examples/google-flights/search_flights/package.json +9 -0
- package/examples/google-flights/search_flights/parser.ts +169 -0
- package/examples/google-flights/search_flights/playbook.yaml +184 -0
- package/examples/google-flights/search_flights/request-transform.ts +119 -0
- package/examples/google-flights/search_flights/workflow.json +143 -0
- package/examples/google-hotels/README.md +29 -0
- package/examples/google-hotels/_shared/batchexecute.ts +73 -0
- package/examples/google-hotels/_shared/freq.ts +158 -0
- package/examples/google-hotels/_shared/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/index.ts +80 -0
- package/examples/google-hotels/autocomplete_hotel_location/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/parser.ts +71 -0
- package/examples/google-hotels/autocomplete_hotel_location/playbook.yaml +36 -0
- package/examples/google-hotels/autocomplete_hotel_location/request-transform.ts +37 -0
- package/examples/google-hotels/autocomplete_hotel_location/workflow.json +36 -0
- package/examples/google-hotels/get_hotel_booking_options/index.ts +143 -0
- package/examples/google-hotels/get_hotel_booking_options/package.json +9 -0
- package/examples/google-hotels/get_hotel_booking_options/parser.ts +271 -0
- package/examples/google-hotels/get_hotel_booking_options/playbook.yaml +154 -0
- package/examples/google-hotels/get_hotel_booking_options/request-transform.ts +154 -0
- package/examples/google-hotels/get_hotel_booking_options/workflow.json +84 -0
- package/examples/google-hotels/get_hotel_reviews/index.ts +81 -0
- package/examples/google-hotels/get_hotel_reviews/package.json +9 -0
- package/examples/google-hotels/get_hotel_reviews/parser.ts +128 -0
- package/examples/google-hotels/get_hotel_reviews/playbook.yaml +64 -0
- package/examples/google-hotels/get_hotel_reviews/request-transform.ts +42 -0
- package/examples/google-hotels/get_hotel_reviews/workflow.json +37 -0
- package/examples/google-hotels/search_hotels/index.ts +207 -0
- package/examples/google-hotels/search_hotels/package.json +9 -0
- package/examples/google-hotels/search_hotels/parser.ts +260 -0
- package/examples/google-hotels/search_hotels/playbook.yaml +87 -0
- package/examples/google-hotels/search_hotels/request-transform.ts +197 -0
- package/examples/google-hotels/search_hotels/workflow.json +127 -0
- package/package.json +3 -2
- package/prompts/audit-agent.md +71 -0
- package/prompts/build-planning.md +74 -0
- package/prompts/compile-agent.md +132 -28
- package/prompts/prereq-builder.md +64 -0
- package/prompts/prereq-planner.md +34 -0
- package/prompts/tool-planning.md +39 -0
- package/src/cli.ts +111 -4
- package/src/imprint/agent.ts +5 -0
- package/src/imprint/audit.ts +996 -0
- package/src/imprint/backend-ladder.ts +1214 -184
- package/src/imprint/build-plan.ts +1051 -0
- package/src/imprint/cdp-browser-fetch.ts +589 -0
- package/src/imprint/cdp-jar-cache.ts +320 -0
- package/src/imprint/chromium.ts +135 -0
- package/src/imprint/claude-cli-compile.ts +125 -25
- package/src/imprint/codex-cli-compile.ts +26 -23
- package/src/imprint/compile-agent-types.ts +38 -0
- package/src/imprint/compile-agent.ts +65 -27
- package/src/imprint/compile-tools.ts +1656 -64
- package/src/imprint/compile.ts +14 -2
- package/src/imprint/concurrency.ts +87 -0
- package/src/imprint/credential-extract.ts +174 -25
- package/src/imprint/cron.ts +1 -0
- package/src/imprint/doctor.ts +39 -0
- package/src/imprint/emit.ts +85 -0
- package/src/imprint/freeform-redact.ts +5 -4
- package/src/imprint/integrations.ts +2 -2
- package/src/imprint/llm.ts +56 -8
- package/src/imprint/mcp-compile-server.ts +43 -10
- package/src/imprint/mcp-maintenance.ts +9 -101
- package/src/imprint/mcp-server.ts +73 -7
- package/src/imprint/multi-progress.ts +7 -2
- package/src/imprint/param-grounding.ts +367 -0
- package/src/imprint/paths.ts +29 -0
- package/src/imprint/playbook-runner.ts +101 -40
- package/src/imprint/prereq-builder.ts +651 -0
- package/src/imprint/probe-backends.ts +6 -3
- package/src/imprint/record.ts +10 -1
- package/src/imprint/redact.ts +30 -2
- package/src/imprint/replay-capture.ts +19 -18
- package/src/imprint/runtime.ts +19 -10
- package/src/imprint/sensitive-keys.ts +141 -7
- package/src/imprint/session-diff.ts +79 -2
- package/src/imprint/session-merge.ts +9 -5
- package/src/imprint/stealth-chromium.ts +81 -0
- package/src/imprint/stealth-fetch.ts +309 -29
- package/src/imprint/stealth-token-cache.ts +88 -0
- package/src/imprint/teach-plan.ts +251 -0
- package/src/imprint/teach-state.ts +17 -0
- package/src/imprint/teach.ts +582 -147
- package/src/imprint/tool-candidates.ts +72 -14
- package/src/imprint/tool-plan.ts +313 -0
- package/src/imprint/tracing.ts +135 -6
- package/src/imprint/types.ts +61 -3
- package/examples/google-flights/search_google_flights/index.ts +0 -101
- package/examples/google-flights/search_google_flights/parser.test.ts +0 -140
- package/examples/google-flights/search_google_flights/parser.ts +0 -189
- package/examples/google-flights/search_google_flights/playbook.yaml +0 -130
- package/examples/google-flights/search_google_flights/workflow.json +0 -48
- package/examples/google-hotels/search_google_hotels/index.ts +0 -194
- package/examples/google-hotels/search_google_hotels/parser.test.ts +0 -168
- package/examples/google-hotels/search_google_hotels/parser.ts +0 -330
- package/examples/google-hotels/search_google_hotels/playbook.yaml +0 -125
- package/examples/google-hotels/search_google_hotels/workflow.json +0 -111
- package/examples/namecheap-domains/search_namecheap_domains/index.ts +0 -144
- package/examples/namecheap-domains/search_namecheap_domains/parser.ts +0 -380
- package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +0 -50
- package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +0 -136
- package/examples/namecheap-domains/search_namecheap_domains/workflow.json +0 -97
|
@@ -0,0 +1,996 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Headless-claude MCP audit harness — the acceptance gate for a site's
|
|
3
|
+
* generated tools.
|
|
4
|
+
*
|
|
5
|
+
* `runAudit` discovers every tool a site exposes via `imprint mcp-server`,
|
|
6
|
+
* spawns a headless `claude` session pointed at that real MCP server, and asks
|
|
7
|
+
* it to exercise each tool and classify every invocation. The model returns a
|
|
8
|
+
* structured report, but it never reports a score: imprint recomputes the score
|
|
9
|
+
* deterministically from the model's per-invocation verdicts
|
|
10
|
+
* (`computeAuditScore`) so the gate can't be talked up by a generous auditor.
|
|
11
|
+
*
|
|
12
|
+
* The harness is fully site-agnostic — the auditor derives every parameter from
|
|
13
|
+
* each tool's schema + description. There is no per-site special-casing here.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { type ChildProcess, spawn } from 'node:child_process';
|
|
17
|
+
import { existsSync, mkdirSync, writeFileSync } from 'node:fs';
|
|
18
|
+
import { dirname, join as pathJoin } from 'node:path';
|
|
19
|
+
import { z } from 'zod';
|
|
20
|
+
import { preferredAgentModel } from './llm.ts';
|
|
21
|
+
import { createLog } from './log.ts';
|
|
22
|
+
import { imprintHomeDir } from './paths.ts';
|
|
23
|
+
import { discoverTools } from './tool-loader.ts';
|
|
24
|
+
import { llmSpanAttributes, setSpanAttributes, totalPromptTokens, traced } from './tracing.ts';
|
|
25
|
+
|
|
26
|
+
const log = createLog('audit');
|
|
27
|
+
|
|
28
|
+
const REPO_ROOT = pathJoin(import.meta.dir, '..', '..');
|
|
29
|
+
const CLI_PATH = pathJoin(REPO_ROOT, 'src', 'cli.ts');
|
|
30
|
+
const PROMPTS_DIR = pathJoin(REPO_ROOT, 'prompts');
|
|
31
|
+
|
|
32
|
+
/** Default wall-clock cap for an audit session. This is a CAP, not a fixed
|
|
33
|
+
* duration: a fast site (e.g. marriott's plain-fetch tools) finishes its full
|
|
34
|
+
* differential param sweep in ~2 min and exits early. The cap only bites on
|
|
35
|
+
* slow sites — those whose tools replay via cdp (a real Chrome per call,
|
|
36
|
+
* ~60-90s each) AND expose many parameters, so the per-param sweep needs far
|
|
37
|
+
* more than the old 20 min (southwest, with 62KB search payloads across ~14
|
|
38
|
+
* params, was killed mid-sweep at 20 min despite both tools being live). 45 min
|
|
39
|
+
* lets those complete while still bounding a genuinely hung session. */
|
|
40
|
+
const DEFAULT_AUDIT_TIMEOUT_MS = 45 * 60_000;
|
|
41
|
+
|
|
42
|
+
/** One invocation the auditor performed against a tool. */
|
|
43
|
+
const InvocationSchema = z.object({
|
|
44
|
+
params: z.record(z.unknown()).default({}),
|
|
45
|
+
ok: z.boolean(),
|
|
46
|
+
verdict: z.enum(['correct', 'tool_broken', 'infra', 'bad_params']),
|
|
47
|
+
reason: z.string().default(''),
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
/** Per-parameter differential verdict. The auditor calls the tool once at a
|
|
51
|
+
* baseline, then once with ONLY this parameter changed to a value expected to
|
|
52
|
+
* alter the result, and compares:
|
|
53
|
+
* - `works` — the result changed as the description promises.
|
|
54
|
+
* - `no_op` — the result was unchanged → the parameter is inert.
|
|
55
|
+
* - `broken` — the result changed wrongly (corrupted/emptied/nonsense).
|
|
56
|
+
* - `untestable` — no distinct valid value could be constructed, or the tool
|
|
57
|
+
* is state-changing / bot-defended so probing is unsafe.
|
|
58
|
+
* `works` grades correct; `no_op`/`broken` grade as defects ("no-op is not a
|
|
59
|
+
* free pass"); `untestable` is surfaced but not scored. */
|
|
60
|
+
const ParameterAuditSchema = z.object({
|
|
61
|
+
name: z.string(),
|
|
62
|
+
verdict: z.enum(['works', 'no_op', 'broken', 'untestable']),
|
|
63
|
+
reason: z.string().default(''),
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
const ToolAuditSchema = z.object({
|
|
67
|
+
name: z.string(),
|
|
68
|
+
invocations: z.array(InvocationSchema).default([]),
|
|
69
|
+
parameters: z.array(ParameterAuditSchema).default([]),
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
/** The single JSON object the auditor returns. Scoring is NOT taken from the
|
|
73
|
+
* model; only the per-invocation verdicts feed `computeAuditScore`. */
|
|
74
|
+
export const AuditReportSchema = z.object({
|
|
75
|
+
tools: z.array(ToolAuditSchema).default([]),
|
|
76
|
+
notes: z.string().default(''),
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
export type AuditReport = z.infer<typeof AuditReportSchema>;
|
|
80
|
+
|
|
81
|
+
interface AuditScore {
|
|
82
|
+
score: number;
|
|
83
|
+
correct: number;
|
|
84
|
+
broken: number;
|
|
85
|
+
infra: number;
|
|
86
|
+
badParams: number;
|
|
87
|
+
graded: number;
|
|
88
|
+
/** Per-parameter differential tallies (folded into correct/broken/graded
|
|
89
|
+
* above; broken out here for the report). `untestable` is surfaced only. */
|
|
90
|
+
paramsWorking: number;
|
|
91
|
+
paramsNoOp: number;
|
|
92
|
+
paramsBroken: number;
|
|
93
|
+
paramsUntestable: number;
|
|
94
|
+
/** `timeout` is set by `runAudit` (not `computeAuditScore`) when the session
|
|
95
|
+
* was killed by the deadline guard — a cut-off run is never a trustworthy
|
|
96
|
+
* pass, even if the partial verdicts would have scored one. */
|
|
97
|
+
verdict: 'pass' | 'fail' | 'inconclusive' | 'timeout';
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Pure, deterministic scoring over the model's verdicts.
|
|
102
|
+
*
|
|
103
|
+
* - `correct` / `tool_broken` invocation verdicts grade core tool behavior; the
|
|
104
|
+
* per-parameter differential verdicts grade each advertised parameter and fold
|
|
105
|
+
* into the SAME accumulator: `works` → correct, `no_op`/`broken` → broken
|
|
106
|
+
* ("no-op is not a free pass"), `untestable` → surfaced but not scored.
|
|
107
|
+
* `graded` is correct + broken (invocations + params). `infra` (anti-bot /
|
|
108
|
+
* rate-limit / network / timeout) and `bad_params` (the auditor's own mistake)
|
|
109
|
+
* are excluded so a blocked or misused tool isn't counted as a code bug.
|
|
110
|
+
* - `score = 100 * correct / graded` (0 when nothing was gradeable).
|
|
111
|
+
* - Verdict: no gradeable invocations → `inconclusive` (re-run / site blocked
|
|
112
|
+
* us, not a code fail). Otherwise `pass` requires both `score >= minScore`
|
|
113
|
+
* AND at least `max(2, gradeableTools)` gradeable invocations, where
|
|
114
|
+
* `gradeableTools` is the number of tools that produced ≥1 gradeable
|
|
115
|
+
* invocation. Scaling the signal floor to *gradeable* tools (not all tools)
|
|
116
|
+
* means a tool the auditor can never exercise — e.g. one that needs an opaque
|
|
117
|
+
* token it cannot synthesize — no longer inflates the bar and sinks an
|
|
118
|
+
* otherwise-perfect run; such tools surface separately as `ungradeableTools`.
|
|
119
|
+
* The floor is one gradeable call per gradeable tool (not two): the auditor
|
|
120
|
+
* often burns a slot per tool on `bad_params`/`infra` (its own mistake or a
|
|
121
|
+
* transient block), so demanding two clean reads per tool false-fails an
|
|
122
|
+
* otherwise-perfect run. One verified read per tool plus `score >= minScore`
|
|
123
|
+
* is the honest floor; real defects still fail on score, not on this count.
|
|
124
|
+
*/
|
|
125
|
+
export function computeAuditScore(report: AuditReport, minScore: number): AuditScore {
|
|
126
|
+
let correct = 0;
|
|
127
|
+
let broken = 0;
|
|
128
|
+
let infra = 0;
|
|
129
|
+
let badParams = 0;
|
|
130
|
+
let paramsWorking = 0;
|
|
131
|
+
let paramsNoOp = 0;
|
|
132
|
+
let paramsBroken = 0;
|
|
133
|
+
let paramsUntestable = 0;
|
|
134
|
+
let gradeableTools = 0;
|
|
135
|
+
for (const tool of report.tools) {
|
|
136
|
+
let toolGradeable = 0;
|
|
137
|
+
for (const inv of tool.invocations) {
|
|
138
|
+
switch (inv.verdict) {
|
|
139
|
+
case 'correct':
|
|
140
|
+
correct++;
|
|
141
|
+
toolGradeable++;
|
|
142
|
+
break;
|
|
143
|
+
case 'tool_broken':
|
|
144
|
+
broken++;
|
|
145
|
+
toolGradeable++;
|
|
146
|
+
break;
|
|
147
|
+
case 'infra':
|
|
148
|
+
infra++;
|
|
149
|
+
break;
|
|
150
|
+
case 'bad_params':
|
|
151
|
+
badParams++;
|
|
152
|
+
break;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
for (const param of tool.parameters) {
|
|
156
|
+
switch (param.verdict) {
|
|
157
|
+
case 'works':
|
|
158
|
+
paramsWorking++;
|
|
159
|
+
correct++;
|
|
160
|
+
toolGradeable++;
|
|
161
|
+
break;
|
|
162
|
+
case 'no_op':
|
|
163
|
+
paramsNoOp++;
|
|
164
|
+
broken++;
|
|
165
|
+
toolGradeable++;
|
|
166
|
+
break;
|
|
167
|
+
case 'broken':
|
|
168
|
+
paramsBroken++;
|
|
169
|
+
broken++;
|
|
170
|
+
toolGradeable++;
|
|
171
|
+
break;
|
|
172
|
+
case 'untestable':
|
|
173
|
+
paramsUntestable++;
|
|
174
|
+
break;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
if (toolGradeable > 0) gradeableTools++;
|
|
178
|
+
}
|
|
179
|
+
const graded = correct + broken;
|
|
180
|
+
const score = graded === 0 ? 0 : (100 * correct) / graded;
|
|
181
|
+
const minGraded = Math.max(2, gradeableTools);
|
|
182
|
+
let verdict: AuditScore['verdict'];
|
|
183
|
+
if (graded === 0) {
|
|
184
|
+
verdict = 'inconclusive';
|
|
185
|
+
} else if (score >= minScore && graded >= minGraded) {
|
|
186
|
+
verdict = 'pass';
|
|
187
|
+
} else {
|
|
188
|
+
verdict = 'fail';
|
|
189
|
+
}
|
|
190
|
+
return {
|
|
191
|
+
score,
|
|
192
|
+
correct,
|
|
193
|
+
broken,
|
|
194
|
+
infra,
|
|
195
|
+
badParams,
|
|
196
|
+
graded,
|
|
197
|
+
paramsWorking,
|
|
198
|
+
paramsNoOp,
|
|
199
|
+
paramsBroken,
|
|
200
|
+
paramsUntestable,
|
|
201
|
+
verdict,
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/** Tools the auditor could never grade (every invocation was infra/bad_params,
|
|
206
|
+
* or it ran none). Surfaced in the report so an un-exercisable tool is visible
|
|
207
|
+
* rather than silently excluded from the score. */
|
|
208
|
+
export function ungradeableToolNames(report: AuditReport): string[] {
|
|
209
|
+
return report.tools
|
|
210
|
+
.filter(
|
|
211
|
+
(t) => !t.invocations.some((i) => i.verdict === 'correct' || i.verdict === 'tool_broken'),
|
|
212
|
+
)
|
|
213
|
+
.map((t) => t.name);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/** Advertised parameters the auditor could not differentially test (opaque enum
|
|
217
|
+
* with no constructible value, or a state-changing/bot-defended tool). Surfaced
|
|
218
|
+
* so an unverifiable parameter is visible rather than silently passing. */
|
|
219
|
+
export function untestableParams(
|
|
220
|
+
report: AuditReport,
|
|
221
|
+
): Array<{ tool: string; name: string; reason: string }> {
|
|
222
|
+
const out: Array<{ tool: string; name: string; reason: string }> = [];
|
|
223
|
+
for (const tool of report.tools) {
|
|
224
|
+
for (const param of tool.parameters) {
|
|
225
|
+
if (param.verdict === 'untestable') {
|
|
226
|
+
out.push({ tool: tool.name, name: param.name, reason: param.reason });
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
return out;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
interface RunAuditOptions {
|
|
234
|
+
site: string;
|
|
235
|
+
minScore: number;
|
|
236
|
+
outPath: string;
|
|
237
|
+
model?: string;
|
|
238
|
+
timeoutMs?: number;
|
|
239
|
+
json?: boolean;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
export async function runAudit(opts: RunAuditOptions): Promise<AuditScore> {
|
|
243
|
+
return await traced(
|
|
244
|
+
'audit.session',
|
|
245
|
+
'AGENT',
|
|
246
|
+
{
|
|
247
|
+
'imprint.site': opts.site,
|
|
248
|
+
'imprint.audit.min_score': opts.minScore,
|
|
249
|
+
},
|
|
250
|
+
async (span) => {
|
|
251
|
+
const assetRoot = imprintHomeDir();
|
|
252
|
+
const tools = await discoverTools(assetRoot, opts.site, '[imprint audit]');
|
|
253
|
+
const toolCount = tools.length;
|
|
254
|
+
if (toolCount === 0) {
|
|
255
|
+
throw new Error(
|
|
256
|
+
`No generated tool found for site "${opts.site}" — run \`imprint teach ${opts.site}\` first, then audit it.`,
|
|
257
|
+
);
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
const model = opts.model ?? preferredAgentModel('claude-cli');
|
|
261
|
+
const timeoutMs = opts.timeoutMs ?? DEFAULT_AUDIT_TIMEOUT_MS;
|
|
262
|
+
const systemPromptPath = pathJoin(PROMPTS_DIR, 'audit-agent.md');
|
|
263
|
+
if (!existsSync(systemPromptPath)) {
|
|
264
|
+
throw new Error(
|
|
265
|
+
`Audit system prompt not found at ${systemPromptPath}\n→ this is an Imprint installation problem; please file an issue at https://github.com/ashaychangwani/imprint/issues with the steps you ran.`,
|
|
266
|
+
);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
const toolNames = tools.map((t) => t.workflow.toolName);
|
|
270
|
+
log(`auditing ${toolCount} tool(s) for site "${opts.site}": ${toolNames.join(', ')}`);
|
|
271
|
+
|
|
272
|
+
// Parameters that shipped live-unverified at compile time (Fix D). Tell the
|
|
273
|
+
// auditor to probe them especially — these are the most likely to be broken
|
|
274
|
+
// (the compile-time differential could not confirm their effect).
|
|
275
|
+
const unverifiedParams: Array<{ tool: string; params: string[] }> = [];
|
|
276
|
+
for (const t of tools) {
|
|
277
|
+
const params = (t.workflow.parameters ?? [])
|
|
278
|
+
.filter((p) => p.verified === false)
|
|
279
|
+
.map((p) => p.name);
|
|
280
|
+
if (params.length > 0) unverifiedParams.push({ tool: t.workflow.toolName, params });
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// Producer→consumer token contracts (sourcedFrom). Tell the auditor to chain
|
|
284
|
+
// (call the producer, read the named field, feed the consumer) rather than
|
|
285
|
+
// fabricate an opaque token — otherwise a correct chained tool false-fails.
|
|
286
|
+
const tokenDeps: TokenDep[] = [];
|
|
287
|
+
for (const t of tools) {
|
|
288
|
+
for (const p of t.workflow.parameters ?? []) {
|
|
289
|
+
if (p.sourcedFrom) {
|
|
290
|
+
tokenDeps.push({
|
|
291
|
+
tool: t.workflow.toolName,
|
|
292
|
+
param: p.name,
|
|
293
|
+
sourceTool: p.sourcedFrom.tool,
|
|
294
|
+
sourceField: p.sourcedFrom.field,
|
|
295
|
+
});
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
const drive = await driveAudit({
|
|
301
|
+
site: opts.site,
|
|
302
|
+
model,
|
|
303
|
+
timeoutMs,
|
|
304
|
+
systemPromptPath,
|
|
305
|
+
toolNames,
|
|
306
|
+
unverifiedParams,
|
|
307
|
+
tokenDeps,
|
|
308
|
+
});
|
|
309
|
+
|
|
310
|
+
const rawScore = computeAuditScore(drive.report, opts.minScore);
|
|
311
|
+
|
|
312
|
+
// Cross-reference compile-time live verification with the audit grade.
|
|
313
|
+
// The downgrade rule's purpose is to surface "flying blind" runs —
|
|
314
|
+
// ones where the gate has no positive evidence the framework works
|
|
315
|
+
// for the audited site. Iterations of this rule:
|
|
316
|
+
// v1: downgrade if any tool was liveVerified=false AND ungradeable
|
|
317
|
+
// → too strict (downgraded perfectly-scoring runs when one
|
|
318
|
+
// chained tool was unreachable from auditor's connected set).
|
|
319
|
+
// v2: downgrade only if a flying-blind tool had infra invocations
|
|
320
|
+
// → still over-attributed transient page-state to defects.
|
|
321
|
+
// v3 (current): downgrade only when the audit produced ZERO
|
|
322
|
+
// `correct` invocations across ALL tools. If even one
|
|
323
|
+
// invocation graded correctly, that's positive evidence the
|
|
324
|
+
// framework + runtime work for at least that tool — the
|
|
325
|
+
// overall score (correct/(correct+broken)) is the honest
|
|
326
|
+
// signal. Tools that couldn't be exercised still surface via
|
|
327
|
+
// `ungradeableTools` / `unverifiedAndUngradeable` for visibility
|
|
328
|
+
// without spoiling a verdict the score honestly earned.
|
|
329
|
+
const ungradeableNames = ungradeableToolNames(drive.report);
|
|
330
|
+
const untestableParamList = untestableParams(drive.report);
|
|
331
|
+
const unverifiedAndUngradeable = tools
|
|
332
|
+
.filter((t) => t.workflow.liveVerified === false)
|
|
333
|
+
.map((t) => t.workflow.toolName)
|
|
334
|
+
.filter((name) => ungradeableNames.includes(name));
|
|
335
|
+
const anyCorrectAcrossAudit = drive.report.tools.some((t) =>
|
|
336
|
+
t.invocations.some((i) => i.verdict === 'correct'),
|
|
337
|
+
);
|
|
338
|
+
let verdict = rawScore.verdict;
|
|
339
|
+
// Timeout takes precedence over inconclusive downgrade.
|
|
340
|
+
if (drive.timedOut) {
|
|
341
|
+
verdict = 'timeout';
|
|
342
|
+
} else if (rawScore.verdict === 'pass' && !anyCorrectAcrossAudit) {
|
|
343
|
+
verdict = 'inconclusive';
|
|
344
|
+
}
|
|
345
|
+
const score: AuditScore = { ...rawScore, verdict };
|
|
346
|
+
|
|
347
|
+
// Persist the auditor transcript next to the report so a stuck/killed run
|
|
348
|
+
// can be inspected after the fact.
|
|
349
|
+
let transcriptPath: string | undefined;
|
|
350
|
+
if (drive.transcript) {
|
|
351
|
+
transcriptPath = pathJoin(dirname(opts.outPath), '.audit-transcript.txt');
|
|
352
|
+
try {
|
|
353
|
+
mkdirSync(dirname(transcriptPath), { recursive: true });
|
|
354
|
+
writeFileSync(transcriptPath, `${drive.transcript}\n`, 'utf8');
|
|
355
|
+
} catch (err) {
|
|
356
|
+
log(`failed to persist audit transcript to ${transcriptPath}: ${errMsg(err)}`);
|
|
357
|
+
transcriptPath = undefined;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// TOTAL prompt (uncached + cache) for the cost calc; the cache split is
|
|
362
|
+
// passed to llmSpanAttributes separately. Always a number here
|
|
363
|
+
// (drive.inputTokens is non-null), so the cost-suppression happens via the
|
|
364
|
+
// `|| undefined` at the call site below.
|
|
365
|
+
const totalInputTokens = totalPromptTokens(
|
|
366
|
+
drive.inputTokens,
|
|
367
|
+
drive.cacheReadInputTokens,
|
|
368
|
+
drive.cacheCreationInputTokens,
|
|
369
|
+
);
|
|
370
|
+
setSpanAttributes(span, {
|
|
371
|
+
'imprint.audit.score': score.score,
|
|
372
|
+
'imprint.audit.correct': score.correct,
|
|
373
|
+
'imprint.audit.broken': score.broken,
|
|
374
|
+
'imprint.audit.infra': score.infra,
|
|
375
|
+
'imprint.audit.bad_params': score.badParams,
|
|
376
|
+
'imprint.audit.graded': score.graded,
|
|
377
|
+
'imprint.audit.params_working': score.paramsWorking,
|
|
378
|
+
'imprint.audit.params_no_op': score.paramsNoOp,
|
|
379
|
+
'imprint.audit.params_broken': score.paramsBroken,
|
|
380
|
+
'imprint.audit.params_untestable': score.paramsUntestable,
|
|
381
|
+
'imprint.audit.tool_count': toolCount,
|
|
382
|
+
'imprint.audit.verdict': score.verdict,
|
|
383
|
+
'imprint.audit.unverified_and_ungradeable_count': unverifiedAndUngradeable.length,
|
|
384
|
+
'imprint.audit.timed_out': drive.timedOut,
|
|
385
|
+
'imprint.audit.turns': drive.turns,
|
|
386
|
+
...(drive.totalCostUsd != null ? { 'imprint.audit.cost_usd': drive.totalCostUsd } : {}),
|
|
387
|
+
...llmSpanAttributes({
|
|
388
|
+
provider: 'claude-cli',
|
|
389
|
+
model,
|
|
390
|
+
// `|| undefined`: when no usage was captured (e.g. spawn failure → 0
|
|
391
|
+
// tokens), suppress a bogus $0 cost instead of emitting it.
|
|
392
|
+
inputTokens: totalInputTokens || undefined,
|
|
393
|
+
outputTokens: drive.outputTokens || undefined,
|
|
394
|
+
cacheReadTokens: drive.cacheReadInputTokens || undefined,
|
|
395
|
+
cacheWriteTokens: drive.cacheCreationInputTokens || undefined,
|
|
396
|
+
}),
|
|
397
|
+
});
|
|
398
|
+
|
|
399
|
+
// Persist the full result (deterministic score + the raw model report).
|
|
400
|
+
const persisted = {
|
|
401
|
+
...score,
|
|
402
|
+
report: drive.report,
|
|
403
|
+
site: opts.site,
|
|
404
|
+
toolCount,
|
|
405
|
+
ungradeableTools: ungradeableNames,
|
|
406
|
+
/** Advertised parameters the auditor could not differentially test. */
|
|
407
|
+
untestableParams: untestableParamList,
|
|
408
|
+
/** Tools that shipped without live verification at compile time AND
|
|
409
|
+
* could not be graded at audit time — zero live signal anywhere. */
|
|
410
|
+
unverifiedAndUngradeable,
|
|
411
|
+
minScore: opts.minScore,
|
|
412
|
+
timedOut: drive.timedOut,
|
|
413
|
+
turns: drive.turns,
|
|
414
|
+
costUsd: drive.totalCostUsd,
|
|
415
|
+
inputTokens: drive.inputTokens,
|
|
416
|
+
outputTokens: drive.outputTokens,
|
|
417
|
+
cacheReadInputTokens: drive.cacheReadInputTokens,
|
|
418
|
+
cacheCreationInputTokens: drive.cacheCreationInputTokens,
|
|
419
|
+
transcriptPath,
|
|
420
|
+
};
|
|
421
|
+
try {
|
|
422
|
+
mkdirSync(dirname(opts.outPath), { recursive: true });
|
|
423
|
+
writeFileSync(opts.outPath, `${JSON.stringify(persisted, null, 2)}\n`, 'utf8');
|
|
424
|
+
} catch (err) {
|
|
425
|
+
log(`failed to persist audit report to ${opts.outPath}: ${errMsg(err)}`);
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
if (opts.json) {
|
|
429
|
+
console.log(JSON.stringify(persisted, null, 2));
|
|
430
|
+
} else {
|
|
431
|
+
printSummary(opts, score, toolCount, {
|
|
432
|
+
timedOut: drive.timedOut,
|
|
433
|
+
timeoutMs,
|
|
434
|
+
transcriptPath,
|
|
435
|
+
costUsd: drive.totalCostUsd,
|
|
436
|
+
unverifiedAndUngradeable,
|
|
437
|
+
report: drive.report,
|
|
438
|
+
});
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
return score;
|
|
442
|
+
},
|
|
443
|
+
);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
/** A consumer param whose value is minted by a sibling producer tool's output
|
|
447
|
+
* field (from `workflow.json` `param.sourcedFrom`). */
|
|
448
|
+
interface TokenDep {
|
|
449
|
+
tool: string;
|
|
450
|
+
param: string;
|
|
451
|
+
sourceTool: string;
|
|
452
|
+
sourceField: string;
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
/** Build the auditor instruction for producer-sourced token params: chain the
|
|
456
|
+
* producer first, read its field, feed the consumer — never fabricate. Pure so
|
|
457
|
+
* it can be unit-tested without spawning the audit session. */
|
|
458
|
+
export function buildTokenDepNote(tokenDeps: TokenDep[]): string {
|
|
459
|
+
if (tokenDeps.length === 0) return '';
|
|
460
|
+
const lines = tokenDeps.map(
|
|
461
|
+
(d) =>
|
|
462
|
+
`- ${d.tool}(${d.param}) ← first call ${d.sourceTool}, then pass its \`${d.sourceField}\` output value`,
|
|
463
|
+
);
|
|
464
|
+
return `\n\nSome parameters are opaque tokens/ids minted by ANOTHER tool — you cannot fabricate them. For each below, call the producer tool first, read the named output field from its result, and pass that exact value to the consumer (reuse it across calls; no need to re-fetch each time):\n${lines.join(
|
|
465
|
+
'\n',
|
|
466
|
+
)}\nIf you cannot obtain such a value because the producer is blocked, classify the consumer call \`bad_params\`, never \`tool_broken\`.`;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
interface DriveAuditOptions {
|
|
470
|
+
site: string;
|
|
471
|
+
model: string;
|
|
472
|
+
timeoutMs: number;
|
|
473
|
+
systemPromptPath: string;
|
|
474
|
+
toolNames: string[];
|
|
475
|
+
/** Per-tool params that shipped live-unverified at compile time. */
|
|
476
|
+
unverifiedParams: Array<{ tool: string; params: string[] }>;
|
|
477
|
+
/** Producer→consumer token contracts (param.sourcedFrom) so the auditor chains. */
|
|
478
|
+
tokenDeps: TokenDep[];
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
interface DriveAuditResult {
|
|
482
|
+
report: AuditReport;
|
|
483
|
+
/** False when no report parsed (empty report substituted). */
|
|
484
|
+
reportRecovered: boolean;
|
|
485
|
+
timedOut: boolean;
|
|
486
|
+
turns: number;
|
|
487
|
+
/** Full assistant transcript for diagnosis (empty if the session never spoke). */
|
|
488
|
+
transcript: string;
|
|
489
|
+
inputTokens: number;
|
|
490
|
+
outputTokens: number;
|
|
491
|
+
cacheReadInputTokens: number;
|
|
492
|
+
cacheCreationInputTokens: number;
|
|
493
|
+
/** Authoritative cost from the claude CLI's `result` event, when reported. */
|
|
494
|
+
totalCostUsd: number | null;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
/** A DriveAuditResult with no session data — spawn failure or an empty run. */
|
|
498
|
+
function emptyDriveAuditResult(): DriveAuditResult {
|
|
499
|
+
return {
|
|
500
|
+
report: AuditReportSchema.parse({}),
|
|
501
|
+
reportRecovered: false,
|
|
502
|
+
timedOut: false,
|
|
503
|
+
turns: 0,
|
|
504
|
+
transcript: '',
|
|
505
|
+
inputTokens: 0,
|
|
506
|
+
outputTokens: 0,
|
|
507
|
+
cacheReadInputTokens: 0,
|
|
508
|
+
cacheCreationInputTokens: 0,
|
|
509
|
+
totalCostUsd: null,
|
|
510
|
+
};
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
/**
|
|
514
|
+
* Spawn a headless `claude` session against the site's real MCP server, drive
|
|
515
|
+
* it to completion, and recover the structured report from the final assistant
|
|
516
|
+
* message. The real `mcp-server` has no write/submit tool, so the report must
|
|
517
|
+
* ride back in the model's text — we extract the last fenced ```json block (or
|
|
518
|
+
* the last balanced top-level object) and validate it. Any unrecoverable report
|
|
519
|
+
* degrades to an empty (→ inconclusive) report rather than crashing the gate.
|
|
520
|
+
*/
|
|
521
|
+
async function driveAudit(opts: DriveAuditOptions): Promise<DriveAuditResult> {
|
|
522
|
+
// Distinct from the persistent `imprint-<site>` server that `imprint teach`
|
|
523
|
+
// registers with Claude Code: a same-named inline server collides and claude
|
|
524
|
+
// marks ours "disabled" (even under --strict-mcp-config), leaving the auditor
|
|
525
|
+
// with zero tools. The `imprint-audit-` prefix keeps the inline server unique.
|
|
526
|
+
const serverName = `imprint-audit-${opts.site}`;
|
|
527
|
+
const bunPath = process.execPath;
|
|
528
|
+
const mcpConfig = {
|
|
529
|
+
mcpServers: {
|
|
530
|
+
[serverName]: {
|
|
531
|
+
command: bunPath,
|
|
532
|
+
args: ['run', CLI_PATH, 'mcp-server', opts.site],
|
|
533
|
+
// Pace every audit tool call: the auditor now differentially probes
|
|
534
|
+
// bot-defended idempotent reads (search/calendar) instead of bailing
|
|
535
|
+
// after one call, so a deliberate inter-call delay keeps the probing
|
|
536
|
+
// steady enough that the per-IP anti-bot defense isn't tripped. Only
|
|
537
|
+
// the audit sets this; production mcp-server runs unpaced.
|
|
538
|
+
env: { IMPRINT_AUDIT_PACING_MS: '5000' },
|
|
539
|
+
},
|
|
540
|
+
},
|
|
541
|
+
};
|
|
542
|
+
|
|
543
|
+
const allowedToolArgs: string[] = [];
|
|
544
|
+
for (const name of opts.toolNames) {
|
|
545
|
+
allowedToolArgs.push('--allowedTools', `mcp__${serverName}__${name}`);
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
const unverifiedNote =
|
|
549
|
+
opts.unverifiedParams.length > 0
|
|
550
|
+
? `\n\nThese parameters shipped WITHOUT a passing compile-time verification, so they are the HIGHEST priority for your per-parameter differential pass: ${opts.unverifiedParams
|
|
551
|
+
.map((u) => `${u.tool}(${u.params.join(', ')})`)
|
|
552
|
+
.join(
|
|
553
|
+
'; ',
|
|
554
|
+
)}. Give each one a \`parameters\` verdict (works / no_op / broken / untestable) like any other — do not let an unverified parameter pass without a differential test. (Per the ONE-invocation rule, a state-changing or bot-defended tool is the exception: mark its parameters \`untestable\` rather than probing.)`
|
|
555
|
+
: '';
|
|
556
|
+
|
|
557
|
+
const initialPrompt = `Audit every MCP tool connected to you for the site "${opts.site}".
|
|
558
|
+
|
|
559
|
+
There are ${opts.toolNames.length} connected tool(s). For each one: read its description and input schema, invoke it with a realistic parameter set, judge the result, and classify each invocation as correct | tool_broken | infra | bad_params per your system prompt. You MAY add one or two edge-case invocations ONLY for tools that are cheap reads not behind an anti-bot/rate defense.
|
|
560
|
+
|
|
561
|
+
ANTI-BOT / STATE-CHANGING TOOLS — ONE invocation only. If a tool drives a state-changing call (a search/booking .act-style POST) or its origin is bot-defended (the first call is slow/tarpitted, or returns 403/429/challenge/anti-bot), do EXACTLY ONE realistic invocation for that tool and move on — do NOT add edge cases. Repeated state-changing calls trip the site's per-IP rate defense, which then tarpits EVERY later call across all tools and ruins the whole audit. One clean read per such tool is enough to grade it; extra probes only convert a passing audit into a tarpitted one.
|
|
562
|
+
|
|
563
|
+
IMPORTANT: Call tools strictly sequentially — issue exactly one tool call, wait for its result, then issue the next. Never issue tool calls in parallel or batch them in one turn. Many target sites share an anti-bot defense across endpoints, so a parallel burst trips a site-wide rate-limit (HTTP 429) that then poisons every later call. If a call returns a 429 / rate-limit / anti-bot result, classify it \`infra\` and pause before the next call.${unverifiedNote}${buildTokenDepNote(opts.tokenDeps)}
|
|
564
|
+
|
|
565
|
+
When you are done, end your final message with exactly one fenced \`\`\`json block containing the full report and nothing after it.`;
|
|
566
|
+
|
|
567
|
+
const args = [
|
|
568
|
+
'--print',
|
|
569
|
+
'--output-format',
|
|
570
|
+
'stream-json',
|
|
571
|
+
'--verbose',
|
|
572
|
+
'--strict-mcp-config',
|
|
573
|
+
'--mcp-config',
|
|
574
|
+
JSON.stringify(mcpConfig),
|
|
575
|
+
'--system-prompt-file',
|
|
576
|
+
opts.systemPromptPath,
|
|
577
|
+
// Disable the built-in tool set so claude only uses the site's MCP tools.
|
|
578
|
+
'--tools',
|
|
579
|
+
'',
|
|
580
|
+
...allowedToolArgs,
|
|
581
|
+
'--max-turns',
|
|
582
|
+
'200',
|
|
583
|
+
'--permission-mode',
|
|
584
|
+
'bypassPermissions',
|
|
585
|
+
'--no-session-persistence',
|
|
586
|
+
'--disable-slash-commands',
|
|
587
|
+
'--effort',
|
|
588
|
+
'high',
|
|
589
|
+
'--model',
|
|
590
|
+
opts.model,
|
|
591
|
+
initialPrompt,
|
|
592
|
+
];
|
|
593
|
+
|
|
594
|
+
log(`spawning claude (model=${opts.model}, mcp-server=${serverName})`);
|
|
595
|
+
|
|
596
|
+
let child: ChildProcess;
|
|
597
|
+
try {
|
|
598
|
+
child = spawn('claude', args, {
|
|
599
|
+
cwd: REPO_ROOT,
|
|
600
|
+
// Claude CLI's default MCP_TOOL_TIMEOUT is 60s. The audit-time MCP
|
|
601
|
+
// server's tool calls walk the backend ladder for each invocation —
|
|
602
|
+
// fetch (30s) → fetch-bootstrap (30s) → stealth-fetch (30s) →
|
|
603
|
+
// playbook (5–30s), worst case ~2 min. Bump to 5 min (covers
|
|
604
|
+
// realistic worst case with margin) but NOT to 30 min like the
|
|
605
|
+
// compile side: the compile MCP needs that long because `done` runs
|
|
606
|
+
// bun-test verification inline, but the audit MCP doesn't — each
|
|
607
|
+
// audit tool call is just a single workflow execution. A longer
|
|
608
|
+
// timeout here would burn the audit's overall 30-min deadline
|
|
609
|
+
// on a handful of hanging calls (compiled tools that hang on bad
|
|
610
|
+
// inputs) before the auditor finishes grading. Honor user-set env.
|
|
611
|
+
env: {
|
|
612
|
+
...process.env,
|
|
613
|
+
MCP_TOOL_TIMEOUT: process.env.MCP_TOOL_TIMEOUT ?? '300000',
|
|
614
|
+
MCP_TIMEOUT: process.env.MCP_TIMEOUT ?? '60000',
|
|
615
|
+
},
|
|
616
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
617
|
+
});
|
|
618
|
+
} catch (err) {
|
|
619
|
+
log(`failed to spawn claude: ${errMsg(err)}`);
|
|
620
|
+
return emptyDriveAuditResult();
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
const session = await collectAssistantText(child, opts.timeoutMs);
|
|
624
|
+
const report = extractReport(session.text);
|
|
625
|
+
if (!report) {
|
|
626
|
+
log(
|
|
627
|
+
session.timedOut
|
|
628
|
+
? 'audit hit the deadline before producing a report — treating as timeout'
|
|
629
|
+
: 'no valid audit report recovered from the auditor — treating as inconclusive',
|
|
630
|
+
);
|
|
631
|
+
}
|
|
632
|
+
return {
|
|
633
|
+
report: report ?? AuditReportSchema.parse({}),
|
|
634
|
+
reportRecovered: report !== undefined,
|
|
635
|
+
timedOut: session.timedOut,
|
|
636
|
+
turns: session.turns,
|
|
637
|
+
transcript: session.transcript,
|
|
638
|
+
inputTokens: session.inputTokens,
|
|
639
|
+
outputTokens: session.outputTokens,
|
|
640
|
+
cacheReadInputTokens: session.cacheReadInputTokens,
|
|
641
|
+
cacheCreationInputTokens: session.cacheCreationInputTokens,
|
|
642
|
+
totalCostUsd: session.totalCostUsd,
|
|
643
|
+
};
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
/** Everything recovered from one audit session: the text to extract the report
|
|
647
|
+
* from, a full transcript for diagnosis, token/cost usage, and whether the
|
|
648
|
+
* deadline guard had to kill the child. */
|
|
649
|
+
interface AuditSessionResult {
|
|
650
|
+
/** Report-extraction source: the terminal result event, or the concatenated
|
|
651
|
+
* assistant text if the run was cut off before producing one. */
|
|
652
|
+
text: string;
|
|
653
|
+
/** Full assistant reasoning across every turn, persisted for diagnosis. */
|
|
654
|
+
transcript: string;
|
|
655
|
+
timedOut: boolean;
|
|
656
|
+
turns: number;
|
|
657
|
+
inputTokens: number;
|
|
658
|
+
outputTokens: number;
|
|
659
|
+
cacheReadInputTokens: number;
|
|
660
|
+
cacheCreationInputTokens: number;
|
|
661
|
+
totalCostUsd: number | null;
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
/** Drain the stream-json events, accumulating assistant text + token/cost usage,
|
|
665
|
+
* and resolve when the child exits. Enforces the wall-clock timeout by killing
|
|
666
|
+
* the child; reports `timedOut` so a cut-off run is a loud, distinct outcome
|
|
667
|
+
* rather than a silent empty (→ inconclusive) report.
|
|
668
|
+
* Emits a one-line-per-event progress log to stderr so operators can `tail -f`
|
|
669
|
+
* the audit log file and see live what the auditor is doing — without this
|
|
670
|
+
* the audit is a 30-minute black box. */
|
|
671
|
+
async function collectAssistantText(
|
|
672
|
+
child: ChildProcess,
|
|
673
|
+
timeoutMs: number,
|
|
674
|
+
): Promise<AuditSessionResult> {
|
|
675
|
+
const chunks: string[] = [];
|
|
676
|
+
let resultText = '';
|
|
677
|
+
let stdoutBuf = '';
|
|
678
|
+
let killed = false;
|
|
679
|
+
let turns = 0;
|
|
680
|
+
// Accumulated per-event so a killed run still reports partial usage; the
|
|
681
|
+
// terminal `result` event (when present) overwrites with the authoritative
|
|
682
|
+
// cumulative totals. Mirrors the compile path (claude-cli-compile.ts).
|
|
683
|
+
let inputTokens = 0;
|
|
684
|
+
let outputTokens = 0;
|
|
685
|
+
let cacheReadInputTokens = 0;
|
|
686
|
+
let cacheCreationInputTokens = 0;
|
|
687
|
+
let totalCostUsd: number | null = null;
|
|
688
|
+
const t0 = Date.now();
|
|
689
|
+
const elapsedStr = (): string => {
|
|
690
|
+
const s = Math.floor((Date.now() - t0) / 1000);
|
|
691
|
+
return `${Math.floor(s / 60)}:${String(s % 60).padStart(2, '0')}`;
|
|
692
|
+
};
|
|
693
|
+
|
|
694
|
+
const timer = setTimeout(() => {
|
|
695
|
+
killed = true;
|
|
696
|
+
log(`audit exceeded ${formatDeadline(timeoutMs)} deadline, terminating claude`);
|
|
697
|
+
try {
|
|
698
|
+
child.kill('SIGTERM');
|
|
699
|
+
setTimeout(() => {
|
|
700
|
+
if (!child.killed) child.kill('SIGKILL');
|
|
701
|
+
}, 5000);
|
|
702
|
+
} catch {
|
|
703
|
+
// already gone
|
|
704
|
+
}
|
|
705
|
+
}, timeoutMs);
|
|
706
|
+
|
|
707
|
+
child.stdout?.on('data', (chunk: Buffer) => {
|
|
708
|
+
stdoutBuf += chunk.toString('utf8');
|
|
709
|
+
while (true) {
|
|
710
|
+
const nl = stdoutBuf.indexOf('\n');
|
|
711
|
+
if (nl < 0) break;
|
|
712
|
+
const line = stdoutBuf.slice(0, nl).trim();
|
|
713
|
+
stdoutBuf = stdoutBuf.slice(nl + 1);
|
|
714
|
+
if (!line) continue;
|
|
715
|
+
|
|
716
|
+
let evt: StreamJsonEvent;
|
|
717
|
+
try {
|
|
718
|
+
evt = JSON.parse(line) as StreamJsonEvent;
|
|
719
|
+
} catch {
|
|
720
|
+
continue;
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
// Token accounting from any event that carries usage (event-level or on
|
|
724
|
+
// the nested assistant message).
|
|
725
|
+
const eu = evt.usage;
|
|
726
|
+
const mu = evt.message?.usage;
|
|
727
|
+
inputTokens += (eu?.input_tokens ?? 0) + (mu?.input_tokens ?? 0);
|
|
728
|
+
outputTokens += (eu?.output_tokens ?? 0) + (mu?.output_tokens ?? 0);
|
|
729
|
+
cacheReadInputTokens +=
|
|
730
|
+
(eu?.cache_read_input_tokens ?? 0) + (mu?.cache_read_input_tokens ?? 0);
|
|
731
|
+
cacheCreationInputTokens +=
|
|
732
|
+
(eu?.cache_creation_input_tokens ?? 0) + (mu?.cache_creation_input_tokens ?? 0);
|
|
733
|
+
|
|
734
|
+
// Live progress signal: one log line per tool_use / tool_result /
|
|
735
|
+
// text-snippet event with [elapsed]. Lets `tail -f` show what the
|
|
736
|
+
// auditor is doing in real time instead of waiting 30-60 min for
|
|
737
|
+
// the final report.
|
|
738
|
+
if (evt.type === 'assistant' && Array.isArray(evt.message?.content)) {
|
|
739
|
+
turns++;
|
|
740
|
+
for (const block of evt.message.content) {
|
|
741
|
+
if (!block) continue;
|
|
742
|
+
if (block.type === 'text' && typeof block.text === 'string') {
|
|
743
|
+
chunks.push(block.text);
|
|
744
|
+
const preview = block.text.replace(/\s+/g, ' ').slice(0, 120);
|
|
745
|
+
log(`[${elapsedStr()}] assistant: ${preview}`);
|
|
746
|
+
} else if (block.type === 'tool_use' && typeof block.name === 'string') {
|
|
747
|
+
const inputPreview = block.input ? JSON.stringify(block.input).slice(0, 120) : '';
|
|
748
|
+
log(
|
|
749
|
+
`[${elapsedStr()}] tool_use: ${block.name}${inputPreview ? ` ${inputPreview}` : ''}`,
|
|
750
|
+
);
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
} else if (evt.type === 'user' && Array.isArray(evt.message?.content)) {
|
|
754
|
+
for (const block of evt.message.content) {
|
|
755
|
+
if (!block) continue;
|
|
756
|
+
if (block.type === 'tool_result') {
|
|
757
|
+
const raw = Array.isArray(block.content)
|
|
758
|
+
? (block.content[0]?.text ?? '')
|
|
759
|
+
: typeof block.content === 'string'
|
|
760
|
+
? block.content
|
|
761
|
+
: '';
|
|
762
|
+
const preview = String(raw).replace(/\s+/g, ' ').slice(0, 140);
|
|
763
|
+
const errMark = block.is_error ? ' (error)' : '';
|
|
764
|
+
log(`[${elapsedStr()}] tool_result${errMark}: ${preview}`);
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
} else if (evt.type === 'result') {
|
|
768
|
+
// The terminal result event carries the final assistant message verbatim
|
|
769
|
+
// plus the authoritative cumulative usage + cost.
|
|
770
|
+
if (typeof evt.result === 'string') {
|
|
771
|
+
resultText = evt.result;
|
|
772
|
+
log(`[${elapsedStr()}] result event received (${evt.result.length} chars)`);
|
|
773
|
+
}
|
|
774
|
+
if (evt.usage) {
|
|
775
|
+
inputTokens = evt.usage.input_tokens ?? inputTokens;
|
|
776
|
+
outputTokens = evt.usage.output_tokens ?? outputTokens;
|
|
777
|
+
cacheReadInputTokens = evt.usage.cache_read_input_tokens ?? cacheReadInputTokens;
|
|
778
|
+
cacheCreationInputTokens =
|
|
779
|
+
evt.usage.cache_creation_input_tokens ?? cacheCreationInputTokens;
|
|
780
|
+
}
|
|
781
|
+
if (typeof evt.total_cost_usd === 'number') totalCostUsd = evt.total_cost_usd;
|
|
782
|
+
}
|
|
783
|
+
}
|
|
784
|
+
});
|
|
785
|
+
|
|
786
|
+
child.stderr?.on('data', (chunk: Buffer) => {
|
|
787
|
+
log(`[claude stderr] ${chunk.toString('utf8').trim()}`);
|
|
788
|
+
});
|
|
789
|
+
|
|
790
|
+
await new Promise<void>((resolve) => {
|
|
791
|
+
child.once('exit', () => resolve());
|
|
792
|
+
child.once('error', (err) => {
|
|
793
|
+
log(`claude process error: ${errMsg(err)}`);
|
|
794
|
+
resolve();
|
|
795
|
+
});
|
|
796
|
+
});
|
|
797
|
+
clearTimeout(timer);
|
|
798
|
+
if (killed) log('audit session was terminated by the deadline guard');
|
|
799
|
+
|
|
800
|
+
return {
|
|
801
|
+
// Prefer the terminal result event (the complete final message); fall back to
|
|
802
|
+
// the concatenated streamed assistant text if the result event was absent.
|
|
803
|
+
text: resultText || chunks.join('\n'),
|
|
804
|
+
transcript: chunks.join('\n\n'),
|
|
805
|
+
timedOut: killed,
|
|
806
|
+
turns,
|
|
807
|
+
inputTokens,
|
|
808
|
+
outputTokens,
|
|
809
|
+
cacheReadInputTokens,
|
|
810
|
+
cacheCreationInputTokens,
|
|
811
|
+
totalCostUsd,
|
|
812
|
+
};
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
interface StreamUsage {
|
|
816
|
+
input_tokens?: number;
|
|
817
|
+
output_tokens?: number;
|
|
818
|
+
cache_read_input_tokens?: number;
|
|
819
|
+
cache_creation_input_tokens?: number;
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
interface StreamJsonEvent {
|
|
823
|
+
type: string;
|
|
824
|
+
message?: {
|
|
825
|
+
content?: Array<{
|
|
826
|
+
type?: string;
|
|
827
|
+
text?: string;
|
|
828
|
+
name?: string;
|
|
829
|
+
input?: unknown;
|
|
830
|
+
tool_use_id?: string;
|
|
831
|
+
content?: unknown;
|
|
832
|
+
is_error?: boolean;
|
|
833
|
+
}>;
|
|
834
|
+
usage?: StreamUsage;
|
|
835
|
+
};
|
|
836
|
+
/** Final cumulative usage + cost ride on the terminal `result` event. */
|
|
837
|
+
usage?: StreamUsage;
|
|
838
|
+
total_cost_usd?: number;
|
|
839
|
+
result?: string;
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
/**
|
|
843
|
+
* Recover the structured report from the auditor's text. Prefers the LAST
|
|
844
|
+
* fenced ```json block (the system prompt requires the report to be the final
|
|
845
|
+
* thing in the message); falls back to the last balanced top-level {…} object.
|
|
846
|
+
* Returns undefined when nothing parses + validates.
|
|
847
|
+
*/
|
|
848
|
+
export function extractReport(text: string): AuditReport | undefined {
|
|
849
|
+
if (!text) return undefined;
|
|
850
|
+
for (const candidate of jsonCandidates(text)) {
|
|
851
|
+
try {
|
|
852
|
+
const parsed = JSON.parse(candidate);
|
|
853
|
+
const result = AuditReportSchema.safeParse(parsed);
|
|
854
|
+
if (result.success) return result.data;
|
|
855
|
+
} catch {
|
|
856
|
+
// try the next candidate
|
|
857
|
+
}
|
|
858
|
+
}
|
|
859
|
+
return undefined;
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
/** Yield JSON candidate strings best-first: every ```json fenced block (last
|
|
863
|
+
* one first), then balanced top-level {…} objects (last one first). */
|
|
864
|
+
function jsonCandidates(text: string): string[] {
|
|
865
|
+
const out: string[] = [];
|
|
866
|
+
const fenced: string[] = [];
|
|
867
|
+
for (const match of text.matchAll(/```json\s*([\s\S]*?)```/gi)) {
|
|
868
|
+
if (match[1]) fenced.push(match[1].trim());
|
|
869
|
+
}
|
|
870
|
+
out.push(...fenced.reverse());
|
|
871
|
+
out.push(...balancedObjects(text).reverse());
|
|
872
|
+
return out;
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
/** Extract every balanced top-level {…} substring (brace-depth scan, ignoring
|
|
876
|
+
* braces inside strings). Good enough to recover an un-fenced final object. */
|
|
877
|
+
function balancedObjects(text: string): string[] {
|
|
878
|
+
const out: string[] = [];
|
|
879
|
+
let depth = 0;
|
|
880
|
+
let start = -1;
|
|
881
|
+
let inString = false;
|
|
882
|
+
let escaped = false;
|
|
883
|
+
for (let i = 0; i < text.length; i++) {
|
|
884
|
+
const ch = text[i];
|
|
885
|
+
if (inString) {
|
|
886
|
+
if (escaped) escaped = false;
|
|
887
|
+
else if (ch === '\\') escaped = true;
|
|
888
|
+
else if (ch === '"') inString = false;
|
|
889
|
+
continue;
|
|
890
|
+
}
|
|
891
|
+
if (ch === '"') {
|
|
892
|
+
inString = true;
|
|
893
|
+
} else if (ch === '{') {
|
|
894
|
+
if (depth === 0) start = i;
|
|
895
|
+
depth++;
|
|
896
|
+
} else if (ch === '}') {
|
|
897
|
+
if (depth > 0) {
|
|
898
|
+
depth--;
|
|
899
|
+
if (depth === 0 && start >= 0) {
|
|
900
|
+
out.push(text.slice(start, i + 1));
|
|
901
|
+
start = -1;
|
|
902
|
+
}
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
return out;
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
function printSummary(
|
|
910
|
+
opts: RunAuditOptions,
|
|
911
|
+
score: AuditScore,
|
|
912
|
+
toolCount: number,
|
|
913
|
+
extra: {
|
|
914
|
+
timedOut: boolean;
|
|
915
|
+
timeoutMs: number;
|
|
916
|
+
transcriptPath?: string;
|
|
917
|
+
costUsd?: number | null;
|
|
918
|
+
unverifiedAndUngradeable: string[];
|
|
919
|
+
report: AuditReport;
|
|
920
|
+
},
|
|
921
|
+
): void {
|
|
922
|
+
const pct = score.graded === 0 ? 'n/a' : `${score.score.toFixed(1)}%`;
|
|
923
|
+
console.log(`[imprint] audit "${opts.site}" — ${score.verdict.toUpperCase()}`);
|
|
924
|
+
console.log(
|
|
925
|
+
`[imprint] score ${pct} (${score.correct} correct / ${score.broken} broken; threshold ${opts.minScore}%)`,
|
|
926
|
+
);
|
|
927
|
+
// `score.correct`/`score.broken` now blend invocation and parameter verdicts;
|
|
928
|
+
// split them back out so this line counts only actual tool calls.
|
|
929
|
+
const paramsTested = score.paramsWorking + score.paramsNoOp + score.paramsBroken;
|
|
930
|
+
const invGraded = score.graded - paramsTested;
|
|
931
|
+
const invTotal = invGraded + score.infra + score.badParams;
|
|
932
|
+
console.log(
|
|
933
|
+
`[imprint] graded ${score.graded} unit(s) = ${invGraded}/${invTotal} invocation(s) + ${paramsTested} parameter(s) across ${toolCount} tool(s) — excluded: ${score.infra} infra, ${score.badParams} bad_params, ${score.paramsUntestable} untestable param(s)`,
|
|
934
|
+
);
|
|
935
|
+
if (paramsTested + score.paramsUntestable > 0) {
|
|
936
|
+
console.log(
|
|
937
|
+
`[imprint] parameters: ${score.paramsWorking}/${paramsTested} working — ${score.paramsNoOp} no-op, ${score.paramsBroken} broken, ${score.paramsUntestable} untestable`,
|
|
938
|
+
);
|
|
939
|
+
// Per the "no-op/untested isn't a free pass" rule: list every parameter that
|
|
940
|
+
// did not cleanly work, with the auditor's evidence, so the operator sees
|
|
941
|
+
// exactly which advertised parameters don't function.
|
|
942
|
+
for (const tool of extra.report.tools) {
|
|
943
|
+
const flagged = tool.parameters.filter((p) => p.verdict !== 'works');
|
|
944
|
+
if (flagged.length === 0) continue;
|
|
945
|
+
const working = tool.parameters.filter((p) => p.verdict === 'works').length;
|
|
946
|
+
// Denominator excludes untestable params, matching the top-level line.
|
|
947
|
+
const tested = tool.parameters.filter((p) => p.verdict !== 'untestable').length;
|
|
948
|
+
console.log(`[imprint] ${tool.name} (${working}/${tested} working):`);
|
|
949
|
+
for (const p of flagged) {
|
|
950
|
+
const mark = p.verdict === 'untestable' ? '⚪' : '✗';
|
|
951
|
+
console.log(
|
|
952
|
+
`[imprint] ${mark} ${p.name} — ${p.verdict}: ${p.reason || '(no reason)'}`,
|
|
953
|
+
);
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
if (extra.costUsd != null) {
|
|
958
|
+
console.log(`[imprint] cost ≈ $${extra.costUsd.toFixed(2)}`);
|
|
959
|
+
}
|
|
960
|
+
if (extra.unverifiedAndUngradeable.length > 0) {
|
|
961
|
+
console.log(
|
|
962
|
+
`[imprint] ${extra.unverifiedAndUngradeable.length} tool(s) flying blind (no live verification at compile, no graded calls at audit): ${extra.unverifiedAndUngradeable.join(', ')}`,
|
|
963
|
+
);
|
|
964
|
+
}
|
|
965
|
+
if (score.verdict === 'timeout') {
|
|
966
|
+
console.log(
|
|
967
|
+
`[imprint] audit was killed at the ${formatDeadline(extra.timeoutMs)} deadline before finishing — partial results only. Re-run with a longer --timeout, or inspect the transcript to see where it stalled.`,
|
|
968
|
+
);
|
|
969
|
+
} else if (score.verdict === 'inconclusive') {
|
|
970
|
+
if (extra.unverifiedAndUngradeable.length > 0) {
|
|
971
|
+
console.log(
|
|
972
|
+
'[imprint] verdict downgraded to inconclusive because at least one tool has zero live signal anywhere.',
|
|
973
|
+
);
|
|
974
|
+
} else {
|
|
975
|
+
console.log(
|
|
976
|
+
'[imprint] no gradeable invocations (likely anti-bot / network) — re-run; this is not a code failure.',
|
|
977
|
+
);
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
if (extra.transcriptPath) {
|
|
981
|
+
console.log(`[imprint] transcript → ${extra.transcriptPath}`);
|
|
982
|
+
}
|
|
983
|
+
console.log(`[imprint] report → ${opts.outPath}`);
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
function errMsg(err: unknown): string {
|
|
987
|
+
return err instanceof Error ? err.message : String(err);
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
/** Human-readable deadline, e.g. "20-minute" or "25-second" (sub-minute timeouts
|
|
991
|
+
* shouldn't round to "0-minute"). */
|
|
992
|
+
function formatDeadline(timeoutMs: number): string {
|
|
993
|
+
return timeoutMs < 60_000
|
|
994
|
+
? `${Math.round(timeoutMs / 1000)}-second`
|
|
995
|
+
: `${Math.round(timeoutMs / 60_000)}-minute`;
|
|
996
|
+
}
|