imprint-mcp 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/README.md +165 -201
  2. package/examples/discoverandgo/README.md +1 -1
  3. package/examples/echo/README.md +1 -1
  4. package/examples/google-flights/README.md +28 -0
  5. package/examples/google-flights/_shared/batchexecute.ts +63 -0
  6. package/examples/google-flights/_shared/flights_request.ts +95 -0
  7. package/examples/google-flights/_shared/package.json +9 -0
  8. package/examples/google-flights/get_flight_booking_details/index.ts +159 -0
  9. package/examples/google-flights/get_flight_booking_details/package.json +9 -0
  10. package/examples/google-flights/get_flight_booking_details/parser.ts +182 -0
  11. package/examples/google-flights/get_flight_booking_details/playbook.yaml +138 -0
  12. package/examples/google-flights/get_flight_booking_details/request-transform.ts +86 -0
  13. package/examples/google-flights/get_flight_booking_details/workflow.json +98 -0
  14. package/examples/google-flights/get_flight_calendar_prices/index.ts +131 -0
  15. package/examples/google-flights/get_flight_calendar_prices/package.json +9 -0
  16. package/examples/google-flights/get_flight_calendar_prices/parser.ts +86 -0
  17. package/examples/google-flights/get_flight_calendar_prices/playbook.yaml +97 -0
  18. package/examples/google-flights/get_flight_calendar_prices/request-transform.ts +31 -0
  19. package/examples/google-flights/get_flight_calendar_prices/workflow.json +76 -0
  20. package/examples/google-flights/lookup_airport/index.ts +101 -0
  21. package/examples/google-flights/lookup_airport/package.json +9 -0
  22. package/examples/google-flights/lookup_airport/parser.ts +66 -0
  23. package/examples/google-flights/lookup_airport/playbook.yaml +47 -0
  24. package/examples/google-flights/lookup_airport/request-transform.ts +20 -0
  25. package/examples/google-flights/lookup_airport/workflow.json +57 -0
  26. package/examples/google-flights/search_flights/index.ts +219 -0
  27. package/examples/google-flights/search_flights/package.json +9 -0
  28. package/examples/google-flights/search_flights/parser.ts +169 -0
  29. package/examples/google-flights/search_flights/playbook.yaml +184 -0
  30. package/examples/google-flights/search_flights/request-transform.ts +119 -0
  31. package/examples/google-flights/search_flights/workflow.json +143 -0
  32. package/examples/google-hotels/README.md +29 -0
  33. package/examples/google-hotels/_shared/batchexecute.ts +73 -0
  34. package/examples/google-hotels/_shared/freq.ts +158 -0
  35. package/examples/google-hotels/_shared/package.json +9 -0
  36. package/examples/google-hotels/autocomplete_hotel_location/index.ts +80 -0
  37. package/examples/google-hotels/autocomplete_hotel_location/package.json +9 -0
  38. package/examples/google-hotels/autocomplete_hotel_location/parser.ts +71 -0
  39. package/examples/google-hotels/autocomplete_hotel_location/playbook.yaml +36 -0
  40. package/examples/google-hotels/autocomplete_hotel_location/request-transform.ts +37 -0
  41. package/examples/google-hotels/autocomplete_hotel_location/workflow.json +36 -0
  42. package/examples/google-hotels/get_hotel_booking_options/index.ts +143 -0
  43. package/examples/google-hotels/get_hotel_booking_options/package.json +9 -0
  44. package/examples/google-hotels/get_hotel_booking_options/parser.ts +271 -0
  45. package/examples/google-hotels/get_hotel_booking_options/playbook.yaml +154 -0
  46. package/examples/google-hotels/get_hotel_booking_options/request-transform.ts +154 -0
  47. package/examples/google-hotels/get_hotel_booking_options/workflow.json +84 -0
  48. package/examples/google-hotels/get_hotel_reviews/index.ts +81 -0
  49. package/examples/google-hotels/get_hotel_reviews/package.json +9 -0
  50. package/examples/google-hotels/get_hotel_reviews/parser.ts +128 -0
  51. package/examples/google-hotels/get_hotel_reviews/playbook.yaml +64 -0
  52. package/examples/google-hotels/get_hotel_reviews/request-transform.ts +42 -0
  53. package/examples/google-hotels/get_hotel_reviews/workflow.json +37 -0
  54. package/examples/google-hotels/search_hotels/index.ts +207 -0
  55. package/examples/google-hotels/search_hotels/package.json +9 -0
  56. package/examples/google-hotels/search_hotels/parser.ts +260 -0
  57. package/examples/google-hotels/search_hotels/playbook.yaml +87 -0
  58. package/examples/google-hotels/search_hotels/request-transform.ts +197 -0
  59. package/examples/google-hotels/search_hotels/workflow.json +127 -0
  60. package/package.json +3 -2
  61. package/prompts/audit-agent.md +71 -0
  62. package/prompts/build-planning.md +74 -0
  63. package/prompts/compile-agent.md +132 -28
  64. package/prompts/prereq-builder.md +64 -0
  65. package/prompts/prereq-planner.md +34 -0
  66. package/prompts/tool-planning.md +39 -0
  67. package/src/cli.ts +111 -4
  68. package/src/imprint/agent.ts +5 -0
  69. package/src/imprint/audit.ts +996 -0
  70. package/src/imprint/backend-ladder.ts +1214 -184
  71. package/src/imprint/build-plan.ts +1051 -0
  72. package/src/imprint/cdp-browser-fetch.ts +589 -0
  73. package/src/imprint/cdp-jar-cache.ts +320 -0
  74. package/src/imprint/chromium.ts +135 -0
  75. package/src/imprint/claude-cli-compile.ts +125 -25
  76. package/src/imprint/codex-cli-compile.ts +26 -23
  77. package/src/imprint/compile-agent-types.ts +38 -0
  78. package/src/imprint/compile-agent.ts +65 -27
  79. package/src/imprint/compile-tools.ts +1656 -64
  80. package/src/imprint/compile.ts +14 -2
  81. package/src/imprint/concurrency.ts +87 -0
  82. package/src/imprint/credential-extract.ts +174 -25
  83. package/src/imprint/cron.ts +1 -0
  84. package/src/imprint/doctor.ts +39 -0
  85. package/src/imprint/emit.ts +85 -0
  86. package/src/imprint/freeform-redact.ts +5 -4
  87. package/src/imprint/integrations.ts +2 -2
  88. package/src/imprint/llm.ts +56 -8
  89. package/src/imprint/mcp-compile-server.ts +43 -10
  90. package/src/imprint/mcp-maintenance.ts +9 -101
  91. package/src/imprint/mcp-server.ts +73 -7
  92. package/src/imprint/multi-progress.ts +7 -2
  93. package/src/imprint/param-grounding.ts +367 -0
  94. package/src/imprint/paths.ts +29 -0
  95. package/src/imprint/playbook-runner.ts +101 -40
  96. package/src/imprint/prereq-builder.ts +651 -0
  97. package/src/imprint/probe-backends.ts +6 -3
  98. package/src/imprint/record.ts +10 -1
  99. package/src/imprint/redact.ts +30 -2
  100. package/src/imprint/replay-capture.ts +19 -18
  101. package/src/imprint/runtime.ts +19 -10
  102. package/src/imprint/sensitive-keys.ts +141 -7
  103. package/src/imprint/session-diff.ts +79 -2
  104. package/src/imprint/session-merge.ts +9 -5
  105. package/src/imprint/stealth-chromium.ts +81 -0
  106. package/src/imprint/stealth-fetch.ts +309 -29
  107. package/src/imprint/stealth-token-cache.ts +88 -0
  108. package/src/imprint/teach-plan.ts +251 -0
  109. package/src/imprint/teach-state.ts +17 -0
  110. package/src/imprint/teach.ts +582 -147
  111. package/src/imprint/tool-candidates.ts +72 -14
  112. package/src/imprint/tool-plan.ts +313 -0
  113. package/src/imprint/tracing.ts +135 -6
  114. package/src/imprint/types.ts +61 -3
  115. package/examples/google-flights/search_google_flights/index.ts +0 -101
  116. package/examples/google-flights/search_google_flights/parser.test.ts +0 -140
  117. package/examples/google-flights/search_google_flights/parser.ts +0 -189
  118. package/examples/google-flights/search_google_flights/playbook.yaml +0 -130
  119. package/examples/google-flights/search_google_flights/workflow.json +0 -48
  120. package/examples/google-hotels/search_google_hotels/index.ts +0 -194
  121. package/examples/google-hotels/search_google_hotels/parser.test.ts +0 -168
  122. package/examples/google-hotels/search_google_hotels/parser.ts +0 -330
  123. package/examples/google-hotels/search_google_hotels/playbook.yaml +0 -125
  124. package/examples/google-hotels/search_google_hotels/workflow.json +0 -111
  125. package/examples/namecheap-domains/search_namecheap_domains/index.ts +0 -144
  126. package/examples/namecheap-domains/search_namecheap_domains/parser.ts +0 -380
  127. package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +0 -50
  128. package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +0 -136
  129. package/examples/namecheap-domains/search_namecheap_domains/workflow.json +0 -97
@@ -0,0 +1,996 @@
1
+ /**
2
+ * Headless-claude MCP audit harness — the acceptance gate for a site's
3
+ * generated tools.
4
+ *
5
+ * `runAudit` discovers every tool a site exposes via `imprint mcp-server`,
6
+ * spawns a headless `claude` session pointed at that real MCP server, and asks
7
+ * it to exercise each tool and classify every invocation. The model returns a
8
+ * structured report, but it never reports a score: imprint recomputes the score
9
+ * deterministically from the model's per-invocation verdicts
10
+ * (`computeAuditScore`) so the gate can't be talked up by a generous auditor.
11
+ *
12
+ * The harness is fully site-agnostic — the auditor derives every parameter from
13
+ * each tool's schema + description. There is no per-site special-casing here.
14
+ */
15
+
16
+ import { type ChildProcess, spawn } from 'node:child_process';
17
+ import { existsSync, mkdirSync, writeFileSync } from 'node:fs';
18
+ import { dirname, join as pathJoin } from 'node:path';
19
+ import { z } from 'zod';
20
+ import { preferredAgentModel } from './llm.ts';
21
+ import { createLog } from './log.ts';
22
+ import { imprintHomeDir } from './paths.ts';
23
+ import { discoverTools } from './tool-loader.ts';
24
+ import { llmSpanAttributes, setSpanAttributes, totalPromptTokens, traced } from './tracing.ts';
25
+
26
+ const log = createLog('audit');
27
+
28
+ const REPO_ROOT = pathJoin(import.meta.dir, '..', '..');
29
+ const CLI_PATH = pathJoin(REPO_ROOT, 'src', 'cli.ts');
30
+ const PROMPTS_DIR = pathJoin(REPO_ROOT, 'prompts');
31
+
32
+ /** Default wall-clock cap for an audit session. This is a CAP, not a fixed
33
+ * duration: a fast site (e.g. marriott's plain-fetch tools) finishes its full
34
+ * differential param sweep in ~2 min and exits early. The cap only bites on
35
+ * slow sites — those whose tools replay via cdp (a real Chrome per call,
36
+ * ~60-90s each) AND expose many parameters, so the per-param sweep needs far
37
+ * more than the old 20 min (southwest, with 62KB search payloads across ~14
38
+ * params, was killed mid-sweep at 20 min despite both tools being live). 45 min
39
+ * lets those complete while still bounding a genuinely hung session. */
40
+ const DEFAULT_AUDIT_TIMEOUT_MS = 45 * 60_000;
41
+
42
+ /** One invocation the auditor performed against a tool. */
43
+ const InvocationSchema = z.object({
44
+ params: z.record(z.unknown()).default({}),
45
+ ok: z.boolean(),
46
+ verdict: z.enum(['correct', 'tool_broken', 'infra', 'bad_params']),
47
+ reason: z.string().default(''),
48
+ });
49
+
50
+ /** Per-parameter differential verdict. The auditor calls the tool once at a
51
+ * baseline, then once with ONLY this parameter changed to a value expected to
52
+ * alter the result, and compares:
53
+ * - `works` — the result changed as the description promises.
54
+ * - `no_op` — the result was unchanged → the parameter is inert.
55
+ * - `broken` — the result changed wrongly (corrupted/emptied/nonsense).
56
+ * - `untestable` — no distinct valid value could be constructed, or the tool
57
+ * is state-changing / bot-defended so probing is unsafe.
58
+ * `works` grades correct; `no_op`/`broken` grade as defects ("no-op is not a
59
+ * free pass"); `untestable` is surfaced but not scored. */
60
+ const ParameterAuditSchema = z.object({
61
+ name: z.string(),
62
+ verdict: z.enum(['works', 'no_op', 'broken', 'untestable']),
63
+ reason: z.string().default(''),
64
+ });
65
+
66
+ const ToolAuditSchema = z.object({
67
+ name: z.string(),
68
+ invocations: z.array(InvocationSchema).default([]),
69
+ parameters: z.array(ParameterAuditSchema).default([]),
70
+ });
71
+
72
+ /** The single JSON object the auditor returns. Scoring is NOT taken from the
73
+ * model; only the per-invocation verdicts feed `computeAuditScore`. */
74
+ export const AuditReportSchema = z.object({
75
+ tools: z.array(ToolAuditSchema).default([]),
76
+ notes: z.string().default(''),
77
+ });
78
+
79
+ export type AuditReport = z.infer<typeof AuditReportSchema>;
80
+
81
+ interface AuditScore {
82
+ score: number;
83
+ correct: number;
84
+ broken: number;
85
+ infra: number;
86
+ badParams: number;
87
+ graded: number;
88
+ /** Per-parameter differential tallies (folded into correct/broken/graded
89
+ * above; broken out here for the report). `untestable` is surfaced only. */
90
+ paramsWorking: number;
91
+ paramsNoOp: number;
92
+ paramsBroken: number;
93
+ paramsUntestable: number;
94
+ /** `timeout` is set by `runAudit` (not `computeAuditScore`) when the session
95
+ * was killed by the deadline guard — a cut-off run is never a trustworthy
96
+ * pass, even if the partial verdicts would have scored one. */
97
+ verdict: 'pass' | 'fail' | 'inconclusive' | 'timeout';
98
+ }
99
+
100
+ /**
101
+ * Pure, deterministic scoring over the model's verdicts.
102
+ *
103
+ * - `correct` / `tool_broken` invocation verdicts grade core tool behavior; the
104
+ * per-parameter differential verdicts grade each advertised parameter and fold
105
+ * into the SAME accumulator: `works` → correct, `no_op`/`broken` → broken
106
+ * ("no-op is not a free pass"), `untestable` → surfaced but not scored.
107
+ * `graded` is correct + broken (invocations + params). `infra` (anti-bot /
108
+ * rate-limit / network / timeout) and `bad_params` (the auditor's own mistake)
109
+ * are excluded so a blocked or misused tool isn't counted as a code bug.
110
+ * - `score = 100 * correct / graded` (0 when nothing was gradeable).
111
+ * - Verdict: no gradeable invocations → `inconclusive` (re-run / site blocked
112
+ * us, not a code fail). Otherwise `pass` requires both `score >= minScore`
113
+ * AND at least `max(2, gradeableTools)` gradeable invocations, where
114
+ * `gradeableTools` is the number of tools that produced ≥1 gradeable
115
+ * invocation. Scaling the signal floor to *gradeable* tools (not all tools)
116
+ * means a tool the auditor can never exercise — e.g. one that needs an opaque
117
+ * token it cannot synthesize — no longer inflates the bar and sinks an
118
+ * otherwise-perfect run; such tools surface separately as `ungradeableTools`.
119
+ * The floor is one gradeable call per gradeable tool (not two): the auditor
120
+ * often burns a slot per tool on `bad_params`/`infra` (its own mistake or a
121
+ * transient block), so demanding two clean reads per tool false-fails an
122
+ * otherwise-perfect run. One verified read per tool plus `score >= minScore`
123
+ * is the honest floor; real defects still fail on score, not on this count.
124
+ */
125
+ export function computeAuditScore(report: AuditReport, minScore: number): AuditScore {
126
+ let correct = 0;
127
+ let broken = 0;
128
+ let infra = 0;
129
+ let badParams = 0;
130
+ let paramsWorking = 0;
131
+ let paramsNoOp = 0;
132
+ let paramsBroken = 0;
133
+ let paramsUntestable = 0;
134
+ let gradeableTools = 0;
135
+ for (const tool of report.tools) {
136
+ let toolGradeable = 0;
137
+ for (const inv of tool.invocations) {
138
+ switch (inv.verdict) {
139
+ case 'correct':
140
+ correct++;
141
+ toolGradeable++;
142
+ break;
143
+ case 'tool_broken':
144
+ broken++;
145
+ toolGradeable++;
146
+ break;
147
+ case 'infra':
148
+ infra++;
149
+ break;
150
+ case 'bad_params':
151
+ badParams++;
152
+ break;
153
+ }
154
+ }
155
+ for (const param of tool.parameters) {
156
+ switch (param.verdict) {
157
+ case 'works':
158
+ paramsWorking++;
159
+ correct++;
160
+ toolGradeable++;
161
+ break;
162
+ case 'no_op':
163
+ paramsNoOp++;
164
+ broken++;
165
+ toolGradeable++;
166
+ break;
167
+ case 'broken':
168
+ paramsBroken++;
169
+ broken++;
170
+ toolGradeable++;
171
+ break;
172
+ case 'untestable':
173
+ paramsUntestable++;
174
+ break;
175
+ }
176
+ }
177
+ if (toolGradeable > 0) gradeableTools++;
178
+ }
179
+ const graded = correct + broken;
180
+ const score = graded === 0 ? 0 : (100 * correct) / graded;
181
+ const minGraded = Math.max(2, gradeableTools);
182
+ let verdict: AuditScore['verdict'];
183
+ if (graded === 0) {
184
+ verdict = 'inconclusive';
185
+ } else if (score >= minScore && graded >= minGraded) {
186
+ verdict = 'pass';
187
+ } else {
188
+ verdict = 'fail';
189
+ }
190
+ return {
191
+ score,
192
+ correct,
193
+ broken,
194
+ infra,
195
+ badParams,
196
+ graded,
197
+ paramsWorking,
198
+ paramsNoOp,
199
+ paramsBroken,
200
+ paramsUntestable,
201
+ verdict,
202
+ };
203
+ }
204
+
205
+ /** Tools the auditor could never grade (every invocation was infra/bad_params,
206
+ * or it ran none). Surfaced in the report so an un-exercisable tool is visible
207
+ * rather than silently excluded from the score. */
208
+ export function ungradeableToolNames(report: AuditReport): string[] {
209
+ return report.tools
210
+ .filter(
211
+ (t) => !t.invocations.some((i) => i.verdict === 'correct' || i.verdict === 'tool_broken'),
212
+ )
213
+ .map((t) => t.name);
214
+ }
215
+
216
+ /** Advertised parameters the auditor could not differentially test (opaque enum
217
+ * with no constructible value, or a state-changing/bot-defended tool). Surfaced
218
+ * so an unverifiable parameter is visible rather than silently passing. */
219
+ export function untestableParams(
220
+ report: AuditReport,
221
+ ): Array<{ tool: string; name: string; reason: string }> {
222
+ const out: Array<{ tool: string; name: string; reason: string }> = [];
223
+ for (const tool of report.tools) {
224
+ for (const param of tool.parameters) {
225
+ if (param.verdict === 'untestable') {
226
+ out.push({ tool: tool.name, name: param.name, reason: param.reason });
227
+ }
228
+ }
229
+ }
230
+ return out;
231
+ }
232
+
233
+ interface RunAuditOptions {
234
+ site: string;
235
+ minScore: number;
236
+ outPath: string;
237
+ model?: string;
238
+ timeoutMs?: number;
239
+ json?: boolean;
240
+ }
241
+
242
+ export async function runAudit(opts: RunAuditOptions): Promise<AuditScore> {
243
+ return await traced(
244
+ 'audit.session',
245
+ 'AGENT',
246
+ {
247
+ 'imprint.site': opts.site,
248
+ 'imprint.audit.min_score': opts.minScore,
249
+ },
250
+ async (span) => {
251
+ const assetRoot = imprintHomeDir();
252
+ const tools = await discoverTools(assetRoot, opts.site, '[imprint audit]');
253
+ const toolCount = tools.length;
254
+ if (toolCount === 0) {
255
+ throw new Error(
256
+ `No generated tool found for site "${opts.site}" — run \`imprint teach ${opts.site}\` first, then audit it.`,
257
+ );
258
+ }
259
+
260
+ const model = opts.model ?? preferredAgentModel('claude-cli');
261
+ const timeoutMs = opts.timeoutMs ?? DEFAULT_AUDIT_TIMEOUT_MS;
262
+ const systemPromptPath = pathJoin(PROMPTS_DIR, 'audit-agent.md');
263
+ if (!existsSync(systemPromptPath)) {
264
+ throw new Error(
265
+ `Audit system prompt not found at ${systemPromptPath}\n→ this is an Imprint installation problem; please file an issue at https://github.com/ashaychangwani/imprint/issues with the steps you ran.`,
266
+ );
267
+ }
268
+
269
+ const toolNames = tools.map((t) => t.workflow.toolName);
270
+ log(`auditing ${toolCount} tool(s) for site "${opts.site}": ${toolNames.join(', ')}`);
271
+
272
+ // Parameters that shipped live-unverified at compile time (Fix D). Tell the
273
+ // auditor to probe them especially — these are the most likely to be broken
274
+ // (the compile-time differential could not confirm their effect).
275
+ const unverifiedParams: Array<{ tool: string; params: string[] }> = [];
276
+ for (const t of tools) {
277
+ const params = (t.workflow.parameters ?? [])
278
+ .filter((p) => p.verified === false)
279
+ .map((p) => p.name);
280
+ if (params.length > 0) unverifiedParams.push({ tool: t.workflow.toolName, params });
281
+ }
282
+
283
+ // Producer→consumer token contracts (sourcedFrom). Tell the auditor to chain
284
+ // (call the producer, read the named field, feed the consumer) rather than
285
+ // fabricate an opaque token — otherwise a correct chained tool false-fails.
286
+ const tokenDeps: TokenDep[] = [];
287
+ for (const t of tools) {
288
+ for (const p of t.workflow.parameters ?? []) {
289
+ if (p.sourcedFrom) {
290
+ tokenDeps.push({
291
+ tool: t.workflow.toolName,
292
+ param: p.name,
293
+ sourceTool: p.sourcedFrom.tool,
294
+ sourceField: p.sourcedFrom.field,
295
+ });
296
+ }
297
+ }
298
+ }
299
+
300
+ const drive = await driveAudit({
301
+ site: opts.site,
302
+ model,
303
+ timeoutMs,
304
+ systemPromptPath,
305
+ toolNames,
306
+ unverifiedParams,
307
+ tokenDeps,
308
+ });
309
+
310
+ const rawScore = computeAuditScore(drive.report, opts.minScore);
311
+
312
+ // Cross-reference compile-time live verification with the audit grade.
313
+ // The downgrade rule's purpose is to surface "flying blind" runs —
314
+ // ones where the gate has no positive evidence the framework works
315
+ // for the audited site. Iterations of this rule:
316
+ // v1: downgrade if any tool was liveVerified=false AND ungradeable
317
+ // → too strict (downgraded perfectly-scoring runs when one
318
+ // chained tool was unreachable from auditor's connected set).
319
+ // v2: downgrade only if a flying-blind tool had infra invocations
320
+ // → still over-attributed transient page-state to defects.
321
+ // v3 (current): downgrade only when the audit produced ZERO
322
+ // `correct` invocations across ALL tools. If even one
323
+ // invocation graded correctly, that's positive evidence the
324
+ // framework + runtime work for at least that tool — the
325
+ // overall score (correct/(correct+broken)) is the honest
326
+ // signal. Tools that couldn't be exercised still surface via
327
+ // `ungradeableTools` / `unverifiedAndUngradeable` for visibility
328
+ // without spoiling a verdict the score honestly earned.
329
+ const ungradeableNames = ungradeableToolNames(drive.report);
330
+ const untestableParamList = untestableParams(drive.report);
331
+ const unverifiedAndUngradeable = tools
332
+ .filter((t) => t.workflow.liveVerified === false)
333
+ .map((t) => t.workflow.toolName)
334
+ .filter((name) => ungradeableNames.includes(name));
335
+ const anyCorrectAcrossAudit = drive.report.tools.some((t) =>
336
+ t.invocations.some((i) => i.verdict === 'correct'),
337
+ );
338
+ let verdict = rawScore.verdict;
339
+ // Timeout takes precedence over inconclusive downgrade.
340
+ if (drive.timedOut) {
341
+ verdict = 'timeout';
342
+ } else if (rawScore.verdict === 'pass' && !anyCorrectAcrossAudit) {
343
+ verdict = 'inconclusive';
344
+ }
345
+ const score: AuditScore = { ...rawScore, verdict };
346
+
347
+ // Persist the auditor transcript next to the report so a stuck/killed run
348
+ // can be inspected after the fact.
349
+ let transcriptPath: string | undefined;
350
+ if (drive.transcript) {
351
+ transcriptPath = pathJoin(dirname(opts.outPath), '.audit-transcript.txt');
352
+ try {
353
+ mkdirSync(dirname(transcriptPath), { recursive: true });
354
+ writeFileSync(transcriptPath, `${drive.transcript}\n`, 'utf8');
355
+ } catch (err) {
356
+ log(`failed to persist audit transcript to ${transcriptPath}: ${errMsg(err)}`);
357
+ transcriptPath = undefined;
358
+ }
359
+ }
360
+
361
+ // TOTAL prompt (uncached + cache) for the cost calc; the cache split is
362
+ // passed to llmSpanAttributes separately. Always a number here
363
+ // (drive.inputTokens is non-null), so the cost-suppression happens via the
364
+ // `|| undefined` at the call site below.
365
+ const totalInputTokens = totalPromptTokens(
366
+ drive.inputTokens,
367
+ drive.cacheReadInputTokens,
368
+ drive.cacheCreationInputTokens,
369
+ );
370
+ setSpanAttributes(span, {
371
+ 'imprint.audit.score': score.score,
372
+ 'imprint.audit.correct': score.correct,
373
+ 'imprint.audit.broken': score.broken,
374
+ 'imprint.audit.infra': score.infra,
375
+ 'imprint.audit.bad_params': score.badParams,
376
+ 'imprint.audit.graded': score.graded,
377
+ 'imprint.audit.params_working': score.paramsWorking,
378
+ 'imprint.audit.params_no_op': score.paramsNoOp,
379
+ 'imprint.audit.params_broken': score.paramsBroken,
380
+ 'imprint.audit.params_untestable': score.paramsUntestable,
381
+ 'imprint.audit.tool_count': toolCount,
382
+ 'imprint.audit.verdict': score.verdict,
383
+ 'imprint.audit.unverified_and_ungradeable_count': unverifiedAndUngradeable.length,
384
+ 'imprint.audit.timed_out': drive.timedOut,
385
+ 'imprint.audit.turns': drive.turns,
386
+ ...(drive.totalCostUsd != null ? { 'imprint.audit.cost_usd': drive.totalCostUsd } : {}),
387
+ ...llmSpanAttributes({
388
+ provider: 'claude-cli',
389
+ model,
390
+ // `|| undefined`: when no usage was captured (e.g. spawn failure → 0
391
+ // tokens), suppress a bogus $0 cost instead of emitting it.
392
+ inputTokens: totalInputTokens || undefined,
393
+ outputTokens: drive.outputTokens || undefined,
394
+ cacheReadTokens: drive.cacheReadInputTokens || undefined,
395
+ cacheWriteTokens: drive.cacheCreationInputTokens || undefined,
396
+ }),
397
+ });
398
+
399
+ // Persist the full result (deterministic score + the raw model report).
400
+ const persisted = {
401
+ ...score,
402
+ report: drive.report,
403
+ site: opts.site,
404
+ toolCount,
405
+ ungradeableTools: ungradeableNames,
406
+ /** Advertised parameters the auditor could not differentially test. */
407
+ untestableParams: untestableParamList,
408
+ /** Tools that shipped without live verification at compile time AND
409
+ * could not be graded at audit time — zero live signal anywhere. */
410
+ unverifiedAndUngradeable,
411
+ minScore: opts.minScore,
412
+ timedOut: drive.timedOut,
413
+ turns: drive.turns,
414
+ costUsd: drive.totalCostUsd,
415
+ inputTokens: drive.inputTokens,
416
+ outputTokens: drive.outputTokens,
417
+ cacheReadInputTokens: drive.cacheReadInputTokens,
418
+ cacheCreationInputTokens: drive.cacheCreationInputTokens,
419
+ transcriptPath,
420
+ };
421
+ try {
422
+ mkdirSync(dirname(opts.outPath), { recursive: true });
423
+ writeFileSync(opts.outPath, `${JSON.stringify(persisted, null, 2)}\n`, 'utf8');
424
+ } catch (err) {
425
+ log(`failed to persist audit report to ${opts.outPath}: ${errMsg(err)}`);
426
+ }
427
+
428
+ if (opts.json) {
429
+ console.log(JSON.stringify(persisted, null, 2));
430
+ } else {
431
+ printSummary(opts, score, toolCount, {
432
+ timedOut: drive.timedOut,
433
+ timeoutMs,
434
+ transcriptPath,
435
+ costUsd: drive.totalCostUsd,
436
+ unverifiedAndUngradeable,
437
+ report: drive.report,
438
+ });
439
+ }
440
+
441
+ return score;
442
+ },
443
+ );
444
+ }
445
+
446
+ /** A consumer param whose value is minted by a sibling producer tool's output
447
+ * field (from `workflow.json` `param.sourcedFrom`). */
448
+ interface TokenDep {
449
+ tool: string;
450
+ param: string;
451
+ sourceTool: string;
452
+ sourceField: string;
453
+ }
454
+
455
+ /** Build the auditor instruction for producer-sourced token params: chain the
456
+ * producer first, read its field, feed the consumer — never fabricate. Pure so
457
+ * it can be unit-tested without spawning the audit session. */
458
+ export function buildTokenDepNote(tokenDeps: TokenDep[]): string {
459
+ if (tokenDeps.length === 0) return '';
460
+ const lines = tokenDeps.map(
461
+ (d) =>
462
+ `- ${d.tool}(${d.param}) ← first call ${d.sourceTool}, then pass its \`${d.sourceField}\` output value`,
463
+ );
464
+ return `\n\nSome parameters are opaque tokens/ids minted by ANOTHER tool — you cannot fabricate them. For each below, call the producer tool first, read the named output field from its result, and pass that exact value to the consumer (reuse it across calls; no need to re-fetch each time):\n${lines.join(
465
+ '\n',
466
+ )}\nIf you cannot obtain such a value because the producer is blocked, classify the consumer call \`bad_params\`, never \`tool_broken\`.`;
467
+ }
468
+
469
+ interface DriveAuditOptions {
470
+ site: string;
471
+ model: string;
472
+ timeoutMs: number;
473
+ systemPromptPath: string;
474
+ toolNames: string[];
475
+ /** Per-tool params that shipped live-unverified at compile time. */
476
+ unverifiedParams: Array<{ tool: string; params: string[] }>;
477
+ /** Producer→consumer token contracts (param.sourcedFrom) so the auditor chains. */
478
+ tokenDeps: TokenDep[];
479
+ }
480
+
481
+ interface DriveAuditResult {
482
+ report: AuditReport;
483
+ /** False when no report parsed (empty report substituted). */
484
+ reportRecovered: boolean;
485
+ timedOut: boolean;
486
+ turns: number;
487
+ /** Full assistant transcript for diagnosis (empty if the session never spoke). */
488
+ transcript: string;
489
+ inputTokens: number;
490
+ outputTokens: number;
491
+ cacheReadInputTokens: number;
492
+ cacheCreationInputTokens: number;
493
+ /** Authoritative cost from the claude CLI's `result` event, when reported. */
494
+ totalCostUsd: number | null;
495
+ }
496
+
497
+ /** A DriveAuditResult with no session data — spawn failure or an empty run. */
498
+ function emptyDriveAuditResult(): DriveAuditResult {
499
+ return {
500
+ report: AuditReportSchema.parse({}),
501
+ reportRecovered: false,
502
+ timedOut: false,
503
+ turns: 0,
504
+ transcript: '',
505
+ inputTokens: 0,
506
+ outputTokens: 0,
507
+ cacheReadInputTokens: 0,
508
+ cacheCreationInputTokens: 0,
509
+ totalCostUsd: null,
510
+ };
511
+ }
512
+
513
+ /**
514
+ * Spawn a headless `claude` session against the site's real MCP server, drive
515
+ * it to completion, and recover the structured report from the final assistant
516
+ * message. The real `mcp-server` has no write/submit tool, so the report must
517
+ * ride back in the model's text — we extract the last fenced ```json block (or
518
+ * the last balanced top-level object) and validate it. Any unrecoverable report
519
+ * degrades to an empty (→ inconclusive) report rather than crashing the gate.
520
+ */
521
+ async function driveAudit(opts: DriveAuditOptions): Promise<DriveAuditResult> {
522
+ // Distinct from the persistent `imprint-<site>` server that `imprint teach`
523
+ // registers with Claude Code: a same-named inline server collides and claude
524
+ // marks ours "disabled" (even under --strict-mcp-config), leaving the auditor
525
+ // with zero tools. The `imprint-audit-` prefix keeps the inline server unique.
526
+ const serverName = `imprint-audit-${opts.site}`;
527
+ const bunPath = process.execPath;
528
+ const mcpConfig = {
529
+ mcpServers: {
530
+ [serverName]: {
531
+ command: bunPath,
532
+ args: ['run', CLI_PATH, 'mcp-server', opts.site],
533
+ // Pace every audit tool call: the auditor now differentially probes
534
+ // bot-defended idempotent reads (search/calendar) instead of bailing
535
+ // after one call, so a deliberate inter-call delay keeps the probing
536
+ // steady enough that the per-IP anti-bot defense isn't tripped. Only
537
+ // the audit sets this; production mcp-server runs unpaced.
538
+ env: { IMPRINT_AUDIT_PACING_MS: '5000' },
539
+ },
540
+ },
541
+ };
542
+
543
+ const allowedToolArgs: string[] = [];
544
+ for (const name of opts.toolNames) {
545
+ allowedToolArgs.push('--allowedTools', `mcp__${serverName}__${name}`);
546
+ }
547
+
548
+ const unverifiedNote =
549
+ opts.unverifiedParams.length > 0
550
+ ? `\n\nThese parameters shipped WITHOUT a passing compile-time verification, so they are the HIGHEST priority for your per-parameter differential pass: ${opts.unverifiedParams
551
+ .map((u) => `${u.tool}(${u.params.join(', ')})`)
552
+ .join(
553
+ '; ',
554
+ )}. Give each one a \`parameters\` verdict (works / no_op / broken / untestable) like any other — do not let an unverified parameter pass without a differential test. (Per the ONE-invocation rule, a state-changing or bot-defended tool is the exception: mark its parameters \`untestable\` rather than probing.)`
555
+ : '';
556
+
557
+ const initialPrompt = `Audit every MCP tool connected to you for the site "${opts.site}".
558
+
559
+ There are ${opts.toolNames.length} connected tool(s). For each one: read its description and input schema, invoke it with a realistic parameter set, judge the result, and classify each invocation as correct | tool_broken | infra | bad_params per your system prompt. You MAY add one or two edge-case invocations ONLY for tools that are cheap reads not behind an anti-bot/rate defense.
560
+
561
+ ANTI-BOT / STATE-CHANGING TOOLS — ONE invocation only. If a tool drives a state-changing call (a search/booking .act-style POST) or its origin is bot-defended (the first call is slow/tarpitted, or returns 403/429/challenge/anti-bot), do EXACTLY ONE realistic invocation for that tool and move on — do NOT add edge cases. Repeated state-changing calls trip the site's per-IP rate defense, which then tarpits EVERY later call across all tools and ruins the whole audit. One clean read per such tool is enough to grade it; extra probes only convert a passing audit into a tarpitted one.
562
+
563
+ IMPORTANT: Call tools strictly sequentially — issue exactly one tool call, wait for its result, then issue the next. Never issue tool calls in parallel or batch them in one turn. Many target sites share an anti-bot defense across endpoints, so a parallel burst trips a site-wide rate-limit (HTTP 429) that then poisons every later call. If a call returns a 429 / rate-limit / anti-bot result, classify it \`infra\` and pause before the next call.${unverifiedNote}${buildTokenDepNote(opts.tokenDeps)}
564
+
565
+ When you are done, end your final message with exactly one fenced \`\`\`json block containing the full report and nothing after it.`;
566
+
567
+ const args = [
568
+ '--print',
569
+ '--output-format',
570
+ 'stream-json',
571
+ '--verbose',
572
+ '--strict-mcp-config',
573
+ '--mcp-config',
574
+ JSON.stringify(mcpConfig),
575
+ '--system-prompt-file',
576
+ opts.systemPromptPath,
577
+ // Disable the built-in tool set so claude only uses the site's MCP tools.
578
+ '--tools',
579
+ '',
580
+ ...allowedToolArgs,
581
+ '--max-turns',
582
+ '200',
583
+ '--permission-mode',
584
+ 'bypassPermissions',
585
+ '--no-session-persistence',
586
+ '--disable-slash-commands',
587
+ '--effort',
588
+ 'high',
589
+ '--model',
590
+ opts.model,
591
+ initialPrompt,
592
+ ];
593
+
594
+ log(`spawning claude (model=${opts.model}, mcp-server=${serverName})`);
595
+
596
+ let child: ChildProcess;
597
+ try {
598
+ child = spawn('claude', args, {
599
+ cwd: REPO_ROOT,
600
+ // Claude CLI's default MCP_TOOL_TIMEOUT is 60s. The audit-time MCP
601
+ // server's tool calls walk the backend ladder for each invocation —
602
+ // fetch (30s) → fetch-bootstrap (30s) → stealth-fetch (30s) →
603
+ // playbook (5–30s), worst case ~2 min. Bump to 5 min (covers
604
+ // realistic worst case with margin) but NOT to 30 min like the
605
+ // compile side: the compile MCP needs that long because `done` runs
606
+ // bun-test verification inline, but the audit MCP doesn't — each
607
+ // audit tool call is just a single workflow execution. A longer
608
+ // timeout here would burn the audit's overall 30-min deadline
609
+ // on a handful of hanging calls (compiled tools that hang on bad
610
+ // inputs) before the auditor finishes grading. Honor user-set env.
611
+ env: {
612
+ ...process.env,
613
+ MCP_TOOL_TIMEOUT: process.env.MCP_TOOL_TIMEOUT ?? '300000',
614
+ MCP_TIMEOUT: process.env.MCP_TIMEOUT ?? '60000',
615
+ },
616
+ stdio: ['ignore', 'pipe', 'pipe'],
617
+ });
618
+ } catch (err) {
619
+ log(`failed to spawn claude: ${errMsg(err)}`);
620
+ return emptyDriveAuditResult();
621
+ }
622
+
623
+ const session = await collectAssistantText(child, opts.timeoutMs);
624
+ const report = extractReport(session.text);
625
+ if (!report) {
626
+ log(
627
+ session.timedOut
628
+ ? 'audit hit the deadline before producing a report — treating as timeout'
629
+ : 'no valid audit report recovered from the auditor — treating as inconclusive',
630
+ );
631
+ }
632
+ return {
633
+ report: report ?? AuditReportSchema.parse({}),
634
+ reportRecovered: report !== undefined,
635
+ timedOut: session.timedOut,
636
+ turns: session.turns,
637
+ transcript: session.transcript,
638
+ inputTokens: session.inputTokens,
639
+ outputTokens: session.outputTokens,
640
+ cacheReadInputTokens: session.cacheReadInputTokens,
641
+ cacheCreationInputTokens: session.cacheCreationInputTokens,
642
+ totalCostUsd: session.totalCostUsd,
643
+ };
644
+ }
645
+
646
+ /** Everything recovered from one audit session: the text to extract the report
647
+ * from, a full transcript for diagnosis, token/cost usage, and whether the
648
+ * deadline guard had to kill the child. */
649
+ interface AuditSessionResult {
650
+ /** Report-extraction source: the terminal result event, or the concatenated
651
+ * assistant text if the run was cut off before producing one. */
652
+ text: string;
653
+ /** Full assistant reasoning across every turn, persisted for diagnosis. */
654
+ transcript: string;
655
+ timedOut: boolean;
656
+ turns: number;
657
+ inputTokens: number;
658
+ outputTokens: number;
659
+ cacheReadInputTokens: number;
660
+ cacheCreationInputTokens: number;
661
+ totalCostUsd: number | null;
662
+ }
663
+
664
+ /** Drain the stream-json events, accumulating assistant text + token/cost usage,
665
+ * and resolve when the child exits. Enforces the wall-clock timeout by killing
666
+ * the child; reports `timedOut` so a cut-off run is a loud, distinct outcome
667
+ * rather than a silent empty (→ inconclusive) report.
668
+ * Emits a one-line-per-event progress log to stderr so operators can `tail -f`
669
+ * the audit log file and see live what the auditor is doing — without this
670
+ * the audit is a 30-minute black box. */
671
+ async function collectAssistantText(
672
+ child: ChildProcess,
673
+ timeoutMs: number,
674
+ ): Promise<AuditSessionResult> {
675
+ const chunks: string[] = [];
676
+ let resultText = '';
677
+ let stdoutBuf = '';
678
+ let killed = false;
679
+ let turns = 0;
680
+ // Accumulated per-event so a killed run still reports partial usage; the
681
+ // terminal `result` event (when present) overwrites with the authoritative
682
+ // cumulative totals. Mirrors the compile path (claude-cli-compile.ts).
683
+ let inputTokens = 0;
684
+ let outputTokens = 0;
685
+ let cacheReadInputTokens = 0;
686
+ let cacheCreationInputTokens = 0;
687
+ let totalCostUsd: number | null = null;
688
+ const t0 = Date.now();
689
+ const elapsedStr = (): string => {
690
+ const s = Math.floor((Date.now() - t0) / 1000);
691
+ return `${Math.floor(s / 60)}:${String(s % 60).padStart(2, '0')}`;
692
+ };
693
+
694
+ const timer = setTimeout(() => {
695
+ killed = true;
696
+ log(`audit exceeded ${formatDeadline(timeoutMs)} deadline, terminating claude`);
697
+ try {
698
+ child.kill('SIGTERM');
699
+ setTimeout(() => {
700
+ if (!child.killed) child.kill('SIGKILL');
701
+ }, 5000);
702
+ } catch {
703
+ // already gone
704
+ }
705
+ }, timeoutMs);
706
+
707
+ child.stdout?.on('data', (chunk: Buffer) => {
708
+ stdoutBuf += chunk.toString('utf8');
709
+ while (true) {
710
+ const nl = stdoutBuf.indexOf('\n');
711
+ if (nl < 0) break;
712
+ const line = stdoutBuf.slice(0, nl).trim();
713
+ stdoutBuf = stdoutBuf.slice(nl + 1);
714
+ if (!line) continue;
715
+
716
+ let evt: StreamJsonEvent;
717
+ try {
718
+ evt = JSON.parse(line) as StreamJsonEvent;
719
+ } catch {
720
+ continue;
721
+ }
722
+
723
+ // Token accounting from any event that carries usage (event-level or on
724
+ // the nested assistant message).
725
+ const eu = evt.usage;
726
+ const mu = evt.message?.usage;
727
+ inputTokens += (eu?.input_tokens ?? 0) + (mu?.input_tokens ?? 0);
728
+ outputTokens += (eu?.output_tokens ?? 0) + (mu?.output_tokens ?? 0);
729
+ cacheReadInputTokens +=
730
+ (eu?.cache_read_input_tokens ?? 0) + (mu?.cache_read_input_tokens ?? 0);
731
+ cacheCreationInputTokens +=
732
+ (eu?.cache_creation_input_tokens ?? 0) + (mu?.cache_creation_input_tokens ?? 0);
733
+
734
+ // Live progress signal: one log line per tool_use / tool_result /
735
+ // text-snippet event with [elapsed]. Lets `tail -f` show what the
736
+ // auditor is doing in real time instead of waiting 30-60 min for
737
+ // the final report.
738
+ if (evt.type === 'assistant' && Array.isArray(evt.message?.content)) {
739
+ turns++;
740
+ for (const block of evt.message.content) {
741
+ if (!block) continue;
742
+ if (block.type === 'text' && typeof block.text === 'string') {
743
+ chunks.push(block.text);
744
+ const preview = block.text.replace(/\s+/g, ' ').slice(0, 120);
745
+ log(`[${elapsedStr()}] assistant: ${preview}`);
746
+ } else if (block.type === 'tool_use' && typeof block.name === 'string') {
747
+ const inputPreview = block.input ? JSON.stringify(block.input).slice(0, 120) : '';
748
+ log(
749
+ `[${elapsedStr()}] tool_use: ${block.name}${inputPreview ? ` ${inputPreview}` : ''}`,
750
+ );
751
+ }
752
+ }
753
+ } else if (evt.type === 'user' && Array.isArray(evt.message?.content)) {
754
+ for (const block of evt.message.content) {
755
+ if (!block) continue;
756
+ if (block.type === 'tool_result') {
757
+ const raw = Array.isArray(block.content)
758
+ ? (block.content[0]?.text ?? '')
759
+ : typeof block.content === 'string'
760
+ ? block.content
761
+ : '';
762
+ const preview = String(raw).replace(/\s+/g, ' ').slice(0, 140);
763
+ const errMark = block.is_error ? ' (error)' : '';
764
+ log(`[${elapsedStr()}] tool_result${errMark}: ${preview}`);
765
+ }
766
+ }
767
+ } else if (evt.type === 'result') {
768
+ // The terminal result event carries the final assistant message verbatim
769
+ // plus the authoritative cumulative usage + cost.
770
+ if (typeof evt.result === 'string') {
771
+ resultText = evt.result;
772
+ log(`[${elapsedStr()}] result event received (${evt.result.length} chars)`);
773
+ }
774
+ if (evt.usage) {
775
+ inputTokens = evt.usage.input_tokens ?? inputTokens;
776
+ outputTokens = evt.usage.output_tokens ?? outputTokens;
777
+ cacheReadInputTokens = evt.usage.cache_read_input_tokens ?? cacheReadInputTokens;
778
+ cacheCreationInputTokens =
779
+ evt.usage.cache_creation_input_tokens ?? cacheCreationInputTokens;
780
+ }
781
+ if (typeof evt.total_cost_usd === 'number') totalCostUsd = evt.total_cost_usd;
782
+ }
783
+ }
784
+ });
785
+
786
+ child.stderr?.on('data', (chunk: Buffer) => {
787
+ log(`[claude stderr] ${chunk.toString('utf8').trim()}`);
788
+ });
789
+
790
+ await new Promise<void>((resolve) => {
791
+ child.once('exit', () => resolve());
792
+ child.once('error', (err) => {
793
+ log(`claude process error: ${errMsg(err)}`);
794
+ resolve();
795
+ });
796
+ });
797
+ clearTimeout(timer);
798
+ if (killed) log('audit session was terminated by the deadline guard');
799
+
800
+ return {
801
+ // Prefer the terminal result event (the complete final message); fall back to
802
+ // the concatenated streamed assistant text if the result event was absent.
803
+ text: resultText || chunks.join('\n'),
804
+ transcript: chunks.join('\n\n'),
805
+ timedOut: killed,
806
+ turns,
807
+ inputTokens,
808
+ outputTokens,
809
+ cacheReadInputTokens,
810
+ cacheCreationInputTokens,
811
+ totalCostUsd,
812
+ };
813
+ }
814
+
815
+ interface StreamUsage {
816
+ input_tokens?: number;
817
+ output_tokens?: number;
818
+ cache_read_input_tokens?: number;
819
+ cache_creation_input_tokens?: number;
820
+ }
821
+
822
+ interface StreamJsonEvent {
823
+ type: string;
824
+ message?: {
825
+ content?: Array<{
826
+ type?: string;
827
+ text?: string;
828
+ name?: string;
829
+ input?: unknown;
830
+ tool_use_id?: string;
831
+ content?: unknown;
832
+ is_error?: boolean;
833
+ }>;
834
+ usage?: StreamUsage;
835
+ };
836
+ /** Final cumulative usage + cost ride on the terminal `result` event. */
837
+ usage?: StreamUsage;
838
+ total_cost_usd?: number;
839
+ result?: string;
840
+ }
841
+
842
+ /**
843
+ * Recover the structured report from the auditor's text. Prefers the LAST
844
+ * fenced ```json block (the system prompt requires the report to be the final
845
+ * thing in the message); falls back to the last balanced top-level {…} object.
846
+ * Returns undefined when nothing parses + validates.
847
+ */
848
+ export function extractReport(text: string): AuditReport | undefined {
849
+ if (!text) return undefined;
850
+ for (const candidate of jsonCandidates(text)) {
851
+ try {
852
+ const parsed = JSON.parse(candidate);
853
+ const result = AuditReportSchema.safeParse(parsed);
854
+ if (result.success) return result.data;
855
+ } catch {
856
+ // try the next candidate
857
+ }
858
+ }
859
+ return undefined;
860
+ }
861
+
862
+ /** Yield JSON candidate strings best-first: every ```json fenced block (last
863
+ * one first), then balanced top-level {…} objects (last one first). */
864
+ function jsonCandidates(text: string): string[] {
865
+ const out: string[] = [];
866
+ const fenced: string[] = [];
867
+ for (const match of text.matchAll(/```json\s*([\s\S]*?)```/gi)) {
868
+ if (match[1]) fenced.push(match[1].trim());
869
+ }
870
+ out.push(...fenced.reverse());
871
+ out.push(...balancedObjects(text).reverse());
872
+ return out;
873
+ }
874
+
875
+ /** Extract every balanced top-level {…} substring (brace-depth scan, ignoring
876
+ * braces inside strings). Good enough to recover an un-fenced final object. */
877
+ function balancedObjects(text: string): string[] {
878
+ const out: string[] = [];
879
+ let depth = 0;
880
+ let start = -1;
881
+ let inString = false;
882
+ let escaped = false;
883
+ for (let i = 0; i < text.length; i++) {
884
+ const ch = text[i];
885
+ if (inString) {
886
+ if (escaped) escaped = false;
887
+ else if (ch === '\\') escaped = true;
888
+ else if (ch === '"') inString = false;
889
+ continue;
890
+ }
891
+ if (ch === '"') {
892
+ inString = true;
893
+ } else if (ch === '{') {
894
+ if (depth === 0) start = i;
895
+ depth++;
896
+ } else if (ch === '}') {
897
+ if (depth > 0) {
898
+ depth--;
899
+ if (depth === 0 && start >= 0) {
900
+ out.push(text.slice(start, i + 1));
901
+ start = -1;
902
+ }
903
+ }
904
+ }
905
+ }
906
+ return out;
907
+ }
908
+
909
+ function printSummary(
910
+ opts: RunAuditOptions,
911
+ score: AuditScore,
912
+ toolCount: number,
913
+ extra: {
914
+ timedOut: boolean;
915
+ timeoutMs: number;
916
+ transcriptPath?: string;
917
+ costUsd?: number | null;
918
+ unverifiedAndUngradeable: string[];
919
+ report: AuditReport;
920
+ },
921
+ ): void {
922
+ const pct = score.graded === 0 ? 'n/a' : `${score.score.toFixed(1)}%`;
923
+ console.log(`[imprint] audit "${opts.site}" — ${score.verdict.toUpperCase()}`);
924
+ console.log(
925
+ `[imprint] score ${pct} (${score.correct} correct / ${score.broken} broken; threshold ${opts.minScore}%)`,
926
+ );
927
+ // `score.correct`/`score.broken` now blend invocation and parameter verdicts;
928
+ // split them back out so this line counts only actual tool calls.
929
+ const paramsTested = score.paramsWorking + score.paramsNoOp + score.paramsBroken;
930
+ const invGraded = score.graded - paramsTested;
931
+ const invTotal = invGraded + score.infra + score.badParams;
932
+ console.log(
933
+ `[imprint] graded ${score.graded} unit(s) = ${invGraded}/${invTotal} invocation(s) + ${paramsTested} parameter(s) across ${toolCount} tool(s) — excluded: ${score.infra} infra, ${score.badParams} bad_params, ${score.paramsUntestable} untestable param(s)`,
934
+ );
935
+ if (paramsTested + score.paramsUntestable > 0) {
936
+ console.log(
937
+ `[imprint] parameters: ${score.paramsWorking}/${paramsTested} working — ${score.paramsNoOp} no-op, ${score.paramsBroken} broken, ${score.paramsUntestable} untestable`,
938
+ );
939
+ // Per the "no-op/untested isn't a free pass" rule: list every parameter that
940
+ // did not cleanly work, with the auditor's evidence, so the operator sees
941
+ // exactly which advertised parameters don't function.
942
+ for (const tool of extra.report.tools) {
943
+ const flagged = tool.parameters.filter((p) => p.verdict !== 'works');
944
+ if (flagged.length === 0) continue;
945
+ const working = tool.parameters.filter((p) => p.verdict === 'works').length;
946
+ // Denominator excludes untestable params, matching the top-level line.
947
+ const tested = tool.parameters.filter((p) => p.verdict !== 'untestable').length;
948
+ console.log(`[imprint] ${tool.name} (${working}/${tested} working):`);
949
+ for (const p of flagged) {
950
+ const mark = p.verdict === 'untestable' ? '⚪' : '✗';
951
+ console.log(
952
+ `[imprint] ${mark} ${p.name} — ${p.verdict}: ${p.reason || '(no reason)'}`,
953
+ );
954
+ }
955
+ }
956
+ }
957
+ if (extra.costUsd != null) {
958
+ console.log(`[imprint] cost ≈ $${extra.costUsd.toFixed(2)}`);
959
+ }
960
+ if (extra.unverifiedAndUngradeable.length > 0) {
961
+ console.log(
962
+ `[imprint] ${extra.unverifiedAndUngradeable.length} tool(s) flying blind (no live verification at compile, no graded calls at audit): ${extra.unverifiedAndUngradeable.join(', ')}`,
963
+ );
964
+ }
965
+ if (score.verdict === 'timeout') {
966
+ console.log(
967
+ `[imprint] audit was killed at the ${formatDeadline(extra.timeoutMs)} deadline before finishing — partial results only. Re-run with a longer --timeout, or inspect the transcript to see where it stalled.`,
968
+ );
969
+ } else if (score.verdict === 'inconclusive') {
970
+ if (extra.unverifiedAndUngradeable.length > 0) {
971
+ console.log(
972
+ '[imprint] verdict downgraded to inconclusive because at least one tool has zero live signal anywhere.',
973
+ );
974
+ } else {
975
+ console.log(
976
+ '[imprint] no gradeable invocations (likely anti-bot / network) — re-run; this is not a code failure.',
977
+ );
978
+ }
979
+ }
980
+ if (extra.transcriptPath) {
981
+ console.log(`[imprint] transcript → ${extra.transcriptPath}`);
982
+ }
983
+ console.log(`[imprint] report → ${opts.outPath}`);
984
+ }
985
+
986
+ function errMsg(err: unknown): string {
987
+ return err instanceof Error ? err.message : String(err);
988
+ }
989
+
990
+ /** Human-readable deadline, e.g. "20-minute" or "25-second" (sub-minute timeouts
991
+ * shouldn't round to "0-minute"). */
992
+ function formatDeadline(timeoutMs: number): string {
993
+ return timeoutMs < 60_000
994
+ ? `${Math.round(timeoutMs / 1000)}-second`
995
+ : `${Math.round(timeoutMs / 60_000)}-minute`;
996
+ }