imprint-mcp 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/README.md +165 -201
  2. package/examples/discoverandgo/README.md +1 -1
  3. package/examples/echo/README.md +1 -1
  4. package/examples/google-flights/README.md +28 -0
  5. package/examples/google-flights/_shared/batchexecute.ts +63 -0
  6. package/examples/google-flights/_shared/flights_request.ts +95 -0
  7. package/examples/google-flights/_shared/package.json +9 -0
  8. package/examples/google-flights/get_flight_booking_details/index.ts +159 -0
  9. package/examples/google-flights/get_flight_booking_details/package.json +9 -0
  10. package/examples/google-flights/get_flight_booking_details/parser.ts +182 -0
  11. package/examples/google-flights/get_flight_booking_details/playbook.yaml +138 -0
  12. package/examples/google-flights/get_flight_booking_details/request-transform.ts +86 -0
  13. package/examples/google-flights/get_flight_booking_details/workflow.json +98 -0
  14. package/examples/google-flights/get_flight_calendar_prices/index.ts +131 -0
  15. package/examples/google-flights/get_flight_calendar_prices/package.json +9 -0
  16. package/examples/google-flights/get_flight_calendar_prices/parser.ts +86 -0
  17. package/examples/google-flights/get_flight_calendar_prices/playbook.yaml +97 -0
  18. package/examples/google-flights/get_flight_calendar_prices/request-transform.ts +31 -0
  19. package/examples/google-flights/get_flight_calendar_prices/workflow.json +76 -0
  20. package/examples/google-flights/lookup_airport/index.ts +101 -0
  21. package/examples/google-flights/lookup_airport/package.json +9 -0
  22. package/examples/google-flights/lookup_airport/parser.ts +66 -0
  23. package/examples/google-flights/lookup_airport/playbook.yaml +47 -0
  24. package/examples/google-flights/lookup_airport/request-transform.ts +20 -0
  25. package/examples/google-flights/lookup_airport/workflow.json +57 -0
  26. package/examples/google-flights/search_flights/index.ts +219 -0
  27. package/examples/google-flights/search_flights/package.json +9 -0
  28. package/examples/google-flights/search_flights/parser.ts +169 -0
  29. package/examples/google-flights/search_flights/playbook.yaml +184 -0
  30. package/examples/google-flights/search_flights/request-transform.ts +119 -0
  31. package/examples/google-flights/search_flights/workflow.json +143 -0
  32. package/examples/google-hotels/README.md +29 -0
  33. package/examples/google-hotels/_shared/batchexecute.ts +73 -0
  34. package/examples/google-hotels/_shared/freq.ts +158 -0
  35. package/examples/google-hotels/_shared/package.json +9 -0
  36. package/examples/google-hotels/autocomplete_hotel_location/index.ts +80 -0
  37. package/examples/google-hotels/autocomplete_hotel_location/package.json +9 -0
  38. package/examples/google-hotels/autocomplete_hotel_location/parser.ts +71 -0
  39. package/examples/google-hotels/autocomplete_hotel_location/playbook.yaml +36 -0
  40. package/examples/google-hotels/autocomplete_hotel_location/request-transform.ts +37 -0
  41. package/examples/google-hotels/autocomplete_hotel_location/workflow.json +36 -0
  42. package/examples/google-hotels/get_hotel_booking_options/index.ts +143 -0
  43. package/examples/google-hotels/get_hotel_booking_options/package.json +9 -0
  44. package/examples/google-hotels/get_hotel_booking_options/parser.ts +271 -0
  45. package/examples/google-hotels/get_hotel_booking_options/playbook.yaml +154 -0
  46. package/examples/google-hotels/get_hotel_booking_options/request-transform.ts +154 -0
  47. package/examples/google-hotels/get_hotel_booking_options/workflow.json +84 -0
  48. package/examples/google-hotels/get_hotel_reviews/index.ts +81 -0
  49. package/examples/google-hotels/get_hotel_reviews/package.json +9 -0
  50. package/examples/google-hotels/get_hotel_reviews/parser.ts +128 -0
  51. package/examples/google-hotels/get_hotel_reviews/playbook.yaml +64 -0
  52. package/examples/google-hotels/get_hotel_reviews/request-transform.ts +42 -0
  53. package/examples/google-hotels/get_hotel_reviews/workflow.json +37 -0
  54. package/examples/google-hotels/search_hotels/index.ts +207 -0
  55. package/examples/google-hotels/search_hotels/package.json +9 -0
  56. package/examples/google-hotels/search_hotels/parser.ts +260 -0
  57. package/examples/google-hotels/search_hotels/playbook.yaml +87 -0
  58. package/examples/google-hotels/search_hotels/request-transform.ts +197 -0
  59. package/examples/google-hotels/search_hotels/workflow.json +127 -0
  60. package/package.json +3 -2
  61. package/prompts/audit-agent.md +71 -0
  62. package/prompts/build-planning.md +74 -0
  63. package/prompts/compile-agent.md +132 -28
  64. package/prompts/prereq-builder.md +64 -0
  65. package/prompts/prereq-planner.md +34 -0
  66. package/prompts/tool-planning.md +39 -0
  67. package/src/cli.ts +111 -4
  68. package/src/imprint/agent.ts +5 -0
  69. package/src/imprint/audit.ts +996 -0
  70. package/src/imprint/backend-ladder.ts +1214 -184
  71. package/src/imprint/build-plan.ts +1051 -0
  72. package/src/imprint/cdp-browser-fetch.ts +589 -0
  73. package/src/imprint/cdp-jar-cache.ts +320 -0
  74. package/src/imprint/chromium.ts +135 -0
  75. package/src/imprint/claude-cli-compile.ts +125 -25
  76. package/src/imprint/codex-cli-compile.ts +26 -23
  77. package/src/imprint/compile-agent-types.ts +38 -0
  78. package/src/imprint/compile-agent.ts +65 -27
  79. package/src/imprint/compile-tools.ts +1656 -64
  80. package/src/imprint/compile.ts +14 -2
  81. package/src/imprint/concurrency.ts +87 -0
  82. package/src/imprint/credential-extract.ts +174 -25
  83. package/src/imprint/cron.ts +1 -0
  84. package/src/imprint/doctor.ts +39 -0
  85. package/src/imprint/emit.ts +85 -0
  86. package/src/imprint/freeform-redact.ts +5 -4
  87. package/src/imprint/integrations.ts +2 -2
  88. package/src/imprint/llm.ts +56 -8
  89. package/src/imprint/mcp-compile-server.ts +43 -10
  90. package/src/imprint/mcp-maintenance.ts +9 -101
  91. package/src/imprint/mcp-server.ts +73 -7
  92. package/src/imprint/multi-progress.ts +7 -2
  93. package/src/imprint/param-grounding.ts +367 -0
  94. package/src/imprint/paths.ts +29 -0
  95. package/src/imprint/playbook-runner.ts +101 -40
  96. package/src/imprint/prereq-builder.ts +651 -0
  97. package/src/imprint/probe-backends.ts +6 -3
  98. package/src/imprint/record.ts +10 -1
  99. package/src/imprint/redact.ts +30 -2
  100. package/src/imprint/replay-capture.ts +19 -18
  101. package/src/imprint/runtime.ts +19 -10
  102. package/src/imprint/sensitive-keys.ts +141 -7
  103. package/src/imprint/session-diff.ts +79 -2
  104. package/src/imprint/session-merge.ts +9 -5
  105. package/src/imprint/stealth-chromium.ts +81 -0
  106. package/src/imprint/stealth-fetch.ts +309 -29
  107. package/src/imprint/stealth-token-cache.ts +88 -0
  108. package/src/imprint/teach-plan.ts +251 -0
  109. package/src/imprint/teach-state.ts +17 -0
  110. package/src/imprint/teach.ts +582 -147
  111. package/src/imprint/tool-candidates.ts +72 -14
  112. package/src/imprint/tool-plan.ts +313 -0
  113. package/src/imprint/tracing.ts +135 -6
  114. package/src/imprint/types.ts +61 -3
  115. package/examples/google-flights/search_google_flights/index.ts +0 -101
  116. package/examples/google-flights/search_google_flights/parser.test.ts +0 -140
  117. package/examples/google-flights/search_google_flights/parser.ts +0 -189
  118. package/examples/google-flights/search_google_flights/playbook.yaml +0 -130
  119. package/examples/google-flights/search_google_flights/workflow.json +0 -48
  120. package/examples/google-hotels/search_google_hotels/index.ts +0 -194
  121. package/examples/google-hotels/search_google_hotels/parser.test.ts +0 -168
  122. package/examples/google-hotels/search_google_hotels/parser.ts +0 -330
  123. package/examples/google-hotels/search_google_hotels/playbook.yaml +0 -125
  124. package/examples/google-hotels/search_google_hotels/workflow.json +0 -111
  125. package/examples/namecheap-domains/search_namecheap_domains/index.ts +0 -144
  126. package/examples/namecheap-domains/search_namecheap_domains/parser.ts +0 -380
  127. package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +0 -50
  128. package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +0 -136
  129. package/examples/namecheap-domains/search_namecheap_domains/workflow.json +0 -97
@@ -8,15 +8,34 @@
8
8
 
9
9
  import { spawn } from 'node:child_process';
10
10
  import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs';
11
- import { dirname, join as pathJoin, relative as pathRelative } from 'node:path';
11
+ import { basename, dirname, join as pathJoin, relative as pathRelative } from 'node:path';
12
12
  import type { AgentTool } from './agent.ts';
13
13
  import { inferAppApiHosts } from './app-api-hosts.ts';
14
+ import {
15
+ type AssignedSharedModule,
16
+ type SharedModuleManifestEntry,
17
+ planSliceForTool,
18
+ readBuildPlanFile,
19
+ resolveAssignedModules,
20
+ } from './build-plan.ts';
14
21
  import { splitSetCookieHeader } from './cookie-jar.ts';
15
22
  import { isSameRegistrableDomain, registrableDomain } from './etld.ts';
23
+ import {
24
+ endpointsForSeqs,
25
+ groundEvent,
26
+ groundingForEvents,
27
+ inputProvenance,
28
+ } from './param-grounding.ts';
16
29
  import { compactRequestContexts, requestContextDigest } from './request-context.ts';
17
30
  import type { ClassifiedValue } from './session-diff.ts';
18
31
  import type { SharedCompileContext, ToolCandidate } from './tool-candidates.ts';
19
- import { type CapturedRequest, type Session, WorkflowSchema } from './types.ts';
32
+ import {
33
+ type BootstrapCapture,
34
+ type CapturedRequest,
35
+ type RequestCapture,
36
+ type Session,
37
+ WorkflowSchema,
38
+ } from './types.ts';
20
39
 
21
40
  const REPO_ROOT = pathJoin(import.meta.dir, '..', '..');
22
41
 
@@ -36,9 +55,10 @@ export function buildCompileTools(
36
55
  const credEnv = context.teachCredentials
37
56
  ? { IMPRINT_TEACH_CREDENTIALS: JSON.stringify(context.teachCredentials) }
38
57
  : undefined;
39
- return [
58
+ const tools = [
40
59
  buildReadSessionSummaryTool(session, context),
41
60
  buildReadRequestTool(session),
61
+ buildDiffRequestForEventTool(session, context),
42
62
  buildReadResponseBodyTool(session),
43
63
  buildSearchResponseBodyTool(session),
44
64
  buildWriteFileTool(toolDir),
@@ -46,6 +66,16 @@ export function buildCompileTools(
46
66
  buildRunBashTool(toolDir, credEnv),
47
67
  buildRunTestsTool(toolDir, sessionPath, credEnv),
48
68
  ];
69
+ if (context.buildPlanPath && context.candidate?.toolName) {
70
+ tools.push(
71
+ buildReadBuildPlanTool(
72
+ context.buildPlanPath,
73
+ context.candidate.toolName,
74
+ context.sharedModules,
75
+ ),
76
+ );
77
+ }
78
+ return tools;
49
79
  }
50
80
 
51
81
  interface CompileToolContext {
@@ -53,6 +83,76 @@ interface CompileToolContext {
53
83
  sharedContext?: SharedCompileContext;
54
84
  classifications?: ClassifiedValue[];
55
85
  teachCredentials?: { site: string; values: Record<string, string> };
86
+ /** Absolute path to the multi-tool build plan sidecar (.build-plan.json). When
87
+ * set, a read_build_plan tool is exposed and the verifier asserts the tool
88
+ * imports the shared modules the plan assigned it. */
89
+ buildPlanPath?: string;
90
+ /** Shared-module build manifest (verified flags) for this site. */
91
+ sharedModules?: SharedModuleManifestEntry[];
92
+ }
93
+
94
+ // ─── Tool: read_build_plan ───────────────────────────────────────────────────
95
+
96
+ function buildReadBuildPlanTool(
97
+ buildPlanPath: string,
98
+ toolName: string,
99
+ manifest?: SharedModuleManifestEntry[],
100
+ ): AgentTool {
101
+ return {
102
+ name: 'read_build_plan',
103
+ description:
104
+ "Read this tool's slice of the shared build plan: shared modules to import (instead of re-implementing), parser guidance, the parameter checklist, the auth recipe to replicate inline, and the opaque-token contract (fields this tool must EMIT for siblings, and params it CONSUMES from siblings).",
105
+ input_schema: { type: 'object', properties: {}, required: [] },
106
+ handler: async () => {
107
+ const plan = readBuildPlanFile(buildPlanPath);
108
+ if (!plan) return { result: 'No build plan available for this run.' };
109
+ const slice = planSliceForTool(plan, toolName);
110
+ if (!slice) return { result: `No build-plan slice for tool "${toolName}".` };
111
+ const assigned = resolveAssignedModules(plan, toolName, manifest).filter((m) => m.verified);
112
+ const emitsTokens = slice.tool.emitsTokens ?? [];
113
+ const tokenParams = slice.tool.tokenParams ?? [];
114
+ const tokenNotes: string[] = [];
115
+ if (emitsTokens.length > 0) {
116
+ tokenNotes.push(
117
+ `PRODUCER CONTRACT: your parser MUST emit ${emitsTokens
118
+ .map((e) => `\`${e.field}\``)
119
+ .join(
120
+ ', ',
121
+ )} in each result item, in the exact shape described (the FULL value a sibling consumer needs — never a bare fragment). Sibling tools mint their input from these fields; the verifier fails this tool if a declared field is missing from the parser output.`,
122
+ );
123
+ }
124
+ for (const tp of tokenParams) {
125
+ tokenNotes.push(
126
+ `CONSUMER CONTRACT: param \`${tp.param}\` is an opaque token minted by the \`${tp.sourceTool}\` tool's \`${tp.sourceField}\` output. Write a CHAINED \`param:${tp.param}\` integration test that calls \`runWorkflowWithLadder\` on \`../${tp.sourceTool}/workflow.json\`, reads \`${tp.sourceField}\` from its result, and passes THAT fresh value (not the recorded constant) into this tool — then asserts the response is non-empty. On producer bot/infra error, rethrow so the suite waives.`,
127
+ );
128
+ }
129
+ return {
130
+ result: JSON.stringify(
131
+ {
132
+ toolName,
133
+ sharedModulesToImport: assigned.map((m) => ({
134
+ importPath: m.importPath,
135
+ kind: m.kind,
136
+ purpose: m.purpose,
137
+ exportSignatures: m.exportSignatures,
138
+ })),
139
+ parserGuidance: slice.tool.parserGuidance,
140
+ paramChecklist: slice.tool.paramChecklist,
141
+ authRecipe: slice.tool.authRecipe,
142
+ emitsTokens,
143
+ tokenParams,
144
+ note:
145
+ assigned.length > 0
146
+ ? 'Import the listed shared modules via their importPath (request-transform → set workflow.json "requestTransformModule"; parser-helper/types → import from parser.ts) instead of re-implementing their logic. The verifier fails this tool if an assigned module is not imported.'
147
+ : 'No shared modules assigned — build this tool self-contained.',
148
+ tokenContract: tokenNotes.length > 0 ? tokenNotes : undefined,
149
+ },
150
+ null,
151
+ 2,
152
+ ),
153
+ };
154
+ },
155
+ };
56
156
  }
57
157
 
58
158
  // ─── Tool: read_session_summary ──────────────────────────────────────────────
@@ -61,7 +161,7 @@ function buildReadSessionSummaryTool(session: Session, context: CompileToolConte
61
161
  return {
62
162
  name: 'read_session_summary',
63
163
  description:
64
- 'Get a high-level summary of the session including narration, selected candidate scope, load-bearing requests with inline data, and capture hints.',
164
+ 'Get a high-level summary of the session including narration, selected candidate scope, load-bearing requests with inline data, capture hints, and parameter-grounding hints (for each recorded UI toggle, the exact request positions that changed — use these to ground each likelyParam instead of eyeballing one request).',
65
165
  input_schema: {
66
166
  type: 'object',
67
167
  properties: {},
@@ -78,6 +178,48 @@ function buildReadSessionSummaryTool(session: Session, context: CompileToolConte
78
178
  ...(context.sharedContext?.loginRequestSeqs ?? []),
79
179
  ]);
80
180
  const preserveSeqs = new Set([...selectedRequestSeqs, ...dependencySeqs]);
181
+
182
+ // Event-correlated differential grounding hints: for each UI event the
183
+ // candidate detector flagged, diff the request it triggered against the
184
+ // prior equivalent request and report what changed. This is where a
185
+ // filter/sort/option param's encoding actually lives — the agent maps
186
+ // each diff to its likelyParam instead of eyeballing one request and
187
+ // giving up (which previously shipped groundable params verified:false).
188
+ const paramGroundingHints =
189
+ (context.candidate?.eventSeqs?.length ?? 0)
190
+ ? groundingForEvents(
191
+ session,
192
+ context.candidate?.eventSeqs ?? [],
193
+ endpointsForSeqs(session, [...preserveSeqs]),
194
+ ).map((g) => ({
195
+ event: g.label || `event seq ${g.eventSeq}`,
196
+ eventSeq: g.eventSeq,
197
+ changedRequestSeq: g.triggeredSeq,
198
+ vsRequestSeq: g.priorSeq,
199
+ changes: g.changes.map((c) => `${c.path}: ${c.before} -> ${c.after}`),
200
+ }))
201
+ : [];
202
+
203
+ // Input-value provenance: positions in a load-bearing request whose value
204
+ // is an opaque id minted by an earlier response (not the user's text). The
205
+ // agent must CHAIN+CAPTURE these, not freeze them or substitute raw param
206
+ // text — substituting raw text where a resolved id belongs makes the
207
+ // backend ignore the input and fall back to a default scope.
208
+ // Scan the candidate's full seq set (capped), not just the representative
209
+ // one: the representative may be a first text-only request whose response
210
+ // mints the id, with the id only appearing in a later sibling request.
211
+ const provenanceSeqs = [...new Set([...selectedRequestSeqs, ...allCandidateSeqs])]
212
+ .sort((a, b) => a - b)
213
+ .slice(0, 30);
214
+ const inputProvenanceHints = inputProvenance(session, provenanceSeqs).map((p) => ({
215
+ path: p.path,
216
+ example: p.valueSample,
217
+ inRequestSeq: p.requestSeq,
218
+ mintedByResponseSeq: p.sourceSeq,
219
+ mintedByEndpoint: p.sourceEndpoint,
220
+ selfChain: p.selfChain,
221
+ }));
222
+
81
223
  const summaryRequests = identifySummaryRequests(session, preserveSeqs);
82
224
  const loadBearingRequests = compactRequestContexts(
83
225
  summaryRequests.map((r) => ({
@@ -124,6 +266,8 @@ function buildReadSessionSummaryTool(session: Session, context: CompileToolConte
124
266
  requestCount: session.requests.length,
125
267
  stateHints,
126
268
  captureHints: captureHints.length > 0 ? captureHints : undefined,
269
+ paramGroundingHints: paramGroundingHints.length > 0 ? paramGroundingHints : undefined,
270
+ inputProvenanceHints: inputProvenanceHints.length > 0 ? inputProvenanceHints : undefined,
127
271
  loadBearingRequests,
128
272
  };
129
273
 
@@ -673,6 +817,56 @@ function buildReadRequestTool(session: Session): AgentTool {
673
817
  };
674
818
  }
675
819
 
820
+ // ─── Tool: diff_request_for_event ────────────────────────────────────────────
821
+
822
+ function buildDiffRequestForEventTool(session: Session, context: CompileToolContext): AgentTool {
823
+ return {
824
+ name: 'diff_request_for_event',
825
+ description:
826
+ "For a recorded UI event seq (a filter/sort/option toggle from selectedCandidate.eventSeqs), return the request it triggered diffed against the prior equivalent request. The changed positions are exactly where that interaction's parameter is encoded — use this to ground a param's encoding when paramGroundingHints does not already cover it. Returns the changed JSON paths (path: before -> after).",
827
+ input_schema: {
828
+ type: 'object',
829
+ properties: {
830
+ eventSeq: {
831
+ type: 'number',
832
+ description: 'Event sequence number (from selectedCandidate.eventSeqs)',
833
+ },
834
+ },
835
+ required: ['eventSeq'],
836
+ },
837
+ handler: async (input: unknown) => {
838
+ const { eventSeq } = input as { eventSeq: number };
839
+ const reqSeqs = [
840
+ ...((context.candidate?.representativeSeqs?.length ?? 0) > 0
841
+ ? (context.candidate?.representativeSeqs ?? [])
842
+ : (context.candidate?.requestSeqs ?? [])),
843
+ ...(context.candidate?.dependencySeqs ?? []),
844
+ ];
845
+ const endpoints = endpointsForSeqs(session, reqSeqs);
846
+ const g = groundEvent(session, eventSeq, endpoints.size > 0 ? endpoints : undefined);
847
+ if (!g.triggeredSeq) {
848
+ return {
849
+ result: `Event ${eventSeq} triggered no comparable request within the window — it may be a client-side-only interaction (no server param), or its request was telemetry. If a filter/sort visibly changed results with no new request, it is applied client-side and cannot be reproduced via request replay.`,
850
+ };
851
+ }
852
+ return {
853
+ result: JSON.stringify(
854
+ {
855
+ event: g.label,
856
+ eventSeq: g.eventSeq,
857
+ changedRequestSeq: g.triggeredSeq,
858
+ vsRequestSeq: g.priorSeq,
859
+ endpoint: g.endpoint,
860
+ changes: g.changes.map((c) => `${c.path}: ${c.before} -> ${c.after}`),
861
+ },
862
+ null,
863
+ 2,
864
+ ),
865
+ };
866
+ },
867
+ };
868
+ }
869
+
676
870
  // ─── Tool: read_response_body ────────────────────────────────────────────────
677
871
 
678
872
  function buildReadResponseBodyTool(session: Session): AgentTool {
@@ -927,7 +1121,7 @@ function buildRunBashTool(toolDir: string, credEnv?: Record<string, string>): Ag
927
1121
  required: ['command'],
928
1122
  },
929
1123
  handler: async (input: unknown) => {
930
- const { command, timeoutSec = 60 } = input as { command: string; timeoutSec?: number };
1124
+ const { command, timeoutSec = 120 } = input as { command: string; timeoutSec?: number };
931
1125
 
932
1126
  if (command.match(/rm\s+-rf\s+\//) || command.includes('sudo')) {
933
1127
  return {
@@ -943,16 +1137,19 @@ function buildRunBashTool(toolDir: string, credEnv?: Record<string, string>): Ag
943
1137
  };
944
1138
  }
945
1139
 
946
- async function runCommand(
1140
+ export async function runCommand(
947
1141
  command: string,
948
1142
  cwd: string,
949
1143
  timeoutMs: number,
950
1144
  extraEnv?: Record<string, string>,
951
1145
  ): Promise<{ result: string; isError?: boolean }> {
952
1146
  return new Promise((resolve) => {
1147
+ // `detached: true` makes the child its own process-group leader so a timeout
1148
+ // can SIGKILL the WHOLE tree (sh → bun → Chrome), not just `sh`.
953
1149
  const proc = spawn('sh', ['-c', command], {
954
1150
  cwd,
955
1151
  env: extraEnv ? { ...process.env, ...extraEnv } : process.env,
1152
+ detached: true,
956
1153
  });
957
1154
 
958
1155
  let stdout = '';
@@ -971,12 +1168,42 @@ async function runCommand(
971
1168
 
972
1169
  const timeout = setTimeout(() => {
973
1170
  timedOut = true;
974
- proc.kill();
1171
+ // Kill the whole process GROUP, not just `sh`. A hung `bun run probe.ts`
1172
+ // spawns bun + Chrome children that survive a bare proc.kill() (SIGTERM to
1173
+ // sh only); they keep the stdout pipe open so 'close' never fires, hanging
1174
+ // this call until the outer MCP tool timeout (30m) — exactly what ate a
1175
+ // tool's compile budget. SIGKILL the group so the timeout reaps bun + any
1176
+ // leaked browser and 'close' fires promptly.
1177
+ try {
1178
+ if (proc.pid) process.kill(-proc.pid, 'SIGKILL');
1179
+ else proc.kill('SIGKILL');
1180
+ } catch {
1181
+ proc.kill('SIGKILL');
1182
+ }
975
1183
  }, timeoutMs);
976
1184
 
977
1185
  proc.on('close', (exitCode) => {
978
1186
  clearTimeout(timeout);
979
1187
 
1188
+ // Reap the whole process GROUP on EVERY exit, not just on timeout. The
1189
+ // compile verifier runs `bun test`, whose runner calls process.exit() the
1190
+ // instant the suite passes — and bun does NOT run process 'exit' /
1191
+ // 'beforeExit' handlers (only afterAll), so the compile cdp pool's
1192
+ // idle-close timer never fires and its launchChromium child is orphaned
1193
+ // (reparented to PID 1), accumulating across a multi-tool/multi-site teach
1194
+ // until the box OOMs. That child is still in THIS process group, though:
1195
+ // the group's id (= proc.pid) outlives the dead `sh` leader, so SIGKILLing
1196
+ // the group here reaps the orphaned Chrome regardless of how `bun test`
1197
+ // chose to exit. Harmless when the group is already empty (ESRCH). Skipped
1198
+ // on timeout (the group was already SIGKILLed above).
1199
+ if (!timedOut && proc.pid) {
1200
+ try {
1201
+ process.kill(-proc.pid, 'SIGKILL');
1202
+ } catch {
1203
+ // group already empty — nothing left to reap
1204
+ }
1205
+ }
1206
+
980
1207
  if (stdout.length > TRUNCATE_LIMIT) {
981
1208
  stdout = `${stdout.slice(0, TRUNCATE_LIMIT)}\n[…truncated…]`;
982
1209
  }
@@ -997,19 +1224,25 @@ async function runCommand(
997
1224
  });
998
1225
  }
999
1226
 
1000
- async function runGeneratedArtifactTypecheck(
1001
- exampleDir: string,
1227
+ /** Typecheck a set of generated `.ts` artifacts in `dir` against the repo's
1228
+ * tsconfig (so `imprint/*` and bun globals resolve). Used by both the compile
1229
+ * verifier (parser.ts / request-transform.ts) and the prereq-module verifier
1230
+ * (`_shared/*.ts`). `*.test.ts` are excluded — they pull in bun:test globals
1231
+ * the strict config rejects. Exported for prereq-builder.ts. */
1232
+ export async function typecheckArtifacts(
1233
+ dir: string,
1234
+ includes: string[],
1002
1235
  ): Promise<{ stdout: string; stderr: string; exitCode: number; timedOut: boolean }> {
1003
- const configPath = pathJoin(exampleDir, '.imprint-typecheck.tsconfig.json');
1236
+ const configPath = pathJoin(dir, '.imprint-typecheck.tsconfig.json');
1004
1237
  const rootTsconfig = pathJoin(REPO_ROOT, 'tsconfig.json');
1005
- const extendsPath = normalizeTsconfigPath(pathRelative(exampleDir, rootTsconfig));
1238
+ const extendsPath = normalizeTsconfigPath(pathRelative(dir, rootTsconfig));
1006
1239
 
1007
1240
  writeFileSync(
1008
1241
  configPath,
1009
1242
  JSON.stringify(
1010
1243
  {
1011
1244
  extends: extendsPath,
1012
- include: ['parser.ts', 'request-transform.ts'],
1245
+ include: includes,
1013
1246
  exclude: ['*.test.ts'],
1014
1247
  },
1015
1248
  null,
@@ -1021,7 +1254,7 @@ async function runGeneratedArtifactTypecheck(
1021
1254
  try {
1022
1255
  const result = await runCommand(
1023
1256
  'bunx tsc --noEmit -p .imprint-typecheck.tsconfig.json',
1024
- exampleDir,
1257
+ dir,
1025
1258
  120000,
1026
1259
  );
1027
1260
  return JSON.parse(result.result) as {
@@ -1103,8 +1336,1068 @@ function buildRunTestsTool(
1103
1336
  };
1104
1337
  }
1105
1338
 
1339
+ // ─── Test-quality helpers (shared with prereq-builder verification) ─────────
1340
+
1341
+ /** Tautological assertions that prove nothing — rejected by every verifier so
1342
+ * an agent can't game the ≥3-expect gate with `expect(true).toBe(true)`. */
1343
+ const TRIVIAL_ASSERTION_PATTERNS: RegExp[] = [
1344
+ /expect\s*\(\s*true\s*\)\.toBe\s*\(\s*true\s*\)/,
1345
+ /expect\s*\(\s*false\s*\)\.toBe\s*\(\s*false\s*\)/,
1346
+ /expect\s*\(\s*1\s*\)\.toBe\s*\(\s*1\s*\)/,
1347
+ /expect\s*\(\s*0\s*\)\.toBe\s*\(\s*0\s*\)/,
1348
+ /expect\s*\(\s*null\s*\)\.toBeNull/,
1349
+ /expect\s*\(\s*undefined\s*\)\.toBeUndefined/,
1350
+ /expect\s*\(\s*"[^"]*"\s*\)\.toBe\s*\(\s*"[^"]*"\s*\)/,
1351
+ /expect\s*\(\s*'[^']*'\s*\)\.toBe\s*\(\s*'[^']*'\s*\)/,
1352
+ ];
1353
+
1354
+ export function countExpectCalls(src: string): number {
1355
+ return (src.match(/expect\s*\(/g) ?? []).length;
1356
+ }
1357
+
1358
+ export function hasTrivialAssertion(src: string): boolean {
1359
+ return TRIVIAL_ASSERTION_PATTERNS.some((pattern) => pattern.test(src));
1360
+ }
1361
+
1362
+ /** Assert the tool imports each verified shared module the plan assigned it.
1363
+ * request-transform → workflow.json.requestTransformModule must point at it;
1364
+ * parser-helper/types → parser.ts (or request-transform.ts) must import it. */
1365
+ function assertSharedModuleImports(
1366
+ toolDir: string,
1367
+ workflowPath: string,
1368
+ assigned: AssignedSharedModule[],
1369
+ ): string[] {
1370
+ const failures: string[] = [];
1371
+ const verified = assigned.filter((m) => m.verified);
1372
+ if (verified.length === 0) return failures;
1373
+
1374
+ let workflowRaw: { requestTransformModule?: unknown } = {};
1375
+ try {
1376
+ workflowRaw = JSON.parse(readFileSync(workflowPath, 'utf8'));
1377
+ } catch {
1378
+ return failures; // workflow parse already flagged elsewhere
1379
+ }
1380
+ const requestTransformModule =
1381
+ typeof workflowRaw.requestTransformModule === 'string'
1382
+ ? workflowRaw.requestTransformModule
1383
+ : '';
1384
+
1385
+ let sourceBlob = '';
1386
+ for (const f of ['parser.ts', 'request-transform.ts']) {
1387
+ const p = pathJoin(toolDir, f);
1388
+ if (existsSync(p)) sourceBlob += `\n${readFileSync(p, 'utf8')}`;
1389
+ }
1390
+
1391
+ for (const m of verified) {
1392
+ if (m.kind === 'request-transform') {
1393
+ if (!requestTransformModule.includes(m.importPath) && !sourceBlob.includes(m.importPath)) {
1394
+ failures.push(
1395
+ `the build plan assigns shared module ${m.path} (request-transform) to this tool, but workflow.json does not set "requestTransformModule": "${m.importPath}" and no artifact imports it. Reuse it instead of re-implementing the logic — see read_build_plan.`,
1396
+ );
1397
+ }
1398
+ } else if (!sourceBlob.includes(m.importPath)) {
1399
+ failures.push(
1400
+ `the build plan assigns shared module ${m.path} (${m.kind}) to this tool, but no artifact imports "${m.importPath}". Import it from parser.ts (or request-transform.ts) instead of re-implementing it — see read_build_plan.`,
1401
+ );
1402
+ }
1403
+ }
1404
+ return failures;
1405
+ }
1406
+
1106
1407
  // ─── External Verification ──────────────────────────────────────────────────
1107
1408
 
1409
+ /**
1410
+ * Decide whether a failed integration test was blocked by anti-automation /
1411
+ * bot defense (as opposed to a real workflow defect). Compile-time integration
1412
+ * tests only reach the fetch + fetch-bootstrap rungs; many sites gate their
1413
+ * APIs behind challenges (CAPTCHA interstitials, redirect-to-challenge pages,
1414
+ * rate-based blocks) that only the runtime ladder's stealth-fetch + playbook
1415
+ * rungs bypass. When the parser is already verified against the recorded
1416
+ * response, such a block should be a non-blocking warning, not a hard failure —
1417
+ * the tool works in production via the full ladder.
1418
+ *
1419
+ * Vendor-agnostic by design: matches the common defense families (Cloudflare,
1420
+ * Akamai, DataDome, PerimeterX, hCaptcha/reCAPTCHA, generic "unusual traffic"
1421
+ * interstitials) plus blocking HTTP statuses (403/429/503) and
1422
+ * redirect-to-challenge (30x to a challenge/verify/captcha location).
1423
+ * Not specialized to any single site.
1424
+ */
1425
+ export function isBotDefenseFailure(output: string): boolean {
1426
+ // Unambiguous challenge/interstitial signatures — sufficient on their own,
1427
+ // regardless of HTTP status, because no legitimate API success emits them.
1428
+ // Vendor-neutral: covers the common anti-bot families, not any one site.
1429
+ const strong =
1430
+ /unusual traffic|recaptcha|hcaptcha|h-captcha|are you (a )?(human|robot)|verify (you are|you'?re) (a )?human|px-captcha|datadome|perimeterx|cf[-_]chl|attention required|just a moment\s*(\.\.\.|…)?|enable javascript and cookies to continue/i;
1431
+ if (strong.test(output)) return true;
1432
+ // Akamai Bot Manager runtime signal: `_abck` is the sensor cookie and a value
1433
+ // ending in `~-1~` means the session is UNVALIDATED (bot-flagged); `~0~` means
1434
+ // validated. The cdp bootstrap logs `_abck status after interaction: ~-1~` when
1435
+ // it ran the human-like interaction (mouse/scroll) and STILL could not validate
1436
+ // the sensor — i.e. it actively tried to beat the defense and failed. On such a
1437
+ // session Akamai serves a 200 "soft block" with empty/placeholder data instead
1438
+ // of a 403, so the live integration fails to produce data even though every
1439
+ // backend reports OK. Treat that as a bot-defense waiver (the tool falls through
1440
+ // to the runtime ladder / playbook and the audit validates it live) rather than
1441
+ // a hard compile failure. Scoped to the post-interaction confirmation so the
1442
+ // ordinary "cached jar not validated … — re-mint" log (which precedes a retry
1443
+ // that often succeeds) does NOT trip it.
1444
+ if (/_abck status after interaction:\s*~-1~/i.test(output)) return true;
1445
+ // Weaker terms need a corroborating blocking status or a redirect to a
1446
+ // challenge page so ordinary error text doesn't get a free pass.
1447
+ const weak =
1448
+ /captcha|challenge|access denied|forbidden|blocked|\bbot\b|rate.?limit|too many requests/i;
1449
+ const blockingStatus = /\b(403|429|503)\b/.test(output);
1450
+ const challengeRedirect =
1451
+ /\b(30[1-8])\b/.test(output) &&
1452
+ /captcha|challenge|verify|robot|denied|blocked|unusual/i.test(output);
1453
+ return (blockingStatus || challengeRedirect) && weak.test(output);
1454
+ }
1455
+
1456
+ function unescapeXml(s: string): string {
1457
+ return s
1458
+ .replace(/&lt;/g, '<')
1459
+ .replace(/&gt;/g, '>')
1460
+ .replace(/&quot;/g, '"')
1461
+ .replace(/&apos;/g, "'")
1462
+ .replace(/&amp;/g, '&');
1463
+ }
1464
+
1465
+ /**
1466
+ * Parse a JUnit XML report (from `bun test --reporter=junit`) into the sets of
1467
+ * passed and failed test *names*. The default bun reporter does not print
1468
+ * per-test names in non-TTY mode, so the JUnit report is the reliable way to
1469
+ * know which individual tests actually ran green. A self-closed
1470
+ * `<testcase .../>` passed; a `<testcase>` with a `<failure>`/`<error>` child
1471
+ * failed.
1472
+ */
1473
+ export function parseJUnitResults(xml: string): { passed: Set<string>; failed: Set<string> } {
1474
+ const passed = new Set<string>();
1475
+ const failed = new Set<string>();
1476
+ if (!xml) return { passed, failed };
1477
+ const re = /<testcase\b([^>]*?)(\/>|>([\s\S]*?)<\/testcase>)/g;
1478
+ for (const m of xml.matchAll(re)) {
1479
+ const attrs = m[1] ?? '';
1480
+ const nameMatch = attrs.match(/\bname="([^"]*)"/);
1481
+ if (!nameMatch?.[1]) continue;
1482
+ const name = unescapeXml(nameMatch[1]);
1483
+ const selfClosed = m[2] === '/>';
1484
+ const didFail = !selfClosed && /<(failure|error)\b/.test(m[3] ?? '');
1485
+ if (didFail) failed.add(name);
1486
+ else passed.add(name);
1487
+ }
1488
+ return { passed, failed };
1489
+ }
1490
+
1491
+ interface BunTestRun {
1492
+ stdout: string;
1493
+ stderr: string;
1494
+ exitCode: number;
1495
+ /** True when the run was killed by the wall-clock timeout (not a clean exit).
1496
+ * Lets the classifier treat a truncated paced anti-bot suite as infra, never
1497
+ * as a bot block (the partial output's fetch-403 must not look like a block). */
1498
+ timedOut: boolean;
1499
+ /** Per-test names recovered from the JUnit report. */
1500
+ passed: Set<string>;
1501
+ failed: Set<string>;
1502
+ }
1503
+
1504
+ /** Per-exposed-parameter verification outcome. `verified` is true only when a
1505
+ * `param:<name>` integration test actually ran green against live data. */
1506
+ interface ParamVerification {
1507
+ name: string;
1508
+ verified: boolean;
1509
+ /** Why an exposed param is unverified. Undefined when `verified` is true.
1510
+ * - `waived-bot` / `waived-infra`: the live suite was waived (anti-bot /
1511
+ * infra), so the param's effect could not be confirmed at compile time;
1512
+ * it is exercised at runtime via the stealth-fetch / playbook ladder.
1513
+ * - `annotated`: the agent marked it `// exposed-but-not-verified`.
1514
+ * - `waived-chain`: the param is a producer-sourced token but the producer
1515
+ * tool could not be run at compile time (anti-bot / not compiled), so the
1516
+ * chain could not be verified. */
1517
+ reason?: 'waived-bot' | 'waived-infra' | 'annotated' | 'waived-chain';
1518
+ /** For a producer-sourced token param, the sibling tool + output field its
1519
+ * value comes from. Stamped into workflow.json (`param.sourcedFrom`) so the
1520
+ * MCP description tells the orchestrating LLM where to mint it and the audit
1521
+ * harness chains producer→consumer instead of fabricating a token. */
1522
+ sourcedFrom?: { tool: string; field: string };
1523
+ }
1524
+
1525
+ /** A parameter the gate knows is an opaque token/id minted by a sibling tool.
1526
+ * `sourceTool`/`sourceField` are known when the build plan declared the contract;
1527
+ * a mechanically-detected source (its recorded value appears in a sibling tool's
1528
+ * response) may carry only the param name. Either way the param REQUIRES a
1529
+ * chained `param:<name>` test that mints a fresh value from the producer. */
1530
+ interface TokenSource {
1531
+ param: string;
1532
+ sourceTool?: string;
1533
+ sourceField?: string;
1534
+ }
1535
+
1536
+ /**
1537
+ * Run a single `bun test <file>` and recover both the raw output (for
1538
+ * bot-defense / infra detection and error surfacing) and the per-test pass/fail
1539
+ * names via a JUnit report written to a transient file in the tool dir.
1540
+ */
1541
+ async function runBunTestWithResults(
1542
+ testPath: string,
1543
+ toolDir: string,
1544
+ timeoutMs: number,
1545
+ env: Record<string, string> = {},
1546
+ ): Promise<BunTestRun> {
1547
+ const junitPath = pathJoin(toolDir, `.imprint-junit-${basename(testPath)}.xml`);
1548
+ try {
1549
+ if (existsSync(junitPath)) unlinkSync(junitPath);
1550
+ } catch {
1551
+ // best-effort
1552
+ }
1553
+ const result = await runCommand(
1554
+ `bun test ${testPath} --reporter=junit --reporter-outfile=${junitPath}`,
1555
+ toolDir,
1556
+ timeoutMs,
1557
+ env,
1558
+ );
1559
+ const output = JSON.parse(result.result) as {
1560
+ stdout: string;
1561
+ stderr: string;
1562
+ exitCode: number;
1563
+ timedOut?: boolean;
1564
+ };
1565
+ let xml = '';
1566
+ try {
1567
+ if (existsSync(junitPath)) xml = readFileSync(junitPath, 'utf8');
1568
+ } catch {
1569
+ // missing/partial report → empty sets, handled by callers
1570
+ }
1571
+ try {
1572
+ if (existsSync(junitPath)) unlinkSync(junitPath);
1573
+ } catch {
1574
+ // best-effort
1575
+ }
1576
+ const { passed, failed } = parseJUnitResults(xml);
1577
+ return {
1578
+ stdout: output.stdout,
1579
+ stderr: output.stderr,
1580
+ exitCode: output.exitCode,
1581
+ timedOut: output.timedOut ?? false,
1582
+ passed,
1583
+ failed,
1584
+ };
1585
+ }
1586
+
1587
+ interface TestBlock {
1588
+ title: string;
1589
+ body: string;
1590
+ }
1591
+
1592
+ /** Split a test file into `test(...)` / `it(...)` blocks (title + source from
1593
+ * that test's start to the next test's start). Good enough to check whether a
1594
+ * named per-parameter test's body actually calls the workflow. */
1595
+ export function extractTestBlocks(src: string): TestBlock[] {
1596
+ const re = /\b(?:test|it)\s*\(\s*(['"`])((?:\\.|(?!\1).)*)\1/g;
1597
+ const starts: Array<{ index: number; title: string }> = [];
1598
+ for (const m of src.matchAll(re)) {
1599
+ starts.push({ index: m.index ?? 0, title: m[2] ?? '' });
1600
+ }
1601
+ const blocks: TestBlock[] = [];
1602
+ for (let i = 0; i < starts.length; i++) {
1603
+ const start = starts[i];
1604
+ if (!start) continue;
1605
+ const end = i + 1 < starts.length ? (starts[i + 1]?.index ?? src.length) : src.length;
1606
+ blocks.push({ title: start.title, body: src.slice(start.index, end) });
1607
+ }
1608
+ return blocks;
1609
+ }
1610
+
1611
+ /** Whether a recorded value looks like an opaque token/id (vs free text, a city
1612
+ * name, a date) — used to gate mechanical producer-source detection. */
1613
+ function looksOpaque(v: string): boolean {
1614
+ if (v.length < 12) return false;
1615
+ if (/\s/.test(v)) return false; // multi-word / free text
1616
+ if (/^\d{4}-\d{2}-\d{2}$/.test(v)) return false; // dates
1617
+ return /[:|_-]/.test(v) || /\d/.test(v) || v.length >= 16;
1618
+ }
1619
+
1620
+ /**
1621
+ * Mechanical producer-source detector (secondary signal to the build plan's
1622
+ * declared `tokenParams`). A parameter is producer-sourced when its recorded
1623
+ * value — or a `|`/`:`-split segment of a composite — appears verbatim in a
1624
+ * SIBLING tool's recorded response. Returns the param name (and the producing
1625
+ * tool name when the sibling response carried one). Advisory: it never marks a
1626
+ * param verified; it only forces the chained-test requirement so an undeclared
1627
+ * cross-tool token can't ship with a tautological recorded-value test.
1628
+ */
1629
+ export function detectTokenSources(opts: {
1630
+ likelyParams: Array<{ name: string }>;
1631
+ recordedParamValues: Map<string, string>;
1632
+ siblingResponses: Array<{ toolName?: string; body: string }>;
1633
+ }): TokenSource[] {
1634
+ const out: TokenSource[] = [];
1635
+ for (const lp of opts.likelyParams) {
1636
+ const val = opts.recordedParamValues.get(lp.name);
1637
+ if (!val || !looksOpaque(val)) continue;
1638
+ const needles = [val, ...val.split(/[|:]/).filter((s) => looksOpaque(s))];
1639
+ const hit = opts.siblingResponses.find((r) => needles.some((n) => r.body.includes(n)));
1640
+ if (hit) out.push({ param: lp.name, sourceTool: hit.toolName });
1641
+ }
1642
+ return out;
1643
+ }
1644
+
1645
+ /** Does a test block mint a fresh value by calling a SIBLING tool's workflow
1646
+ * (`../<producer>/workflow.json`) rather than only this tool's own workflow? */
1647
+ const SIBLING_WORKFLOW_RE = /\.\.\/[A-Za-z0-9_]+\/workflow\.json/;
1648
+
1649
+ /** The `sourcedFrom` stamp for a token param — `{tool, field}` when both the
1650
+ * producer tool and field are known, else undefined. */
1651
+ function sourcedFromOf(ts: {
1652
+ sourceTool?: string;
1653
+ sourceField?: string;
1654
+ }): { tool: string; field: string } | undefined {
1655
+ return ts.sourceTool && ts.sourceField
1656
+ ? { tool: ts.sourceTool, field: ts.sourceField }
1657
+ : undefined;
1658
+ }
1659
+
1660
+ interface IntegrationVerdict {
1661
+ /** Drives PARAM coverage: a waived suite lets per-param tests waive (non-blocking);
1662
+ * `failed` blocks; `passed` grades params strictly. */
1663
+ outcome: 'passed' | 'waived-bot' | 'waived-infra' | 'failed';
1664
+ /** Drives `liveVerified` — INDEPENDENT of `outcome`. True when a backend returned
1665
+ * real data this run (the workflow IS live-verifiable), even if the per-param
1666
+ * suite was truncated/blocked. Decoupling these is the fix for tools whose
1667
+ * stealth/cdp baseline succeeded shipping `liveVerified:false` just because the
1668
+ * param suite hit the verifier timeout. */
1669
+ baselineLiveVerified: boolean;
1670
+ firstError: string;
1671
+ exhaustedBackends: string[];
1672
+ /** Non-null when a declared `${state.X}` capture returned null at runtime (a
1673
+ * workflow-correctness error, not infra) — the caller crafts the actionable msg. */
1674
+ captureFailName: string | null;
1675
+ captureFailFromKnown: boolean;
1676
+ }
1677
+
1678
+ /**
1679
+ * Pure classifier for the live integration run. Decides the suite `outcome` AND,
1680
+ * separately, whether the BASELINE was live-verified.
1681
+ *
1682
+ * Why two outputs: an anti-bot suite can have its baseline return real data
1683
+ * (liveVerified) while its per-param tests time out / get blocked (params waive,
1684
+ * non-blocking). The old code coupled `liveVerified` to the WHOLE suite passing,
1685
+ * so a tool whose stealth/cdp baseline succeeded shipped `liveVerified:false`
1686
+ * merely because the param suite was truncated by the 60s verifier timeout, and a
1687
+ * lone `fetch`-rung 403 in the partial output read as a total bot block.
1688
+ *
1689
+ * `baselineLiveVerified` = a backend returned real data this run, detected by the
1690
+ * ladder's `parallel probe: winner=<backend>` log (logged ONLY on an ok result —
1691
+ * robust when JUnit is absent because a timeout SIGKILLed the suite) OR a
1692
+ * non-`param:` baseline test passing in JUnit (robust when a memoized call skipped
1693
+ * the probe log).
1694
+ *
1695
+ * `exhaustedBackends` lists only backends whose probe digest line reported an
1696
+ * ERROR — NOT every backend that was "trying…" (cdp-replay/stealth-fetch usually
1697
+ * succeed and must not be reported as exhausted).
1698
+ */
1699
+ export function classifyIntegrationOutcome(input: {
1700
+ exitCode: number;
1701
+ timedOut: boolean;
1702
+ combined: string;
1703
+ passedTests: ReadonlySet<string>;
1704
+ referencedStateBroken: boolean;
1705
+ failedCaptureNames: ReadonlySet<string>;
1706
+ }): IntegrationVerdict {
1707
+ const { combined } = input;
1708
+ const baselineLiveVerified =
1709
+ /parallel probe: winner=/.test(combined) ||
1710
+ [...input.passedTests].some((t) => !t.startsWith('param:'));
1711
+ const exhaustedBackends = Array.from(
1712
+ new Set(
1713
+ Array.from(
1714
+ combined.matchAll(
1715
+ /^\s*([a-z-]+): (?:NETWORK|FORBIDDEN|RATE_LIMITED|BAD_RESPONSE|STATE_MISSING|AUTH_EXPIRED|UNKNOWN)\b/gm,
1716
+ ),
1717
+ ).map((m) => m[1] as string),
1718
+ ),
1719
+ );
1720
+ const firstErrorMatch = combined.match(/\b(NETWORK|FORBIDDEN|RATE_LIMITED)\b[^\n]{0,200}/);
1721
+ const firstError = firstErrorMatch?.[0]?.trim() ?? 'unknown';
1722
+ const base = { baselineLiveVerified, firstError, exhaustedBackends };
1723
+
1724
+ if (input.exitCode === 0) {
1725
+ return {
1726
+ ...base,
1727
+ outcome: 'passed',
1728
+ baselineLiveVerified: true,
1729
+ firstError: '',
1730
+ exhaustedBackends: [],
1731
+ captureFailName: null,
1732
+ captureFailFromKnown: false,
1733
+ };
1734
+ }
1735
+ if (input.referencedStateBroken) {
1736
+ return { ...base, outcome: 'failed', captureFailName: null, captureFailFromKnown: false };
1737
+ }
1738
+ // Fix C — a STATE_MISSING traced to a declared capture is a workflow-correctness
1739
+ // error, not infra; waiving it would silently ship a broken workflow. Match the
1740
+ // EXACT runtime message (runtime.ts: `Required capture "<name>" (<source>) did
1741
+ // not produce a value.`) — the error code prefix is separated by an em-dash, not
1742
+ // a colon, so the old `STATE_MISSING:` regex never matched and these failures
1743
+ // wrongly fell through to the anti-bot branch (shipped waived-bot instead of
1744
+ // failed). Checked BEFORE the bot-defense branch so a capture-fail that also has
1745
+ // an `_abck` line in the log is still classified `failed`, not waived.
1746
+ const captureFailMatch = combined.match(
1747
+ /Required capture\s+"([^"]+)"\s*\([^)]*\)\s*did not produce a value/i,
1748
+ );
1749
+ if (captureFailMatch) {
1750
+ const name = captureFailMatch[1] ?? '';
1751
+ return {
1752
+ ...base,
1753
+ outcome: 'failed',
1754
+ captureFailName: name,
1755
+ captureFailFromKnown: input.failedCaptureNames.has(name),
1756
+ };
1757
+ }
1758
+ // A verifier TIMEOUT truncated a paced suite — that's infra, NEVER a bot block.
1759
+ // (Don't let the partial output's fetch-403 masquerade as a total block.)
1760
+ if (input.timedOut) {
1761
+ return {
1762
+ ...base,
1763
+ outcome: 'waived-infra',
1764
+ firstError: firstError === 'unknown' ? 'verifier timeout (suite truncated)' : firstError,
1765
+ captureFailName: null,
1766
+ captureFailFromKnown: false,
1767
+ };
1768
+ }
1769
+ if (isBotDefenseFailure(combined)) {
1770
+ return { ...base, outcome: 'waived-bot', captureFailName: null, captureFailFromKnown: false };
1771
+ }
1772
+ // Every ladder rung exhausted with an infra error. Matches the runWorkflowWithLadder
1773
+ // probe summary (`all backends failed`) + the runWithLadder memo-path summary
1774
+ // (`ladder exhausted`) + stealth's `giving up` / non-escalatable markers.
1775
+ const hasImprintBlock =
1776
+ /\bRATE_LIMITED\b|\bFORBIDDEN\b|\bNETWORK\b/.test(combined) &&
1777
+ /non-escalatable|giving up|ladder exhausted|all backends failed/.test(combined);
1778
+ if (hasImprintBlock) {
1779
+ return { ...base, outcome: 'waived-infra', captureFailName: null, captureFailFromKnown: false };
1780
+ }
1781
+ return { ...base, outcome: 'failed', captureFailName: null, captureFailFromKnown: false };
1782
+ }
1783
+
1784
+ /**
1785
+ * Pure per-parameter coverage classifier (Fix C/D + chained-token verification).
1786
+ * Decides, for each exposed parameter, whether it was behaviorally verified — a
1787
+ * `param:<name>` integration test that actually ran green (in `passedTests`) AND
1788
+ * calls the workflow — and otherwise why it is unverified. Never drops a param
1789
+ * (keep+mark policy):
1790
+ * - covered-live → `{ verified: true }`
1791
+ * - suite waived by anti-bot/infra and not covered → `{ verified: false, reason: 'waived-*' }`
1792
+ * - annotated `// exposed-but-not-verified` and not covered → `{ verified: false, reason: 'annotated' }`
1793
+ * - else (suite ran, no test, no annotation) → `uncovered` (blocking)
1794
+ * - passed but the test never calls runWorkflowWithLadder → `tautological` (blocking)
1795
+ *
1796
+ * A **producer-sourced token param** (in `tokenSources`) is held to a stricter
1797
+ * bar: its `param:<name>` test must mint a FRESH value by calling the producer's
1798
+ * sibling workflow (`../<tool>/workflow.json`), not reuse the recorded constant.
1799
+ * - chained pass → `{ verified: true, sourcedFrom }`
1800
+ * - passed but not chained (the recorded-value tautology) → `unchained` (blocking)
1801
+ * - suite waived (producer anti-bot) → `{ verified: false, reason: 'waived-chain' }`
1802
+ * - else → `unchained` (blocking)
1803
+ */
1804
+ export function classifyParamCoverage(opts: {
1805
+ likelyParams: Array<{ name: string }>;
1806
+ integrationSrc: string;
1807
+ passedTests: Set<string>;
1808
+ integrationOutcome: 'passed' | 'waived-bot' | 'waived-infra' | 'failed' | 'absent';
1809
+ tokenSources?: TokenSource[];
1810
+ }): {
1811
+ paramVerification: ParamVerification[];
1812
+ uncovered: string[];
1813
+ tautological: string[];
1814
+ unchained: string[];
1815
+ } {
1816
+ const paramVerification: ParamVerification[] = [];
1817
+ const uncovered: string[] = [];
1818
+ const tautological: string[] = [];
1819
+ const unchained: string[] = [];
1820
+ const tokenByName = new Map((opts.tokenSources ?? []).map((t) => [t.param, t]));
1821
+ const blocks = extractTestBlocks(opts.integrationSrc);
1822
+ const waived =
1823
+ opts.integrationOutcome === 'waived-bot' || opts.integrationOutcome === 'waived-infra';
1824
+ for (const lp of opts.likelyParams) {
1825
+ const token = `param:${lp.name}`;
1826
+ const passedLive = [...opts.passedTests].some((n) => n.includes(token));
1827
+ const block = blocks.find((b) => b.title.includes(token));
1828
+
1829
+ // Producer-sourced token param: requires a chained test that mints a fresh
1830
+ // value from the producer's sibling workflow.
1831
+ const ts = tokenByName.get(lp.name);
1832
+ if (ts) {
1833
+ const sourcedFrom = sourcedFromOf(ts);
1834
+ if (passedLive) {
1835
+ const chained =
1836
+ !!block &&
1837
+ /runWorkflowWithLadder\s*\(/.test(block.body) &&
1838
+ SIBLING_WORKFLOW_RE.test(block.body);
1839
+ if (chained) {
1840
+ paramVerification.push({ name: lp.name, verified: true, sourcedFrom });
1841
+ } else {
1842
+ unchained.push(lp.name);
1843
+ }
1844
+ } else if (waived) {
1845
+ paramVerification.push({
1846
+ name: lp.name,
1847
+ verified: false,
1848
+ reason: 'waived-chain',
1849
+ sourcedFrom,
1850
+ });
1851
+ } else {
1852
+ unchained.push(lp.name);
1853
+ }
1854
+ continue;
1855
+ }
1856
+
1857
+ const annotationRe = new RegExp(
1858
+ `//\\s*exposed-but-not-verified[^\\n]*\\b${lp.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`,
1859
+ );
1860
+ const isAnnotated = annotationRe.test(opts.integrationSrc);
1861
+
1862
+ if (passedLive) {
1863
+ // Anti-tautology: a passing per-param test must actually exercise the live
1864
+ // workflow, not assert a constant.
1865
+ if (block && !/runWorkflowWithLadder\s*\(/.test(block.body)) {
1866
+ tautological.push(lp.name);
1867
+ } else {
1868
+ paramVerification.push({ name: lp.name, verified: true });
1869
+ }
1870
+ continue;
1871
+ }
1872
+
1873
+ if (waived) {
1874
+ paramVerification.push({
1875
+ name: lp.name,
1876
+ verified: false,
1877
+ reason: opts.integrationOutcome as 'waived-bot' | 'waived-infra',
1878
+ });
1879
+ continue;
1880
+ }
1881
+ if (isAnnotated) {
1882
+ paramVerification.push({ name: lp.name, verified: false, reason: 'annotated' });
1883
+ continue;
1884
+ }
1885
+ uncovered.push(lp.name);
1886
+ }
1887
+ return { paramVerification, uncovered, tautological, unchained };
1888
+ }
1889
+
1890
+ /**
1891
+ * Fix D: on successful verification, persist each exposed parameter's
1892
+ * `verified` / `verifyNote` into workflow.json so the audit harness and
1893
+ * operators can see which params were not behaviorally verified at compile time
1894
+ * (per the keep+mark policy — nothing is dropped). Returns a consolidated
1895
+ * warning line for any unverified params (empty when all verified). Best-effort:
1896
+ * a write failure never blocks a tool that already passed verification.
1897
+ */
1898
+ export function applyParamVerification(
1899
+ toolDir: string,
1900
+ paramVerification: ParamVerification[],
1901
+ ): string[] {
1902
+ if (paramVerification.length === 0) return [];
1903
+ const workflowPath = pathJoin(toolDir, 'workflow.json');
1904
+ if (!existsSync(workflowPath)) return [];
1905
+ let workflow: {
1906
+ parameters?: Array<{
1907
+ name: string;
1908
+ verified?: boolean;
1909
+ verifyNote?: string;
1910
+ sourcedFrom?: { tool: string; field: string };
1911
+ }>;
1912
+ };
1913
+ try {
1914
+ workflow = JSON.parse(readFileSync(workflowPath, 'utf8'));
1915
+ } catch {
1916
+ return [];
1917
+ }
1918
+ const byName = new Map(paramVerification.map((p) => [p.name, p]));
1919
+ for (const param of workflow.parameters ?? []) {
1920
+ const pv = byName.get(param.name);
1921
+ if (!pv) continue;
1922
+ if (pv.verified) {
1923
+ param.verified = true;
1924
+ param.verifyNote = undefined;
1925
+ } else {
1926
+ param.verified = false;
1927
+ param.verifyNote = pv.reason;
1928
+ }
1929
+ // Stamp the producer-source contract so the MCP description (mcp-server.ts)
1930
+ // tells the orchestrating LLM where to mint the token and `imprint audit`
1931
+ // chains producer→consumer instead of fabricating it.
1932
+ if (pv.sourcedFrom) param.sourcedFrom = pv.sourcedFrom;
1933
+ }
1934
+ try {
1935
+ writeFileSync(workflowPath, `${JSON.stringify(workflow, null, 2)}\n`, 'utf8');
1936
+ } catch {
1937
+ // best-effort — the tool is already verified; this is only metadata.
1938
+ }
1939
+ const unverified = paramVerification.filter((p) => !p.verified);
1940
+ if (unverified.length === 0) return [];
1941
+ return [
1942
+ `${unverified.length} parameter(s) live-unverified at compile time (${unverified
1943
+ .map((p) => `${p.name}: ${p.reason ?? 'unverified'}`)
1944
+ .join(', ')}) — exercised at runtime via the stealth-fetch / playbook ladder.`,
1945
+ ];
1946
+ }
1947
+
1948
+ /**
1949
+ * Stamp the integration-test waiver outcome onto workflow.json. When a tool's
1950
+ * integration test couldn't produce live data (anti-bot block or every-rung
1951
+ * NETWORK exhaustion), we ship anyway — but the workflow records
1952
+ * `liveVerified: false` plus the structured waiver reason so the audit gate
1953
+ * and the teach summary can flag it instead of silently treating it as
1954
+ * verified. Best-effort: a write failure never blocks a tool that already
1955
+ * passed parser + schema verification.
1956
+ */
1957
+ export function applyLiveVerification(
1958
+ toolDir: string,
1959
+ liveVerification:
1960
+ | { kind: 'waived-bot' | 'waived-infra'; firstError: string; exhaustedBackends: string[] }
1961
+ | undefined,
1962
+ ): void {
1963
+ const workflowPath = pathJoin(toolDir, 'workflow.json');
1964
+ if (!existsSync(workflowPath)) return;
1965
+ let workflow: Record<string, unknown>;
1966
+ try {
1967
+ workflow = JSON.parse(readFileSync(workflowPath, 'utf8'));
1968
+ } catch {
1969
+ return;
1970
+ }
1971
+ if (liveVerification) {
1972
+ workflow.liveVerified = false;
1973
+ workflow.liveVerifiedWaiver = liveVerification;
1974
+ } else {
1975
+ workflow.liveVerified = true;
1976
+ workflow.liveVerifiedWaiver = undefined;
1977
+ }
1978
+ try {
1979
+ writeFileSync(workflowPath, `${JSON.stringify(workflow, null, 2)}\n`, 'utf8');
1980
+ } catch {
1981
+ // best-effort — non-fatal
1982
+ }
1983
+ }
1984
+
1985
+ /** Strip `${...}` placeholders and query string from a workflow URL so it can
1986
+ * be compared against a recorded request URL by (origin + path). Returns null
1987
+ * when the URL is unparseable even after stripping. */
1988
+ function normalizeUrlForMatch(rawUrl: string): { origin: string; path: string } | null {
1989
+ // Replace placeholders with a stable token, then try to parse. If the URL
1990
+ // still has a placeholder in the host/scheme it will fail — fine, caller
1991
+ // falls back to substring matching.
1992
+ const stripped = rawUrl.replace(/\$\{[^}]+\}/g, 'X');
1993
+ try {
1994
+ const u = new URL(stripped);
1995
+ return { origin: u.origin, path: u.pathname };
1996
+ } catch {
1997
+ return null;
1998
+ }
1999
+ }
2000
+
2001
+ /** Find recorded requests whose (method, origin+path) matches the workflow
2002
+ * request. Used by capture-cross-reference and hardcoded-body checks. */
2003
+ function findRecordedMatches(
2004
+ session: Session,
2005
+ method: string,
2006
+ url: string,
2007
+ restrictToSeqs?: Set<number>,
2008
+ ): CapturedRequest[] {
2009
+ const norm = normalizeUrlForMatch(url);
2010
+ if (!norm) return [];
2011
+ const upperMethod = method.toUpperCase();
2012
+ return session.requests.filter((r) => {
2013
+ if (restrictToSeqs && !restrictToSeqs.has(r.seq)) return false;
2014
+ if (r.method.toUpperCase() !== upperMethod) return false;
2015
+ const rNorm = normalizeUrlForMatch(r.url);
2016
+ if (!rNorm) return false;
2017
+ return rNorm.origin === norm.origin && rNorm.path === norm.path;
2018
+ });
2019
+ }
2020
+
2021
+ /** Case-insensitive header lookup against a `Record<string, string>` (which
2022
+ * records preserve as they were captured — Chrome's DevTools protocol does not
2023
+ * normalize). */
2024
+ function headerValue(headers: Record<string, string>, name: string): string | undefined {
2025
+ const lower = name.toLowerCase();
2026
+ for (const [k, v] of Object.entries(headers)) {
2027
+ if (k.toLowerCase() === lower) return v;
2028
+ }
2029
+ return undefined;
2030
+ }
2031
+
2032
+ /** Set-Cookie can appear multiple times; the captured shape is best-effort.
2033
+ * Returns true if any Set-Cookie header in `headers` defines a cookie named
2034
+ * `cookieName`. */
2035
+ function setCookieDefines(headers: Record<string, string>, cookieName: string): boolean {
2036
+ const raw = headerValue(headers, 'set-cookie');
2037
+ if (!raw) return false;
2038
+ // Multiple cookies may be joined with newlines or commas; split conservatively.
2039
+ const cookies = raw.split(/\n|,(?=\s*[A-Za-z_])/);
2040
+ for (const c of cookies) {
2041
+ const eq = c.indexOf('=');
2042
+ if (eq < 0) continue;
2043
+ if (c.slice(0, eq).trim() === cookieName) return true;
2044
+ }
2045
+ return false;
2046
+ }
2047
+
2048
+ /** Fix A — cross-reference each declared `required` capture against the
2049
+ * recording. The verifier rejects done() if the declared source doesn't
2050
+ * actually carry the value, so the agent can no longer ship a workflow whose
2051
+ * capture recipe will silently fail at runtime. General — not specific to
2052
+ * any one capture source or site. */
2053
+ function crossReferenceCaptures(
2054
+ workflow: ReturnType<typeof WorkflowSchema.parse>,
2055
+ session: Session,
2056
+ candidateRequestSeqs?: number[],
2057
+ ): { failures: string[]; failedCaptureNames: Set<string> } {
2058
+ const failures: string[] = [];
2059
+ const failedCaptureNames = new Set<string>();
2060
+ const restrictSet = candidateRequestSeqs ? new Set(candidateRequestSeqs) : undefined;
2061
+
2062
+ // Bootstrap captures
2063
+ if (workflow.bootstrap?.captures) {
2064
+ for (const cap of workflow.bootstrap.captures) {
2065
+ if (cap.required === false) continue;
2066
+ const matches = findRecordedMatches(session, 'GET', workflow.bootstrap.url, restrictSet);
2067
+ // Bootstrap URL might not be in candidateRequestSeqs (dependency); retry
2068
+ // without the restriction so we can still cross-reference.
2069
+ const recorded = matches[0] ?? findRecordedMatches(session, 'GET', workflow.bootstrap.url)[0];
2070
+ if (!recorded) {
2071
+ // Out of scope; do not fail — we can't prove anything.
2072
+ continue;
2073
+ }
2074
+ const fail = validateCaptureAgainstRecording(cap, recorded, 'bootstrap GET');
2075
+ if (fail) {
2076
+ failures.push(fail);
2077
+ failedCaptureNames.add(cap.name);
2078
+ }
2079
+ }
2080
+ }
2081
+
2082
+ // Per-request captures
2083
+ for (const [i, req] of workflow.requests.entries()) {
2084
+ if (!req.captures) continue;
2085
+ for (const cap of req.captures) {
2086
+ if (cap.required === false) continue;
2087
+ const matches = findRecordedMatches(session, req.method, req.url, restrictSet);
2088
+ const recorded = matches[0] ?? findRecordedMatches(session, req.method, req.url)[0];
2089
+ if (!recorded) continue;
2090
+ const fail = validateCaptureAgainstRecording(
2091
+ cap,
2092
+ recorded,
2093
+ `request[${i}] ${req.method} ${req.url}`,
2094
+ );
2095
+ if (fail) {
2096
+ failures.push(fail);
2097
+ failedCaptureNames.add(cap.name);
2098
+ }
2099
+ }
2100
+ }
2101
+
2102
+ return { failures, failedCaptureNames };
2103
+ }
2104
+
2105
+ /** Fix 2 — cross-reference every capture that a request actually DEPENDS ON
2106
+ * (referenced via `${state.X}` in a header/body/url) against the recording,
2107
+ * regardless of the capture's `required` flag. Fix A only checks `required`
2108
+ * captures and only against the capture's own URL response; that misses the
2109
+ * common anti-bot shape where a `required:false` html_regex capture (csrf /
2110
+ * csp-nonce) is scraped from a bootstrap page that isn't itself in the
2111
+ * recording, yet a request hard-references `${state.csrf_token}` in a header.
2112
+ * At runtime that reference STATE_MISSINGs the whole workflow. This check
2113
+ * rejects done() so the agent must fix the pattern (or source).
2114
+ *
2115
+ * Scope: html_regex / text_regex captures (robustly checkable by testing the
2116
+ * pattern against every recorded same-origin HTML document body). Other
2117
+ * sources referenced-but-not-required are left to Fix A / the integration test.
2118
+ * General — not specific to any site or token. */
2119
+ export function crossReferenceReferencedStateCaptures(
2120
+ workflow: ReturnType<typeof WorkflowSchema.parse>,
2121
+ session: Session,
2122
+ ): { failures: string[]; failedCaptureNames: Set<string> } {
2123
+ const failures: string[] = [];
2124
+ const failedCaptureNames = new Set<string>();
2125
+
2126
+ // 1) Collect every ${state.X} name referenced across request url/headers/body.
2127
+ const referenced = new Set<string>();
2128
+ const stateRefRe = /\$\{state\.([A-Za-z0-9_]+)\}/g;
2129
+ const scan = (s: string | undefined): void => {
2130
+ if (!s) return;
2131
+ for (const m of s.matchAll(stateRefRe)) {
2132
+ const name = m[1];
2133
+ if (name) referenced.add(name);
2134
+ }
2135
+ };
2136
+ for (const req of workflow.requests) {
2137
+ scan(req.url);
2138
+ scan(req.body);
2139
+ for (const hv of Object.values(req.headers ?? {})) scan(hv);
2140
+ }
2141
+ if (referenced.size === 0) return { failures, failedCaptureNames };
2142
+
2143
+ // 2) Index captures by name (bootstrap + per-request).
2144
+ const capByName = new Map<string, BootstrapCapture | RequestCapture>();
2145
+ for (const cap of workflow.bootstrap?.captures ?? []) capByName.set(cap.name, cap);
2146
+ for (const req of workflow.requests) {
2147
+ for (const cap of req.captures ?? []) capByName.set(cap.name, cap);
2148
+ }
2149
+
2150
+ // 3) Gather recorded HTML document bodies, preferring the bootstrap origin but
2151
+ // falling back to all HTML bodies (the bootstrap page itself may be absent
2152
+ // from the recording — e.g. costco's /Rental-Cars).
2153
+ let targetOrigin: string | undefined;
2154
+ try {
2155
+ if (workflow.bootstrap?.url) targetOrigin = new URL(workflow.bootstrap.url).origin;
2156
+ } catch {
2157
+ /* leave undefined */
2158
+ }
2159
+ const isHtmlDoc = (r: CapturedRequest): boolean => {
2160
+ const mime = r.response?.mimeType ?? '';
2161
+ return (
2162
+ (mime.includes('text/html') || r.resourceType === 'Document') &&
2163
+ typeof r.response?.body === 'string' &&
2164
+ r.response.body.length > 0
2165
+ );
2166
+ };
2167
+ const sameOrigin = (r: CapturedRequest): boolean => {
2168
+ if (!targetOrigin) return true;
2169
+ try {
2170
+ return new URL(r.url).origin === targetOrigin;
2171
+ } catch {
2172
+ return false;
2173
+ }
2174
+ };
2175
+ let htmlBodies = session.requests
2176
+ .filter((r) => isHtmlDoc(r) && sameOrigin(r))
2177
+ .map((r) => r.response?.body ?? '');
2178
+ if (htmlBodies.length === 0) {
2179
+ htmlBodies = session.requests.filter(isHtmlDoc).map((r) => r.response?.body ?? '');
2180
+ }
2181
+
2182
+ // 4) For each referenced state name produced by an html_regex/text_regex
2183
+ // capture, assert the pattern matches at least one recorded HTML body.
2184
+ for (const name of referenced) {
2185
+ const cap = capByName.get(name);
2186
+ if (!cap) continue; // may be seeded by the fetch-bootstrap jar — not statically known
2187
+ if (cap.source !== 'html_regex' && cap.source !== 'text_regex') continue;
2188
+ if (failedCaptureNames.has(name)) continue;
2189
+ let re: RegExp;
2190
+ try {
2191
+ re = new RegExp(cap.pattern);
2192
+ } catch (err) {
2193
+ failures.push(
2194
+ `capture "${name}" (referenced via \${state.${name}} in a request) has an invalid regex /${cap.pattern}/: ${err instanceof Error ? err.message : String(err)}.`,
2195
+ );
2196
+ failedCaptureNames.add(name);
2197
+ continue;
2198
+ }
2199
+ if (htmlBodies.length === 0) continue; // no recorded HTML to check against
2200
+ const matches = htmlBodies.some((body) => re.test(body));
2201
+ if (!matches) {
2202
+ failures.push(
2203
+ `capture "${name}" (source "${cap.source}") is referenced via \${state.${name}} in a request, but its pattern /${cap.pattern}/ does not match ANY recorded HTML page body for this site. At runtime \${state.${name}} resolves to nothing → the request fails with STATE_MISSING. Fix the pattern to match the token as it actually appears in the recorded page (inspect the recorded HTML), or change the capture source. (required:${cap.required === false ? 'false' : 'true'} does not exempt this — the request hard-references the value.)`,
2204
+ );
2205
+ failedCaptureNames.add(name);
2206
+ }
2207
+ }
2208
+
2209
+ return { failures, failedCaptureNames };
2210
+ }
2211
+
2212
+ /** Check one capture against the recorded request it should be reading from.
2213
+ * Returns a failure message or null. */
2214
+ function validateCaptureAgainstRecording(
2215
+ cap: BootstrapCapture | RequestCapture,
2216
+ recorded: CapturedRequest,
2217
+ context: string,
2218
+ ): string | null {
2219
+ const respHeaders = recorded.response?.headers ?? {};
2220
+ const respBody = recorded.response?.body ?? '';
2221
+ const fix = (suggestion: string) =>
2222
+ `capture "${cap.name}" on ${context}: declared source "${cap.source}" did not produce a value in the recording (seq=${recorded.seq}). ${suggestion}`;
2223
+
2224
+ switch (cap.source) {
2225
+ case 'response_header': {
2226
+ const v = headerValue(respHeaders, cap.header);
2227
+ if (v && v.length > 0) return null;
2228
+ return fix(
2229
+ `The recorded response has no "${cap.header}" header. Inspect the recorded response headers for a header that actually carries this value, or switch to source: 'html_regex' / 'cookie' / 'dom_*' if the value lives elsewhere.`,
2230
+ );
2231
+ }
2232
+ case 'cookie': {
2233
+ if (setCookieDefines(respHeaders, cap.cookie)) return null;
2234
+ return fix(
2235
+ `The recorded response Set-Cookie does not define cookie "${cap.cookie}". Check the recorded response headers and pick the correct cookie name, or switch source if the value isn't in a cookie.`,
2236
+ );
2237
+ }
2238
+ case 'html_regex':
2239
+ case 'text_regex': {
2240
+ try {
2241
+ const re = new RegExp(cap.pattern);
2242
+ if (re.test(respBody)) return null;
2243
+ } catch (err) {
2244
+ return fix(
2245
+ `Pattern is not a valid regex: ${err instanceof Error ? err.message : String(err)}.`,
2246
+ );
2247
+ }
2248
+ return fix(
2249
+ `Pattern /${cap.pattern}/ does not match the recorded response body. The token may live in a different location — check response headers (use source: 'response_header'), Set-Cookie (use source: 'cookie'), or revise the pattern.`,
2250
+ );
2251
+ }
2252
+ case 'json': {
2253
+ // 'json' captures use a path expression; static validation is fragile.
2254
+ // Skip — the integration test surfaces failures.
2255
+ return null;
2256
+ }
2257
+ default:
2258
+ // dom_attribute, dom_text, local_storage, session_storage — not statically
2259
+ // verifiable from a HAR-style recording.
2260
+ return null;
2261
+ }
2262
+ }
2263
+
2264
+ /** Fix B — detect request body fields hardcoded to the recording's first
2265
+ * invocation value when the recording proves the field is user input (varies
2266
+ * across multiple recorded invocations of the same endpoint). The verifier
2267
+ * rejects done() so the agent must expose the field as `${param.X}` (or use a
2268
+ * requestTransformModule). General — not specific to any one site. */
2269
+ function detectHardcodedSessionValues(
2270
+ workflow: ReturnType<typeof WorkflowSchema.parse>,
2271
+ session: Session,
2272
+ candidateRequestSeqs?: number[],
2273
+ dependencyRequestSeqs?: number[],
2274
+ ): string[] {
2275
+ // Skip the whole check when the workflow uses a requestTransformModule:
2276
+ // that module is the agent's declared escape hatch for programmatic body
2277
+ // construction (e.g. _uid generators, position-dependent encoding), and
2278
+ // any literal we see in workflow.json's body field may be overridden at
2279
+ // runtime by the transform. Trying to second-guess transform behavior
2280
+ // statically is the wrong layer.
2281
+ if (workflow.requestTransformModule) return [];
2282
+
2283
+ const failures: string[] = [];
2284
+ const allowedSeqs = new Set<number>([
2285
+ ...(candidateRequestSeqs ?? []),
2286
+ ...(dependencyRequestSeqs ?? []),
2287
+ ]);
2288
+ const restrictSet = allowedSeqs.size > 0 ? allowedSeqs : undefined;
2289
+
2290
+ for (const [i, req] of workflow.requests.entries()) {
2291
+ if (!req.body || req.body.length === 0) continue;
2292
+
2293
+ const matches = findRecordedMatches(session, req.method, req.url, restrictSet);
2294
+ if (matches.length < 2) continue;
2295
+ const firstMatch = matches[0];
2296
+ if (!firstMatch) continue;
2297
+
2298
+ // Determine body parser based on the recorded Content-Type (workflow may
2299
+ // have stripped headers).
2300
+ const recordedCt =
2301
+ headerValue(firstMatch.headers, 'content-type') ?? req.headers['Content-Type'] ?? '';
2302
+
2303
+ const parsed = matches
2304
+ .map((m) => parseBodyForFieldExtraction(m.body ?? '', recordedCt))
2305
+ .filter((p): p is Record<string, string> => p !== null);
2306
+ if (parsed.length < 2) continue;
2307
+
2308
+ // Collect distinct values per field
2309
+ const valuesByField = new Map<string, Set<string>>();
2310
+ for (const map of parsed) {
2311
+ for (const [k, v] of Object.entries(map)) {
2312
+ if (!valuesByField.has(k)) valuesByField.set(k, new Set());
2313
+ valuesByField.get(k)?.add(v);
2314
+ }
2315
+ }
2316
+
2317
+ const varying: Array<{ field: string; values: string[] }> = [];
2318
+ for (const [field, set] of valuesByField) {
2319
+ if (set.size < 2) continue;
2320
+ varying.push({ field, values: [...set].slice(0, 4) });
2321
+ }
2322
+ if (varying.length === 0) continue;
2323
+
2324
+ // For each varying field, check whether the workflow body has the first
2325
+ // recorded value as a literal substring AND no template placeholder for it.
2326
+ const workflowParsed = parseBodyForFieldExtraction(req.body, recordedCt);
2327
+ if (!workflowParsed) continue;
2328
+
2329
+ const offenders: Array<{ field: string; literal: string; distinctValues: string[] }> = [];
2330
+ for (const { field, values } of varying) {
2331
+ const wfValue = workflowParsed[field];
2332
+ if (wfValue === undefined) continue;
2333
+ // If the workflow value contains ANY placeholder, it's templated → OK.
2334
+ if (/\$\{(param|state|credential|response)\.[A-Za-z0-9_[\]]+\}/.test(wfValue)) continue;
2335
+ // The workflow value is a literal. Compare against the first recorded
2336
+ // value — if equal, this is a frozen-session-value bug. (Equality vs
2337
+ // just-non-templated avoids false positives where the agent picked a
2338
+ // sensible default different from any recorded seq.)
2339
+ if (values.includes(wfValue)) {
2340
+ offenders.push({ field, literal: wfValue, distinctValues: values });
2341
+ }
2342
+ }
2343
+
2344
+ if (offenders.length > 0) {
2345
+ const lines = offenders.map(
2346
+ (o) =>
2347
+ ` ${o.field}=${JSON.stringify(o.literal)} — recorded values across seqs: [${o.distinctValues
2348
+ .map((v) => JSON.stringify(v))
2349
+ .join(', ')}]`,
2350
+ );
2351
+ failures.push(
2352
+ `request[${i}] ${req.method} ${req.url} body has ${offenders.length} field(s) frozen to one recorded user's session — the recording proves these are user input:\n${lines.join('\n')}\nReplace each with \${param.NAME} and add the parameter to workflow.parameters, OR move body construction into a requestTransformModule.`,
2353
+ );
2354
+ }
2355
+ }
2356
+
2357
+ return failures;
2358
+ }
2359
+
2360
+ /** Parse a request body into a flat field→value map for variation analysis.
2361
+ * Supports form-urlencoded and (top-level) JSON. Returns null for shapes the
2362
+ * check can't reason about. */
2363
+ function parseBodyForFieldExtraction(
2364
+ body: string,
2365
+ contentType: string,
2366
+ ): Record<string, string> | null {
2367
+ const ct = contentType.toLowerCase();
2368
+ if (
2369
+ ct.includes('application/x-www-form-urlencoded') ||
2370
+ (!ct && body.includes('=') && body.includes('&'))
2371
+ ) {
2372
+ const out: Record<string, string> = {};
2373
+ for (const pair of body.split('&')) {
2374
+ const eq = pair.indexOf('=');
2375
+ if (eq < 0) continue;
2376
+ const k = decodeURIComponent(pair.slice(0, eq).replace(/\+/g, ' '));
2377
+ const v = decodeURIComponent(pair.slice(eq + 1).replace(/\+/g, ' '));
2378
+ out[k] = v;
2379
+ }
2380
+ return out;
2381
+ }
2382
+ if (ct.includes('application/json') || (ct === '' && body.trim().startsWith('{'))) {
2383
+ try {
2384
+ const parsed = JSON.parse(body);
2385
+ if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
2386
+ const out: Record<string, string> = {};
2387
+ for (const [k, v] of Object.entries(parsed)) {
2388
+ if (typeof v === 'string' || typeof v === 'number' || typeof v === 'boolean') {
2389
+ out[k] = String(v);
2390
+ }
2391
+ }
2392
+ return out;
2393
+ }
2394
+ } catch {
2395
+ // not parseable
2396
+ }
2397
+ }
2398
+ return null;
2399
+ }
2400
+
1108
2401
  export async function externalVerification(
1109
2402
  toolDir: string,
1110
2403
  session: Session,
@@ -1113,10 +2406,52 @@ export async function externalVerification(
1113
2406
  expectedToolName?: string;
1114
2407
  likelyParams?: Array<{ name: string; type?: string; description?: string }>;
1115
2408
  candidateRequestSeqs?: number[];
2409
+ /** Shared modules the build plan assigned to this tool. The verifier asserts
2410
+ * each verified module is actually imported (no silent re-implementation). */
2411
+ assignedSharedModules?: AssignedSharedModule[];
2412
+ /** Producer→consumer token contracts the build plan declared for this tool:
2413
+ * each `param` is minted by `sourceTool`'s `sourceField` output. Such params
2414
+ * require a chained `param:<name>` test (mint a fresh value from the producer)
2415
+ * and are stamped with `sourcedFrom` on success. */
2416
+ tokenParams?: Array<{ param: string; sourceTool: string; sourceField: string }>;
2417
+ /** Fields the build plan requires THIS tool's parser to emit for sibling
2418
+ * consumers (producer side). The verifier fails the tool if a declared field
2419
+ * is not emitted, so the producer/consumer field name can't silently diverge
2420
+ * (e.g. the plan says `hotel_id` but the parser emits `propertyToken`). */
2421
+ emittedTokens?: Array<{ field: string; shape: string }>;
2422
+ /** Build-plan-declared dependency seqs (e.g. bootstrap GET seq, producer
2423
+ * search seq) used by the hardcoded-body check to widen its variation
2424
+ * pool beyond the tool's own load-bearing seqs. */
2425
+ dependencyRequestSeqs?: number[];
1116
2426
  } = {},
1117
- ): Promise<{ failures: string[]; warnings: string[] }> {
2427
+ ): Promise<{
2428
+ failures: string[];
2429
+ warnings: string[];
2430
+ paramVerification: ParamVerification[];
2431
+ /** Set when the integration test was waived rather than passing live — the
2432
+ * caller should stamp this onto workflow.json so audit/teach can surface
2433
+ * the unverified state instead of silently treating the tool as live. */
2434
+ liveVerification?: {
2435
+ kind: 'waived-bot' | 'waived-infra';
2436
+ firstError: string;
2437
+ exhaustedBackends: string[];
2438
+ };
2439
+ }> {
1118
2440
  const failures: string[] = [];
1119
2441
  const warnings: string[] = [];
2442
+ const paramVerification: ParamVerification[] = [];
2443
+ let liveVerification:
2444
+ | { kind: 'waived-bot' | 'waived-infra'; firstError: string; exhaustedBackends: string[] }
2445
+ | undefined;
2446
+ // Captures Fix A flagged as having a wrong source. Surfaced into the
2447
+ // waiver classification (Fix C) so a STATE_MISSING traced to one of these
2448
+ // captures cannot silently become `waived-infra`.
2449
+ let failedCaptureNames = new Set<string>();
2450
+ // Fix 3 — when a request-referenced ${state.X} capture provably can't resolve
2451
+ // (Fix 2 below), the live integration call is GUARANTEED to STATE_MISSING, so
2452
+ // firing it is pure waste that also burns the per-IP anti-bot rate budget.
2453
+ // Skip the live test in that case and make the agent fix the capture first.
2454
+ let referencedStateBroken = false;
1120
2455
 
1121
2456
  const workflowPath = pathJoin(toolDir, 'workflow.json');
1122
2457
  const parserPath = pathJoin(toolDir, 'parser.ts');
@@ -1141,6 +2476,38 @@ export async function externalVerification(
1141
2476
  );
1142
2477
  }
1143
2478
 
2479
+ // Fix A — cross-reference every required capture against the recording.
2480
+ // A capture that declares `response_header` but reads from a recorded
2481
+ // response with no such header (or `html_regex` whose pattern doesn't
2482
+ // match the recorded body, etc.) will silently return null at runtime;
2483
+ // we reject it at compile so the agent picks a source that works.
2484
+ const crossRef = crossReferenceCaptures(workflow, session, opts.candidateRequestSeqs);
2485
+ failures.push(...crossRef.failures);
2486
+ failedCaptureNames = crossRef.failedCaptureNames;
2487
+
2488
+ // Fix 2 — cross-reference captures that a request DEPENDS ON via
2489
+ // `${state.X}` (e.g. an anti-bot csrf/csp-nonce html_regex capture whose
2490
+ // bootstrap page isn't in the recording) against every recorded HTML body,
2491
+ // regardless of `required`. Catches the silent STATE_MISSING that ships a
2492
+ // .act tool which can never resolve its csrf header at runtime.
2493
+ const stateRef = crossReferenceReferencedStateCaptures(workflow, session);
2494
+ failures.push(...stateRef.failures);
2495
+ for (const n of stateRef.failedCaptureNames) failedCaptureNames.add(n);
2496
+ if (stateRef.failedCaptureNames.size > 0) referencedStateBroken = true;
2497
+
2498
+ // Fix B — flag request body fields hardcoded to one recorded user's
2499
+ // session when the recording proves those fields are user input
2500
+ // (varying values across multiple recorded invocations of the same
2501
+ // endpoint). Skipped when the tool uses a requestTransformModule.
2502
+ failures.push(
2503
+ ...detectHardcodedSessionValues(
2504
+ workflow,
2505
+ session,
2506
+ opts.candidateRequestSeqs,
2507
+ opts.dependencyRequestSeqs,
2508
+ ),
2509
+ );
2510
+
1144
2511
  if (opts.likelyParams && opts.likelyParams.length > 0) {
1145
2512
  // Build the set of query param keys from the original recorded URLs
1146
2513
  // so we can distinguish real API params from invented ones.
@@ -1237,6 +2604,17 @@ export async function externalVerification(
1237
2604
  }
1238
2605
  }
1239
2606
 
2607
+ // Shared-module reuse: when the build plan assigned this tool a verified
2608
+ // shared module, the tool's artifacts MUST import it rather than duplicating
2609
+ // the logic. This is the anti-duplication gate for multi-tool teach runs.
2610
+ if (
2611
+ opts.assignedSharedModules &&
2612
+ opts.assignedSharedModules.length > 0 &&
2613
+ existsSync(workflowPath)
2614
+ ) {
2615
+ failures.push(...assertSharedModuleImports(toolDir, workflowPath, opts.assignedSharedModules));
2616
+ }
2617
+
1240
2618
  if (!existsSync(parserPath)) {
1241
2619
  failures.push('parser.ts was not written');
1242
2620
  } else {
@@ -1256,85 +2634,299 @@ export async function externalVerification(
1256
2634
  failures.push('parser.test.ts was not written');
1257
2635
  } else {
1258
2636
  const src = readFileSync(parserTestPath, 'utf8');
1259
- const expectMatches = src.match(/expect\s*\(/g) ?? [];
1260
- if (expectMatches.length < 3) {
1261
- failures.push(`parser.test.ts has only ${expectMatches.length} expect() calls; need ≥3`);
2637
+ const expectCount = countExpectCalls(src);
2638
+ if (expectCount < 3) {
2639
+ failures.push(`parser.test.ts has only ${expectCount} expect() calls; need ≥3`);
1262
2640
  }
1263
-
1264
- const trivialPatterns = [
1265
- /expect\s*\(\s*true\s*\)\.toBe\s*\(\s*true\s*\)/,
1266
- /expect\s*\(\s*false\s*\)\.toBe\s*\(\s*false\s*\)/,
1267
- /expect\s*\(\s*1\s*\)\.toBe\s*\(\s*1\s*\)/,
1268
- /expect\s*\(\s*0\s*\)\.toBe\s*\(\s*0\s*\)/,
1269
- /expect\s*\(\s*null\s*\)\.toBeNull/,
1270
- /expect\s*\(\s*undefined\s*\)\.toBeUndefined/,
1271
- /expect\s*\(\s*"[^"]*"\s*\)\.toBe\s*\(\s*"[^"]*"\s*\)/,
1272
- /expect\s*\(\s*'[^']*'\s*\)\.toBe\s*\(\s*'[^']*'\s*\)/,
1273
- ];
1274
- for (const pattern of trivialPatterns) {
1275
- if (pattern.test(src)) {
1276
- failures.push(
1277
- 'parser.test.ts contains trivial tautological assertions like expect(true).toBe(true) — tests must reference real values',
1278
- );
1279
- break;
1280
- }
2641
+ if (hasTrivialAssertion(src)) {
2642
+ failures.push(
2643
+ 'parser.test.ts contains trivial tautological assertions like expect(true).toBe(true) — tests must reference real values',
2644
+ );
2645
+ }
2646
+ // Fix E: the zero/empty-result contract. The recording has no no-match
2647
+ // response, so the only way to verify empty-handling is a synthetic case.
2648
+ if (!src.includes('synthetic:empty-result')) {
2649
+ failures.push(
2650
+ 'parser.test.ts is missing the required `synthetic:empty-result` test — add a test titled `synthetic:empty-result …` that feeds extract() a no-match / empty-items response and asserts it returns a clean empty collection (length 0), never a single all-null placeholder record. See prompts/compile-agent.md.',
2651
+ );
1281
2652
  }
1282
2653
  }
1283
2654
 
1284
2655
  if (existsSync(parserTestPath)) {
1285
- const result = await runCommand(`bun test ${parserTestPath}`, toolDir, 120000, {
2656
+ const run = await runBunTestWithResults(parserTestPath, toolDir, 120000, {
1286
2657
  [SESSION_PATH_ENV]: sessionPath,
1287
2658
  });
1288
- const output = JSON.parse(result.result) as {
1289
- stdout: string;
1290
- stderr: string;
1291
- exitCode: number;
1292
- };
1293
- if (output.exitCode !== 0) {
2659
+ if (run.exitCode !== 0) {
1294
2660
  failures.push(
1295
- `bun test parser.test.ts exited ${output.exitCode}\nstdout:\n${output.stdout}\nstderr:\n${output.stderr}`,
2661
+ `bun test parser.test.ts exited ${run.exitCode}\nstdout:\n${run.stdout}\nstderr:\n${run.stderr}`,
2662
+ );
2663
+ }
2664
+ // The synthetic empty-result test must actually RUN GREEN, not merely be
2665
+ // present in source — a failed/absent synthetic test leaves empty-handling
2666
+ // unverified (R1: phantom all-null record on a zero-result input).
2667
+ const ranAnyTest = run.passed.size + run.failed.size > 0;
2668
+ const syntheticPassed = [...run.passed].some((n) => n.includes('synthetic:empty-result'));
2669
+ if (ranAnyTest && !syntheticPassed) {
2670
+ failures.push(
2671
+ 'the `synthetic:empty-result` parser test did not pass — extract() must return a clean empty collection for a no-match/empty response (not a phantom record). Fix the parser or the test.',
1296
2672
  );
1297
2673
  }
1298
2674
  }
1299
2675
 
2676
+ // Run the live integration suite and classify the outcome. The per-param
2677
+ // coverage check below trusts the test *runner* (which named tests actually
2678
+ // ran green) rather than a static source scan, so a suite that was waived by
2679
+ // anti-bot can no longer be counted as "covered".
1300
2680
  const integrationTestPath = pathJoin(toolDir, 'integration.test.ts');
2681
+ let integrationOutcome: 'passed' | 'waived-bot' | 'waived-infra' | 'failed' | 'absent' = 'absent';
2682
+ let integrationPassedTests = new Set<string>();
1301
2683
  if (!existsSync(integrationTestPath)) {
1302
2684
  failures.push(
1303
2685
  'integration.test.ts was not written — the tool must include a live API test that calls the workflow and verifies it returns real data',
1304
2686
  );
2687
+ } else if (referencedStateBroken) {
2688
+ // A request hard-references a ${state.X} whose html_regex capture provably
2689
+ // does not match the recorded page (Fix 2 already pushed the actionable
2690
+ // failure). The live call WOULD STATE_MISSING — running it can't pass and
2691
+ // would only spend a live anti-bot .act and deepen the per-IP rate flag.
2692
+ // Skip it; the agent must fix the capture, then the next cycle verifies live.
2693
+ integrationOutcome = 'failed';
2694
+ warnings.push(
2695
+ 'skipped the live integration test: a request references a ${state.X} capture (e.g. csrf/csp-nonce) whose pattern does not match the recorded page, so the live call is guaranteed to fail with STATE_MISSING. Fix the capture pattern/source (see the failure above) — the next verification cycle will run the live test once it can succeed. This avoids burning a doomed anti-bot .act call.',
2696
+ );
1305
2697
  } else {
1306
- let integrationPassed = false;
1307
- let lastOutput = { stdout: '', stderr: '', exitCode: 1 };
2698
+ // Scale the verifier's live-test timeout to the suite size: the baseline plus
2699
+ // one live `runWorkflowWithLadder` per param, each gated by the ~25s compile
2700
+ // pacing and a possible cdp cold start. A flat 60s truncated paced anti-bot
2701
+ // suites mid-run, and the partial output then misclassified as a bot block.
2702
+ // Cap it so a genuinely wedged suite can't run away.
2703
+ const paramCount = opts.likelyParams?.length ?? 0;
2704
+ const pacingMs = Number(process.env.IMPRINT_COMPILE_ACT_SPACING_MS ?? 25_000) || 0;
2705
+ const verifierTimeoutMs = Math.min(120_000 + paramCount * (pacingMs + 20_000), 10 * 60_000);
2706
+ let run: BunTestRun = {
2707
+ stdout: '',
2708
+ stderr: '',
2709
+ exitCode: 1,
2710
+ timedOut: false,
2711
+ passed: new Set(),
2712
+ failed: new Set(),
2713
+ };
1308
2714
  for (let attempt = 0; attempt < 3; attempt++) {
1309
- const result = await runCommand(`bun test ${integrationTestPath}`, toolDir, 60000);
1310
- lastOutput = JSON.parse(result.result) as {
1311
- stdout: string;
1312
- stderr: string;
1313
- exitCode: number;
2715
+ run = await runBunTestWithResults(integrationTestPath, toolDir, verifierTimeoutMs);
2716
+ if (run.exitCode === 0) break;
2717
+ // A timeout, bot-defense, or ladder-exhaustion failure will NOT clear on a
2718
+ // retry — re-running only fires more state-changing calls and deepens the
2719
+ // per-IP rate flag. One attempt is enough to classify it; stop early.
2720
+ if (run.timedOut) break;
2721
+ const out = `${run.stdout}\n${run.stderr}`;
2722
+ const ladderExhausted =
2723
+ /\bRATE_LIMITED\b|\bFORBIDDEN\b|\bNETWORK\b/.test(out) &&
2724
+ /non-escalatable|giving up|ladder exhausted|all backends failed/.test(out);
2725
+ if (isBotDefenseFailure(out) || ladderExhausted) break;
2726
+ }
2727
+ integrationPassedTests = run.passed;
2728
+
2729
+ const verdict = classifyIntegrationOutcome({
2730
+ exitCode: run.exitCode,
2731
+ timedOut: run.timedOut,
2732
+ combined: `${run.stdout}\n${run.stderr}`,
2733
+ passedTests: run.passed,
2734
+ referencedStateBroken: false, // the broken-capture case is handled above
2735
+ failedCaptureNames,
2736
+ });
2737
+ integrationOutcome = verdict.outcome;
2738
+
2739
+ if (verdict.outcome === 'passed') {
2740
+ // exitCode 0 — nothing to surface.
2741
+ } else if (verdict.captureFailName !== null) {
2742
+ const capName = verdict.captureFailName;
2743
+ // If the failing capture is a `response_header` on a REPLAYED workflow
2744
+ // request, the cause is almost always the replay asymmetry: programmatic
2745
+ // fetch reliably receives the response BODY and Set-Cookie, but anti-bot
2746
+ // edges withhold browser-only response headers from non-browser requests.
2747
+ let sourceHint = '';
2748
+ try {
2749
+ const wf = JSON.parse(readFileSync(workflowPath, 'utf8')) as {
2750
+ requests?: Array<{ captures?: Array<{ name: string; source: string }> }>;
2751
+ bootstrap?: { captures?: Array<{ name: string; source: string }> };
2752
+ };
2753
+ const reqCap = (wf.requests ?? [])
2754
+ .flatMap((r) => r.captures ?? [])
2755
+ .find((c) => c.name === capName);
2756
+ if (reqCap?.source === 'response_header') {
2757
+ sourceHint = ` The capture uses source: 'response_header' on a replayed request. Programmatic replay does NOT receive browser-only response headers that anti-bot edges withhold — but it DOES receive the response body and Set-Cookie. If this token also appears in the HTML body, switch to source: 'text_regex' (read it from the body); if it is set as a cookie, switch to source: 'cookie'. Reserve 'response_header' for a workflow.bootstrap capture (a real Chrome navigation), not a replayed request.`;
2758
+ }
2759
+ } catch {
2760
+ // best-effort hint only
2761
+ }
2762
+ failures.push(
2763
+ `integration test failed because a declared capture did not produce a value at runtime: capture "${capName}" returned null${
2764
+ verdict.captureFailFromKnown
2765
+ ? ' (matches a capture flagged by the compile-time cross-reference check)'
2766
+ : ''
2767
+ }. This is a workflow-correctness error, not infra — fix the capture source/path in workflow.json so it actually reads from the recorded location.${sourceHint}\nstdout:\n${run.stdout}\nstderr:\n${run.stderr}`,
2768
+ );
2769
+ } else if (verdict.outcome === 'waived-bot' || verdict.outcome === 'waived-infra') {
2770
+ // `liveVerified` is driven by whether the BASELINE produced real data, NOT by
2771
+ // whether every param test passed. Only stamp liveVerified=false when the
2772
+ // baseline ALSO failed — if a backend returned real data this run the tool IS
2773
+ // live-verified; only its per-parameter tests waive (non-blocking).
2774
+ liveVerification = verdict.baselineLiveVerified
2775
+ ? undefined
2776
+ : {
2777
+ kind: verdict.outcome,
2778
+ firstError: verdict.firstError,
2779
+ exhaustedBackends: verdict.exhaustedBackends,
2780
+ };
2781
+ const liveNote = verdict.baselineLiveVerified
2782
+ ? 'The baseline returned real data this run, so liveVerified stays TRUE — only the per-parameter tests are waived.'
2783
+ : 'Stamping liveVerified=false on workflow.json — the runtime falls through to the cdp-replay / playbook rung. Audit and teach surface this tool as unverified.';
2784
+ warnings.push(
2785
+ verdict.outcome === 'waived-bot'
2786
+ ? `integration test hit a likely bot-detection / anti-automation challenge. ${liveNote}\nstdout:\n${run.stdout}\nstderr:\n${run.stderr}`
2787
+ : `integration test hit an infrastructure error (${verdict.firstError}); rungs exhausted: ${verdict.exhaustedBackends.join(', ') || 'unknown'}. ${liveNote}\nstdout:\n${run.stdout}\nstderr:\n${run.stderr}`,
2788
+ );
2789
+ } else {
2790
+ failures.push(
2791
+ `bun test integration.test.ts exited ${run.exitCode} — the workflow failed to produce live data (tried 3 times).\nstdout:\n${run.stdout}\nstderr:\n${run.stderr}`,
2792
+ );
2793
+ }
2794
+ }
2795
+
2796
+ // Per-parameter coverage (Fix C/D). Each exposed parameter must have a
2797
+ // `param:<name>` integration test that actually RAN GREEN against live data —
2798
+ // a static source scan is not enough, because a waived suite never exercised
2799
+ // the param (R2: a filter wired to a field the server ignores looks "covered"
2800
+ // by source but does nothing). Per the keep+mark policy we never drop a param;
2801
+ // each is recorded in `paramVerification` as verified or not (with a reason),
2802
+ // and only a genuinely-uncovered param on a suite that DID run blocks compile.
2803
+ if (
2804
+ !referencedStateBroken &&
2805
+ existsSync(integrationTestPath) &&
2806
+ opts.likelyParams &&
2807
+ opts.likelyParams.length > 0
2808
+ ) {
2809
+ const integrationSrc = readFileSync(integrationTestPath, 'utf8');
2810
+
2811
+ // Producer-sourced token params: union of build-plan-declared contracts and
2812
+ // mechanical detection (the recorded value appears in a SIBLING tool's
2813
+ // recorded response). Declared entries win — they carry the producer tool +
2814
+ // field used for stamping `sourcedFrom` and the MCP description.
2815
+ const recordedParamValues = new Map<string, string>();
2816
+ try {
2817
+ const wf = JSON.parse(readFileSync(workflowPath, 'utf8')) as {
2818
+ parameters?: Array<{ name: string; default?: unknown }>;
1314
2819
  };
1315
- if (lastOutput.exitCode === 0) {
1316
- integrationPassed = true;
1317
- break;
2820
+ for (const p of wf.parameters ?? []) {
2821
+ if (typeof p.default === 'string') recordedParamValues.set(p.name, p.default);
1318
2822
  }
2823
+ } catch {
2824
+ // best-effort — defaults are only a detection hint
2825
+ }
2826
+ const candidateSet = new Set(opts.candidateRequestSeqs ?? []);
2827
+ const siblingResponses = session.requests
2828
+ .filter((r) => !candidateSet.has(r.seq) && r.response?.body)
2829
+ .map((r) => ({ body: r.response?.body ?? '' }));
2830
+ const detected = detectTokenSources({
2831
+ likelyParams: opts.likelyParams,
2832
+ recordedParamValues,
2833
+ siblingResponses,
2834
+ });
2835
+ const tokenByName = new Map<string, TokenSource>();
2836
+ for (const d of detected) tokenByName.set(d.param, d);
2837
+ for (const d of opts.tokenParams ?? []) {
2838
+ tokenByName.set(d.param, {
2839
+ param: d.param,
2840
+ sourceTool: d.sourceTool,
2841
+ sourceField: d.sourceField,
2842
+ });
1319
2843
  }
1320
- if (!integrationPassed) {
1321
- const combined = `${lastOutput.stdout}\n${lastOutput.stderr}`;
1322
- const botSignatures = /PerimeterX|DataDome|Akamai|captcha|challenge|blocked|rate.?limit/i;
1323
- const hasStatusBlock = /\b(403|429)\b/.test(combined);
1324
- if (hasStatusBlock && botSignatures.test(combined)) {
2844
+
2845
+ // Missing-producer guard: if a declared producer did not compile, the chain
2846
+ // cannot be exercised — waive (verified:false, keep+mark) rather than block
2847
+ // the consumer on something out of its control.
2848
+ const tokenSources: TokenSource[] = [];
2849
+ const waivedChain: ParamVerification[] = [];
2850
+ for (const ts of tokenByName.values()) {
2851
+ if (ts.sourceTool && !existsSync(pathJoin(toolDir, '..', ts.sourceTool, 'workflow.json'))) {
2852
+ waivedChain.push({
2853
+ name: ts.param,
2854
+ verified: false,
2855
+ reason: 'waived-chain',
2856
+ sourcedFrom: sourcedFromOf(ts),
2857
+ });
1325
2858
  warnings.push(
1326
- `integration test failed with likely bot-detection or rate-limiting (tried 3 times) — treating as non-blocking since parser verification passed.\nstdout:\n${lastOutput.stdout}\nstderr:\n${lastOutput.stderr}`,
2859
+ `producer tool "${ts.sourceTool}" for token param "${ts.param}" is unavailable (did not compile) — the producer→consumer chain is left unverified (waived-chain).`,
1327
2860
  );
1328
2861
  } else {
1329
- failures.push(
1330
- `bun test integration.test.ts exited ${lastOutput.exitCode} — the workflow failed to produce live data (tried 3 times).\nstdout:\n${lastOutput.stdout}\nstderr:\n${lastOutput.stderr}`,
1331
- );
2862
+ tokenSources.push(ts);
1332
2863
  }
1333
2864
  }
2865
+
2866
+ const waivedNames = new Set(waivedChain.map((w) => w.name));
2867
+ const coverage = classifyParamCoverage({
2868
+ likelyParams: opts.likelyParams.filter((lp) => !waivedNames.has(lp.name)),
2869
+ integrationSrc,
2870
+ passedTests: integrationPassedTests,
2871
+ integrationOutcome,
2872
+ tokenSources,
2873
+ });
2874
+ paramVerification.push(...coverage.paramVerification, ...waivedChain);
2875
+ if (coverage.tautological.length > 0) {
2876
+ failures.push(
2877
+ `${coverage.tautological.length} parameter(s) have a passing \`param:<name>\` test that never calls runWorkflowWithLadder, so it does not exercise the live workflow: ${coverage.tautological.join(', ')}. Each per-parameter test must call the workflow with the override value and assert the response is constrained by it.`,
2878
+ );
2879
+ }
2880
+ if (coverage.uncovered.length > 0) {
2881
+ failures.push(
2882
+ `${coverage.uncovered.length} parameter(s) have no passing \`param:<name>\` integration test and no \`// exposed-but-not-verified\` annotation: ${coverage.uncovered.join(', ')}. Add a test titled \`param:<name> …\` that overrides the value, calls runWorkflowWithLadder, and asserts the response is constrained — or annotate the parameter as explicitly unverified. See prompts/compile-agent.md "Per-parameter coverage tests".`,
2883
+ );
2884
+ }
2885
+ if (coverage.unchained.length > 0) {
2886
+ const details = coverage.unchained
2887
+ .map((name) => {
2888
+ const ts = tokenSources.find((t) => t.param === name);
2889
+ return ts?.sourceTool && ts.sourceField
2890
+ ? `\`${name}\` (mint from \`../${ts.sourceTool}/workflow.json\` → read field \`${ts.sourceField}\`)`
2891
+ : `\`${name}\``;
2892
+ })
2893
+ .join(', ');
2894
+ failures.push(
2895
+ `${coverage.unchained.length} producer-sourced token param(s) lack a CHAINED \`param:<name>\` test that mints a FRESH value from the producer tool: ${details}. Each test must call runWorkflowWithLadder on the named producer's \`workflow.json\`, read the named field from its result, and pass THAT value (not the recorded constant) into this tool — then assert the response is non-empty. If the producer only emits a bare fragment, fix the PRODUCER to emit the full value this tool consumes. See prompts/compile-agent.md "Producer-sourced token parameters".`,
2896
+ );
2897
+ }
2898
+ }
2899
+
2900
+ // Producer-side token contract: the build plan requires this tool to emit
2901
+ // certain fields for sibling consumers. Fail if the parser doesn't reference a
2902
+ // declared field by name — otherwise the producer/consumer field name silently
2903
+ // diverges (plan says `hotel_id`, parser emits `propertyToken`) and the
2904
+ // consumer's chained test can never extract it.
2905
+ if ((opts.emittedTokens?.length ?? 0) > 0 && existsSync(parserPath)) {
2906
+ const parserSrc = readFileSync(parserPath, 'utf8');
2907
+ const missing = (opts.emittedTokens ?? [])
2908
+ .map((e) => e.field)
2909
+ .filter(
2910
+ (field) =>
2911
+ !new RegExp(`\\b${field.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`).test(parserSrc),
2912
+ );
2913
+ if (missing.length > 0) {
2914
+ failures.push(
2915
+ `the build plan requires this tool's parser to emit ${missing
2916
+ .map((f) => `\`${f}\``)
2917
+ .join(', ')} so sibling consumer tools can use ${
2918
+ missing.length === 1 ? 'it' : 'them'
2919
+ } as an input token, but parser.ts does not emit ${
2920
+ missing.length === 1 ? 'that field' : 'those fields'
2921
+ }. Emit ${
2922
+ missing.length === 1 ? 'it' : 'each'
2923
+ } in every result item under the EXACT field name (the full value a consumer needs, never a bare fragment) — see read_build_plan "emitsTokens".`,
2924
+ );
2925
+ }
1334
2926
  }
1335
2927
 
1336
2928
  if (existsSync(parserPath) || existsSync(parserTestPath)) {
1337
- const output = await runGeneratedArtifactTypecheck(toolDir);
2929
+ const output = await typecheckArtifacts(toolDir, ['parser.ts', 'request-transform.ts']);
1338
2930
  if (output.exitCode !== 0 || output.timedOut) {
1339
2931
  failures.push(
1340
2932
  `generated TypeScript artifacts failed typecheck (bunx tsc --noEmit -p .imprint-typecheck.tsconfig.json) exited ${output.exitCode}${output.timedOut ? ' after timing out' : ''}\nstdout:\n${output.stdout}\nstderr:\n${output.stderr}`,
@@ -1385,5 +2977,5 @@ export async function externalVerification(
1385
2977
  }
1386
2978
  }
1387
2979
 
1388
- return { failures, warnings };
2980
+ return { failures, warnings, paramVerification, liveVerification };
1389
2981
  }