imprint-mcp 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -201
- package/examples/discoverandgo/README.md +1 -1
- package/examples/echo/README.md +1 -1
- package/examples/google-flights/README.md +28 -0
- package/examples/google-flights/_shared/batchexecute.ts +63 -0
- package/examples/google-flights/_shared/flights_request.ts +95 -0
- package/examples/google-flights/_shared/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/index.ts +159 -0
- package/examples/google-flights/get_flight_booking_details/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/parser.ts +182 -0
- package/examples/google-flights/get_flight_booking_details/playbook.yaml +138 -0
- package/examples/google-flights/get_flight_booking_details/request-transform.ts +86 -0
- package/examples/google-flights/get_flight_booking_details/workflow.json +98 -0
- package/examples/google-flights/get_flight_calendar_prices/index.ts +131 -0
- package/examples/google-flights/get_flight_calendar_prices/package.json +9 -0
- package/examples/google-flights/get_flight_calendar_prices/parser.ts +86 -0
- package/examples/google-flights/get_flight_calendar_prices/playbook.yaml +97 -0
- package/examples/google-flights/get_flight_calendar_prices/request-transform.ts +31 -0
- package/examples/google-flights/get_flight_calendar_prices/workflow.json +76 -0
- package/examples/google-flights/lookup_airport/index.ts +101 -0
- package/examples/google-flights/lookup_airport/package.json +9 -0
- package/examples/google-flights/lookup_airport/parser.ts +66 -0
- package/examples/google-flights/lookup_airport/playbook.yaml +47 -0
- package/examples/google-flights/lookup_airport/request-transform.ts +20 -0
- package/examples/google-flights/lookup_airport/workflow.json +57 -0
- package/examples/google-flights/search_flights/index.ts +219 -0
- package/examples/google-flights/search_flights/package.json +9 -0
- package/examples/google-flights/search_flights/parser.ts +169 -0
- package/examples/google-flights/search_flights/playbook.yaml +184 -0
- package/examples/google-flights/search_flights/request-transform.ts +119 -0
- package/examples/google-flights/search_flights/workflow.json +143 -0
- package/examples/google-hotels/README.md +29 -0
- package/examples/google-hotels/_shared/batchexecute.ts +73 -0
- package/examples/google-hotels/_shared/freq.ts +158 -0
- package/examples/google-hotels/_shared/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/index.ts +80 -0
- package/examples/google-hotels/autocomplete_hotel_location/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/parser.ts +71 -0
- package/examples/google-hotels/autocomplete_hotel_location/playbook.yaml +36 -0
- package/examples/google-hotels/autocomplete_hotel_location/request-transform.ts +37 -0
- package/examples/google-hotels/autocomplete_hotel_location/workflow.json +36 -0
- package/examples/google-hotels/get_hotel_booking_options/index.ts +143 -0
- package/examples/google-hotels/get_hotel_booking_options/package.json +9 -0
- package/examples/google-hotels/get_hotel_booking_options/parser.ts +271 -0
- package/examples/google-hotels/get_hotel_booking_options/playbook.yaml +154 -0
- package/examples/google-hotels/get_hotel_booking_options/request-transform.ts +154 -0
- package/examples/google-hotels/get_hotel_booking_options/workflow.json +84 -0
- package/examples/google-hotels/get_hotel_reviews/index.ts +81 -0
- package/examples/google-hotels/get_hotel_reviews/package.json +9 -0
- package/examples/google-hotels/get_hotel_reviews/parser.ts +128 -0
- package/examples/google-hotels/get_hotel_reviews/playbook.yaml +64 -0
- package/examples/google-hotels/get_hotel_reviews/request-transform.ts +42 -0
- package/examples/google-hotels/get_hotel_reviews/workflow.json +37 -0
- package/examples/google-hotels/search_hotels/index.ts +207 -0
- package/examples/google-hotels/search_hotels/package.json +9 -0
- package/examples/google-hotels/search_hotels/parser.ts +260 -0
- package/examples/google-hotels/search_hotels/playbook.yaml +87 -0
- package/examples/google-hotels/search_hotels/request-transform.ts +197 -0
- package/examples/google-hotels/search_hotels/workflow.json +127 -0
- package/package.json +3 -2
- package/prompts/audit-agent.md +71 -0
- package/prompts/build-planning.md +74 -0
- package/prompts/compile-agent.md +132 -28
- package/prompts/prereq-builder.md +64 -0
- package/prompts/prereq-planner.md +34 -0
- package/prompts/tool-planning.md +39 -0
- package/src/cli.ts +111 -4
- package/src/imprint/agent.ts +5 -0
- package/src/imprint/audit.ts +996 -0
- package/src/imprint/backend-ladder.ts +1214 -184
- package/src/imprint/build-plan.ts +1051 -0
- package/src/imprint/cdp-browser-fetch.ts +589 -0
- package/src/imprint/cdp-jar-cache.ts +320 -0
- package/src/imprint/chromium.ts +135 -0
- package/src/imprint/claude-cli-compile.ts +125 -25
- package/src/imprint/codex-cli-compile.ts +26 -23
- package/src/imprint/compile-agent-types.ts +38 -0
- package/src/imprint/compile-agent.ts +65 -27
- package/src/imprint/compile-tools.ts +1656 -64
- package/src/imprint/compile.ts +14 -2
- package/src/imprint/concurrency.ts +87 -0
- package/src/imprint/credential-extract.ts +174 -25
- package/src/imprint/cron.ts +1 -0
- package/src/imprint/doctor.ts +39 -0
- package/src/imprint/emit.ts +85 -0
- package/src/imprint/freeform-redact.ts +5 -4
- package/src/imprint/integrations.ts +2 -2
- package/src/imprint/llm.ts +56 -8
- package/src/imprint/mcp-compile-server.ts +43 -10
- package/src/imprint/mcp-maintenance.ts +9 -101
- package/src/imprint/mcp-server.ts +73 -7
- package/src/imprint/multi-progress.ts +7 -2
- package/src/imprint/param-grounding.ts +367 -0
- package/src/imprint/paths.ts +29 -0
- package/src/imprint/playbook-runner.ts +101 -40
- package/src/imprint/prereq-builder.ts +651 -0
- package/src/imprint/probe-backends.ts +6 -3
- package/src/imprint/record.ts +10 -1
- package/src/imprint/redact.ts +30 -2
- package/src/imprint/replay-capture.ts +19 -18
- package/src/imprint/runtime.ts +19 -10
- package/src/imprint/sensitive-keys.ts +141 -7
- package/src/imprint/session-diff.ts +79 -2
- package/src/imprint/session-merge.ts +9 -5
- package/src/imprint/stealth-chromium.ts +81 -0
- package/src/imprint/stealth-fetch.ts +309 -29
- package/src/imprint/stealth-token-cache.ts +88 -0
- package/src/imprint/teach-plan.ts +251 -0
- package/src/imprint/teach-state.ts +17 -0
- package/src/imprint/teach.ts +582 -147
- package/src/imprint/tool-candidates.ts +72 -14
- package/src/imprint/tool-plan.ts +313 -0
- package/src/imprint/tracing.ts +135 -6
- package/src/imprint/types.ts +61 -3
- package/examples/google-flights/search_google_flights/index.ts +0 -101
- package/examples/google-flights/search_google_flights/parser.test.ts +0 -140
- package/examples/google-flights/search_google_flights/parser.ts +0 -189
- package/examples/google-flights/search_google_flights/playbook.yaml +0 -130
- package/examples/google-flights/search_google_flights/workflow.json +0 -48
- package/examples/google-hotels/search_google_hotels/index.ts +0 -194
- package/examples/google-hotels/search_google_hotels/parser.test.ts +0 -168
- package/examples/google-hotels/search_google_hotels/parser.ts +0 -330
- package/examples/google-hotels/search_google_hotels/playbook.yaml +0 -125
- package/examples/google-hotels/search_google_hotels/workflow.json +0 -111
- package/examples/namecheap-domains/search_namecheap_domains/index.ts +0 -144
- package/examples/namecheap-domains/search_namecheap_domains/parser.ts +0 -380
- package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +0 -50
- package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +0 -136
- package/examples/namecheap-domains/search_namecheap_domains/workflow.json +0 -97
|
@@ -8,15 +8,34 @@
|
|
|
8
8
|
|
|
9
9
|
import { spawn } from 'node:child_process';
|
|
10
10
|
import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs';
|
|
11
|
-
import { dirname, join as pathJoin, relative as pathRelative } from 'node:path';
|
|
11
|
+
import { basename, dirname, join as pathJoin, relative as pathRelative } from 'node:path';
|
|
12
12
|
import type { AgentTool } from './agent.ts';
|
|
13
13
|
import { inferAppApiHosts } from './app-api-hosts.ts';
|
|
14
|
+
import {
|
|
15
|
+
type AssignedSharedModule,
|
|
16
|
+
type SharedModuleManifestEntry,
|
|
17
|
+
planSliceForTool,
|
|
18
|
+
readBuildPlanFile,
|
|
19
|
+
resolveAssignedModules,
|
|
20
|
+
} from './build-plan.ts';
|
|
14
21
|
import { splitSetCookieHeader } from './cookie-jar.ts';
|
|
15
22
|
import { isSameRegistrableDomain, registrableDomain } from './etld.ts';
|
|
23
|
+
import {
|
|
24
|
+
endpointsForSeqs,
|
|
25
|
+
groundEvent,
|
|
26
|
+
groundingForEvents,
|
|
27
|
+
inputProvenance,
|
|
28
|
+
} from './param-grounding.ts';
|
|
16
29
|
import { compactRequestContexts, requestContextDigest } from './request-context.ts';
|
|
17
30
|
import type { ClassifiedValue } from './session-diff.ts';
|
|
18
31
|
import type { SharedCompileContext, ToolCandidate } from './tool-candidates.ts';
|
|
19
|
-
import {
|
|
32
|
+
import {
|
|
33
|
+
type BootstrapCapture,
|
|
34
|
+
type CapturedRequest,
|
|
35
|
+
type RequestCapture,
|
|
36
|
+
type Session,
|
|
37
|
+
WorkflowSchema,
|
|
38
|
+
} from './types.ts';
|
|
20
39
|
|
|
21
40
|
const REPO_ROOT = pathJoin(import.meta.dir, '..', '..');
|
|
22
41
|
|
|
@@ -36,9 +55,10 @@ export function buildCompileTools(
|
|
|
36
55
|
const credEnv = context.teachCredentials
|
|
37
56
|
? { IMPRINT_TEACH_CREDENTIALS: JSON.stringify(context.teachCredentials) }
|
|
38
57
|
: undefined;
|
|
39
|
-
|
|
58
|
+
const tools = [
|
|
40
59
|
buildReadSessionSummaryTool(session, context),
|
|
41
60
|
buildReadRequestTool(session),
|
|
61
|
+
buildDiffRequestForEventTool(session, context),
|
|
42
62
|
buildReadResponseBodyTool(session),
|
|
43
63
|
buildSearchResponseBodyTool(session),
|
|
44
64
|
buildWriteFileTool(toolDir),
|
|
@@ -46,6 +66,16 @@ export function buildCompileTools(
|
|
|
46
66
|
buildRunBashTool(toolDir, credEnv),
|
|
47
67
|
buildRunTestsTool(toolDir, sessionPath, credEnv),
|
|
48
68
|
];
|
|
69
|
+
if (context.buildPlanPath && context.candidate?.toolName) {
|
|
70
|
+
tools.push(
|
|
71
|
+
buildReadBuildPlanTool(
|
|
72
|
+
context.buildPlanPath,
|
|
73
|
+
context.candidate.toolName,
|
|
74
|
+
context.sharedModules,
|
|
75
|
+
),
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
return tools;
|
|
49
79
|
}
|
|
50
80
|
|
|
51
81
|
interface CompileToolContext {
|
|
@@ -53,6 +83,76 @@ interface CompileToolContext {
|
|
|
53
83
|
sharedContext?: SharedCompileContext;
|
|
54
84
|
classifications?: ClassifiedValue[];
|
|
55
85
|
teachCredentials?: { site: string; values: Record<string, string> };
|
|
86
|
+
/** Absolute path to the multi-tool build plan sidecar (.build-plan.json). When
|
|
87
|
+
* set, a read_build_plan tool is exposed and the verifier asserts the tool
|
|
88
|
+
* imports the shared modules the plan assigned it. */
|
|
89
|
+
buildPlanPath?: string;
|
|
90
|
+
/** Shared-module build manifest (verified flags) for this site. */
|
|
91
|
+
sharedModules?: SharedModuleManifestEntry[];
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// ─── Tool: read_build_plan ───────────────────────────────────────────────────
|
|
95
|
+
|
|
96
|
+
function buildReadBuildPlanTool(
|
|
97
|
+
buildPlanPath: string,
|
|
98
|
+
toolName: string,
|
|
99
|
+
manifest?: SharedModuleManifestEntry[],
|
|
100
|
+
): AgentTool {
|
|
101
|
+
return {
|
|
102
|
+
name: 'read_build_plan',
|
|
103
|
+
description:
|
|
104
|
+
"Read this tool's slice of the shared build plan: shared modules to import (instead of re-implementing), parser guidance, the parameter checklist, the auth recipe to replicate inline, and the opaque-token contract (fields this tool must EMIT for siblings, and params it CONSUMES from siblings).",
|
|
105
|
+
input_schema: { type: 'object', properties: {}, required: [] },
|
|
106
|
+
handler: async () => {
|
|
107
|
+
const plan = readBuildPlanFile(buildPlanPath);
|
|
108
|
+
if (!plan) return { result: 'No build plan available for this run.' };
|
|
109
|
+
const slice = planSliceForTool(plan, toolName);
|
|
110
|
+
if (!slice) return { result: `No build-plan slice for tool "${toolName}".` };
|
|
111
|
+
const assigned = resolveAssignedModules(plan, toolName, manifest).filter((m) => m.verified);
|
|
112
|
+
const emitsTokens = slice.tool.emitsTokens ?? [];
|
|
113
|
+
const tokenParams = slice.tool.tokenParams ?? [];
|
|
114
|
+
const tokenNotes: string[] = [];
|
|
115
|
+
if (emitsTokens.length > 0) {
|
|
116
|
+
tokenNotes.push(
|
|
117
|
+
`PRODUCER CONTRACT: your parser MUST emit ${emitsTokens
|
|
118
|
+
.map((e) => `\`${e.field}\``)
|
|
119
|
+
.join(
|
|
120
|
+
', ',
|
|
121
|
+
)} in each result item, in the exact shape described (the FULL value a sibling consumer needs — never a bare fragment). Sibling tools mint their input from these fields; the verifier fails this tool if a declared field is missing from the parser output.`,
|
|
122
|
+
);
|
|
123
|
+
}
|
|
124
|
+
for (const tp of tokenParams) {
|
|
125
|
+
tokenNotes.push(
|
|
126
|
+
`CONSUMER CONTRACT: param \`${tp.param}\` is an opaque token minted by the \`${tp.sourceTool}\` tool's \`${tp.sourceField}\` output. Write a CHAINED \`param:${tp.param}\` integration test that calls \`runWorkflowWithLadder\` on \`../${tp.sourceTool}/workflow.json\`, reads \`${tp.sourceField}\` from its result, and passes THAT fresh value (not the recorded constant) into this tool — then asserts the response is non-empty. On producer bot/infra error, rethrow so the suite waives.`,
|
|
127
|
+
);
|
|
128
|
+
}
|
|
129
|
+
return {
|
|
130
|
+
result: JSON.stringify(
|
|
131
|
+
{
|
|
132
|
+
toolName,
|
|
133
|
+
sharedModulesToImport: assigned.map((m) => ({
|
|
134
|
+
importPath: m.importPath,
|
|
135
|
+
kind: m.kind,
|
|
136
|
+
purpose: m.purpose,
|
|
137
|
+
exportSignatures: m.exportSignatures,
|
|
138
|
+
})),
|
|
139
|
+
parserGuidance: slice.tool.parserGuidance,
|
|
140
|
+
paramChecklist: slice.tool.paramChecklist,
|
|
141
|
+
authRecipe: slice.tool.authRecipe,
|
|
142
|
+
emitsTokens,
|
|
143
|
+
tokenParams,
|
|
144
|
+
note:
|
|
145
|
+
assigned.length > 0
|
|
146
|
+
? 'Import the listed shared modules via their importPath (request-transform → set workflow.json "requestTransformModule"; parser-helper/types → import from parser.ts) instead of re-implementing their logic. The verifier fails this tool if an assigned module is not imported.'
|
|
147
|
+
: 'No shared modules assigned — build this tool self-contained.',
|
|
148
|
+
tokenContract: tokenNotes.length > 0 ? tokenNotes : undefined,
|
|
149
|
+
},
|
|
150
|
+
null,
|
|
151
|
+
2,
|
|
152
|
+
),
|
|
153
|
+
};
|
|
154
|
+
},
|
|
155
|
+
};
|
|
56
156
|
}
|
|
57
157
|
|
|
58
158
|
// ─── Tool: read_session_summary ──────────────────────────────────────────────
|
|
@@ -61,7 +161,7 @@ function buildReadSessionSummaryTool(session: Session, context: CompileToolConte
|
|
|
61
161
|
return {
|
|
62
162
|
name: 'read_session_summary',
|
|
63
163
|
description:
|
|
64
|
-
'Get a high-level summary of the session including narration, selected candidate scope, load-bearing requests with inline data, and
|
|
164
|
+
'Get a high-level summary of the session including narration, selected candidate scope, load-bearing requests with inline data, capture hints, and parameter-grounding hints (for each recorded UI toggle, the exact request positions that changed — use these to ground each likelyParam instead of eyeballing one request).',
|
|
65
165
|
input_schema: {
|
|
66
166
|
type: 'object',
|
|
67
167
|
properties: {},
|
|
@@ -78,6 +178,48 @@ function buildReadSessionSummaryTool(session: Session, context: CompileToolConte
|
|
|
78
178
|
...(context.sharedContext?.loginRequestSeqs ?? []),
|
|
79
179
|
]);
|
|
80
180
|
const preserveSeqs = new Set([...selectedRequestSeqs, ...dependencySeqs]);
|
|
181
|
+
|
|
182
|
+
// Event-correlated differential grounding hints: for each UI event the
|
|
183
|
+
// candidate detector flagged, diff the request it triggered against the
|
|
184
|
+
// prior equivalent request and report what changed. This is where a
|
|
185
|
+
// filter/sort/option param's encoding actually lives — the agent maps
|
|
186
|
+
// each diff to its likelyParam instead of eyeballing one request and
|
|
187
|
+
// giving up (which previously shipped groundable params verified:false).
|
|
188
|
+
const paramGroundingHints =
|
|
189
|
+
(context.candidate?.eventSeqs?.length ?? 0)
|
|
190
|
+
? groundingForEvents(
|
|
191
|
+
session,
|
|
192
|
+
context.candidate?.eventSeqs ?? [],
|
|
193
|
+
endpointsForSeqs(session, [...preserveSeqs]),
|
|
194
|
+
).map((g) => ({
|
|
195
|
+
event: g.label || `event seq ${g.eventSeq}`,
|
|
196
|
+
eventSeq: g.eventSeq,
|
|
197
|
+
changedRequestSeq: g.triggeredSeq,
|
|
198
|
+
vsRequestSeq: g.priorSeq,
|
|
199
|
+
changes: g.changes.map((c) => `${c.path}: ${c.before} -> ${c.after}`),
|
|
200
|
+
}))
|
|
201
|
+
: [];
|
|
202
|
+
|
|
203
|
+
// Input-value provenance: positions in a load-bearing request whose value
|
|
204
|
+
// is an opaque id minted by an earlier response (not the user's text). The
|
|
205
|
+
// agent must CHAIN+CAPTURE these, not freeze them or substitute raw param
|
|
206
|
+
// text — substituting raw text where a resolved id belongs makes the
|
|
207
|
+
// backend ignore the input and fall back to a default scope.
|
|
208
|
+
// Scan the candidate's full seq set (capped), not just the representative
|
|
209
|
+
// one: the representative may be a first text-only request whose response
|
|
210
|
+
// mints the id, with the id only appearing in a later sibling request.
|
|
211
|
+
const provenanceSeqs = [...new Set([...selectedRequestSeqs, ...allCandidateSeqs])]
|
|
212
|
+
.sort((a, b) => a - b)
|
|
213
|
+
.slice(0, 30);
|
|
214
|
+
const inputProvenanceHints = inputProvenance(session, provenanceSeqs).map((p) => ({
|
|
215
|
+
path: p.path,
|
|
216
|
+
example: p.valueSample,
|
|
217
|
+
inRequestSeq: p.requestSeq,
|
|
218
|
+
mintedByResponseSeq: p.sourceSeq,
|
|
219
|
+
mintedByEndpoint: p.sourceEndpoint,
|
|
220
|
+
selfChain: p.selfChain,
|
|
221
|
+
}));
|
|
222
|
+
|
|
81
223
|
const summaryRequests = identifySummaryRequests(session, preserveSeqs);
|
|
82
224
|
const loadBearingRequests = compactRequestContexts(
|
|
83
225
|
summaryRequests.map((r) => ({
|
|
@@ -124,6 +266,8 @@ function buildReadSessionSummaryTool(session: Session, context: CompileToolConte
|
|
|
124
266
|
requestCount: session.requests.length,
|
|
125
267
|
stateHints,
|
|
126
268
|
captureHints: captureHints.length > 0 ? captureHints : undefined,
|
|
269
|
+
paramGroundingHints: paramGroundingHints.length > 0 ? paramGroundingHints : undefined,
|
|
270
|
+
inputProvenanceHints: inputProvenanceHints.length > 0 ? inputProvenanceHints : undefined,
|
|
127
271
|
loadBearingRequests,
|
|
128
272
|
};
|
|
129
273
|
|
|
@@ -673,6 +817,56 @@ function buildReadRequestTool(session: Session): AgentTool {
|
|
|
673
817
|
};
|
|
674
818
|
}
|
|
675
819
|
|
|
820
|
+
// ─── Tool: diff_request_for_event ────────────────────────────────────────────
|
|
821
|
+
|
|
822
|
+
function buildDiffRequestForEventTool(session: Session, context: CompileToolContext): AgentTool {
|
|
823
|
+
return {
|
|
824
|
+
name: 'diff_request_for_event',
|
|
825
|
+
description:
|
|
826
|
+
"For a recorded UI event seq (a filter/sort/option toggle from selectedCandidate.eventSeqs), return the request it triggered diffed against the prior equivalent request. The changed positions are exactly where that interaction's parameter is encoded — use this to ground a param's encoding when paramGroundingHints does not already cover it. Returns the changed JSON paths (path: before -> after).",
|
|
827
|
+
input_schema: {
|
|
828
|
+
type: 'object',
|
|
829
|
+
properties: {
|
|
830
|
+
eventSeq: {
|
|
831
|
+
type: 'number',
|
|
832
|
+
description: 'Event sequence number (from selectedCandidate.eventSeqs)',
|
|
833
|
+
},
|
|
834
|
+
},
|
|
835
|
+
required: ['eventSeq'],
|
|
836
|
+
},
|
|
837
|
+
handler: async (input: unknown) => {
|
|
838
|
+
const { eventSeq } = input as { eventSeq: number };
|
|
839
|
+
const reqSeqs = [
|
|
840
|
+
...((context.candidate?.representativeSeqs?.length ?? 0) > 0
|
|
841
|
+
? (context.candidate?.representativeSeqs ?? [])
|
|
842
|
+
: (context.candidate?.requestSeqs ?? [])),
|
|
843
|
+
...(context.candidate?.dependencySeqs ?? []),
|
|
844
|
+
];
|
|
845
|
+
const endpoints = endpointsForSeqs(session, reqSeqs);
|
|
846
|
+
const g = groundEvent(session, eventSeq, endpoints.size > 0 ? endpoints : undefined);
|
|
847
|
+
if (!g.triggeredSeq) {
|
|
848
|
+
return {
|
|
849
|
+
result: `Event ${eventSeq} triggered no comparable request within the window — it may be a client-side-only interaction (no server param), or its request was telemetry. If a filter/sort visibly changed results with no new request, it is applied client-side and cannot be reproduced via request replay.`,
|
|
850
|
+
};
|
|
851
|
+
}
|
|
852
|
+
return {
|
|
853
|
+
result: JSON.stringify(
|
|
854
|
+
{
|
|
855
|
+
event: g.label,
|
|
856
|
+
eventSeq: g.eventSeq,
|
|
857
|
+
changedRequestSeq: g.triggeredSeq,
|
|
858
|
+
vsRequestSeq: g.priorSeq,
|
|
859
|
+
endpoint: g.endpoint,
|
|
860
|
+
changes: g.changes.map((c) => `${c.path}: ${c.before} -> ${c.after}`),
|
|
861
|
+
},
|
|
862
|
+
null,
|
|
863
|
+
2,
|
|
864
|
+
),
|
|
865
|
+
};
|
|
866
|
+
},
|
|
867
|
+
};
|
|
868
|
+
}
|
|
869
|
+
|
|
676
870
|
// ─── Tool: read_response_body ────────────────────────────────────────────────
|
|
677
871
|
|
|
678
872
|
function buildReadResponseBodyTool(session: Session): AgentTool {
|
|
@@ -927,7 +1121,7 @@ function buildRunBashTool(toolDir: string, credEnv?: Record<string, string>): Ag
|
|
|
927
1121
|
required: ['command'],
|
|
928
1122
|
},
|
|
929
1123
|
handler: async (input: unknown) => {
|
|
930
|
-
const { command, timeoutSec =
|
|
1124
|
+
const { command, timeoutSec = 120 } = input as { command: string; timeoutSec?: number };
|
|
931
1125
|
|
|
932
1126
|
if (command.match(/rm\s+-rf\s+\//) || command.includes('sudo')) {
|
|
933
1127
|
return {
|
|
@@ -943,16 +1137,19 @@ function buildRunBashTool(toolDir: string, credEnv?: Record<string, string>): Ag
|
|
|
943
1137
|
};
|
|
944
1138
|
}
|
|
945
1139
|
|
|
946
|
-
async function runCommand(
|
|
1140
|
+
export async function runCommand(
|
|
947
1141
|
command: string,
|
|
948
1142
|
cwd: string,
|
|
949
1143
|
timeoutMs: number,
|
|
950
1144
|
extraEnv?: Record<string, string>,
|
|
951
1145
|
): Promise<{ result: string; isError?: boolean }> {
|
|
952
1146
|
return new Promise((resolve) => {
|
|
1147
|
+
// `detached: true` makes the child its own process-group leader so a timeout
|
|
1148
|
+
// can SIGKILL the WHOLE tree (sh → bun → Chrome), not just `sh`.
|
|
953
1149
|
const proc = spawn('sh', ['-c', command], {
|
|
954
1150
|
cwd,
|
|
955
1151
|
env: extraEnv ? { ...process.env, ...extraEnv } : process.env,
|
|
1152
|
+
detached: true,
|
|
956
1153
|
});
|
|
957
1154
|
|
|
958
1155
|
let stdout = '';
|
|
@@ -971,12 +1168,42 @@ async function runCommand(
|
|
|
971
1168
|
|
|
972
1169
|
const timeout = setTimeout(() => {
|
|
973
1170
|
timedOut = true;
|
|
974
|
-
|
|
1171
|
+
// Kill the whole process GROUP, not just `sh`. A hung `bun run probe.ts`
|
|
1172
|
+
// spawns bun + Chrome children that survive a bare proc.kill() (SIGTERM to
|
|
1173
|
+
// sh only); they keep the stdout pipe open so 'close' never fires, hanging
|
|
1174
|
+
// this call until the outer MCP tool timeout (30m) — exactly what ate a
|
|
1175
|
+
// tool's compile budget. SIGKILL the group so the timeout reaps bun + any
|
|
1176
|
+
// leaked browser and 'close' fires promptly.
|
|
1177
|
+
try {
|
|
1178
|
+
if (proc.pid) process.kill(-proc.pid, 'SIGKILL');
|
|
1179
|
+
else proc.kill('SIGKILL');
|
|
1180
|
+
} catch {
|
|
1181
|
+
proc.kill('SIGKILL');
|
|
1182
|
+
}
|
|
975
1183
|
}, timeoutMs);
|
|
976
1184
|
|
|
977
1185
|
proc.on('close', (exitCode) => {
|
|
978
1186
|
clearTimeout(timeout);
|
|
979
1187
|
|
|
1188
|
+
// Reap the whole process GROUP on EVERY exit, not just on timeout. The
|
|
1189
|
+
// compile verifier runs `bun test`, whose runner calls process.exit() the
|
|
1190
|
+
// instant the suite passes — and bun does NOT run process 'exit' /
|
|
1191
|
+
// 'beforeExit' handlers (only afterAll), so the compile cdp pool's
|
|
1192
|
+
// idle-close timer never fires and its launchChromium child is orphaned
|
|
1193
|
+
// (reparented to PID 1), accumulating across a multi-tool/multi-site teach
|
|
1194
|
+
// until the box OOMs. That child is still in THIS process group, though:
|
|
1195
|
+
// the group's id (= proc.pid) outlives the dead `sh` leader, so SIGKILLing
|
|
1196
|
+
// the group here reaps the orphaned Chrome regardless of how `bun test`
|
|
1197
|
+
// chose to exit. Harmless when the group is already empty (ESRCH). Skipped
|
|
1198
|
+
// on timeout (the group was already SIGKILLed above).
|
|
1199
|
+
if (!timedOut && proc.pid) {
|
|
1200
|
+
try {
|
|
1201
|
+
process.kill(-proc.pid, 'SIGKILL');
|
|
1202
|
+
} catch {
|
|
1203
|
+
// group already empty — nothing left to reap
|
|
1204
|
+
}
|
|
1205
|
+
}
|
|
1206
|
+
|
|
980
1207
|
if (stdout.length > TRUNCATE_LIMIT) {
|
|
981
1208
|
stdout = `${stdout.slice(0, TRUNCATE_LIMIT)}\n[…truncated…]`;
|
|
982
1209
|
}
|
|
@@ -997,19 +1224,25 @@ async function runCommand(
|
|
|
997
1224
|
});
|
|
998
1225
|
}
|
|
999
1226
|
|
|
1000
|
-
|
|
1001
|
-
|
|
1227
|
+
/** Typecheck a set of generated `.ts` artifacts in `dir` against the repo's
|
|
1228
|
+
* tsconfig (so `imprint/*` and bun globals resolve). Used by both the compile
|
|
1229
|
+
* verifier (parser.ts / request-transform.ts) and the prereq-module verifier
|
|
1230
|
+
* (`_shared/*.ts`). `*.test.ts` are excluded — they pull in bun:test globals
|
|
1231
|
+
* the strict config rejects. Exported for prereq-builder.ts. */
|
|
1232
|
+
export async function typecheckArtifacts(
|
|
1233
|
+
dir: string,
|
|
1234
|
+
includes: string[],
|
|
1002
1235
|
): Promise<{ stdout: string; stderr: string; exitCode: number; timedOut: boolean }> {
|
|
1003
|
-
const configPath = pathJoin(
|
|
1236
|
+
const configPath = pathJoin(dir, '.imprint-typecheck.tsconfig.json');
|
|
1004
1237
|
const rootTsconfig = pathJoin(REPO_ROOT, 'tsconfig.json');
|
|
1005
|
-
const extendsPath = normalizeTsconfigPath(pathRelative(
|
|
1238
|
+
const extendsPath = normalizeTsconfigPath(pathRelative(dir, rootTsconfig));
|
|
1006
1239
|
|
|
1007
1240
|
writeFileSync(
|
|
1008
1241
|
configPath,
|
|
1009
1242
|
JSON.stringify(
|
|
1010
1243
|
{
|
|
1011
1244
|
extends: extendsPath,
|
|
1012
|
-
include:
|
|
1245
|
+
include: includes,
|
|
1013
1246
|
exclude: ['*.test.ts'],
|
|
1014
1247
|
},
|
|
1015
1248
|
null,
|
|
@@ -1021,7 +1254,7 @@ async function runGeneratedArtifactTypecheck(
|
|
|
1021
1254
|
try {
|
|
1022
1255
|
const result = await runCommand(
|
|
1023
1256
|
'bunx tsc --noEmit -p .imprint-typecheck.tsconfig.json',
|
|
1024
|
-
|
|
1257
|
+
dir,
|
|
1025
1258
|
120000,
|
|
1026
1259
|
);
|
|
1027
1260
|
return JSON.parse(result.result) as {
|
|
@@ -1103,8 +1336,1068 @@ function buildRunTestsTool(
|
|
|
1103
1336
|
};
|
|
1104
1337
|
}
|
|
1105
1338
|
|
|
1339
|
+
// ─── Test-quality helpers (shared with prereq-builder verification) ─────────
|
|
1340
|
+
|
|
1341
|
+
/** Tautological assertions that prove nothing — rejected by every verifier so
|
|
1342
|
+
* an agent can't game the ≥3-expect gate with `expect(true).toBe(true)`. */
|
|
1343
|
+
const TRIVIAL_ASSERTION_PATTERNS: RegExp[] = [
|
|
1344
|
+
/expect\s*\(\s*true\s*\)\.toBe\s*\(\s*true\s*\)/,
|
|
1345
|
+
/expect\s*\(\s*false\s*\)\.toBe\s*\(\s*false\s*\)/,
|
|
1346
|
+
/expect\s*\(\s*1\s*\)\.toBe\s*\(\s*1\s*\)/,
|
|
1347
|
+
/expect\s*\(\s*0\s*\)\.toBe\s*\(\s*0\s*\)/,
|
|
1348
|
+
/expect\s*\(\s*null\s*\)\.toBeNull/,
|
|
1349
|
+
/expect\s*\(\s*undefined\s*\)\.toBeUndefined/,
|
|
1350
|
+
/expect\s*\(\s*"[^"]*"\s*\)\.toBe\s*\(\s*"[^"]*"\s*\)/,
|
|
1351
|
+
/expect\s*\(\s*'[^']*'\s*\)\.toBe\s*\(\s*'[^']*'\s*\)/,
|
|
1352
|
+
];
|
|
1353
|
+
|
|
1354
|
+
export function countExpectCalls(src: string): number {
|
|
1355
|
+
return (src.match(/expect\s*\(/g) ?? []).length;
|
|
1356
|
+
}
|
|
1357
|
+
|
|
1358
|
+
export function hasTrivialAssertion(src: string): boolean {
|
|
1359
|
+
return TRIVIAL_ASSERTION_PATTERNS.some((pattern) => pattern.test(src));
|
|
1360
|
+
}
|
|
1361
|
+
|
|
1362
|
+
/** Assert the tool imports each verified shared module the plan assigned it.
|
|
1363
|
+
* request-transform → workflow.json.requestTransformModule must point at it;
|
|
1364
|
+
* parser-helper/types → parser.ts (or request-transform.ts) must import it. */
|
|
1365
|
+
function assertSharedModuleImports(
|
|
1366
|
+
toolDir: string,
|
|
1367
|
+
workflowPath: string,
|
|
1368
|
+
assigned: AssignedSharedModule[],
|
|
1369
|
+
): string[] {
|
|
1370
|
+
const failures: string[] = [];
|
|
1371
|
+
const verified = assigned.filter((m) => m.verified);
|
|
1372
|
+
if (verified.length === 0) return failures;
|
|
1373
|
+
|
|
1374
|
+
let workflowRaw: { requestTransformModule?: unknown } = {};
|
|
1375
|
+
try {
|
|
1376
|
+
workflowRaw = JSON.parse(readFileSync(workflowPath, 'utf8'));
|
|
1377
|
+
} catch {
|
|
1378
|
+
return failures; // workflow parse already flagged elsewhere
|
|
1379
|
+
}
|
|
1380
|
+
const requestTransformModule =
|
|
1381
|
+
typeof workflowRaw.requestTransformModule === 'string'
|
|
1382
|
+
? workflowRaw.requestTransformModule
|
|
1383
|
+
: '';
|
|
1384
|
+
|
|
1385
|
+
let sourceBlob = '';
|
|
1386
|
+
for (const f of ['parser.ts', 'request-transform.ts']) {
|
|
1387
|
+
const p = pathJoin(toolDir, f);
|
|
1388
|
+
if (existsSync(p)) sourceBlob += `\n${readFileSync(p, 'utf8')}`;
|
|
1389
|
+
}
|
|
1390
|
+
|
|
1391
|
+
for (const m of verified) {
|
|
1392
|
+
if (m.kind === 'request-transform') {
|
|
1393
|
+
if (!requestTransformModule.includes(m.importPath) && !sourceBlob.includes(m.importPath)) {
|
|
1394
|
+
failures.push(
|
|
1395
|
+
`the build plan assigns shared module ${m.path} (request-transform) to this tool, but workflow.json does not set "requestTransformModule": "${m.importPath}" and no artifact imports it. Reuse it instead of re-implementing the logic — see read_build_plan.`,
|
|
1396
|
+
);
|
|
1397
|
+
}
|
|
1398
|
+
} else if (!sourceBlob.includes(m.importPath)) {
|
|
1399
|
+
failures.push(
|
|
1400
|
+
`the build plan assigns shared module ${m.path} (${m.kind}) to this tool, but no artifact imports "${m.importPath}". Import it from parser.ts (or request-transform.ts) instead of re-implementing it — see read_build_plan.`,
|
|
1401
|
+
);
|
|
1402
|
+
}
|
|
1403
|
+
}
|
|
1404
|
+
return failures;
|
|
1405
|
+
}
|
|
1406
|
+
|
|
1106
1407
|
// ─── External Verification ──────────────────────────────────────────────────
|
|
1107
1408
|
|
|
1409
|
+
/**
|
|
1410
|
+
* Decide whether a failed integration test was blocked by anti-automation /
|
|
1411
|
+
* bot defense (as opposed to a real workflow defect). Compile-time integration
|
|
1412
|
+
* tests only reach the fetch + fetch-bootstrap rungs; many sites gate their
|
|
1413
|
+
* APIs behind challenges (CAPTCHA interstitials, redirect-to-challenge pages,
|
|
1414
|
+
* rate-based blocks) that only the runtime ladder's stealth-fetch + playbook
|
|
1415
|
+
* rungs bypass. When the parser is already verified against the recorded
|
|
1416
|
+
* response, such a block should be a non-blocking warning, not a hard failure —
|
|
1417
|
+
* the tool works in production via the full ladder.
|
|
1418
|
+
*
|
|
1419
|
+
* Vendor-agnostic by design: matches the common defense families (Cloudflare,
|
|
1420
|
+
* Akamai, DataDome, PerimeterX, hCaptcha/reCAPTCHA, generic "unusual traffic"
|
|
1421
|
+
* interstitials) plus blocking HTTP statuses (403/429/503) and
|
|
1422
|
+
* redirect-to-challenge (30x to a challenge/verify/captcha location).
|
|
1423
|
+
* Not specialized to any single site.
|
|
1424
|
+
*/
|
|
1425
|
+
export function isBotDefenseFailure(output: string): boolean {
|
|
1426
|
+
// Unambiguous challenge/interstitial signatures — sufficient on their own,
|
|
1427
|
+
// regardless of HTTP status, because no legitimate API success emits them.
|
|
1428
|
+
// Vendor-neutral: covers the common anti-bot families, not any one site.
|
|
1429
|
+
const strong =
|
|
1430
|
+
/unusual traffic|recaptcha|hcaptcha|h-captcha|are you (a )?(human|robot)|verify (you are|you'?re) (a )?human|px-captcha|datadome|perimeterx|cf[-_]chl|attention required|just a moment\s*(\.\.\.|…)?|enable javascript and cookies to continue/i;
|
|
1431
|
+
if (strong.test(output)) return true;
|
|
1432
|
+
// Akamai Bot Manager runtime signal: `_abck` is the sensor cookie and a value
|
|
1433
|
+
// ending in `~-1~` means the session is UNVALIDATED (bot-flagged); `~0~` means
|
|
1434
|
+
// validated. The cdp bootstrap logs `_abck status after interaction: ~-1~` when
|
|
1435
|
+
// it ran the human-like interaction (mouse/scroll) and STILL could not validate
|
|
1436
|
+
// the sensor — i.e. it actively tried to beat the defense and failed. On such a
|
|
1437
|
+
// session Akamai serves a 200 "soft block" with empty/placeholder data instead
|
|
1438
|
+
// of a 403, so the live integration fails to produce data even though every
|
|
1439
|
+
// backend reports OK. Treat that as a bot-defense waiver (the tool falls through
|
|
1440
|
+
// to the runtime ladder / playbook and the audit validates it live) rather than
|
|
1441
|
+
// a hard compile failure. Scoped to the post-interaction confirmation so the
|
|
1442
|
+
// ordinary "cached jar not validated … — re-mint" log (which precedes a retry
|
|
1443
|
+
// that often succeeds) does NOT trip it.
|
|
1444
|
+
if (/_abck status after interaction:\s*~-1~/i.test(output)) return true;
|
|
1445
|
+
// Weaker terms need a corroborating blocking status or a redirect to a
|
|
1446
|
+
// challenge page so ordinary error text doesn't get a free pass.
|
|
1447
|
+
const weak =
|
|
1448
|
+
/captcha|challenge|access denied|forbidden|blocked|\bbot\b|rate.?limit|too many requests/i;
|
|
1449
|
+
const blockingStatus = /\b(403|429|503)\b/.test(output);
|
|
1450
|
+
const challengeRedirect =
|
|
1451
|
+
/\b(30[1-8])\b/.test(output) &&
|
|
1452
|
+
/captcha|challenge|verify|robot|denied|blocked|unusual/i.test(output);
|
|
1453
|
+
return (blockingStatus || challengeRedirect) && weak.test(output);
|
|
1454
|
+
}
|
|
1455
|
+
|
|
1456
|
+
function unescapeXml(s: string): string {
|
|
1457
|
+
return s
|
|
1458
|
+
.replace(/</g, '<')
|
|
1459
|
+
.replace(/>/g, '>')
|
|
1460
|
+
.replace(/"/g, '"')
|
|
1461
|
+
.replace(/'/g, "'")
|
|
1462
|
+
.replace(/&/g, '&');
|
|
1463
|
+
}
|
|
1464
|
+
|
|
1465
|
+
/**
|
|
1466
|
+
* Parse a JUnit XML report (from `bun test --reporter=junit`) into the sets of
|
|
1467
|
+
* passed and failed test *names*. The default bun reporter does not print
|
|
1468
|
+
* per-test names in non-TTY mode, so the JUnit report is the reliable way to
|
|
1469
|
+
* know which individual tests actually ran green. A self-closed
|
|
1470
|
+
* `<testcase .../>` passed; a `<testcase>` with a `<failure>`/`<error>` child
|
|
1471
|
+
* failed.
|
|
1472
|
+
*/
|
|
1473
|
+
export function parseJUnitResults(xml: string): { passed: Set<string>; failed: Set<string> } {
|
|
1474
|
+
const passed = new Set<string>();
|
|
1475
|
+
const failed = new Set<string>();
|
|
1476
|
+
if (!xml) return { passed, failed };
|
|
1477
|
+
const re = /<testcase\b([^>]*?)(\/>|>([\s\S]*?)<\/testcase>)/g;
|
|
1478
|
+
for (const m of xml.matchAll(re)) {
|
|
1479
|
+
const attrs = m[1] ?? '';
|
|
1480
|
+
const nameMatch = attrs.match(/\bname="([^"]*)"/);
|
|
1481
|
+
if (!nameMatch?.[1]) continue;
|
|
1482
|
+
const name = unescapeXml(nameMatch[1]);
|
|
1483
|
+
const selfClosed = m[2] === '/>';
|
|
1484
|
+
const didFail = !selfClosed && /<(failure|error)\b/.test(m[3] ?? '');
|
|
1485
|
+
if (didFail) failed.add(name);
|
|
1486
|
+
else passed.add(name);
|
|
1487
|
+
}
|
|
1488
|
+
return { passed, failed };
|
|
1489
|
+
}
|
|
1490
|
+
|
|
1491
|
+
interface BunTestRun {
|
|
1492
|
+
stdout: string;
|
|
1493
|
+
stderr: string;
|
|
1494
|
+
exitCode: number;
|
|
1495
|
+
/** True when the run was killed by the wall-clock timeout (not a clean exit).
|
|
1496
|
+
* Lets the classifier treat a truncated paced anti-bot suite as infra, never
|
|
1497
|
+
* as a bot block (the partial output's fetch-403 must not look like a block). */
|
|
1498
|
+
timedOut: boolean;
|
|
1499
|
+
/** Per-test names recovered from the JUnit report. */
|
|
1500
|
+
passed: Set<string>;
|
|
1501
|
+
failed: Set<string>;
|
|
1502
|
+
}
|
|
1503
|
+
|
|
1504
|
+
/** Per-exposed-parameter verification outcome. `verified` is true only when a
|
|
1505
|
+
* `param:<name>` integration test actually ran green against live data. */
|
|
1506
|
+
interface ParamVerification {
|
|
1507
|
+
name: string;
|
|
1508
|
+
verified: boolean;
|
|
1509
|
+
/** Why an exposed param is unverified. Undefined when `verified` is true.
|
|
1510
|
+
* - `waived-bot` / `waived-infra`: the live suite was waived (anti-bot /
|
|
1511
|
+
* infra), so the param's effect could not be confirmed at compile time;
|
|
1512
|
+
* it is exercised at runtime via the stealth-fetch / playbook ladder.
|
|
1513
|
+
* - `annotated`: the agent marked it `// exposed-but-not-verified`.
|
|
1514
|
+
* - `waived-chain`: the param is a producer-sourced token but the producer
|
|
1515
|
+
* tool could not be run at compile time (anti-bot / not compiled), so the
|
|
1516
|
+
* chain could not be verified. */
|
|
1517
|
+
reason?: 'waived-bot' | 'waived-infra' | 'annotated' | 'waived-chain';
|
|
1518
|
+
/** For a producer-sourced token param, the sibling tool + output field its
|
|
1519
|
+
* value comes from. Stamped into workflow.json (`param.sourcedFrom`) so the
|
|
1520
|
+
* MCP description tells the orchestrating LLM where to mint it and the audit
|
|
1521
|
+
* harness chains producer→consumer instead of fabricating a token. */
|
|
1522
|
+
sourcedFrom?: { tool: string; field: string };
|
|
1523
|
+
}
|
|
1524
|
+
|
|
1525
|
+
/** A parameter the gate knows is an opaque token/id minted by a sibling tool.
|
|
1526
|
+
* `sourceTool`/`sourceField` are known when the build plan declared the contract;
|
|
1527
|
+
* a mechanically-detected source (its recorded value appears in a sibling tool's
|
|
1528
|
+
* response) may carry only the param name. Either way the param REQUIRES a
|
|
1529
|
+
* chained `param:<name>` test that mints a fresh value from the producer. */
|
|
1530
|
+
interface TokenSource {
|
|
1531
|
+
param: string;
|
|
1532
|
+
sourceTool?: string;
|
|
1533
|
+
sourceField?: string;
|
|
1534
|
+
}
|
|
1535
|
+
|
|
1536
|
+
/**
|
|
1537
|
+
* Run a single `bun test <file>` and recover both the raw output (for
|
|
1538
|
+
* bot-defense / infra detection and error surfacing) and the per-test pass/fail
|
|
1539
|
+
* names via a JUnit report written to a transient file in the tool dir.
|
|
1540
|
+
*/
|
|
1541
|
+
async function runBunTestWithResults(
|
|
1542
|
+
testPath: string,
|
|
1543
|
+
toolDir: string,
|
|
1544
|
+
timeoutMs: number,
|
|
1545
|
+
env: Record<string, string> = {},
|
|
1546
|
+
): Promise<BunTestRun> {
|
|
1547
|
+
const junitPath = pathJoin(toolDir, `.imprint-junit-${basename(testPath)}.xml`);
|
|
1548
|
+
try {
|
|
1549
|
+
if (existsSync(junitPath)) unlinkSync(junitPath);
|
|
1550
|
+
} catch {
|
|
1551
|
+
// best-effort
|
|
1552
|
+
}
|
|
1553
|
+
const result = await runCommand(
|
|
1554
|
+
`bun test ${testPath} --reporter=junit --reporter-outfile=${junitPath}`,
|
|
1555
|
+
toolDir,
|
|
1556
|
+
timeoutMs,
|
|
1557
|
+
env,
|
|
1558
|
+
);
|
|
1559
|
+
const output = JSON.parse(result.result) as {
|
|
1560
|
+
stdout: string;
|
|
1561
|
+
stderr: string;
|
|
1562
|
+
exitCode: number;
|
|
1563
|
+
timedOut?: boolean;
|
|
1564
|
+
};
|
|
1565
|
+
let xml = '';
|
|
1566
|
+
try {
|
|
1567
|
+
if (existsSync(junitPath)) xml = readFileSync(junitPath, 'utf8');
|
|
1568
|
+
} catch {
|
|
1569
|
+
// missing/partial report → empty sets, handled by callers
|
|
1570
|
+
}
|
|
1571
|
+
try {
|
|
1572
|
+
if (existsSync(junitPath)) unlinkSync(junitPath);
|
|
1573
|
+
} catch {
|
|
1574
|
+
// best-effort
|
|
1575
|
+
}
|
|
1576
|
+
const { passed, failed } = parseJUnitResults(xml);
|
|
1577
|
+
return {
|
|
1578
|
+
stdout: output.stdout,
|
|
1579
|
+
stderr: output.stderr,
|
|
1580
|
+
exitCode: output.exitCode,
|
|
1581
|
+
timedOut: output.timedOut ?? false,
|
|
1582
|
+
passed,
|
|
1583
|
+
failed,
|
|
1584
|
+
};
|
|
1585
|
+
}
|
|
1586
|
+
|
|
1587
|
+
interface TestBlock {
|
|
1588
|
+
title: string;
|
|
1589
|
+
body: string;
|
|
1590
|
+
}
|
|
1591
|
+
|
|
1592
|
+
/** Split a test file into `test(...)` / `it(...)` blocks (title + source from
|
|
1593
|
+
* that test's start to the next test's start). Good enough to check whether a
|
|
1594
|
+
* named per-parameter test's body actually calls the workflow. */
|
|
1595
|
+
export function extractTestBlocks(src: string): TestBlock[] {
|
|
1596
|
+
const re = /\b(?:test|it)\s*\(\s*(['"`])((?:\\.|(?!\1).)*)\1/g;
|
|
1597
|
+
const starts: Array<{ index: number; title: string }> = [];
|
|
1598
|
+
for (const m of src.matchAll(re)) {
|
|
1599
|
+
starts.push({ index: m.index ?? 0, title: m[2] ?? '' });
|
|
1600
|
+
}
|
|
1601
|
+
const blocks: TestBlock[] = [];
|
|
1602
|
+
for (let i = 0; i < starts.length; i++) {
|
|
1603
|
+
const start = starts[i];
|
|
1604
|
+
if (!start) continue;
|
|
1605
|
+
const end = i + 1 < starts.length ? (starts[i + 1]?.index ?? src.length) : src.length;
|
|
1606
|
+
blocks.push({ title: start.title, body: src.slice(start.index, end) });
|
|
1607
|
+
}
|
|
1608
|
+
return blocks;
|
|
1609
|
+
}
|
|
1610
|
+
|
|
1611
|
+
/** Whether a recorded value looks like an opaque token/id (vs free text, a city
|
|
1612
|
+
* name, a date) — used to gate mechanical producer-source detection. */
|
|
1613
|
+
function looksOpaque(v: string): boolean {
|
|
1614
|
+
if (v.length < 12) return false;
|
|
1615
|
+
if (/\s/.test(v)) return false; // multi-word / free text
|
|
1616
|
+
if (/^\d{4}-\d{2}-\d{2}$/.test(v)) return false; // dates
|
|
1617
|
+
return /[:|_-]/.test(v) || /\d/.test(v) || v.length >= 16;
|
|
1618
|
+
}
|
|
1619
|
+
|
|
1620
|
+
/**
|
|
1621
|
+
* Mechanical producer-source detector (secondary signal to the build plan's
|
|
1622
|
+
* declared `tokenParams`). A parameter is producer-sourced when its recorded
|
|
1623
|
+
* value — or a `|`/`:`-split segment of a composite — appears verbatim in a
|
|
1624
|
+
* SIBLING tool's recorded response. Returns the param name (and the producing
|
|
1625
|
+
* tool name when the sibling response carried one). Advisory: it never marks a
|
|
1626
|
+
* param verified; it only forces the chained-test requirement so an undeclared
|
|
1627
|
+
* cross-tool token can't ship with a tautological recorded-value test.
|
|
1628
|
+
*/
|
|
1629
|
+
export function detectTokenSources(opts: {
|
|
1630
|
+
likelyParams: Array<{ name: string }>;
|
|
1631
|
+
recordedParamValues: Map<string, string>;
|
|
1632
|
+
siblingResponses: Array<{ toolName?: string; body: string }>;
|
|
1633
|
+
}): TokenSource[] {
|
|
1634
|
+
const out: TokenSource[] = [];
|
|
1635
|
+
for (const lp of opts.likelyParams) {
|
|
1636
|
+
const val = opts.recordedParamValues.get(lp.name);
|
|
1637
|
+
if (!val || !looksOpaque(val)) continue;
|
|
1638
|
+
const needles = [val, ...val.split(/[|:]/).filter((s) => looksOpaque(s))];
|
|
1639
|
+
const hit = opts.siblingResponses.find((r) => needles.some((n) => r.body.includes(n)));
|
|
1640
|
+
if (hit) out.push({ param: lp.name, sourceTool: hit.toolName });
|
|
1641
|
+
}
|
|
1642
|
+
return out;
|
|
1643
|
+
}
|
|
1644
|
+
|
|
1645
|
+
/** Does a test block mint a fresh value by calling a SIBLING tool's workflow
|
|
1646
|
+
* (`../<producer>/workflow.json`) rather than only this tool's own workflow? */
|
|
1647
|
+
const SIBLING_WORKFLOW_RE = /\.\.\/[A-Za-z0-9_]+\/workflow\.json/;
|
|
1648
|
+
|
|
1649
|
+
/** The `sourcedFrom` stamp for a token param — `{tool, field}` when both the
|
|
1650
|
+
* producer tool and field are known, else undefined. */
|
|
1651
|
+
function sourcedFromOf(ts: {
|
|
1652
|
+
sourceTool?: string;
|
|
1653
|
+
sourceField?: string;
|
|
1654
|
+
}): { tool: string; field: string } | undefined {
|
|
1655
|
+
return ts.sourceTool && ts.sourceField
|
|
1656
|
+
? { tool: ts.sourceTool, field: ts.sourceField }
|
|
1657
|
+
: undefined;
|
|
1658
|
+
}
|
|
1659
|
+
|
|
1660
|
+
interface IntegrationVerdict {
|
|
1661
|
+
/** Drives PARAM coverage: a waived suite lets per-param tests waive (non-blocking);
|
|
1662
|
+
* `failed` blocks; `passed` grades params strictly. */
|
|
1663
|
+
outcome: 'passed' | 'waived-bot' | 'waived-infra' | 'failed';
|
|
1664
|
+
/** Drives `liveVerified` — INDEPENDENT of `outcome`. True when a backend returned
|
|
1665
|
+
* real data this run (the workflow IS live-verifiable), even if the per-param
|
|
1666
|
+
* suite was truncated/blocked. Decoupling these is the fix for tools whose
|
|
1667
|
+
* stealth/cdp baseline succeeded shipping `liveVerified:false` just because the
|
|
1668
|
+
* param suite hit the verifier timeout. */
|
|
1669
|
+
baselineLiveVerified: boolean;
|
|
1670
|
+
firstError: string;
|
|
1671
|
+
exhaustedBackends: string[];
|
|
1672
|
+
/** Non-null when a declared `${state.X}` capture returned null at runtime (a
|
|
1673
|
+
* workflow-correctness error, not infra) — the caller crafts the actionable msg. */
|
|
1674
|
+
captureFailName: string | null;
|
|
1675
|
+
captureFailFromKnown: boolean;
|
|
1676
|
+
}
|
|
1677
|
+
|
|
1678
|
+
/**
|
|
1679
|
+
* Pure classifier for the live integration run. Decides the suite `outcome` AND,
|
|
1680
|
+
* separately, whether the BASELINE was live-verified.
|
|
1681
|
+
*
|
|
1682
|
+
* Why two outputs: an anti-bot suite can have its baseline return real data
|
|
1683
|
+
* (liveVerified) while its per-param tests time out / get blocked (params waive,
|
|
1684
|
+
* non-blocking). The old code coupled `liveVerified` to the WHOLE suite passing,
|
|
1685
|
+
* so a tool whose stealth/cdp baseline succeeded shipped `liveVerified:false`
|
|
1686
|
+
* merely because the param suite was truncated by the 60s verifier timeout, and a
|
|
1687
|
+
* lone `fetch`-rung 403 in the partial output read as a total bot block.
|
|
1688
|
+
*
|
|
1689
|
+
* `baselineLiveVerified` = a backend returned real data this run, detected by the
|
|
1690
|
+
* ladder's `parallel probe: winner=<backend>` log (logged ONLY on an ok result —
|
|
1691
|
+
* robust when JUnit is absent because a timeout SIGKILLed the suite) OR a
|
|
1692
|
+
* non-`param:` baseline test passing in JUnit (robust when a memoized call skipped
|
|
1693
|
+
* the probe log).
|
|
1694
|
+
*
|
|
1695
|
+
* `exhaustedBackends` lists only backends whose probe digest line reported an
|
|
1696
|
+
* ERROR — NOT every backend that was "trying…" (cdp-replay/stealth-fetch usually
|
|
1697
|
+
* succeed and must not be reported as exhausted).
|
|
1698
|
+
*/
|
|
1699
|
+
export function classifyIntegrationOutcome(input: {
|
|
1700
|
+
exitCode: number;
|
|
1701
|
+
timedOut: boolean;
|
|
1702
|
+
combined: string;
|
|
1703
|
+
passedTests: ReadonlySet<string>;
|
|
1704
|
+
referencedStateBroken: boolean;
|
|
1705
|
+
failedCaptureNames: ReadonlySet<string>;
|
|
1706
|
+
}): IntegrationVerdict {
|
|
1707
|
+
const { combined } = input;
|
|
1708
|
+
const baselineLiveVerified =
|
|
1709
|
+
/parallel probe: winner=/.test(combined) ||
|
|
1710
|
+
[...input.passedTests].some((t) => !t.startsWith('param:'));
|
|
1711
|
+
const exhaustedBackends = Array.from(
|
|
1712
|
+
new Set(
|
|
1713
|
+
Array.from(
|
|
1714
|
+
combined.matchAll(
|
|
1715
|
+
/^\s*([a-z-]+): (?:NETWORK|FORBIDDEN|RATE_LIMITED|BAD_RESPONSE|STATE_MISSING|AUTH_EXPIRED|UNKNOWN)\b/gm,
|
|
1716
|
+
),
|
|
1717
|
+
).map((m) => m[1] as string),
|
|
1718
|
+
),
|
|
1719
|
+
);
|
|
1720
|
+
const firstErrorMatch = combined.match(/\b(NETWORK|FORBIDDEN|RATE_LIMITED)\b[^\n]{0,200}/);
|
|
1721
|
+
const firstError = firstErrorMatch?.[0]?.trim() ?? 'unknown';
|
|
1722
|
+
const base = { baselineLiveVerified, firstError, exhaustedBackends };
|
|
1723
|
+
|
|
1724
|
+
if (input.exitCode === 0) {
|
|
1725
|
+
return {
|
|
1726
|
+
...base,
|
|
1727
|
+
outcome: 'passed',
|
|
1728
|
+
baselineLiveVerified: true,
|
|
1729
|
+
firstError: '',
|
|
1730
|
+
exhaustedBackends: [],
|
|
1731
|
+
captureFailName: null,
|
|
1732
|
+
captureFailFromKnown: false,
|
|
1733
|
+
};
|
|
1734
|
+
}
|
|
1735
|
+
if (input.referencedStateBroken) {
|
|
1736
|
+
return { ...base, outcome: 'failed', captureFailName: null, captureFailFromKnown: false };
|
|
1737
|
+
}
|
|
1738
|
+
// Fix C — a STATE_MISSING traced to a declared capture is a workflow-correctness
|
|
1739
|
+
// error, not infra; waiving it would silently ship a broken workflow. Match the
|
|
1740
|
+
// EXACT runtime message (runtime.ts: `Required capture "<name>" (<source>) did
|
|
1741
|
+
// not produce a value.`) — the error code prefix is separated by an em-dash, not
|
|
1742
|
+
// a colon, so the old `STATE_MISSING:` regex never matched and these failures
|
|
1743
|
+
// wrongly fell through to the anti-bot branch (shipped waived-bot instead of
|
|
1744
|
+
// failed). Checked BEFORE the bot-defense branch so a capture-fail that also has
|
|
1745
|
+
// an `_abck` line in the log is still classified `failed`, not waived.
|
|
1746
|
+
const captureFailMatch = combined.match(
|
|
1747
|
+
/Required capture\s+"([^"]+)"\s*\([^)]*\)\s*did not produce a value/i,
|
|
1748
|
+
);
|
|
1749
|
+
if (captureFailMatch) {
|
|
1750
|
+
const name = captureFailMatch[1] ?? '';
|
|
1751
|
+
return {
|
|
1752
|
+
...base,
|
|
1753
|
+
outcome: 'failed',
|
|
1754
|
+
captureFailName: name,
|
|
1755
|
+
captureFailFromKnown: input.failedCaptureNames.has(name),
|
|
1756
|
+
};
|
|
1757
|
+
}
|
|
1758
|
+
// A verifier TIMEOUT truncated a paced suite — that's infra, NEVER a bot block.
|
|
1759
|
+
// (Don't let the partial output's fetch-403 masquerade as a total block.)
|
|
1760
|
+
if (input.timedOut) {
|
|
1761
|
+
return {
|
|
1762
|
+
...base,
|
|
1763
|
+
outcome: 'waived-infra',
|
|
1764
|
+
firstError: firstError === 'unknown' ? 'verifier timeout (suite truncated)' : firstError,
|
|
1765
|
+
captureFailName: null,
|
|
1766
|
+
captureFailFromKnown: false,
|
|
1767
|
+
};
|
|
1768
|
+
}
|
|
1769
|
+
if (isBotDefenseFailure(combined)) {
|
|
1770
|
+
return { ...base, outcome: 'waived-bot', captureFailName: null, captureFailFromKnown: false };
|
|
1771
|
+
}
|
|
1772
|
+
// Every ladder rung exhausted with an infra error. Matches the runWorkflowWithLadder
|
|
1773
|
+
// probe summary (`all backends failed`) + the runWithLadder memo-path summary
|
|
1774
|
+
// (`ladder exhausted`) + stealth's `giving up` / non-escalatable markers.
|
|
1775
|
+
const hasImprintBlock =
|
|
1776
|
+
/\bRATE_LIMITED\b|\bFORBIDDEN\b|\bNETWORK\b/.test(combined) &&
|
|
1777
|
+
/non-escalatable|giving up|ladder exhausted|all backends failed/.test(combined);
|
|
1778
|
+
if (hasImprintBlock) {
|
|
1779
|
+
return { ...base, outcome: 'waived-infra', captureFailName: null, captureFailFromKnown: false };
|
|
1780
|
+
}
|
|
1781
|
+
return { ...base, outcome: 'failed', captureFailName: null, captureFailFromKnown: false };
|
|
1782
|
+
}
|
|
1783
|
+
|
|
1784
|
+
/**
|
|
1785
|
+
* Pure per-parameter coverage classifier (Fix C/D + chained-token verification).
|
|
1786
|
+
* Decides, for each exposed parameter, whether it was behaviorally verified — a
|
|
1787
|
+
* `param:<name>` integration test that actually ran green (in `passedTests`) AND
|
|
1788
|
+
* calls the workflow — and otherwise why it is unverified. Never drops a param
|
|
1789
|
+
* (keep+mark policy):
|
|
1790
|
+
* - covered-live → `{ verified: true }`
|
|
1791
|
+
* - suite waived by anti-bot/infra and not covered → `{ verified: false, reason: 'waived-*' }`
|
|
1792
|
+
* - annotated `// exposed-but-not-verified` and not covered → `{ verified: false, reason: 'annotated' }`
|
|
1793
|
+
* - else (suite ran, no test, no annotation) → `uncovered` (blocking)
|
|
1794
|
+
* - passed but the test never calls runWorkflowWithLadder → `tautological` (blocking)
|
|
1795
|
+
*
|
|
1796
|
+
* A **producer-sourced token param** (in `tokenSources`) is held to a stricter
|
|
1797
|
+
* bar: its `param:<name>` test must mint a FRESH value by calling the producer's
|
|
1798
|
+
* sibling workflow (`../<tool>/workflow.json`), not reuse the recorded constant.
|
|
1799
|
+
* - chained pass → `{ verified: true, sourcedFrom }`
|
|
1800
|
+
* - passed but not chained (the recorded-value tautology) → `unchained` (blocking)
|
|
1801
|
+
* - suite waived (producer anti-bot) → `{ verified: false, reason: 'waived-chain' }`
|
|
1802
|
+
* - else → `unchained` (blocking)
|
|
1803
|
+
*/
|
|
1804
|
+
export function classifyParamCoverage(opts: {
|
|
1805
|
+
likelyParams: Array<{ name: string }>;
|
|
1806
|
+
integrationSrc: string;
|
|
1807
|
+
passedTests: Set<string>;
|
|
1808
|
+
integrationOutcome: 'passed' | 'waived-bot' | 'waived-infra' | 'failed' | 'absent';
|
|
1809
|
+
tokenSources?: TokenSource[];
|
|
1810
|
+
}): {
|
|
1811
|
+
paramVerification: ParamVerification[];
|
|
1812
|
+
uncovered: string[];
|
|
1813
|
+
tautological: string[];
|
|
1814
|
+
unchained: string[];
|
|
1815
|
+
} {
|
|
1816
|
+
const paramVerification: ParamVerification[] = [];
|
|
1817
|
+
const uncovered: string[] = [];
|
|
1818
|
+
const tautological: string[] = [];
|
|
1819
|
+
const unchained: string[] = [];
|
|
1820
|
+
const tokenByName = new Map((opts.tokenSources ?? []).map((t) => [t.param, t]));
|
|
1821
|
+
const blocks = extractTestBlocks(opts.integrationSrc);
|
|
1822
|
+
const waived =
|
|
1823
|
+
opts.integrationOutcome === 'waived-bot' || opts.integrationOutcome === 'waived-infra';
|
|
1824
|
+
for (const lp of opts.likelyParams) {
|
|
1825
|
+
const token = `param:${lp.name}`;
|
|
1826
|
+
const passedLive = [...opts.passedTests].some((n) => n.includes(token));
|
|
1827
|
+
const block = blocks.find((b) => b.title.includes(token));
|
|
1828
|
+
|
|
1829
|
+
// Producer-sourced token param: requires a chained test that mints a fresh
|
|
1830
|
+
// value from the producer's sibling workflow.
|
|
1831
|
+
const ts = tokenByName.get(lp.name);
|
|
1832
|
+
if (ts) {
|
|
1833
|
+
const sourcedFrom = sourcedFromOf(ts);
|
|
1834
|
+
if (passedLive) {
|
|
1835
|
+
const chained =
|
|
1836
|
+
!!block &&
|
|
1837
|
+
/runWorkflowWithLadder\s*\(/.test(block.body) &&
|
|
1838
|
+
SIBLING_WORKFLOW_RE.test(block.body);
|
|
1839
|
+
if (chained) {
|
|
1840
|
+
paramVerification.push({ name: lp.name, verified: true, sourcedFrom });
|
|
1841
|
+
} else {
|
|
1842
|
+
unchained.push(lp.name);
|
|
1843
|
+
}
|
|
1844
|
+
} else if (waived) {
|
|
1845
|
+
paramVerification.push({
|
|
1846
|
+
name: lp.name,
|
|
1847
|
+
verified: false,
|
|
1848
|
+
reason: 'waived-chain',
|
|
1849
|
+
sourcedFrom,
|
|
1850
|
+
});
|
|
1851
|
+
} else {
|
|
1852
|
+
unchained.push(lp.name);
|
|
1853
|
+
}
|
|
1854
|
+
continue;
|
|
1855
|
+
}
|
|
1856
|
+
|
|
1857
|
+
const annotationRe = new RegExp(
|
|
1858
|
+
`//\\s*exposed-but-not-verified[^\\n]*\\b${lp.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`,
|
|
1859
|
+
);
|
|
1860
|
+
const isAnnotated = annotationRe.test(opts.integrationSrc);
|
|
1861
|
+
|
|
1862
|
+
if (passedLive) {
|
|
1863
|
+
// Anti-tautology: a passing per-param test must actually exercise the live
|
|
1864
|
+
// workflow, not assert a constant.
|
|
1865
|
+
if (block && !/runWorkflowWithLadder\s*\(/.test(block.body)) {
|
|
1866
|
+
tautological.push(lp.name);
|
|
1867
|
+
} else {
|
|
1868
|
+
paramVerification.push({ name: lp.name, verified: true });
|
|
1869
|
+
}
|
|
1870
|
+
continue;
|
|
1871
|
+
}
|
|
1872
|
+
|
|
1873
|
+
if (waived) {
|
|
1874
|
+
paramVerification.push({
|
|
1875
|
+
name: lp.name,
|
|
1876
|
+
verified: false,
|
|
1877
|
+
reason: opts.integrationOutcome as 'waived-bot' | 'waived-infra',
|
|
1878
|
+
});
|
|
1879
|
+
continue;
|
|
1880
|
+
}
|
|
1881
|
+
if (isAnnotated) {
|
|
1882
|
+
paramVerification.push({ name: lp.name, verified: false, reason: 'annotated' });
|
|
1883
|
+
continue;
|
|
1884
|
+
}
|
|
1885
|
+
uncovered.push(lp.name);
|
|
1886
|
+
}
|
|
1887
|
+
return { paramVerification, uncovered, tautological, unchained };
|
|
1888
|
+
}
|
|
1889
|
+
|
|
1890
|
+
/**
|
|
1891
|
+
* Fix D: on successful verification, persist each exposed parameter's
|
|
1892
|
+
* `verified` / `verifyNote` into workflow.json so the audit harness and
|
|
1893
|
+
* operators can see which params were not behaviorally verified at compile time
|
|
1894
|
+
* (per the keep+mark policy — nothing is dropped). Returns a consolidated
|
|
1895
|
+
* warning line for any unverified params (empty when all verified). Best-effort:
|
|
1896
|
+
* a write failure never blocks a tool that already passed verification.
|
|
1897
|
+
*/
|
|
1898
|
+
export function applyParamVerification(
|
|
1899
|
+
toolDir: string,
|
|
1900
|
+
paramVerification: ParamVerification[],
|
|
1901
|
+
): string[] {
|
|
1902
|
+
if (paramVerification.length === 0) return [];
|
|
1903
|
+
const workflowPath = pathJoin(toolDir, 'workflow.json');
|
|
1904
|
+
if (!existsSync(workflowPath)) return [];
|
|
1905
|
+
let workflow: {
|
|
1906
|
+
parameters?: Array<{
|
|
1907
|
+
name: string;
|
|
1908
|
+
verified?: boolean;
|
|
1909
|
+
verifyNote?: string;
|
|
1910
|
+
sourcedFrom?: { tool: string; field: string };
|
|
1911
|
+
}>;
|
|
1912
|
+
};
|
|
1913
|
+
try {
|
|
1914
|
+
workflow = JSON.parse(readFileSync(workflowPath, 'utf8'));
|
|
1915
|
+
} catch {
|
|
1916
|
+
return [];
|
|
1917
|
+
}
|
|
1918
|
+
const byName = new Map(paramVerification.map((p) => [p.name, p]));
|
|
1919
|
+
for (const param of workflow.parameters ?? []) {
|
|
1920
|
+
const pv = byName.get(param.name);
|
|
1921
|
+
if (!pv) continue;
|
|
1922
|
+
if (pv.verified) {
|
|
1923
|
+
param.verified = true;
|
|
1924
|
+
param.verifyNote = undefined;
|
|
1925
|
+
} else {
|
|
1926
|
+
param.verified = false;
|
|
1927
|
+
param.verifyNote = pv.reason;
|
|
1928
|
+
}
|
|
1929
|
+
// Stamp the producer-source contract so the MCP description (mcp-server.ts)
|
|
1930
|
+
// tells the orchestrating LLM where to mint the token and `imprint audit`
|
|
1931
|
+
// chains producer→consumer instead of fabricating it.
|
|
1932
|
+
if (pv.sourcedFrom) param.sourcedFrom = pv.sourcedFrom;
|
|
1933
|
+
}
|
|
1934
|
+
try {
|
|
1935
|
+
writeFileSync(workflowPath, `${JSON.stringify(workflow, null, 2)}\n`, 'utf8');
|
|
1936
|
+
} catch {
|
|
1937
|
+
// best-effort — the tool is already verified; this is only metadata.
|
|
1938
|
+
}
|
|
1939
|
+
const unverified = paramVerification.filter((p) => !p.verified);
|
|
1940
|
+
if (unverified.length === 0) return [];
|
|
1941
|
+
return [
|
|
1942
|
+
`${unverified.length} parameter(s) live-unverified at compile time (${unverified
|
|
1943
|
+
.map((p) => `${p.name}: ${p.reason ?? 'unverified'}`)
|
|
1944
|
+
.join(', ')}) — exercised at runtime via the stealth-fetch / playbook ladder.`,
|
|
1945
|
+
];
|
|
1946
|
+
}
|
|
1947
|
+
|
|
1948
|
+
/**
|
|
1949
|
+
* Stamp the integration-test waiver outcome onto workflow.json. When a tool's
|
|
1950
|
+
* integration test couldn't produce live data (anti-bot block or every-rung
|
|
1951
|
+
* NETWORK exhaustion), we ship anyway — but the workflow records
|
|
1952
|
+
* `liveVerified: false` plus the structured waiver reason so the audit gate
|
|
1953
|
+
* and the teach summary can flag it instead of silently treating it as
|
|
1954
|
+
* verified. Best-effort: a write failure never blocks a tool that already
|
|
1955
|
+
* passed parser + schema verification.
|
|
1956
|
+
*/
|
|
1957
|
+
export function applyLiveVerification(
|
|
1958
|
+
toolDir: string,
|
|
1959
|
+
liveVerification:
|
|
1960
|
+
| { kind: 'waived-bot' | 'waived-infra'; firstError: string; exhaustedBackends: string[] }
|
|
1961
|
+
| undefined,
|
|
1962
|
+
): void {
|
|
1963
|
+
const workflowPath = pathJoin(toolDir, 'workflow.json');
|
|
1964
|
+
if (!existsSync(workflowPath)) return;
|
|
1965
|
+
let workflow: Record<string, unknown>;
|
|
1966
|
+
try {
|
|
1967
|
+
workflow = JSON.parse(readFileSync(workflowPath, 'utf8'));
|
|
1968
|
+
} catch {
|
|
1969
|
+
return;
|
|
1970
|
+
}
|
|
1971
|
+
if (liveVerification) {
|
|
1972
|
+
workflow.liveVerified = false;
|
|
1973
|
+
workflow.liveVerifiedWaiver = liveVerification;
|
|
1974
|
+
} else {
|
|
1975
|
+
workflow.liveVerified = true;
|
|
1976
|
+
workflow.liveVerifiedWaiver = undefined;
|
|
1977
|
+
}
|
|
1978
|
+
try {
|
|
1979
|
+
writeFileSync(workflowPath, `${JSON.stringify(workflow, null, 2)}\n`, 'utf8');
|
|
1980
|
+
} catch {
|
|
1981
|
+
// best-effort — non-fatal
|
|
1982
|
+
}
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
/** Strip `${...}` placeholders and query string from a workflow URL so it can
|
|
1986
|
+
* be compared against a recorded request URL by (origin + path). Returns null
|
|
1987
|
+
* when the URL is unparseable even after stripping. */
|
|
1988
|
+
function normalizeUrlForMatch(rawUrl: string): { origin: string; path: string } | null {
|
|
1989
|
+
// Replace placeholders with a stable token, then try to parse. If the URL
|
|
1990
|
+
// still has a placeholder in the host/scheme it will fail — fine, caller
|
|
1991
|
+
// falls back to substring matching.
|
|
1992
|
+
const stripped = rawUrl.replace(/\$\{[^}]+\}/g, 'X');
|
|
1993
|
+
try {
|
|
1994
|
+
const u = new URL(stripped);
|
|
1995
|
+
return { origin: u.origin, path: u.pathname };
|
|
1996
|
+
} catch {
|
|
1997
|
+
return null;
|
|
1998
|
+
}
|
|
1999
|
+
}
|
|
2000
|
+
|
|
2001
|
+
/** Find recorded requests whose (method, origin+path) matches the workflow
|
|
2002
|
+
* request. Used by capture-cross-reference and hardcoded-body checks. */
|
|
2003
|
+
function findRecordedMatches(
|
|
2004
|
+
session: Session,
|
|
2005
|
+
method: string,
|
|
2006
|
+
url: string,
|
|
2007
|
+
restrictToSeqs?: Set<number>,
|
|
2008
|
+
): CapturedRequest[] {
|
|
2009
|
+
const norm = normalizeUrlForMatch(url);
|
|
2010
|
+
if (!norm) return [];
|
|
2011
|
+
const upperMethod = method.toUpperCase();
|
|
2012
|
+
return session.requests.filter((r) => {
|
|
2013
|
+
if (restrictToSeqs && !restrictToSeqs.has(r.seq)) return false;
|
|
2014
|
+
if (r.method.toUpperCase() !== upperMethod) return false;
|
|
2015
|
+
const rNorm = normalizeUrlForMatch(r.url);
|
|
2016
|
+
if (!rNorm) return false;
|
|
2017
|
+
return rNorm.origin === norm.origin && rNorm.path === norm.path;
|
|
2018
|
+
});
|
|
2019
|
+
}
|
|
2020
|
+
|
|
2021
|
+
/** Case-insensitive header lookup against a `Record<string, string>` (which
|
|
2022
|
+
* records preserve as they were captured — Chrome's DevTools protocol does not
|
|
2023
|
+
* normalize). */
|
|
2024
|
+
function headerValue(headers: Record<string, string>, name: string): string | undefined {
|
|
2025
|
+
const lower = name.toLowerCase();
|
|
2026
|
+
for (const [k, v] of Object.entries(headers)) {
|
|
2027
|
+
if (k.toLowerCase() === lower) return v;
|
|
2028
|
+
}
|
|
2029
|
+
return undefined;
|
|
2030
|
+
}
|
|
2031
|
+
|
|
2032
|
+
/** Set-Cookie can appear multiple times; the captured shape is best-effort.
|
|
2033
|
+
* Returns true if any Set-Cookie header in `headers` defines a cookie named
|
|
2034
|
+
* `cookieName`. */
|
|
2035
|
+
function setCookieDefines(headers: Record<string, string>, cookieName: string): boolean {
|
|
2036
|
+
const raw = headerValue(headers, 'set-cookie');
|
|
2037
|
+
if (!raw) return false;
|
|
2038
|
+
// Multiple cookies may be joined with newlines or commas; split conservatively.
|
|
2039
|
+
const cookies = raw.split(/\n|,(?=\s*[A-Za-z_])/);
|
|
2040
|
+
for (const c of cookies) {
|
|
2041
|
+
const eq = c.indexOf('=');
|
|
2042
|
+
if (eq < 0) continue;
|
|
2043
|
+
if (c.slice(0, eq).trim() === cookieName) return true;
|
|
2044
|
+
}
|
|
2045
|
+
return false;
|
|
2046
|
+
}
|
|
2047
|
+
|
|
2048
|
+
/** Fix A — cross-reference each declared `required` capture against the
|
|
2049
|
+
* recording. The verifier rejects done() if the declared source doesn't
|
|
2050
|
+
* actually carry the value, so the agent can no longer ship a workflow whose
|
|
2051
|
+
* capture recipe will silently fail at runtime. General — not specific to
|
|
2052
|
+
* any one capture source or site. */
|
|
2053
|
+
function crossReferenceCaptures(
|
|
2054
|
+
workflow: ReturnType<typeof WorkflowSchema.parse>,
|
|
2055
|
+
session: Session,
|
|
2056
|
+
candidateRequestSeqs?: number[],
|
|
2057
|
+
): { failures: string[]; failedCaptureNames: Set<string> } {
|
|
2058
|
+
const failures: string[] = [];
|
|
2059
|
+
const failedCaptureNames = new Set<string>();
|
|
2060
|
+
const restrictSet = candidateRequestSeqs ? new Set(candidateRequestSeqs) : undefined;
|
|
2061
|
+
|
|
2062
|
+
// Bootstrap captures
|
|
2063
|
+
if (workflow.bootstrap?.captures) {
|
|
2064
|
+
for (const cap of workflow.bootstrap.captures) {
|
|
2065
|
+
if (cap.required === false) continue;
|
|
2066
|
+
const matches = findRecordedMatches(session, 'GET', workflow.bootstrap.url, restrictSet);
|
|
2067
|
+
// Bootstrap URL might not be in candidateRequestSeqs (dependency); retry
|
|
2068
|
+
// without the restriction so we can still cross-reference.
|
|
2069
|
+
const recorded = matches[0] ?? findRecordedMatches(session, 'GET', workflow.bootstrap.url)[0];
|
|
2070
|
+
if (!recorded) {
|
|
2071
|
+
// Out of scope; do not fail — we can't prove anything.
|
|
2072
|
+
continue;
|
|
2073
|
+
}
|
|
2074
|
+
const fail = validateCaptureAgainstRecording(cap, recorded, 'bootstrap GET');
|
|
2075
|
+
if (fail) {
|
|
2076
|
+
failures.push(fail);
|
|
2077
|
+
failedCaptureNames.add(cap.name);
|
|
2078
|
+
}
|
|
2079
|
+
}
|
|
2080
|
+
}
|
|
2081
|
+
|
|
2082
|
+
// Per-request captures
|
|
2083
|
+
for (const [i, req] of workflow.requests.entries()) {
|
|
2084
|
+
if (!req.captures) continue;
|
|
2085
|
+
for (const cap of req.captures) {
|
|
2086
|
+
if (cap.required === false) continue;
|
|
2087
|
+
const matches = findRecordedMatches(session, req.method, req.url, restrictSet);
|
|
2088
|
+
const recorded = matches[0] ?? findRecordedMatches(session, req.method, req.url)[0];
|
|
2089
|
+
if (!recorded) continue;
|
|
2090
|
+
const fail = validateCaptureAgainstRecording(
|
|
2091
|
+
cap,
|
|
2092
|
+
recorded,
|
|
2093
|
+
`request[${i}] ${req.method} ${req.url}`,
|
|
2094
|
+
);
|
|
2095
|
+
if (fail) {
|
|
2096
|
+
failures.push(fail);
|
|
2097
|
+
failedCaptureNames.add(cap.name);
|
|
2098
|
+
}
|
|
2099
|
+
}
|
|
2100
|
+
}
|
|
2101
|
+
|
|
2102
|
+
return { failures, failedCaptureNames };
|
|
2103
|
+
}
|
|
2104
|
+
|
|
2105
|
+
/** Fix 2 — cross-reference every capture that a request actually DEPENDS ON
|
|
2106
|
+
* (referenced via `${state.X}` in a header/body/url) against the recording,
|
|
2107
|
+
* regardless of the capture's `required` flag. Fix A only checks `required`
|
|
2108
|
+
* captures and only against the capture's own URL response; that misses the
|
|
2109
|
+
* common anti-bot shape where a `required:false` html_regex capture (csrf /
|
|
2110
|
+
* csp-nonce) is scraped from a bootstrap page that isn't itself in the
|
|
2111
|
+
* recording, yet a request hard-references `${state.csrf_token}` in a header.
|
|
2112
|
+
* At runtime that reference STATE_MISSINGs the whole workflow. This check
|
|
2113
|
+
* rejects done() so the agent must fix the pattern (or source).
|
|
2114
|
+
*
|
|
2115
|
+
* Scope: html_regex / text_regex captures (robustly checkable by testing the
|
|
2116
|
+
* pattern against every recorded same-origin HTML document body). Other
|
|
2117
|
+
* sources referenced-but-not-required are left to Fix A / the integration test.
|
|
2118
|
+
* General — not specific to any site or token. */
|
|
2119
|
+
export function crossReferenceReferencedStateCaptures(
|
|
2120
|
+
workflow: ReturnType<typeof WorkflowSchema.parse>,
|
|
2121
|
+
session: Session,
|
|
2122
|
+
): { failures: string[]; failedCaptureNames: Set<string> } {
|
|
2123
|
+
const failures: string[] = [];
|
|
2124
|
+
const failedCaptureNames = new Set<string>();
|
|
2125
|
+
|
|
2126
|
+
// 1) Collect every ${state.X} name referenced across request url/headers/body.
|
|
2127
|
+
const referenced = new Set<string>();
|
|
2128
|
+
const stateRefRe = /\$\{state\.([A-Za-z0-9_]+)\}/g;
|
|
2129
|
+
const scan = (s: string | undefined): void => {
|
|
2130
|
+
if (!s) return;
|
|
2131
|
+
for (const m of s.matchAll(stateRefRe)) {
|
|
2132
|
+
const name = m[1];
|
|
2133
|
+
if (name) referenced.add(name);
|
|
2134
|
+
}
|
|
2135
|
+
};
|
|
2136
|
+
for (const req of workflow.requests) {
|
|
2137
|
+
scan(req.url);
|
|
2138
|
+
scan(req.body);
|
|
2139
|
+
for (const hv of Object.values(req.headers ?? {})) scan(hv);
|
|
2140
|
+
}
|
|
2141
|
+
if (referenced.size === 0) return { failures, failedCaptureNames };
|
|
2142
|
+
|
|
2143
|
+
// 2) Index captures by name (bootstrap + per-request).
|
|
2144
|
+
const capByName = new Map<string, BootstrapCapture | RequestCapture>();
|
|
2145
|
+
for (const cap of workflow.bootstrap?.captures ?? []) capByName.set(cap.name, cap);
|
|
2146
|
+
for (const req of workflow.requests) {
|
|
2147
|
+
for (const cap of req.captures ?? []) capByName.set(cap.name, cap);
|
|
2148
|
+
}
|
|
2149
|
+
|
|
2150
|
+
// 3) Gather recorded HTML document bodies, preferring the bootstrap origin but
|
|
2151
|
+
// falling back to all HTML bodies (the bootstrap page itself may be absent
|
|
2152
|
+
// from the recording — e.g. costco's /Rental-Cars).
|
|
2153
|
+
let targetOrigin: string | undefined;
|
|
2154
|
+
try {
|
|
2155
|
+
if (workflow.bootstrap?.url) targetOrigin = new URL(workflow.bootstrap.url).origin;
|
|
2156
|
+
} catch {
|
|
2157
|
+
/* leave undefined */
|
|
2158
|
+
}
|
|
2159
|
+
const isHtmlDoc = (r: CapturedRequest): boolean => {
|
|
2160
|
+
const mime = r.response?.mimeType ?? '';
|
|
2161
|
+
return (
|
|
2162
|
+
(mime.includes('text/html') || r.resourceType === 'Document') &&
|
|
2163
|
+
typeof r.response?.body === 'string' &&
|
|
2164
|
+
r.response.body.length > 0
|
|
2165
|
+
);
|
|
2166
|
+
};
|
|
2167
|
+
const sameOrigin = (r: CapturedRequest): boolean => {
|
|
2168
|
+
if (!targetOrigin) return true;
|
|
2169
|
+
try {
|
|
2170
|
+
return new URL(r.url).origin === targetOrigin;
|
|
2171
|
+
} catch {
|
|
2172
|
+
return false;
|
|
2173
|
+
}
|
|
2174
|
+
};
|
|
2175
|
+
let htmlBodies = session.requests
|
|
2176
|
+
.filter((r) => isHtmlDoc(r) && sameOrigin(r))
|
|
2177
|
+
.map((r) => r.response?.body ?? '');
|
|
2178
|
+
if (htmlBodies.length === 0) {
|
|
2179
|
+
htmlBodies = session.requests.filter(isHtmlDoc).map((r) => r.response?.body ?? '');
|
|
2180
|
+
}
|
|
2181
|
+
|
|
2182
|
+
// 4) For each referenced state name produced by an html_regex/text_regex
|
|
2183
|
+
// capture, assert the pattern matches at least one recorded HTML body.
|
|
2184
|
+
for (const name of referenced) {
|
|
2185
|
+
const cap = capByName.get(name);
|
|
2186
|
+
if (!cap) continue; // may be seeded by the fetch-bootstrap jar — not statically known
|
|
2187
|
+
if (cap.source !== 'html_regex' && cap.source !== 'text_regex') continue;
|
|
2188
|
+
if (failedCaptureNames.has(name)) continue;
|
|
2189
|
+
let re: RegExp;
|
|
2190
|
+
try {
|
|
2191
|
+
re = new RegExp(cap.pattern);
|
|
2192
|
+
} catch (err) {
|
|
2193
|
+
failures.push(
|
|
2194
|
+
`capture "${name}" (referenced via \${state.${name}} in a request) has an invalid regex /${cap.pattern}/: ${err instanceof Error ? err.message : String(err)}.`,
|
|
2195
|
+
);
|
|
2196
|
+
failedCaptureNames.add(name);
|
|
2197
|
+
continue;
|
|
2198
|
+
}
|
|
2199
|
+
if (htmlBodies.length === 0) continue; // no recorded HTML to check against
|
|
2200
|
+
const matches = htmlBodies.some((body) => re.test(body));
|
|
2201
|
+
if (!matches) {
|
|
2202
|
+
failures.push(
|
|
2203
|
+
`capture "${name}" (source "${cap.source}") is referenced via \${state.${name}} in a request, but its pattern /${cap.pattern}/ does not match ANY recorded HTML page body for this site. At runtime \${state.${name}} resolves to nothing → the request fails with STATE_MISSING. Fix the pattern to match the token as it actually appears in the recorded page (inspect the recorded HTML), or change the capture source. (required:${cap.required === false ? 'false' : 'true'} does not exempt this — the request hard-references the value.)`,
|
|
2204
|
+
);
|
|
2205
|
+
failedCaptureNames.add(name);
|
|
2206
|
+
}
|
|
2207
|
+
}
|
|
2208
|
+
|
|
2209
|
+
return { failures, failedCaptureNames };
|
|
2210
|
+
}
|
|
2211
|
+
|
|
2212
|
+
/** Check one capture against the recorded request it should be reading from.
|
|
2213
|
+
* Returns a failure message or null. */
|
|
2214
|
+
function validateCaptureAgainstRecording(
|
|
2215
|
+
cap: BootstrapCapture | RequestCapture,
|
|
2216
|
+
recorded: CapturedRequest,
|
|
2217
|
+
context: string,
|
|
2218
|
+
): string | null {
|
|
2219
|
+
const respHeaders = recorded.response?.headers ?? {};
|
|
2220
|
+
const respBody = recorded.response?.body ?? '';
|
|
2221
|
+
const fix = (suggestion: string) =>
|
|
2222
|
+
`capture "${cap.name}" on ${context}: declared source "${cap.source}" did not produce a value in the recording (seq=${recorded.seq}). ${suggestion}`;
|
|
2223
|
+
|
|
2224
|
+
switch (cap.source) {
|
|
2225
|
+
case 'response_header': {
|
|
2226
|
+
const v = headerValue(respHeaders, cap.header);
|
|
2227
|
+
if (v && v.length > 0) return null;
|
|
2228
|
+
return fix(
|
|
2229
|
+
`The recorded response has no "${cap.header}" header. Inspect the recorded response headers for a header that actually carries this value, or switch to source: 'html_regex' / 'cookie' / 'dom_*' if the value lives elsewhere.`,
|
|
2230
|
+
);
|
|
2231
|
+
}
|
|
2232
|
+
case 'cookie': {
|
|
2233
|
+
if (setCookieDefines(respHeaders, cap.cookie)) return null;
|
|
2234
|
+
return fix(
|
|
2235
|
+
`The recorded response Set-Cookie does not define cookie "${cap.cookie}". Check the recorded response headers and pick the correct cookie name, or switch source if the value isn't in a cookie.`,
|
|
2236
|
+
);
|
|
2237
|
+
}
|
|
2238
|
+
case 'html_regex':
|
|
2239
|
+
case 'text_regex': {
|
|
2240
|
+
try {
|
|
2241
|
+
const re = new RegExp(cap.pattern);
|
|
2242
|
+
if (re.test(respBody)) return null;
|
|
2243
|
+
} catch (err) {
|
|
2244
|
+
return fix(
|
|
2245
|
+
`Pattern is not a valid regex: ${err instanceof Error ? err.message : String(err)}.`,
|
|
2246
|
+
);
|
|
2247
|
+
}
|
|
2248
|
+
return fix(
|
|
2249
|
+
`Pattern /${cap.pattern}/ does not match the recorded response body. The token may live in a different location — check response headers (use source: 'response_header'), Set-Cookie (use source: 'cookie'), or revise the pattern.`,
|
|
2250
|
+
);
|
|
2251
|
+
}
|
|
2252
|
+
case 'json': {
|
|
2253
|
+
// 'json' captures use a path expression; static validation is fragile.
|
|
2254
|
+
// Skip — the integration test surfaces failures.
|
|
2255
|
+
return null;
|
|
2256
|
+
}
|
|
2257
|
+
default:
|
|
2258
|
+
// dom_attribute, dom_text, local_storage, session_storage — not statically
|
|
2259
|
+
// verifiable from a HAR-style recording.
|
|
2260
|
+
return null;
|
|
2261
|
+
}
|
|
2262
|
+
}
|
|
2263
|
+
|
|
2264
|
+
/** Fix B — detect request body fields hardcoded to the recording's first
|
|
2265
|
+
* invocation value when the recording proves the field is user input (varies
|
|
2266
|
+
* across multiple recorded invocations of the same endpoint). The verifier
|
|
2267
|
+
* rejects done() so the agent must expose the field as `${param.X}` (or use a
|
|
2268
|
+
* requestTransformModule). General — not specific to any one site. */
|
|
2269
|
+
function detectHardcodedSessionValues(
|
|
2270
|
+
workflow: ReturnType<typeof WorkflowSchema.parse>,
|
|
2271
|
+
session: Session,
|
|
2272
|
+
candidateRequestSeqs?: number[],
|
|
2273
|
+
dependencyRequestSeqs?: number[],
|
|
2274
|
+
): string[] {
|
|
2275
|
+
// Skip the whole check when the workflow uses a requestTransformModule:
|
|
2276
|
+
// that module is the agent's declared escape hatch for programmatic body
|
|
2277
|
+
// construction (e.g. _uid generators, position-dependent encoding), and
|
|
2278
|
+
// any literal we see in workflow.json's body field may be overridden at
|
|
2279
|
+
// runtime by the transform. Trying to second-guess transform behavior
|
|
2280
|
+
// statically is the wrong layer.
|
|
2281
|
+
if (workflow.requestTransformModule) return [];
|
|
2282
|
+
|
|
2283
|
+
const failures: string[] = [];
|
|
2284
|
+
const allowedSeqs = new Set<number>([
|
|
2285
|
+
...(candidateRequestSeqs ?? []),
|
|
2286
|
+
...(dependencyRequestSeqs ?? []),
|
|
2287
|
+
]);
|
|
2288
|
+
const restrictSet = allowedSeqs.size > 0 ? allowedSeqs : undefined;
|
|
2289
|
+
|
|
2290
|
+
for (const [i, req] of workflow.requests.entries()) {
|
|
2291
|
+
if (!req.body || req.body.length === 0) continue;
|
|
2292
|
+
|
|
2293
|
+
const matches = findRecordedMatches(session, req.method, req.url, restrictSet);
|
|
2294
|
+
if (matches.length < 2) continue;
|
|
2295
|
+
const firstMatch = matches[0];
|
|
2296
|
+
if (!firstMatch) continue;
|
|
2297
|
+
|
|
2298
|
+
// Determine body parser based on the recorded Content-Type (workflow may
|
|
2299
|
+
// have stripped headers).
|
|
2300
|
+
const recordedCt =
|
|
2301
|
+
headerValue(firstMatch.headers, 'content-type') ?? req.headers['Content-Type'] ?? '';
|
|
2302
|
+
|
|
2303
|
+
const parsed = matches
|
|
2304
|
+
.map((m) => parseBodyForFieldExtraction(m.body ?? '', recordedCt))
|
|
2305
|
+
.filter((p): p is Record<string, string> => p !== null);
|
|
2306
|
+
if (parsed.length < 2) continue;
|
|
2307
|
+
|
|
2308
|
+
// Collect distinct values per field
|
|
2309
|
+
const valuesByField = new Map<string, Set<string>>();
|
|
2310
|
+
for (const map of parsed) {
|
|
2311
|
+
for (const [k, v] of Object.entries(map)) {
|
|
2312
|
+
if (!valuesByField.has(k)) valuesByField.set(k, new Set());
|
|
2313
|
+
valuesByField.get(k)?.add(v);
|
|
2314
|
+
}
|
|
2315
|
+
}
|
|
2316
|
+
|
|
2317
|
+
const varying: Array<{ field: string; values: string[] }> = [];
|
|
2318
|
+
for (const [field, set] of valuesByField) {
|
|
2319
|
+
if (set.size < 2) continue;
|
|
2320
|
+
varying.push({ field, values: [...set].slice(0, 4) });
|
|
2321
|
+
}
|
|
2322
|
+
if (varying.length === 0) continue;
|
|
2323
|
+
|
|
2324
|
+
// For each varying field, check whether the workflow body has the first
|
|
2325
|
+
// recorded value as a literal substring AND no template placeholder for it.
|
|
2326
|
+
const workflowParsed = parseBodyForFieldExtraction(req.body, recordedCt);
|
|
2327
|
+
if (!workflowParsed) continue;
|
|
2328
|
+
|
|
2329
|
+
const offenders: Array<{ field: string; literal: string; distinctValues: string[] }> = [];
|
|
2330
|
+
for (const { field, values } of varying) {
|
|
2331
|
+
const wfValue = workflowParsed[field];
|
|
2332
|
+
if (wfValue === undefined) continue;
|
|
2333
|
+
// If the workflow value contains ANY placeholder, it's templated → OK.
|
|
2334
|
+
if (/\$\{(param|state|credential|response)\.[A-Za-z0-9_[\]]+\}/.test(wfValue)) continue;
|
|
2335
|
+
// The workflow value is a literal. Compare against the first recorded
|
|
2336
|
+
// value — if equal, this is a frozen-session-value bug. (Equality vs
|
|
2337
|
+
// just-non-templated avoids false positives where the agent picked a
|
|
2338
|
+
// sensible default different from any recorded seq.)
|
|
2339
|
+
if (values.includes(wfValue)) {
|
|
2340
|
+
offenders.push({ field, literal: wfValue, distinctValues: values });
|
|
2341
|
+
}
|
|
2342
|
+
}
|
|
2343
|
+
|
|
2344
|
+
if (offenders.length > 0) {
|
|
2345
|
+
const lines = offenders.map(
|
|
2346
|
+
(o) =>
|
|
2347
|
+
` ${o.field}=${JSON.stringify(o.literal)} — recorded values across seqs: [${o.distinctValues
|
|
2348
|
+
.map((v) => JSON.stringify(v))
|
|
2349
|
+
.join(', ')}]`,
|
|
2350
|
+
);
|
|
2351
|
+
failures.push(
|
|
2352
|
+
`request[${i}] ${req.method} ${req.url} body has ${offenders.length} field(s) frozen to one recorded user's session — the recording proves these are user input:\n${lines.join('\n')}\nReplace each with \${param.NAME} and add the parameter to workflow.parameters, OR move body construction into a requestTransformModule.`,
|
|
2353
|
+
);
|
|
2354
|
+
}
|
|
2355
|
+
}
|
|
2356
|
+
|
|
2357
|
+
return failures;
|
|
2358
|
+
}
|
|
2359
|
+
|
|
2360
|
+
/** Parse a request body into a flat field→value map for variation analysis.
|
|
2361
|
+
* Supports form-urlencoded and (top-level) JSON. Returns null for shapes the
|
|
2362
|
+
* check can't reason about. */
|
|
2363
|
+
function parseBodyForFieldExtraction(
|
|
2364
|
+
body: string,
|
|
2365
|
+
contentType: string,
|
|
2366
|
+
): Record<string, string> | null {
|
|
2367
|
+
const ct = contentType.toLowerCase();
|
|
2368
|
+
if (
|
|
2369
|
+
ct.includes('application/x-www-form-urlencoded') ||
|
|
2370
|
+
(!ct && body.includes('=') && body.includes('&'))
|
|
2371
|
+
) {
|
|
2372
|
+
const out: Record<string, string> = {};
|
|
2373
|
+
for (const pair of body.split('&')) {
|
|
2374
|
+
const eq = pair.indexOf('=');
|
|
2375
|
+
if (eq < 0) continue;
|
|
2376
|
+
const k = decodeURIComponent(pair.slice(0, eq).replace(/\+/g, ' '));
|
|
2377
|
+
const v = decodeURIComponent(pair.slice(eq + 1).replace(/\+/g, ' '));
|
|
2378
|
+
out[k] = v;
|
|
2379
|
+
}
|
|
2380
|
+
return out;
|
|
2381
|
+
}
|
|
2382
|
+
if (ct.includes('application/json') || (ct === '' && body.trim().startsWith('{'))) {
|
|
2383
|
+
try {
|
|
2384
|
+
const parsed = JSON.parse(body);
|
|
2385
|
+
if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
|
|
2386
|
+
const out: Record<string, string> = {};
|
|
2387
|
+
for (const [k, v] of Object.entries(parsed)) {
|
|
2388
|
+
if (typeof v === 'string' || typeof v === 'number' || typeof v === 'boolean') {
|
|
2389
|
+
out[k] = String(v);
|
|
2390
|
+
}
|
|
2391
|
+
}
|
|
2392
|
+
return out;
|
|
2393
|
+
}
|
|
2394
|
+
} catch {
|
|
2395
|
+
// not parseable
|
|
2396
|
+
}
|
|
2397
|
+
}
|
|
2398
|
+
return null;
|
|
2399
|
+
}
|
|
2400
|
+
|
|
1108
2401
|
export async function externalVerification(
|
|
1109
2402
|
toolDir: string,
|
|
1110
2403
|
session: Session,
|
|
@@ -1113,10 +2406,52 @@ export async function externalVerification(
|
|
|
1113
2406
|
expectedToolName?: string;
|
|
1114
2407
|
likelyParams?: Array<{ name: string; type?: string; description?: string }>;
|
|
1115
2408
|
candidateRequestSeqs?: number[];
|
|
2409
|
+
/** Shared modules the build plan assigned to this tool. The verifier asserts
|
|
2410
|
+
* each verified module is actually imported (no silent re-implementation). */
|
|
2411
|
+
assignedSharedModules?: AssignedSharedModule[];
|
|
2412
|
+
/** Producer→consumer token contracts the build plan declared for this tool:
|
|
2413
|
+
* each `param` is minted by `sourceTool`'s `sourceField` output. Such params
|
|
2414
|
+
* require a chained `param:<name>` test (mint a fresh value from the producer)
|
|
2415
|
+
* and are stamped with `sourcedFrom` on success. */
|
|
2416
|
+
tokenParams?: Array<{ param: string; sourceTool: string; sourceField: string }>;
|
|
2417
|
+
/** Fields the build plan requires THIS tool's parser to emit for sibling
|
|
2418
|
+
* consumers (producer side). The verifier fails the tool if a declared field
|
|
2419
|
+
* is not emitted, so the producer/consumer field name can't silently diverge
|
|
2420
|
+
* (e.g. the plan says `hotel_id` but the parser emits `propertyToken`). */
|
|
2421
|
+
emittedTokens?: Array<{ field: string; shape: string }>;
|
|
2422
|
+
/** Build-plan-declared dependency seqs (e.g. bootstrap GET seq, producer
|
|
2423
|
+
* search seq) used by the hardcoded-body check to widen its variation
|
|
2424
|
+
* pool beyond the tool's own load-bearing seqs. */
|
|
2425
|
+
dependencyRequestSeqs?: number[];
|
|
1116
2426
|
} = {},
|
|
1117
|
-
): Promise<{
|
|
2427
|
+
): Promise<{
|
|
2428
|
+
failures: string[];
|
|
2429
|
+
warnings: string[];
|
|
2430
|
+
paramVerification: ParamVerification[];
|
|
2431
|
+
/** Set when the integration test was waived rather than passing live — the
|
|
2432
|
+
* caller should stamp this onto workflow.json so audit/teach can surface
|
|
2433
|
+
* the unverified state instead of silently treating the tool as live. */
|
|
2434
|
+
liveVerification?: {
|
|
2435
|
+
kind: 'waived-bot' | 'waived-infra';
|
|
2436
|
+
firstError: string;
|
|
2437
|
+
exhaustedBackends: string[];
|
|
2438
|
+
};
|
|
2439
|
+
}> {
|
|
1118
2440
|
const failures: string[] = [];
|
|
1119
2441
|
const warnings: string[] = [];
|
|
2442
|
+
const paramVerification: ParamVerification[] = [];
|
|
2443
|
+
let liveVerification:
|
|
2444
|
+
| { kind: 'waived-bot' | 'waived-infra'; firstError: string; exhaustedBackends: string[] }
|
|
2445
|
+
| undefined;
|
|
2446
|
+
// Captures Fix A flagged as having a wrong source. Surfaced into the
|
|
2447
|
+
// waiver classification (Fix C) so a STATE_MISSING traced to one of these
|
|
2448
|
+
// captures cannot silently become `waived-infra`.
|
|
2449
|
+
let failedCaptureNames = new Set<string>();
|
|
2450
|
+
// Fix 3 — when a request-referenced ${state.X} capture provably can't resolve
|
|
2451
|
+
// (Fix 2 below), the live integration call is GUARANTEED to STATE_MISSING, so
|
|
2452
|
+
// firing it is pure waste that also burns the per-IP anti-bot rate budget.
|
|
2453
|
+
// Skip the live test in that case and make the agent fix the capture first.
|
|
2454
|
+
let referencedStateBroken = false;
|
|
1120
2455
|
|
|
1121
2456
|
const workflowPath = pathJoin(toolDir, 'workflow.json');
|
|
1122
2457
|
const parserPath = pathJoin(toolDir, 'parser.ts');
|
|
@@ -1141,6 +2476,38 @@ export async function externalVerification(
|
|
|
1141
2476
|
);
|
|
1142
2477
|
}
|
|
1143
2478
|
|
|
2479
|
+
// Fix A — cross-reference every required capture against the recording.
|
|
2480
|
+
// A capture that declares `response_header` but reads from a recorded
|
|
2481
|
+
// response with no such header (or `html_regex` whose pattern doesn't
|
|
2482
|
+
// match the recorded body, etc.) will silently return null at runtime;
|
|
2483
|
+
// we reject it at compile so the agent picks a source that works.
|
|
2484
|
+
const crossRef = crossReferenceCaptures(workflow, session, opts.candidateRequestSeqs);
|
|
2485
|
+
failures.push(...crossRef.failures);
|
|
2486
|
+
failedCaptureNames = crossRef.failedCaptureNames;
|
|
2487
|
+
|
|
2488
|
+
// Fix 2 — cross-reference captures that a request DEPENDS ON via
|
|
2489
|
+
// `${state.X}` (e.g. an anti-bot csrf/csp-nonce html_regex capture whose
|
|
2490
|
+
// bootstrap page isn't in the recording) against every recorded HTML body,
|
|
2491
|
+
// regardless of `required`. Catches the silent STATE_MISSING that ships a
|
|
2492
|
+
// .act tool which can never resolve its csrf header at runtime.
|
|
2493
|
+
const stateRef = crossReferenceReferencedStateCaptures(workflow, session);
|
|
2494
|
+
failures.push(...stateRef.failures);
|
|
2495
|
+
for (const n of stateRef.failedCaptureNames) failedCaptureNames.add(n);
|
|
2496
|
+
if (stateRef.failedCaptureNames.size > 0) referencedStateBroken = true;
|
|
2497
|
+
|
|
2498
|
+
// Fix B — flag request body fields hardcoded to one recorded user's
|
|
2499
|
+
// session when the recording proves those fields are user input
|
|
2500
|
+
// (varying values across multiple recorded invocations of the same
|
|
2501
|
+
// endpoint). Skipped when the tool uses a requestTransformModule.
|
|
2502
|
+
failures.push(
|
|
2503
|
+
...detectHardcodedSessionValues(
|
|
2504
|
+
workflow,
|
|
2505
|
+
session,
|
|
2506
|
+
opts.candidateRequestSeqs,
|
|
2507
|
+
opts.dependencyRequestSeqs,
|
|
2508
|
+
),
|
|
2509
|
+
);
|
|
2510
|
+
|
|
1144
2511
|
if (opts.likelyParams && opts.likelyParams.length > 0) {
|
|
1145
2512
|
// Build the set of query param keys from the original recorded URLs
|
|
1146
2513
|
// so we can distinguish real API params from invented ones.
|
|
@@ -1237,6 +2604,17 @@ export async function externalVerification(
|
|
|
1237
2604
|
}
|
|
1238
2605
|
}
|
|
1239
2606
|
|
|
2607
|
+
// Shared-module reuse: when the build plan assigned this tool a verified
|
|
2608
|
+
// shared module, the tool's artifacts MUST import it rather than duplicating
|
|
2609
|
+
// the logic. This is the anti-duplication gate for multi-tool teach runs.
|
|
2610
|
+
if (
|
|
2611
|
+
opts.assignedSharedModules &&
|
|
2612
|
+
opts.assignedSharedModules.length > 0 &&
|
|
2613
|
+
existsSync(workflowPath)
|
|
2614
|
+
) {
|
|
2615
|
+
failures.push(...assertSharedModuleImports(toolDir, workflowPath, opts.assignedSharedModules));
|
|
2616
|
+
}
|
|
2617
|
+
|
|
1240
2618
|
if (!existsSync(parserPath)) {
|
|
1241
2619
|
failures.push('parser.ts was not written');
|
|
1242
2620
|
} else {
|
|
@@ -1256,85 +2634,299 @@ export async function externalVerification(
|
|
|
1256
2634
|
failures.push('parser.test.ts was not written');
|
|
1257
2635
|
} else {
|
|
1258
2636
|
const src = readFileSync(parserTestPath, 'utf8');
|
|
1259
|
-
const
|
|
1260
|
-
if (
|
|
1261
|
-
failures.push(`parser.test.ts has only ${
|
|
2637
|
+
const expectCount = countExpectCalls(src);
|
|
2638
|
+
if (expectCount < 3) {
|
|
2639
|
+
failures.push(`parser.test.ts has only ${expectCount} expect() calls; need ≥3`);
|
|
1262
2640
|
}
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
for (const pattern of trivialPatterns) {
|
|
1275
|
-
if (pattern.test(src)) {
|
|
1276
|
-
failures.push(
|
|
1277
|
-
'parser.test.ts contains trivial tautological assertions like expect(true).toBe(true) — tests must reference real values',
|
|
1278
|
-
);
|
|
1279
|
-
break;
|
|
1280
|
-
}
|
|
2641
|
+
if (hasTrivialAssertion(src)) {
|
|
2642
|
+
failures.push(
|
|
2643
|
+
'parser.test.ts contains trivial tautological assertions like expect(true).toBe(true) — tests must reference real values',
|
|
2644
|
+
);
|
|
2645
|
+
}
|
|
2646
|
+
// Fix E: the zero/empty-result contract. The recording has no no-match
|
|
2647
|
+
// response, so the only way to verify empty-handling is a synthetic case.
|
|
2648
|
+
if (!src.includes('synthetic:empty-result')) {
|
|
2649
|
+
failures.push(
|
|
2650
|
+
'parser.test.ts is missing the required `synthetic:empty-result` test — add a test titled `synthetic:empty-result …` that feeds extract() a no-match / empty-items response and asserts it returns a clean empty collection (length 0), never a single all-null placeholder record. See prompts/compile-agent.md.',
|
|
2651
|
+
);
|
|
1281
2652
|
}
|
|
1282
2653
|
}
|
|
1283
2654
|
|
|
1284
2655
|
if (existsSync(parserTestPath)) {
|
|
1285
|
-
const
|
|
2656
|
+
const run = await runBunTestWithResults(parserTestPath, toolDir, 120000, {
|
|
1286
2657
|
[SESSION_PATH_ENV]: sessionPath,
|
|
1287
2658
|
});
|
|
1288
|
-
|
|
1289
|
-
stdout: string;
|
|
1290
|
-
stderr: string;
|
|
1291
|
-
exitCode: number;
|
|
1292
|
-
};
|
|
1293
|
-
if (output.exitCode !== 0) {
|
|
2659
|
+
if (run.exitCode !== 0) {
|
|
1294
2660
|
failures.push(
|
|
1295
|
-
`bun test parser.test.ts exited ${
|
|
2661
|
+
`bun test parser.test.ts exited ${run.exitCode}\nstdout:\n${run.stdout}\nstderr:\n${run.stderr}`,
|
|
2662
|
+
);
|
|
2663
|
+
}
|
|
2664
|
+
// The synthetic empty-result test must actually RUN GREEN, not merely be
|
|
2665
|
+
// present in source — a failed/absent synthetic test leaves empty-handling
|
|
2666
|
+
// unverified (R1: phantom all-null record on a zero-result input).
|
|
2667
|
+
const ranAnyTest = run.passed.size + run.failed.size > 0;
|
|
2668
|
+
const syntheticPassed = [...run.passed].some((n) => n.includes('synthetic:empty-result'));
|
|
2669
|
+
if (ranAnyTest && !syntheticPassed) {
|
|
2670
|
+
failures.push(
|
|
2671
|
+
'the `synthetic:empty-result` parser test did not pass — extract() must return a clean empty collection for a no-match/empty response (not a phantom record). Fix the parser or the test.',
|
|
1296
2672
|
);
|
|
1297
2673
|
}
|
|
1298
2674
|
}
|
|
1299
2675
|
|
|
2676
|
+
// Run the live integration suite and classify the outcome. The per-param
|
|
2677
|
+
// coverage check below trusts the test *runner* (which named tests actually
|
|
2678
|
+
// ran green) rather than a static source scan, so a suite that was waived by
|
|
2679
|
+
// anti-bot can no longer be counted as "covered".
|
|
1300
2680
|
const integrationTestPath = pathJoin(toolDir, 'integration.test.ts');
|
|
2681
|
+
let integrationOutcome: 'passed' | 'waived-bot' | 'waived-infra' | 'failed' | 'absent' = 'absent';
|
|
2682
|
+
let integrationPassedTests = new Set<string>();
|
|
1301
2683
|
if (!existsSync(integrationTestPath)) {
|
|
1302
2684
|
failures.push(
|
|
1303
2685
|
'integration.test.ts was not written — the tool must include a live API test that calls the workflow and verifies it returns real data',
|
|
1304
2686
|
);
|
|
2687
|
+
} else if (referencedStateBroken) {
|
|
2688
|
+
// A request hard-references a ${state.X} whose html_regex capture provably
|
|
2689
|
+
// does not match the recorded page (Fix 2 already pushed the actionable
|
|
2690
|
+
// failure). The live call WOULD STATE_MISSING — running it can't pass and
|
|
2691
|
+
// would only spend a live anti-bot .act and deepen the per-IP rate flag.
|
|
2692
|
+
// Skip it; the agent must fix the capture, then the next cycle verifies live.
|
|
2693
|
+
integrationOutcome = 'failed';
|
|
2694
|
+
warnings.push(
|
|
2695
|
+
'skipped the live integration test: a request references a ${state.X} capture (e.g. csrf/csp-nonce) whose pattern does not match the recorded page, so the live call is guaranteed to fail with STATE_MISSING. Fix the capture pattern/source (see the failure above) — the next verification cycle will run the live test once it can succeed. This avoids burning a doomed anti-bot .act call.',
|
|
2696
|
+
);
|
|
1305
2697
|
} else {
|
|
1306
|
-
|
|
1307
|
-
|
|
2698
|
+
// Scale the verifier's live-test timeout to the suite size: the baseline plus
|
|
2699
|
+
// one live `runWorkflowWithLadder` per param, each gated by the ~25s compile
|
|
2700
|
+
// pacing and a possible cdp cold start. A flat 60s truncated paced anti-bot
|
|
2701
|
+
// suites mid-run, and the partial output then misclassified as a bot block.
|
|
2702
|
+
// Cap it so a genuinely wedged suite can't run away.
|
|
2703
|
+
const paramCount = opts.likelyParams?.length ?? 0;
|
|
2704
|
+
const pacingMs = Number(process.env.IMPRINT_COMPILE_ACT_SPACING_MS ?? 25_000) || 0;
|
|
2705
|
+
const verifierTimeoutMs = Math.min(120_000 + paramCount * (pacingMs + 20_000), 10 * 60_000);
|
|
2706
|
+
let run: BunTestRun = {
|
|
2707
|
+
stdout: '',
|
|
2708
|
+
stderr: '',
|
|
2709
|
+
exitCode: 1,
|
|
2710
|
+
timedOut: false,
|
|
2711
|
+
passed: new Set(),
|
|
2712
|
+
failed: new Set(),
|
|
2713
|
+
};
|
|
1308
2714
|
for (let attempt = 0; attempt < 3; attempt++) {
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
2715
|
+
run = await runBunTestWithResults(integrationTestPath, toolDir, verifierTimeoutMs);
|
|
2716
|
+
if (run.exitCode === 0) break;
|
|
2717
|
+
// A timeout, bot-defense, or ladder-exhaustion failure will NOT clear on a
|
|
2718
|
+
// retry — re-running only fires more state-changing calls and deepens the
|
|
2719
|
+
// per-IP rate flag. One attempt is enough to classify it; stop early.
|
|
2720
|
+
if (run.timedOut) break;
|
|
2721
|
+
const out = `${run.stdout}\n${run.stderr}`;
|
|
2722
|
+
const ladderExhausted =
|
|
2723
|
+
/\bRATE_LIMITED\b|\bFORBIDDEN\b|\bNETWORK\b/.test(out) &&
|
|
2724
|
+
/non-escalatable|giving up|ladder exhausted|all backends failed/.test(out);
|
|
2725
|
+
if (isBotDefenseFailure(out) || ladderExhausted) break;
|
|
2726
|
+
}
|
|
2727
|
+
integrationPassedTests = run.passed;
|
|
2728
|
+
|
|
2729
|
+
const verdict = classifyIntegrationOutcome({
|
|
2730
|
+
exitCode: run.exitCode,
|
|
2731
|
+
timedOut: run.timedOut,
|
|
2732
|
+
combined: `${run.stdout}\n${run.stderr}`,
|
|
2733
|
+
passedTests: run.passed,
|
|
2734
|
+
referencedStateBroken: false, // the broken-capture case is handled above
|
|
2735
|
+
failedCaptureNames,
|
|
2736
|
+
});
|
|
2737
|
+
integrationOutcome = verdict.outcome;
|
|
2738
|
+
|
|
2739
|
+
if (verdict.outcome === 'passed') {
|
|
2740
|
+
// exitCode 0 — nothing to surface.
|
|
2741
|
+
} else if (verdict.captureFailName !== null) {
|
|
2742
|
+
const capName = verdict.captureFailName;
|
|
2743
|
+
// If the failing capture is a `response_header` on a REPLAYED workflow
|
|
2744
|
+
// request, the cause is almost always the replay asymmetry: programmatic
|
|
2745
|
+
// fetch reliably receives the response BODY and Set-Cookie, but anti-bot
|
|
2746
|
+
// edges withhold browser-only response headers from non-browser requests.
|
|
2747
|
+
let sourceHint = '';
|
|
2748
|
+
try {
|
|
2749
|
+
const wf = JSON.parse(readFileSync(workflowPath, 'utf8')) as {
|
|
2750
|
+
requests?: Array<{ captures?: Array<{ name: string; source: string }> }>;
|
|
2751
|
+
bootstrap?: { captures?: Array<{ name: string; source: string }> };
|
|
2752
|
+
};
|
|
2753
|
+
const reqCap = (wf.requests ?? [])
|
|
2754
|
+
.flatMap((r) => r.captures ?? [])
|
|
2755
|
+
.find((c) => c.name === capName);
|
|
2756
|
+
if (reqCap?.source === 'response_header') {
|
|
2757
|
+
sourceHint = ` The capture uses source: 'response_header' on a replayed request. Programmatic replay does NOT receive browser-only response headers that anti-bot edges withhold — but it DOES receive the response body and Set-Cookie. If this token also appears in the HTML body, switch to source: 'text_regex' (read it from the body); if it is set as a cookie, switch to source: 'cookie'. Reserve 'response_header' for a workflow.bootstrap capture (a real Chrome navigation), not a replayed request.`;
|
|
2758
|
+
}
|
|
2759
|
+
} catch {
|
|
2760
|
+
// best-effort hint only
|
|
2761
|
+
}
|
|
2762
|
+
failures.push(
|
|
2763
|
+
`integration test failed because a declared capture did not produce a value at runtime: capture "${capName}" returned null${
|
|
2764
|
+
verdict.captureFailFromKnown
|
|
2765
|
+
? ' (matches a capture flagged by the compile-time cross-reference check)'
|
|
2766
|
+
: ''
|
|
2767
|
+
}. This is a workflow-correctness error, not infra — fix the capture source/path in workflow.json so it actually reads from the recorded location.${sourceHint}\nstdout:\n${run.stdout}\nstderr:\n${run.stderr}`,
|
|
2768
|
+
);
|
|
2769
|
+
} else if (verdict.outcome === 'waived-bot' || verdict.outcome === 'waived-infra') {
|
|
2770
|
+
// `liveVerified` is driven by whether the BASELINE produced real data, NOT by
|
|
2771
|
+
// whether every param test passed. Only stamp liveVerified=false when the
|
|
2772
|
+
// baseline ALSO failed — if a backend returned real data this run the tool IS
|
|
2773
|
+
// live-verified; only its per-parameter tests waive (non-blocking).
|
|
2774
|
+
liveVerification = verdict.baselineLiveVerified
|
|
2775
|
+
? undefined
|
|
2776
|
+
: {
|
|
2777
|
+
kind: verdict.outcome,
|
|
2778
|
+
firstError: verdict.firstError,
|
|
2779
|
+
exhaustedBackends: verdict.exhaustedBackends,
|
|
2780
|
+
};
|
|
2781
|
+
const liveNote = verdict.baselineLiveVerified
|
|
2782
|
+
? 'The baseline returned real data this run, so liveVerified stays TRUE — only the per-parameter tests are waived.'
|
|
2783
|
+
: 'Stamping liveVerified=false on workflow.json — the runtime falls through to the cdp-replay / playbook rung. Audit and teach surface this tool as unverified.';
|
|
2784
|
+
warnings.push(
|
|
2785
|
+
verdict.outcome === 'waived-bot'
|
|
2786
|
+
? `integration test hit a likely bot-detection / anti-automation challenge. ${liveNote}\nstdout:\n${run.stdout}\nstderr:\n${run.stderr}`
|
|
2787
|
+
: `integration test hit an infrastructure error (${verdict.firstError}); rungs exhausted: ${verdict.exhaustedBackends.join(', ') || 'unknown'}. ${liveNote}\nstdout:\n${run.stdout}\nstderr:\n${run.stderr}`,
|
|
2788
|
+
);
|
|
2789
|
+
} else {
|
|
2790
|
+
failures.push(
|
|
2791
|
+
`bun test integration.test.ts exited ${run.exitCode} — the workflow failed to produce live data (tried 3 times).\nstdout:\n${run.stdout}\nstderr:\n${run.stderr}`,
|
|
2792
|
+
);
|
|
2793
|
+
}
|
|
2794
|
+
}
|
|
2795
|
+
|
|
2796
|
+
// Per-parameter coverage (Fix C/D). Each exposed parameter must have a
|
|
2797
|
+
// `param:<name>` integration test that actually RAN GREEN against live data —
|
|
2798
|
+
// a static source scan is not enough, because a waived suite never exercised
|
|
2799
|
+
// the param (R2: a filter wired to a field the server ignores looks "covered"
|
|
2800
|
+
// by source but does nothing). Per the keep+mark policy we never drop a param;
|
|
2801
|
+
// each is recorded in `paramVerification` as verified or not (with a reason),
|
|
2802
|
+
// and only a genuinely-uncovered param on a suite that DID run blocks compile.
|
|
2803
|
+
if (
|
|
2804
|
+
!referencedStateBroken &&
|
|
2805
|
+
existsSync(integrationTestPath) &&
|
|
2806
|
+
opts.likelyParams &&
|
|
2807
|
+
opts.likelyParams.length > 0
|
|
2808
|
+
) {
|
|
2809
|
+
const integrationSrc = readFileSync(integrationTestPath, 'utf8');
|
|
2810
|
+
|
|
2811
|
+
// Producer-sourced token params: union of build-plan-declared contracts and
|
|
2812
|
+
// mechanical detection (the recorded value appears in a SIBLING tool's
|
|
2813
|
+
// recorded response). Declared entries win — they carry the producer tool +
|
|
2814
|
+
// field used for stamping `sourcedFrom` and the MCP description.
|
|
2815
|
+
const recordedParamValues = new Map<string, string>();
|
|
2816
|
+
try {
|
|
2817
|
+
const wf = JSON.parse(readFileSync(workflowPath, 'utf8')) as {
|
|
2818
|
+
parameters?: Array<{ name: string; default?: unknown }>;
|
|
1314
2819
|
};
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
break;
|
|
2820
|
+
for (const p of wf.parameters ?? []) {
|
|
2821
|
+
if (typeof p.default === 'string') recordedParamValues.set(p.name, p.default);
|
|
1318
2822
|
}
|
|
2823
|
+
} catch {
|
|
2824
|
+
// best-effort — defaults are only a detection hint
|
|
2825
|
+
}
|
|
2826
|
+
const candidateSet = new Set(opts.candidateRequestSeqs ?? []);
|
|
2827
|
+
const siblingResponses = session.requests
|
|
2828
|
+
.filter((r) => !candidateSet.has(r.seq) && r.response?.body)
|
|
2829
|
+
.map((r) => ({ body: r.response?.body ?? '' }));
|
|
2830
|
+
const detected = detectTokenSources({
|
|
2831
|
+
likelyParams: opts.likelyParams,
|
|
2832
|
+
recordedParamValues,
|
|
2833
|
+
siblingResponses,
|
|
2834
|
+
});
|
|
2835
|
+
const tokenByName = new Map<string, TokenSource>();
|
|
2836
|
+
for (const d of detected) tokenByName.set(d.param, d);
|
|
2837
|
+
for (const d of opts.tokenParams ?? []) {
|
|
2838
|
+
tokenByName.set(d.param, {
|
|
2839
|
+
param: d.param,
|
|
2840
|
+
sourceTool: d.sourceTool,
|
|
2841
|
+
sourceField: d.sourceField,
|
|
2842
|
+
});
|
|
1319
2843
|
}
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
2844
|
+
|
|
2845
|
+
// Missing-producer guard: if a declared producer did not compile, the chain
|
|
2846
|
+
// cannot be exercised — waive (verified:false, keep+mark) rather than block
|
|
2847
|
+
// the consumer on something out of its control.
|
|
2848
|
+
const tokenSources: TokenSource[] = [];
|
|
2849
|
+
const waivedChain: ParamVerification[] = [];
|
|
2850
|
+
for (const ts of tokenByName.values()) {
|
|
2851
|
+
if (ts.sourceTool && !existsSync(pathJoin(toolDir, '..', ts.sourceTool, 'workflow.json'))) {
|
|
2852
|
+
waivedChain.push({
|
|
2853
|
+
name: ts.param,
|
|
2854
|
+
verified: false,
|
|
2855
|
+
reason: 'waived-chain',
|
|
2856
|
+
sourcedFrom: sourcedFromOf(ts),
|
|
2857
|
+
});
|
|
1325
2858
|
warnings.push(
|
|
1326
|
-
`
|
|
2859
|
+
`producer tool "${ts.sourceTool}" for token param "${ts.param}" is unavailable (did not compile) — the producer→consumer chain is left unverified (waived-chain).`,
|
|
1327
2860
|
);
|
|
1328
2861
|
} else {
|
|
1329
|
-
|
|
1330
|
-
`bun test integration.test.ts exited ${lastOutput.exitCode} — the workflow failed to produce live data (tried 3 times).\nstdout:\n${lastOutput.stdout}\nstderr:\n${lastOutput.stderr}`,
|
|
1331
|
-
);
|
|
2862
|
+
tokenSources.push(ts);
|
|
1332
2863
|
}
|
|
1333
2864
|
}
|
|
2865
|
+
|
|
2866
|
+
const waivedNames = new Set(waivedChain.map((w) => w.name));
|
|
2867
|
+
const coverage = classifyParamCoverage({
|
|
2868
|
+
likelyParams: opts.likelyParams.filter((lp) => !waivedNames.has(lp.name)),
|
|
2869
|
+
integrationSrc,
|
|
2870
|
+
passedTests: integrationPassedTests,
|
|
2871
|
+
integrationOutcome,
|
|
2872
|
+
tokenSources,
|
|
2873
|
+
});
|
|
2874
|
+
paramVerification.push(...coverage.paramVerification, ...waivedChain);
|
|
2875
|
+
if (coverage.tautological.length > 0) {
|
|
2876
|
+
failures.push(
|
|
2877
|
+
`${coverage.tautological.length} parameter(s) have a passing \`param:<name>\` test that never calls runWorkflowWithLadder, so it does not exercise the live workflow: ${coverage.tautological.join(', ')}. Each per-parameter test must call the workflow with the override value and assert the response is constrained by it.`,
|
|
2878
|
+
);
|
|
2879
|
+
}
|
|
2880
|
+
if (coverage.uncovered.length > 0) {
|
|
2881
|
+
failures.push(
|
|
2882
|
+
`${coverage.uncovered.length} parameter(s) have no passing \`param:<name>\` integration test and no \`// exposed-but-not-verified\` annotation: ${coverage.uncovered.join(', ')}. Add a test titled \`param:<name> …\` that overrides the value, calls runWorkflowWithLadder, and asserts the response is constrained — or annotate the parameter as explicitly unverified. See prompts/compile-agent.md "Per-parameter coverage tests".`,
|
|
2883
|
+
);
|
|
2884
|
+
}
|
|
2885
|
+
if (coverage.unchained.length > 0) {
|
|
2886
|
+
const details = coverage.unchained
|
|
2887
|
+
.map((name) => {
|
|
2888
|
+
const ts = tokenSources.find((t) => t.param === name);
|
|
2889
|
+
return ts?.sourceTool && ts.sourceField
|
|
2890
|
+
? `\`${name}\` (mint from \`../${ts.sourceTool}/workflow.json\` → read field \`${ts.sourceField}\`)`
|
|
2891
|
+
: `\`${name}\``;
|
|
2892
|
+
})
|
|
2893
|
+
.join(', ');
|
|
2894
|
+
failures.push(
|
|
2895
|
+
`${coverage.unchained.length} producer-sourced token param(s) lack a CHAINED \`param:<name>\` test that mints a FRESH value from the producer tool: ${details}. Each test must call runWorkflowWithLadder on the named producer's \`workflow.json\`, read the named field from its result, and pass THAT value (not the recorded constant) into this tool — then assert the response is non-empty. If the producer only emits a bare fragment, fix the PRODUCER to emit the full value this tool consumes. See prompts/compile-agent.md "Producer-sourced token parameters".`,
|
|
2896
|
+
);
|
|
2897
|
+
}
|
|
2898
|
+
}
|
|
2899
|
+
|
|
2900
|
+
// Producer-side token contract: the build plan requires this tool to emit
|
|
2901
|
+
// certain fields for sibling consumers. Fail if the parser doesn't reference a
|
|
2902
|
+
// declared field by name — otherwise the producer/consumer field name silently
|
|
2903
|
+
// diverges (plan says `hotel_id`, parser emits `propertyToken`) and the
|
|
2904
|
+
// consumer's chained test can never extract it.
|
|
2905
|
+
if ((opts.emittedTokens?.length ?? 0) > 0 && existsSync(parserPath)) {
|
|
2906
|
+
const parserSrc = readFileSync(parserPath, 'utf8');
|
|
2907
|
+
const missing = (opts.emittedTokens ?? [])
|
|
2908
|
+
.map((e) => e.field)
|
|
2909
|
+
.filter(
|
|
2910
|
+
(field) =>
|
|
2911
|
+
!new RegExp(`\\b${field.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`).test(parserSrc),
|
|
2912
|
+
);
|
|
2913
|
+
if (missing.length > 0) {
|
|
2914
|
+
failures.push(
|
|
2915
|
+
`the build plan requires this tool's parser to emit ${missing
|
|
2916
|
+
.map((f) => `\`${f}\``)
|
|
2917
|
+
.join(', ')} so sibling consumer tools can use ${
|
|
2918
|
+
missing.length === 1 ? 'it' : 'them'
|
|
2919
|
+
} as an input token, but parser.ts does not emit ${
|
|
2920
|
+
missing.length === 1 ? 'that field' : 'those fields'
|
|
2921
|
+
}. Emit ${
|
|
2922
|
+
missing.length === 1 ? 'it' : 'each'
|
|
2923
|
+
} in every result item under the EXACT field name (the full value a consumer needs, never a bare fragment) — see read_build_plan "emitsTokens".`,
|
|
2924
|
+
);
|
|
2925
|
+
}
|
|
1334
2926
|
}
|
|
1335
2927
|
|
|
1336
2928
|
if (existsSync(parserPath) || existsSync(parserTestPath)) {
|
|
1337
|
-
const output = await
|
|
2929
|
+
const output = await typecheckArtifacts(toolDir, ['parser.ts', 'request-transform.ts']);
|
|
1338
2930
|
if (output.exitCode !== 0 || output.timedOut) {
|
|
1339
2931
|
failures.push(
|
|
1340
2932
|
`generated TypeScript artifacts failed typecheck (bunx tsc --noEmit -p .imprint-typecheck.tsconfig.json) exited ${output.exitCode}${output.timedOut ? ' after timing out' : ''}\nstdout:\n${output.stdout}\nstderr:\n${output.stderr}`,
|
|
@@ -1385,5 +2977,5 @@ export async function externalVerification(
|
|
|
1385
2977
|
}
|
|
1386
2978
|
}
|
|
1387
2979
|
|
|
1388
|
-
return { failures, warnings };
|
|
2980
|
+
return { failures, warnings, paramVerification, liveVerification };
|
|
1389
2981
|
}
|