imprint-mcp 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -201
- package/examples/discoverandgo/README.md +1 -1
- package/examples/echo/README.md +1 -1
- package/examples/google-flights/README.md +28 -0
- package/examples/google-flights/_shared/batchexecute.ts +63 -0
- package/examples/google-flights/_shared/flights_request.ts +95 -0
- package/examples/google-flights/_shared/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/index.ts +159 -0
- package/examples/google-flights/get_flight_booking_details/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/parser.ts +182 -0
- package/examples/google-flights/get_flight_booking_details/playbook.yaml +138 -0
- package/examples/google-flights/get_flight_booking_details/request-transform.ts +86 -0
- package/examples/google-flights/get_flight_booking_details/workflow.json +98 -0
- package/examples/google-flights/get_flight_calendar_prices/index.ts +131 -0
- package/examples/google-flights/get_flight_calendar_prices/package.json +9 -0
- package/examples/google-flights/get_flight_calendar_prices/parser.ts +86 -0
- package/examples/google-flights/get_flight_calendar_prices/playbook.yaml +97 -0
- package/examples/google-flights/get_flight_calendar_prices/request-transform.ts +31 -0
- package/examples/google-flights/get_flight_calendar_prices/workflow.json +76 -0
- package/examples/google-flights/lookup_airport/index.ts +101 -0
- package/examples/google-flights/lookup_airport/package.json +9 -0
- package/examples/google-flights/lookup_airport/parser.ts +66 -0
- package/examples/google-flights/lookup_airport/playbook.yaml +47 -0
- package/examples/google-flights/lookup_airport/request-transform.ts +20 -0
- package/examples/google-flights/lookup_airport/workflow.json +57 -0
- package/examples/google-flights/search_flights/index.ts +219 -0
- package/examples/google-flights/search_flights/package.json +9 -0
- package/examples/google-flights/search_flights/parser.ts +169 -0
- package/examples/google-flights/search_flights/playbook.yaml +184 -0
- package/examples/google-flights/search_flights/request-transform.ts +119 -0
- package/examples/google-flights/search_flights/workflow.json +143 -0
- package/examples/google-hotels/README.md +29 -0
- package/examples/google-hotels/_shared/batchexecute.ts +73 -0
- package/examples/google-hotels/_shared/freq.ts +158 -0
- package/examples/google-hotels/_shared/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/index.ts +80 -0
- package/examples/google-hotels/autocomplete_hotel_location/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/parser.ts +71 -0
- package/examples/google-hotels/autocomplete_hotel_location/playbook.yaml +36 -0
- package/examples/google-hotels/autocomplete_hotel_location/request-transform.ts +37 -0
- package/examples/google-hotels/autocomplete_hotel_location/workflow.json +36 -0
- package/examples/google-hotels/get_hotel_booking_options/index.ts +143 -0
- package/examples/google-hotels/get_hotel_booking_options/package.json +9 -0
- package/examples/google-hotels/get_hotel_booking_options/parser.ts +271 -0
- package/examples/google-hotels/get_hotel_booking_options/playbook.yaml +154 -0
- package/examples/google-hotels/get_hotel_booking_options/request-transform.ts +154 -0
- package/examples/google-hotels/get_hotel_booking_options/workflow.json +84 -0
- package/examples/google-hotels/get_hotel_reviews/index.ts +81 -0
- package/examples/google-hotels/get_hotel_reviews/package.json +9 -0
- package/examples/google-hotels/get_hotel_reviews/parser.ts +128 -0
- package/examples/google-hotels/get_hotel_reviews/playbook.yaml +64 -0
- package/examples/google-hotels/get_hotel_reviews/request-transform.ts +42 -0
- package/examples/google-hotels/get_hotel_reviews/workflow.json +37 -0
- package/examples/google-hotels/search_hotels/index.ts +207 -0
- package/examples/google-hotels/search_hotels/package.json +9 -0
- package/examples/google-hotels/search_hotels/parser.ts +260 -0
- package/examples/google-hotels/search_hotels/playbook.yaml +87 -0
- package/examples/google-hotels/search_hotels/request-transform.ts +197 -0
- package/examples/google-hotels/search_hotels/workflow.json +127 -0
- package/package.json +3 -2
- package/prompts/audit-agent.md +71 -0
- package/prompts/build-planning.md +74 -0
- package/prompts/compile-agent.md +131 -27
- package/prompts/prereq-builder.md +64 -0
- package/prompts/prereq-planner.md +34 -0
- package/prompts/tool-planning.md +39 -0
- package/src/cli.ts +109 -2
- package/src/imprint/agent.ts +5 -0
- package/src/imprint/audit.ts +996 -0
- package/src/imprint/backend-ladder.ts +1214 -184
- package/src/imprint/build-plan.ts +1051 -0
- package/src/imprint/cdp-browser-fetch.ts +589 -0
- package/src/imprint/cdp-jar-cache.ts +320 -0
- package/src/imprint/chromium.ts +135 -0
- package/src/imprint/claude-cli-compile.ts +125 -25
- package/src/imprint/codex-cli-compile.ts +26 -23
- package/src/imprint/compile-agent-types.ts +38 -0
- package/src/imprint/compile-agent.ts +63 -25
- package/src/imprint/compile-tools.ts +1656 -64
- package/src/imprint/compile.ts +13 -1
- package/src/imprint/concurrency.ts +87 -0
- package/src/imprint/cron.ts +1 -0
- package/src/imprint/doctor.ts +39 -0
- package/src/imprint/freeform-redact.ts +5 -4
- package/src/imprint/integrations.ts +2 -2
- package/src/imprint/llm.ts +56 -8
- package/src/imprint/mcp-compile-server.ts +43 -10
- package/src/imprint/mcp-maintenance.ts +9 -101
- package/src/imprint/mcp-server.ts +73 -7
- package/src/imprint/multi-progress.ts +7 -2
- package/src/imprint/param-grounding.ts +367 -0
- package/src/imprint/paths.ts +29 -0
- package/src/imprint/playbook-runner.ts +101 -40
- package/src/imprint/prereq-builder.ts +651 -0
- package/src/imprint/probe-backends.ts +6 -3
- package/src/imprint/record.ts +10 -1
- package/src/imprint/redact.ts +30 -2
- package/src/imprint/replay-capture.ts +19 -18
- package/src/imprint/runtime.ts +19 -10
- package/src/imprint/session-diff.ts +79 -2
- package/src/imprint/session-merge.ts +9 -5
- package/src/imprint/stealth-chromium.ts +81 -0
- package/src/imprint/stealth-fetch.ts +309 -29
- package/src/imprint/stealth-token-cache.ts +88 -0
- package/src/imprint/teach-plan.ts +251 -0
- package/src/imprint/teach-state.ts +10 -0
- package/src/imprint/teach.ts +456 -142
- package/src/imprint/tool-candidates.ts +72 -14
- package/src/imprint/tool-plan.ts +313 -0
- package/src/imprint/tracing.ts +135 -6
- package/src/imprint/types.ts +61 -3
- package/examples/google-flights/search_google_flights/index.ts +0 -101
- package/examples/google-flights/search_google_flights/parser.test.ts +0 -140
- package/examples/google-flights/search_google_flights/parser.ts +0 -189
- package/examples/google-flights/search_google_flights/playbook.yaml +0 -130
- package/examples/google-flights/search_google_flights/workflow.json +0 -48
- package/examples/google-hotels/search_google_hotels/index.ts +0 -194
- package/examples/google-hotels/search_google_hotels/parser.test.ts +0 -168
- package/examples/google-hotels/search_google_hotels/parser.ts +0 -330
- package/examples/google-hotels/search_google_hotels/playbook.yaml +0 -125
- package/examples/google-hotels/search_google_hotels/workflow.json +0 -111
- package/examples/namecheap-domains/search_namecheap_domains/index.ts +0 -144
- package/examples/namecheap-domains/search_namecheap_domains/parser.ts +0 -380
- package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +0 -50
- package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +0 -136
- package/examples/namecheap-domains/search_namecheap_domains/workflow.json +0 -97
|
@@ -1,28 +1,63 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Walk a list of backends in order, escalating on FORBIDDEN
|
|
3
|
-
* STATE_MISSING; other errors return immediately.
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
2
|
+
* Walk a list of backends in order, escalating on FORBIDDEN, NETWORK (tarpit),
|
|
3
|
+
* and satisfiable STATE_MISSING; other errors return immediately.
|
|
4
|
+
*
|
|
5
|
+
* Rung tiers:
|
|
6
|
+
* - `fetch` — plain HTTP API replay.
|
|
7
|
+
* - `fetch-bootstrap` — the API ANTI-BOT path: a one-time cdp-browser mint of a
|
|
8
|
+
* validated Akamai session jar (real Chrome used ONLY to bootstrap, then
|
|
9
|
+
* closed), then PLAIN-fetch replay of every request with that jar. The jar is
|
|
10
|
+
* cached (~90 min) so one bootstrap serves many searches. Auto mode always
|
|
11
|
+
* splices this right after `fetch`; it only RUNS when `fetch` escalates, so a
|
|
12
|
+
* healthy plain-API site never pays for it.
|
|
13
|
+
* - `stealth-fetch` — Playwright stealth bootstrap + native fetch (token tier).
|
|
14
|
+
* - `playbook` — DOM-walk LAST RESORT (needs a compiled playbook.yaml).
|
|
7
15
|
*/
|
|
8
16
|
|
|
9
|
-
import { existsSync } from 'node:fs';
|
|
10
|
-
import { resolve as pathResolve } from 'node:path';
|
|
17
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
18
|
+
import { dirname, resolve as pathResolve } from 'node:path';
|
|
11
19
|
import type { Page } from 'playwright';
|
|
20
|
+
import {
|
|
21
|
+
type CdpBrowserFetch,
|
|
22
|
+
type CdpBrowserFetchOptions,
|
|
23
|
+
type MintedJar,
|
|
24
|
+
createCdpBrowserFetch,
|
|
25
|
+
} from './cdp-browser-fetch.ts';
|
|
26
|
+
import {
|
|
27
|
+
clearJar,
|
|
28
|
+
loadJar,
|
|
29
|
+
newestRecording,
|
|
30
|
+
saveJar,
|
|
31
|
+
seedJarFromRecording,
|
|
32
|
+
} from './cdp-jar-cache.ts';
|
|
33
|
+
import { proxyUrl } from './chromium.ts';
|
|
12
34
|
import { RuntimeCookieJar } from './cookie-jar.ts';
|
|
13
35
|
import { createLog } from './log.ts';
|
|
14
36
|
import { runPlaybook } from './playbook-runner.ts';
|
|
15
|
-
import {
|
|
16
|
-
|
|
37
|
+
import {
|
|
38
|
+
type CredentialStore,
|
|
39
|
+
executeWorkflow,
|
|
40
|
+
loadCredentialStore,
|
|
41
|
+
substituteString,
|
|
42
|
+
} from './runtime.ts';
|
|
43
|
+
import {
|
|
44
|
+
type BootstrapArgs,
|
|
45
|
+
type StealthFetch,
|
|
46
|
+
type TokenCache,
|
|
47
|
+
bootstrapStealthToken,
|
|
48
|
+
createStealthFetch,
|
|
49
|
+
} from './stealth-fetch.ts';
|
|
50
|
+
import { clearCachedToken, loadCachedToken, saveCachedToken } from './stealth-token-cache.ts';
|
|
17
51
|
import type { ResolvedTool } from './tool-loader.ts';
|
|
18
|
-
import
|
|
19
|
-
BootstrapCapture,
|
|
20
|
-
ConcreteBackend,
|
|
21
|
-
ReplayBackend,
|
|
22
|
-
StateCapability,
|
|
23
|
-
StateMissingItem,
|
|
24
|
-
ToolResult,
|
|
25
|
-
Workflow,
|
|
52
|
+
import {
|
|
53
|
+
type BootstrapCapture,
|
|
54
|
+
type ConcreteBackend,
|
|
55
|
+
type ReplayBackend,
|
|
56
|
+
type StateCapability,
|
|
57
|
+
type StateMissingItem,
|
|
58
|
+
type ToolResult,
|
|
59
|
+
type Workflow,
|
|
60
|
+
WorkflowSchema,
|
|
26
61
|
} from './types.ts';
|
|
27
62
|
|
|
28
63
|
interface LadderResult {
|
|
@@ -41,6 +76,131 @@ const log = createLog('backend');
|
|
|
41
76
|
|
|
42
77
|
const DEFAULT_LADDER: ConcreteBackend[] = ['fetch', 'stealth-fetch', 'playbook'];
|
|
43
78
|
|
|
79
|
+
/** Process-scoped memo of the backend that last succeeded for a site on the
|
|
80
|
+
* compile/test path (`runWorkflowWithLadder`). Lets the param-coverage suite
|
|
81
|
+
* skip doomed rungs after the first success. Never persisted; never consulted
|
|
82
|
+
* by production replay. Exported reset for test isolation. */
|
|
83
|
+
const compileWinningBackend = new Map<string, ConcreteBackend>();
|
|
84
|
+
export function __resetCompileWinningBackendForTest(): void {
|
|
85
|
+
compileWinningBackend.clear();
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
let probeTimeoutMsForTest: number | null = null;
|
|
89
|
+
export function __setProbeTimeoutMsForTest(ms: number | null): void {
|
|
90
|
+
probeTimeoutMsForTest = ms;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/** Backend preference for the compile parallel-probe winner, LOWER = preferred.
|
|
94
|
+
* `fetch` first (cheapest, no browser). Among the browser-backed rungs prefer
|
|
95
|
+
* `cdp-replay` over `stealth-fetch`: cdp-replay's cold start is a one-time cost
|
|
96
|
+
* (the pool keeps Chrome warm so later calls are ~2-5s) and it is the more
|
|
97
|
+
* anti-bot-robust path (real Chrome re-validating its sensor between calls), so
|
|
98
|
+
* it shouldn't lose the probe just because stealth's FIRST call clocked faster. */
|
|
99
|
+
const BACKEND_PROBE_RANK: Record<string, number> = {
|
|
100
|
+
fetch: 0,
|
|
101
|
+
'cdp-replay': 1,
|
|
102
|
+
'stealth-fetch': 2,
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
/** Pick the parallel-probe winner among backends that returned real data: prefer
|
|
106
|
+
* by `BACKEND_PROBE_RANK` (fetch < cdp-replay < stealth-fetch), with first-call
|
|
107
|
+
* duration only as a tiebreak — so when both browser backends succeed, the
|
|
108
|
+
* warm-poolable cdp-replay wins instead of stealth's faster cold call. Pure +
|
|
109
|
+
* exported for unit testing. */
|
|
110
|
+
export function pickProbeWinner<T extends { backend: ConcreteBackend; durationMs: number }>(
|
|
111
|
+
winners: T[],
|
|
112
|
+
): T | undefined {
|
|
113
|
+
return [...winners].sort((a, b) => {
|
|
114
|
+
const ra = BACKEND_PROBE_RANK[a.backend] ?? 9;
|
|
115
|
+
const rb = BACKEND_PROBE_RANK[b.backend] ?? 9;
|
|
116
|
+
return ra !== rb ? ra - rb : a.durationMs - b.durationMs;
|
|
117
|
+
})[0];
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/** Process-global CDP pool for the compile/test path (`runWorkflowWithLadder`).
|
|
121
|
+
* cdp-replay stores its live Chrome here on success so subsequent calls within
|
|
122
|
+
* the same `bun test` process reuse it (~2-5s vs ~33s cold start) — the same
|
|
123
|
+
* mechanism as the runtime pool in mcp-server.ts. An idle timer (re)armed after
|
|
124
|
+
* every call closes each browser shortly after the LAST call, so the host
|
|
125
|
+
* process drains and exits cleanly (no leak, no hang) without a per-call drain.
|
|
126
|
+
* Per-process: concurrent compile lanes are separate `bun test` processes, so
|
|
127
|
+
* this is never shared across lanes; never consulted by production replay. */
|
|
128
|
+
const compileCdpPool = new Map<string, CdpBrowserFetch>();
|
|
129
|
+
const compileCdpIdleTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
|
130
|
+
const COMPILE_CDP_IDLE_MS = 15_000;
|
|
131
|
+
|
|
132
|
+
/** Cancel pending idle-closes — called when a new call is about to reuse the pool. */
|
|
133
|
+
function clearCompileCdpIdle(): void {
|
|
134
|
+
for (const t of compileCdpIdleTimers.values()) clearTimeout(t);
|
|
135
|
+
compileCdpIdleTimers.clear();
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/** (Re)arm an idle-close timer for every pooled browser. If no further call
|
|
139
|
+
* reuses the pool within COMPILE_CDP_IDLE_MS, the browser is closed + evicted so
|
|
140
|
+
* the event loop drains and the process exits. The timer is intentionally NOT
|
|
141
|
+
* unref'd: closing the browser is what lets the process exit, so the teardown
|
|
142
|
+
* must be guaranteed to fire. */
|
|
143
|
+
function armCompileCdpIdleClose(): void {
|
|
144
|
+
clearCompileCdpIdle();
|
|
145
|
+
for (const [site, cf] of compileCdpPool) {
|
|
146
|
+
const timer = setTimeout(() => {
|
|
147
|
+
compileCdpPool.delete(site);
|
|
148
|
+
compileCdpIdleTimers.delete(site);
|
|
149
|
+
// Close releases the websocket + Chrome child handles so the event loop
|
|
150
|
+
// drains and the host process exits (mirrors mcp-server's idle close).
|
|
151
|
+
void cf.close().catch(() => {});
|
|
152
|
+
}, COMPILE_CDP_IDLE_MS);
|
|
153
|
+
compileCdpIdleTimers.set(site, timer);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/** Test isolation: cancel idle timers + drop pooled browsers (best-effort close). */
|
|
158
|
+
export function __resetCompileCdpPoolForTest(): void {
|
|
159
|
+
clearCompileCdpIdle();
|
|
160
|
+
for (const cf of compileCdpPool.values()) void cf.close().catch(() => {});
|
|
161
|
+
compileCdpPool.clear();
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/** Freshness window for the file-backed compile-time stealth token. Matches
|
|
165
|
+
* stealth-fetch's in-process `maxTokenAgeSeconds` default so a reused token is
|
|
166
|
+
* not immediately considered stale by `createStealthFetch`. */
|
|
167
|
+
const STEALTH_TOKEN_MAX_AGE_SECONDS = 600;
|
|
168
|
+
|
|
169
|
+
/** Min spacing (ms) between LIVE requests to one origin on the compile/test path,
|
|
170
|
+
* to stay under the transient anti-bot rate-flag (observed: ~2 rapid state-
|
|
171
|
+
* changing requests OK, ~3-4 trips it; recovers). The param-coverage suite fires
|
|
172
|
+
* one search per parameter — without pacing that burst flags the IP and TARPITS
|
|
173
|
+
* every later request (exactly what made v13's `.act` tools fail compile, and
|
|
174
|
+
* what flagged the IP during manual testing). Read per-call so tests can set
|
|
175
|
+
* IMPRINT_COMPILE_ACT_SPACING_MS=0. Process-scoped; production replay untouched. */
|
|
176
|
+
function compileActSpacingMs(): number {
|
|
177
|
+
const v = Number(process.env.IMPRINT_COMPILE_ACT_SPACING_MS ?? 25_000);
|
|
178
|
+
return Number.isFinite(v) && v > 0 ? v : 0;
|
|
179
|
+
}
|
|
180
|
+
const compileLastRequestAt = new Map<string, number>();
|
|
181
|
+
function sleepMs(ms: number): Promise<void> {
|
|
182
|
+
return new Promise((r) => setTimeout(r, ms));
|
|
183
|
+
}
|
|
184
|
+
/** Await the per-origin min spacing before a compile-path live request. The
|
|
185
|
+
* first call to an origin never waits (last=0); subsequent ones within the
|
|
186
|
+
* window are delayed so the suite paces itself under the rate-flag. */
|
|
187
|
+
async function paceCompileRequest(origin: string): Promise<void> {
|
|
188
|
+
const spacing = compileActSpacingMs();
|
|
189
|
+
if (spacing <= 0) return;
|
|
190
|
+
const last = compileLastRequestAt.get(origin) ?? 0;
|
|
191
|
+
const waitMs = last + spacing - Date.now();
|
|
192
|
+
if (waitMs > 0) {
|
|
193
|
+
log(
|
|
194
|
+
`compile pacing: waiting ${Math.round(waitMs / 1000)}s before next live request to ${origin}`,
|
|
195
|
+
);
|
|
196
|
+
await sleepMs(waitMs);
|
|
197
|
+
}
|
|
198
|
+
compileLastRequestAt.set(origin, Date.now());
|
|
199
|
+
}
|
|
200
|
+
export function __resetCompilePacingForTest(): void {
|
|
201
|
+
compileLastRequestAt.clear();
|
|
202
|
+
}
|
|
203
|
+
|
|
44
204
|
/** Expand a replayBackend choice into a concrete ladder. 'auto' prefers
|
|
45
205
|
* the probed order (if any), else the default. Explicit choice → single rung. */
|
|
46
206
|
export function resolveLadder(
|
|
@@ -62,12 +222,46 @@ export async function runWithLadder(
|
|
|
62
222
|
params: Record<string, string | number | boolean>,
|
|
63
223
|
assetRoot: string,
|
|
64
224
|
stealthCache: Map<string, StealthFetch>,
|
|
225
|
+
options?: {
|
|
226
|
+
skipBootstrapSplice?: boolean;
|
|
227
|
+
/** Per-site CDP browser pool so cdp-replay reuses a live Chrome across
|
|
228
|
+
* calls (~2-5s) instead of launching a fresh one each time (~33s). */
|
|
229
|
+
cdpPool?: Map<string, CdpBrowserFetch>;
|
|
230
|
+
/** Per-session memo of the backend that last served each tool. Once set, the
|
|
231
|
+
* next call starts at that backend instead of re-walking the doomed early
|
|
232
|
+
* rungs — the runtime analog of the compile path's `compileWinningBackend`.
|
|
233
|
+
* The mcp-server owns one map and ties its lifetime to `cdpPool` (a memoized
|
|
234
|
+
* cdp-replay is only fast while its Chrome is pooled). */
|
|
235
|
+
winnerCache?: Map<string, ConcreteBackend>;
|
|
236
|
+
},
|
|
65
237
|
): Promise<LadderResult> {
|
|
66
238
|
if (ladder.length === 0) {
|
|
67
239
|
throw new Error('runWithLadder: empty ladder');
|
|
68
240
|
}
|
|
69
241
|
|
|
70
|
-
const
|
|
242
|
+
const baseLadder = options?.skipBootstrapSplice
|
|
243
|
+
? ladder
|
|
244
|
+
: effectiveAutoLadder(ladder, tool.workflow);
|
|
245
|
+
|
|
246
|
+
// Runtime winner memo. Once a backend has served this tool in THIS session,
|
|
247
|
+
// start there next time instead of re-walking the doomed early rungs (southwest
|
|
248
|
+
// re-paid an ~80s fetch-bootstrap before cdp-replay on every call). The memo
|
|
249
|
+
// reorders the POST-splice ladder — cdp-replay only exists after
|
|
250
|
+
// effectiveAutoLadder splices it in, so reordering the raw `ladder` could never
|
|
251
|
+
// memoize it. Wrap-around keeps every other rung as fallback, so a now-stale
|
|
252
|
+
// winner still escalates correctly.
|
|
253
|
+
const memoKey = `${tool.site}:${tool.workflow.toolName}`;
|
|
254
|
+
let effectiveLadder = baseLadder;
|
|
255
|
+
const memoWinner = options?.winnerCache?.get(memoKey);
|
|
256
|
+
if (memoWinner) {
|
|
257
|
+
const idx = baseLadder.indexOf(memoWinner);
|
|
258
|
+
if (idx > 0) {
|
|
259
|
+
effectiveLadder = [...baseLadder.slice(idx), ...baseLadder.slice(0, idx)];
|
|
260
|
+
log(
|
|
261
|
+
`runtime memo: ${memoKey} → start at ${memoWinner}; ladder: ${effectiveLadder.join(' → ')}`,
|
|
262
|
+
);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
71
265
|
const attempts: LadderResult['attempts'] = [];
|
|
72
266
|
let lastResult: ToolResult | null = null;
|
|
73
267
|
let skipUntilBackend: ConcreteBackend | null = null;
|
|
@@ -76,6 +270,9 @@ export async function runWithLadder(
|
|
|
76
270
|
if (skipUntilBackend && backend !== skipUntilBackend) continue;
|
|
77
271
|
if (skipUntilBackend === backend) skipUntilBackend = null;
|
|
78
272
|
|
|
273
|
+
// The playbook rung is the DOM-walk LAST RESORT (needs a playbook.yaml). The
|
|
274
|
+
// anti-bot API path is the fetch-bootstrap rung above (cdp-browser jar mint
|
|
275
|
+
// then PLAIN-fetch replay) — NOT this rung. Skip when no playbook.yaml.
|
|
79
276
|
if (backend === 'playbook' && !existsSync(playbookPath(assetRoot, tool.site, tool.dir))) {
|
|
80
277
|
attempts.push({
|
|
81
278
|
backend,
|
|
@@ -83,7 +280,7 @@ export async function runWithLadder(
|
|
|
83
280
|
detail: 'no playbook.yaml',
|
|
84
281
|
durationMs: 0,
|
|
85
282
|
});
|
|
86
|
-
log(`${backend}: skipped (
|
|
283
|
+
log(`${backend}: skipped (no playbook.yaml)`);
|
|
87
284
|
continue;
|
|
88
285
|
}
|
|
89
286
|
|
|
@@ -92,24 +289,50 @@ export async function runWithLadder(
|
|
|
92
289
|
let result: ToolResult;
|
|
93
290
|
try {
|
|
94
291
|
switch (backend) {
|
|
95
|
-
case 'fetch':
|
|
96
|
-
|
|
292
|
+
case 'fetch': {
|
|
293
|
+
// Egress the plain `fetch` rung through IMPRINT_PROXY when set, so even
|
|
294
|
+
// the first rung (and GET-only tools) use the residential proxy IP.
|
|
295
|
+
const proxyFetch = makeProxyFetch();
|
|
296
|
+
result = await tool.toolFn(params, proxyFetch ? { fetchImpl: proxyFetch } : undefined);
|
|
97
297
|
break;
|
|
298
|
+
}
|
|
98
299
|
case 'fetch-bootstrap':
|
|
99
300
|
result = await runFetchBootstrap(tool, params);
|
|
100
301
|
break;
|
|
302
|
+
case 'cdp-replay':
|
|
303
|
+
result = await runCdpReplay(tool, params, options?.cdpPool);
|
|
304
|
+
break;
|
|
101
305
|
case 'stealth-fetch': {
|
|
102
306
|
const sf = ensureStealthFetch(tool, stealthCache);
|
|
103
|
-
|
|
307
|
+
// When the workflow declares a bootstrap block, mint its declared
|
|
308
|
+
// session-token state (CSRF cookies etc.) from the SAME stealth
|
|
309
|
+
// session that provides the transport cookies. Without this, a
|
|
310
|
+
// workflow escalating here from fetch-bootstrap loses the
|
|
311
|
+
// ${state.X} its requests need — the gap that made bootstrap-block
|
|
312
|
+
// tools on anti-bot sites unverifiable.
|
|
313
|
+
const initialState = tool.workflow.bootstrap
|
|
314
|
+
? await stealthBootstrapState(sf, tool.workflow.bootstrap)
|
|
315
|
+
: undefined;
|
|
316
|
+
result = await tool.toolFn(params, { fetchImpl: sf.fetchImpl, initialState });
|
|
104
317
|
break;
|
|
105
318
|
}
|
|
106
|
-
case 'playbook':
|
|
319
|
+
case 'playbook': {
|
|
320
|
+
// DOM-walk last resort (the anti-bot API path is fetch-bootstrap, above).
|
|
321
|
+
// Apply workflow.json's declared parameter defaults — runPlaybook
|
|
322
|
+
// validates and throws on absent values regardless of declared defaults.
|
|
323
|
+
const paramsWithDefaults: typeof params = { ...params };
|
|
324
|
+
for (const p of tool.workflow.parameters) {
|
|
325
|
+
if (!(p.name in paramsWithDefaults) && p.default !== undefined) {
|
|
326
|
+
paramsWithDefaults[p.name] = p.default;
|
|
327
|
+
}
|
|
328
|
+
}
|
|
107
329
|
result = await runPlaybook({
|
|
108
330
|
playbook: playbookPath(assetRoot, tool.site, tool.dir),
|
|
109
|
-
params,
|
|
331
|
+
params: paramsWithDefaults,
|
|
110
332
|
site: tool.site,
|
|
111
333
|
});
|
|
112
334
|
break;
|
|
335
|
+
}
|
|
113
336
|
}
|
|
114
337
|
} catch (err) {
|
|
115
338
|
const msg = err instanceof Error ? err.message : String(err);
|
|
@@ -121,6 +344,7 @@ export async function runWithLadder(
|
|
|
121
344
|
if (result.ok) {
|
|
122
345
|
attempts.push({ backend, outcome: 'ok', detail: `succeeded in ${durationMs}ms`, durationMs });
|
|
123
346
|
log(`${backend}: OK in ${durationMs}ms`);
|
|
347
|
+
options?.winnerCache?.set(memoKey, backend);
|
|
124
348
|
return { result, usedBackend: backend, attempts };
|
|
125
349
|
}
|
|
126
350
|
|
|
@@ -150,8 +374,48 @@ export async function runWithLadder(
|
|
|
150
374
|
}
|
|
151
375
|
}
|
|
152
376
|
|
|
153
|
-
//
|
|
154
|
-
//
|
|
377
|
+
// NETWORK escalates: a long timeout is usually anti-bot tarpitting
|
|
378
|
+
// (Akamai/Cloudflare/PerimeterX hang the connection rather than 403),
|
|
379
|
+
// and a different transport (stealth-fetch's minted token cookies, or
|
|
380
|
+
// playbook's full stealth browser) can fix it. Real DNS/connectivity
|
|
381
|
+
// failures die in milliseconds at every rung, so the cost ceiling is
|
|
382
|
+
// bounded by the per-rung timeout × ladder length.
|
|
383
|
+
if (result.error === 'NETWORK') {
|
|
384
|
+
attempts.push({
|
|
385
|
+
backend,
|
|
386
|
+
outcome: 'escalate',
|
|
387
|
+
detail: `${result.error}: ${result.message.slice(0, 120)}`,
|
|
388
|
+
durationMs,
|
|
389
|
+
});
|
|
390
|
+
log(`${backend}: NETWORK in ${durationMs}ms — escalating to next rung`);
|
|
391
|
+
continue;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
// BAD_RESPONSE (e.g. HTTP 400) is backend-specific on anti-bot sites, so it
|
|
395
|
+
// escalates rather than stopping. A cdp-replay in-page POST can be rejected
|
|
396
|
+
// because it lacks the live Akamai sensor headers the endpoint demands, while
|
|
397
|
+
// stealth-fetch — which MINTS those sensor headers during its bootstrap —
|
|
398
|
+
// returns 200 for the byte-identical request. Validated on southwest's
|
|
399
|
+
// low-fare-calendar (cdp-replay 400, stealth-fetch 200). Stopping at the first
|
|
400
|
+
// 400 stranded the working rung; escalate so a higher-trust backend gets a
|
|
401
|
+
// shot, and the winner memo then locks onto whatever passed. A genuinely
|
|
402
|
+
// malformed request 400s at every rung and the last 400 is still returned
|
|
403
|
+
// below — cost is bounded by the ladder length.
|
|
404
|
+
if (result.error === 'BAD_RESPONSE') {
|
|
405
|
+
attempts.push({
|
|
406
|
+
backend,
|
|
407
|
+
outcome: 'escalate',
|
|
408
|
+
detail: `${result.error}: ${result.message.slice(0, 120)}`,
|
|
409
|
+
durationMs,
|
|
410
|
+
});
|
|
411
|
+
log(
|
|
412
|
+
`${backend}: BAD_RESPONSE in ${durationMs}ms — escalating (a higher-trust rung may pass)`,
|
|
413
|
+
);
|
|
414
|
+
continue;
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
// AUTH_EXPIRED needs a re-login; RATE_LIMITED needs backoff. Neither
|
|
418
|
+
// is fixed by switching transport.
|
|
155
419
|
attempts.push({
|
|
156
420
|
backend,
|
|
157
421
|
outcome: 'failed',
|
|
@@ -174,33 +438,96 @@ export async function runWithLadder(
|
|
|
174
438
|
attempts,
|
|
175
439
|
};
|
|
176
440
|
}
|
|
441
|
+
const lastBackend = effectiveLadder[effectiveLadder.length - 1] ?? 'fetch';
|
|
442
|
+
// Be accurate about ladder size: the parallel probe calls this with SINGLE-rung
|
|
443
|
+
// ladders, so "every backend escalated" was misleading (it described one rung,
|
|
444
|
+
// e.g. fetch-only, as if the whole ladder gave up — and fooled the integration
|
|
445
|
+
// classifier). Only say "all rungs" when there really was more than one.
|
|
177
446
|
log(
|
|
178
|
-
|
|
447
|
+
effectiveLadder.length === 1
|
|
448
|
+
? `${lastBackend}: exhausted (no fallback rung in this ladder); returning its error`
|
|
449
|
+
: `ladder exhausted: all ${effectiveLadder.length} rungs escalated (${effectiveLadder.join(' → ')}); returning last error from ${lastBackend}`,
|
|
179
450
|
);
|
|
180
451
|
return {
|
|
181
452
|
result: lastResult,
|
|
182
|
-
usedBackend:
|
|
453
|
+
usedBackend: lastBackend,
|
|
183
454
|
attempts,
|
|
184
455
|
};
|
|
185
456
|
}
|
|
186
457
|
|
|
187
|
-
function effectiveAutoLadder(
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
if (
|
|
458
|
+
export function effectiveAutoLadder(
|
|
459
|
+
ladder: ConcreteBackend[],
|
|
460
|
+
workflow: Workflow,
|
|
461
|
+
): ConcreteBackend[] {
|
|
462
|
+
if (ladder.length <= 1) return ladder;
|
|
192
463
|
const next = [...ladder];
|
|
193
|
-
|
|
464
|
+
// Splice fetch-bootstrap right after `fetch`. It is the plain-fetch API
|
|
465
|
+
// anti-bot path: a one-time cdp-browser jar mint, then PLAIN-fetch replay. It
|
|
466
|
+
// only RUNS when `fetch` escalates (FORBIDDEN/NETWORK/satisfiable
|
|
467
|
+
// STATE_MISSING), so a healthy plain-API site never pays for it. (Gating it on
|
|
468
|
+
// workflowNeedsBootstrap previously excluded inline-token workflows like
|
|
469
|
+
// costco — so we always splice now.)
|
|
470
|
+
if (!next.includes('fetch-bootstrap')) {
|
|
471
|
+
const fetchIdx = next.indexOf('fetch');
|
|
472
|
+
if (fetchIdx !== -1) {
|
|
473
|
+
next.splice(fetchIdx + 1, 0, 'fetch-bootstrap');
|
|
474
|
+
} else if (!next.includes('cdp-replay')) {
|
|
475
|
+
// `fetch` was probed-out (e.g. Akamai 403) and `cdp-replay` is not
|
|
476
|
+
// explicitly in the ladder. Splice fetch-bootstrap before stealth-fetch
|
|
477
|
+
// so the jar-based path gets a shot. When cdp-replay IS explicit, the
|
|
478
|
+
// probe already determined it's the right rung and fetch-bootstrap was
|
|
479
|
+
// exhausted — don't re-add a doomed 60s+ rung before it.
|
|
480
|
+
const sfIdx = next.indexOf('stealth-fetch');
|
|
481
|
+
if (sfIdx !== -1) next.splice(sfIdx, 0, 'fetch-bootstrap');
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
// Splice cdp-replay right after fetch-bootstrap. It runs the API requests IN a
|
|
485
|
+
// live trusted Chrome so a protected POST's self-invalidated _abck is
|
|
486
|
+
// re-validated by the page's bmak sensor between calls — the only path that
|
|
487
|
+
// SUSTAINS multiple sensitive .act POSTs (plain-fetch replay dies after ~1-2
|
|
488
|
+
// because it cannot re-post sensor data). Expensive (a real Chrome launch), so
|
|
489
|
+
// it only RUNS when fetch-bootstrap also escalates; a single-.act tool wins at
|
|
490
|
+
// fetch-bootstrap and never pays for it.
|
|
491
|
+
if (!next.includes('cdp-replay')) {
|
|
492
|
+
const fbIdx = next.indexOf('fetch-bootstrap');
|
|
493
|
+
if (fbIdx !== -1) next.splice(fbIdx + 1, 0, 'cdp-replay');
|
|
494
|
+
}
|
|
495
|
+
// For a MULTI-step state-changing anti-bot workflow, plain-fetch rungs are not
|
|
496
|
+
// just doomed — their tarpitted .act attempts BURN the per-IP rate budget
|
|
497
|
+
// before cdp-replay even runs, which can flag the IP and make cdp-replay tarpit
|
|
498
|
+
// too. Front-load cdp-replay for these so the live browser handles every
|
|
499
|
+
// protected POST from a clean slate.
|
|
500
|
+
if (prefersCdpReplayFirst(workflow)) {
|
|
501
|
+
const i = next.indexOf('cdp-replay');
|
|
502
|
+
if (i > 0) {
|
|
503
|
+
next.splice(i, 1);
|
|
504
|
+
next.unshift('cdp-replay');
|
|
505
|
+
}
|
|
506
|
+
}
|
|
194
507
|
return next;
|
|
195
508
|
}
|
|
196
509
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
510
|
+
/** A multi-step, state-changing, anti-bot workflow: ≥2 mutating requests AND an
|
|
511
|
+
* anti-bot signal (a bootstrap block, or requests that depend on captured
|
|
512
|
+
* `${state.X}` tokens). Plain-fetch replay can't sustain its sequence of
|
|
513
|
+
* protected POSTs (each self-invalidates `_abck`); only the live-browser
|
|
514
|
+
* cdp-replay rung can — and it should run FIRST so the doomed fetch /
|
|
515
|
+
* fetch-bootstrap attempts don't pre-burn the per-IP .act budget. A plain
|
|
516
|
+
* multi-POST REST API (no bootstrap, no `${state.X}`) is NOT matched, so it
|
|
517
|
+
* keeps the cheap fetch-first order. */
|
|
518
|
+
export function prefersCdpReplayFirst(workflow: Workflow): boolean {
|
|
519
|
+
const mutating = workflow.requests.filter((r) => {
|
|
520
|
+
const m = (r.method ?? 'GET').toUpperCase();
|
|
521
|
+
return r.effect === 'unsafe' || m === 'POST' || m === 'PUT' || m === 'PATCH' || m === 'DELETE';
|
|
522
|
+
});
|
|
523
|
+
if (mutating.length < 2) return false;
|
|
524
|
+
const hasStateRefs = workflow.requests.some(
|
|
525
|
+
(r) =>
|
|
526
|
+
/\$\{state\./.test(r.url ?? '') ||
|
|
527
|
+
/\$\{state\./.test(r.body ?? '') ||
|
|
528
|
+
Object.values(r.headers ?? {}).some((v) => /\$\{state\./.test(v)),
|
|
203
529
|
);
|
|
530
|
+
return Boolean(workflow.bootstrap) || hasStateRefs;
|
|
204
531
|
}
|
|
205
532
|
|
|
206
533
|
function nextStateMissingBackend(
|
|
@@ -226,7 +553,12 @@ function capabilitySatisfiedBy(backend: ConcreteBackend, capability: StateCapabi
|
|
|
226
553
|
if (backend === 'fetch-bootstrap') {
|
|
227
554
|
return capability === 'browser_bootstrap' || capability === 'stealth_bootstrap';
|
|
228
555
|
}
|
|
229
|
-
if (backend === '
|
|
556
|
+
if (backend === 'cdp-replay') {
|
|
557
|
+
return capability === 'browser_bootstrap' || capability === 'stealth_bootstrap';
|
|
558
|
+
}
|
|
559
|
+
if (backend === 'stealth-fetch') {
|
|
560
|
+
return capability === 'browser_bootstrap' || capability === 'stealth_bootstrap';
|
|
561
|
+
}
|
|
230
562
|
if (backend === 'playbook') {
|
|
231
563
|
return (
|
|
232
564
|
capability === 'ordinary_http' ||
|
|
@@ -237,26 +569,154 @@ function capabilitySatisfiedBy(backend: ConcreteBackend, capability: StateCapabi
|
|
|
237
569
|
return false;
|
|
238
570
|
}
|
|
239
571
|
|
|
572
|
+
/** Get a validated Akamai jar for this site: reuse the cached one (<=90 min,
|
|
573
|
+
* _abck~0~) or mint a fresh one via cdp-browser (ONE real-Chrome launch — the
|
|
574
|
+
* only mechanism that earns Akamai's trust; Playwright tarpits and never
|
|
575
|
+
* validates _abck). The browser is closed before returning; the jar replays
|
|
576
|
+
* via plain fetch. Returns null if Chrome can't launch (caller escalates). */
|
|
577
|
+
/** Test seam: stub the cdp-browser jar mint so unit tests don't launch real
|
|
578
|
+
* Chrome. Production leaves this null and uses the real cdp-browser path. */
|
|
579
|
+
let cdpJarMinterForTest:
|
|
580
|
+
| ((baseUrl: string, bootstrapUrl: string | undefined) => Promise<MintedJar | null>)
|
|
581
|
+
| null = null;
|
|
582
|
+
export function __setCdpJarMinterForTest(
|
|
583
|
+
fn: ((baseUrl: string, bootstrapUrl: string | undefined) => Promise<MintedJar | null>) | null,
|
|
584
|
+
): void {
|
|
585
|
+
cdpJarMinterForTest = fn;
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
/** Test seam: stub the cdp-browser factory used by the cdp-replay rung so unit
|
|
589
|
+
* tests don't launch real Chrome. Production leaves this null. */
|
|
590
|
+
let cdpBrowserFetchFactoryForTest: ((opts: CdpBrowserFetchOptions) => CdpBrowserFetch) | null =
|
|
591
|
+
null;
|
|
592
|
+
export function __setCdpBrowserFetchFactoryForTest(
|
|
593
|
+
fn: ((opts: CdpBrowserFetchOptions) => CdpBrowserFetch) | null,
|
|
594
|
+
): void {
|
|
595
|
+
cdpBrowserFetchFactoryForTest = fn;
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
async function getOrMintCdpJar(
|
|
599
|
+
baseUrl: string,
|
|
600
|
+
bootstrapUrl: string | undefined,
|
|
601
|
+
siteDir: string,
|
|
602
|
+
forceFresh: boolean,
|
|
603
|
+
): Promise<MintedJar | null> {
|
|
604
|
+
if (cdpJarMinterForTest) return cdpJarMinterForTest(baseUrl, bootstrapUrl);
|
|
605
|
+
if (!forceFresh) {
|
|
606
|
+
let cached = loadJar(siteDir);
|
|
607
|
+
// A recording NEWER than the cached jar supersedes it — e.g. the user
|
|
608
|
+
// re-recorded on a new IP, so the cached (old-IP) jar would tarpit. Drop the
|
|
609
|
+
// stale cache and re-seed from the fresh recording below.
|
|
610
|
+
const rec = newestRecording(siteDir);
|
|
611
|
+
if (cached && rec && rec.mtimeMs > cached.bootstrapEpoch) cached = null;
|
|
612
|
+
// No (usable) cached jar? Prefer seeding from the user's most recent
|
|
613
|
+
// RECORDING — a real-browser session whose `_abck` is HIGH-TRUST (sustains
|
|
614
|
+
// many sequential .act), strictly better than a synthetic cdp-browser mint
|
|
615
|
+
// (low-trust → tarpitted even on a fresh IP). "The recording IS the
|
|
616
|
+
// executable." Reuse the `rec` stat above so we don't re-glob.
|
|
617
|
+
if (!cached && seedJarFromRecording(siteDir, rec, bootstrapUrl)) cached = loadJar(siteDir);
|
|
618
|
+
if (cached) {
|
|
619
|
+
const provenance =
|
|
620
|
+
cached.source === 'recording'
|
|
621
|
+
? 'recording-seeded'
|
|
622
|
+
: cached.source === 'mint'
|
|
623
|
+
? 'cdp-minted'
|
|
624
|
+
: // pre-`source` cache: html-emptiness was the old (now-unreliable) tell
|
|
625
|
+
cached.html
|
|
626
|
+
? 'cdp-minted'
|
|
627
|
+
: 'recording-seeded';
|
|
628
|
+
log(
|
|
629
|
+
`reusing ${provenance} jar (age ${Math.round((Date.now() - cached.bootstrapEpoch) / 1000)}s, _abck~${cached.abckFlag}~, html=${cached.html.length}b)`,
|
|
630
|
+
);
|
|
631
|
+
return cached;
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
let cf: CdpBrowserFetch | undefined;
|
|
635
|
+
try {
|
|
636
|
+
cf = createCdpBrowserFetch({ baseUrl, bootstrapUrl });
|
|
637
|
+
const jar = await cf.mintJar();
|
|
638
|
+
if (jar.abckFlag !== '0') {
|
|
639
|
+
log(`cdp jar minted with _abck~${jar.abckFlag}~ (not validated) — replay may be rejected`);
|
|
640
|
+
}
|
|
641
|
+
saveJar(siteDir, jar);
|
|
642
|
+
return jar;
|
|
643
|
+
} catch (err) {
|
|
644
|
+
log(`cdp jar mint failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
645
|
+
return null;
|
|
646
|
+
} finally {
|
|
647
|
+
await cf?.close(); // browser dead; the jar outlives it
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
/** Replay transport for the bootstrap-then-fetch path: PLAIN fetch that presents
|
|
652
|
+
* the jar's exact UA (Akamai drops the jar on a UA mismatch). Cookies are
|
|
653
|
+
* attached by executeWorkflow's RuntimeCookieJar from bootstrappedCredentials,
|
|
654
|
+
* so this only forces the UA. Egresses through IMPRINT_PROXY when set, so the
|
|
655
|
+
* replay's IP matches the (proxied) browser that minted the jar — else Akamai
|
|
656
|
+
* drops the jar on the IP mismatch. */
|
|
657
|
+
function makeJarUaFetch(ua: string): typeof fetch {
|
|
658
|
+
const proxy = proxyUrl();
|
|
659
|
+
return (async (input: string | URL | Request, init?: RequestInit): Promise<Response> => {
|
|
660
|
+
const headers = new Headers(init?.headers ?? {});
|
|
661
|
+
if (ua) headers.set('user-agent', ua);
|
|
662
|
+
return globalThis.fetch(
|
|
663
|
+
input as Parameters<typeof fetch>[0],
|
|
664
|
+
{
|
|
665
|
+
...init,
|
|
666
|
+
headers,
|
|
667
|
+
...(proxy ? { proxy } : {}),
|
|
668
|
+
} as RequestInit,
|
|
669
|
+
);
|
|
670
|
+
}) as typeof fetch;
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
/** Plain proxied fetch for the `fetch` rung so even the first (no-jar) rung
|
|
674
|
+
* egresses through IMPRINT_PROXY — keeps the egress IP uniform across rungs and
|
|
675
|
+
* lets GET-only tools (e.g. location lookups) succeed from the residential
|
|
676
|
+
* proxy. No-op (returns global fetch) when no proxy is configured. */
|
|
677
|
+
function makeProxyFetch(): typeof fetch | undefined {
|
|
678
|
+
const proxy = proxyUrl();
|
|
679
|
+
if (!proxy) return undefined;
|
|
680
|
+
return (async (input: string | URL | Request, init?: RequestInit): Promise<Response> =>
|
|
681
|
+
globalThis.fetch(
|
|
682
|
+
input as Parameters<typeof fetch>[0],
|
|
683
|
+
{
|
|
684
|
+
...init,
|
|
685
|
+
proxy,
|
|
686
|
+
} as RequestInit,
|
|
687
|
+
)) as typeof fetch;
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
/** A replay error that means the JAR is bad (clear it + re-mint), as opposed to a
|
|
691
|
+
* transient IP rate-flag (NETWORK/RATE_LIMITED — a fresh jar won't help; back off). */
|
|
692
|
+
function jarLikelyStale(result: ToolResult): boolean {
|
|
693
|
+
return !result.ok && (result.error === 'FORBIDDEN' || result.error === 'AUTH_EXPIRED');
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
/**
|
|
697
|
+
* fetch-bootstrap rung — the API anti-bot path. Mint a validated session jar via
|
|
698
|
+
* cdp-browser (real Chrome, used ONLY to bootstrap), CLOSE the browser, then
|
|
699
|
+
* replay every workflow request via PLAIN fetch with that jar. Works with or
|
|
700
|
+
* without a workflow.bootstrap block: cookie/html_regex bootstrap captures are
|
|
701
|
+
* satisfied from the minted jar + page HTML, and a workflow that captures its
|
|
702
|
+
* tokens inline (e.g. csrf via a request text_regex) just needs the jar's
|
|
703
|
+
* anti-bot cookies. Self-heals: a stale jar (403/AUTH) is cleared and re-minted
|
|
704
|
+
* once; an IP rate-flag (NETWORK) is returned for the ladder to handle (a fresh
|
|
705
|
+
* jar can't beat a transient rate tarpit).
|
|
706
|
+
*/
|
|
240
707
|
async function runFetchBootstrap(
|
|
241
708
|
tool: ResolvedTool,
|
|
242
709
|
params: Record<string, string | number | boolean>,
|
|
243
710
|
): Promise<ToolResult> {
|
|
244
|
-
|
|
711
|
+
let baseUrl: string;
|
|
712
|
+
try {
|
|
713
|
+
baseUrl = pickBaseUrl(tool);
|
|
714
|
+
} catch {
|
|
245
715
|
return {
|
|
246
716
|
ok: false,
|
|
247
717
|
error: 'STATE_MISSING',
|
|
248
|
-
message: 'fetch-bootstrap
|
|
249
|
-
|
|
250
|
-
{
|
|
251
|
-
name: 'workflow.bootstrap',
|
|
252
|
-
source: 'workflow',
|
|
253
|
-
capability: 'browser_bootstrap',
|
|
254
|
-
required: true,
|
|
255
|
-
failure: 'producer_unavailable',
|
|
256
|
-
message: 'workflow.bootstrap is missing',
|
|
257
|
-
},
|
|
258
|
-
],
|
|
259
|
-
remediation: 'Regenerate or edit workflow.json with bootstrap metadata.',
|
|
718
|
+
message: 'fetch-bootstrap needs at least one request URL to bootstrap from.',
|
|
719
|
+
remediation: 'Regenerate workflow.json — it has no requests.',
|
|
260
720
|
};
|
|
261
721
|
}
|
|
262
722
|
|
|
@@ -266,88 +726,178 @@ async function runFetchBootstrap(
|
|
|
266
726
|
values: {},
|
|
267
727
|
storage: [],
|
|
268
728
|
};
|
|
269
|
-
const bootstrapUrl =
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
const
|
|
276
|
-
if (
|
|
277
|
-
|
|
278
|
-
|
|
729
|
+
const bootstrapUrl = tool.workflow.bootstrap
|
|
730
|
+
? substituteString(tool.workflow.bootstrap.url, params, credentials, [])
|
|
731
|
+
: undefined;
|
|
732
|
+
const siteDir = pathResolve(tool.dir, '..');
|
|
733
|
+
|
|
734
|
+
for (let attempt = 0; attempt < 2; attempt++) {
|
|
735
|
+
const jar = await getOrMintCdpJar(baseUrl, bootstrapUrl, siteDir, attempt > 0);
|
|
736
|
+
if (!jar) {
|
|
737
|
+
// Couldn't even launch the bootstrap browser → let the ladder escalate.
|
|
738
|
+
const stateMissing = bootstrapFailureStateMissingResult(
|
|
739
|
+
tool.workflow,
|
|
740
|
+
'fetch-bootstrap could not launch the bootstrap browser to mint a session jar.',
|
|
741
|
+
);
|
|
742
|
+
if (stateMissing) return stateMissing;
|
|
743
|
+
return {
|
|
744
|
+
ok: false,
|
|
745
|
+
error: 'NETWORK',
|
|
746
|
+
message: 'fetch-bootstrap could not mint a session jar (browser launch failed).',
|
|
747
|
+
};
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
// Fast-fail an UNVALIDATED jar. A cdp-minted jar without `_abck~0~`/`bm_sv`
|
|
751
|
+
// (validated:false) is rejected by Akamai on plain-fetch replay, and a second
|
|
752
|
+
// mint just produces another unvalidated jar — so don't pay two doomed
|
|
753
|
+
// ~40s mint+replay cycles (the ~80s that made southwest's every call slow).
|
|
754
|
+
// Escalate straight to cdp-replay, which fetches INSIDE the live page (the
|
|
755
|
+
// bmak sensor re-validates `_abck` between calls) and is the only path that
|
|
756
|
+
// works once the recording is too old to seed a high-trust jar. A
|
|
757
|
+
// recording-seeded or cached jar is validated:true by construction, so the
|
|
758
|
+
// cheap plain-fetch path is untouched; `=== false` (not falsy) leaves jars
|
|
759
|
+
// without the field — older caches / test stubs — on the original path.
|
|
760
|
+
if (jar.validated === false) {
|
|
761
|
+
log(
|
|
762
|
+
'fetch-bootstrap: minted jar unvalidated (no _abck~0~/bm_sv) — plain-fetch replay doomed; escalating to cdp-replay',
|
|
763
|
+
);
|
|
764
|
+
return {
|
|
765
|
+
ok: false,
|
|
766
|
+
error: 'FORBIDDEN',
|
|
767
|
+
message: 'fetch-bootstrap: cdp-minted jar did not validate; cdp-replay (in-page) required.',
|
|
768
|
+
};
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
// Build credentials carrying the minted jar's cookies (executeWorkflow's
|
|
772
|
+
// RuntimeCookieJar scopes them per-request); fetchImpl only forces the UA.
|
|
773
|
+
const bootstrappedCredentials: CredentialStore = {
|
|
774
|
+
...credentials,
|
|
775
|
+
cookies: [
|
|
776
|
+
...credentials.cookies,
|
|
777
|
+
...jar.cookies.map((c) => ({
|
|
279
778
|
name: c.name,
|
|
280
779
|
value: c.value,
|
|
281
|
-
domain: c.
|
|
282
|
-
url: c.hostOnly ? cookieUrlFor(c, bootstrapUrl) : undefined,
|
|
780
|
+
domain: c.domain,
|
|
283
781
|
path: c.path,
|
|
284
782
|
expires: c.expires,
|
|
285
783
|
httpOnly: c.httpOnly,
|
|
286
784
|
secure: c.secure,
|
|
287
|
-
sameSite:
|
|
785
|
+
sameSite: c.sameSite,
|
|
786
|
+
hostOnly: !c.domain.startsWith('.'),
|
|
288
787
|
})),
|
|
289
|
-
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
}, credentials.storage ?? []);
|
|
308
|
-
}
|
|
309
|
-
const page = await context.newPage();
|
|
310
|
-
await page.route('**/*', async (route) => {
|
|
311
|
-
const type = route.request().resourceType();
|
|
312
|
-
if (['image', 'media', 'font'].includes(type)) return route.abort();
|
|
313
|
-
return route.continue();
|
|
314
|
-
});
|
|
315
|
-
await page.goto(bootstrapUrl, {
|
|
316
|
-
waitUntil: tool.workflow.bootstrap.waitUntil ?? 'domcontentloaded',
|
|
317
|
-
timeout: tool.workflow.bootstrap.timeoutMs ?? 30_000,
|
|
788
|
+
],
|
|
789
|
+
};
|
|
790
|
+
|
|
791
|
+
// Satisfy any declared bootstrap captures from the minted jar (cookie) +
|
|
792
|
+
// page HTML (html_regex). response_header/dom captures aren't available from
|
|
793
|
+
// a closed browser — required ones of those fail loud below.
|
|
794
|
+
const captureResult = jarBootstrapCaptureState(
|
|
795
|
+
tool.workflow.bootstrap,
|
|
796
|
+
jar,
|
|
797
|
+
bootstrappedCredentials,
|
|
798
|
+
bootstrapUrl ?? baseUrl,
|
|
799
|
+
);
|
|
800
|
+
if (!captureResult.ok) return captureResult.result;
|
|
801
|
+
|
|
802
|
+
const result = await tool.toolFn(params, {
|
|
803
|
+
credentials: bootstrappedCredentials,
|
|
804
|
+
initialState: captureResult.state,
|
|
805
|
+
fetchImpl: makeJarUaFetch(jar.ua),
|
|
318
806
|
});
|
|
319
|
-
if (tool.workflow.bootstrap.waitMs) await page.waitForTimeout(tool.workflow.bootstrap.waitMs);
|
|
320
807
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
808
|
+
if (result.ok) return result;
|
|
809
|
+
if (attempt === 0 && jarLikelyStale(result)) {
|
|
810
|
+
log('fetch-bootstrap replay was rejected (403/auth) — clearing jar and re-minting once');
|
|
811
|
+
clearJar(siteDir);
|
|
812
|
+
continue;
|
|
813
|
+
}
|
|
814
|
+
return result;
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
return {
|
|
818
|
+
ok: false,
|
|
819
|
+
error: 'NETWORK',
|
|
820
|
+
message: 'fetch-bootstrap exhausted its bootstrap retries.',
|
|
821
|
+
};
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
/**
|
|
825
|
+
* cdp-replay rung — run the workflow's requests INSIDE a live trusted Chrome
|
|
826
|
+
* page (cdp-browser-fetch's in-page `fetchImpl`) instead of replaying a harvested
|
|
827
|
+
* jar via plain fetch. The decisive difference: a same-origin protected POST
|
|
828
|
+
* executes in the real page, so when its `_abck` self-invalidates the page's
|
|
829
|
+
* Akamai bmak sensor auto-re-validates it before the next call. This is the only
|
|
830
|
+
* path that SUSTAINS a SEQUENCE of sensitive `.act` POSTs (a multi-step
|
|
831
|
+
* search→agency→details flow); plain-fetch replay (fetch-bootstrap) dies after
|
|
832
|
+
* ~1-2 because it cannot re-post sensor data. Expensive (a real Chrome launch
|
|
833
|
+
* held open for the whole workflow), so it sits after fetch-bootstrap in the
|
|
834
|
+
* ladder — single-.act tools never reach it.
|
|
835
|
+
*
|
|
836
|
+
* Bootstrap state (csrf / csp-nonce) is resolved exactly as fetch-bootstrap does
|
|
837
|
+
* (via jarBootstrapCaptureState over the live page HTML + cookies harvested by
|
|
838
|
+
* mintJar) — only the transport differs.
|
|
839
|
+
*/
|
|
840
|
+
async function runCdpReplay(
|
|
841
|
+
tool: ResolvedTool,
|
|
842
|
+
params: Record<string, string | number | boolean>,
|
|
843
|
+
cdpPool?: Map<string, CdpBrowserFetch>,
|
|
844
|
+
): Promise<ToolResult> {
|
|
845
|
+
let baseUrl: string;
|
|
846
|
+
try {
|
|
847
|
+
baseUrl = pickBaseUrl(tool);
|
|
848
|
+
} catch {
|
|
849
|
+
return {
|
|
850
|
+
ok: false,
|
|
851
|
+
error: 'STATE_MISSING',
|
|
852
|
+
message: 'cdp-replay needs at least one request URL to bootstrap from.',
|
|
853
|
+
remediation: 'Regenerate workflow.json — it has no requests.',
|
|
854
|
+
};
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
const credentials = (await loadCredentialStore(tool.site)) ?? {
|
|
858
|
+
site: tool.site,
|
|
859
|
+
cookies: [],
|
|
860
|
+
values: {},
|
|
861
|
+
storage: [],
|
|
862
|
+
};
|
|
863
|
+
const bootstrapUrl = tool.workflow.bootstrap
|
|
864
|
+
? substituteString(tool.workflow.bootstrap.url, params, credentials, [])
|
|
865
|
+
: undefined;
|
|
866
|
+
|
|
867
|
+
const siteDir = pathResolve(tool.dir, '..');
|
|
868
|
+
const poolKey = tool.site;
|
|
869
|
+
const pooled = cdpPool?.get(poolKey);
|
|
870
|
+
const ownsSession = !pooled;
|
|
871
|
+
|
|
872
|
+
let cf: CdpBrowserFetch;
|
|
873
|
+
if (pooled) {
|
|
874
|
+
log('cdp-replay: reusing pooled Chrome session');
|
|
875
|
+
cf = pooled;
|
|
876
|
+
} else {
|
|
877
|
+
let seedCookies: MintedJar['cookies'] | undefined;
|
|
878
|
+
try {
|
|
879
|
+
const rec = newestRecording(siteDir);
|
|
880
|
+
let cached = loadJar(siteDir);
|
|
881
|
+
if (cached && rec && rec.mtimeMs > cached.bootstrapEpoch) cached = null;
|
|
882
|
+
if (!cached && seedJarFromRecording(siteDir, rec, bootstrapUrl)) cached = loadJar(siteDir);
|
|
883
|
+
if (cached?.cookies.length) seedCookies = cached.cookies;
|
|
884
|
+
} catch {
|
|
885
|
+
// best-effort
|
|
343
886
|
}
|
|
887
|
+
cf = (cdpBrowserFetchFactoryForTest ?? createCdpBrowserFetch)({
|
|
888
|
+
baseUrl,
|
|
889
|
+
bootstrapUrl,
|
|
890
|
+
seedCookies,
|
|
891
|
+
});
|
|
892
|
+
}
|
|
344
893
|
|
|
345
|
-
|
|
894
|
+
try {
|
|
895
|
+
const jar = await cf.mintJar();
|
|
346
896
|
const bootstrappedCredentials: CredentialStore = {
|
|
347
897
|
...credentials,
|
|
348
898
|
cookies: [
|
|
349
899
|
...credentials.cookies,
|
|
350
|
-
...cookies.map((c) => ({
|
|
900
|
+
...jar.cookies.map((c) => ({
|
|
351
901
|
name: c.name,
|
|
352
902
|
value: c.value,
|
|
353
903
|
domain: c.domain,
|
|
@@ -360,47 +910,121 @@ async function runFetchBootstrap(
|
|
|
360
910
|
})),
|
|
361
911
|
],
|
|
362
912
|
};
|
|
363
|
-
const
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
913
|
+
const captureResult = jarBootstrapCaptureState(
|
|
914
|
+
tool.workflow.bootstrap,
|
|
915
|
+
jar,
|
|
916
|
+
bootstrappedCredentials,
|
|
917
|
+
bootstrapUrl ?? baseUrl,
|
|
918
|
+
);
|
|
919
|
+
if (!captureResult.ok) {
|
|
920
|
+
if (ownsSession) await cf.close();
|
|
921
|
+
return captureResult.result;
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
const result = await tool.toolFn(params, {
|
|
925
|
+
credentials: bootstrappedCredentials,
|
|
926
|
+
initialState: captureResult.state,
|
|
927
|
+
fetchImpl: cf.fetchImpl,
|
|
928
|
+
});
|
|
929
|
+
|
|
930
|
+
if (result.ok) {
|
|
931
|
+
if (cdpPool && ownsSession) cdpPool.set(poolKey, cf);
|
|
932
|
+
try {
|
|
933
|
+
const postJar = await cf.mintJar();
|
|
934
|
+
saveJar(siteDir, postJar);
|
|
935
|
+
} catch {
|
|
936
|
+
// best-effort
|
|
937
|
+
}
|
|
938
|
+
} else {
|
|
939
|
+
if (ownsSession) {
|
|
940
|
+
await cf.close();
|
|
941
|
+
} else if (cdpPool) {
|
|
942
|
+
cdpPool.delete(poolKey);
|
|
943
|
+
log('cdp-replay: evicted degraded session from pool');
|
|
944
|
+
await cf.close();
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
return result;
|
|
949
|
+
} catch (err) {
|
|
950
|
+
// Session is dead — evict from pool so the next call creates a fresh one.
|
|
951
|
+
if (cdpPool) {
|
|
952
|
+
cdpPool.delete(poolKey);
|
|
953
|
+
log('cdp-replay: evicted dead session from pool');
|
|
954
|
+
}
|
|
955
|
+
if (ownsSession) await cf.close();
|
|
956
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
957
|
+
return { ok: false, error: 'NETWORK', message: `cdp-replay failed: ${msg}` };
|
|
958
|
+
}
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
/** Resolve workflow.bootstrap captures from a minted jar (cookie source) + the
|
|
962
|
+
* bootstrap page HTML (html_regex source). Returns the initial ${state.X} map,
|
|
963
|
+
* or a STATE_MISSING result if a required capture can't be satisfied. */
|
|
964
|
+
function jarBootstrapCaptureState(
|
|
965
|
+
bootstrap: ResolvedTool['workflow']['bootstrap'],
|
|
966
|
+
jar: MintedJar,
|
|
967
|
+
credentials: CredentialStore,
|
|
968
|
+
bootstrapUrl: string,
|
|
969
|
+
): { ok: true; state: Record<string, unknown> } | { ok: false; result: ToolResult } {
|
|
970
|
+
const state: Record<string, unknown> = {};
|
|
971
|
+
const captures = bootstrap?.captures ?? [];
|
|
972
|
+
if (captures.length === 0) return { ok: true, state };
|
|
973
|
+
const cookieJar = new RuntimeCookieJar(credentials.cookies);
|
|
974
|
+
for (const capture of captures) {
|
|
975
|
+
if (capture.source === 'cookie') {
|
|
976
|
+
const lookup = cookieJar.lookup(capture.cookie, capture.url ?? bootstrapUrl, {
|
|
367
977
|
url: capture.url,
|
|
368
978
|
domain: capture.domain,
|
|
369
979
|
path: capture.path,
|
|
370
980
|
sameSite: capture.sameSite,
|
|
371
981
|
allowHttpOnlyProjection: capture.allowHttpOnlyProjection,
|
|
372
982
|
});
|
|
373
|
-
if (lookup.ok)
|
|
983
|
+
if (lookup.ok) state[capture.name] = lookup.cookie.value;
|
|
374
984
|
else if (capture.required !== false) {
|
|
375
|
-
return
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
? `Bootstrap cookie capture "${capture.name}"
|
|
985
|
+
return {
|
|
986
|
+
ok: false,
|
|
987
|
+
result: bootstrapCaptureMissingResult(
|
|
988
|
+
capture,
|
|
989
|
+
lookup.reason === 'ambiguous'
|
|
990
|
+
? `Bootstrap cookie capture "${capture.name}" is ambiguous; add url/domain/path constraints.`
|
|
381
991
|
: `Bootstrap cookie capture "${capture.name}" did not find cookie "${capture.cookie}".`,
|
|
382
|
-
|
|
383
|
-
|
|
992
|
+
lookup.reason === 'ambiguous' ? 'ambiguous_cookie' : 'producer_ran_value_absent',
|
|
993
|
+
),
|
|
994
|
+
};
|
|
384
995
|
}
|
|
996
|
+
} else if (capture.source === 'html_regex') {
|
|
997
|
+
let value: string | undefined;
|
|
998
|
+
try {
|
|
999
|
+
const m = new RegExp(capture.pattern).exec(jar.html);
|
|
1000
|
+
value = m?.[capture.group ?? 1] ?? m?.[0];
|
|
1001
|
+
} catch {
|
|
1002
|
+
value = undefined;
|
|
1003
|
+
}
|
|
1004
|
+
if (value) state[capture.name] = value;
|
|
1005
|
+
else if (capture.required !== false) {
|
|
1006
|
+
return {
|
|
1007
|
+
ok: false,
|
|
1008
|
+
result: bootstrapCaptureMissingResult(
|
|
1009
|
+
capture,
|
|
1010
|
+
`Required bootstrap capture "${capture.name}" (html_regex) did not match the bootstrap page.`,
|
|
1011
|
+
'producer_ran_value_absent',
|
|
1012
|
+
),
|
|
1013
|
+
};
|
|
1014
|
+
}
|
|
1015
|
+
} else if (capture.required !== false) {
|
|
1016
|
+
// response_header / dom_* can't be resolved from a closed browser jar.
|
|
1017
|
+
return {
|
|
1018
|
+
ok: false,
|
|
1019
|
+
result: bootstrapCaptureMissingResult(
|
|
1020
|
+
capture,
|
|
1021
|
+
`Bootstrap capture "${capture.name}" (${capture.source}) is not supported by the fetch-bootstrap jar path; use cookie or html_regex.`,
|
|
1022
|
+
'producer_ran_value_absent',
|
|
1023
|
+
),
|
|
1024
|
+
};
|
|
385
1025
|
}
|
|
386
|
-
return await tool.toolFn(params, {
|
|
387
|
-
credentials: bootstrappedCredentials,
|
|
388
|
-
initialState,
|
|
389
|
-
});
|
|
390
|
-
} catch (err) {
|
|
391
|
-
const stateMissing = bootstrapFailureStateMissingResult(
|
|
392
|
-
tool.workflow,
|
|
393
|
-
`fetch-bootstrap could not produce required bootstrap state: ${err instanceof Error ? err.message : String(err)}`,
|
|
394
|
-
);
|
|
395
|
-
if (stateMissing) return stateMissing;
|
|
396
|
-
return {
|
|
397
|
-
ok: false,
|
|
398
|
-
error: 'NETWORK',
|
|
399
|
-
message: `fetch-bootstrap failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
400
|
-
};
|
|
401
|
-
} finally {
|
|
402
|
-
await browser?.close().catch(() => {});
|
|
403
1026
|
}
|
|
1027
|
+
return { ok: true, state };
|
|
404
1028
|
}
|
|
405
1029
|
|
|
406
1030
|
function bootstrapFailureStateMissingResult(
|
|
@@ -463,12 +1087,33 @@ function remediationForBootstrapCapabilities(capabilities: StateCapability[]): s
|
|
|
463
1087
|
: 'Run through fetch-bootstrap, or update workflow.bootstrap so Imprint can mint browser state before API replay.';
|
|
464
1088
|
}
|
|
465
1089
|
|
|
466
|
-
|
|
1090
|
+
// Exported for tests so the per-source logic (regex, DOM, storage, header)
|
|
1091
|
+
// can be unit-asserted without launching real Chromium. Internal callers
|
|
1092
|
+
// use it the same way; the export is just a visibility relaxation.
|
|
1093
|
+
export async function evaluateBootstrapCapture(
|
|
467
1094
|
capture: BootstrapCapture,
|
|
468
1095
|
page: Page,
|
|
469
1096
|
html: string,
|
|
1097
|
+
responseHeaders: Record<string, string>,
|
|
470
1098
|
): Promise<unknown> {
|
|
471
1099
|
switch (capture.source) {
|
|
1100
|
+
case 'response_header': {
|
|
1101
|
+
const raw = responseHeaders[capture.header.toLowerCase()];
|
|
1102
|
+
if (raw === undefined) return undefined;
|
|
1103
|
+
// Playwright's `allHeaders()` joins multi-valued headers with ", ".
|
|
1104
|
+
// Most uses (CSRF, single-valued anti-replay tokens) want the whole
|
|
1105
|
+
// string; mode 'first'/'last' splits when the value actually carries
|
|
1106
|
+
// a comma-list. Keep the default conservative: return raw.
|
|
1107
|
+
if (capture.mode === 'first' || capture.mode === 'last') {
|
|
1108
|
+
const parts = raw
|
|
1109
|
+
.split(',')
|
|
1110
|
+
.map((p) => p.trim())
|
|
1111
|
+
.filter(Boolean);
|
|
1112
|
+
if (parts.length === 0) return undefined;
|
|
1113
|
+
return capture.mode === 'first' ? parts[0] : parts[parts.length - 1];
|
|
1114
|
+
}
|
|
1115
|
+
return raw;
|
|
1116
|
+
}
|
|
472
1117
|
case 'html_regex': {
|
|
473
1118
|
const match = html.match(new RegExp(capture.pattern));
|
|
474
1119
|
return match?.[capture.group ?? 1];
|
|
@@ -514,55 +1159,440 @@ async function evaluateBootstrapCapture(
|
|
|
514
1159
|
}
|
|
515
1160
|
}
|
|
516
1161
|
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
1162
|
+
/** Per-site stealth fetcher; bootstrap pays its ~12s once per process. */
|
|
1163
|
+
/** Mint `${state.X}` values from the stealth bootstrap session for a workflow
|
|
1164
|
+
* that declares a bootstrap block. Satisfies `cookie`, `html_regex`, and
|
|
1165
|
+
* `response_header` captures from the cookies / HTML / response headers the
|
|
1166
|
+
* stealth navigation minted — all one consistent session as the transport
|
|
1167
|
+
* cookies, so a token the later API POST checks against the session resolves.
|
|
1168
|
+
* `dom_*` / storage sources need a live page and are left for the
|
|
1169
|
+
* fetch-bootstrap rung (the compile prompt steers replay-safe session tokens
|
|
1170
|
+
* to cookie/html_regex, which this covers). */
|
|
1171
|
+
async function stealthBootstrapState(
|
|
1172
|
+
sf: StealthFetch,
|
|
1173
|
+
bootstrap: NonNullable<ResolvedTool['workflow']['bootstrap']>,
|
|
1174
|
+
): Promise<Record<string, unknown>> {
|
|
1175
|
+
const state: Record<string, unknown> = {};
|
|
1176
|
+
const captures = bootstrap.captures ?? [];
|
|
1177
|
+
const supported = captures.filter(
|
|
1178
|
+
(c) => c.source === 'cookie' || c.source === 'html_regex' || c.source === 'response_header',
|
|
1179
|
+
);
|
|
1180
|
+
if (supported.length === 0) return state;
|
|
1181
|
+
const tokens = await sf.ensureBootstrapped();
|
|
1182
|
+
for (const cap of supported) {
|
|
1183
|
+
if (cap.source === 'cookie') {
|
|
1184
|
+
const hit = tokens.cookies.find((c) => c.name === cap.cookie);
|
|
1185
|
+
if (hit) state[cap.name] = hit.value;
|
|
1186
|
+
} else if (cap.source === 'html_regex') {
|
|
1187
|
+
const html = tokens.bootstrapHtml ?? '';
|
|
1188
|
+
try {
|
|
1189
|
+
const m = html.match(new RegExp(cap.pattern));
|
|
1190
|
+
const v = m?.[cap.group ?? 1];
|
|
1191
|
+
if (v !== undefined) state[cap.name] = v;
|
|
1192
|
+
} catch {
|
|
1193
|
+
// invalid regex — leave unset; substitution will surface STATE_MISSING
|
|
1194
|
+
}
|
|
1195
|
+
} else if (cap.source === 'response_header') {
|
|
1196
|
+
const v = tokens.bootstrapResponseHeaders?.[cap.header.toLowerCase()];
|
|
1197
|
+
if (v !== undefined && v !== '') state[cap.name] = v;
|
|
1198
|
+
}
|
|
536
1199
|
}
|
|
1200
|
+
return state;
|
|
537
1201
|
}
|
|
538
1202
|
|
|
539
|
-
/** Per-site stealth fetcher; bootstrap pays its ~12s once per process. */
|
|
540
1203
|
function ensureStealthFetch(tool: ResolvedTool, cache: Map<string, StealthFetch>): StealthFetch {
|
|
541
1204
|
const cached = cache.get(tool.site);
|
|
542
1205
|
if (cached) return cached;
|
|
543
|
-
const sf = createStealthFetch({
|
|
1206
|
+
const sf = createStealthFetch({
|
|
1207
|
+
baseUrl: pickBaseUrl(tool),
|
|
1208
|
+
// When the workflow declares a bootstrap page, navigate IT during the
|
|
1209
|
+
// stealth bootstrap so the session-token cookies it sets (CSRF etc.) are
|
|
1210
|
+
// minted in the same session as the anti-bot cookies. Otherwise the
|
|
1211
|
+
// stealth rung can't satisfy a `${state.X}` the workflow bootstrap was
|
|
1212
|
+
// supposed to provide, and escalation from fetch-bootstrap dead-ends.
|
|
1213
|
+
bootstrapUrl: tool.workflow.bootstrap?.url,
|
|
1214
|
+
});
|
|
544
1215
|
cache.set(tool.site, sf);
|
|
545
1216
|
return sf;
|
|
546
1217
|
}
|
|
547
1218
|
|
|
548
|
-
/**
|
|
549
|
-
*
|
|
550
|
-
*
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
1219
|
+
/** Pick the URL to navigate when bootstrapping an anti-bot session.
|
|
1220
|
+
* Akamai binds sensor tokens to the origin+path the browser navigated
|
|
1221
|
+
* to, so we need an HTML page — not a JSON API endpoint.
|
|
1222
|
+
*
|
|
1223
|
+
* Heuristic: skip leading requests whose path looks like a raw data
|
|
1224
|
+
* endpoint (.json, .xml, /api/, /version) — those return JSON/XML
|
|
1225
|
+
* without rendering an HTML page, so the anti-bot sensor JS never
|
|
1226
|
+
* fires and the _abck cookie stays unvalidated. Fall back to
|
|
1227
|
+
* requests[0] if every request looks like an API call. */
|
|
1228
|
+
export function pickBaseUrl(tool: ResolvedTool): string {
|
|
1229
|
+
const requests = tool.workflow.requests;
|
|
1230
|
+
if (!requests.length) {
|
|
554
1231
|
throw new Error(
|
|
555
1232
|
`Workflow ${tool.workflow.toolName} has no requests — stealth-fetch needs at least one request URL.\n→ re-record the session; recording probably stopped before any XHR fired.`,
|
|
556
1233
|
);
|
|
557
1234
|
}
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
1235
|
+
|
|
1236
|
+
// Prefer the first request whose Referer is an HTML page — the Referer
|
|
1237
|
+
// is the page the user was on when the API call fired, so it's the
|
|
1238
|
+
// correct bootstrap target. Referer is set by the browser and always
|
|
1239
|
+
// points to a real navigable page.
|
|
1240
|
+
for (const req of requests) {
|
|
1241
|
+
const referer = req.headers?.Referer ?? req.headers?.referer;
|
|
1242
|
+
if (referer) {
|
|
1243
|
+
try {
|
|
1244
|
+
const u = new URL(referer);
|
|
1245
|
+
return `${u.origin}${u.pathname}`;
|
|
1246
|
+
} catch {
|
|
1247
|
+
// malformed referer — skip
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
}
|
|
1251
|
+
|
|
1252
|
+
// Fallback: use the origin of the first request. API paths
|
|
1253
|
+
// (/api/...) aren't navigable HTML pages — the anti-bot sensor only
|
|
1254
|
+
// fires on a real page load — so the bare origin (homepage) is the
|
|
1255
|
+
// safest bootstrap target. The homepage loads the full SPA shell
|
|
1256
|
+
// with Akamai/Cloudflare/DataDome sensor scripts, minting a valid
|
|
1257
|
+
// _abck cookie that covers all paths under that origin.
|
|
1258
|
+
const first = requests[0];
|
|
1259
|
+
if (!first) {
|
|
1260
|
+
throw new Error(
|
|
1261
|
+
`Workflow ${tool.workflow.toolName} has no requests — unreachable after length check above.`,
|
|
1262
|
+
);
|
|
1263
|
+
}
|
|
1264
|
+
try {
|
|
1265
|
+
const u = new URL(first.url);
|
|
1266
|
+
return u.origin;
|
|
1267
|
+
} catch {
|
|
1268
|
+
throw new Error(
|
|
1269
|
+
`Could not parse bootstrap URL: ${first.url}\n→ check workflow.json — the first request URL must be absolute (https://...).`,
|
|
1270
|
+
);
|
|
1271
|
+
}
|
|
563
1272
|
}
|
|
564
1273
|
|
|
565
1274
|
function playbookPath(assetRoot: string, site: string, toolDir?: string): string {
|
|
566
1275
|
if (toolDir) return pathResolve(toolDir, 'playbook.yaml');
|
|
567
1276
|
return pathResolve(assetRoot, site, 'playbook.yaml');
|
|
568
1277
|
}
|
|
1278
|
+
|
|
1279
|
+
/**
|
|
1280
|
+
* Compile-time integration-test convenience: dispatch a request through
|
|
1281
|
+
* `runWithLadder` using only a `workflow.json` path. Avoids requiring an
|
|
1282
|
+
* emitted `index.ts` (which doesn't exist when integration.test.ts runs
|
|
1283
|
+
* during compile, before `imprint emit`).
|
|
1284
|
+
*
|
|
1285
|
+
* **Ladder is intentionally fixed to `['fetch', 'stealth-fetch']`** —
|
|
1286
|
+
* the playbook rung is excluded because `playbook.yaml` is compiled in
|
|
1287
|
+
* a separate later step (`imprint compile-playbook`), so at integration-
|
|
1288
|
+
* test time there is no playbook to fall back to. Even if a stale
|
|
1289
|
+
* playbook from a prior compile exists on disk, exercising it here would
|
|
1290
|
+
* conflate two independent verification surfaces and pull a slow
|
|
1291
|
+
* Playwright bootstrap into every test run.
|
|
1292
|
+
*
|
|
1293
|
+
* Credentials are loaded by `executeWorkflow` from the credential store
|
|
1294
|
+
* for the workflow's `site` by default; pass `credentials` explicitly to
|
|
1295
|
+
* override (e.g., when a test wants to assert behavior under a known
|
|
1296
|
+
* credential state).
|
|
1297
|
+
*
|
|
1298
|
+
* The test "passes" as long as ANY backend in the ladder returns ok —
|
|
1299
|
+
* fetch OR stealth-fetch. Tools whose fetch path will be blocked at
|
|
1300
|
+
* runtime are still verified end-to-end via stealth-fetch.
|
|
1301
|
+
*/
|
|
1302
|
+
export async function runWorkflowWithLadder(opts: {
|
|
1303
|
+
workflowPath: string;
|
|
1304
|
+
params: Record<string, string | number | boolean>;
|
|
1305
|
+
/** Optional credential override; otherwise loaded from the credential
|
|
1306
|
+
* store by executeWorkflow. */
|
|
1307
|
+
credentials?: CredentialStore;
|
|
1308
|
+
}): Promise<LadderResult> {
|
|
1309
|
+
if (!existsSync(opts.workflowPath)) {
|
|
1310
|
+
throw new Error(`runWorkflowWithLadder: workflow.json not found at ${opts.workflowPath}`);
|
|
1311
|
+
}
|
|
1312
|
+
const workflow = WorkflowSchema.parse(JSON.parse(readFileSync(opts.workflowPath, 'utf8')));
|
|
1313
|
+
const toolDir = dirname(opts.workflowPath);
|
|
1314
|
+
// assetRoot only matters for playbook-rung path resolution, which this
|
|
1315
|
+
// ladder skips. Use a conventional value for completeness.
|
|
1316
|
+
const assetRoot = pathResolve(toolDir, '..', '..');
|
|
1317
|
+
|
|
1318
|
+
const tool: ResolvedTool = {
|
|
1319
|
+
site: workflow.site ?? '',
|
|
1320
|
+
dir: toolDir,
|
|
1321
|
+
workflow,
|
|
1322
|
+
toolFn: async (params, fnOpts) => {
|
|
1323
|
+
// Thread ALL execution opts the rungs pass — fetchImpl (stealth), and
|
|
1324
|
+
// crucially initialState + credentials minted by fetch-bootstrap's
|
|
1325
|
+
// Chrome navigation. The production generated tool fn (tool-loader path)
|
|
1326
|
+
// forwards these to executeWorkflow; this test/probe-path toolFn must do
|
|
1327
|
+
// the same, otherwise a bootstrap-block tool's csrf/session state is
|
|
1328
|
+
// silently dropped here and the integration test fails a workflow that
|
|
1329
|
+
// actually works in production — a false waiver.
|
|
1330
|
+
const o = fnOpts as
|
|
1331
|
+
| {
|
|
1332
|
+
fetchImpl?: typeof fetch;
|
|
1333
|
+
initialState?: Record<string, unknown>;
|
|
1334
|
+
credentials?: CredentialStore;
|
|
1335
|
+
}
|
|
1336
|
+
| undefined;
|
|
1337
|
+
return executeWorkflow({
|
|
1338
|
+
workflow,
|
|
1339
|
+
params: params as Record<string, string | number | boolean>,
|
|
1340
|
+
credentials: o?.credentials ?? opts.credentials,
|
|
1341
|
+
workflowPath: opts.workflowPath,
|
|
1342
|
+
fetchImpl: o?.fetchImpl,
|
|
1343
|
+
initialState: o?.initialState,
|
|
1344
|
+
});
|
|
1345
|
+
},
|
|
1346
|
+
};
|
|
1347
|
+
|
|
1348
|
+
const ladder: ConcreteBackend[] = ['fetch', 'fetch-bootstrap', 'cdp-replay', 'stealth-fetch'];
|
|
1349
|
+
|
|
1350
|
+
const memoKey = `${tool.site}::${workflow.toolName}`;
|
|
1351
|
+
const memoWinner = compileWinningBackend.get(memoKey);
|
|
1352
|
+
|
|
1353
|
+
// Share one stealth token across this site's compile-time test processes.
|
|
1354
|
+
const stealthCache = new Map<string, StealthFetch>();
|
|
1355
|
+
try {
|
|
1356
|
+
const siteDir = pathResolve(toolDir, '..');
|
|
1357
|
+
const baseUrl = pickBaseUrl(tool);
|
|
1358
|
+
let fileCacheConsumed = false;
|
|
1359
|
+
const cachingBootstrap = async (args: BootstrapArgs): Promise<TokenCache> => {
|
|
1360
|
+
if (!fileCacheConsumed) {
|
|
1361
|
+
const cached = loadCachedToken(siteDir, STEALTH_TOKEN_MAX_AGE_SECONDS);
|
|
1362
|
+
if (cached) {
|
|
1363
|
+
fileCacheConsumed = true;
|
|
1364
|
+
log(`reusing cached stealth token for ${tool.site || siteDir}`);
|
|
1365
|
+
return cached;
|
|
1366
|
+
}
|
|
1367
|
+
}
|
|
1368
|
+
clearCachedToken(siteDir);
|
|
1369
|
+
const token = await bootstrapStealthToken(args);
|
|
1370
|
+
saveCachedToken(siteDir, token);
|
|
1371
|
+
fileCacheConsumed = true;
|
|
1372
|
+
return token;
|
|
1373
|
+
};
|
|
1374
|
+
stealthCache.set(
|
|
1375
|
+
tool.site,
|
|
1376
|
+
createStealthFetch(
|
|
1377
|
+
{ baseUrl, bootstrapUrl: tool.workflow.bootstrap?.url },
|
|
1378
|
+
{ bootstrap: cachingBootstrap },
|
|
1379
|
+
),
|
|
1380
|
+
);
|
|
1381
|
+
} catch {
|
|
1382
|
+
// No usable base URL → leave the cache empty; runWithLadder/ensureStealthFetch
|
|
1383
|
+
// will lazily bootstrap (same behavior as before this optimization).
|
|
1384
|
+
}
|
|
1385
|
+
|
|
1386
|
+
// Reuse the process-global compile CDP pool so cdp-replay stays warm (~2-5s)
|
|
1387
|
+
// across this `bun test` process's calls; cancel any pending idle-close now
|
|
1388
|
+
// that we're about to use it again. The pool is torn down by an idle timer
|
|
1389
|
+
// (armed in `finally`) shortly after the LAST call — see compileCdpPool.
|
|
1390
|
+
const cdpPool = compileCdpPool;
|
|
1391
|
+
clearCompileCdpIdle();
|
|
1392
|
+
|
|
1393
|
+
try {
|
|
1394
|
+
try {
|
|
1395
|
+
await paceCompileRequest(new URL(pickBaseUrl(tool)).origin);
|
|
1396
|
+
} catch {
|
|
1397
|
+
// no parseable base URL → nothing to pace
|
|
1398
|
+
}
|
|
1399
|
+
|
|
1400
|
+
// ── First call: parallel probe (45s deadline) ───────────────────────────
|
|
1401
|
+
// Race non-overlapping backends so a tarpitted rung doesn't block a
|
|
1402
|
+
// faster one. fetch-bootstrap is excluded: it launches Chrome to the
|
|
1403
|
+
// same origin as cdp-replay, and two simultaneous Chromes trip Akamai's
|
|
1404
|
+
// concurrent-session detection. cdp-replay is strictly better when both
|
|
1405
|
+
// need Chrome; if fetch wins, fetch-bootstrap is unnecessary anyway.
|
|
1406
|
+
//
|
|
1407
|
+
// Uses Promise.allSettled (NOT Promise.any) deliberately: a fast OK from
|
|
1408
|
+
// a lower rung (e.g. fetch returning a cached/stale 200) may not be the
|
|
1409
|
+
// best result — we need all backends to settle so we can pick the
|
|
1410
|
+
// fastest *correct* one. The tradeoff is wall-clock: the probe blocks
|
|
1411
|
+
// until the slowest backend resolves (or hits the deadline). cdp-replay
|
|
1412
|
+
// is slow on its first cold start (~33s) but subsequent calls reuse the
|
|
1413
|
+
// CDP pool and complete in ~2-5s — so the first probe pays the cost but
|
|
1414
|
+
// all later calls benefit from having discovered the right rung.
|
|
1415
|
+
//
|
|
1416
|
+
// The compile agent's integration tests MUST use a timeout >= 60s (the
|
|
1417
|
+
// compile-agent.md prompt recommends this) so the test process survives
|
|
1418
|
+
// the full probe duration. A 30s test timeout kills the probe before
|
|
1419
|
+
// cdp-replay can finish its cold start.
|
|
1420
|
+
//
|
|
1421
|
+
// Each bun-test subprocess is a fresh process (memo empty), so the
|
|
1422
|
+
// compile agent's iteration loop re-probes after every workflow change —
|
|
1423
|
+
// no premature lock-in.
|
|
1424
|
+
if (!memoWinner) {
|
|
1425
|
+
const PROBE_TIMEOUT_MS = probeTimeoutMsForTest ?? 45_000;
|
|
1426
|
+
const probeBackends: ConcreteBackend[] = ['fetch', 'cdp-replay', 'stealth-fetch'];
|
|
1427
|
+
|
|
1428
|
+
const settled = await Promise.allSettled(
|
|
1429
|
+
probeBackends.map(async (b) => {
|
|
1430
|
+
const t0 = Date.now();
|
|
1431
|
+
// Keep a handle to the real backend run (the race's non-timeout arm) so a
|
|
1432
|
+
// backend that LOSES the deadline race — still launching Chrome in the
|
|
1433
|
+
// background — gets settled and its pooled browser drained, not leaked,
|
|
1434
|
+
// once the probe returns.
|
|
1435
|
+
const inner = runWithLadder([b], tool, opts.params, assetRoot, stealthCache, {
|
|
1436
|
+
skipBootstrapSplice: true,
|
|
1437
|
+
cdpPool,
|
|
1438
|
+
});
|
|
1439
|
+
// A backend that finishes AFTER the probe returned (it lost the race but
|
|
1440
|
+
// is still cold-starting Chrome) pools its browser late — arm the idle
|
|
1441
|
+
// close so it's torn down rather than left lingering.
|
|
1442
|
+
void inner.finally(() => armCompileCdpIdleClose());
|
|
1443
|
+
const r = await Promise.race([
|
|
1444
|
+
inner,
|
|
1445
|
+
sleepMs(PROBE_TIMEOUT_MS).then(
|
|
1446
|
+
() =>
|
|
1447
|
+
({
|
|
1448
|
+
result: { ok: false, error: 'NETWORK', message: 'probe deadline exceeded' },
|
|
1449
|
+
usedBackend: b,
|
|
1450
|
+
attempts: [],
|
|
1451
|
+
}) as LadderResult,
|
|
1452
|
+
),
|
|
1453
|
+
]);
|
|
1454
|
+
return { backend: b, result: r, durationMs: Date.now() - t0 };
|
|
1455
|
+
}),
|
|
1456
|
+
);
|
|
1457
|
+
|
|
1458
|
+
const digest = settled.map((s, i) => {
|
|
1459
|
+
const b = probeBackends[i];
|
|
1460
|
+
if (s.status === 'rejected')
|
|
1461
|
+
return `${b}: ${s.reason instanceof Error ? s.reason.message : String(s.reason)}`.slice(
|
|
1462
|
+
0,
|
|
1463
|
+
120,
|
|
1464
|
+
);
|
|
1465
|
+
const { result: lr, durationMs } = s.value;
|
|
1466
|
+
return lr.result.ok
|
|
1467
|
+
? `${b}: OK in ${durationMs}ms`
|
|
1468
|
+
: `${b}: ${lr.result.error} — ${lr.result.message.slice(0, 200)} (${durationMs}ms)`;
|
|
1469
|
+
});
|
|
1470
|
+
|
|
1471
|
+
type ProbeEntry = { backend: ConcreteBackend; result: LadderResult; durationMs: number };
|
|
1472
|
+
const winners = settled
|
|
1473
|
+
.filter(
|
|
1474
|
+
(s): s is PromiseFulfilledResult<ProbeEntry> =>
|
|
1475
|
+
s.status === 'fulfilled' && s.value.result.result.ok,
|
|
1476
|
+
)
|
|
1477
|
+
.map((s) => s.value);
|
|
1478
|
+
|
|
1479
|
+
const best = pickProbeWinner(winners);
|
|
1480
|
+
if (best) {
|
|
1481
|
+
compileWinningBackend.set(memoKey, best.backend);
|
|
1482
|
+
log(
|
|
1483
|
+
`parallel probe: winner=${best.backend} (${best.durationMs}ms)\n ${digest.join('\n ')}`,
|
|
1484
|
+
);
|
|
1485
|
+
return best.result;
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1488
|
+
log(`parallel probe: all backends failed\n ${digest.join('\n ')}`);
|
|
1489
|
+
return {
|
|
1490
|
+
result: {
|
|
1491
|
+
ok: false as const,
|
|
1492
|
+
error: 'NETWORK' as const,
|
|
1493
|
+
message: `All backends failed during parallel probe: ${digest.join('; ')}`,
|
|
1494
|
+
},
|
|
1495
|
+
usedBackend: ladder[ladder.length - 1] ?? 'fetch',
|
|
1496
|
+
attempts: [],
|
|
1497
|
+
};
|
|
1498
|
+
}
|
|
1499
|
+
|
|
1500
|
+
// ── Memo hit: start at the memoized winner, keep all later rungs ─────
|
|
1501
|
+
// Previous logic sliced earlier rungs away (`ladder.slice(idx)`), which
|
|
1502
|
+
// dropped cdp-replay as a fallback when stealth-fetch (the last rung)
|
|
1503
|
+
// was the winner. Now: reorder the ladder to start at the winner and
|
|
1504
|
+
// wrap around so every rung remains reachable. The winner is tried first
|
|
1505
|
+
// (the optimization), but if it fails the remaining rungs catch it.
|
|
1506
|
+
const idx = ladder.indexOf(memoWinner);
|
|
1507
|
+
const memoLadder = idx > 0 ? [...ladder.slice(idx), ...ladder.slice(0, idx)] : ladder;
|
|
1508
|
+
log(
|
|
1509
|
+
`compile memo: ${memoKey} previously succeeded via ${memoWinner}; ladder: ${memoLadder.join(' → ')}`,
|
|
1510
|
+
);
|
|
1511
|
+
const result = await runWithLadder(memoLadder, tool, opts.params, assetRoot, stealthCache, {
|
|
1512
|
+
skipBootstrapSplice: true,
|
|
1513
|
+
cdpPool,
|
|
1514
|
+
});
|
|
1515
|
+
if (result.result.ok) {
|
|
1516
|
+
compileWinningBackend.set(memoKey, result.usedBackend);
|
|
1517
|
+
} else {
|
|
1518
|
+
compileWinningBackend.delete(memoKey);
|
|
1519
|
+
}
|
|
1520
|
+
return result;
|
|
1521
|
+
} finally {
|
|
1522
|
+
// Keep the pool warm for the next call in this process; arm an idle-close so
|
|
1523
|
+
// it's torn down shortly after the LAST call — that lets a raw `bun probe.ts`
|
|
1524
|
+
// exit cleanly (no 30-min hang) and never leaks a browser.
|
|
1525
|
+
armCompileCdpIdleClose();
|
|
1526
|
+
}
|
|
1527
|
+
}
|
|
1528
|
+
|
|
1529
|
+
export interface RenderedRequest {
|
|
1530
|
+
method: string;
|
|
1531
|
+
/** Final, fully-substituted + transform-applied request URL. */
|
|
1532
|
+
url: string;
|
|
1533
|
+
/** Outgoing headers (lower/mixed case as the runtime set them). */
|
|
1534
|
+
headers: Record<string, string>;
|
|
1535
|
+
/** Outgoing body, or null for body-less requests. */
|
|
1536
|
+
body: string | null;
|
|
1537
|
+
}
|
|
1538
|
+
|
|
1539
|
+
/**
|
|
1540
|
+
* Render a workflow's outgoing requests OFFLINE — no network, no browser. Runs
|
|
1541
|
+
* the real `executeWorkflow` (so `${param}`/`${state}` substitution, captures,
|
|
1542
|
+
* and any `requestTransformModule` all execute) but with a `fetchImpl` that
|
|
1543
|
+
* returns the matching RECORDED response for each request and CAPTURES the final
|
|
1544
|
+
* outgoing request before returning it.
|
|
1545
|
+
*
|
|
1546
|
+
* Purpose: verify a parameter actually reaches its field by diffing renders
|
|
1547
|
+
* across param overrides — WITHOUT firing a live `.act` per parameter (the burst
|
|
1548
|
+
* that flags anti-bot IPs and made costco's tools fail compile). The live suite
|
|
1549
|
+
* then needs only ONE baseline call to prove the workflow produces real data; the
|
|
1550
|
+
* per-parameter "does X reach field F" check becomes a deterministic offline diff.
|
|
1551
|
+
*
|
|
1552
|
+
* `recordedResponseFor(method, url)` supplies the recorded response so captures
|
|
1553
|
+
* (csrf via text_regex, etc.) resolve and the transform builds the real body;
|
|
1554
|
+
* return undefined to fall back to an empty `200`.
|
|
1555
|
+
*/
|
|
1556
|
+
export async function renderWorkflowRequests(opts: {
|
|
1557
|
+
workflow: Workflow;
|
|
1558
|
+
params: Record<string, string | number | boolean>;
|
|
1559
|
+
workflowPath?: string;
|
|
1560
|
+
credentials?: CredentialStore;
|
|
1561
|
+
recordedResponseFor?: (
|
|
1562
|
+
method: string,
|
|
1563
|
+
url: string,
|
|
1564
|
+
) => { status: number; body: string; headers?: Record<string, string> } | undefined;
|
|
1565
|
+
}): Promise<{ requests: RenderedRequest[]; result: ToolResult }> {
|
|
1566
|
+
const captured: RenderedRequest[] = [];
|
|
1567
|
+
const fetchImpl: typeof fetch = (async (
|
|
1568
|
+
input: string | URL | Request,
|
|
1569
|
+
init?: RequestInit,
|
|
1570
|
+
): Promise<Response> => {
|
|
1571
|
+
const url =
|
|
1572
|
+
typeof input === 'string' ? input : input instanceof URL ? input.toString() : input.url;
|
|
1573
|
+
const method = (init?.method ?? 'GET').toUpperCase();
|
|
1574
|
+
const headers: Record<string, string> = {};
|
|
1575
|
+
if (init?.headers) {
|
|
1576
|
+
const h = new Headers(init.headers as Record<string, string>);
|
|
1577
|
+
h.forEach((v, k) => {
|
|
1578
|
+
headers[k] = v;
|
|
1579
|
+
});
|
|
1580
|
+
}
|
|
1581
|
+
const body = typeof init?.body === 'string' ? init.body : init?.body ? String(init.body) : null;
|
|
1582
|
+
captured.push({ method, url, headers, body });
|
|
1583
|
+
const rec = opts.recordedResponseFor?.(method, url);
|
|
1584
|
+
return new Response(rec?.body ?? '{}', {
|
|
1585
|
+
status: rec?.status ?? 200,
|
|
1586
|
+
headers: new Headers(rec?.headers ?? {}),
|
|
1587
|
+
});
|
|
1588
|
+
}) as typeof fetch;
|
|
1589
|
+
|
|
1590
|
+
const result = await executeWorkflow({
|
|
1591
|
+
workflow: opts.workflow,
|
|
1592
|
+
params: opts.params,
|
|
1593
|
+
credentials: opts.credentials,
|
|
1594
|
+
workflowPath: opts.workflowPath,
|
|
1595
|
+
fetchImpl,
|
|
1596
|
+
});
|
|
1597
|
+
return { requests: captured, result };
|
|
1598
|
+
}
|