imprint-mcp 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -201
- package/examples/discoverandgo/README.md +1 -1
- package/examples/echo/README.md +1 -1
- package/examples/google-flights/README.md +28 -0
- package/examples/google-flights/_shared/batchexecute.ts +63 -0
- package/examples/google-flights/_shared/flights_request.ts +95 -0
- package/examples/google-flights/_shared/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/index.ts +159 -0
- package/examples/google-flights/get_flight_booking_details/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/parser.ts +182 -0
- package/examples/google-flights/get_flight_booking_details/playbook.yaml +138 -0
- package/examples/google-flights/get_flight_booking_details/request-transform.ts +86 -0
- package/examples/google-flights/get_flight_booking_details/workflow.json +98 -0
- package/examples/google-flights/get_flight_calendar_prices/index.ts +131 -0
- package/examples/google-flights/get_flight_calendar_prices/package.json +9 -0
- package/examples/google-flights/get_flight_calendar_prices/parser.ts +86 -0
- package/examples/google-flights/get_flight_calendar_prices/playbook.yaml +97 -0
- package/examples/google-flights/get_flight_calendar_prices/request-transform.ts +31 -0
- package/examples/google-flights/get_flight_calendar_prices/workflow.json +76 -0
- package/examples/google-flights/lookup_airport/index.ts +101 -0
- package/examples/google-flights/lookup_airport/package.json +9 -0
- package/examples/google-flights/lookup_airport/parser.ts +66 -0
- package/examples/google-flights/lookup_airport/playbook.yaml +47 -0
- package/examples/google-flights/lookup_airport/request-transform.ts +20 -0
- package/examples/google-flights/lookup_airport/workflow.json +57 -0
- package/examples/google-flights/search_flights/index.ts +219 -0
- package/examples/google-flights/search_flights/package.json +9 -0
- package/examples/google-flights/search_flights/parser.ts +169 -0
- package/examples/google-flights/search_flights/playbook.yaml +184 -0
- package/examples/google-flights/search_flights/request-transform.ts +119 -0
- package/examples/google-flights/search_flights/workflow.json +143 -0
- package/examples/google-hotels/README.md +29 -0
- package/examples/google-hotels/_shared/batchexecute.ts +73 -0
- package/examples/google-hotels/_shared/freq.ts +158 -0
- package/examples/google-hotels/_shared/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/index.ts +80 -0
- package/examples/google-hotels/autocomplete_hotel_location/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/parser.ts +71 -0
- package/examples/google-hotels/autocomplete_hotel_location/playbook.yaml +36 -0
- package/examples/google-hotels/autocomplete_hotel_location/request-transform.ts +37 -0
- package/examples/google-hotels/autocomplete_hotel_location/workflow.json +36 -0
- package/examples/google-hotels/get_hotel_booking_options/index.ts +143 -0
- package/examples/google-hotels/get_hotel_booking_options/package.json +9 -0
- package/examples/google-hotels/get_hotel_booking_options/parser.ts +271 -0
- package/examples/google-hotels/get_hotel_booking_options/playbook.yaml +154 -0
- package/examples/google-hotels/get_hotel_booking_options/request-transform.ts +154 -0
- package/examples/google-hotels/get_hotel_booking_options/workflow.json +84 -0
- package/examples/google-hotels/get_hotel_reviews/index.ts +81 -0
- package/examples/google-hotels/get_hotel_reviews/package.json +9 -0
- package/examples/google-hotels/get_hotel_reviews/parser.ts +128 -0
- package/examples/google-hotels/get_hotel_reviews/playbook.yaml +64 -0
- package/examples/google-hotels/get_hotel_reviews/request-transform.ts +42 -0
- package/examples/google-hotels/get_hotel_reviews/workflow.json +37 -0
- package/examples/google-hotels/search_hotels/index.ts +207 -0
- package/examples/google-hotels/search_hotels/package.json +9 -0
- package/examples/google-hotels/search_hotels/parser.ts +260 -0
- package/examples/google-hotels/search_hotels/playbook.yaml +87 -0
- package/examples/google-hotels/search_hotels/request-transform.ts +197 -0
- package/examples/google-hotels/search_hotels/workflow.json +127 -0
- package/package.json +3 -2
- package/prompts/audit-agent.md +71 -0
- package/prompts/build-planning.md +74 -0
- package/prompts/compile-agent.md +132 -28
- package/prompts/prereq-builder.md +64 -0
- package/prompts/prereq-planner.md +34 -0
- package/prompts/tool-planning.md +39 -0
- package/src/cli.ts +111 -4
- package/src/imprint/agent.ts +5 -0
- package/src/imprint/audit.ts +996 -0
- package/src/imprint/backend-ladder.ts +1214 -184
- package/src/imprint/build-plan.ts +1051 -0
- package/src/imprint/cdp-browser-fetch.ts +589 -0
- package/src/imprint/cdp-jar-cache.ts +320 -0
- package/src/imprint/chromium.ts +135 -0
- package/src/imprint/claude-cli-compile.ts +125 -25
- package/src/imprint/codex-cli-compile.ts +26 -23
- package/src/imprint/compile-agent-types.ts +38 -0
- package/src/imprint/compile-agent.ts +65 -27
- package/src/imprint/compile-tools.ts +1656 -64
- package/src/imprint/compile.ts +14 -2
- package/src/imprint/concurrency.ts +87 -0
- package/src/imprint/credential-extract.ts +174 -25
- package/src/imprint/cron.ts +1 -0
- package/src/imprint/doctor.ts +39 -0
- package/src/imprint/emit.ts +85 -0
- package/src/imprint/freeform-redact.ts +5 -4
- package/src/imprint/integrations.ts +2 -2
- package/src/imprint/llm.ts +56 -8
- package/src/imprint/mcp-compile-server.ts +43 -10
- package/src/imprint/mcp-maintenance.ts +9 -101
- package/src/imprint/mcp-server.ts +73 -7
- package/src/imprint/multi-progress.ts +7 -2
- package/src/imprint/param-grounding.ts +367 -0
- package/src/imprint/paths.ts +29 -0
- package/src/imprint/playbook-runner.ts +101 -40
- package/src/imprint/prereq-builder.ts +651 -0
- package/src/imprint/probe-backends.ts +6 -3
- package/src/imprint/record.ts +10 -1
- package/src/imprint/redact.ts +30 -2
- package/src/imprint/replay-capture.ts +19 -18
- package/src/imprint/runtime.ts +19 -10
- package/src/imprint/sensitive-keys.ts +141 -7
- package/src/imprint/session-diff.ts +79 -2
- package/src/imprint/session-merge.ts +9 -5
- package/src/imprint/stealth-chromium.ts +81 -0
- package/src/imprint/stealth-fetch.ts +309 -29
- package/src/imprint/stealth-token-cache.ts +88 -0
- package/src/imprint/teach-plan.ts +251 -0
- package/src/imprint/teach-state.ts +17 -0
- package/src/imprint/teach.ts +582 -147
- package/src/imprint/tool-candidates.ts +72 -14
- package/src/imprint/tool-plan.ts +313 -0
- package/src/imprint/tracing.ts +135 -6
- package/src/imprint/types.ts +61 -3
- package/examples/google-flights/search_google_flights/index.ts +0 -101
- package/examples/google-flights/search_google_flights/parser.test.ts +0 -140
- package/examples/google-flights/search_google_flights/parser.ts +0 -189
- package/examples/google-flights/search_google_flights/playbook.yaml +0 -130
- package/examples/google-flights/search_google_flights/workflow.json +0 -48
- package/examples/google-hotels/search_google_hotels/index.ts +0 -194
- package/examples/google-hotels/search_google_hotels/parser.test.ts +0 -168
- package/examples/google-hotels/search_google_hotels/parser.ts +0 -330
- package/examples/google-hotels/search_google_hotels/playbook.yaml +0 -125
- package/examples/google-hotels/search_google_hotels/workflow.json +0 -111
- package/examples/namecheap-domains/search_namecheap_domains/index.ts +0 -144
- package/examples/namecheap-domains/search_namecheap_domains/parser.ts +0 -380
- package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +0 -50
- package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +0 -136
- package/examples/namecheap-domains/search_namecheap_domains/workflow.json +0 -97
|
@@ -10,13 +10,20 @@
|
|
|
10
10
|
* ~12s bootstrap one-time, ~1s per API call after.
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
|
-
import type { Browser } from 'playwright';
|
|
13
|
+
import type { Browser, BrowserContext, Page } from 'playwright';
|
|
14
14
|
import { isSameRegistrableDomain, registrableDomain } from './etld.ts';
|
|
15
15
|
import { createLog } from './log.ts';
|
|
16
16
|
|
|
17
17
|
export interface StealthFetchOptions {
|
|
18
18
|
/** Homepage URL to load during bootstrap (triggers bot-detection JS). */
|
|
19
19
|
baseUrl: string;
|
|
20
|
+
/** URL to navigate during bootstrap, when it differs from baseUrl. Set this
|
|
21
|
+
* to the workflow's `bootstrap.url` so the same stealth session that mints
|
|
22
|
+
* anti-bot cookies (_abck etc.) ALSO loads the page that sets session
|
|
23
|
+
* tokens (CSRF cookies, nonces) — those tokens and the bot-cookies must
|
|
24
|
+
* come from ONE session or the site rejects the later API POST on a
|
|
25
|
+
* session mismatch. Defaults to baseUrl. */
|
|
26
|
+
bootstrapUrl?: string;
|
|
20
27
|
/** Seconds to wait after page load for sensor initialization. Default 3. */
|
|
21
28
|
sensorWaitSeconds?: number;
|
|
22
29
|
/** Launch headed for debugging. Default false. */
|
|
@@ -42,6 +49,11 @@ export interface FetchInit {
|
|
|
42
49
|
* a 403 retry — callers that need retry-after-bot-bootstrap should
|
|
43
50
|
* pass a string, Blob, ArrayBuffer, FormData, or URLSearchParams. */
|
|
44
51
|
body?: RequestInit['body'];
|
|
52
|
+
/** Abort signal from the caller (e.g. executeWorkflow's per-request timeout
|
|
53
|
+
* AbortController). MUST be forwarded to the underlying fetch — without it a
|
|
54
|
+
* tarpitting anti-bot endpoint hangs far past the caller's timeout (observed
|
|
55
|
+
* ~272s on Akamai) instead of aborting promptly so the ladder can escalate. */
|
|
56
|
+
signal?: AbortSignal;
|
|
45
57
|
}
|
|
46
58
|
|
|
47
59
|
interface FetchResult {
|
|
@@ -55,11 +67,34 @@ export interface TokenCache {
|
|
|
55
67
|
cookies: Array<{ name: string; value: string }>;
|
|
56
68
|
sensorHeaders: Record<string, string>;
|
|
57
69
|
bootstrappedAt: number;
|
|
70
|
+
/** HTML of the bootstrap navigation, so callers can satisfy a workflow's
|
|
71
|
+
* `html_regex` bootstrap captures from the same session. Optional —
|
|
72
|
+
* absent on caches minted before this field existed. */
|
|
73
|
+
bootstrapHtml?: string;
|
|
74
|
+
/** Lower-cased response headers of the bootstrap navigation, so callers can
|
|
75
|
+
* satisfy `response_header` bootstrap captures. Optional. */
|
|
76
|
+
bootstrapResponseHeaders?: Record<string, string>;
|
|
77
|
+
/** The bootstrap browser's actual `navigator.userAgent`, captured live. Reused
|
|
78
|
+
* for the post-bootstrap fetches so the wire UA matches the binary that minted
|
|
79
|
+
* the cookies (and its client hints below). Absent if capture failed or on
|
|
80
|
+
* caches minted before this field existed → caller falls back to DEFAULT_UA. */
|
|
81
|
+
userAgent?: string;
|
|
82
|
+
/** Lower-cased `sec-ch-ua*` client-hint headers derived from the bootstrap
|
|
83
|
+
* browser's `navigator.userAgentData`, so the post-bootstrap fetch can send
|
|
84
|
+
* client hints consistent with `userAgent`. Absent when the browser doesn't
|
|
85
|
+
* expose userAgentData (non-secure context / non-Chromium). */
|
|
86
|
+
clientHints?: Record<string, string>;
|
|
58
87
|
}
|
|
59
88
|
|
|
60
89
|
export interface StealthFetch {
|
|
61
90
|
/** typeof fetch wrapper that auto-bootstraps + adds sensor headers. */
|
|
62
91
|
readonly fetchImpl: typeof fetch;
|
|
92
|
+
/** Force the bootstrap navigation now (if not already done) and return the
|
|
93
|
+
* token cache — including the cookies minted during the navigation. Callers
|
|
94
|
+
* use this to read session-token cookies (CSRF etc.) set by the bootstrap
|
|
95
|
+
* page and feed them into the workflow as `${state.X}`, in the SAME session
|
|
96
|
+
* as the transport cookies. */
|
|
97
|
+
ensureBootstrapped(): Promise<TokenCache>;
|
|
63
98
|
/** Drop cached tokens; next fetch re-bootstraps. */
|
|
64
99
|
invalidate(): void;
|
|
65
100
|
/** Token age in seconds; -1 if not bootstrapped yet. */
|
|
@@ -74,10 +109,16 @@ export interface StealthFetch {
|
|
|
74
109
|
close(): Promise<void>;
|
|
75
110
|
}
|
|
76
111
|
|
|
77
|
-
interface BootstrapArgs {
|
|
112
|
+
export interface BootstrapArgs {
|
|
78
113
|
baseUrl: string;
|
|
114
|
+
/** Page to navigate during bootstrap (for session-token cookies). Defaults
|
|
115
|
+
* to baseUrl when absent. */
|
|
116
|
+
bootstrapUrl?: string;
|
|
79
117
|
probeUrl?: string;
|
|
80
|
-
|
|
118
|
+
/** Force a specific UA on the bootstrap browser. Omit (the default) to let
|
|
119
|
+
* Chrome use its NATIVE UA — which is always self-consistent with the client
|
|
120
|
+
* hints it emits. Only set this when a caller explicitly needs a custom UA. */
|
|
121
|
+
userAgent?: string;
|
|
81
122
|
headed: boolean;
|
|
82
123
|
sensorWaitSeconds: number;
|
|
83
124
|
}
|
|
@@ -92,8 +133,20 @@ interface StealthFetchInternals {
|
|
|
92
133
|
underlyingFetch?: (url: string, init: FetchInit, tokens: TokenCache) => Promise<FetchResult>;
|
|
93
134
|
}
|
|
94
135
|
|
|
136
|
+
/**
|
|
137
|
+
* Last-resort User-Agent, used ONLY when the bootstrap browser couldn't report
|
|
138
|
+
* its own UA and the caller didn't force one. The real path captures the live
|
|
139
|
+
* browser's actual `navigator.userAgent` during bootstrap (see
|
|
140
|
+
* `bootstrapStealthToken`) and reuses THAT for the post-bootstrap fetches, so
|
|
141
|
+
* the UA on the wire always matches the binary's own client hints (sec-ch-ua).
|
|
142
|
+
*
|
|
143
|
+
* A hardcoded UA is dangerous precisely because it drifts: a stale major
|
|
144
|
+
* version (e.g. Chrome/131) paired with the live binary's client hints
|
|
145
|
+
* (Chrome/148) is a contradiction no real browser emits — a textbook anti-bot
|
|
146
|
+
* tell. Keep this roughly current as a floor, but the capture is what ships.
|
|
147
|
+
*/
|
|
95
148
|
const DEFAULT_UA =
|
|
96
|
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
|
149
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36';
|
|
97
150
|
|
|
98
151
|
/** Standard headers the runtime sets — anything outbound NOT in this set
|
|
99
152
|
* was injected by sensor JS and is what we capture for replay. */
|
|
@@ -136,14 +189,18 @@ export function createStealthFetch(
|
|
|
136
189
|
const o = typeof optsOrUrl === 'string' ? { baseUrl: optsOrUrl } : optsOrUrl;
|
|
137
190
|
const opts = {
|
|
138
191
|
baseUrl: o.baseUrl,
|
|
192
|
+
bootstrapUrl: o.bootstrapUrl ?? o.baseUrl,
|
|
139
193
|
sensorWaitSeconds: o.sensorWaitSeconds ?? 3,
|
|
140
194
|
headed: o.headed ?? false,
|
|
141
|
-
|
|
195
|
+
// Undefined unless the caller forces a UA. Letting it stay undefined makes
|
|
196
|
+
// the bootstrap browser use its native UA (self-consistent with its client
|
|
197
|
+
// hints); we then capture that real UA and reuse it for the fetches.
|
|
198
|
+
userAgent: o.userAgent,
|
|
142
199
|
maxRetries: o.maxRetries ?? 1,
|
|
143
200
|
maxTokenAgeSeconds: o.maxTokenAgeSeconds ?? 600,
|
|
144
201
|
maxConsecutiveFailures: o.maxConsecutiveFailures ?? 3,
|
|
145
202
|
};
|
|
146
|
-
const bootstrapFn = internals?.bootstrap ??
|
|
203
|
+
const bootstrapFn = internals?.bootstrap ?? bootstrapStealthToken;
|
|
147
204
|
const underlyingFetchFn = internals?.underlyingFetch ?? defaultUnderlyingFetch;
|
|
148
205
|
|
|
149
206
|
let tokens: TokenCache | null = null;
|
|
@@ -164,6 +221,7 @@ export function createStealthFetch(
|
|
|
164
221
|
log('bootstrapping…');
|
|
165
222
|
tokens = await bootstrapFn({
|
|
166
223
|
baseUrl: opts.baseUrl,
|
|
224
|
+
bootstrapUrl: opts.bootstrapUrl,
|
|
167
225
|
probeUrl,
|
|
168
226
|
userAgent: opts.userAgent,
|
|
169
227
|
headed: opts.headed,
|
|
@@ -183,24 +241,48 @@ export function createStealthFetch(
|
|
|
183
241
|
const t = tokens;
|
|
184
242
|
if (!t) throw new Error('No tokens (bootstrap failed?)');
|
|
185
243
|
const { headers: initHeaders, cookieHeader } = splitCookieHeader(init?.headers ?? {});
|
|
244
|
+
// Defaults that yield to the caller's initHeaders (and the workflow's
|
|
245
|
+
// recorded headers that flow through them). Keys are lowercase to
|
|
246
|
+
// match what the public `fetchImpl` wrapper normalizes everything to
|
|
247
|
+
// (via `new Headers().forEach`) — a mixed-case merge would silently
|
|
248
|
+
// duplicate both `Accept` and `accept` in the final headers and the
|
|
249
|
+
// caller's override would never actually win.
|
|
250
|
+
//
|
|
251
|
+
// Content-Type intentionally depends on whether the request actually
|
|
252
|
+
// has a body — sending Content-Type: application/json on a body-less
|
|
253
|
+
// GET is anti-bot suspicious (real browsers don't do it) and was
|
|
254
|
+
// contributing to Akamai tarpits on HTML bootstrap GETs from this rung.
|
|
255
|
+
const hasBody = init?.body !== undefined && init?.body !== null;
|
|
256
|
+
// UA precedence: an explicit caller override (also used for the bootstrap
|
|
257
|
+
// context) → the UA the bootstrap browser actually reported → the stale
|
|
258
|
+
// fallback. The captured value keeps the fetch UA matching the binary that
|
|
259
|
+
// minted the cookies.
|
|
260
|
+
const ua = opts.userAgent ?? t.userAgent ?? DEFAULT_UA;
|
|
261
|
+
const defaultHeaders: Record<string, string> = {
|
|
262
|
+
'user-agent': ua,
|
|
263
|
+
accept: 'application/json, text/javascript, */*; q=0.01',
|
|
264
|
+
cookie: mergeCookieHeader(
|
|
265
|
+
t.cookies.map((c) => `${c.name}=${c.value}`).join('; '),
|
|
266
|
+
cookieHeader,
|
|
267
|
+
),
|
|
268
|
+
origin: new URL(fullUrl).origin,
|
|
269
|
+
referer: opts.baseUrl,
|
|
270
|
+
...t.sensorHeaders,
|
|
271
|
+
};
|
|
272
|
+
// Send client hints consistent with the UA. Only when we're NOT forcing a
|
|
273
|
+
// custom UA: the captured hints reflect the browser's native UA, so pairing
|
|
274
|
+
// them with an override would reintroduce the UA/hints contradiction we fix.
|
|
275
|
+
if (!opts.userAgent && t.clientHints) {
|
|
276
|
+
for (const [k, v] of Object.entries(t.clientHints)) defaultHeaders[k] = v;
|
|
277
|
+
}
|
|
278
|
+
if (hasBody) defaultHeaders['content-type'] = 'application/json';
|
|
186
279
|
const result = await underlyingFetchFn(
|
|
187
280
|
fullUrl,
|
|
188
281
|
{
|
|
189
282
|
method: init?.method ?? 'GET',
|
|
190
|
-
headers: {
|
|
191
|
-
'User-Agent': opts.userAgent,
|
|
192
|
-
Accept: 'application/json, text/javascript, */*; q=0.01',
|
|
193
|
-
'Content-Type': 'application/json',
|
|
194
|
-
Cookie: mergeCookieHeader(
|
|
195
|
-
t.cookies.map((c) => `${c.name}=${c.value}`).join('; '),
|
|
196
|
-
cookieHeader,
|
|
197
|
-
),
|
|
198
|
-
Origin: new URL(fullUrl).origin,
|
|
199
|
-
Referer: opts.baseUrl,
|
|
200
|
-
...t.sensorHeaders,
|
|
201
|
-
...initHeaders,
|
|
202
|
-
},
|
|
283
|
+
headers: { ...defaultHeaders, ...initHeaders },
|
|
203
284
|
body: init?.body,
|
|
285
|
+
signal: init?.signal,
|
|
204
286
|
},
|
|
205
287
|
t,
|
|
206
288
|
);
|
|
@@ -261,6 +343,9 @@ export function createStealthFetch(
|
|
|
261
343
|
// accepted shape (string, Blob, ArrayBuffer, FormData, URLSearchParams,
|
|
262
344
|
// ReadableStream). Previously we dropped any non-string body silently.
|
|
263
345
|
body: init?.body ?? undefined,
|
|
346
|
+
// Forward the caller's abort signal (per-request timeout) — without it a
|
|
347
|
+
// tarpitting endpoint hangs far past the timeout instead of escalating.
|
|
348
|
+
signal: init?.signal ?? undefined,
|
|
264
349
|
});
|
|
265
350
|
return new Response(result.body, {
|
|
266
351
|
status: result.status,
|
|
@@ -270,6 +355,11 @@ export function createStealthFetch(
|
|
|
270
355
|
|
|
271
356
|
return {
|
|
272
357
|
fetchImpl,
|
|
358
|
+
async ensureBootstrapped(): Promise<TokenCache> {
|
|
359
|
+
await ensureTokens();
|
|
360
|
+
if (!tokens) throw new Error('stealth bootstrap produced no tokens');
|
|
361
|
+
return tokens;
|
|
362
|
+
},
|
|
273
363
|
invalidate(): void {
|
|
274
364
|
tokens = null;
|
|
275
365
|
consecutiveFailures = 0;
|
|
@@ -325,18 +415,93 @@ function mergeCookieHeader(browserCookie: string, runtimeCookie: string | undefi
|
|
|
325
415
|
* `baseUrl`, lets the bot-detection JS run, captures the resulting
|
|
326
416
|
* cookies + sensor-injected headers via a route interceptor on a probe
|
|
327
417
|
* request, closes the browser. Returns a fresh TokenCache.
|
|
418
|
+
*
|
|
419
|
+
* Exported so the compile-time token cache (stealth-token-cache.ts) can mint a
|
|
420
|
+
* token to persist + share across `bun test` processes without re-implementing
|
|
421
|
+
* the Playwright bootstrap.
|
|
328
422
|
*/
|
|
329
|
-
|
|
330
|
-
|
|
423
|
+
/** Akamai _abck cookie validation marker. Format: `<token>~<status>~…`;
|
|
424
|
+
* status `0` = sensor-validated (requests pass), `-1` = not yet validated
|
|
425
|
+
* (state-changing POSTs get tarpitted). */
|
|
426
|
+
function abckIsValidated(cookieValue: string | undefined): boolean {
|
|
427
|
+
if (!cookieValue) return false;
|
|
428
|
+
return cookieValue.split('~')[1] === '0';
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
/** Drive human-like interaction (mouse moves + scroll) and poll until the
|
|
432
|
+
* Akamai _abck cookie validates (`~0~`), or until `maxSeconds` elapse. Returns
|
|
433
|
+
* true if validation was observed. No-op-safe on pages without _abck (returns
|
|
434
|
+
* false after the window; caller proceeds regardless). */
|
|
435
|
+
async function driveSensorValidation(
|
|
436
|
+
page: Page,
|
|
437
|
+
context: BrowserContext,
|
|
438
|
+
maxSeconds: number,
|
|
439
|
+
): Promise<boolean> {
|
|
440
|
+
const deadline = maxSeconds * 1000;
|
|
441
|
+
const start = Date.now();
|
|
442
|
+
let i = 0;
|
|
443
|
+
while (Date.now() - start < deadline) {
|
|
444
|
+
// Jittered mouse path + occasional scroll — the sensor wants movement, not
|
|
445
|
+
// a single teleport. Coordinates stay within the viewport.
|
|
446
|
+
try {
|
|
447
|
+
await page.mouse.move(80 + ((i * 137) % 1200), 120 + ((i * 89) % 640), { steps: 4 });
|
|
448
|
+
if (i % 3 === 0) {
|
|
449
|
+
await page.evaluate(
|
|
450
|
+
(y: number) => {
|
|
451
|
+
(globalThis as unknown as { scrollBy: (x: number, y: number) => void }).scrollBy(0, y);
|
|
452
|
+
},
|
|
453
|
+
100 + (i % 5) * 40,
|
|
454
|
+
);
|
|
455
|
+
}
|
|
456
|
+
} catch {
|
|
457
|
+
// page may navigate/close mid-interaction — non-fatal
|
|
458
|
+
}
|
|
459
|
+
await page.waitForTimeout(800);
|
|
460
|
+
let abck: string | undefined;
|
|
461
|
+
try {
|
|
462
|
+
abck = (await context.cookies()).find((c) => c.name === '_abck')?.value;
|
|
463
|
+
} catch {
|
|
464
|
+
// best-effort
|
|
465
|
+
}
|
|
466
|
+
// Absent _abck → site doesn't use Akamai's scheme; nothing to wait for.
|
|
467
|
+
if (abck === undefined && i >= 2) return false;
|
|
468
|
+
if (abckIsValidated(abck)) return true;
|
|
469
|
+
i++;
|
|
470
|
+
}
|
|
471
|
+
return false;
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
export async function bootstrapStealthToken(args: BootstrapArgs): Promise<TokenCache> {
|
|
475
|
+
// Use the same stealth-patched chromium + full Chrome binary that
|
|
476
|
+
// runFetchBootstrap and runPlaybook use. The original implementation
|
|
477
|
+
// imported vanilla `playwright` with no executablePath, which defaults
|
|
478
|
+
// to chrome-headless-shell — a separate stripped-down binary that
|
|
479
|
+
// Akamai / Cloudflare / PerimeterX detect at the binary / TLS layer
|
|
480
|
+
// and RST the HTTP/2 stream immediately (verified empirically against
|
|
481
|
+
// www.costcotravel.com). Using the same binary as `imprint record`
|
|
482
|
+
// (Playwright's bundled "Google Chrome for Testing") makes Akamai
|
|
483
|
+
// accept the navigation and mint clean bot-cookies, just like the
|
|
484
|
+
// recording session did.
|
|
485
|
+
const { getStealthChromium, getStealthExecutablePath, isStealthPluginAvailable } = await import(
|
|
486
|
+
'./stealth-chromium.ts'
|
|
487
|
+
);
|
|
488
|
+
const chromium = await getStealthChromium();
|
|
489
|
+
const stealthActive = await isStealthPluginAvailable();
|
|
331
490
|
let browser: Browser | undefined;
|
|
332
491
|
try {
|
|
333
492
|
browser = await chromium.launch({
|
|
334
493
|
headless: !args.headed,
|
|
494
|
+
executablePath: getStealthExecutablePath(),
|
|
335
495
|
args: ['--disable-blink-features=AutomationControlled', '--no-sandbox'],
|
|
336
496
|
});
|
|
337
497
|
|
|
498
|
+
// Only override the UA when the caller explicitly asked for one. Otherwise
|
|
499
|
+
// let Chrome use its native UA: a forced UA does NOT change the client hints
|
|
500
|
+
// (sec-ch-ua) the browser emits, so pinning a stale UA string while the
|
|
501
|
+
// binary advertises its real version is a contradiction anti-bot services
|
|
502
|
+
// flag. Native UA + native hints are always self-consistent.
|
|
338
503
|
const context = await browser.newContext({
|
|
339
|
-
userAgent: args.userAgent,
|
|
504
|
+
...(args.userAgent ? { userAgent: args.userAgent } : {}),
|
|
340
505
|
viewport: { width: 1440, height: 900 },
|
|
341
506
|
screen: { width: 2560, height: 1440 },
|
|
342
507
|
locale: 'en-US',
|
|
@@ -344,14 +509,117 @@ async function defaultBootstrap(args: BootstrapArgs): Promise<TokenCache> {
|
|
|
344
509
|
});
|
|
345
510
|
|
|
346
511
|
const page = await context.newPage();
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
512
|
+
// Patch navigator.webdriver ONLY on the vanilla-Playwright fallback. When the
|
|
513
|
+
// stealth plugin is active it already removes the property natively (a real
|
|
514
|
+
// Chrome lacks it); stacking our Object.defineProperty on top leaves a
|
|
515
|
+
// non-native descriptor that is itself a tell. See isStealthPluginAvailable.
|
|
516
|
+
if (!stealthActive) {
|
|
517
|
+
await page.addInitScript(() => {
|
|
518
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
|
519
|
+
});
|
|
520
|
+
}
|
|
350
521
|
|
|
522
|
+
// Navigate the bootstrap page (the workflow's bootstrap.url when set,
|
|
523
|
+
// else baseUrl). Loading the actual session-minting page here means the
|
|
524
|
+
// CSRF/nonce cookies it sets land in the SAME context as the anti-bot
|
|
525
|
+
// cookies — a later API POST that needs both will not be rejected for a
|
|
526
|
+
// session mismatch.
|
|
527
|
+
const navUrl = args.bootstrapUrl ?? args.baseUrl;
|
|
351
528
|
// 'domcontentloaded' (not 'networkidle') because SPAs keep connections
|
|
352
529
|
// alive forever; explicit sensor-wait lets bot-detection JS fire.
|
|
353
|
-
await page.goto(
|
|
354
|
-
|
|
530
|
+
const navResponse = await page.goto(navUrl, {
|
|
531
|
+
waitUntil: 'domcontentloaded',
|
|
532
|
+
timeout: 30000,
|
|
533
|
+
});
|
|
534
|
+
|
|
535
|
+
// Drive human-like interaction (mouse moves + scroll) while polling the
|
|
536
|
+
// Akamai _abck cookie until it VALIDATES. Akamai's sensor JS only flips
|
|
537
|
+
// _abck from "unvalidated" (`token~-1~…`) to "validated" (`token~0~…`)
|
|
538
|
+
// after it observes human behavioral signals; a bare navigate-and-idle
|
|
539
|
+
// bootstrap captures the `~-1~` cookie, and every later API POST that
|
|
540
|
+
// relies on it is silently tarpitted (RST after ~30s). Verified against
|
|
541
|
+
// www.costcotravel.com: a recorded human session shows _abck `~-1~`→`~0~`
|
|
542
|
+
// after the sensor POSTs, and synthetic mouse/scroll reproduces the flip
|
|
543
|
+
// in ~5s. This is the behavioral piece a real browser has and a headless
|
|
544
|
+
// replay lacks. General — any Akamai-protected site uses the same _abck
|
|
545
|
+
// state machine. Falls through after the wait window regardless, so a site
|
|
546
|
+
// that doesn't use _abck (the cookie is absent) is unaffected.
|
|
547
|
+
const abckValidated = await driveSensorValidation(
|
|
548
|
+
page,
|
|
549
|
+
context,
|
|
550
|
+
Math.max(args.sensorWaitSeconds, 20),
|
|
551
|
+
);
|
|
552
|
+
if (!abckValidated) {
|
|
553
|
+
log('_abck did not validate within the interaction window (continuing anyway)');
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
// Snapshot the bootstrap page HTML + response headers so callers can
|
|
557
|
+
// satisfy the workflow's html_regex / response_header bootstrap captures
|
|
558
|
+
// from this same stealth session (the cookies, HTML, and headers are all
|
|
559
|
+
// one consistent session — required for tokens the later API POST checks).
|
|
560
|
+
let bootstrapHtml: string | undefined;
|
|
561
|
+
try {
|
|
562
|
+
bootstrapHtml = await page.content();
|
|
563
|
+
} catch {
|
|
564
|
+
// best-effort
|
|
565
|
+
}
|
|
566
|
+
const bootstrapResponseHeaders: Record<string, string> = {};
|
|
567
|
+
if (navResponse) {
|
|
568
|
+
try {
|
|
569
|
+
const raw = await navResponse.allHeaders();
|
|
570
|
+
for (const [k, v] of Object.entries(raw)) bootstrapResponseHeaders[k.toLowerCase()] = v;
|
|
571
|
+
} catch {
|
|
572
|
+
// best-effort
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
// Capture the live browser's actual UA + client hints so the post-bootstrap
|
|
577
|
+
// fetches present the SAME identity that minted the cookies. Reading them
|
|
578
|
+
// from the page (rather than hardcoding) guarantees UA ↔ sec-ch-ua agree and
|
|
579
|
+
// never drift as the bundled Chrome updates.
|
|
580
|
+
// Hoisted out of the page.evaluate callback: TS types are erased before
|
|
581
|
+
// Playwright serializes the function, so the callback can reference them
|
|
582
|
+
// without breaking serialization — and keeping them flat avoids formatter
|
|
583
|
+
// churn on a deeply nested inline type.
|
|
584
|
+
type HighEntropy = {
|
|
585
|
+
platform?: string;
|
|
586
|
+
fullVersionList?: Array<{ brand: string; version: string }>;
|
|
587
|
+
};
|
|
588
|
+
type UserAgentData = {
|
|
589
|
+
brands?: Array<{ brand: string; version: string }>;
|
|
590
|
+
mobile?: boolean;
|
|
591
|
+
getHighEntropyValues?: (hints: string[]) => Promise<HighEntropy>;
|
|
592
|
+
};
|
|
593
|
+
let capturedUserAgent: string | undefined;
|
|
594
|
+
let clientHints: Record<string, string> | undefined;
|
|
595
|
+
try {
|
|
596
|
+
const captured = (await page.evaluate(async () => {
|
|
597
|
+
const ua = navigator.userAgent;
|
|
598
|
+
const d = (navigator as unknown as { userAgentData?: UserAgentData }).userAgentData;
|
|
599
|
+
let hints: Record<string, string> | null = null;
|
|
600
|
+
if (d && typeof d.getHighEntropyValues === 'function') {
|
|
601
|
+
try {
|
|
602
|
+
const he = await d.getHighEntropyValues(['fullVersionList', 'platform']);
|
|
603
|
+
const fmt = (list?: Array<{ brand: string; version: string }>) =>
|
|
604
|
+
(list ?? []).map((b) => `"${b.brand}";v="${b.version}"`).join(', ');
|
|
605
|
+
hints = {
|
|
606
|
+
'sec-ch-ua': fmt(d.brands),
|
|
607
|
+
'sec-ch-ua-mobile': d.mobile ? '?1' : '?0',
|
|
608
|
+
'sec-ch-ua-platform': `"${he.platform ?? ''}"`,
|
|
609
|
+
};
|
|
610
|
+
const fv = fmt(he.fullVersionList);
|
|
611
|
+
if (fv) hints['sec-ch-ua-full-version-list'] = fv;
|
|
612
|
+
} catch {
|
|
613
|
+
hints = null;
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
return { ua, hints };
|
|
617
|
+
})) as { ua: string; hints: Record<string, string> | null };
|
|
618
|
+
capturedUserAgent = captured.ua || undefined;
|
|
619
|
+
clientHints = captured.hints ?? undefined;
|
|
620
|
+
} catch {
|
|
621
|
+
// best-effort — fall back to DEFAULT_UA downstream
|
|
622
|
+
}
|
|
355
623
|
|
|
356
624
|
// Probe with known headers; any header we DIDN'T send was injected
|
|
357
625
|
// by the sensor — that's what we capture.
|
|
@@ -400,7 +668,10 @@ async function defaultBootstrap(args: BootstrapArgs): Promise<TokenCache> {
|
|
|
400
668
|
// suffixes like .co.uk — it would match any cookie whose domain
|
|
401
669
|
// contained "co.uk".
|
|
402
670
|
const allCookies = await context.cookies();
|
|
403
|
-
|
|
671
|
+
// Scope to the navigated page's registrable domain — that's where the
|
|
672
|
+
// session-token cookies live (baseUrl may be an API subdomain that shares
|
|
673
|
+
// the same eTLD+1, so this still captures cross-subdomain cookies).
|
|
674
|
+
const origin = new URL(navUrl);
|
|
404
675
|
const root = registrableDomain(origin.hostname);
|
|
405
676
|
const cookies = allCookies
|
|
406
677
|
.filter((c) => {
|
|
@@ -409,7 +680,15 @@ async function defaultBootstrap(args: BootstrapArgs): Promise<TokenCache> {
|
|
|
409
680
|
})
|
|
410
681
|
.map((c) => ({ name: c.name, value: c.value }));
|
|
411
682
|
|
|
412
|
-
return {
|
|
683
|
+
return {
|
|
684
|
+
cookies,
|
|
685
|
+
sensorHeaders,
|
|
686
|
+
bootstrappedAt: Date.now(),
|
|
687
|
+
bootstrapHtml,
|
|
688
|
+
bootstrapResponseHeaders,
|
|
689
|
+
userAgent: capturedUserAgent,
|
|
690
|
+
clientHints,
|
|
691
|
+
};
|
|
413
692
|
} finally {
|
|
414
693
|
await browser?.close().catch(() => {});
|
|
415
694
|
}
|
|
@@ -424,6 +703,7 @@ async function defaultUnderlyingFetch(
|
|
|
424
703
|
method: init.method ?? 'GET',
|
|
425
704
|
headers: init.headers,
|
|
426
705
|
body: init.body,
|
|
706
|
+
signal: init.signal,
|
|
427
707
|
});
|
|
428
708
|
const body = await resp.text();
|
|
429
709
|
const headers: Record<string, string> = {};
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* File-backed stealth-fetch TokenCache, shared across compile-time `bun test`
|
|
3
|
+
* processes.
|
|
4
|
+
*
|
|
5
|
+
* Each integration / per-parameter test the compile agent writes runs in its own
|
|
6
|
+
* `bun test` process, and `runWorkflowWithLadder` otherwise mints a fresh stealth
|
|
7
|
+
* token (~12s headless Chromium bootstrap, see stealth-fetch.ts) every time. A
|
|
8
|
+
* multi-test gate run therefore fires a burst of bootstraps against one origin in
|
|
9
|
+
* seconds — exactly the pattern Akamai/PerimeterX flag, which forces the
|
|
10
|
+
* integration test to be waived. Persisting one token per site (keyed by the site
|
|
11
|
+
* asset dir) lets sibling processes reuse a single bootstrap, cutting both waivers
|
|
12
|
+
* and compile time.
|
|
13
|
+
*
|
|
14
|
+
* The file holds a live session token. It lives under ~/.imprint/<site>/ (never
|
|
15
|
+
* the repo) and is transient: stale entries are ignored on read, a malformed file
|
|
16
|
+
* is treated as absent, and a token that has gone bad self-heals via the
|
|
17
|
+
* 403 → re-bootstrap path in stealth-fetch.ts. `clearCachedToken` removes it when
|
|
18
|
+
* a site's teach run ends.
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
import { existsSync, mkdirSync, readFileSync, renameSync, rmSync, writeFileSync } from 'node:fs';
|
|
22
|
+
import { join as pathJoin } from 'node:path';
|
|
23
|
+
import { createLog } from './log.ts';
|
|
24
|
+
import type { TokenCache } from './stealth-fetch.ts';
|
|
25
|
+
|
|
26
|
+
const log = createLog('stealth-cache');
|
|
27
|
+
|
|
28
|
+
const TOKEN_FILE = '.stealth-token.json';
|
|
29
|
+
|
|
30
|
+
function tokenPath(siteDir: string): string {
|
|
31
|
+
return pathJoin(siteDir, TOKEN_FILE);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** Load a cached token for a site dir, or null if absent / malformed / stale. */
|
|
35
|
+
export function loadCachedToken(siteDir: string, maxAgeSeconds: number): TokenCache | null {
|
|
36
|
+
const p = tokenPath(siteDir);
|
|
37
|
+
if (!existsSync(p)) return null;
|
|
38
|
+
try {
|
|
39
|
+
const raw = JSON.parse(readFileSync(p, 'utf8')) as Partial<TokenCache>;
|
|
40
|
+
if (
|
|
41
|
+
!raw ||
|
|
42
|
+
!Array.isArray(raw.cookies) ||
|
|
43
|
+
typeof raw.sensorHeaders !== 'object' ||
|
|
44
|
+
raw.sensorHeaders === null ||
|
|
45
|
+
typeof raw.bootstrappedAt !== 'number'
|
|
46
|
+
) {
|
|
47
|
+
return null;
|
|
48
|
+
}
|
|
49
|
+
const ageSeconds = (Date.now() - raw.bootstrappedAt) / 1000;
|
|
50
|
+
if (ageSeconds >= maxAgeSeconds) {
|
|
51
|
+
log(
|
|
52
|
+
`cached token in ${siteDir} is ${Math.round(ageSeconds)}s old (>= ${maxAgeSeconds}s) — ignoring`,
|
|
53
|
+
);
|
|
54
|
+
return null;
|
|
55
|
+
}
|
|
56
|
+
return {
|
|
57
|
+
cookies: raw.cookies,
|
|
58
|
+
sensorHeaders: raw.sensorHeaders,
|
|
59
|
+
bootstrappedAt: raw.bootstrappedAt,
|
|
60
|
+
};
|
|
61
|
+
} catch {
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/** Persist a token for a site dir (atomic temp + rename). Best-effort. */
|
|
67
|
+
export function saveCachedToken(siteDir: string, token: TokenCache): void {
|
|
68
|
+
try {
|
|
69
|
+
mkdirSync(siteDir, { recursive: true });
|
|
70
|
+
const p = tokenPath(siteDir);
|
|
71
|
+
const tmp = `${p}.${process.pid}.tmp`;
|
|
72
|
+
writeFileSync(tmp, `${JSON.stringify(token)}\n`, 'utf8');
|
|
73
|
+
renameSync(tmp, p);
|
|
74
|
+
} catch (err) {
|
|
75
|
+
log(
|
|
76
|
+
`failed to persist stealth token to ${siteDir}: ${err instanceof Error ? err.message : String(err)}`,
|
|
77
|
+
);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/** Remove a cached token (best-effort) — call when a site's teach run ends. */
|
|
82
|
+
export function clearCachedToken(siteDir: string): void {
|
|
83
|
+
try {
|
|
84
|
+
rmSync(tokenPath(siteDir), { force: true });
|
|
85
|
+
} catch {
|
|
86
|
+
// best-effort
|
|
87
|
+
}
|
|
88
|
+
}
|