imprint-mcp 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/README.md +165 -201
  2. package/examples/discoverandgo/README.md +1 -1
  3. package/examples/echo/README.md +1 -1
  4. package/examples/google-flights/README.md +28 -0
  5. package/examples/google-flights/_shared/batchexecute.ts +63 -0
  6. package/examples/google-flights/_shared/flights_request.ts +95 -0
  7. package/examples/google-flights/_shared/package.json +9 -0
  8. package/examples/google-flights/get_flight_booking_details/index.ts +159 -0
  9. package/examples/google-flights/get_flight_booking_details/package.json +9 -0
  10. package/examples/google-flights/get_flight_booking_details/parser.ts +182 -0
  11. package/examples/google-flights/get_flight_booking_details/playbook.yaml +138 -0
  12. package/examples/google-flights/get_flight_booking_details/request-transform.ts +86 -0
  13. package/examples/google-flights/get_flight_booking_details/workflow.json +98 -0
  14. package/examples/google-flights/get_flight_calendar_prices/index.ts +131 -0
  15. package/examples/google-flights/get_flight_calendar_prices/package.json +9 -0
  16. package/examples/google-flights/get_flight_calendar_prices/parser.ts +86 -0
  17. package/examples/google-flights/get_flight_calendar_prices/playbook.yaml +97 -0
  18. package/examples/google-flights/get_flight_calendar_prices/request-transform.ts +31 -0
  19. package/examples/google-flights/get_flight_calendar_prices/workflow.json +76 -0
  20. package/examples/google-flights/lookup_airport/index.ts +101 -0
  21. package/examples/google-flights/lookup_airport/package.json +9 -0
  22. package/examples/google-flights/lookup_airport/parser.ts +66 -0
  23. package/examples/google-flights/lookup_airport/playbook.yaml +47 -0
  24. package/examples/google-flights/lookup_airport/request-transform.ts +20 -0
  25. package/examples/google-flights/lookup_airport/workflow.json +57 -0
  26. package/examples/google-flights/search_flights/index.ts +219 -0
  27. package/examples/google-flights/search_flights/package.json +9 -0
  28. package/examples/google-flights/search_flights/parser.ts +169 -0
  29. package/examples/google-flights/search_flights/playbook.yaml +184 -0
  30. package/examples/google-flights/search_flights/request-transform.ts +119 -0
  31. package/examples/google-flights/search_flights/workflow.json +143 -0
  32. package/examples/google-hotels/README.md +29 -0
  33. package/examples/google-hotels/_shared/batchexecute.ts +73 -0
  34. package/examples/google-hotels/_shared/freq.ts +158 -0
  35. package/examples/google-hotels/_shared/package.json +9 -0
  36. package/examples/google-hotels/autocomplete_hotel_location/index.ts +80 -0
  37. package/examples/google-hotels/autocomplete_hotel_location/package.json +9 -0
  38. package/examples/google-hotels/autocomplete_hotel_location/parser.ts +71 -0
  39. package/examples/google-hotels/autocomplete_hotel_location/playbook.yaml +36 -0
  40. package/examples/google-hotels/autocomplete_hotel_location/request-transform.ts +37 -0
  41. package/examples/google-hotels/autocomplete_hotel_location/workflow.json +36 -0
  42. package/examples/google-hotels/get_hotel_booking_options/index.ts +143 -0
  43. package/examples/google-hotels/get_hotel_booking_options/package.json +9 -0
  44. package/examples/google-hotels/get_hotel_booking_options/parser.ts +271 -0
  45. package/examples/google-hotels/get_hotel_booking_options/playbook.yaml +154 -0
  46. package/examples/google-hotels/get_hotel_booking_options/request-transform.ts +154 -0
  47. package/examples/google-hotels/get_hotel_booking_options/workflow.json +84 -0
  48. package/examples/google-hotels/get_hotel_reviews/index.ts +81 -0
  49. package/examples/google-hotels/get_hotel_reviews/package.json +9 -0
  50. package/examples/google-hotels/get_hotel_reviews/parser.ts +128 -0
  51. package/examples/google-hotels/get_hotel_reviews/playbook.yaml +64 -0
  52. package/examples/google-hotels/get_hotel_reviews/request-transform.ts +42 -0
  53. package/examples/google-hotels/get_hotel_reviews/workflow.json +37 -0
  54. package/examples/google-hotels/search_hotels/index.ts +207 -0
  55. package/examples/google-hotels/search_hotels/package.json +9 -0
  56. package/examples/google-hotels/search_hotels/parser.ts +260 -0
  57. package/examples/google-hotels/search_hotels/playbook.yaml +87 -0
  58. package/examples/google-hotels/search_hotels/request-transform.ts +197 -0
  59. package/examples/google-hotels/search_hotels/workflow.json +127 -0
  60. package/package.json +3 -2
  61. package/prompts/audit-agent.md +71 -0
  62. package/prompts/build-planning.md +74 -0
  63. package/prompts/compile-agent.md +132 -28
  64. package/prompts/prereq-builder.md +64 -0
  65. package/prompts/prereq-planner.md +34 -0
  66. package/prompts/tool-planning.md +39 -0
  67. package/src/cli.ts +111 -4
  68. package/src/imprint/agent.ts +5 -0
  69. package/src/imprint/audit.ts +996 -0
  70. package/src/imprint/backend-ladder.ts +1214 -184
  71. package/src/imprint/build-plan.ts +1051 -0
  72. package/src/imprint/cdp-browser-fetch.ts +589 -0
  73. package/src/imprint/cdp-jar-cache.ts +320 -0
  74. package/src/imprint/chromium.ts +135 -0
  75. package/src/imprint/claude-cli-compile.ts +125 -25
  76. package/src/imprint/codex-cli-compile.ts +26 -23
  77. package/src/imprint/compile-agent-types.ts +38 -0
  78. package/src/imprint/compile-agent.ts +65 -27
  79. package/src/imprint/compile-tools.ts +1656 -64
  80. package/src/imprint/compile.ts +14 -2
  81. package/src/imprint/concurrency.ts +87 -0
  82. package/src/imprint/credential-extract.ts +174 -25
  83. package/src/imprint/cron.ts +1 -0
  84. package/src/imprint/doctor.ts +39 -0
  85. package/src/imprint/emit.ts +85 -0
  86. package/src/imprint/freeform-redact.ts +5 -4
  87. package/src/imprint/integrations.ts +2 -2
  88. package/src/imprint/llm.ts +56 -8
  89. package/src/imprint/mcp-compile-server.ts +43 -10
  90. package/src/imprint/mcp-maintenance.ts +9 -101
  91. package/src/imprint/mcp-server.ts +73 -7
  92. package/src/imprint/multi-progress.ts +7 -2
  93. package/src/imprint/param-grounding.ts +367 -0
  94. package/src/imprint/paths.ts +29 -0
  95. package/src/imprint/playbook-runner.ts +101 -40
  96. package/src/imprint/prereq-builder.ts +651 -0
  97. package/src/imprint/probe-backends.ts +6 -3
  98. package/src/imprint/record.ts +10 -1
  99. package/src/imprint/redact.ts +30 -2
  100. package/src/imprint/replay-capture.ts +19 -18
  101. package/src/imprint/runtime.ts +19 -10
  102. package/src/imprint/sensitive-keys.ts +141 -7
  103. package/src/imprint/session-diff.ts +79 -2
  104. package/src/imprint/session-merge.ts +9 -5
  105. package/src/imprint/stealth-chromium.ts +81 -0
  106. package/src/imprint/stealth-fetch.ts +309 -29
  107. package/src/imprint/stealth-token-cache.ts +88 -0
  108. package/src/imprint/teach-plan.ts +251 -0
  109. package/src/imprint/teach-state.ts +17 -0
  110. package/src/imprint/teach.ts +582 -147
  111. package/src/imprint/tool-candidates.ts +72 -14
  112. package/src/imprint/tool-plan.ts +313 -0
  113. package/src/imprint/tracing.ts +135 -6
  114. package/src/imprint/types.ts +61 -3
  115. package/examples/google-flights/search_google_flights/index.ts +0 -101
  116. package/examples/google-flights/search_google_flights/parser.test.ts +0 -140
  117. package/examples/google-flights/search_google_flights/parser.ts +0 -189
  118. package/examples/google-flights/search_google_flights/playbook.yaml +0 -130
  119. package/examples/google-flights/search_google_flights/workflow.json +0 -48
  120. package/examples/google-hotels/search_google_hotels/index.ts +0 -194
  121. package/examples/google-hotels/search_google_hotels/parser.test.ts +0 -168
  122. package/examples/google-hotels/search_google_hotels/parser.ts +0 -330
  123. package/examples/google-hotels/search_google_hotels/playbook.yaml +0 -125
  124. package/examples/google-hotels/search_google_hotels/workflow.json +0 -111
  125. package/examples/namecheap-domains/search_namecheap_domains/index.ts +0 -144
  126. package/examples/namecheap-domains/search_namecheap_domains/parser.ts +0 -380
  127. package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +0 -50
  128. package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +0 -136
  129. package/examples/namecheap-domains/search_namecheap_domains/workflow.json +0 -97
@@ -10,13 +10,20 @@
10
10
  * ~12s bootstrap one-time, ~1s per API call after.
11
11
  */
12
12
 
13
- import type { Browser } from 'playwright';
13
+ import type { Browser, BrowserContext, Page } from 'playwright';
14
14
  import { isSameRegistrableDomain, registrableDomain } from './etld.ts';
15
15
  import { createLog } from './log.ts';
16
16
 
17
17
  export interface StealthFetchOptions {
18
18
  /** Homepage URL to load during bootstrap (triggers bot-detection JS). */
19
19
  baseUrl: string;
20
+ /** URL to navigate during bootstrap, when it differs from baseUrl. Set this
21
+ * to the workflow's `bootstrap.url` so the same stealth session that mints
22
+ * anti-bot cookies (_abck etc.) ALSO loads the page that sets session
23
+ * tokens (CSRF cookies, nonces) — those tokens and the bot-cookies must
24
+ * come from ONE session or the site rejects the later API POST on a
25
+ * session mismatch. Defaults to baseUrl. */
26
+ bootstrapUrl?: string;
20
27
  /** Seconds to wait after page load for sensor initialization. Default 3. */
21
28
  sensorWaitSeconds?: number;
22
29
  /** Launch headed for debugging. Default false. */
@@ -42,6 +49,11 @@ export interface FetchInit {
42
49
  * a 403 retry — callers that need retry-after-bot-bootstrap should
43
50
  * pass a string, Blob, ArrayBuffer, FormData, or URLSearchParams. */
44
51
  body?: RequestInit['body'];
52
+ /** Abort signal from the caller (e.g. executeWorkflow's per-request timeout
53
+ * AbortController). MUST be forwarded to the underlying fetch — without it a
54
+ * tarpitting anti-bot endpoint hangs far past the caller's timeout (observed
55
+ * ~272s on Akamai) instead of aborting promptly so the ladder can escalate. */
56
+ signal?: AbortSignal;
45
57
  }
46
58
 
47
59
  interface FetchResult {
@@ -55,11 +67,34 @@ export interface TokenCache {
55
67
  cookies: Array<{ name: string; value: string }>;
56
68
  sensorHeaders: Record<string, string>;
57
69
  bootstrappedAt: number;
70
+ /** HTML of the bootstrap navigation, so callers can satisfy a workflow's
71
+ * `html_regex` bootstrap captures from the same session. Optional —
72
+ * absent on caches minted before this field existed. */
73
+ bootstrapHtml?: string;
74
+ /** Lower-cased response headers of the bootstrap navigation, so callers can
75
+ * satisfy `response_header` bootstrap captures. Optional. */
76
+ bootstrapResponseHeaders?: Record<string, string>;
77
+ /** The bootstrap browser's actual `navigator.userAgent`, captured live. Reused
78
+ * for the post-bootstrap fetches so the wire UA matches the binary that minted
79
+ * the cookies (and its client hints below). Absent if capture failed or on
80
+ * caches minted before this field existed → caller falls back to DEFAULT_UA. */
81
+ userAgent?: string;
82
+ /** Lower-cased `sec-ch-ua*` client-hint headers derived from the bootstrap
83
+ * browser's `navigator.userAgentData`, so the post-bootstrap fetch can send
84
+ * client hints consistent with `userAgent`. Absent when the browser doesn't
85
+ * expose userAgentData (non-secure context / non-Chromium). */
86
+ clientHints?: Record<string, string>;
58
87
  }
59
88
 
60
89
  export interface StealthFetch {
61
90
  /** typeof fetch wrapper that auto-bootstraps + adds sensor headers. */
62
91
  readonly fetchImpl: typeof fetch;
92
+ /** Force the bootstrap navigation now (if not already done) and return the
93
+ * token cache — including the cookies minted during the navigation. Callers
94
+ * use this to read session-token cookies (CSRF etc.) set by the bootstrap
95
+ * page and feed them into the workflow as `${state.X}`, in the SAME session
96
+ * as the transport cookies. */
97
+ ensureBootstrapped(): Promise<TokenCache>;
63
98
  /** Drop cached tokens; next fetch re-bootstraps. */
64
99
  invalidate(): void;
65
100
  /** Token age in seconds; -1 if not bootstrapped yet. */
@@ -74,10 +109,16 @@ export interface StealthFetch {
74
109
  close(): Promise<void>;
75
110
  }
76
111
 
77
- interface BootstrapArgs {
112
+ export interface BootstrapArgs {
78
113
  baseUrl: string;
114
+ /** Page to navigate during bootstrap (for session-token cookies). Defaults
115
+ * to baseUrl when absent. */
116
+ bootstrapUrl?: string;
79
117
  probeUrl?: string;
80
- userAgent: string;
118
+ /** Force a specific UA on the bootstrap browser. Omit (the default) to let
119
+ * Chrome use its NATIVE UA — which is always self-consistent with the client
120
+ * hints it emits. Only set this when a caller explicitly needs a custom UA. */
121
+ userAgent?: string;
81
122
  headed: boolean;
82
123
  sensorWaitSeconds: number;
83
124
  }
@@ -92,8 +133,20 @@ interface StealthFetchInternals {
92
133
  underlyingFetch?: (url: string, init: FetchInit, tokens: TokenCache) => Promise<FetchResult>;
93
134
  }
94
135
 
136
+ /**
137
+ * Last-resort User-Agent, used ONLY when the bootstrap browser couldn't report
138
+ * its own UA and the caller didn't force one. The real path captures the live
139
+ * browser's actual `navigator.userAgent` during bootstrap (see
140
+ * `bootstrapStealthToken`) and reuses THAT for the post-bootstrap fetches, so
141
+ * the UA on the wire always matches the binary's own client hints (sec-ch-ua).
142
+ *
143
+ * A hardcoded UA is dangerous precisely because it drifts: a stale major
144
+ * version (e.g. Chrome/131) paired with the live binary's client hints
145
+ * (Chrome/148) is a contradiction no real browser emits — a textbook anti-bot
146
+ * tell. Keep this roughly current as a floor, but the capture is what ships.
147
+ */
95
148
  const DEFAULT_UA =
96
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36';
149
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36';
97
150
 
98
151
  /** Standard headers the runtime sets — anything outbound NOT in this set
99
152
  * was injected by sensor JS and is what we capture for replay. */
@@ -136,14 +189,18 @@ export function createStealthFetch(
136
189
  const o = typeof optsOrUrl === 'string' ? { baseUrl: optsOrUrl } : optsOrUrl;
137
190
  const opts = {
138
191
  baseUrl: o.baseUrl,
192
+ bootstrapUrl: o.bootstrapUrl ?? o.baseUrl,
139
193
  sensorWaitSeconds: o.sensorWaitSeconds ?? 3,
140
194
  headed: o.headed ?? false,
141
- userAgent: o.userAgent ?? DEFAULT_UA,
195
+ // Undefined unless the caller forces a UA. Letting it stay undefined makes
196
+ // the bootstrap browser use its native UA (self-consistent with its client
197
+ // hints); we then capture that real UA and reuse it for the fetches.
198
+ userAgent: o.userAgent,
142
199
  maxRetries: o.maxRetries ?? 1,
143
200
  maxTokenAgeSeconds: o.maxTokenAgeSeconds ?? 600,
144
201
  maxConsecutiveFailures: o.maxConsecutiveFailures ?? 3,
145
202
  };
146
- const bootstrapFn = internals?.bootstrap ?? defaultBootstrap;
203
+ const bootstrapFn = internals?.bootstrap ?? bootstrapStealthToken;
147
204
  const underlyingFetchFn = internals?.underlyingFetch ?? defaultUnderlyingFetch;
148
205
 
149
206
  let tokens: TokenCache | null = null;
@@ -164,6 +221,7 @@ export function createStealthFetch(
164
221
  log('bootstrapping…');
165
222
  tokens = await bootstrapFn({
166
223
  baseUrl: opts.baseUrl,
224
+ bootstrapUrl: opts.bootstrapUrl,
167
225
  probeUrl,
168
226
  userAgent: opts.userAgent,
169
227
  headed: opts.headed,
@@ -183,24 +241,48 @@ export function createStealthFetch(
183
241
  const t = tokens;
184
242
  if (!t) throw new Error('No tokens (bootstrap failed?)');
185
243
  const { headers: initHeaders, cookieHeader } = splitCookieHeader(init?.headers ?? {});
244
+ // Defaults that yield to the caller's initHeaders (and the workflow's
245
+ // recorded headers that flow through them). Keys are lowercase to
246
+ // match what the public `fetchImpl` wrapper normalizes everything to
247
+ // (via `new Headers().forEach`) — a mixed-case merge would silently
248
+ // duplicate both `Accept` and `accept` in the final headers and the
249
+ // caller's override would never actually win.
250
+ //
251
+ // Content-Type intentionally depends on whether the request actually
252
+ // has a body — sending Content-Type: application/json on a body-less
253
+ // GET is anti-bot suspicious (real browsers don't do it) and was
254
+ // contributing to Akamai tarpits on HTML bootstrap GETs from this rung.
255
+ const hasBody = init?.body !== undefined && init?.body !== null;
256
+ // UA precedence: an explicit caller override (also used for the bootstrap
257
+ // context) → the UA the bootstrap browser actually reported → the stale
258
+ // fallback. The captured value keeps the fetch UA matching the binary that
259
+ // minted the cookies.
260
+ const ua = opts.userAgent ?? t.userAgent ?? DEFAULT_UA;
261
+ const defaultHeaders: Record<string, string> = {
262
+ 'user-agent': ua,
263
+ accept: 'application/json, text/javascript, */*; q=0.01',
264
+ cookie: mergeCookieHeader(
265
+ t.cookies.map((c) => `${c.name}=${c.value}`).join('; '),
266
+ cookieHeader,
267
+ ),
268
+ origin: new URL(fullUrl).origin,
269
+ referer: opts.baseUrl,
270
+ ...t.sensorHeaders,
271
+ };
272
+ // Send client hints consistent with the UA. Only when we're NOT forcing a
273
+ // custom UA: the captured hints reflect the browser's native UA, so pairing
274
+ // them with an override would reintroduce the UA/hints contradiction we fix.
275
+ if (!opts.userAgent && t.clientHints) {
276
+ for (const [k, v] of Object.entries(t.clientHints)) defaultHeaders[k] = v;
277
+ }
278
+ if (hasBody) defaultHeaders['content-type'] = 'application/json';
186
279
  const result = await underlyingFetchFn(
187
280
  fullUrl,
188
281
  {
189
282
  method: init?.method ?? 'GET',
190
- headers: {
191
- 'User-Agent': opts.userAgent,
192
- Accept: 'application/json, text/javascript, */*; q=0.01',
193
- 'Content-Type': 'application/json',
194
- Cookie: mergeCookieHeader(
195
- t.cookies.map((c) => `${c.name}=${c.value}`).join('; '),
196
- cookieHeader,
197
- ),
198
- Origin: new URL(fullUrl).origin,
199
- Referer: opts.baseUrl,
200
- ...t.sensorHeaders,
201
- ...initHeaders,
202
- },
283
+ headers: { ...defaultHeaders, ...initHeaders },
203
284
  body: init?.body,
285
+ signal: init?.signal,
204
286
  },
205
287
  t,
206
288
  );
@@ -261,6 +343,9 @@ export function createStealthFetch(
261
343
  // accepted shape (string, Blob, ArrayBuffer, FormData, URLSearchParams,
262
344
  // ReadableStream). Previously we dropped any non-string body silently.
263
345
  body: init?.body ?? undefined,
346
+ // Forward the caller's abort signal (per-request timeout) — without it a
347
+ // tarpitting endpoint hangs far past the timeout instead of escalating.
348
+ signal: init?.signal ?? undefined,
264
349
  });
265
350
  return new Response(result.body, {
266
351
  status: result.status,
@@ -270,6 +355,11 @@ export function createStealthFetch(
270
355
 
271
356
  return {
272
357
  fetchImpl,
358
+ async ensureBootstrapped(): Promise<TokenCache> {
359
+ await ensureTokens();
360
+ if (!tokens) throw new Error('stealth bootstrap produced no tokens');
361
+ return tokens;
362
+ },
273
363
  invalidate(): void {
274
364
  tokens = null;
275
365
  consecutiveFailures = 0;
@@ -325,18 +415,93 @@ function mergeCookieHeader(browserCookie: string, runtimeCookie: string | undefi
325
415
  * `baseUrl`, lets the bot-detection JS run, captures the resulting
326
416
  * cookies + sensor-injected headers via a route interceptor on a probe
327
417
  * request, closes the browser. Returns a fresh TokenCache.
418
+ *
419
+ * Exported so the compile-time token cache (stealth-token-cache.ts) can mint a
420
+ * token to persist + share across `bun test` processes without re-implementing
421
+ * the Playwright bootstrap.
328
422
  */
329
- async function defaultBootstrap(args: BootstrapArgs): Promise<TokenCache> {
330
- const { chromium } = await import('playwright');
423
+ /** Akamai _abck cookie validation marker. Format: `<token>~<status>~…`;
424
+ * status `0` = sensor-validated (requests pass), `-1` = not yet validated
425
+ * (state-changing POSTs get tarpitted). */
426
+ function abckIsValidated(cookieValue: string | undefined): boolean {
427
+ if (!cookieValue) return false;
428
+ return cookieValue.split('~')[1] === '0';
429
+ }
430
+
431
+ /** Drive human-like interaction (mouse moves + scroll) and poll until the
432
+ * Akamai _abck cookie validates (`~0~`), or until `maxSeconds` elapse. Returns
433
+ * true if validation was observed. No-op-safe on pages without _abck (returns
434
+ * false after the window; caller proceeds regardless). */
435
+ async function driveSensorValidation(
436
+ page: Page,
437
+ context: BrowserContext,
438
+ maxSeconds: number,
439
+ ): Promise<boolean> {
440
+ const deadline = maxSeconds * 1000;
441
+ const start = Date.now();
442
+ let i = 0;
443
+ while (Date.now() - start < deadline) {
444
+ // Jittered mouse path + occasional scroll — the sensor wants movement, not
445
+ // a single teleport. Coordinates stay within the viewport.
446
+ try {
447
+ await page.mouse.move(80 + ((i * 137) % 1200), 120 + ((i * 89) % 640), { steps: 4 });
448
+ if (i % 3 === 0) {
449
+ await page.evaluate(
450
+ (y: number) => {
451
+ (globalThis as unknown as { scrollBy: (x: number, y: number) => void }).scrollBy(0, y);
452
+ },
453
+ 100 + (i % 5) * 40,
454
+ );
455
+ }
456
+ } catch {
457
+ // page may navigate/close mid-interaction — non-fatal
458
+ }
459
+ await page.waitForTimeout(800);
460
+ let abck: string | undefined;
461
+ try {
462
+ abck = (await context.cookies()).find((c) => c.name === '_abck')?.value;
463
+ } catch {
464
+ // best-effort
465
+ }
466
+ // Absent _abck → site doesn't use Akamai's scheme; nothing to wait for.
467
+ if (abck === undefined && i >= 2) return false;
468
+ if (abckIsValidated(abck)) return true;
469
+ i++;
470
+ }
471
+ return false;
472
+ }
473
+
474
+ export async function bootstrapStealthToken(args: BootstrapArgs): Promise<TokenCache> {
475
+ // Use the same stealth-patched chromium + full Chrome binary that
476
+ // runFetchBootstrap and runPlaybook use. The original implementation
477
+ // imported vanilla `playwright` with no executablePath, which defaults
478
+ // to chrome-headless-shell — a separate stripped-down binary that
479
+ // Akamai / Cloudflare / PerimeterX detect at the binary / TLS layer
480
+ // and RST the HTTP/2 stream immediately (verified empirically against
481
+ // www.costcotravel.com). Using the same binary as `imprint record`
482
+ // (Playwright's bundled "Google Chrome for Testing") makes Akamai
483
+ // accept the navigation and mint clean bot-cookies, just like the
484
+ // recording session did.
485
+ const { getStealthChromium, getStealthExecutablePath, isStealthPluginAvailable } = await import(
486
+ './stealth-chromium.ts'
487
+ );
488
+ const chromium = await getStealthChromium();
489
+ const stealthActive = await isStealthPluginAvailable();
331
490
  let browser: Browser | undefined;
332
491
  try {
333
492
  browser = await chromium.launch({
334
493
  headless: !args.headed,
494
+ executablePath: getStealthExecutablePath(),
335
495
  args: ['--disable-blink-features=AutomationControlled', '--no-sandbox'],
336
496
  });
337
497
 
498
+ // Only override the UA when the caller explicitly asked for one. Otherwise
499
+ // let Chrome use its native UA: a forced UA does NOT change the client hints
500
+ // (sec-ch-ua) the browser emits, so pinning a stale UA string while the
501
+ // binary advertises its real version is a contradiction anti-bot services
502
+ // flag. Native UA + native hints are always self-consistent.
338
503
  const context = await browser.newContext({
339
- userAgent: args.userAgent,
504
+ ...(args.userAgent ? { userAgent: args.userAgent } : {}),
340
505
  viewport: { width: 1440, height: 900 },
341
506
  screen: { width: 2560, height: 1440 },
342
507
  locale: 'en-US',
@@ -344,14 +509,117 @@ async function defaultBootstrap(args: BootstrapArgs): Promise<TokenCache> {
344
509
  });
345
510
 
346
511
  const page = await context.newPage();
347
- await page.addInitScript(() => {
348
- Object.defineProperty(navigator, 'webdriver', { get: () => false });
349
- });
512
+ // Patch navigator.webdriver ONLY on the vanilla-Playwright fallback. When the
513
+ // stealth plugin is active it already removes the property natively (a real
514
+ // Chrome lacks it); stacking our Object.defineProperty on top leaves a
515
+ // non-native descriptor that is itself a tell. See isStealthPluginAvailable.
516
+ if (!stealthActive) {
517
+ await page.addInitScript(() => {
518
+ Object.defineProperty(navigator, 'webdriver', { get: () => false });
519
+ });
520
+ }
350
521
 
522
+ // Navigate the bootstrap page (the workflow's bootstrap.url when set,
523
+ // else baseUrl). Loading the actual session-minting page here means the
524
+ // CSRF/nonce cookies it sets land in the SAME context as the anti-bot
525
+ // cookies — a later API POST that needs both will not be rejected for a
526
+ // session mismatch.
527
+ const navUrl = args.bootstrapUrl ?? args.baseUrl;
351
528
  // 'domcontentloaded' (not 'networkidle') because SPAs keep connections
352
529
  // alive forever; explicit sensor-wait lets bot-detection JS fire.
353
- await page.goto(args.baseUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
354
- await page.waitForTimeout(args.sensorWaitSeconds * 1000);
530
+ const navResponse = await page.goto(navUrl, {
531
+ waitUntil: 'domcontentloaded',
532
+ timeout: 30000,
533
+ });
534
+
535
+ // Drive human-like interaction (mouse moves + scroll) while polling the
536
+ // Akamai _abck cookie until it VALIDATES. Akamai's sensor JS only flips
537
+ // _abck from "unvalidated" (`token~-1~…`) to "validated" (`token~0~…`)
538
+ // after it observes human behavioral signals; a bare navigate-and-idle
539
+ // bootstrap captures the `~-1~` cookie, and every later API POST that
540
+ // relies on it is silently tarpitted (RST after ~30s). Verified against
541
+ // www.costcotravel.com: a recorded human session shows _abck `~-1~`→`~0~`
542
+ // after the sensor POSTs, and synthetic mouse/scroll reproduces the flip
543
+ // in ~5s. This is the behavioral piece a real browser has and a headless
544
+ // replay lacks. General — any Akamai-protected site uses the same _abck
545
+ // state machine. Falls through after the wait window regardless, so a site
546
+ // that doesn't use _abck (the cookie is absent) is unaffected.
547
+ const abckValidated = await driveSensorValidation(
548
+ page,
549
+ context,
550
+ Math.max(args.sensorWaitSeconds, 20),
551
+ );
552
+ if (!abckValidated) {
553
+ log('_abck did not validate within the interaction window (continuing anyway)');
554
+ }
555
+
556
+ // Snapshot the bootstrap page HTML + response headers so callers can
557
+ // satisfy the workflow's html_regex / response_header bootstrap captures
558
+ // from this same stealth session (the cookies, HTML, and headers are all
559
+ // one consistent session — required for tokens the later API POST checks).
560
+ let bootstrapHtml: string | undefined;
561
+ try {
562
+ bootstrapHtml = await page.content();
563
+ } catch {
564
+ // best-effort
565
+ }
566
+ const bootstrapResponseHeaders: Record<string, string> = {};
567
+ if (navResponse) {
568
+ try {
569
+ const raw = await navResponse.allHeaders();
570
+ for (const [k, v] of Object.entries(raw)) bootstrapResponseHeaders[k.toLowerCase()] = v;
571
+ } catch {
572
+ // best-effort
573
+ }
574
+ }
575
+
576
+ // Capture the live browser's actual UA + client hints so the post-bootstrap
577
+ // fetches present the SAME identity that minted the cookies. Reading them
578
+ // from the page (rather than hardcoding) guarantees UA ↔ sec-ch-ua agree and
579
+ // never drift as the bundled Chrome updates.
580
+ // Hoisted out of the page.evaluate callback: TS types are erased before
581
+ // Playwright serializes the function, so the callback can reference them
582
+ // without breaking serialization — and keeping them flat avoids formatter
583
+ // churn on a deeply nested inline type.
584
+ type HighEntropy = {
585
+ platform?: string;
586
+ fullVersionList?: Array<{ brand: string; version: string }>;
587
+ };
588
+ type UserAgentData = {
589
+ brands?: Array<{ brand: string; version: string }>;
590
+ mobile?: boolean;
591
+ getHighEntropyValues?: (hints: string[]) => Promise<HighEntropy>;
592
+ };
593
+ let capturedUserAgent: string | undefined;
594
+ let clientHints: Record<string, string> | undefined;
595
+ try {
596
+ const captured = (await page.evaluate(async () => {
597
+ const ua = navigator.userAgent;
598
+ const d = (navigator as unknown as { userAgentData?: UserAgentData }).userAgentData;
599
+ let hints: Record<string, string> | null = null;
600
+ if (d && typeof d.getHighEntropyValues === 'function') {
601
+ try {
602
+ const he = await d.getHighEntropyValues(['fullVersionList', 'platform']);
603
+ const fmt = (list?: Array<{ brand: string; version: string }>) =>
604
+ (list ?? []).map((b) => `"${b.brand}";v="${b.version}"`).join(', ');
605
+ hints = {
606
+ 'sec-ch-ua': fmt(d.brands),
607
+ 'sec-ch-ua-mobile': d.mobile ? '?1' : '?0',
608
+ 'sec-ch-ua-platform': `"${he.platform ?? ''}"`,
609
+ };
610
+ const fv = fmt(he.fullVersionList);
611
+ if (fv) hints['sec-ch-ua-full-version-list'] = fv;
612
+ } catch {
613
+ hints = null;
614
+ }
615
+ }
616
+ return { ua, hints };
617
+ })) as { ua: string; hints: Record<string, string> | null };
618
+ capturedUserAgent = captured.ua || undefined;
619
+ clientHints = captured.hints ?? undefined;
620
+ } catch {
621
+ // best-effort — fall back to DEFAULT_UA downstream
622
+ }
355
623
 
356
624
  // Probe with known headers; any header we DIDN'T send was injected
357
625
  // by the sensor — that's what we capture.
@@ -400,7 +668,10 @@ async function defaultBootstrap(args: BootstrapArgs): Promise<TokenCache> {
400
668
  // suffixes like .co.uk — it would match any cookie whose domain
401
669
  // contained "co.uk".
402
670
  const allCookies = await context.cookies();
403
- const origin = new URL(args.baseUrl);
671
+ // Scope to the navigated page's registrable domain — that's where the
672
+ // session-token cookies live (baseUrl may be an API subdomain that shares
673
+ // the same eTLD+1, so this still captures cross-subdomain cookies).
674
+ const origin = new URL(navUrl);
404
675
  const root = registrableDomain(origin.hostname);
405
676
  const cookies = allCookies
406
677
  .filter((c) => {
@@ -409,7 +680,15 @@ async function defaultBootstrap(args: BootstrapArgs): Promise<TokenCache> {
409
680
  })
410
681
  .map((c) => ({ name: c.name, value: c.value }));
411
682
 
412
- return { cookies, sensorHeaders, bootstrappedAt: Date.now() };
683
+ return {
684
+ cookies,
685
+ sensorHeaders,
686
+ bootstrappedAt: Date.now(),
687
+ bootstrapHtml,
688
+ bootstrapResponseHeaders,
689
+ userAgent: capturedUserAgent,
690
+ clientHints,
691
+ };
413
692
  } finally {
414
693
  await browser?.close().catch(() => {});
415
694
  }
@@ -424,6 +703,7 @@ async function defaultUnderlyingFetch(
424
703
  method: init.method ?? 'GET',
425
704
  headers: init.headers,
426
705
  body: init.body,
706
+ signal: init.signal,
427
707
  });
428
708
  const body = await resp.text();
429
709
  const headers: Record<string, string> = {};
@@ -0,0 +1,88 @@
1
+ /**
2
+ * File-backed stealth-fetch TokenCache, shared across compile-time `bun test`
3
+ * processes.
4
+ *
5
+ * Each integration / per-parameter test the compile agent writes runs in its own
6
+ * `bun test` process, and `runWorkflowWithLadder` otherwise mints a fresh stealth
7
+ * token (~12s headless Chromium bootstrap, see stealth-fetch.ts) every time. A
8
+ * multi-test gate run therefore fires a burst of bootstraps against one origin in
9
+ * seconds — exactly the pattern Akamai/PerimeterX flag, which forces the
10
+ * integration test to be waived. Persisting one token per site (keyed by the site
11
+ * asset dir) lets sibling processes reuse a single bootstrap, cutting both waivers
12
+ * and compile time.
13
+ *
14
+ * The file holds a live session token. It lives under ~/.imprint/<site>/ (never
15
+ * the repo) and is transient: stale entries are ignored on read, a malformed file
16
+ * is treated as absent, and a token that has gone bad self-heals via the
17
+ * 403 → re-bootstrap path in stealth-fetch.ts. `clearCachedToken` removes it when
18
+ * a site's teach run ends.
19
+ */
20
+
21
+ import { existsSync, mkdirSync, readFileSync, renameSync, rmSync, writeFileSync } from 'node:fs';
22
+ import { join as pathJoin } from 'node:path';
23
+ import { createLog } from './log.ts';
24
+ import type { TokenCache } from './stealth-fetch.ts';
25
+
26
+ const log = createLog('stealth-cache');
27
+
28
+ const TOKEN_FILE = '.stealth-token.json';
29
+
30
+ function tokenPath(siteDir: string): string {
31
+ return pathJoin(siteDir, TOKEN_FILE);
32
+ }
33
+
34
+ /** Load a cached token for a site dir, or null if absent / malformed / stale. */
35
+ export function loadCachedToken(siteDir: string, maxAgeSeconds: number): TokenCache | null {
36
+ const p = tokenPath(siteDir);
37
+ if (!existsSync(p)) return null;
38
+ try {
39
+ const raw = JSON.parse(readFileSync(p, 'utf8')) as Partial<TokenCache>;
40
+ if (
41
+ !raw ||
42
+ !Array.isArray(raw.cookies) ||
43
+ typeof raw.sensorHeaders !== 'object' ||
44
+ raw.sensorHeaders === null ||
45
+ typeof raw.bootstrappedAt !== 'number'
46
+ ) {
47
+ return null;
48
+ }
49
+ const ageSeconds = (Date.now() - raw.bootstrappedAt) / 1000;
50
+ if (ageSeconds >= maxAgeSeconds) {
51
+ log(
52
+ `cached token in ${siteDir} is ${Math.round(ageSeconds)}s old (>= ${maxAgeSeconds}s) — ignoring`,
53
+ );
54
+ return null;
55
+ }
56
+ return {
57
+ cookies: raw.cookies,
58
+ sensorHeaders: raw.sensorHeaders,
59
+ bootstrappedAt: raw.bootstrappedAt,
60
+ };
61
+ } catch {
62
+ return null;
63
+ }
64
+ }
65
+
66
+ /** Persist a token for a site dir (atomic temp + rename). Best-effort. */
67
+ export function saveCachedToken(siteDir: string, token: TokenCache): void {
68
+ try {
69
+ mkdirSync(siteDir, { recursive: true });
70
+ const p = tokenPath(siteDir);
71
+ const tmp = `${p}.${process.pid}.tmp`;
72
+ writeFileSync(tmp, `${JSON.stringify(token)}\n`, 'utf8');
73
+ renameSync(tmp, p);
74
+ } catch (err) {
75
+ log(
76
+ `failed to persist stealth token to ${siteDir}: ${err instanceof Error ? err.message : String(err)}`,
77
+ );
78
+ }
79
+ }
80
+
81
+ /** Remove a cached token (best-effort) — call when a site's teach run ends. */
82
+ export function clearCachedToken(siteDir: string): void {
83
+ try {
84
+ rmSync(tokenPath(siteDir), { force: true });
85
+ } catch {
86
+ // best-effort
87
+ }
88
+ }