imprint-mcp 0.4.6 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/README.md +4 -4
  2. package/examples/google-flights/README.md +2 -0
  3. package/examples/google-flights/_shared/flights_request.ts +10 -4
  4. package/examples/google-flights/get_flight_booking_details/index.ts +5 -2
  5. package/examples/google-flights/get_flight_booking_details/parser.ts +8 -0
  6. package/examples/google-flights/get_flight_booking_details/workflow.json +5 -2
  7. package/examples/google-flights/get_flight_calendar_prices/index.ts +5 -2
  8. package/examples/google-flights/get_flight_calendar_prices/parser.ts +8 -4
  9. package/examples/google-flights/get_flight_calendar_prices/workflow.json +5 -2
  10. package/examples/google-flights/lookup_airport/index.ts +3 -0
  11. package/examples/google-flights/lookup_airport/parser.ts +8 -1
  12. package/examples/google-flights/lookup_airport/workflow.json +3 -0
  13. package/examples/google-flights/search_flights/index.ts +63 -8
  14. package/examples/google-flights/search_flights/parser.ts +10 -0
  15. package/examples/google-flights/search_flights/request-transform.ts +45 -0
  16. package/examples/google-flights/search_flights/workflow.json +63 -8
  17. package/package.json +1 -1
  18. package/prompts/build-planning.md +1 -1
  19. package/prompts/compile-agent.md +5 -3
  20. package/prompts/prereq-builder.md +2 -1
  21. package/src/imprint/backend-ladder.ts +436 -43
  22. package/src/imprint/cdp-browser-fetch.ts +176 -6
  23. package/src/imprint/cdp-jar-cache.ts +105 -10
  24. package/src/imprint/compile-tools.ts +2 -2
  25. package/src/imprint/mcp-server.ts +152 -65
  26. package/src/imprint/probe-backends.ts +41 -10
  27. package/src/imprint/runtime.ts +24 -12
  28. package/src/imprint/stealth-fetch.ts +71 -0
  29. package/src/imprint/stealth-token-cache.ts +38 -1
  30. package/src/imprint/types.ts +45 -0
@@ -10,6 +10,9 @@
10
10
  * cached (~90 min) so one bootstrap serves many searches. Auto mode always
11
11
  * splices this right after `fetch`; it only RUNS when `fetch` escalates, so a
12
12
  * healthy plain-API site never pays for it.
13
+ * - `cdp-replay` — live Chrome API replay. Reused by MCP/compile sessions
14
+ * when a workflow needs browser-observed request state or sustained protected
15
+ * POSTs.
13
16
  * - `stealth-fetch` — Playwright stealth bootstrap + native fetch (token tier).
14
17
  * - `playbook` — DOM-walk LAST RESORT (needs a compiled playbook.yaml).
15
18
  */
@@ -22,6 +25,7 @@ import {
22
25
  type CdpBrowserFetchOptions,
23
26
  type MintedJar,
24
27
  createCdpBrowserFetch,
28
+ jarHasAkamaiValidationSignals,
25
29
  } from './cdp-browser-fetch.ts';
26
30
  import {
27
31
  clearJar,
@@ -213,6 +217,30 @@ function withWorkflowDefaults(
213
217
  return paramsWithDefaults;
214
218
  }
215
219
 
220
+ async function withWorkflowPreparedParams(
221
+ tool: ResolvedTool,
222
+ params: Record<string, string | number | boolean>,
223
+ ): Promise<Record<string, string | number | boolean>> {
224
+ const preparedParams = withWorkflowDefaults(tool.workflow, params);
225
+ const modulePath = tool.workflow.requestTransformModule;
226
+ if (!modulePath) return preparedParams;
227
+ try {
228
+ const mod = await import(pathResolve(tool.dir, modulePath));
229
+ if (typeof mod.prepareParams !== 'function') return preparedParams;
230
+ const extra = await mod.prepareParams(preparedParams);
231
+ if (!extra || typeof extra !== 'object') return preparedParams;
232
+ for (const [key, value] of Object.entries(extra)) {
233
+ if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') {
234
+ preparedParams[key] = value;
235
+ }
236
+ }
237
+ } catch {
238
+ // Non-fatal: request transforms are optional, and executeWorkflow will surface
239
+ // any still-missing placeholders with its normal STATE_MISSING diagnostics.
240
+ }
241
+ return preparedParams;
242
+ }
243
+
216
244
  /** Await the per-origin min spacing before a compile-path live request. The
217
245
  * first call to an origin never waits (last=0); subsequent ones within the
218
246
  * window are delayed so the suite paces itself under the rate-flag. */
@@ -335,7 +363,7 @@ export async function runWithLadder(
335
363
  result = await runCdpReplay(tool, params, options?.cdpPool);
336
364
  break;
337
365
  case 'stealth-fetch': {
338
- const paramsWithDefaults = withWorkflowDefaults(tool.workflow, params);
366
+ const paramsWithDefaults = await withWorkflowPreparedParams(tool, params);
339
367
  const sf = await ensureStealthFetch(tool, stealthCache, paramsWithDefaults);
340
368
  // When the workflow declares a bootstrap block, mint its declared
341
369
  // session-token state (CSRF cookies etc.) from the SAME stealth
@@ -343,17 +371,21 @@ export async function runWithLadder(
343
371
  // workflow escalating here from fetch-bootstrap loses the
344
372
  // ${state.X} its requests need — the gap that made bootstrap-block
345
373
  // tools on anti-bot sites unverifiable.
374
+ const tokens = tool.workflow.bootstrap ? await sf.ensureBootstrapped() : undefined;
346
375
  const initialState = tool.workflow.bootstrap
347
- ? await stealthBootstrapState(sf, tool.workflow.bootstrap)
376
+ ? await stealthBootstrapState(sf, tool.workflow.bootstrap, tokens)
348
377
  : undefined;
349
- result = await tool.toolFn(paramsWithDefaults, { fetchImpl: sf.fetchImpl, initialState });
378
+ result = await tool.toolFn(paramsWithDefaults, {
379
+ fetchImpl: tokens ? makeObservedResponseFetch(tokens, sf.fetchImpl) : sf.fetchImpl,
380
+ initialState,
381
+ });
350
382
  break;
351
383
  }
352
384
  case 'playbook': {
353
385
  // DOM-walk last resort (the anti-bot API path is fetch-bootstrap, above).
354
386
  // Apply workflow.json's declared parameter defaults — runPlaybook
355
387
  // validates and throws on absent values regardless of declared defaults.
356
- const paramsWithDefaults = withWorkflowDefaults(tool.workflow, params);
388
+ const paramsWithDefaults = await withWorkflowPreparedParams(tool, params);
357
389
  result = await runPlaybook({
358
390
  playbook: playbookPath(assetRoot, tool.site, tool.dir),
359
391
  params: paramsWithDefaults,
@@ -522,11 +554,9 @@ export function effectiveAutoLadder(
522
554
  const fbIdx = next.indexOf('fetch-bootstrap');
523
555
  if (fbIdx !== -1) next.splice(fbIdx + 1, 0, 'cdp-replay');
524
556
  }
525
- // For a MULTI-step state-changing anti-bot workflow, plain-fetch rungs are not
526
- // just doomed their tarpitted .act attempts BURN the per-IP rate budget
527
- // before cdp-replay even runs, which can flag the IP and make cdp-replay tarpit
528
- // too. Front-load cdp-replay for these so the live browser handles every
529
- // protected POST from a clean slate.
557
+ // For workflows that need live-browser request state, front-load cdp-replay so
558
+ // MCP sessions reuse the same Chrome instead of paying the one-shot
559
+ // fetch-bootstrap browser mint before every distinct bootstrap URL.
530
560
  if (prefersCdpReplayFirst(workflow)) {
531
561
  const i = next.indexOf('cdp-replay');
532
562
  if (i > 0) {
@@ -537,15 +567,20 @@ export function effectiveAutoLadder(
537
567
  return next;
538
568
  }
539
569
 
540
- /** A multi-step, state-changing, anti-bot workflow: ≥2 mutating requests AND an
541
- * anti-bot signal (a bootstrap block, or requests that depend on captured
542
- * `${state.X}` tokens). Plain-fetch replay can't sustain its sequence of
543
- * protected POSTs (each self-invalidates `_abck`); only the live-browser
544
- * cdp-replay rung can and it should run FIRST so the doomed fetch /
545
- * fetch-bootstrap attempts don't pre-burn the per-IP .act budget. A plain
546
- * multi-POST REST API (no bootstrap, no `${state.X}`) is NOT matched, so it
547
- * keeps the cheap fetch-first order. */
570
+ /** Prefer CDP first when the workflow needs live-browser request state.
571
+ *
572
+ * Two generic cases qualify:
573
+ * - bootstrap captures read fields from browser-observed requests
574
+ * (`request_header` / `request_url_regex` / `request_body_regex`). A one-shot fetch-bootstrap can
575
+ * also observe them, but it closes Chrome after minting; CDP can reuse the
576
+ * same browser across MCP calls and retarget route/date-specific bootstraps.
577
+ * - multi-step state-changing anti-bot flows (≥2 mutating requests plus a
578
+ * bootstrap/state signal). Plain-fetch replay can't sustain those protected
579
+ * POST sequences and can burn the per-IP budget before CDP runs.
580
+ */
548
581
  export function prefersCdpReplayFirst(workflow: Workflow): boolean {
582
+ if (workflow.bootstrap?.captures?.some(isObservedRequestBootstrapCapture)) return true;
583
+
549
584
  const mutating = workflow.requests.filter((r) => {
550
585
  const m = (r.method ?? 'GET').toUpperCase();
551
586
  return r.effect === 'unsafe' || m === 'POST' || m === 'PUT' || m === 'PATCH' || m === 'DELETE';
@@ -560,6 +595,14 @@ export function prefersCdpReplayFirst(workflow: Workflow): boolean {
560
595
  return Boolean(workflow.bootstrap) || hasStateRefs;
561
596
  }
562
597
 
598
+ function isObservedRequestBootstrapCapture(capture: BootstrapCapture): boolean {
599
+ return (
600
+ capture.source === 'request_header' ||
601
+ capture.source === 'request_url_regex' ||
602
+ capture.source === 'request_body_regex'
603
+ );
604
+ }
605
+
563
606
  function nextStateMissingBackend(
564
607
  ladder: ConcreteBackend[],
565
608
  backend: ConcreteBackend,
@@ -630,10 +673,11 @@ async function getOrMintCdpJar(
630
673
  bootstrapUrl: string | undefined,
631
674
  siteDir: string,
632
675
  forceFresh: boolean,
676
+ workflow?: Workflow,
633
677
  ): Promise<MintedJar | null> {
634
678
  if (cdpJarMinterForTest) return cdpJarMinterForTest(baseUrl, bootstrapUrl);
635
679
  if (!forceFresh) {
636
- let cached = loadJar(siteDir);
680
+ let cached = loadJar(siteDir, bootstrapUrl);
637
681
  // A recording NEWER than the cached jar supersedes it — e.g. the user
638
682
  // re-recorded on a new IP, so the cached (old-IP) jar would tarpit. Drop the
639
683
  // stale cache and re-seed from the fresh recording below.
@@ -644,7 +688,18 @@ async function getOrMintCdpJar(
644
688
  // many sequential .act), strictly better than a synthetic cdp-browser mint
645
689
  // (low-trust → tarpitted even on a fresh IP). "The recording IS the
646
690
  // executable." Reuse the `rec` stat above so we don't re-glob.
647
- if (!cached && seedJarFromRecording(siteDir, rec, bootstrapUrl)) cached = loadJar(siteDir);
691
+ if (!cached && seedJarFromRecording(siteDir, rec, bootstrapUrl)) {
692
+ cached = loadJar(siteDir, bootstrapUrl);
693
+ }
694
+ if (cached && workflow?.bootstrap) {
695
+ const missing = missingObservedRequestCaptureNames(workflow.bootstrap, cached);
696
+ if (missing.length > 0) {
697
+ log(
698
+ `cached jar is missing required browser-observed capture(s): ${missing.join(', ')} — re-mint`,
699
+ );
700
+ cached = null;
701
+ }
702
+ }
648
703
  if (cached) {
649
704
  const provenance =
650
705
  cached.source === 'recording'
@@ -663,10 +718,12 @@ async function getOrMintCdpJar(
663
718
  }
664
719
  let cf: CdpBrowserFetch | undefined;
665
720
  try {
666
- cf = createCdpBrowserFetch({ baseUrl, bootstrapUrl });
667
- const jar = await cf.mintJar();
668
- if (jar.abckFlag !== '0') {
721
+ cf = (cdpBrowserFetchFactoryForTest ?? createCdpBrowserFetch)({ baseUrl, bootstrapUrl });
722
+ const jar = await mintJarWithBootstrapWait(cf, workflow);
723
+ if (jar.abckFlag !== '0' && jarHasAkamaiValidationSignals(jar.cookies)) {
669
724
  log(`cdp jar minted with _abck~${jar.abckFlag}~ (not validated) — replay may be rejected`);
725
+ } else if (!jarHasAkamaiValidationSignals(jar.cookies)) {
726
+ log(`cdp jar minted generic bootstrap state (html=${jar.html.length}b)`);
670
727
  }
671
728
  saveJar(siteDir, jar);
672
729
  return jar;
@@ -678,6 +735,43 @@ async function getOrMintCdpJar(
678
735
  }
679
736
  }
680
737
 
738
+ async function mintJarWithBootstrapWait(
739
+ cf: CdpBrowserFetch,
740
+ workflow: Workflow | undefined,
741
+ ): Promise<MintedJar> {
742
+ let jar = await cf.mintJar();
743
+ const bootstrap = workflow?.bootstrap;
744
+ if (!bootstrap || requiredObservedRequestCaptures(bootstrap).length === 0) return jar;
745
+
746
+ const timeoutMs =
747
+ typeof bootstrap.timeoutMs === 'number' && bootstrap.timeoutMs > 0
748
+ ? bootstrap.timeoutMs
749
+ : 30_000;
750
+ const deadline = Date.now() + timeoutMs;
751
+ let loggedWait = false;
752
+
753
+ while (Date.now() < deadline) {
754
+ const missing = missingObservedRequestCaptureNames(bootstrap, jar);
755
+ if (missing.length === 0) return jar;
756
+ if (!loggedWait) {
757
+ log(
758
+ `waiting up to ${timeoutMs}ms for browser-observed bootstrap request capture(s): ${missing.join(', ')}`,
759
+ );
760
+ loggedWait = true;
761
+ }
762
+ await sleepMs(Math.min(500, Math.max(1, deadline - Date.now())));
763
+ jar = await cf.mintJar();
764
+ }
765
+
766
+ const missing = missingObservedRequestCaptureNames(bootstrap, jar);
767
+ if (missing.length > 0) {
768
+ log(
769
+ `timed out waiting for browser-observed bootstrap request capture(s): ${missing.join(', ')}`,
770
+ );
771
+ }
772
+ return jar;
773
+ }
774
+
681
775
  /** Replay transport for the bootstrap-then-fetch path: PLAIN fetch that presents
682
776
  * the jar's exact UA (Akamai drops the jar on a UA mismatch). Cookies are
683
777
  * attached by executeWorkflow's RuntimeCookieJar from bootstrappedCredentials,
@@ -717,6 +811,81 @@ function makeProxyFetch(): typeof fetch | undefined {
717
811
  )) as typeof fetch;
718
812
  }
719
813
 
814
+ type ObservedResponseSource = {
815
+ observedRequests?: Array<{
816
+ method: string;
817
+ url: string;
818
+ body?: string;
819
+ source?: 'browser' | 'replay';
820
+ response?: {
821
+ status: number;
822
+ headers: Record<string, string>;
823
+ body?: string;
824
+ };
825
+ }>;
826
+ };
827
+
828
+ function makeObservedResponseFetch(
829
+ source: ObservedResponseSource,
830
+ fallbackFetch: typeof fetch,
831
+ ): typeof fetch {
832
+ return (async (input: string | URL | Request, init?: RequestInit): Promise<Response> => {
833
+ const url =
834
+ typeof input === 'string' ? input : input instanceof URL ? input.toString() : input.url;
835
+ const method = (init?.method ?? 'GET').toUpperCase();
836
+ const body = observedRequestBody(init?.body);
837
+ const observed = findObservedResponse(source, method, url, body);
838
+ if (observed) {
839
+ log(`using bootstrap-observed response for ${method} ${redactUrlForLog(url)}`);
840
+ return new Response(observed.body ?? '', {
841
+ status: observed.status,
842
+ headers: new Headers(observed.headers),
843
+ });
844
+ }
845
+ return fallbackFetch(input, init);
846
+ }) as typeof fetch;
847
+ }
848
+
849
+ function findObservedResponse(
850
+ source: ObservedResponseSource,
851
+ method: string,
852
+ url: string,
853
+ body: string | undefined,
854
+ ): { status: number; headers: Record<string, string>; body?: string } | undefined {
855
+ const observed = source.observedRequests ?? [];
856
+ for (let i = observed.length - 1; i >= 0; i--) {
857
+ const req = observed[i];
858
+ if (!req?.response || req.response.body === undefined) continue;
859
+ if (req.source === 'replay') continue;
860
+ if (req.method.toUpperCase() !== method) continue;
861
+ if (req.url !== url) continue;
862
+ // Some CDP requestWillBeSent events omit postData even though the matching
863
+ // response body is available. If the observed body exists, require an exact
864
+ // body match. If CDP omitted it, fall back to exact method+URL; Google-style
865
+ // batchexecute URLs carry session/request ids, so this still avoids serving a
866
+ // response from a different bootstrap request.
867
+ if (req.body !== undefined && req.body !== (body ?? undefined)) continue;
868
+ return req.response;
869
+ }
870
+ return undefined;
871
+ }
872
+
873
+ function observedRequestBody(body: RequestInit['body'] | undefined): string | undefined {
874
+ if (body === undefined || body === null) return undefined;
875
+ if (typeof body === 'string') return body;
876
+ if (body instanceof URLSearchParams) return body.toString();
877
+ return undefined;
878
+ }
879
+
880
+ function redactUrlForLog(url: string): string {
881
+ try {
882
+ const u = new URL(url);
883
+ return `${u.origin}${u.pathname}`;
884
+ } catch {
885
+ return url.slice(0, 80);
886
+ }
887
+ }
888
+
720
889
  /** A replay error that means the JAR is bad (clear it + re-mint), as opposed to a
721
890
  * transient IP rate-flag (NETWORK/RATE_LIMITED — a fresh jar won't help; back off). */
722
891
  function jarLikelyStale(result: ToolResult): boolean {
@@ -756,14 +925,14 @@ async function runFetchBootstrap(
756
925
  values: {},
757
926
  storage: [],
758
927
  };
759
- const paramsWithDefaults = withWorkflowDefaults(tool.workflow, params);
928
+ const paramsWithDefaults = await withWorkflowPreparedParams(tool, params);
760
929
  const bootstrapUrl = tool.workflow.bootstrap
761
930
  ? substituteString(tool.workflow.bootstrap.url, paramsWithDefaults, credentials, [])
762
931
  : undefined;
763
932
  const siteDir = pathResolve(tool.dir, '..');
764
933
 
765
934
  for (let attempt = 0; attempt < 2; attempt++) {
766
- const jar = await getOrMintCdpJar(baseUrl, bootstrapUrl, siteDir, attempt > 0);
935
+ const jar = await getOrMintCdpJar(baseUrl, bootstrapUrl, siteDir, attempt > 0, tool.workflow);
767
936
  if (!jar) {
768
937
  // Couldn't even launch the bootstrap browser → let the ladder escalate.
769
938
  const stateMissing = bootstrapFailureStateMissingResult(
@@ -788,7 +957,7 @@ async function runFetchBootstrap(
788
957
  // recording-seeded or cached jar is validated:true by construction, so the
789
958
  // cheap plain-fetch path is untouched; `=== false` (not falsy) leaves jars
790
959
  // without the field — older caches / test stubs — on the original path.
791
- if (jar.validated === false) {
960
+ if (jar.validated === false && jarHasAkamaiValidationSignals(jar.cookies)) {
792
961
  log(
793
962
  'fetch-bootstrap: minted jar unvalidated (no _abck~0~/bm_sv) — plain-fetch replay doomed; escalating to cdp-replay',
794
963
  );
@@ -833,7 +1002,7 @@ async function runFetchBootstrap(
833
1002
  const result = await tool.toolFn(paramsWithDefaults, {
834
1003
  credentials: bootstrappedCredentials,
835
1004
  initialState: captureResult.state,
836
- fetchImpl: makeJarUaFetch(jar.ua),
1005
+ fetchImpl: makeObservedResponseFetch(jar, makeJarUaFetch(jar.ua)),
837
1006
  });
838
1007
 
839
1008
  if (result.ok) return result;
@@ -891,7 +1060,7 @@ async function runCdpReplay(
891
1060
  values: {},
892
1061
  storage: [],
893
1062
  };
894
- const paramsWithDefaults = withWorkflowDefaults(tool.workflow, params);
1063
+ const paramsWithDefaults = await withWorkflowPreparedParams(tool, params);
895
1064
  const bootstrapUrl = tool.workflow.bootstrap
896
1065
  ? substituteString(tool.workflow.bootstrap.url, paramsWithDefaults, credentials, [])
897
1066
  : undefined;
@@ -899,6 +1068,12 @@ async function runCdpReplay(
899
1068
  const siteDir = pathResolve(tool.dir, '..');
900
1069
  const poolKey = tool.site;
901
1070
  const pooled = cdpPool?.get(poolKey);
1071
+ if (pooled && bootstrapUrl && pooled.bootstrapUrl !== bootstrapUrl) {
1072
+ log(
1073
+ `cdp-replay: reusing pooled Chrome session for new bootstrap (${pooled.bootstrapUrl} → ${bootstrapUrl})`,
1074
+ );
1075
+ pooled.setBootstrapUrl(bootstrapUrl);
1076
+ }
902
1077
  const ownsSession = !pooled;
903
1078
 
904
1079
  let cf: CdpBrowserFetch;
@@ -924,7 +1099,7 @@ async function runCdpReplay(
924
1099
  }
925
1100
 
926
1101
  try {
927
- const jar = await cf.mintJar();
1102
+ const jar = await mintJarWithBootstrapWait(cf, tool.workflow);
928
1103
  const bootstrappedCredentials: CredentialStore = {
929
1104
  ...credentials,
930
1105
  cookies: [
@@ -956,7 +1131,7 @@ async function runCdpReplay(
956
1131
  const result = await tool.toolFn(paramsWithDefaults, {
957
1132
  credentials: bootstrappedCredentials,
958
1133
  initialState: captureResult.state,
959
- fetchImpl: cf.fetchImpl,
1134
+ fetchImpl: makeObservedResponseFetch(jar, cf.fetchImpl),
960
1135
  });
961
1136
 
962
1137
  if (result.ok) {
@@ -970,11 +1145,13 @@ async function runCdpReplay(
970
1145
  if (!cdpPool && ownsSession) await cf.close();
971
1146
  }
972
1147
  } else {
973
- if (ownsSession) {
974
- await cf.close();
975
- } else if (cdpPool) {
976
- cdpPool.delete(poolKey);
977
- log('cdp-replay: evicted degraded session from pool');
1148
+ // A workflow-level failure (BAD_RESPONSE/STATE_MISSING/FORBIDDEN/etc.) is
1149
+ // not evidence that the Chrome/CDP session is dead. Keep pooled sessions
1150
+ // alive so the next MCP call can reuse/retarget the browser; only the
1151
+ // catch path below evicts sessions after an actual CDP exception.
1152
+ if (cdpPool && ownsSession) {
1153
+ cdpPool.set(poolKey, cf);
1154
+ } else if (!cdpPool && ownsSession) {
978
1155
  await cf.close();
979
1156
  }
980
1157
  }
@@ -1046,6 +1223,45 @@ function jarBootstrapCaptureState(
1046
1223
  ),
1047
1224
  };
1048
1225
  }
1226
+ } else if (capture.source === 'request_header') {
1227
+ const value = captureObservedRequestHeader(jar, capture);
1228
+ if (value !== undefined && value !== null && value !== '') state[capture.name] = value;
1229
+ else if (capture.required !== false) {
1230
+ return {
1231
+ ok: false,
1232
+ result: bootstrapCaptureMissingResult(
1233
+ capture,
1234
+ `Required bootstrap capture "${capture.name}" (request_header ${capture.header}) did not match an observed browser request.`,
1235
+ 'producer_ran_value_absent',
1236
+ ),
1237
+ };
1238
+ }
1239
+ } else if (capture.source === 'request_url_regex') {
1240
+ const value = captureObservedRequestUrlRegex(jar, capture);
1241
+ if (value !== undefined && value !== null && value !== '') state[capture.name] = value;
1242
+ else if (capture.required !== false) {
1243
+ return {
1244
+ ok: false,
1245
+ result: bootstrapCaptureMissingResult(
1246
+ capture,
1247
+ `Required bootstrap capture "${capture.name}" (request_url_regex ${capture.pattern}) did not match an observed browser request.`,
1248
+ 'producer_ran_value_absent',
1249
+ ),
1250
+ };
1251
+ }
1252
+ } else if (capture.source === 'request_body_regex') {
1253
+ const value = captureObservedRequestBodyRegex(jar, capture);
1254
+ if (value !== undefined && value !== null && value !== '') state[capture.name] = value;
1255
+ else if (capture.required !== false) {
1256
+ return {
1257
+ ok: false,
1258
+ result: bootstrapCaptureMissingResult(
1259
+ capture,
1260
+ `Required bootstrap capture "${capture.name}" (request_body_regex ${capture.pattern}) did not match an observed browser request body.`,
1261
+ 'producer_ran_value_absent',
1262
+ ),
1263
+ };
1264
+ }
1049
1265
  } else if (capture.required !== false) {
1050
1266
  // response_header / dom_* can't be resolved from a closed browser jar.
1051
1267
  return {
@@ -1061,6 +1277,134 @@ function jarBootstrapCaptureState(
1061
1277
  return { ok: true, state };
1062
1278
  }
1063
1279
 
1280
+ function captureObservedRequestHeader(
1281
+ jar: MintedJar,
1282
+ capture: Extract<BootstrapCapture, { source: 'request_header' }>,
1283
+ ): string | string[] | undefined {
1284
+ return captureObservedRequestValueFromObserved(jar.observedRequests ?? [], capture, (req) =>
1285
+ headerValue(req.headers, capture.header),
1286
+ );
1287
+ }
1288
+
1289
+ function captureObservedRequestUrlRegex(
1290
+ jar: MintedJar,
1291
+ capture: Extract<BootstrapCapture, { source: 'request_url_regex' }>,
1292
+ ): string | string[] | undefined {
1293
+ return captureObservedRequestValueFromObserved(jar.observedRequests ?? [], capture, (req) => {
1294
+ try {
1295
+ return req.url.match(new RegExp(capture.pattern))?.[capture.group ?? 1];
1296
+ } catch {
1297
+ return undefined;
1298
+ }
1299
+ });
1300
+ }
1301
+
1302
+ function captureObservedRequestBodyRegex(
1303
+ jar: MintedJar,
1304
+ capture: Extract<BootstrapCapture, { source: 'request_body_regex' }>,
1305
+ ): string | string[] | undefined {
1306
+ return captureObservedRequestValueFromObserved(jar.observedRequests ?? [], capture, (req) => {
1307
+ if (typeof req.body !== 'string') return undefined;
1308
+ try {
1309
+ const match = req.body.match(new RegExp(capture.pattern));
1310
+ return match?.[capture.group ?? 1] ?? match?.[0];
1311
+ } catch {
1312
+ return undefined;
1313
+ }
1314
+ });
1315
+ }
1316
+
1317
+ function requiredObservedRequestCaptures(
1318
+ bootstrap: NonNullable<Workflow['bootstrap']>,
1319
+ ): Array<
1320
+ Extract<
1321
+ BootstrapCapture,
1322
+ { source: 'request_header' | 'request_url_regex' | 'request_body_regex' }
1323
+ >
1324
+ > {
1325
+ return (bootstrap.captures ?? []).filter(
1326
+ (
1327
+ capture,
1328
+ ): capture is Extract<
1329
+ BootstrapCapture,
1330
+ { source: 'request_header' | 'request_url_regex' | 'request_body_regex' }
1331
+ > =>
1332
+ capture.required !== false &&
1333
+ (capture.source === 'request_header' ||
1334
+ capture.source === 'request_url_regex' ||
1335
+ capture.source === 'request_body_regex'),
1336
+ );
1337
+ }
1338
+
1339
+ function missingObservedRequestCaptureNames(
1340
+ bootstrap: NonNullable<Workflow['bootstrap']>,
1341
+ jar: MintedJar,
1342
+ ): string[] {
1343
+ const missing: string[] = [];
1344
+ for (const capture of requiredObservedRequestCaptures(bootstrap)) {
1345
+ const value =
1346
+ capture.source === 'request_header'
1347
+ ? captureObservedRequestHeader(jar, capture)
1348
+ : capture.source === 'request_url_regex'
1349
+ ? captureObservedRequestUrlRegex(jar, capture)
1350
+ : captureObservedRequestBodyRegex(jar, capture);
1351
+ if (value === undefined || value === '' || (Array.isArray(value) && value.length === 0)) {
1352
+ missing.push(capture.name);
1353
+ }
1354
+ }
1355
+ return missing;
1356
+ }
1357
+
1358
+ function captureObservedRequestValueFromObserved(
1359
+ observed: Array<{
1360
+ method: string;
1361
+ url: string;
1362
+ headers: Record<string, string>;
1363
+ body?: string;
1364
+ source?: 'browser' | 'replay';
1365
+ }>,
1366
+ capture: Extract<
1367
+ BootstrapCapture,
1368
+ { source: 'request_header' | 'request_url_regex' | 'request_body_regex' }
1369
+ >,
1370
+ pickValue: (req: {
1371
+ method: string;
1372
+ url: string;
1373
+ headers: Record<string, string>;
1374
+ body?: string;
1375
+ source?: 'browser' | 'replay';
1376
+ }) => string | undefined,
1377
+ ): string | string[] | undefined {
1378
+ let urlRe: RegExp | null = null;
1379
+ if (capture.urlPattern) {
1380
+ try {
1381
+ urlRe = new RegExp(capture.urlPattern);
1382
+ } catch {
1383
+ return undefined;
1384
+ }
1385
+ }
1386
+ const method = capture.method?.toUpperCase();
1387
+ const matches: string[] = [];
1388
+ for (const req of observed) {
1389
+ if (req.source === 'replay') continue;
1390
+ if (method && req.method.toUpperCase() !== method) continue;
1391
+ if (urlRe && !urlRe.test(req.url)) continue;
1392
+ const value = pickValue(req);
1393
+ if (value !== undefined && value !== '') matches.push(value);
1394
+ }
1395
+ if (capture.mode === 'all') return matches.length ? matches : undefined;
1396
+ if (capture.mode === 'first') return matches[0];
1397
+ return matches[matches.length - 1];
1398
+ }
1399
+
1400
+ function headerValue(headers: Record<string, string>, header: string): string | undefined {
1401
+ const headerName = header.toLowerCase();
1402
+ for (const [name, value] of Object.entries(headers)) {
1403
+ if (name.toLowerCase() === headerName) return value;
1404
+ }
1405
+ return undefined;
1406
+ }
1407
+
1064
1408
  function bootstrapFailureStateMissingResult(
1065
1409
  workflow: Workflow,
1066
1410
  message: string,
@@ -1188,6 +1532,12 @@ export async function evaluateBootstrapCapture(
1188
1532
  },
1189
1533
  { origin: capture.origin, key: capture.key },
1190
1534
  );
1535
+ case 'request_header':
1536
+ return undefined;
1537
+ case 'request_url_regex':
1538
+ return undefined;
1539
+ case 'request_body_regex':
1540
+ return undefined;
1191
1541
  case 'cookie':
1192
1542
  return undefined;
1193
1543
  }
@@ -1195,30 +1545,38 @@ export async function evaluateBootstrapCapture(
1195
1545
 
1196
1546
  /** Per-site stealth fetcher; bootstrap pays its ~12s once per process. */
1197
1547
  /** Mint `${state.X}` values from the stealth bootstrap session for a workflow
1198
- * that declares a bootstrap block. Satisfies `cookie`, `html_regex`, and
1199
- * `response_header` captures from the cookies / HTML / response headers the
1200
- * stealth navigation minted — all one consistent session as the transport
1201
- * cookies, so a token the later API POST checks against the session resolves.
1548
+ * that declares a bootstrap block. Satisfies `cookie`, `html_regex`,
1549
+ * `response_header`, and observed request captures from the cookies / HTML /
1550
+ * headers / observed browser requests the stealth navigation minted — all one
1551
+ * consistent session as the transport cookies, so a token the later API POST
1552
+ * checks against the session resolves.
1202
1553
  * `dom_*` / storage sources need a live page and are left for the
1203
1554
  * fetch-bootstrap rung (the compile prompt steers replay-safe session tokens
1204
1555
  * to cookie/html_regex, which this covers). */
1205
1556
  async function stealthBootstrapState(
1206
1557
  sf: StealthFetch,
1207
1558
  bootstrap: NonNullable<ResolvedTool['workflow']['bootstrap']>,
1559
+ tokens?: TokenCache,
1208
1560
  ): Promise<Record<string, unknown>> {
1209
1561
  const state: Record<string, unknown> = {};
1210
1562
  const captures = bootstrap.captures ?? [];
1211
1563
  const supported = captures.filter(
1212
- (c) => c.source === 'cookie' || c.source === 'html_regex' || c.source === 'response_header',
1564
+ (c) =>
1565
+ c.source === 'cookie' ||
1566
+ c.source === 'html_regex' ||
1567
+ c.source === 'response_header' ||
1568
+ c.source === 'request_header' ||
1569
+ c.source === 'request_url_regex' ||
1570
+ c.source === 'request_body_regex',
1213
1571
  );
1214
1572
  if (supported.length === 0) return state;
1215
- const tokens = await sf.ensureBootstrapped();
1573
+ const bootstrapTokens = tokens ?? (await sf.ensureBootstrapped());
1216
1574
  for (const cap of supported) {
1217
1575
  if (cap.source === 'cookie') {
1218
- const hit = tokens.cookies.find((c) => c.name === cap.cookie);
1576
+ const hit = bootstrapTokens.cookies.find((c) => c.name === cap.cookie);
1219
1577
  if (hit) state[cap.name] = hit.value;
1220
1578
  } else if (cap.source === 'html_regex') {
1221
- const html = tokens.bootstrapHtml ?? '';
1579
+ const html = bootstrapTokens.bootstrapHtml ?? '';
1222
1580
  try {
1223
1581
  const m = html.match(new RegExp(cap.pattern));
1224
1582
  const v = m?.[cap.group ?? 1];
@@ -1227,8 +1585,43 @@ async function stealthBootstrapState(
1227
1585
  // invalid regex — leave unset; substitution will surface STATE_MISSING
1228
1586
  }
1229
1587
  } else if (cap.source === 'response_header') {
1230
- const v = tokens.bootstrapResponseHeaders?.[cap.header.toLowerCase()];
1588
+ const v = bootstrapTokens.bootstrapResponseHeaders?.[cap.header.toLowerCase()];
1231
1589
  if (v !== undefined && v !== '') state[cap.name] = v;
1590
+ } else if (cap.source === 'request_header') {
1591
+ const v = captureObservedRequestValueFromObserved(
1592
+ bootstrapTokens.observedRequests ?? [],
1593
+ cap,
1594
+ (req) => headerValue(req.headers, cap.header),
1595
+ );
1596
+ if (v !== undefined && v !== null && v !== '') state[cap.name] = v;
1597
+ } else if (cap.source === 'request_url_regex') {
1598
+ const v = captureObservedRequestValueFromObserved(
1599
+ bootstrapTokens.observedRequests ?? [],
1600
+ cap,
1601
+ (req) => {
1602
+ try {
1603
+ return req.url.match(new RegExp(cap.pattern))?.[cap.group ?? 1];
1604
+ } catch {
1605
+ return undefined;
1606
+ }
1607
+ },
1608
+ );
1609
+ if (v !== undefined && v !== null && v !== '') state[cap.name] = v;
1610
+ } else if (cap.source === 'request_body_regex') {
1611
+ const v = captureObservedRequestValueFromObserved(
1612
+ bootstrapTokens.observedRequests ?? [],
1613
+ cap,
1614
+ (req) => {
1615
+ if (typeof req.body !== 'string') return undefined;
1616
+ try {
1617
+ const match = req.body.match(new RegExp(cap.pattern));
1618
+ return match?.[cap.group ?? 1] ?? match?.[0];
1619
+ } catch {
1620
+ return undefined;
1621
+ }
1622
+ },
1623
+ );
1624
+ if (v !== undefined && v !== null && v !== '') state[cap.name] = v;
1232
1625
  }
1233
1626
  }
1234
1627
  return state;