imprint-mcp 0.4.6 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/examples/google-flights/README.md +2 -0
- package/examples/google-flights/_shared/flights_request.ts +10 -4
- package/examples/google-flights/get_flight_booking_details/index.ts +5 -2
- package/examples/google-flights/get_flight_booking_details/parser.ts +8 -0
- package/examples/google-flights/get_flight_booking_details/workflow.json +5 -2
- package/examples/google-flights/get_flight_calendar_prices/index.ts +5 -2
- package/examples/google-flights/get_flight_calendar_prices/parser.ts +8 -4
- package/examples/google-flights/get_flight_calendar_prices/workflow.json +5 -2
- package/examples/google-flights/lookup_airport/index.ts +3 -0
- package/examples/google-flights/lookup_airport/parser.ts +8 -1
- package/examples/google-flights/lookup_airport/workflow.json +3 -0
- package/examples/google-flights/search_flights/index.ts +63 -8
- package/examples/google-flights/search_flights/parser.ts +10 -0
- package/examples/google-flights/search_flights/request-transform.ts +45 -0
- package/examples/google-flights/search_flights/workflow.json +63 -8
- package/package.json +1 -1
- package/prompts/build-planning.md +1 -1
- package/prompts/compile-agent.md +5 -3
- package/prompts/prereq-builder.md +2 -1
- package/src/imprint/backend-ladder.ts +436 -43
- package/src/imprint/cdp-browser-fetch.ts +176 -6
- package/src/imprint/cdp-jar-cache.ts +105 -10
- package/src/imprint/compile-tools.ts +2 -2
- package/src/imprint/mcp-server.ts +152 -65
- package/src/imprint/probe-backends.ts +41 -10
- package/src/imprint/runtime.ts +24 -12
- package/src/imprint/stealth-fetch.ts +71 -0
- package/src/imprint/stealth-token-cache.ts +38 -1
- package/src/imprint/types.ts +45 -0
|
@@ -10,6 +10,9 @@
|
|
|
10
10
|
* cached (~90 min) so one bootstrap serves many searches. Auto mode always
|
|
11
11
|
* splices this right after `fetch`; it only RUNS when `fetch` escalates, so a
|
|
12
12
|
* healthy plain-API site never pays for it.
|
|
13
|
+
* - `cdp-replay` — live Chrome API replay. Reused by MCP/compile sessions
|
|
14
|
+
* when a workflow needs browser-observed request state or sustained protected
|
|
15
|
+
* POSTs.
|
|
13
16
|
* - `stealth-fetch` — Playwright stealth bootstrap + native fetch (token tier).
|
|
14
17
|
* - `playbook` — DOM-walk LAST RESORT (needs a compiled playbook.yaml).
|
|
15
18
|
*/
|
|
@@ -22,6 +25,7 @@ import {
|
|
|
22
25
|
type CdpBrowserFetchOptions,
|
|
23
26
|
type MintedJar,
|
|
24
27
|
createCdpBrowserFetch,
|
|
28
|
+
jarHasAkamaiValidationSignals,
|
|
25
29
|
} from './cdp-browser-fetch.ts';
|
|
26
30
|
import {
|
|
27
31
|
clearJar,
|
|
@@ -213,6 +217,30 @@ function withWorkflowDefaults(
|
|
|
213
217
|
return paramsWithDefaults;
|
|
214
218
|
}
|
|
215
219
|
|
|
220
|
+
async function withWorkflowPreparedParams(
|
|
221
|
+
tool: ResolvedTool,
|
|
222
|
+
params: Record<string, string | number | boolean>,
|
|
223
|
+
): Promise<Record<string, string | number | boolean>> {
|
|
224
|
+
const preparedParams = withWorkflowDefaults(tool.workflow, params);
|
|
225
|
+
const modulePath = tool.workflow.requestTransformModule;
|
|
226
|
+
if (!modulePath) return preparedParams;
|
|
227
|
+
try {
|
|
228
|
+
const mod = await import(pathResolve(tool.dir, modulePath));
|
|
229
|
+
if (typeof mod.prepareParams !== 'function') return preparedParams;
|
|
230
|
+
const extra = await mod.prepareParams(preparedParams);
|
|
231
|
+
if (!extra || typeof extra !== 'object') return preparedParams;
|
|
232
|
+
for (const [key, value] of Object.entries(extra)) {
|
|
233
|
+
if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') {
|
|
234
|
+
preparedParams[key] = value;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
} catch {
|
|
238
|
+
// Non-fatal: request transforms are optional, and executeWorkflow will surface
|
|
239
|
+
// any still-missing placeholders with its normal STATE_MISSING diagnostics.
|
|
240
|
+
}
|
|
241
|
+
return preparedParams;
|
|
242
|
+
}
|
|
243
|
+
|
|
216
244
|
/** Await the per-origin min spacing before a compile-path live request. The
|
|
217
245
|
* first call to an origin never waits (last=0); subsequent ones within the
|
|
218
246
|
* window are delayed so the suite paces itself under the rate-flag. */
|
|
@@ -335,7 +363,7 @@ export async function runWithLadder(
|
|
|
335
363
|
result = await runCdpReplay(tool, params, options?.cdpPool);
|
|
336
364
|
break;
|
|
337
365
|
case 'stealth-fetch': {
|
|
338
|
-
const paramsWithDefaults =
|
|
366
|
+
const paramsWithDefaults = await withWorkflowPreparedParams(tool, params);
|
|
339
367
|
const sf = await ensureStealthFetch(tool, stealthCache, paramsWithDefaults);
|
|
340
368
|
// When the workflow declares a bootstrap block, mint its declared
|
|
341
369
|
// session-token state (CSRF cookies etc.) from the SAME stealth
|
|
@@ -343,17 +371,21 @@ export async function runWithLadder(
|
|
|
343
371
|
// workflow escalating here from fetch-bootstrap loses the
|
|
344
372
|
// ${state.X} its requests need — the gap that made bootstrap-block
|
|
345
373
|
// tools on anti-bot sites unverifiable.
|
|
374
|
+
const tokens = tool.workflow.bootstrap ? await sf.ensureBootstrapped() : undefined;
|
|
346
375
|
const initialState = tool.workflow.bootstrap
|
|
347
|
-
? await stealthBootstrapState(sf, tool.workflow.bootstrap)
|
|
376
|
+
? await stealthBootstrapState(sf, tool.workflow.bootstrap, tokens)
|
|
348
377
|
: undefined;
|
|
349
|
-
result = await tool.toolFn(paramsWithDefaults, {
|
|
378
|
+
result = await tool.toolFn(paramsWithDefaults, {
|
|
379
|
+
fetchImpl: tokens ? makeObservedResponseFetch(tokens, sf.fetchImpl) : sf.fetchImpl,
|
|
380
|
+
initialState,
|
|
381
|
+
});
|
|
350
382
|
break;
|
|
351
383
|
}
|
|
352
384
|
case 'playbook': {
|
|
353
385
|
// DOM-walk last resort (the anti-bot API path is fetch-bootstrap, above).
|
|
354
386
|
// Apply workflow.json's declared parameter defaults — runPlaybook
|
|
355
387
|
// validates and throws on absent values regardless of declared defaults.
|
|
356
|
-
const paramsWithDefaults =
|
|
388
|
+
const paramsWithDefaults = await withWorkflowPreparedParams(tool, params);
|
|
357
389
|
result = await runPlaybook({
|
|
358
390
|
playbook: playbookPath(assetRoot, tool.site, tool.dir),
|
|
359
391
|
params: paramsWithDefaults,
|
|
@@ -522,11 +554,9 @@ export function effectiveAutoLadder(
|
|
|
522
554
|
const fbIdx = next.indexOf('fetch-bootstrap');
|
|
523
555
|
if (fbIdx !== -1) next.splice(fbIdx + 1, 0, 'cdp-replay');
|
|
524
556
|
}
|
|
525
|
-
// For
|
|
526
|
-
//
|
|
527
|
-
//
|
|
528
|
-
// too. Front-load cdp-replay for these so the live browser handles every
|
|
529
|
-
// protected POST from a clean slate.
|
|
557
|
+
// For workflows that need live-browser request state, front-load cdp-replay so
|
|
558
|
+
// MCP sessions reuse the same Chrome instead of paying the one-shot
|
|
559
|
+
// fetch-bootstrap browser mint before every distinct bootstrap URL.
|
|
530
560
|
if (prefersCdpReplayFirst(workflow)) {
|
|
531
561
|
const i = next.indexOf('cdp-replay');
|
|
532
562
|
if (i > 0) {
|
|
@@ -537,15 +567,20 @@ export function effectiveAutoLadder(
|
|
|
537
567
|
return next;
|
|
538
568
|
}
|
|
539
569
|
|
|
540
|
-
/**
|
|
541
|
-
*
|
|
542
|
-
*
|
|
543
|
-
*
|
|
544
|
-
*
|
|
545
|
-
*
|
|
546
|
-
*
|
|
547
|
-
*
|
|
570
|
+
/** Prefer CDP first when the workflow needs live-browser request state.
|
|
571
|
+
*
|
|
572
|
+
* Two generic cases qualify:
|
|
573
|
+
* - bootstrap captures read fields from browser-observed requests
|
|
574
|
+
* (`request_header` / `request_url_regex` / `request_body_regex`). A one-shot fetch-bootstrap can
|
|
575
|
+
* also observe them, but it closes Chrome after minting; CDP can reuse the
|
|
576
|
+
* same browser across MCP calls and retarget route/date-specific bootstraps.
|
|
577
|
+
* - multi-step state-changing anti-bot flows (≥2 mutating requests plus a
|
|
578
|
+
* bootstrap/state signal). Plain-fetch replay can't sustain those protected
|
|
579
|
+
* POST sequences and can burn the per-IP budget before CDP runs.
|
|
580
|
+
*/
|
|
548
581
|
export function prefersCdpReplayFirst(workflow: Workflow): boolean {
|
|
582
|
+
if (workflow.bootstrap?.captures?.some(isObservedRequestBootstrapCapture)) return true;
|
|
583
|
+
|
|
549
584
|
const mutating = workflow.requests.filter((r) => {
|
|
550
585
|
const m = (r.method ?? 'GET').toUpperCase();
|
|
551
586
|
return r.effect === 'unsafe' || m === 'POST' || m === 'PUT' || m === 'PATCH' || m === 'DELETE';
|
|
@@ -560,6 +595,14 @@ export function prefersCdpReplayFirst(workflow: Workflow): boolean {
|
|
|
560
595
|
return Boolean(workflow.bootstrap) || hasStateRefs;
|
|
561
596
|
}
|
|
562
597
|
|
|
598
|
+
function isObservedRequestBootstrapCapture(capture: BootstrapCapture): boolean {
|
|
599
|
+
return (
|
|
600
|
+
capture.source === 'request_header' ||
|
|
601
|
+
capture.source === 'request_url_regex' ||
|
|
602
|
+
capture.source === 'request_body_regex'
|
|
603
|
+
);
|
|
604
|
+
}
|
|
605
|
+
|
|
563
606
|
function nextStateMissingBackend(
|
|
564
607
|
ladder: ConcreteBackend[],
|
|
565
608
|
backend: ConcreteBackend,
|
|
@@ -630,10 +673,11 @@ async function getOrMintCdpJar(
|
|
|
630
673
|
bootstrapUrl: string | undefined,
|
|
631
674
|
siteDir: string,
|
|
632
675
|
forceFresh: boolean,
|
|
676
|
+
workflow?: Workflow,
|
|
633
677
|
): Promise<MintedJar | null> {
|
|
634
678
|
if (cdpJarMinterForTest) return cdpJarMinterForTest(baseUrl, bootstrapUrl);
|
|
635
679
|
if (!forceFresh) {
|
|
636
|
-
let cached = loadJar(siteDir);
|
|
680
|
+
let cached = loadJar(siteDir, bootstrapUrl);
|
|
637
681
|
// A recording NEWER than the cached jar supersedes it — e.g. the user
|
|
638
682
|
// re-recorded on a new IP, so the cached (old-IP) jar would tarpit. Drop the
|
|
639
683
|
// stale cache and re-seed from the fresh recording below.
|
|
@@ -644,7 +688,18 @@ async function getOrMintCdpJar(
|
|
|
644
688
|
// many sequential .act), strictly better than a synthetic cdp-browser mint
|
|
645
689
|
// (low-trust → tarpitted even on a fresh IP). "The recording IS the
|
|
646
690
|
// executable." Reuse the `rec` stat above so we don't re-glob.
|
|
647
|
-
if (!cached && seedJarFromRecording(siteDir, rec, bootstrapUrl))
|
|
691
|
+
if (!cached && seedJarFromRecording(siteDir, rec, bootstrapUrl)) {
|
|
692
|
+
cached = loadJar(siteDir, bootstrapUrl);
|
|
693
|
+
}
|
|
694
|
+
if (cached && workflow?.bootstrap) {
|
|
695
|
+
const missing = missingObservedRequestCaptureNames(workflow.bootstrap, cached);
|
|
696
|
+
if (missing.length > 0) {
|
|
697
|
+
log(
|
|
698
|
+
`cached jar is missing required browser-observed capture(s): ${missing.join(', ')} — re-mint`,
|
|
699
|
+
);
|
|
700
|
+
cached = null;
|
|
701
|
+
}
|
|
702
|
+
}
|
|
648
703
|
if (cached) {
|
|
649
704
|
const provenance =
|
|
650
705
|
cached.source === 'recording'
|
|
@@ -663,10 +718,12 @@ async function getOrMintCdpJar(
|
|
|
663
718
|
}
|
|
664
719
|
let cf: CdpBrowserFetch | undefined;
|
|
665
720
|
try {
|
|
666
|
-
cf = createCdpBrowserFetch({ baseUrl, bootstrapUrl });
|
|
667
|
-
const jar = await cf
|
|
668
|
-
if (jar.abckFlag !== '0') {
|
|
721
|
+
cf = (cdpBrowserFetchFactoryForTest ?? createCdpBrowserFetch)({ baseUrl, bootstrapUrl });
|
|
722
|
+
const jar = await mintJarWithBootstrapWait(cf, workflow);
|
|
723
|
+
if (jar.abckFlag !== '0' && jarHasAkamaiValidationSignals(jar.cookies)) {
|
|
669
724
|
log(`cdp jar minted with _abck~${jar.abckFlag}~ (not validated) — replay may be rejected`);
|
|
725
|
+
} else if (!jarHasAkamaiValidationSignals(jar.cookies)) {
|
|
726
|
+
log(`cdp jar minted generic bootstrap state (html=${jar.html.length}b)`);
|
|
670
727
|
}
|
|
671
728
|
saveJar(siteDir, jar);
|
|
672
729
|
return jar;
|
|
@@ -678,6 +735,43 @@ async function getOrMintCdpJar(
|
|
|
678
735
|
}
|
|
679
736
|
}
|
|
680
737
|
|
|
738
|
+
async function mintJarWithBootstrapWait(
|
|
739
|
+
cf: CdpBrowserFetch,
|
|
740
|
+
workflow: Workflow | undefined,
|
|
741
|
+
): Promise<MintedJar> {
|
|
742
|
+
let jar = await cf.mintJar();
|
|
743
|
+
const bootstrap = workflow?.bootstrap;
|
|
744
|
+
if (!bootstrap || requiredObservedRequestCaptures(bootstrap).length === 0) return jar;
|
|
745
|
+
|
|
746
|
+
const timeoutMs =
|
|
747
|
+
typeof bootstrap.timeoutMs === 'number' && bootstrap.timeoutMs > 0
|
|
748
|
+
? bootstrap.timeoutMs
|
|
749
|
+
: 30_000;
|
|
750
|
+
const deadline = Date.now() + timeoutMs;
|
|
751
|
+
let loggedWait = false;
|
|
752
|
+
|
|
753
|
+
while (Date.now() < deadline) {
|
|
754
|
+
const missing = missingObservedRequestCaptureNames(bootstrap, jar);
|
|
755
|
+
if (missing.length === 0) return jar;
|
|
756
|
+
if (!loggedWait) {
|
|
757
|
+
log(
|
|
758
|
+
`waiting up to ${timeoutMs}ms for browser-observed bootstrap request capture(s): ${missing.join(', ')}`,
|
|
759
|
+
);
|
|
760
|
+
loggedWait = true;
|
|
761
|
+
}
|
|
762
|
+
await sleepMs(Math.min(500, Math.max(1, deadline - Date.now())));
|
|
763
|
+
jar = await cf.mintJar();
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
const missing = missingObservedRequestCaptureNames(bootstrap, jar);
|
|
767
|
+
if (missing.length > 0) {
|
|
768
|
+
log(
|
|
769
|
+
`timed out waiting for browser-observed bootstrap request capture(s): ${missing.join(', ')}`,
|
|
770
|
+
);
|
|
771
|
+
}
|
|
772
|
+
return jar;
|
|
773
|
+
}
|
|
774
|
+
|
|
681
775
|
/** Replay transport for the bootstrap-then-fetch path: PLAIN fetch that presents
|
|
682
776
|
* the jar's exact UA (Akamai drops the jar on a UA mismatch). Cookies are
|
|
683
777
|
* attached by executeWorkflow's RuntimeCookieJar from bootstrappedCredentials,
|
|
@@ -717,6 +811,81 @@ function makeProxyFetch(): typeof fetch | undefined {
|
|
|
717
811
|
)) as typeof fetch;
|
|
718
812
|
}
|
|
719
813
|
|
|
814
|
+
type ObservedResponseSource = {
|
|
815
|
+
observedRequests?: Array<{
|
|
816
|
+
method: string;
|
|
817
|
+
url: string;
|
|
818
|
+
body?: string;
|
|
819
|
+
source?: 'browser' | 'replay';
|
|
820
|
+
response?: {
|
|
821
|
+
status: number;
|
|
822
|
+
headers: Record<string, string>;
|
|
823
|
+
body?: string;
|
|
824
|
+
};
|
|
825
|
+
}>;
|
|
826
|
+
};
|
|
827
|
+
|
|
828
|
+
function makeObservedResponseFetch(
|
|
829
|
+
source: ObservedResponseSource,
|
|
830
|
+
fallbackFetch: typeof fetch,
|
|
831
|
+
): typeof fetch {
|
|
832
|
+
return (async (input: string | URL | Request, init?: RequestInit): Promise<Response> => {
|
|
833
|
+
const url =
|
|
834
|
+
typeof input === 'string' ? input : input instanceof URL ? input.toString() : input.url;
|
|
835
|
+
const method = (init?.method ?? 'GET').toUpperCase();
|
|
836
|
+
const body = observedRequestBody(init?.body);
|
|
837
|
+
const observed = findObservedResponse(source, method, url, body);
|
|
838
|
+
if (observed) {
|
|
839
|
+
log(`using bootstrap-observed response for ${method} ${redactUrlForLog(url)}`);
|
|
840
|
+
return new Response(observed.body ?? '', {
|
|
841
|
+
status: observed.status,
|
|
842
|
+
headers: new Headers(observed.headers),
|
|
843
|
+
});
|
|
844
|
+
}
|
|
845
|
+
return fallbackFetch(input, init);
|
|
846
|
+
}) as typeof fetch;
|
|
847
|
+
}
|
|
848
|
+
|
|
849
|
+
function findObservedResponse(
|
|
850
|
+
source: ObservedResponseSource,
|
|
851
|
+
method: string,
|
|
852
|
+
url: string,
|
|
853
|
+
body: string | undefined,
|
|
854
|
+
): { status: number; headers: Record<string, string>; body?: string } | undefined {
|
|
855
|
+
const observed = source.observedRequests ?? [];
|
|
856
|
+
for (let i = observed.length - 1; i >= 0; i--) {
|
|
857
|
+
const req = observed[i];
|
|
858
|
+
if (!req?.response || req.response.body === undefined) continue;
|
|
859
|
+
if (req.source === 'replay') continue;
|
|
860
|
+
if (req.method.toUpperCase() !== method) continue;
|
|
861
|
+
if (req.url !== url) continue;
|
|
862
|
+
// Some CDP requestWillBeSent events omit postData even though the matching
|
|
863
|
+
// response body is available. If the observed body exists, require an exact
|
|
864
|
+
// body match. If CDP omitted it, fall back to exact method+URL; Google-style
|
|
865
|
+
// batchexecute URLs carry session/request ids, so this still avoids serving a
|
|
866
|
+
// response from a different bootstrap request.
|
|
867
|
+
if (req.body !== undefined && req.body !== (body ?? undefined)) continue;
|
|
868
|
+
return req.response;
|
|
869
|
+
}
|
|
870
|
+
return undefined;
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
function observedRequestBody(body: RequestInit['body'] | undefined): string | undefined {
|
|
874
|
+
if (body === undefined || body === null) return undefined;
|
|
875
|
+
if (typeof body === 'string') return body;
|
|
876
|
+
if (body instanceof URLSearchParams) return body.toString();
|
|
877
|
+
return undefined;
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
function redactUrlForLog(url: string): string {
|
|
881
|
+
try {
|
|
882
|
+
const u = new URL(url);
|
|
883
|
+
return `${u.origin}${u.pathname}`;
|
|
884
|
+
} catch {
|
|
885
|
+
return url.slice(0, 80);
|
|
886
|
+
}
|
|
887
|
+
}
|
|
888
|
+
|
|
720
889
|
/** A replay error that means the JAR is bad (clear it + re-mint), as opposed to a
|
|
721
890
|
* transient IP rate-flag (NETWORK/RATE_LIMITED — a fresh jar won't help; back off). */
|
|
722
891
|
function jarLikelyStale(result: ToolResult): boolean {
|
|
@@ -756,14 +925,14 @@ async function runFetchBootstrap(
|
|
|
756
925
|
values: {},
|
|
757
926
|
storage: [],
|
|
758
927
|
};
|
|
759
|
-
const paramsWithDefaults =
|
|
928
|
+
const paramsWithDefaults = await withWorkflowPreparedParams(tool, params);
|
|
760
929
|
const bootstrapUrl = tool.workflow.bootstrap
|
|
761
930
|
? substituteString(tool.workflow.bootstrap.url, paramsWithDefaults, credentials, [])
|
|
762
931
|
: undefined;
|
|
763
932
|
const siteDir = pathResolve(tool.dir, '..');
|
|
764
933
|
|
|
765
934
|
for (let attempt = 0; attempt < 2; attempt++) {
|
|
766
|
-
const jar = await getOrMintCdpJar(baseUrl, bootstrapUrl, siteDir, attempt > 0);
|
|
935
|
+
const jar = await getOrMintCdpJar(baseUrl, bootstrapUrl, siteDir, attempt > 0, tool.workflow);
|
|
767
936
|
if (!jar) {
|
|
768
937
|
// Couldn't even launch the bootstrap browser → let the ladder escalate.
|
|
769
938
|
const stateMissing = bootstrapFailureStateMissingResult(
|
|
@@ -788,7 +957,7 @@ async function runFetchBootstrap(
|
|
|
788
957
|
// recording-seeded or cached jar is validated:true by construction, so the
|
|
789
958
|
// cheap plain-fetch path is untouched; `=== false` (not falsy) leaves jars
|
|
790
959
|
// without the field — older caches / test stubs — on the original path.
|
|
791
|
-
if (jar.validated === false) {
|
|
960
|
+
if (jar.validated === false && jarHasAkamaiValidationSignals(jar.cookies)) {
|
|
792
961
|
log(
|
|
793
962
|
'fetch-bootstrap: minted jar unvalidated (no _abck~0~/bm_sv) — plain-fetch replay doomed; escalating to cdp-replay',
|
|
794
963
|
);
|
|
@@ -833,7 +1002,7 @@ async function runFetchBootstrap(
|
|
|
833
1002
|
const result = await tool.toolFn(paramsWithDefaults, {
|
|
834
1003
|
credentials: bootstrappedCredentials,
|
|
835
1004
|
initialState: captureResult.state,
|
|
836
|
-
fetchImpl: makeJarUaFetch(jar.ua),
|
|
1005
|
+
fetchImpl: makeObservedResponseFetch(jar, makeJarUaFetch(jar.ua)),
|
|
837
1006
|
});
|
|
838
1007
|
|
|
839
1008
|
if (result.ok) return result;
|
|
@@ -891,7 +1060,7 @@ async function runCdpReplay(
|
|
|
891
1060
|
values: {},
|
|
892
1061
|
storage: [],
|
|
893
1062
|
};
|
|
894
|
-
const paramsWithDefaults =
|
|
1063
|
+
const paramsWithDefaults = await withWorkflowPreparedParams(tool, params);
|
|
895
1064
|
const bootstrapUrl = tool.workflow.bootstrap
|
|
896
1065
|
? substituteString(tool.workflow.bootstrap.url, paramsWithDefaults, credentials, [])
|
|
897
1066
|
: undefined;
|
|
@@ -899,6 +1068,12 @@ async function runCdpReplay(
|
|
|
899
1068
|
const siteDir = pathResolve(tool.dir, '..');
|
|
900
1069
|
const poolKey = tool.site;
|
|
901
1070
|
const pooled = cdpPool?.get(poolKey);
|
|
1071
|
+
if (pooled && bootstrapUrl && pooled.bootstrapUrl !== bootstrapUrl) {
|
|
1072
|
+
log(
|
|
1073
|
+
`cdp-replay: reusing pooled Chrome session for new bootstrap (${pooled.bootstrapUrl} → ${bootstrapUrl})`,
|
|
1074
|
+
);
|
|
1075
|
+
pooled.setBootstrapUrl(bootstrapUrl);
|
|
1076
|
+
}
|
|
902
1077
|
const ownsSession = !pooled;
|
|
903
1078
|
|
|
904
1079
|
let cf: CdpBrowserFetch;
|
|
@@ -924,7 +1099,7 @@ async function runCdpReplay(
|
|
|
924
1099
|
}
|
|
925
1100
|
|
|
926
1101
|
try {
|
|
927
|
-
const jar = await cf.
|
|
1102
|
+
const jar = await mintJarWithBootstrapWait(cf, tool.workflow);
|
|
928
1103
|
const bootstrappedCredentials: CredentialStore = {
|
|
929
1104
|
...credentials,
|
|
930
1105
|
cookies: [
|
|
@@ -956,7 +1131,7 @@ async function runCdpReplay(
|
|
|
956
1131
|
const result = await tool.toolFn(paramsWithDefaults, {
|
|
957
1132
|
credentials: bootstrappedCredentials,
|
|
958
1133
|
initialState: captureResult.state,
|
|
959
|
-
fetchImpl: cf.fetchImpl,
|
|
1134
|
+
fetchImpl: makeObservedResponseFetch(jar, cf.fetchImpl),
|
|
960
1135
|
});
|
|
961
1136
|
|
|
962
1137
|
if (result.ok) {
|
|
@@ -970,11 +1145,13 @@ async function runCdpReplay(
|
|
|
970
1145
|
if (!cdpPool && ownsSession) await cf.close();
|
|
971
1146
|
}
|
|
972
1147
|
} else {
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
1148
|
+
// A workflow-level failure (BAD_RESPONSE/STATE_MISSING/FORBIDDEN/etc.) is
|
|
1149
|
+
// not evidence that the Chrome/CDP session is dead. Keep pooled sessions
|
|
1150
|
+
// alive so the next MCP call can reuse/retarget the browser; only the
|
|
1151
|
+
// catch path below evicts sessions after an actual CDP exception.
|
|
1152
|
+
if (cdpPool && ownsSession) {
|
|
1153
|
+
cdpPool.set(poolKey, cf);
|
|
1154
|
+
} else if (!cdpPool && ownsSession) {
|
|
978
1155
|
await cf.close();
|
|
979
1156
|
}
|
|
980
1157
|
}
|
|
@@ -1046,6 +1223,45 @@ function jarBootstrapCaptureState(
|
|
|
1046
1223
|
),
|
|
1047
1224
|
};
|
|
1048
1225
|
}
|
|
1226
|
+
} else if (capture.source === 'request_header') {
|
|
1227
|
+
const value = captureObservedRequestHeader(jar, capture);
|
|
1228
|
+
if (value !== undefined && value !== null && value !== '') state[capture.name] = value;
|
|
1229
|
+
else if (capture.required !== false) {
|
|
1230
|
+
return {
|
|
1231
|
+
ok: false,
|
|
1232
|
+
result: bootstrapCaptureMissingResult(
|
|
1233
|
+
capture,
|
|
1234
|
+
`Required bootstrap capture "${capture.name}" (request_header ${capture.header}) did not match an observed browser request.`,
|
|
1235
|
+
'producer_ran_value_absent',
|
|
1236
|
+
),
|
|
1237
|
+
};
|
|
1238
|
+
}
|
|
1239
|
+
} else if (capture.source === 'request_url_regex') {
|
|
1240
|
+
const value = captureObservedRequestUrlRegex(jar, capture);
|
|
1241
|
+
if (value !== undefined && value !== null && value !== '') state[capture.name] = value;
|
|
1242
|
+
else if (capture.required !== false) {
|
|
1243
|
+
return {
|
|
1244
|
+
ok: false,
|
|
1245
|
+
result: bootstrapCaptureMissingResult(
|
|
1246
|
+
capture,
|
|
1247
|
+
`Required bootstrap capture "${capture.name}" (request_url_regex ${capture.pattern}) did not match an observed browser request.`,
|
|
1248
|
+
'producer_ran_value_absent',
|
|
1249
|
+
),
|
|
1250
|
+
};
|
|
1251
|
+
}
|
|
1252
|
+
} else if (capture.source === 'request_body_regex') {
|
|
1253
|
+
const value = captureObservedRequestBodyRegex(jar, capture);
|
|
1254
|
+
if (value !== undefined && value !== null && value !== '') state[capture.name] = value;
|
|
1255
|
+
else if (capture.required !== false) {
|
|
1256
|
+
return {
|
|
1257
|
+
ok: false,
|
|
1258
|
+
result: bootstrapCaptureMissingResult(
|
|
1259
|
+
capture,
|
|
1260
|
+
`Required bootstrap capture "${capture.name}" (request_body_regex ${capture.pattern}) did not match an observed browser request body.`,
|
|
1261
|
+
'producer_ran_value_absent',
|
|
1262
|
+
),
|
|
1263
|
+
};
|
|
1264
|
+
}
|
|
1049
1265
|
} else if (capture.required !== false) {
|
|
1050
1266
|
// response_header / dom_* can't be resolved from a closed browser jar.
|
|
1051
1267
|
return {
|
|
@@ -1061,6 +1277,134 @@ function jarBootstrapCaptureState(
|
|
|
1061
1277
|
return { ok: true, state };
|
|
1062
1278
|
}
|
|
1063
1279
|
|
|
1280
|
+
function captureObservedRequestHeader(
|
|
1281
|
+
jar: MintedJar,
|
|
1282
|
+
capture: Extract<BootstrapCapture, { source: 'request_header' }>,
|
|
1283
|
+
): string | string[] | undefined {
|
|
1284
|
+
return captureObservedRequestValueFromObserved(jar.observedRequests ?? [], capture, (req) =>
|
|
1285
|
+
headerValue(req.headers, capture.header),
|
|
1286
|
+
);
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
function captureObservedRequestUrlRegex(
|
|
1290
|
+
jar: MintedJar,
|
|
1291
|
+
capture: Extract<BootstrapCapture, { source: 'request_url_regex' }>,
|
|
1292
|
+
): string | string[] | undefined {
|
|
1293
|
+
return captureObservedRequestValueFromObserved(jar.observedRequests ?? [], capture, (req) => {
|
|
1294
|
+
try {
|
|
1295
|
+
return req.url.match(new RegExp(capture.pattern))?.[capture.group ?? 1];
|
|
1296
|
+
} catch {
|
|
1297
|
+
return undefined;
|
|
1298
|
+
}
|
|
1299
|
+
});
|
|
1300
|
+
}
|
|
1301
|
+
|
|
1302
|
+
function captureObservedRequestBodyRegex(
|
|
1303
|
+
jar: MintedJar,
|
|
1304
|
+
capture: Extract<BootstrapCapture, { source: 'request_body_regex' }>,
|
|
1305
|
+
): string | string[] | undefined {
|
|
1306
|
+
return captureObservedRequestValueFromObserved(jar.observedRequests ?? [], capture, (req) => {
|
|
1307
|
+
if (typeof req.body !== 'string') return undefined;
|
|
1308
|
+
try {
|
|
1309
|
+
const match = req.body.match(new RegExp(capture.pattern));
|
|
1310
|
+
return match?.[capture.group ?? 1] ?? match?.[0];
|
|
1311
|
+
} catch {
|
|
1312
|
+
return undefined;
|
|
1313
|
+
}
|
|
1314
|
+
});
|
|
1315
|
+
}
|
|
1316
|
+
|
|
1317
|
+
function requiredObservedRequestCaptures(
|
|
1318
|
+
bootstrap: NonNullable<Workflow['bootstrap']>,
|
|
1319
|
+
): Array<
|
|
1320
|
+
Extract<
|
|
1321
|
+
BootstrapCapture,
|
|
1322
|
+
{ source: 'request_header' | 'request_url_regex' | 'request_body_regex' }
|
|
1323
|
+
>
|
|
1324
|
+
> {
|
|
1325
|
+
return (bootstrap.captures ?? []).filter(
|
|
1326
|
+
(
|
|
1327
|
+
capture,
|
|
1328
|
+
): capture is Extract<
|
|
1329
|
+
BootstrapCapture,
|
|
1330
|
+
{ source: 'request_header' | 'request_url_regex' | 'request_body_regex' }
|
|
1331
|
+
> =>
|
|
1332
|
+
capture.required !== false &&
|
|
1333
|
+
(capture.source === 'request_header' ||
|
|
1334
|
+
capture.source === 'request_url_regex' ||
|
|
1335
|
+
capture.source === 'request_body_regex'),
|
|
1336
|
+
);
|
|
1337
|
+
}
|
|
1338
|
+
|
|
1339
|
+
function missingObservedRequestCaptureNames(
|
|
1340
|
+
bootstrap: NonNullable<Workflow['bootstrap']>,
|
|
1341
|
+
jar: MintedJar,
|
|
1342
|
+
): string[] {
|
|
1343
|
+
const missing: string[] = [];
|
|
1344
|
+
for (const capture of requiredObservedRequestCaptures(bootstrap)) {
|
|
1345
|
+
const value =
|
|
1346
|
+
capture.source === 'request_header'
|
|
1347
|
+
? captureObservedRequestHeader(jar, capture)
|
|
1348
|
+
: capture.source === 'request_url_regex'
|
|
1349
|
+
? captureObservedRequestUrlRegex(jar, capture)
|
|
1350
|
+
: captureObservedRequestBodyRegex(jar, capture);
|
|
1351
|
+
if (value === undefined || value === '' || (Array.isArray(value) && value.length === 0)) {
|
|
1352
|
+
missing.push(capture.name);
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
return missing;
|
|
1356
|
+
}
|
|
1357
|
+
|
|
1358
|
+
function captureObservedRequestValueFromObserved(
|
|
1359
|
+
observed: Array<{
|
|
1360
|
+
method: string;
|
|
1361
|
+
url: string;
|
|
1362
|
+
headers: Record<string, string>;
|
|
1363
|
+
body?: string;
|
|
1364
|
+
source?: 'browser' | 'replay';
|
|
1365
|
+
}>,
|
|
1366
|
+
capture: Extract<
|
|
1367
|
+
BootstrapCapture,
|
|
1368
|
+
{ source: 'request_header' | 'request_url_regex' | 'request_body_regex' }
|
|
1369
|
+
>,
|
|
1370
|
+
pickValue: (req: {
|
|
1371
|
+
method: string;
|
|
1372
|
+
url: string;
|
|
1373
|
+
headers: Record<string, string>;
|
|
1374
|
+
body?: string;
|
|
1375
|
+
source?: 'browser' | 'replay';
|
|
1376
|
+
}) => string | undefined,
|
|
1377
|
+
): string | string[] | undefined {
|
|
1378
|
+
let urlRe: RegExp | null = null;
|
|
1379
|
+
if (capture.urlPattern) {
|
|
1380
|
+
try {
|
|
1381
|
+
urlRe = new RegExp(capture.urlPattern);
|
|
1382
|
+
} catch {
|
|
1383
|
+
return undefined;
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
1386
|
+
const method = capture.method?.toUpperCase();
|
|
1387
|
+
const matches: string[] = [];
|
|
1388
|
+
for (const req of observed) {
|
|
1389
|
+
if (req.source === 'replay') continue;
|
|
1390
|
+
if (method && req.method.toUpperCase() !== method) continue;
|
|
1391
|
+
if (urlRe && !urlRe.test(req.url)) continue;
|
|
1392
|
+
const value = pickValue(req);
|
|
1393
|
+
if (value !== undefined && value !== '') matches.push(value);
|
|
1394
|
+
}
|
|
1395
|
+
if (capture.mode === 'all') return matches.length ? matches : undefined;
|
|
1396
|
+
if (capture.mode === 'first') return matches[0];
|
|
1397
|
+
return matches[matches.length - 1];
|
|
1398
|
+
}
|
|
1399
|
+
|
|
1400
|
+
function headerValue(headers: Record<string, string>, header: string): string | undefined {
|
|
1401
|
+
const headerName = header.toLowerCase();
|
|
1402
|
+
for (const [name, value] of Object.entries(headers)) {
|
|
1403
|
+
if (name.toLowerCase() === headerName) return value;
|
|
1404
|
+
}
|
|
1405
|
+
return undefined;
|
|
1406
|
+
}
|
|
1407
|
+
|
|
1064
1408
|
function bootstrapFailureStateMissingResult(
|
|
1065
1409
|
workflow: Workflow,
|
|
1066
1410
|
message: string,
|
|
@@ -1188,6 +1532,12 @@ export async function evaluateBootstrapCapture(
|
|
|
1188
1532
|
},
|
|
1189
1533
|
{ origin: capture.origin, key: capture.key },
|
|
1190
1534
|
);
|
|
1535
|
+
case 'request_header':
|
|
1536
|
+
return undefined;
|
|
1537
|
+
case 'request_url_regex':
|
|
1538
|
+
return undefined;
|
|
1539
|
+
case 'request_body_regex':
|
|
1540
|
+
return undefined;
|
|
1191
1541
|
case 'cookie':
|
|
1192
1542
|
return undefined;
|
|
1193
1543
|
}
|
|
@@ -1195,30 +1545,38 @@ export async function evaluateBootstrapCapture(
|
|
|
1195
1545
|
|
|
1196
1546
|
/** Per-site stealth fetcher; bootstrap pays its ~12s once per process. */
|
|
1197
1547
|
/** Mint `${state.X}` values from the stealth bootstrap session for a workflow
|
|
1198
|
-
* that declares a bootstrap block. Satisfies `cookie`, `html_regex`,
|
|
1199
|
-
* `response_header
|
|
1200
|
-
* stealth navigation minted — all one
|
|
1201
|
-
* cookies, so a token the later API POST
|
|
1548
|
+
* that declares a bootstrap block. Satisfies `cookie`, `html_regex`,
|
|
1549
|
+
* `response_header`, and observed request captures from the cookies / HTML /
|
|
1550
|
+
* headers / observed browser requests the stealth navigation minted — all one
|
|
1551
|
+
* consistent session as the transport cookies, so a token the later API POST
|
|
1552
|
+
* checks against the session resolves.
|
|
1202
1553
|
* `dom_*` / storage sources need a live page and are left for the
|
|
1203
1554
|
* fetch-bootstrap rung (the compile prompt steers replay-safe session tokens
|
|
1204
1555
|
* to cookie/html_regex, which this covers). */
|
|
1205
1556
|
async function stealthBootstrapState(
|
|
1206
1557
|
sf: StealthFetch,
|
|
1207
1558
|
bootstrap: NonNullable<ResolvedTool['workflow']['bootstrap']>,
|
|
1559
|
+
tokens?: TokenCache,
|
|
1208
1560
|
): Promise<Record<string, unknown>> {
|
|
1209
1561
|
const state: Record<string, unknown> = {};
|
|
1210
1562
|
const captures = bootstrap.captures ?? [];
|
|
1211
1563
|
const supported = captures.filter(
|
|
1212
|
-
(c) =>
|
|
1564
|
+
(c) =>
|
|
1565
|
+
c.source === 'cookie' ||
|
|
1566
|
+
c.source === 'html_regex' ||
|
|
1567
|
+
c.source === 'response_header' ||
|
|
1568
|
+
c.source === 'request_header' ||
|
|
1569
|
+
c.source === 'request_url_regex' ||
|
|
1570
|
+
c.source === 'request_body_regex',
|
|
1213
1571
|
);
|
|
1214
1572
|
if (supported.length === 0) return state;
|
|
1215
|
-
const
|
|
1573
|
+
const bootstrapTokens = tokens ?? (await sf.ensureBootstrapped());
|
|
1216
1574
|
for (const cap of supported) {
|
|
1217
1575
|
if (cap.source === 'cookie') {
|
|
1218
|
-
const hit =
|
|
1576
|
+
const hit = bootstrapTokens.cookies.find((c) => c.name === cap.cookie);
|
|
1219
1577
|
if (hit) state[cap.name] = hit.value;
|
|
1220
1578
|
} else if (cap.source === 'html_regex') {
|
|
1221
|
-
const html =
|
|
1579
|
+
const html = bootstrapTokens.bootstrapHtml ?? '';
|
|
1222
1580
|
try {
|
|
1223
1581
|
const m = html.match(new RegExp(cap.pattern));
|
|
1224
1582
|
const v = m?.[cap.group ?? 1];
|
|
@@ -1227,8 +1585,43 @@ async function stealthBootstrapState(
|
|
|
1227
1585
|
// invalid regex — leave unset; substitution will surface STATE_MISSING
|
|
1228
1586
|
}
|
|
1229
1587
|
} else if (cap.source === 'response_header') {
|
|
1230
|
-
const v =
|
|
1588
|
+
const v = bootstrapTokens.bootstrapResponseHeaders?.[cap.header.toLowerCase()];
|
|
1231
1589
|
if (v !== undefined && v !== '') state[cap.name] = v;
|
|
1590
|
+
} else if (cap.source === 'request_header') {
|
|
1591
|
+
const v = captureObservedRequestValueFromObserved(
|
|
1592
|
+
bootstrapTokens.observedRequests ?? [],
|
|
1593
|
+
cap,
|
|
1594
|
+
(req) => headerValue(req.headers, cap.header),
|
|
1595
|
+
);
|
|
1596
|
+
if (v !== undefined && v !== null && v !== '') state[cap.name] = v;
|
|
1597
|
+
} else if (cap.source === 'request_url_regex') {
|
|
1598
|
+
const v = captureObservedRequestValueFromObserved(
|
|
1599
|
+
bootstrapTokens.observedRequests ?? [],
|
|
1600
|
+
cap,
|
|
1601
|
+
(req) => {
|
|
1602
|
+
try {
|
|
1603
|
+
return req.url.match(new RegExp(cap.pattern))?.[cap.group ?? 1];
|
|
1604
|
+
} catch {
|
|
1605
|
+
return undefined;
|
|
1606
|
+
}
|
|
1607
|
+
},
|
|
1608
|
+
);
|
|
1609
|
+
if (v !== undefined && v !== null && v !== '') state[cap.name] = v;
|
|
1610
|
+
} else if (cap.source === 'request_body_regex') {
|
|
1611
|
+
const v = captureObservedRequestValueFromObserved(
|
|
1612
|
+
bootstrapTokens.observedRequests ?? [],
|
|
1613
|
+
cap,
|
|
1614
|
+
(req) => {
|
|
1615
|
+
if (typeof req.body !== 'string') return undefined;
|
|
1616
|
+
try {
|
|
1617
|
+
const match = req.body.match(new RegExp(cap.pattern));
|
|
1618
|
+
return match?.[cap.group ?? 1] ?? match?.[0];
|
|
1619
|
+
} catch {
|
|
1620
|
+
return undefined;
|
|
1621
|
+
}
|
|
1622
|
+
},
|
|
1623
|
+
);
|
|
1624
|
+
if (v !== undefined && v !== null && v !== '') state[cap.name] = v;
|
|
1232
1625
|
}
|
|
1233
1626
|
}
|
|
1234
1627
|
return state;
|