imprint-mcp 0.4.7 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/examples/google-flights/README.md +0 -2
- package/examples/google-flights/_shared/flights_request.ts +4 -10
- package/examples/google-flights/get_flight_booking_details/index.ts +2 -5
- package/examples/google-flights/get_flight_booking_details/parser.ts +0 -8
- package/examples/google-flights/get_flight_booking_details/workflow.json +2 -5
- package/examples/google-flights/get_flight_calendar_prices/index.ts +2 -5
- package/examples/google-flights/get_flight_calendar_prices/parser.ts +4 -8
- package/examples/google-flights/get_flight_calendar_prices/workflow.json +2 -5
- package/examples/google-flights/lookup_airport/index.ts +0 -3
- package/examples/google-flights/lookup_airport/parser.ts +1 -8
- package/examples/google-flights/lookup_airport/workflow.json +0 -3
- package/examples/google-flights/search_flights/index.ts +7 -62
- package/examples/google-flights/search_flights/request-transform.ts +0 -45
- package/examples/google-flights/search_flights/workflow.json +7 -62
- package/package.json +1 -1
- package/prompts/build-planning.md +1 -1
- package/prompts/compile-agent.md +3 -5
- package/prompts/prereq-builder.md +1 -2
- package/src/imprint/backend-ladder.ts +47 -436
- package/src/imprint/cdp-browser-fetch.ts +6 -176
- package/src/imprint/cdp-jar-cache.ts +10 -105
- package/src/imprint/compile-tools.ts +2 -2
- package/src/imprint/mcp-server.ts +65 -152
- package/src/imprint/probe-backends.ts +10 -41
- package/src/imprint/runtime.ts +12 -24
- package/src/imprint/stealth-fetch.ts +0 -71
- package/src/imprint/stealth-token-cache.ts +1 -38
- package/src/imprint/types.ts +0 -45
|
@@ -10,9 +10,6 @@
|
|
|
10
10
|
* cached (~90 min) so one bootstrap serves many searches. Auto mode always
|
|
11
11
|
* splices this right after `fetch`; it only RUNS when `fetch` escalates, so a
|
|
12
12
|
* healthy plain-API site never pays for it.
|
|
13
|
-
* - `cdp-replay` — live Chrome API replay. Reused by MCP/compile sessions
|
|
14
|
-
* when a workflow needs browser-observed request state or sustained protected
|
|
15
|
-
* POSTs.
|
|
16
13
|
* - `stealth-fetch` — Playwright stealth bootstrap + native fetch (token tier).
|
|
17
14
|
* - `playbook` — DOM-walk LAST RESORT (needs a compiled playbook.yaml).
|
|
18
15
|
*/
|
|
@@ -25,7 +22,6 @@ import {
|
|
|
25
22
|
type CdpBrowserFetchOptions,
|
|
26
23
|
type MintedJar,
|
|
27
24
|
createCdpBrowserFetch,
|
|
28
|
-
jarHasAkamaiValidationSignals,
|
|
29
25
|
} from './cdp-browser-fetch.ts';
|
|
30
26
|
import {
|
|
31
27
|
clearJar,
|
|
@@ -167,6 +163,10 @@ export function __resetCompileCdpPoolForTest(): void {
|
|
|
167
163
|
compileCdpPool.clear();
|
|
168
164
|
}
|
|
169
165
|
|
|
166
|
+
function cdpToolResultImpliesDeadSession(result: ToolResult): boolean {
|
|
167
|
+
return !result.ok && result.error === 'NETWORK';
|
|
168
|
+
}
|
|
169
|
+
|
|
170
170
|
/** Freshness window for the file-backed compile-time stealth token. Matches
|
|
171
171
|
* stealth-fetch's in-process `maxTokenAgeSeconds` default so a reused token is
|
|
172
172
|
* not immediately considered stale by `createStealthFetch`. */
|
|
@@ -217,30 +217,6 @@ function withWorkflowDefaults(
|
|
|
217
217
|
return paramsWithDefaults;
|
|
218
218
|
}
|
|
219
219
|
|
|
220
|
-
async function withWorkflowPreparedParams(
|
|
221
|
-
tool: ResolvedTool,
|
|
222
|
-
params: Record<string, string | number | boolean>,
|
|
223
|
-
): Promise<Record<string, string | number | boolean>> {
|
|
224
|
-
const preparedParams = withWorkflowDefaults(tool.workflow, params);
|
|
225
|
-
const modulePath = tool.workflow.requestTransformModule;
|
|
226
|
-
if (!modulePath) return preparedParams;
|
|
227
|
-
try {
|
|
228
|
-
const mod = await import(pathResolve(tool.dir, modulePath));
|
|
229
|
-
if (typeof mod.prepareParams !== 'function') return preparedParams;
|
|
230
|
-
const extra = await mod.prepareParams(preparedParams);
|
|
231
|
-
if (!extra || typeof extra !== 'object') return preparedParams;
|
|
232
|
-
for (const [key, value] of Object.entries(extra)) {
|
|
233
|
-
if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') {
|
|
234
|
-
preparedParams[key] = value;
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
} catch {
|
|
238
|
-
// Non-fatal: request transforms are optional, and executeWorkflow will surface
|
|
239
|
-
// any still-missing placeholders with its normal STATE_MISSING diagnostics.
|
|
240
|
-
}
|
|
241
|
-
return preparedParams;
|
|
242
|
-
}
|
|
243
|
-
|
|
244
220
|
/** Await the per-origin min spacing before a compile-path live request. The
|
|
245
221
|
* first call to an origin never waits (last=0); subsequent ones within the
|
|
246
222
|
* window are delayed so the suite paces itself under the rate-flag. */
|
|
@@ -363,7 +339,7 @@ export async function runWithLadder(
|
|
|
363
339
|
result = await runCdpReplay(tool, params, options?.cdpPool);
|
|
364
340
|
break;
|
|
365
341
|
case 'stealth-fetch': {
|
|
366
|
-
const paramsWithDefaults =
|
|
342
|
+
const paramsWithDefaults = withWorkflowDefaults(tool.workflow, params);
|
|
367
343
|
const sf = await ensureStealthFetch(tool, stealthCache, paramsWithDefaults);
|
|
368
344
|
// When the workflow declares a bootstrap block, mint its declared
|
|
369
345
|
// session-token state (CSRF cookies etc.) from the SAME stealth
|
|
@@ -371,21 +347,17 @@ export async function runWithLadder(
|
|
|
371
347
|
// workflow escalating here from fetch-bootstrap loses the
|
|
372
348
|
// ${state.X} its requests need — the gap that made bootstrap-block
|
|
373
349
|
// tools on anti-bot sites unverifiable.
|
|
374
|
-
const tokens = tool.workflow.bootstrap ? await sf.ensureBootstrapped() : undefined;
|
|
375
350
|
const initialState = tool.workflow.bootstrap
|
|
376
|
-
? await stealthBootstrapState(sf, tool.workflow.bootstrap
|
|
351
|
+
? await stealthBootstrapState(sf, tool.workflow.bootstrap)
|
|
377
352
|
: undefined;
|
|
378
|
-
result = await tool.toolFn(paramsWithDefaults, {
|
|
379
|
-
fetchImpl: tokens ? makeObservedResponseFetch(tokens, sf.fetchImpl) : sf.fetchImpl,
|
|
380
|
-
initialState,
|
|
381
|
-
});
|
|
353
|
+
result = await tool.toolFn(paramsWithDefaults, { fetchImpl: sf.fetchImpl, initialState });
|
|
382
354
|
break;
|
|
383
355
|
}
|
|
384
356
|
case 'playbook': {
|
|
385
357
|
// DOM-walk last resort (the anti-bot API path is fetch-bootstrap, above).
|
|
386
358
|
// Apply workflow.json's declared parameter defaults — runPlaybook
|
|
387
359
|
// validates and throws on absent values regardless of declared defaults.
|
|
388
|
-
const paramsWithDefaults =
|
|
360
|
+
const paramsWithDefaults = withWorkflowDefaults(tool.workflow, params);
|
|
389
361
|
result = await runPlaybook({
|
|
390
362
|
playbook: playbookPath(assetRoot, tool.site, tool.dir),
|
|
391
363
|
params: paramsWithDefaults,
|
|
@@ -554,9 +526,11 @@ export function effectiveAutoLadder(
|
|
|
554
526
|
const fbIdx = next.indexOf('fetch-bootstrap');
|
|
555
527
|
if (fbIdx !== -1) next.splice(fbIdx + 1, 0, 'cdp-replay');
|
|
556
528
|
}
|
|
557
|
-
// For
|
|
558
|
-
//
|
|
559
|
-
//
|
|
529
|
+
// For a MULTI-step state-changing anti-bot workflow, plain-fetch rungs are not
|
|
530
|
+
// just doomed — their tarpitted .act attempts BURN the per-IP rate budget
|
|
531
|
+
// before cdp-replay even runs, which can flag the IP and make cdp-replay tarpit
|
|
532
|
+
// too. Front-load cdp-replay for these so the live browser handles every
|
|
533
|
+
// protected POST from a clean slate.
|
|
560
534
|
if (prefersCdpReplayFirst(workflow)) {
|
|
561
535
|
const i = next.indexOf('cdp-replay');
|
|
562
536
|
if (i > 0) {
|
|
@@ -567,20 +541,15 @@ export function effectiveAutoLadder(
|
|
|
567
541
|
return next;
|
|
568
542
|
}
|
|
569
543
|
|
|
570
|
-
/**
|
|
571
|
-
*
|
|
572
|
-
*
|
|
573
|
-
*
|
|
574
|
-
*
|
|
575
|
-
*
|
|
576
|
-
*
|
|
577
|
-
*
|
|
578
|
-
* bootstrap/state signal). Plain-fetch replay can't sustain those protected
|
|
579
|
-
* POST sequences and can burn the per-IP budget before CDP runs.
|
|
580
|
-
*/
|
|
544
|
+
/** A multi-step, state-changing, anti-bot workflow: ≥2 mutating requests AND an
|
|
545
|
+
* anti-bot signal (a bootstrap block, or requests that depend on captured
|
|
546
|
+
* `${state.X}` tokens). Plain-fetch replay can't sustain its sequence of
|
|
547
|
+
* protected POSTs (each self-invalidates `_abck`); only the live-browser
|
|
548
|
+
* cdp-replay rung can — and it should run FIRST so the doomed fetch /
|
|
549
|
+
* fetch-bootstrap attempts don't pre-burn the per-IP .act budget. A plain
|
|
550
|
+
* multi-POST REST API (no bootstrap, no `${state.X}`) is NOT matched, so it
|
|
551
|
+
* keeps the cheap fetch-first order. */
|
|
581
552
|
export function prefersCdpReplayFirst(workflow: Workflow): boolean {
|
|
582
|
-
if (workflow.bootstrap?.captures?.some(isObservedRequestBootstrapCapture)) return true;
|
|
583
|
-
|
|
584
553
|
const mutating = workflow.requests.filter((r) => {
|
|
585
554
|
const m = (r.method ?? 'GET').toUpperCase();
|
|
586
555
|
return r.effect === 'unsafe' || m === 'POST' || m === 'PUT' || m === 'PATCH' || m === 'DELETE';
|
|
@@ -595,14 +564,6 @@ export function prefersCdpReplayFirst(workflow: Workflow): boolean {
|
|
|
595
564
|
return Boolean(workflow.bootstrap) || hasStateRefs;
|
|
596
565
|
}
|
|
597
566
|
|
|
598
|
-
function isObservedRequestBootstrapCapture(capture: BootstrapCapture): boolean {
|
|
599
|
-
return (
|
|
600
|
-
capture.source === 'request_header' ||
|
|
601
|
-
capture.source === 'request_url_regex' ||
|
|
602
|
-
capture.source === 'request_body_regex'
|
|
603
|
-
);
|
|
604
|
-
}
|
|
605
|
-
|
|
606
567
|
function nextStateMissingBackend(
|
|
607
568
|
ladder: ConcreteBackend[],
|
|
608
569
|
backend: ConcreteBackend,
|
|
@@ -673,11 +634,10 @@ async function getOrMintCdpJar(
|
|
|
673
634
|
bootstrapUrl: string | undefined,
|
|
674
635
|
siteDir: string,
|
|
675
636
|
forceFresh: boolean,
|
|
676
|
-
workflow?: Workflow,
|
|
677
637
|
): Promise<MintedJar | null> {
|
|
678
638
|
if (cdpJarMinterForTest) return cdpJarMinterForTest(baseUrl, bootstrapUrl);
|
|
679
639
|
if (!forceFresh) {
|
|
680
|
-
let cached = loadJar(siteDir
|
|
640
|
+
let cached = loadJar(siteDir);
|
|
681
641
|
// A recording NEWER than the cached jar supersedes it — e.g. the user
|
|
682
642
|
// re-recorded on a new IP, so the cached (old-IP) jar would tarpit. Drop the
|
|
683
643
|
// stale cache and re-seed from the fresh recording below.
|
|
@@ -688,18 +648,7 @@ async function getOrMintCdpJar(
|
|
|
688
648
|
// many sequential .act), strictly better than a synthetic cdp-browser mint
|
|
689
649
|
// (low-trust → tarpitted even on a fresh IP). "The recording IS the
|
|
690
650
|
// executable." Reuse the `rec` stat above so we don't re-glob.
|
|
691
|
-
if (!cached && seedJarFromRecording(siteDir, rec, bootstrapUrl))
|
|
692
|
-
cached = loadJar(siteDir, bootstrapUrl);
|
|
693
|
-
}
|
|
694
|
-
if (cached && workflow?.bootstrap) {
|
|
695
|
-
const missing = missingObservedRequestCaptureNames(workflow.bootstrap, cached);
|
|
696
|
-
if (missing.length > 0) {
|
|
697
|
-
log(
|
|
698
|
-
`cached jar is missing required browser-observed capture(s): ${missing.join(', ')} — re-mint`,
|
|
699
|
-
);
|
|
700
|
-
cached = null;
|
|
701
|
-
}
|
|
702
|
-
}
|
|
651
|
+
if (!cached && seedJarFromRecording(siteDir, rec, bootstrapUrl)) cached = loadJar(siteDir);
|
|
703
652
|
if (cached) {
|
|
704
653
|
const provenance =
|
|
705
654
|
cached.source === 'recording'
|
|
@@ -718,12 +667,10 @@ async function getOrMintCdpJar(
|
|
|
718
667
|
}
|
|
719
668
|
let cf: CdpBrowserFetch | undefined;
|
|
720
669
|
try {
|
|
721
|
-
cf =
|
|
722
|
-
const jar = await
|
|
723
|
-
if (jar.abckFlag !== '0'
|
|
670
|
+
cf = createCdpBrowserFetch({ baseUrl, bootstrapUrl });
|
|
671
|
+
const jar = await cf.mintJar();
|
|
672
|
+
if (jar.abckFlag !== '0') {
|
|
724
673
|
log(`cdp jar minted with _abck~${jar.abckFlag}~ (not validated) — replay may be rejected`);
|
|
725
|
-
} else if (!jarHasAkamaiValidationSignals(jar.cookies)) {
|
|
726
|
-
log(`cdp jar minted generic bootstrap state (html=${jar.html.length}b)`);
|
|
727
674
|
}
|
|
728
675
|
saveJar(siteDir, jar);
|
|
729
676
|
return jar;
|
|
@@ -735,43 +682,6 @@ async function getOrMintCdpJar(
|
|
|
735
682
|
}
|
|
736
683
|
}
|
|
737
684
|
|
|
738
|
-
async function mintJarWithBootstrapWait(
|
|
739
|
-
cf: CdpBrowserFetch,
|
|
740
|
-
workflow: Workflow | undefined,
|
|
741
|
-
): Promise<MintedJar> {
|
|
742
|
-
let jar = await cf.mintJar();
|
|
743
|
-
const bootstrap = workflow?.bootstrap;
|
|
744
|
-
if (!bootstrap || requiredObservedRequestCaptures(bootstrap).length === 0) return jar;
|
|
745
|
-
|
|
746
|
-
const timeoutMs =
|
|
747
|
-
typeof bootstrap.timeoutMs === 'number' && bootstrap.timeoutMs > 0
|
|
748
|
-
? bootstrap.timeoutMs
|
|
749
|
-
: 30_000;
|
|
750
|
-
const deadline = Date.now() + timeoutMs;
|
|
751
|
-
let loggedWait = false;
|
|
752
|
-
|
|
753
|
-
while (Date.now() < deadline) {
|
|
754
|
-
const missing = missingObservedRequestCaptureNames(bootstrap, jar);
|
|
755
|
-
if (missing.length === 0) return jar;
|
|
756
|
-
if (!loggedWait) {
|
|
757
|
-
log(
|
|
758
|
-
`waiting up to ${timeoutMs}ms for browser-observed bootstrap request capture(s): ${missing.join(', ')}`,
|
|
759
|
-
);
|
|
760
|
-
loggedWait = true;
|
|
761
|
-
}
|
|
762
|
-
await sleepMs(Math.min(500, Math.max(1, deadline - Date.now())));
|
|
763
|
-
jar = await cf.mintJar();
|
|
764
|
-
}
|
|
765
|
-
|
|
766
|
-
const missing = missingObservedRequestCaptureNames(bootstrap, jar);
|
|
767
|
-
if (missing.length > 0) {
|
|
768
|
-
log(
|
|
769
|
-
`timed out waiting for browser-observed bootstrap request capture(s): ${missing.join(', ')}`,
|
|
770
|
-
);
|
|
771
|
-
}
|
|
772
|
-
return jar;
|
|
773
|
-
}
|
|
774
|
-
|
|
775
685
|
/** Replay transport for the bootstrap-then-fetch path: PLAIN fetch that presents
|
|
776
686
|
* the jar's exact UA (Akamai drops the jar on a UA mismatch). Cookies are
|
|
777
687
|
* attached by executeWorkflow's RuntimeCookieJar from bootstrappedCredentials,
|
|
@@ -811,81 +721,6 @@ function makeProxyFetch(): typeof fetch | undefined {
|
|
|
811
721
|
)) as typeof fetch;
|
|
812
722
|
}
|
|
813
723
|
|
|
814
|
-
type ObservedResponseSource = {
|
|
815
|
-
observedRequests?: Array<{
|
|
816
|
-
method: string;
|
|
817
|
-
url: string;
|
|
818
|
-
body?: string;
|
|
819
|
-
source?: 'browser' | 'replay';
|
|
820
|
-
response?: {
|
|
821
|
-
status: number;
|
|
822
|
-
headers: Record<string, string>;
|
|
823
|
-
body?: string;
|
|
824
|
-
};
|
|
825
|
-
}>;
|
|
826
|
-
};
|
|
827
|
-
|
|
828
|
-
function makeObservedResponseFetch(
|
|
829
|
-
source: ObservedResponseSource,
|
|
830
|
-
fallbackFetch: typeof fetch,
|
|
831
|
-
): typeof fetch {
|
|
832
|
-
return (async (input: string | URL | Request, init?: RequestInit): Promise<Response> => {
|
|
833
|
-
const url =
|
|
834
|
-
typeof input === 'string' ? input : input instanceof URL ? input.toString() : input.url;
|
|
835
|
-
const method = (init?.method ?? 'GET').toUpperCase();
|
|
836
|
-
const body = observedRequestBody(init?.body);
|
|
837
|
-
const observed = findObservedResponse(source, method, url, body);
|
|
838
|
-
if (observed) {
|
|
839
|
-
log(`using bootstrap-observed response for ${method} ${redactUrlForLog(url)}`);
|
|
840
|
-
return new Response(observed.body ?? '', {
|
|
841
|
-
status: observed.status,
|
|
842
|
-
headers: new Headers(observed.headers),
|
|
843
|
-
});
|
|
844
|
-
}
|
|
845
|
-
return fallbackFetch(input, init);
|
|
846
|
-
}) as typeof fetch;
|
|
847
|
-
}
|
|
848
|
-
|
|
849
|
-
function findObservedResponse(
|
|
850
|
-
source: ObservedResponseSource,
|
|
851
|
-
method: string,
|
|
852
|
-
url: string,
|
|
853
|
-
body: string | undefined,
|
|
854
|
-
): { status: number; headers: Record<string, string>; body?: string } | undefined {
|
|
855
|
-
const observed = source.observedRequests ?? [];
|
|
856
|
-
for (let i = observed.length - 1; i >= 0; i--) {
|
|
857
|
-
const req = observed[i];
|
|
858
|
-
if (!req?.response || req.response.body === undefined) continue;
|
|
859
|
-
if (req.source === 'replay') continue;
|
|
860
|
-
if (req.method.toUpperCase() !== method) continue;
|
|
861
|
-
if (req.url !== url) continue;
|
|
862
|
-
// Some CDP requestWillBeSent events omit postData even though the matching
|
|
863
|
-
// response body is available. If the observed body exists, require an exact
|
|
864
|
-
// body match. If CDP omitted it, fall back to exact method+URL; Google-style
|
|
865
|
-
// batchexecute URLs carry session/request ids, so this still avoids serving a
|
|
866
|
-
// response from a different bootstrap request.
|
|
867
|
-
if (req.body !== undefined && req.body !== (body ?? undefined)) continue;
|
|
868
|
-
return req.response;
|
|
869
|
-
}
|
|
870
|
-
return undefined;
|
|
871
|
-
}
|
|
872
|
-
|
|
873
|
-
function observedRequestBody(body: RequestInit['body'] | undefined): string | undefined {
|
|
874
|
-
if (body === undefined || body === null) return undefined;
|
|
875
|
-
if (typeof body === 'string') return body;
|
|
876
|
-
if (body instanceof URLSearchParams) return body.toString();
|
|
877
|
-
return undefined;
|
|
878
|
-
}
|
|
879
|
-
|
|
880
|
-
function redactUrlForLog(url: string): string {
|
|
881
|
-
try {
|
|
882
|
-
const u = new URL(url);
|
|
883
|
-
return `${u.origin}${u.pathname}`;
|
|
884
|
-
} catch {
|
|
885
|
-
return url.slice(0, 80);
|
|
886
|
-
}
|
|
887
|
-
}
|
|
888
|
-
|
|
889
724
|
/** A replay error that means the JAR is bad (clear it + re-mint), as opposed to a
|
|
890
725
|
* transient IP rate-flag (NETWORK/RATE_LIMITED — a fresh jar won't help; back off). */
|
|
891
726
|
function jarLikelyStale(result: ToolResult): boolean {
|
|
@@ -925,14 +760,14 @@ async function runFetchBootstrap(
|
|
|
925
760
|
values: {},
|
|
926
761
|
storage: [],
|
|
927
762
|
};
|
|
928
|
-
const paramsWithDefaults =
|
|
763
|
+
const paramsWithDefaults = withWorkflowDefaults(tool.workflow, params);
|
|
929
764
|
const bootstrapUrl = tool.workflow.bootstrap
|
|
930
765
|
? substituteString(tool.workflow.bootstrap.url, paramsWithDefaults, credentials, [])
|
|
931
766
|
: undefined;
|
|
932
767
|
const siteDir = pathResolve(tool.dir, '..');
|
|
933
768
|
|
|
934
769
|
for (let attempt = 0; attempt < 2; attempt++) {
|
|
935
|
-
const jar = await getOrMintCdpJar(baseUrl, bootstrapUrl, siteDir, attempt > 0
|
|
770
|
+
const jar = await getOrMintCdpJar(baseUrl, bootstrapUrl, siteDir, attempt > 0);
|
|
936
771
|
if (!jar) {
|
|
937
772
|
// Couldn't even launch the bootstrap browser → let the ladder escalate.
|
|
938
773
|
const stateMissing = bootstrapFailureStateMissingResult(
|
|
@@ -957,7 +792,7 @@ async function runFetchBootstrap(
|
|
|
957
792
|
// recording-seeded or cached jar is validated:true by construction, so the
|
|
958
793
|
// cheap plain-fetch path is untouched; `=== false` (not falsy) leaves jars
|
|
959
794
|
// without the field — older caches / test stubs — on the original path.
|
|
960
|
-
if (jar.validated === false
|
|
795
|
+
if (jar.validated === false) {
|
|
961
796
|
log(
|
|
962
797
|
'fetch-bootstrap: minted jar unvalidated (no _abck~0~/bm_sv) — plain-fetch replay doomed; escalating to cdp-replay',
|
|
963
798
|
);
|
|
@@ -1002,7 +837,7 @@ async function runFetchBootstrap(
|
|
|
1002
837
|
const result = await tool.toolFn(paramsWithDefaults, {
|
|
1003
838
|
credentials: bootstrappedCredentials,
|
|
1004
839
|
initialState: captureResult.state,
|
|
1005
|
-
fetchImpl:
|
|
840
|
+
fetchImpl: makeJarUaFetch(jar.ua),
|
|
1006
841
|
});
|
|
1007
842
|
|
|
1008
843
|
if (result.ok) return result;
|
|
@@ -1060,7 +895,7 @@ async function runCdpReplay(
|
|
|
1060
895
|
values: {},
|
|
1061
896
|
storage: [],
|
|
1062
897
|
};
|
|
1063
|
-
const paramsWithDefaults =
|
|
898
|
+
const paramsWithDefaults = withWorkflowDefaults(tool.workflow, params);
|
|
1064
899
|
const bootstrapUrl = tool.workflow.bootstrap
|
|
1065
900
|
? substituteString(tool.workflow.bootstrap.url, paramsWithDefaults, credentials, [])
|
|
1066
901
|
: undefined;
|
|
@@ -1068,12 +903,6 @@ async function runCdpReplay(
|
|
|
1068
903
|
const siteDir = pathResolve(tool.dir, '..');
|
|
1069
904
|
const poolKey = tool.site;
|
|
1070
905
|
const pooled = cdpPool?.get(poolKey);
|
|
1071
|
-
if (pooled && bootstrapUrl && pooled.bootstrapUrl !== bootstrapUrl) {
|
|
1072
|
-
log(
|
|
1073
|
-
`cdp-replay: reusing pooled Chrome session for new bootstrap (${pooled.bootstrapUrl} → ${bootstrapUrl})`,
|
|
1074
|
-
);
|
|
1075
|
-
pooled.setBootstrapUrl(bootstrapUrl);
|
|
1076
|
-
}
|
|
1077
906
|
const ownsSession = !pooled;
|
|
1078
907
|
|
|
1079
908
|
let cf: CdpBrowserFetch;
|
|
@@ -1099,7 +928,7 @@ async function runCdpReplay(
|
|
|
1099
928
|
}
|
|
1100
929
|
|
|
1101
930
|
try {
|
|
1102
|
-
const jar = await
|
|
931
|
+
const jar = await cf.mintJar();
|
|
1103
932
|
const bootstrappedCredentials: CredentialStore = {
|
|
1104
933
|
...credentials,
|
|
1105
934
|
cookies: [
|
|
@@ -1131,7 +960,7 @@ async function runCdpReplay(
|
|
|
1131
960
|
const result = await tool.toolFn(paramsWithDefaults, {
|
|
1132
961
|
credentials: bootstrappedCredentials,
|
|
1133
962
|
initialState: captureResult.state,
|
|
1134
|
-
fetchImpl:
|
|
963
|
+
fetchImpl: cf.fetchImpl,
|
|
1135
964
|
});
|
|
1136
965
|
|
|
1137
966
|
if (result.ok) {
|
|
@@ -1145,13 +974,11 @@ async function runCdpReplay(
|
|
|
1145
974
|
if (!cdpPool && ownsSession) await cf.close();
|
|
1146
975
|
}
|
|
1147
976
|
} else {
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
cdpPool.set(poolKey, cf);
|
|
1154
|
-
} else if (!cdpPool && ownsSession) {
|
|
977
|
+
if (ownsSession) {
|
|
978
|
+
await cf.close();
|
|
979
|
+
} else if (cdpPool && cdpToolResultImpliesDeadSession(result)) {
|
|
980
|
+
cdpPool.delete(poolKey);
|
|
981
|
+
log('cdp-replay: evicted degraded session from pool');
|
|
1155
982
|
await cf.close();
|
|
1156
983
|
}
|
|
1157
984
|
}
|
|
@@ -1223,45 +1050,6 @@ function jarBootstrapCaptureState(
|
|
|
1223
1050
|
),
|
|
1224
1051
|
};
|
|
1225
1052
|
}
|
|
1226
|
-
} else if (capture.source === 'request_header') {
|
|
1227
|
-
const value = captureObservedRequestHeader(jar, capture);
|
|
1228
|
-
if (value !== undefined && value !== null && value !== '') state[capture.name] = value;
|
|
1229
|
-
else if (capture.required !== false) {
|
|
1230
|
-
return {
|
|
1231
|
-
ok: false,
|
|
1232
|
-
result: bootstrapCaptureMissingResult(
|
|
1233
|
-
capture,
|
|
1234
|
-
`Required bootstrap capture "${capture.name}" (request_header ${capture.header}) did not match an observed browser request.`,
|
|
1235
|
-
'producer_ran_value_absent',
|
|
1236
|
-
),
|
|
1237
|
-
};
|
|
1238
|
-
}
|
|
1239
|
-
} else if (capture.source === 'request_url_regex') {
|
|
1240
|
-
const value = captureObservedRequestUrlRegex(jar, capture);
|
|
1241
|
-
if (value !== undefined && value !== null && value !== '') state[capture.name] = value;
|
|
1242
|
-
else if (capture.required !== false) {
|
|
1243
|
-
return {
|
|
1244
|
-
ok: false,
|
|
1245
|
-
result: bootstrapCaptureMissingResult(
|
|
1246
|
-
capture,
|
|
1247
|
-
`Required bootstrap capture "${capture.name}" (request_url_regex ${capture.pattern}) did not match an observed browser request.`,
|
|
1248
|
-
'producer_ran_value_absent',
|
|
1249
|
-
),
|
|
1250
|
-
};
|
|
1251
|
-
}
|
|
1252
|
-
} else if (capture.source === 'request_body_regex') {
|
|
1253
|
-
const value = captureObservedRequestBodyRegex(jar, capture);
|
|
1254
|
-
if (value !== undefined && value !== null && value !== '') state[capture.name] = value;
|
|
1255
|
-
else if (capture.required !== false) {
|
|
1256
|
-
return {
|
|
1257
|
-
ok: false,
|
|
1258
|
-
result: bootstrapCaptureMissingResult(
|
|
1259
|
-
capture,
|
|
1260
|
-
`Required bootstrap capture "${capture.name}" (request_body_regex ${capture.pattern}) did not match an observed browser request body.`,
|
|
1261
|
-
'producer_ran_value_absent',
|
|
1262
|
-
),
|
|
1263
|
-
};
|
|
1264
|
-
}
|
|
1265
1053
|
} else if (capture.required !== false) {
|
|
1266
1054
|
// response_header / dom_* can't be resolved from a closed browser jar.
|
|
1267
1055
|
return {
|
|
@@ -1277,134 +1065,6 @@ function jarBootstrapCaptureState(
|
|
|
1277
1065
|
return { ok: true, state };
|
|
1278
1066
|
}
|
|
1279
1067
|
|
|
1280
|
-
function captureObservedRequestHeader(
|
|
1281
|
-
jar: MintedJar,
|
|
1282
|
-
capture: Extract<BootstrapCapture, { source: 'request_header' }>,
|
|
1283
|
-
): string | string[] | undefined {
|
|
1284
|
-
return captureObservedRequestValueFromObserved(jar.observedRequests ?? [], capture, (req) =>
|
|
1285
|
-
headerValue(req.headers, capture.header),
|
|
1286
|
-
);
|
|
1287
|
-
}
|
|
1288
|
-
|
|
1289
|
-
function captureObservedRequestUrlRegex(
|
|
1290
|
-
jar: MintedJar,
|
|
1291
|
-
capture: Extract<BootstrapCapture, { source: 'request_url_regex' }>,
|
|
1292
|
-
): string | string[] | undefined {
|
|
1293
|
-
return captureObservedRequestValueFromObserved(jar.observedRequests ?? [], capture, (req) => {
|
|
1294
|
-
try {
|
|
1295
|
-
return req.url.match(new RegExp(capture.pattern))?.[capture.group ?? 1];
|
|
1296
|
-
} catch {
|
|
1297
|
-
return undefined;
|
|
1298
|
-
}
|
|
1299
|
-
});
|
|
1300
|
-
}
|
|
1301
|
-
|
|
1302
|
-
function captureObservedRequestBodyRegex(
|
|
1303
|
-
jar: MintedJar,
|
|
1304
|
-
capture: Extract<BootstrapCapture, { source: 'request_body_regex' }>,
|
|
1305
|
-
): string | string[] | undefined {
|
|
1306
|
-
return captureObservedRequestValueFromObserved(jar.observedRequests ?? [], capture, (req) => {
|
|
1307
|
-
if (typeof req.body !== 'string') return undefined;
|
|
1308
|
-
try {
|
|
1309
|
-
const match = req.body.match(new RegExp(capture.pattern));
|
|
1310
|
-
return match?.[capture.group ?? 1] ?? match?.[0];
|
|
1311
|
-
} catch {
|
|
1312
|
-
return undefined;
|
|
1313
|
-
}
|
|
1314
|
-
});
|
|
1315
|
-
}
|
|
1316
|
-
|
|
1317
|
-
function requiredObservedRequestCaptures(
|
|
1318
|
-
bootstrap: NonNullable<Workflow['bootstrap']>,
|
|
1319
|
-
): Array<
|
|
1320
|
-
Extract<
|
|
1321
|
-
BootstrapCapture,
|
|
1322
|
-
{ source: 'request_header' | 'request_url_regex' | 'request_body_regex' }
|
|
1323
|
-
>
|
|
1324
|
-
> {
|
|
1325
|
-
return (bootstrap.captures ?? []).filter(
|
|
1326
|
-
(
|
|
1327
|
-
capture,
|
|
1328
|
-
): capture is Extract<
|
|
1329
|
-
BootstrapCapture,
|
|
1330
|
-
{ source: 'request_header' | 'request_url_regex' | 'request_body_regex' }
|
|
1331
|
-
> =>
|
|
1332
|
-
capture.required !== false &&
|
|
1333
|
-
(capture.source === 'request_header' ||
|
|
1334
|
-
capture.source === 'request_url_regex' ||
|
|
1335
|
-
capture.source === 'request_body_regex'),
|
|
1336
|
-
);
|
|
1337
|
-
}
|
|
1338
|
-
|
|
1339
|
-
function missingObservedRequestCaptureNames(
|
|
1340
|
-
bootstrap: NonNullable<Workflow['bootstrap']>,
|
|
1341
|
-
jar: MintedJar,
|
|
1342
|
-
): string[] {
|
|
1343
|
-
const missing: string[] = [];
|
|
1344
|
-
for (const capture of requiredObservedRequestCaptures(bootstrap)) {
|
|
1345
|
-
const value =
|
|
1346
|
-
capture.source === 'request_header'
|
|
1347
|
-
? captureObservedRequestHeader(jar, capture)
|
|
1348
|
-
: capture.source === 'request_url_regex'
|
|
1349
|
-
? captureObservedRequestUrlRegex(jar, capture)
|
|
1350
|
-
: captureObservedRequestBodyRegex(jar, capture);
|
|
1351
|
-
if (value === undefined || value === '' || (Array.isArray(value) && value.length === 0)) {
|
|
1352
|
-
missing.push(capture.name);
|
|
1353
|
-
}
|
|
1354
|
-
}
|
|
1355
|
-
return missing;
|
|
1356
|
-
}
|
|
1357
|
-
|
|
1358
|
-
function captureObservedRequestValueFromObserved(
|
|
1359
|
-
observed: Array<{
|
|
1360
|
-
method: string;
|
|
1361
|
-
url: string;
|
|
1362
|
-
headers: Record<string, string>;
|
|
1363
|
-
body?: string;
|
|
1364
|
-
source?: 'browser' | 'replay';
|
|
1365
|
-
}>,
|
|
1366
|
-
capture: Extract<
|
|
1367
|
-
BootstrapCapture,
|
|
1368
|
-
{ source: 'request_header' | 'request_url_regex' | 'request_body_regex' }
|
|
1369
|
-
>,
|
|
1370
|
-
pickValue: (req: {
|
|
1371
|
-
method: string;
|
|
1372
|
-
url: string;
|
|
1373
|
-
headers: Record<string, string>;
|
|
1374
|
-
body?: string;
|
|
1375
|
-
source?: 'browser' | 'replay';
|
|
1376
|
-
}) => string | undefined,
|
|
1377
|
-
): string | string[] | undefined {
|
|
1378
|
-
let urlRe: RegExp | null = null;
|
|
1379
|
-
if (capture.urlPattern) {
|
|
1380
|
-
try {
|
|
1381
|
-
urlRe = new RegExp(capture.urlPattern);
|
|
1382
|
-
} catch {
|
|
1383
|
-
return undefined;
|
|
1384
|
-
}
|
|
1385
|
-
}
|
|
1386
|
-
const method = capture.method?.toUpperCase();
|
|
1387
|
-
const matches: string[] = [];
|
|
1388
|
-
for (const req of observed) {
|
|
1389
|
-
if (req.source === 'replay') continue;
|
|
1390
|
-
if (method && req.method.toUpperCase() !== method) continue;
|
|
1391
|
-
if (urlRe && !urlRe.test(req.url)) continue;
|
|
1392
|
-
const value = pickValue(req);
|
|
1393
|
-
if (value !== undefined && value !== '') matches.push(value);
|
|
1394
|
-
}
|
|
1395
|
-
if (capture.mode === 'all') return matches.length ? matches : undefined;
|
|
1396
|
-
if (capture.mode === 'first') return matches[0];
|
|
1397
|
-
return matches[matches.length - 1];
|
|
1398
|
-
}
|
|
1399
|
-
|
|
1400
|
-
function headerValue(headers: Record<string, string>, header: string): string | undefined {
|
|
1401
|
-
const headerName = header.toLowerCase();
|
|
1402
|
-
for (const [name, value] of Object.entries(headers)) {
|
|
1403
|
-
if (name.toLowerCase() === headerName) return value;
|
|
1404
|
-
}
|
|
1405
|
-
return undefined;
|
|
1406
|
-
}
|
|
1407
|
-
|
|
1408
1068
|
function bootstrapFailureStateMissingResult(
|
|
1409
1069
|
workflow: Workflow,
|
|
1410
1070
|
message: string,
|
|
@@ -1532,12 +1192,6 @@ export async function evaluateBootstrapCapture(
|
|
|
1532
1192
|
},
|
|
1533
1193
|
{ origin: capture.origin, key: capture.key },
|
|
1534
1194
|
);
|
|
1535
|
-
case 'request_header':
|
|
1536
|
-
return undefined;
|
|
1537
|
-
case 'request_url_regex':
|
|
1538
|
-
return undefined;
|
|
1539
|
-
case 'request_body_regex':
|
|
1540
|
-
return undefined;
|
|
1541
1195
|
case 'cookie':
|
|
1542
1196
|
return undefined;
|
|
1543
1197
|
}
|
|
@@ -1545,38 +1199,30 @@ export async function evaluateBootstrapCapture(
|
|
|
1545
1199
|
|
|
1546
1200
|
/** Per-site stealth fetcher; bootstrap pays its ~12s once per process. */
|
|
1547
1201
|
/** Mint `${state.X}` values from the stealth bootstrap session for a workflow
|
|
1548
|
-
* that declares a bootstrap block. Satisfies `cookie`, `html_regex`,
|
|
1549
|
-
* `response_header
|
|
1550
|
-
*
|
|
1551
|
-
*
|
|
1552
|
-
* checks against the session resolves.
|
|
1202
|
+
* that declares a bootstrap block. Satisfies `cookie`, `html_regex`, and
|
|
1203
|
+
* `response_header` captures from the cookies / HTML / response headers the
|
|
1204
|
+
* stealth navigation minted — all one consistent session as the transport
|
|
1205
|
+
* cookies, so a token the later API POST checks against the session resolves.
|
|
1553
1206
|
* `dom_*` / storage sources need a live page and are left for the
|
|
1554
1207
|
* fetch-bootstrap rung (the compile prompt steers replay-safe session tokens
|
|
1555
1208
|
* to cookie/html_regex, which this covers). */
|
|
1556
1209
|
async function stealthBootstrapState(
|
|
1557
1210
|
sf: StealthFetch,
|
|
1558
1211
|
bootstrap: NonNullable<ResolvedTool['workflow']['bootstrap']>,
|
|
1559
|
-
tokens?: TokenCache,
|
|
1560
1212
|
): Promise<Record<string, unknown>> {
|
|
1561
1213
|
const state: Record<string, unknown> = {};
|
|
1562
1214
|
const captures = bootstrap.captures ?? [];
|
|
1563
1215
|
const supported = captures.filter(
|
|
1564
|
-
(c) =>
|
|
1565
|
-
c.source === 'cookie' ||
|
|
1566
|
-
c.source === 'html_regex' ||
|
|
1567
|
-
c.source === 'response_header' ||
|
|
1568
|
-
c.source === 'request_header' ||
|
|
1569
|
-
c.source === 'request_url_regex' ||
|
|
1570
|
-
c.source === 'request_body_regex',
|
|
1216
|
+
(c) => c.source === 'cookie' || c.source === 'html_regex' || c.source === 'response_header',
|
|
1571
1217
|
);
|
|
1572
1218
|
if (supported.length === 0) return state;
|
|
1573
|
-
const
|
|
1219
|
+
const tokens = await sf.ensureBootstrapped();
|
|
1574
1220
|
for (const cap of supported) {
|
|
1575
1221
|
if (cap.source === 'cookie') {
|
|
1576
|
-
const hit =
|
|
1222
|
+
const hit = tokens.cookies.find((c) => c.name === cap.cookie);
|
|
1577
1223
|
if (hit) state[cap.name] = hit.value;
|
|
1578
1224
|
} else if (cap.source === 'html_regex') {
|
|
1579
|
-
const html =
|
|
1225
|
+
const html = tokens.bootstrapHtml ?? '';
|
|
1580
1226
|
try {
|
|
1581
1227
|
const m = html.match(new RegExp(cap.pattern));
|
|
1582
1228
|
const v = m?.[cap.group ?? 1];
|
|
@@ -1585,43 +1231,8 @@ async function stealthBootstrapState(
|
|
|
1585
1231
|
// invalid regex — leave unset; substitution will surface STATE_MISSING
|
|
1586
1232
|
}
|
|
1587
1233
|
} else if (cap.source === 'response_header') {
|
|
1588
|
-
const v =
|
|
1234
|
+
const v = tokens.bootstrapResponseHeaders?.[cap.header.toLowerCase()];
|
|
1589
1235
|
if (v !== undefined && v !== '') state[cap.name] = v;
|
|
1590
|
-
} else if (cap.source === 'request_header') {
|
|
1591
|
-
const v = captureObservedRequestValueFromObserved(
|
|
1592
|
-
bootstrapTokens.observedRequests ?? [],
|
|
1593
|
-
cap,
|
|
1594
|
-
(req) => headerValue(req.headers, cap.header),
|
|
1595
|
-
);
|
|
1596
|
-
if (v !== undefined && v !== null && v !== '') state[cap.name] = v;
|
|
1597
|
-
} else if (cap.source === 'request_url_regex') {
|
|
1598
|
-
const v = captureObservedRequestValueFromObserved(
|
|
1599
|
-
bootstrapTokens.observedRequests ?? [],
|
|
1600
|
-
cap,
|
|
1601
|
-
(req) => {
|
|
1602
|
-
try {
|
|
1603
|
-
return req.url.match(new RegExp(cap.pattern))?.[cap.group ?? 1];
|
|
1604
|
-
} catch {
|
|
1605
|
-
return undefined;
|
|
1606
|
-
}
|
|
1607
|
-
},
|
|
1608
|
-
);
|
|
1609
|
-
if (v !== undefined && v !== null && v !== '') state[cap.name] = v;
|
|
1610
|
-
} else if (cap.source === 'request_body_regex') {
|
|
1611
|
-
const v = captureObservedRequestValueFromObserved(
|
|
1612
|
-
bootstrapTokens.observedRequests ?? [],
|
|
1613
|
-
cap,
|
|
1614
|
-
(req) => {
|
|
1615
|
-
if (typeof req.body !== 'string') return undefined;
|
|
1616
|
-
try {
|
|
1617
|
-
const match = req.body.match(new RegExp(cap.pattern));
|
|
1618
|
-
return match?.[cap.group ?? 1] ?? match?.[0];
|
|
1619
|
-
} catch {
|
|
1620
|
-
return undefined;
|
|
1621
|
-
}
|
|
1622
|
-
},
|
|
1623
|
-
);
|
|
1624
|
-
if (v !== undefined && v !== null && v !== '') state[cap.name] = v;
|
|
1625
1236
|
}
|
|
1626
1237
|
}
|
|
1627
1238
|
return state;
|