imprint-mcp 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -201
- package/examples/discoverandgo/README.md +1 -1
- package/examples/echo/README.md +1 -1
- package/examples/google-flights/README.md +28 -0
- package/examples/google-flights/_shared/batchexecute.ts +63 -0
- package/examples/google-flights/_shared/flights_request.ts +95 -0
- package/examples/google-flights/_shared/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/index.ts +159 -0
- package/examples/google-flights/get_flight_booking_details/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/parser.ts +182 -0
- package/examples/google-flights/get_flight_booking_details/playbook.yaml +138 -0
- package/examples/google-flights/get_flight_booking_details/request-transform.ts +86 -0
- package/examples/google-flights/get_flight_booking_details/workflow.json +98 -0
- package/examples/google-flights/get_flight_calendar_prices/index.ts +131 -0
- package/examples/google-flights/get_flight_calendar_prices/package.json +9 -0
- package/examples/google-flights/get_flight_calendar_prices/parser.ts +86 -0
- package/examples/google-flights/get_flight_calendar_prices/playbook.yaml +97 -0
- package/examples/google-flights/get_flight_calendar_prices/request-transform.ts +31 -0
- package/examples/google-flights/get_flight_calendar_prices/workflow.json +76 -0
- package/examples/google-flights/lookup_airport/index.ts +101 -0
- package/examples/google-flights/lookup_airport/package.json +9 -0
- package/examples/google-flights/lookup_airport/parser.ts +66 -0
- package/examples/google-flights/lookup_airport/playbook.yaml +47 -0
- package/examples/google-flights/lookup_airport/request-transform.ts +20 -0
- package/examples/google-flights/lookup_airport/workflow.json +57 -0
- package/examples/google-flights/search_flights/index.ts +219 -0
- package/examples/google-flights/search_flights/package.json +9 -0
- package/examples/google-flights/search_flights/parser.ts +169 -0
- package/examples/google-flights/search_flights/playbook.yaml +184 -0
- package/examples/google-flights/search_flights/request-transform.ts +119 -0
- package/examples/google-flights/search_flights/workflow.json +143 -0
- package/examples/google-hotels/README.md +29 -0
- package/examples/google-hotels/_shared/batchexecute.ts +73 -0
- package/examples/google-hotels/_shared/freq.ts +158 -0
- package/examples/google-hotels/_shared/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/index.ts +80 -0
- package/examples/google-hotels/autocomplete_hotel_location/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/parser.ts +71 -0
- package/examples/google-hotels/autocomplete_hotel_location/playbook.yaml +36 -0
- package/examples/google-hotels/autocomplete_hotel_location/request-transform.ts +37 -0
- package/examples/google-hotels/autocomplete_hotel_location/workflow.json +36 -0
- package/examples/google-hotels/get_hotel_booking_options/index.ts +143 -0
- package/examples/google-hotels/get_hotel_booking_options/package.json +9 -0
- package/examples/google-hotels/get_hotel_booking_options/parser.ts +271 -0
- package/examples/google-hotels/get_hotel_booking_options/playbook.yaml +154 -0
- package/examples/google-hotels/get_hotel_booking_options/request-transform.ts +154 -0
- package/examples/google-hotels/get_hotel_booking_options/workflow.json +84 -0
- package/examples/google-hotels/get_hotel_reviews/index.ts +81 -0
- package/examples/google-hotels/get_hotel_reviews/package.json +9 -0
- package/examples/google-hotels/get_hotel_reviews/parser.ts +128 -0
- package/examples/google-hotels/get_hotel_reviews/playbook.yaml +64 -0
- package/examples/google-hotels/get_hotel_reviews/request-transform.ts +42 -0
- package/examples/google-hotels/get_hotel_reviews/workflow.json +37 -0
- package/examples/google-hotels/search_hotels/index.ts +207 -0
- package/examples/google-hotels/search_hotels/package.json +9 -0
- package/examples/google-hotels/search_hotels/parser.ts +260 -0
- package/examples/google-hotels/search_hotels/playbook.yaml +87 -0
- package/examples/google-hotels/search_hotels/request-transform.ts +197 -0
- package/examples/google-hotels/search_hotels/workflow.json +127 -0
- package/package.json +3 -2
- package/prompts/audit-agent.md +71 -0
- package/prompts/build-planning.md +74 -0
- package/prompts/compile-agent.md +131 -27
- package/prompts/prereq-builder.md +64 -0
- package/prompts/prereq-planner.md +34 -0
- package/prompts/tool-planning.md +39 -0
- package/src/cli.ts +109 -2
- package/src/imprint/agent.ts +5 -0
- package/src/imprint/audit.ts +996 -0
- package/src/imprint/backend-ladder.ts +1214 -184
- package/src/imprint/build-plan.ts +1051 -0
- package/src/imprint/cdp-browser-fetch.ts +589 -0
- package/src/imprint/cdp-jar-cache.ts +320 -0
- package/src/imprint/chromium.ts +135 -0
- package/src/imprint/claude-cli-compile.ts +125 -25
- package/src/imprint/codex-cli-compile.ts +26 -23
- package/src/imprint/compile-agent-types.ts +38 -0
- package/src/imprint/compile-agent.ts +63 -25
- package/src/imprint/compile-tools.ts +1656 -64
- package/src/imprint/compile.ts +13 -1
- package/src/imprint/concurrency.ts +87 -0
- package/src/imprint/cron.ts +1 -0
- package/src/imprint/doctor.ts +39 -0
- package/src/imprint/freeform-redact.ts +5 -4
- package/src/imprint/integrations.ts +2 -2
- package/src/imprint/llm.ts +56 -8
- package/src/imprint/mcp-compile-server.ts +43 -10
- package/src/imprint/mcp-maintenance.ts +9 -101
- package/src/imprint/mcp-server.ts +73 -7
- package/src/imprint/multi-progress.ts +7 -2
- package/src/imprint/param-grounding.ts +367 -0
- package/src/imprint/paths.ts +29 -0
- package/src/imprint/playbook-runner.ts +101 -40
- package/src/imprint/prereq-builder.ts +651 -0
- package/src/imprint/probe-backends.ts +6 -3
- package/src/imprint/record.ts +10 -1
- package/src/imprint/redact.ts +30 -2
- package/src/imprint/replay-capture.ts +19 -18
- package/src/imprint/runtime.ts +19 -10
- package/src/imprint/session-diff.ts +79 -2
- package/src/imprint/session-merge.ts +9 -5
- package/src/imprint/stealth-chromium.ts +81 -0
- package/src/imprint/stealth-fetch.ts +309 -29
- package/src/imprint/stealth-token-cache.ts +88 -0
- package/src/imprint/teach-plan.ts +251 -0
- package/src/imprint/teach-state.ts +10 -0
- package/src/imprint/teach.ts +456 -142
- package/src/imprint/tool-candidates.ts +72 -14
- package/src/imprint/tool-plan.ts +313 -0
- package/src/imprint/tracing.ts +135 -6
- package/src/imprint/types.ts +61 -3
- package/examples/google-flights/search_google_flights/index.ts +0 -101
- package/examples/google-flights/search_google_flights/parser.test.ts +0 -140
- package/examples/google-flights/search_google_flights/parser.ts +0 -189
- package/examples/google-flights/search_google_flights/playbook.yaml +0 -130
- package/examples/google-flights/search_google_flights/workflow.json +0 -48
- package/examples/google-hotels/search_google_hotels/index.ts +0 -194
- package/examples/google-hotels/search_google_hotels/parser.test.ts +0 -168
- package/examples/google-hotels/search_google_hotels/parser.ts +0 -330
- package/examples/google-hotels/search_google_hotels/playbook.yaml +0 -125
- package/examples/google-hotels/search_google_hotels/workflow.json +0 -111
- package/examples/namecheap-domains/search_namecheap_domains/index.ts +0 -144
- package/examples/namecheap-domains/search_namecheap_domains/parser.ts +0 -380
- package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +0 -50
- package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +0 -136
- package/examples/namecheap-domains/search_namecheap_domains/workflow.json +0 -97
package/src/imprint/redact.ts
CHANGED
|
@@ -21,6 +21,22 @@ import type { CapturedRequest, Session } from './types.ts';
|
|
|
21
21
|
const USER_INTERACTION_TYPES = new Set(['click', 'input', 'change', 'submit']);
|
|
22
22
|
const MULTI_VALUE_HEADERS = new Set(['cookie', 'set-cookie']);
|
|
23
23
|
|
|
24
|
+
/**
|
|
25
|
+
* Detect a structured RPC envelope (XSSI-guarded or length-prefixed) whose body
|
|
26
|
+
* is NOT top-level JSON but carries doubly-encoded JSON as string payloads —
|
|
27
|
+
* e.g. Google `batchexecute` (`)]}'` guard + `<len>\n[...]` frames). Running the
|
|
28
|
+
* flat-text freeform scanner over such a body injects `[REDACTED]` into bare
|
|
29
|
+
* numeric IDs/coordinates inside the inner JSON and makes it unparseable, so the
|
|
30
|
+
* freeform fallback must skip these. The structure-aware key-based redaction
|
|
31
|
+
* still applies to any clean-JSON bodies; this only gates the flat-text scan.
|
|
32
|
+
*/
|
|
33
|
+
export function looksLikeRpcEnvelope(body: string): boolean {
|
|
34
|
+
const head = body.slice(0, 64).trimStart();
|
|
35
|
+
if (head.startsWith(")]}'")) return true; // anti-XSSI guard: )]}' and )]}',
|
|
36
|
+
if (/^\d{1,9}\r?\n\[/.test(head)) return true; // length-prefixed frame: 219006\n[
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
39
|
+
|
|
24
40
|
/**
|
|
25
41
|
* Detect sensitive headers whose values are page-minted constants — baked
|
|
26
42
|
* into the site's JavaScript, not per-user secrets. The recording starts
|
|
@@ -179,7 +195,12 @@ export function redactJsonBody(
|
|
|
179
195
|
const visited = visit(inner, [...pathSoFar, k]);
|
|
180
196
|
out[k] = JSON.stringify(visited);
|
|
181
197
|
} catch {
|
|
182
|
-
|
|
198
|
+
// Nested string that isn't parseable JSON: scan it as free text,
|
|
199
|
+
// unless it's a structured RPC envelope (flat-scanning corrupts it).
|
|
200
|
+
const r =
|
|
201
|
+
freeform && !looksLikeRpcEnvelope(v)
|
|
202
|
+
? redactFreeformText(v)
|
|
203
|
+
: { redacted: v, redactionsCount: 0 };
|
|
183
204
|
freeformCount += r.redactionsCount;
|
|
184
205
|
out[k] = r.redacted;
|
|
185
206
|
}
|
|
@@ -228,6 +249,9 @@ export function redactBody(
|
|
|
228
249
|
} catch {
|
|
229
250
|
const formR = redactFormBody(body, formPlaceholders, markerContext);
|
|
230
251
|
if (formR.redactionsCount > 0 || formR.placeholdersInjected > 0 || !freeform) return formR;
|
|
252
|
+
// A structured RPC envelope (XSSI/length-prefixed) is not flat text —
|
|
253
|
+
// flat-scanning it would corrupt the doubly-encoded JSON payloads it carries.
|
|
254
|
+
if (looksLikeRpcEnvelope(body)) return formR;
|
|
231
255
|
const freeformR = redactFreeformText(body);
|
|
232
256
|
return {
|
|
233
257
|
redacted: freeformR.redacted,
|
|
@@ -437,7 +461,11 @@ export function redactSession(
|
|
|
437
461
|
response.mimeType,
|
|
438
462
|
undefined,
|
|
439
463
|
undefined,
|
|
440
|
-
|
|
464
|
+
// Responses are key-based only: never value-pattern (freeform) scan a
|
|
465
|
+
// server body. Keeps redaction focused on real secrets (post-login
|
|
466
|
+
// cookies + user-entered PII) and avoids corrupting structured RPC
|
|
467
|
+
// envelopes whose payloads are doubly-encoded JSON.
|
|
468
|
+
false,
|
|
441
469
|
markerContext,
|
|
442
470
|
);
|
|
443
471
|
respBody = respBodyR.redacted;
|
|
@@ -15,6 +15,7 @@ import { join as pathJoin } from 'node:path';
|
|
|
15
15
|
import type { Browser, BrowserContext, Locator, Page } from 'playwright';
|
|
16
16
|
import { createLog } from './log.ts';
|
|
17
17
|
import type { CapturedReplayRequest } from './session-diff.ts';
|
|
18
|
+
import { getStealthChromium, getStealthExecutablePath } from './stealth-chromium.ts';
|
|
18
19
|
import type { CapturedEvent, Session } from './types.ts';
|
|
19
20
|
|
|
20
21
|
const log = createLog('replay-capture');
|
|
@@ -62,25 +63,17 @@ export async function replayRawSession(opts: RawReplayOptions): Promise<ReplayCa
|
|
|
62
63
|
|
|
63
64
|
let chromium: typeof import('playwright').chromium;
|
|
64
65
|
try {
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
(stealthMod as { default?: () => unknown }).default ??
|
|
69
|
-
(stealthMod as unknown as () => unknown);
|
|
70
|
-
pwExtra.chromium.use(stealthFactory() as never);
|
|
71
|
-
chromium = pwExtra.chromium as unknown as typeof import('playwright').chromium;
|
|
72
|
-
} catch {
|
|
73
|
-
try {
|
|
74
|
-
const pw = await import('playwright');
|
|
75
|
-
chromium = pw.chromium;
|
|
76
|
-
} catch (innerErr) {
|
|
77
|
-
return { ok: false, requests: [], error: `Playwright not available: ${errMsg(innerErr)}` };
|
|
78
|
-
}
|
|
66
|
+
chromium = await getStealthChromium();
|
|
67
|
+
} catch (innerErr) {
|
|
68
|
+
return { ok: false, requests: [], error: `Playwright not available: ${errMsg(innerErr)}` };
|
|
79
69
|
}
|
|
80
70
|
|
|
81
71
|
try {
|
|
82
72
|
replayLog(`launching browser (headed=${!!opts.headed})`);
|
|
83
|
-
browser = await chromium.launch({
|
|
73
|
+
browser = await chromium.launch({
|
|
74
|
+
headless: !opts.headed,
|
|
75
|
+
executablePath: getStealthExecutablePath(),
|
|
76
|
+
});
|
|
84
77
|
} catch (err) {
|
|
85
78
|
replayLog(`browser launch failed: ${errMsg(err)}`);
|
|
86
79
|
return { ok: false, requests: [], error: `Could not launch Chromium: ${errMsg(err)}` };
|
|
@@ -215,11 +208,19 @@ export async function replayRawSession(opts: RawReplayOptions): Promise<ReplayCa
|
|
|
215
208
|
opts.onProgress?.(i + 1, replayableEvents.length, captured.length);
|
|
216
209
|
}
|
|
217
210
|
|
|
218
|
-
// Allow final network requests to settle
|
|
211
|
+
// Allow final network requests to settle, but never block forever: on a
|
|
212
|
+
// large recording a single hung response-body read can stall allSettled
|
|
213
|
+
// indefinitely (there is no outer timeout on the replay stage). Cap the
|
|
214
|
+
// wait and proceed with whatever bodies are ready — replay-diff is
|
|
215
|
+
// best-effort, so partial captures are acceptable.
|
|
216
|
+
const SETTLE_TIMEOUT_MS = 15_000;
|
|
219
217
|
replayLog('waiting for networkidle...');
|
|
220
|
-
await page.waitForLoadState('networkidle').catch(() => {});
|
|
218
|
+
await page.waitForLoadState('networkidle', { timeout: SETTLE_TIMEOUT_MS }).catch(() => {});
|
|
221
219
|
await page.waitForTimeout(1000);
|
|
222
|
-
await Promise.
|
|
220
|
+
await Promise.race([
|
|
221
|
+
Promise.allSettled(pendingReads),
|
|
222
|
+
new Promise<void>((resolve) => setTimeout(resolve, SETTLE_TIMEOUT_MS)),
|
|
223
|
+
]);
|
|
223
224
|
captured.sort((a, b) => a.seq - b.seq);
|
|
224
225
|
|
|
225
226
|
replayLog(`replay complete: captured ${captured.length} requests total`);
|
package/src/imprint/runtime.ts
CHANGED
|
@@ -113,14 +113,23 @@ export async function executeWorkflow<T = unknown>(opts: ExecuteOptions): Promis
|
|
|
113
113
|
(await loadCredentialStore(opts.workflow.site)) ??
|
|
114
114
|
emptyStore(opts.workflow.site);
|
|
115
115
|
|
|
116
|
-
// Validate required parameters are present
|
|
116
|
+
// Validate required parameters are present and merge declared defaults
|
|
117
|
+
// into the working params map. Without the merge, `parameter.default` would
|
|
118
|
+
// be a presence-sentinel only — the substitution layer at
|
|
119
|
+
// `resolvePlaceholder` would still throw STATE_MISSING because it reads
|
|
120
|
+
// from this map directly. The schema declares `default` as a real value
|
|
121
|
+
// (string | number | boolean), so honor it.
|
|
122
|
+
const params: Record<string, string | number | boolean> = { ...opts.params };
|
|
117
123
|
for (const p of opts.workflow.parameters) {
|
|
118
|
-
if (!(p.name in
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
+
if (!(p.name in params)) {
|
|
125
|
+
if (p.default === undefined) {
|
|
126
|
+
return {
|
|
127
|
+
ok: false,
|
|
128
|
+
error: 'UNKNOWN',
|
|
129
|
+
message: `Missing required parameter: ${p.name} (${p.description})`,
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
params[p.name] = p.default;
|
|
124
133
|
}
|
|
125
134
|
}
|
|
126
135
|
|
|
@@ -163,7 +172,7 @@ export async function executeWorkflow<T = unknown>(opts: ExecuteOptions): Promis
|
|
|
163
172
|
if (!req) continue;
|
|
164
173
|
|
|
165
174
|
const subbedResult = substituteRequest(req, {
|
|
166
|
-
params
|
|
175
|
+
params,
|
|
167
176
|
credentials: liveCredentials,
|
|
168
177
|
responseSlots,
|
|
169
178
|
state,
|
|
@@ -180,7 +189,7 @@ export async function executeWorkflow<T = unknown>(opts: ExecuteOptions): Promis
|
|
|
180
189
|
subbed.method,
|
|
181
190
|
subbed.url,
|
|
182
191
|
responseSlots.map((s) => s.raw),
|
|
183
|
-
|
|
192
|
+
params,
|
|
184
193
|
);
|
|
185
194
|
if (typeof transformResult === 'string') {
|
|
186
195
|
subbed.url = transformResult;
|
|
@@ -312,7 +321,7 @@ export async function executeWorkflow<T = unknown>(opts: ExecuteOptions): Promis
|
|
|
312
321
|
};
|
|
313
322
|
}
|
|
314
323
|
finalData = mod.extract(finalData, {
|
|
315
|
-
params
|
|
324
|
+
params,
|
|
316
325
|
responses: responseSlots.map((s) => s.raw),
|
|
317
326
|
});
|
|
318
327
|
} catch (err) {
|
|
@@ -318,6 +318,17 @@ function suggestStateName(location: string): string {
|
|
|
318
318
|
.toLowerCase();
|
|
319
319
|
}
|
|
320
320
|
|
|
321
|
+
/** Whether a value looks like an opaque token/id (vs human text, a city name, a
|
|
322
|
+
* date). Gates provenance-tagging of stable values so an incidental constant
|
|
323
|
+
* (a UI label, the echoed query) isn't treated as a server-provided token.
|
|
324
|
+
* Shared with the build-plan token detector. */
|
|
325
|
+
export function looksLikeToken(v: string): boolean {
|
|
326
|
+
if (v.length < 12) return false;
|
|
327
|
+
if (/\s/.test(v)) return false; // multi-word / free text
|
|
328
|
+
if (/^\d{4}-\d{2}-\d{2}$/.test(v)) return false; // dates
|
|
329
|
+
return /[:|_-]/.test(v) || /\d/.test(v);
|
|
330
|
+
}
|
|
331
|
+
|
|
321
332
|
// ─── Main diff ──────────────────────────────────────────────────────────────
|
|
322
333
|
|
|
323
334
|
export function diffTriagedSessions(
|
|
@@ -327,6 +338,12 @@ export function diffTriagedSessions(
|
|
|
327
338
|
const pairs = alignRequests(original.requests, replay.requests);
|
|
328
339
|
const pairedOrigSeqs = new Set(pairs.map((p) => p.originalSeq));
|
|
329
340
|
const pairedReplaySeqs = new Set(pairs.map((p) => p.replaySeq));
|
|
341
|
+
// `searchPriorResponses` over the replay returns a producer in REPLAY-seq
|
|
342
|
+
// space, but `originalSeq` and every downstream consumer (capture hints,
|
|
343
|
+
// build-plan token detection, the planner) work in ORIGINAL-seq space — so a
|
|
344
|
+
// replay producer must be translated back via the alignment pairs.
|
|
345
|
+
const replayToOriginal = new Map(pairs.map((p) => [p.replaySeq, p.originalSeq]));
|
|
346
|
+
const toOriginalSeq = (replaySeq: number): number => replayToOriginal.get(replaySeq) ?? replaySeq;
|
|
330
347
|
|
|
331
348
|
const classifications: ClassifiedValue[] = [];
|
|
332
349
|
|
|
@@ -347,17 +364,28 @@ export function diffTriagedSessions(
|
|
|
347
364
|
if (v2Value === undefined) continue; // field only in run 1
|
|
348
365
|
|
|
349
366
|
if (v1.value === v2Value) {
|
|
367
|
+
// Stable across runs. Normally a constant — but an OPAQUE stable value
|
|
368
|
+
// that also appears in a PRIOR response is a server-PROVIDED token (e.g.
|
|
369
|
+
// a per-entity id minted by a sibling search tool). The same-flow replay
|
|
370
|
+
// can't expose it by variance (same entity → same token), so recover its
|
|
371
|
+
// provenance from the original responses (already original-seq space).
|
|
372
|
+
// A cross-tool consumer then sources it as a param instead of hardcoding.
|
|
373
|
+
const provider = looksLikeToken(v1.value)
|
|
374
|
+
? searchPriorResponses(v1.value, original.requests, pair.originalSeq)
|
|
375
|
+
: null;
|
|
350
376
|
classifications.push({
|
|
351
377
|
classification: 'constant',
|
|
352
378
|
location: v1.location,
|
|
353
379
|
originalSeq: pair.originalSeq,
|
|
354
380
|
value1: v1.value,
|
|
355
381
|
value2: v2Value,
|
|
382
|
+
...(provider ? { producerSeq: provider.seq, producerPath: provider.path } : {}),
|
|
356
383
|
});
|
|
357
384
|
continue;
|
|
358
385
|
}
|
|
359
386
|
|
|
360
|
-
// Value differs — check if it came from a prior response in run 2
|
|
387
|
+
// Value differs — check if it came from a prior response in run 2,
|
|
388
|
+
// translating the replay producer back to original-seq space.
|
|
361
389
|
const producer = searchPriorResponses(v2Value, replay.requests, pair.replaySeq);
|
|
362
390
|
|
|
363
391
|
if (producer) {
|
|
@@ -368,7 +396,7 @@ export function diffTriagedSessions(
|
|
|
368
396
|
originalSeq: pair.originalSeq,
|
|
369
397
|
value1: v1.value,
|
|
370
398
|
value2: v2Value,
|
|
371
|
-
producerSeq: producer.seq,
|
|
399
|
+
producerSeq: toOriginalSeq(producer.seq),
|
|
372
400
|
producerPath: producer.path,
|
|
373
401
|
suggestedStateName: name || undefined,
|
|
374
402
|
});
|
|
@@ -407,3 +435,52 @@ export function triageByAlignment(
|
|
|
407
435
|
const aligned = alignRequests(run1TriagedRequests, run2AllRequests);
|
|
408
436
|
return aligned.filter((pair) => pair.confidence >= 0.5).map((pair) => pair.replaySeq);
|
|
409
437
|
}
|
|
438
|
+
|
|
439
|
+
/**
|
|
440
|
+
* Severity order — a value seen varying in ANY pass outranks one seen constant.
|
|
441
|
+
* server_derived (traceable to a response) wins over browser_minted.
|
|
442
|
+
*/
|
|
443
|
+
const CLASSIFICATION_RANK: Record<ValueClassification, number> = {
|
|
444
|
+
constant: 0,
|
|
445
|
+
browser_minted: 1,
|
|
446
|
+
server_derived: 2,
|
|
447
|
+
};
|
|
448
|
+
|
|
449
|
+
/**
|
|
450
|
+
* Merge `ClassifiedValue`s from several diff passes that all share the SAME
|
|
451
|
+
* `original` recording (so `originalSeq` is a stable join key across passes).
|
|
452
|
+
*
|
|
453
|
+
* Each pass diffs the original recording against one other run — the automated
|
|
454
|
+
* browser replay AND every other real recording of the site. Anti-bot edges
|
|
455
|
+
* (Akamai, DataDome, …) often block the automated replay at the page level, so
|
|
456
|
+
* the replay reproduces only a fraction of the recording's requests and their
|
|
457
|
+
* functional values (GraphQL safelisting signatures, persisted-query hashes,
|
|
458
|
+
* app keys) never get classified. Real recordings come from a trusted browser
|
|
459
|
+
* and DO carry those requests, so diffing recordings against each other
|
|
460
|
+
* recovers the missing signal.
|
|
461
|
+
*
|
|
462
|
+
* Merge rule per (originalSeq, location):
|
|
463
|
+
* - a value that VARIES in any pass is ephemeral — the strongest non-constant
|
|
464
|
+
* classification wins (server_derived > browser_minted), preserving its
|
|
465
|
+
* producer provenance;
|
|
466
|
+
* - a value constant in every pass that observed it is `constant`.
|
|
467
|
+
* A value the replay never observed (because it was blocked) but that is
|
|
468
|
+
* identical across time-separated recordings is therefore kept as `constant`,
|
|
469
|
+
* not silently dropped.
|
|
470
|
+
*/
|
|
471
|
+
export function mergeClassifications(passes: ClassifiedValue[][]): ClassifiedValue[] {
|
|
472
|
+
const byKey = new Map<string, ClassifiedValue>();
|
|
473
|
+
for (const pass of passes) {
|
|
474
|
+
for (const cv of pass) {
|
|
475
|
+
const key = `${cv.originalSeq}${cv.location}`;
|
|
476
|
+
const prev = byKey.get(key);
|
|
477
|
+
if (
|
|
478
|
+
!prev ||
|
|
479
|
+
CLASSIFICATION_RANK[cv.classification] > CLASSIFICATION_RANK[prev.classification]
|
|
480
|
+
) {
|
|
481
|
+
byKey.set(key, cv);
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
return [...byKey.values()];
|
|
486
|
+
}
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
* pipeline consumes unchanged.
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
|
-
import { existsSync, readFileSync, readdirSync, writeFileSync } from 'node:fs';
|
|
10
|
+
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from 'node:fs';
|
|
11
11
|
import { join as pathJoin } from 'node:path';
|
|
12
12
|
import { localSessionsDir } from './paths.ts';
|
|
13
13
|
import { friendlySessionTimestamp } from './teach-state.ts';
|
|
@@ -34,10 +34,13 @@ interface SessionInfo {
|
|
|
34
34
|
}
|
|
35
35
|
|
|
36
36
|
export function listSiteSessions(site: string): SessionInfo[] {
|
|
37
|
-
|
|
38
|
-
|
|
37
|
+
return listSessionsInDir(localSessionsDir(site));
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export function listSessionsInDir(dir: string): SessionInfo[] {
|
|
41
|
+
if (!existsSync(dir)) return [];
|
|
39
42
|
|
|
40
|
-
const files = readdirSync(
|
|
43
|
+
const files = readdirSync(dir).filter(
|
|
41
44
|
(f) =>
|
|
42
45
|
f.endsWith('.json') &&
|
|
43
46
|
!f.includes('.redacted') &&
|
|
@@ -47,7 +50,7 @@ export function listSiteSessions(site: string): SessionInfo[] {
|
|
|
47
50
|
|
|
48
51
|
const infos: SessionInfo[] = [];
|
|
49
52
|
for (const filename of files) {
|
|
50
|
-
const absPath = pathJoin(
|
|
53
|
+
const absPath = pathJoin(dir, filename);
|
|
51
54
|
try {
|
|
52
55
|
const raw = JSON.parse(readFileSync(absPath, 'utf8'));
|
|
53
56
|
const session = SessionSchema.parse(raw);
|
|
@@ -190,6 +193,7 @@ export function mergeSessions(sessions: Session[]): Session {
|
|
|
190
193
|
|
|
191
194
|
export function writeCombinedSession(site: string, combined: Session): string {
|
|
192
195
|
const sessDir = localSessionsDir(site);
|
|
196
|
+
mkdirSync(sessDir, { recursive: true });
|
|
193
197
|
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
194
198
|
const filename = `combined-${timestamp}.json`;
|
|
195
199
|
const absPath = pathJoin(sessDir, filename);
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { findChromium } from './chromium.ts';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Shared loader for Playwright's chromium with the stealth plugin applied.
|
|
5
|
+
*
|
|
6
|
+
* Stealth patches navigator.webdriver, plugin enumeration, WebGL vendor
|
|
7
|
+
* strings, and other headless-Chrome telltales that anti-bot services
|
|
8
|
+
* (Akamai, Cloudflare, PerimeterX) detect. Vanilla headless Playwright
|
|
9
|
+
* gets tarpitted or 403'd by these services; the stealth-patched chromium
|
|
10
|
+
* loads the same pages in seconds.
|
|
11
|
+
*
|
|
12
|
+
* Falls back to vanilla `playwright` if `playwright-extra` /
|
|
13
|
+
* `puppeteer-extra-plugin-stealth` are not installed (preserves the
|
|
14
|
+
* graceful-degrade behavior of the original duplicated loaders in
|
|
15
|
+
* playbook-runner, replay-capture, and backend-ladder).
|
|
16
|
+
*
|
|
17
|
+
* Throws if no Playwright is available at all — callers translate the
|
|
18
|
+
* thrown error into their own result shape.
|
|
19
|
+
*/
|
|
20
|
+
export async function getStealthChromium(): Promise<typeof import('playwright').chromium> {
|
|
21
|
+
try {
|
|
22
|
+
const pwExtra = await import('playwright-extra');
|
|
23
|
+
const stealthMod = await import('puppeteer-extra-plugin-stealth');
|
|
24
|
+
const stealthFactory =
|
|
25
|
+
(stealthMod as { default?: () => unknown }).default ??
|
|
26
|
+
(stealthMod as unknown as () => unknown);
|
|
27
|
+
pwExtra.chromium.use(stealthFactory() as never);
|
|
28
|
+
return pwExtra.chromium as unknown as typeof import('playwright').chromium;
|
|
29
|
+
} catch {
|
|
30
|
+
const pw = await import('playwright');
|
|
31
|
+
return pw.chromium;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* True when the puppeteer-extra stealth plugin is installed and WILL be applied
|
|
37
|
+
* by getStealthChromium() (i.e. we're not on the vanilla-Playwright fallback).
|
|
38
|
+
*
|
|
39
|
+
* Callers use this to avoid stacking a manual `navigator.webdriver` patch on top
|
|
40
|
+
* of the plugin's: the stealth plugin removes the property the way a real Chrome
|
|
41
|
+
* does (it simply lacks `webdriver`), whereas a redundant
|
|
42
|
+
* `Object.defineProperty(navigator,'webdriver',{get:()=>false})` leaves a
|
|
43
|
+
* non-native property descriptor that is ITSELF a fingerprinting tell. So the
|
|
44
|
+
* manual patch should only run on the vanilla fallback, where it's the only
|
|
45
|
+
* protection. Import resolution is cached, so probing here is cheap.
|
|
46
|
+
*/
|
|
47
|
+
export async function isStealthPluginAvailable(): Promise<boolean> {
|
|
48
|
+
try {
|
|
49
|
+
await import('playwright-extra');
|
|
50
|
+
await import('puppeteer-extra-plugin-stealth');
|
|
51
|
+
return true;
|
|
52
|
+
} catch {
|
|
53
|
+
return false;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Path to the same Chromium binary `imprint record` uses for the user's
|
|
59
|
+
* recording session — Playwright's bundled "Google Chrome for Testing"
|
|
60
|
+
* (full Chrome build), the system Chrome on macOS, or a Linux distro
|
|
61
|
+
* Chrome/Chromium package, in that order of preference.
|
|
62
|
+
*
|
|
63
|
+
* Why this matters: by default Playwright's `chromium.launch({ headless: true })`
|
|
64
|
+
* picks `chrome-headless-shell` — a separate stripped-down binary that
|
|
65
|
+
* Akamai / Cloudflare / PerimeterX class anti-bot services detect at the
|
|
66
|
+
* binary/TLS-fingerprint layer regardless of how thoroughly the JS-level
|
|
67
|
+
* `navigator.webdriver` etc. are patched by the stealth plugin. The
|
|
68
|
+
* recording browser uses the FULL Chrome binary and Akamai trusts it; the
|
|
69
|
+
* replay browser using chrome-headless-shell looks like a bot. Using the
|
|
70
|
+
* SAME binary for both eliminates the binary asymmetry.
|
|
71
|
+
*
|
|
72
|
+
* Returns `undefined` if no Chromium can be located — callers should let
|
|
73
|
+
* Playwright fall back to whatever default it finds.
|
|
74
|
+
*/
|
|
75
|
+
export function getStealthExecutablePath(): string | undefined {
|
|
76
|
+
try {
|
|
77
|
+
return findChromium();
|
|
78
|
+
} catch {
|
|
79
|
+
return undefined;
|
|
80
|
+
}
|
|
81
|
+
}
|