imprint-mcp 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -201
- package/examples/discoverandgo/README.md +1 -1
- package/examples/echo/README.md +1 -1
- package/examples/google-flights/README.md +28 -0
- package/examples/google-flights/_shared/batchexecute.ts +63 -0
- package/examples/google-flights/_shared/flights_request.ts +95 -0
- package/examples/google-flights/_shared/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/index.ts +159 -0
- package/examples/google-flights/get_flight_booking_details/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/parser.ts +182 -0
- package/examples/google-flights/get_flight_booking_details/playbook.yaml +138 -0
- package/examples/google-flights/get_flight_booking_details/request-transform.ts +86 -0
- package/examples/google-flights/get_flight_booking_details/workflow.json +98 -0
- package/examples/google-flights/get_flight_calendar_prices/index.ts +131 -0
- package/examples/google-flights/get_flight_calendar_prices/package.json +9 -0
- package/examples/google-flights/get_flight_calendar_prices/parser.ts +86 -0
- package/examples/google-flights/get_flight_calendar_prices/playbook.yaml +97 -0
- package/examples/google-flights/get_flight_calendar_prices/request-transform.ts +31 -0
- package/examples/google-flights/get_flight_calendar_prices/workflow.json +76 -0
- package/examples/google-flights/lookup_airport/index.ts +101 -0
- package/examples/google-flights/lookup_airport/package.json +9 -0
- package/examples/google-flights/lookup_airport/parser.ts +66 -0
- package/examples/google-flights/lookup_airport/playbook.yaml +47 -0
- package/examples/google-flights/lookup_airport/request-transform.ts +20 -0
- package/examples/google-flights/lookup_airport/workflow.json +57 -0
- package/examples/google-flights/search_flights/index.ts +219 -0
- package/examples/google-flights/search_flights/package.json +9 -0
- package/examples/google-flights/search_flights/parser.ts +169 -0
- package/examples/google-flights/search_flights/playbook.yaml +184 -0
- package/examples/google-flights/search_flights/request-transform.ts +119 -0
- package/examples/google-flights/search_flights/workflow.json +143 -0
- package/examples/google-hotels/README.md +29 -0
- package/examples/google-hotels/_shared/batchexecute.ts +73 -0
- package/examples/google-hotels/_shared/freq.ts +158 -0
- package/examples/google-hotels/_shared/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/index.ts +80 -0
- package/examples/google-hotels/autocomplete_hotel_location/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/parser.ts +71 -0
- package/examples/google-hotels/autocomplete_hotel_location/playbook.yaml +36 -0
- package/examples/google-hotels/autocomplete_hotel_location/request-transform.ts +37 -0
- package/examples/google-hotels/autocomplete_hotel_location/workflow.json +36 -0
- package/examples/google-hotels/get_hotel_booking_options/index.ts +143 -0
- package/examples/google-hotels/get_hotel_booking_options/package.json +9 -0
- package/examples/google-hotels/get_hotel_booking_options/parser.ts +271 -0
- package/examples/google-hotels/get_hotel_booking_options/playbook.yaml +154 -0
- package/examples/google-hotels/get_hotel_booking_options/request-transform.ts +154 -0
- package/examples/google-hotels/get_hotel_booking_options/workflow.json +84 -0
- package/examples/google-hotels/get_hotel_reviews/index.ts +81 -0
- package/examples/google-hotels/get_hotel_reviews/package.json +9 -0
- package/examples/google-hotels/get_hotel_reviews/parser.ts +128 -0
- package/examples/google-hotels/get_hotel_reviews/playbook.yaml +64 -0
- package/examples/google-hotels/get_hotel_reviews/request-transform.ts +42 -0
- package/examples/google-hotels/get_hotel_reviews/workflow.json +37 -0
- package/examples/google-hotels/search_hotels/index.ts +207 -0
- package/examples/google-hotels/search_hotels/package.json +9 -0
- package/examples/google-hotels/search_hotels/parser.ts +260 -0
- package/examples/google-hotels/search_hotels/playbook.yaml +87 -0
- package/examples/google-hotels/search_hotels/request-transform.ts +197 -0
- package/examples/google-hotels/search_hotels/workflow.json +127 -0
- package/package.json +3 -2
- package/prompts/audit-agent.md +71 -0
- package/prompts/build-planning.md +74 -0
- package/prompts/compile-agent.md +132 -28
- package/prompts/prereq-builder.md +64 -0
- package/prompts/prereq-planner.md +34 -0
- package/prompts/tool-planning.md +39 -0
- package/src/cli.ts +111 -4
- package/src/imprint/agent.ts +5 -0
- package/src/imprint/audit.ts +996 -0
- package/src/imprint/backend-ladder.ts +1214 -184
- package/src/imprint/build-plan.ts +1051 -0
- package/src/imprint/cdp-browser-fetch.ts +589 -0
- package/src/imprint/cdp-jar-cache.ts +320 -0
- package/src/imprint/chromium.ts +135 -0
- package/src/imprint/claude-cli-compile.ts +125 -25
- package/src/imprint/codex-cli-compile.ts +26 -23
- package/src/imprint/compile-agent-types.ts +38 -0
- package/src/imprint/compile-agent.ts +65 -27
- package/src/imprint/compile-tools.ts +1656 -64
- package/src/imprint/compile.ts +14 -2
- package/src/imprint/concurrency.ts +87 -0
- package/src/imprint/credential-extract.ts +174 -25
- package/src/imprint/cron.ts +1 -0
- package/src/imprint/doctor.ts +39 -0
- package/src/imprint/emit.ts +85 -0
- package/src/imprint/freeform-redact.ts +5 -4
- package/src/imprint/integrations.ts +2 -2
- package/src/imprint/llm.ts +56 -8
- package/src/imprint/mcp-compile-server.ts +43 -10
- package/src/imprint/mcp-maintenance.ts +9 -101
- package/src/imprint/mcp-server.ts +73 -7
- package/src/imprint/multi-progress.ts +7 -2
- package/src/imprint/param-grounding.ts +367 -0
- package/src/imprint/paths.ts +29 -0
- package/src/imprint/playbook-runner.ts +101 -40
- package/src/imprint/prereq-builder.ts +651 -0
- package/src/imprint/probe-backends.ts +6 -3
- package/src/imprint/record.ts +10 -1
- package/src/imprint/redact.ts +30 -2
- package/src/imprint/replay-capture.ts +19 -18
- package/src/imprint/runtime.ts +19 -10
- package/src/imprint/sensitive-keys.ts +141 -7
- package/src/imprint/session-diff.ts +79 -2
- package/src/imprint/session-merge.ts +9 -5
- package/src/imprint/stealth-chromium.ts +81 -0
- package/src/imprint/stealth-fetch.ts +309 -29
- package/src/imprint/stealth-token-cache.ts +88 -0
- package/src/imprint/teach-plan.ts +251 -0
- package/src/imprint/teach-state.ts +17 -0
- package/src/imprint/teach.ts +582 -147
- package/src/imprint/tool-candidates.ts +72 -14
- package/src/imprint/tool-plan.ts +313 -0
- package/src/imprint/tracing.ts +135 -6
- package/src/imprint/types.ts +61 -3
- package/examples/google-flights/search_google_flights/index.ts +0 -101
- package/examples/google-flights/search_google_flights/parser.test.ts +0 -140
- package/examples/google-flights/search_google_flights/parser.ts +0 -189
- package/examples/google-flights/search_google_flights/playbook.yaml +0 -130
- package/examples/google-flights/search_google_flights/workflow.json +0 -48
- package/examples/google-hotels/search_google_hotels/index.ts +0 -194
- package/examples/google-hotels/search_google_hotels/parser.test.ts +0 -168
- package/examples/google-hotels/search_google_hotels/parser.ts +0 -330
- package/examples/google-hotels/search_google_hotels/playbook.yaml +0 -125
- package/examples/google-hotels/search_google_hotels/workflow.json +0 -111
- package/examples/namecheap-domains/search_namecheap_domains/index.ts +0 -144
- package/examples/namecheap-domains/search_namecheap_domains/parser.ts +0 -380
- package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +0 -50
- package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +0 -136
- package/examples/namecheap-domains/search_namecheap_domains/workflow.json +0 -97
package/src/imprint/redact.ts
CHANGED
|
@@ -21,6 +21,22 @@ import type { CapturedRequest, Session } from './types.ts';
|
|
|
21
21
|
const USER_INTERACTION_TYPES = new Set(['click', 'input', 'change', 'submit']);
|
|
22
22
|
const MULTI_VALUE_HEADERS = new Set(['cookie', 'set-cookie']);
|
|
23
23
|
|
|
24
|
+
/**
|
|
25
|
+
* Detect a structured RPC envelope (XSSI-guarded or length-prefixed) whose body
|
|
26
|
+
* is NOT top-level JSON but carries doubly-encoded JSON as string payloads —
|
|
27
|
+
* e.g. Google `batchexecute` (`)]}'` guard + `<len>\n[...]` frames). Running the
|
|
28
|
+
* flat-text freeform scanner over such a body injects `[REDACTED]` into bare
|
|
29
|
+
* numeric IDs/coordinates inside the inner JSON and makes it unparseable, so the
|
|
30
|
+
* freeform fallback must skip these. The structure-aware key-based redaction
|
|
31
|
+
* still applies to any clean-JSON bodies; this only gates the flat-text scan.
|
|
32
|
+
*/
|
|
33
|
+
export function looksLikeRpcEnvelope(body: string): boolean {
|
|
34
|
+
const head = body.slice(0, 64).trimStart();
|
|
35
|
+
if (head.startsWith(")]}'")) return true; // anti-XSSI guard: )]}' and )]}',
|
|
36
|
+
if (/^\d{1,9}\r?\n\[/.test(head)) return true; // length-prefixed frame: 219006\n[
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
39
|
+
|
|
24
40
|
/**
|
|
25
41
|
* Detect sensitive headers whose values are page-minted constants — baked
|
|
26
42
|
* into the site's JavaScript, not per-user secrets. The recording starts
|
|
@@ -179,7 +195,12 @@ export function redactJsonBody(
|
|
|
179
195
|
const visited = visit(inner, [...pathSoFar, k]);
|
|
180
196
|
out[k] = JSON.stringify(visited);
|
|
181
197
|
} catch {
|
|
182
|
-
|
|
198
|
+
// Nested string that isn't parseable JSON: scan it as free text,
|
|
199
|
+
// unless it's a structured RPC envelope (flat-scanning corrupts it).
|
|
200
|
+
const r =
|
|
201
|
+
freeform && !looksLikeRpcEnvelope(v)
|
|
202
|
+
? redactFreeformText(v)
|
|
203
|
+
: { redacted: v, redactionsCount: 0 };
|
|
183
204
|
freeformCount += r.redactionsCount;
|
|
184
205
|
out[k] = r.redacted;
|
|
185
206
|
}
|
|
@@ -228,6 +249,9 @@ export function redactBody(
|
|
|
228
249
|
} catch {
|
|
229
250
|
const formR = redactFormBody(body, formPlaceholders, markerContext);
|
|
230
251
|
if (formR.redactionsCount > 0 || formR.placeholdersInjected > 0 || !freeform) return formR;
|
|
252
|
+
// A structured RPC envelope (XSSI/length-prefixed) is not flat text —
|
|
253
|
+
// flat-scanning it would corrupt the doubly-encoded JSON payloads it carries.
|
|
254
|
+
if (looksLikeRpcEnvelope(body)) return formR;
|
|
231
255
|
const freeformR = redactFreeformText(body);
|
|
232
256
|
return {
|
|
233
257
|
redacted: freeformR.redacted,
|
|
@@ -437,7 +461,11 @@ export function redactSession(
|
|
|
437
461
|
response.mimeType,
|
|
438
462
|
undefined,
|
|
439
463
|
undefined,
|
|
440
|
-
|
|
464
|
+
// Responses are key-based only: never value-pattern (freeform) scan a
|
|
465
|
+
// server body. Keeps redaction focused on real secrets (post-login
|
|
466
|
+
// cookies + user-entered PII) and avoids corrupting structured RPC
|
|
467
|
+
// envelopes whose payloads are doubly-encoded JSON.
|
|
468
|
+
false,
|
|
441
469
|
markerContext,
|
|
442
470
|
);
|
|
443
471
|
respBody = respBodyR.redacted;
|
|
@@ -15,6 +15,7 @@ import { join as pathJoin } from 'node:path';
|
|
|
15
15
|
import type { Browser, BrowserContext, Locator, Page } from 'playwright';
|
|
16
16
|
import { createLog } from './log.ts';
|
|
17
17
|
import type { CapturedReplayRequest } from './session-diff.ts';
|
|
18
|
+
import { getStealthChromium, getStealthExecutablePath } from './stealth-chromium.ts';
|
|
18
19
|
import type { CapturedEvent, Session } from './types.ts';
|
|
19
20
|
|
|
20
21
|
const log = createLog('replay-capture');
|
|
@@ -62,25 +63,17 @@ export async function replayRawSession(opts: RawReplayOptions): Promise<ReplayCa
|
|
|
62
63
|
|
|
63
64
|
let chromium: typeof import('playwright').chromium;
|
|
64
65
|
try {
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
(stealthMod as { default?: () => unknown }).default ??
|
|
69
|
-
(stealthMod as unknown as () => unknown);
|
|
70
|
-
pwExtra.chromium.use(stealthFactory() as never);
|
|
71
|
-
chromium = pwExtra.chromium as unknown as typeof import('playwright').chromium;
|
|
72
|
-
} catch {
|
|
73
|
-
try {
|
|
74
|
-
const pw = await import('playwright');
|
|
75
|
-
chromium = pw.chromium;
|
|
76
|
-
} catch (innerErr) {
|
|
77
|
-
return { ok: false, requests: [], error: `Playwright not available: ${errMsg(innerErr)}` };
|
|
78
|
-
}
|
|
66
|
+
chromium = await getStealthChromium();
|
|
67
|
+
} catch (innerErr) {
|
|
68
|
+
return { ok: false, requests: [], error: `Playwright not available: ${errMsg(innerErr)}` };
|
|
79
69
|
}
|
|
80
70
|
|
|
81
71
|
try {
|
|
82
72
|
replayLog(`launching browser (headed=${!!opts.headed})`);
|
|
83
|
-
browser = await chromium.launch({
|
|
73
|
+
browser = await chromium.launch({
|
|
74
|
+
headless: !opts.headed,
|
|
75
|
+
executablePath: getStealthExecutablePath(),
|
|
76
|
+
});
|
|
84
77
|
} catch (err) {
|
|
85
78
|
replayLog(`browser launch failed: ${errMsg(err)}`);
|
|
86
79
|
return { ok: false, requests: [], error: `Could not launch Chromium: ${errMsg(err)}` };
|
|
@@ -215,11 +208,19 @@ export async function replayRawSession(opts: RawReplayOptions): Promise<ReplayCa
|
|
|
215
208
|
opts.onProgress?.(i + 1, replayableEvents.length, captured.length);
|
|
216
209
|
}
|
|
217
210
|
|
|
218
|
-
// Allow final network requests to settle
|
|
211
|
+
// Allow final network requests to settle, but never block forever: on a
|
|
212
|
+
// large recording a single hung response-body read can stall allSettled
|
|
213
|
+
// indefinitely (there is no outer timeout on the replay stage). Cap the
|
|
214
|
+
// wait and proceed with whatever bodies are ready — replay-diff is
|
|
215
|
+
// best-effort, so partial captures are acceptable.
|
|
216
|
+
const SETTLE_TIMEOUT_MS = 15_000;
|
|
219
217
|
replayLog('waiting for networkidle...');
|
|
220
|
-
await page.waitForLoadState('networkidle').catch(() => {});
|
|
218
|
+
await page.waitForLoadState('networkidle', { timeout: SETTLE_TIMEOUT_MS }).catch(() => {});
|
|
221
219
|
await page.waitForTimeout(1000);
|
|
222
|
-
await Promise.
|
|
220
|
+
await Promise.race([
|
|
221
|
+
Promise.allSettled(pendingReads),
|
|
222
|
+
new Promise<void>((resolve) => setTimeout(resolve, SETTLE_TIMEOUT_MS)),
|
|
223
|
+
]);
|
|
223
224
|
captured.sort((a, b) => a.seq - b.seq);
|
|
224
225
|
|
|
225
226
|
replayLog(`replay complete: captured ${captured.length} requests total`);
|
package/src/imprint/runtime.ts
CHANGED
|
@@ -113,14 +113,23 @@ export async function executeWorkflow<T = unknown>(opts: ExecuteOptions): Promis
|
|
|
113
113
|
(await loadCredentialStore(opts.workflow.site)) ??
|
|
114
114
|
emptyStore(opts.workflow.site);
|
|
115
115
|
|
|
116
|
-
// Validate required parameters are present
|
|
116
|
+
// Validate required parameters are present and merge declared defaults
|
|
117
|
+
// into the working params map. Without the merge, `parameter.default` would
|
|
118
|
+
// be a presence-sentinel only — the substitution layer at
|
|
119
|
+
// `resolvePlaceholder` would still throw STATE_MISSING because it reads
|
|
120
|
+
// from this map directly. The schema declares `default` as a real value
|
|
121
|
+
// (string | number | boolean), so honor it.
|
|
122
|
+
const params: Record<string, string | number | boolean> = { ...opts.params };
|
|
117
123
|
for (const p of opts.workflow.parameters) {
|
|
118
|
-
if (!(p.name in
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
+
if (!(p.name in params)) {
|
|
125
|
+
if (p.default === undefined) {
|
|
126
|
+
return {
|
|
127
|
+
ok: false,
|
|
128
|
+
error: 'UNKNOWN',
|
|
129
|
+
message: `Missing required parameter: ${p.name} (${p.description})`,
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
params[p.name] = p.default;
|
|
124
133
|
}
|
|
125
134
|
}
|
|
126
135
|
|
|
@@ -163,7 +172,7 @@ export async function executeWorkflow<T = unknown>(opts: ExecuteOptions): Promis
|
|
|
163
172
|
if (!req) continue;
|
|
164
173
|
|
|
165
174
|
const subbedResult = substituteRequest(req, {
|
|
166
|
-
params
|
|
175
|
+
params,
|
|
167
176
|
credentials: liveCredentials,
|
|
168
177
|
responseSlots,
|
|
169
178
|
state,
|
|
@@ -180,7 +189,7 @@ export async function executeWorkflow<T = unknown>(opts: ExecuteOptions): Promis
|
|
|
180
189
|
subbed.method,
|
|
181
190
|
subbed.url,
|
|
182
191
|
responseSlots.map((s) => s.raw),
|
|
183
|
-
|
|
192
|
+
params,
|
|
184
193
|
);
|
|
185
194
|
if (typeof transformResult === 'string') {
|
|
186
195
|
subbed.url = transformResult;
|
|
@@ -312,7 +321,7 @@ export async function executeWorkflow<T = unknown>(opts: ExecuteOptions): Promis
|
|
|
312
321
|
};
|
|
313
322
|
}
|
|
314
323
|
finalData = mod.extract(finalData, {
|
|
315
|
-
params
|
|
324
|
+
params,
|
|
316
325
|
responses: responseSlots.map((s) => s.raw),
|
|
317
326
|
});
|
|
318
327
|
} catch (err) {
|
|
@@ -112,15 +112,126 @@ const SENSITIVE_KEYS = [
|
|
|
112
112
|
'dob',
|
|
113
113
|
];
|
|
114
114
|
|
|
115
|
-
|
|
115
|
+
// `normalizeKey` (defined below) lowercases and strips `_`/`-` — set
|
|
116
|
+
// membership goes through it, so we MUST pre-normalize the stored entries
|
|
117
|
+
// or lookups for e.g. `j_password` (→ `jpassword`) will miss a stored
|
|
118
|
+
// `j_password`. Hoisting a local copy of the rule rather than ordering
|
|
119
|
+
// gymnastics keeps the file linear.
|
|
120
|
+
const _normalize = (s: string): string => s.toLowerCase().replace(/[-_]/g, '');
|
|
121
|
+
|
|
122
|
+
const SENSITIVE_KEY_SET = new Set(SENSITIVE_KEYS.map(_normalize));
|
|
116
123
|
|
|
117
124
|
/** Subset of SENSITIVE_KEYS that specifically denote a credential (not PII).
|
|
118
125
|
* Used by credential-extract.ts when looking for the password half of a
|
|
119
|
-
* login form pair — we don't want to treat e.g. `dob` as a password.
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
126
|
+
* login form pair — we don't want to treat e.g. `dob` as a password.
|
|
127
|
+
*
|
|
128
|
+
* Inclusion criterion: a key name that, when present in a request body
|
|
129
|
+
* alongside a username-like partner, almost always means "this is the
|
|
130
|
+
* password the user typed at login time." Be liberal here — false positives
|
|
131
|
+
* cost the user one extra prompt confirmation; false negatives ship broken
|
|
132
|
+
* tools. New additions should reference a real recorded site that broke
|
|
133
|
+
* without them.
|
|
134
|
+
*
|
|
135
|
+
* Sites observed needing each entry:
|
|
136
|
+
* - password / passwd / pwd: most modern APIs
|
|
137
|
+
* - pin: bank / utility login forms
|
|
138
|
+
* - pass: legacy PHP forms (e.g. SMF)
|
|
139
|
+
* - secret: OAuth ROPC payloads
|
|
140
|
+
* - j_password: Java EE / Spring Security default form-login
|
|
141
|
+
* - userpassword / loginpassword / accountpassword:
|
|
142
|
+
* vendor SSO portals that namespace fields
|
|
143
|
+
* - patronpassword / patron_password: Discover & Go libraries (kept for back-compat)
|
|
144
|
+
*/
|
|
145
|
+
const PASSWORD_LIKE_ENTRIES = [
|
|
146
|
+
'password',
|
|
147
|
+
'passwd',
|
|
148
|
+
'pwd',
|
|
149
|
+
'pin',
|
|
150
|
+
'pass',
|
|
151
|
+
'secret',
|
|
152
|
+
'j_password',
|
|
153
|
+
'userpassword',
|
|
154
|
+
'loginpassword',
|
|
155
|
+
'accountpassword',
|
|
156
|
+
'patronpassword',
|
|
157
|
+
'patron_password',
|
|
158
|
+
];
|
|
159
|
+
const PASSWORD_LIKE_KEYS = new Set(PASSWORD_LIKE_ENTRIES.map(_normalize));
|
|
160
|
+
|
|
161
|
+
/** Subset of SENSITIVE_KEYS that specifically denote a username/email/login
|
|
162
|
+
* identifier — the partner half of a username+password login pair.
|
|
163
|
+
*
|
|
164
|
+
* Same inclusion criterion as PASSWORD_LIKE_KEYS: liberal coverage of real
|
|
165
|
+
* recorded forms, narrow enough not to match arbitrary identifiers. Note
|
|
166
|
+
* this set is intentionally distinct from `email`, `phone` etc. in
|
|
167
|
+
* SENSITIVE_KEYS — those get redacted as PII regardless, but only the
|
|
168
|
+
* subset here qualifies as the "username partner" the credential extractor
|
|
169
|
+
* pairs with a password.
|
|
170
|
+
*
|
|
171
|
+
* Sites observed needing each entry:
|
|
172
|
+
* - user / username / user_name / userid / user_id:
|
|
173
|
+
* most APIs
|
|
174
|
+
* - login / loginid / login_id / login_email:
|
|
175
|
+
* REST endpoints that name the form field after the action
|
|
176
|
+
* - email / emailaddress / email_address: email-as-username flows
|
|
177
|
+
* - account / accountid / account_id: enterprise SSO portals
|
|
178
|
+
* - patron / patronnumber / patron_number / patronid / patron_id:
|
|
179
|
+
* library systems (Discover & Go)
|
|
180
|
+
* - j_username: Java EE / Spring Security default form-login
|
|
181
|
+
* - signin / signinid / sign_in_id: vendor SSO portals (Okta-style)
|
|
182
|
+
* - usr / uid: legacy CGI / older PHP
|
|
183
|
+
* - memberid / member_id / membername / member_name:
|
|
184
|
+
* membership-driven sites (gyms, clubs)
|
|
185
|
+
* - customerid / customer_id / customernumber / customer_number:
|
|
186
|
+
* ecommerce account portals
|
|
187
|
+
* - clientid / client_id / clientnumber / client_number:
|
|
188
|
+
* B2B portals (CAUTION: also matches OAuth client_id;
|
|
189
|
+
* credential-extract.ts gates on having a password
|
|
190
|
+
* partner in the same parent, so OAuth token endpoints
|
|
191
|
+
* that pass client_id without a password won't match)
|
|
192
|
+
*/
|
|
193
|
+
const USERNAME_LIKE_KEYS = new Set(
|
|
194
|
+
[
|
|
195
|
+
'user',
|
|
196
|
+
'username',
|
|
197
|
+
'user_name',
|
|
198
|
+
'userid',
|
|
199
|
+
'user_id',
|
|
200
|
+
'login',
|
|
201
|
+
'loginid',
|
|
202
|
+
'login_id',
|
|
203
|
+
'loginemail',
|
|
204
|
+
'login_email',
|
|
205
|
+
'email',
|
|
206
|
+
'emailaddress',
|
|
207
|
+
'email_address',
|
|
208
|
+
'account',
|
|
209
|
+
'accountid',
|
|
210
|
+
'account_id',
|
|
211
|
+
'patron',
|
|
212
|
+
'patronnumber',
|
|
213
|
+
'patron_number',
|
|
214
|
+
'patronid',
|
|
215
|
+
'patron_id',
|
|
216
|
+
'j_username',
|
|
217
|
+
'signin',
|
|
218
|
+
'signinid',
|
|
219
|
+
'sign_in_id',
|
|
220
|
+
'usr',
|
|
221
|
+
'uid',
|
|
222
|
+
'memberid',
|
|
223
|
+
'member_id',
|
|
224
|
+
'membername',
|
|
225
|
+
'member_name',
|
|
226
|
+
'customerid',
|
|
227
|
+
'customer_id',
|
|
228
|
+
'customernumber',
|
|
229
|
+
'customer_number',
|
|
230
|
+
'clientid',
|
|
231
|
+
'client_id',
|
|
232
|
+
'clientnumber',
|
|
233
|
+
'client_number',
|
|
234
|
+
].map(_normalize),
|
|
124
235
|
);
|
|
125
236
|
|
|
126
237
|
const SENSITIVE_HEADERS = [
|
|
@@ -138,7 +249,7 @@ const SENSITIVE_HEADERS = [
|
|
|
138
249
|
|
|
139
250
|
const SENSITIVE_HEADER_SET = new Set(SENSITIVE_HEADERS.map((h) => h.toLowerCase()));
|
|
140
251
|
|
|
141
|
-
export const normalizeKey =
|
|
252
|
+
export const normalizeKey = _normalize;
|
|
142
253
|
|
|
143
254
|
/** True if the key name suggests a sensitive value (auth, payment, PII). */
|
|
144
255
|
export function isSensitiveKey(key: string): boolean {
|
|
@@ -151,6 +262,29 @@ export function isSensitiveCredentialKey(key: string): boolean {
|
|
|
151
262
|
return PASSWORD_LIKE_KEYS.has(normalizeKey(key));
|
|
152
263
|
}
|
|
153
264
|
|
|
265
|
+
/** True if the key name suggests a username/email/login identifier — the
|
|
266
|
+
* partner half of a login pair. Used in credential extraction and in the
|
|
267
|
+
* pre-emit guardrail that flags workflows templating credentials as plain
|
|
268
|
+
* parameters. */
|
|
269
|
+
export function isUsernameLikeKey(key: string): boolean {
|
|
270
|
+
return USERNAME_LIKE_KEYS.has(normalizeKey(key));
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
/** True for either half of a login pair (username or password). Used by the
|
|
274
|
+
* pre-emit guardrail and the post-redact pairing audit, which both need to
|
|
275
|
+
* decide "is this parameter name credential-shaped?" without caring which
|
|
276
|
+
* half. */
|
|
277
|
+
export function isLoginFieldKey(key: string): boolean {
|
|
278
|
+
const n = normalizeKey(key);
|
|
279
|
+
return PASSWORD_LIKE_KEYS.has(n) || USERNAME_LIKE_KEYS.has(n);
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/** Raw password-like key strings (pre-normalization) for callers that need
|
|
283
|
+
* substring matching against raw body text rather than parsed key lookup. */
|
|
284
|
+
export function passwordLikeTokens(): readonly string[] {
|
|
285
|
+
return PASSWORD_LIKE_ENTRIES;
|
|
286
|
+
}
|
|
287
|
+
|
|
154
288
|
export function isSensitiveHeader(header: string): boolean {
|
|
155
289
|
return SENSITIVE_HEADER_SET.has(header.toLowerCase());
|
|
156
290
|
}
|
|
@@ -318,6 +318,17 @@ function suggestStateName(location: string): string {
|
|
|
318
318
|
.toLowerCase();
|
|
319
319
|
}
|
|
320
320
|
|
|
321
|
+
/** Whether a value looks like an opaque token/id (vs human text, a city name, a
|
|
322
|
+
* date). Gates provenance-tagging of stable values so an incidental constant
|
|
323
|
+
* (a UI label, the echoed query) isn't treated as a server-provided token.
|
|
324
|
+
* Shared with the build-plan token detector. */
|
|
325
|
+
export function looksLikeToken(v: string): boolean {
|
|
326
|
+
if (v.length < 12) return false;
|
|
327
|
+
if (/\s/.test(v)) return false; // multi-word / free text
|
|
328
|
+
if (/^\d{4}-\d{2}-\d{2}$/.test(v)) return false; // dates
|
|
329
|
+
return /[:|_-]/.test(v) || /\d/.test(v);
|
|
330
|
+
}
|
|
331
|
+
|
|
321
332
|
// ─── Main diff ──────────────────────────────────────────────────────────────
|
|
322
333
|
|
|
323
334
|
export function diffTriagedSessions(
|
|
@@ -327,6 +338,12 @@ export function diffTriagedSessions(
|
|
|
327
338
|
const pairs = alignRequests(original.requests, replay.requests);
|
|
328
339
|
const pairedOrigSeqs = new Set(pairs.map((p) => p.originalSeq));
|
|
329
340
|
const pairedReplaySeqs = new Set(pairs.map((p) => p.replaySeq));
|
|
341
|
+
// `searchPriorResponses` over the replay returns a producer in REPLAY-seq
|
|
342
|
+
// space, but `originalSeq` and every downstream consumer (capture hints,
|
|
343
|
+
// build-plan token detection, the planner) work in ORIGINAL-seq space — so a
|
|
344
|
+
// replay producer must be translated back via the alignment pairs.
|
|
345
|
+
const replayToOriginal = new Map(pairs.map((p) => [p.replaySeq, p.originalSeq]));
|
|
346
|
+
const toOriginalSeq = (replaySeq: number): number => replayToOriginal.get(replaySeq) ?? replaySeq;
|
|
330
347
|
|
|
331
348
|
const classifications: ClassifiedValue[] = [];
|
|
332
349
|
|
|
@@ -347,17 +364,28 @@ export function diffTriagedSessions(
|
|
|
347
364
|
if (v2Value === undefined) continue; // field only in run 1
|
|
348
365
|
|
|
349
366
|
if (v1.value === v2Value) {
|
|
367
|
+
// Stable across runs. Normally a constant — but an OPAQUE stable value
|
|
368
|
+
// that also appears in a PRIOR response is a server-PROVIDED token (e.g.
|
|
369
|
+
// a per-entity id minted by a sibling search tool). The same-flow replay
|
|
370
|
+
// can't expose it by variance (same entity → same token), so recover its
|
|
371
|
+
// provenance from the original responses (already original-seq space).
|
|
372
|
+
// A cross-tool consumer then sources it as a param instead of hardcoding.
|
|
373
|
+
const provider = looksLikeToken(v1.value)
|
|
374
|
+
? searchPriorResponses(v1.value, original.requests, pair.originalSeq)
|
|
375
|
+
: null;
|
|
350
376
|
classifications.push({
|
|
351
377
|
classification: 'constant',
|
|
352
378
|
location: v1.location,
|
|
353
379
|
originalSeq: pair.originalSeq,
|
|
354
380
|
value1: v1.value,
|
|
355
381
|
value2: v2Value,
|
|
382
|
+
...(provider ? { producerSeq: provider.seq, producerPath: provider.path } : {}),
|
|
356
383
|
});
|
|
357
384
|
continue;
|
|
358
385
|
}
|
|
359
386
|
|
|
360
|
-
// Value differs — check if it came from a prior response in run 2
|
|
387
|
+
// Value differs — check if it came from a prior response in run 2,
|
|
388
|
+
// translating the replay producer back to original-seq space.
|
|
361
389
|
const producer = searchPriorResponses(v2Value, replay.requests, pair.replaySeq);
|
|
362
390
|
|
|
363
391
|
if (producer) {
|
|
@@ -368,7 +396,7 @@ export function diffTriagedSessions(
|
|
|
368
396
|
originalSeq: pair.originalSeq,
|
|
369
397
|
value1: v1.value,
|
|
370
398
|
value2: v2Value,
|
|
371
|
-
producerSeq: producer.seq,
|
|
399
|
+
producerSeq: toOriginalSeq(producer.seq),
|
|
372
400
|
producerPath: producer.path,
|
|
373
401
|
suggestedStateName: name || undefined,
|
|
374
402
|
});
|
|
@@ -407,3 +435,52 @@ export function triageByAlignment(
|
|
|
407
435
|
const aligned = alignRequests(run1TriagedRequests, run2AllRequests);
|
|
408
436
|
return aligned.filter((pair) => pair.confidence >= 0.5).map((pair) => pair.replaySeq);
|
|
409
437
|
}
|
|
438
|
+
|
|
439
|
+
/**
|
|
440
|
+
* Severity order — a value seen varying in ANY pass outranks one seen constant.
|
|
441
|
+
* server_derived (traceable to a response) wins over browser_minted.
|
|
442
|
+
*/
|
|
443
|
+
const CLASSIFICATION_RANK: Record<ValueClassification, number> = {
|
|
444
|
+
constant: 0,
|
|
445
|
+
browser_minted: 1,
|
|
446
|
+
server_derived: 2,
|
|
447
|
+
};
|
|
448
|
+
|
|
449
|
+
/**
|
|
450
|
+
* Merge `ClassifiedValue`s from several diff passes that all share the SAME
|
|
451
|
+
* `original` recording (so `originalSeq` is a stable join key across passes).
|
|
452
|
+
*
|
|
453
|
+
* Each pass diffs the original recording against one other run — the automated
|
|
454
|
+
* browser replay AND every other real recording of the site. Anti-bot edges
|
|
455
|
+
* (Akamai, DataDome, …) often block the automated replay at the page level, so
|
|
456
|
+
* the replay reproduces only a fraction of the recording's requests and their
|
|
457
|
+
* functional values (GraphQL safelisting signatures, persisted-query hashes,
|
|
458
|
+
* app keys) never get classified. Real recordings come from a trusted browser
|
|
459
|
+
* and DO carry those requests, so diffing recordings against each other
|
|
460
|
+
* recovers the missing signal.
|
|
461
|
+
*
|
|
462
|
+
* Merge rule per (originalSeq, location):
|
|
463
|
+
* - a value that VARIES in any pass is ephemeral — the strongest non-constant
|
|
464
|
+
* classification wins (server_derived > browser_minted), preserving its
|
|
465
|
+
* producer provenance;
|
|
466
|
+
* - a value constant in every pass that observed it is `constant`.
|
|
467
|
+
* A value the replay never observed (because it was blocked) but that is
|
|
468
|
+
* identical across time-separated recordings is therefore kept as `constant`,
|
|
469
|
+
* not silently dropped.
|
|
470
|
+
*/
|
|
471
|
+
export function mergeClassifications(passes: ClassifiedValue[][]): ClassifiedValue[] {
|
|
472
|
+
const byKey = new Map<string, ClassifiedValue>();
|
|
473
|
+
for (const pass of passes) {
|
|
474
|
+
for (const cv of pass) {
|
|
475
|
+
const key = `${cv.originalSeq}${cv.location}`;
|
|
476
|
+
const prev = byKey.get(key);
|
|
477
|
+
if (
|
|
478
|
+
!prev ||
|
|
479
|
+
CLASSIFICATION_RANK[cv.classification] > CLASSIFICATION_RANK[prev.classification]
|
|
480
|
+
) {
|
|
481
|
+
byKey.set(key, cv);
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
return [...byKey.values()];
|
|
486
|
+
}
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
* pipeline consumes unchanged.
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
|
-
import { existsSync, readFileSync, readdirSync, writeFileSync } from 'node:fs';
|
|
10
|
+
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from 'node:fs';
|
|
11
11
|
import { join as pathJoin } from 'node:path';
|
|
12
12
|
import { localSessionsDir } from './paths.ts';
|
|
13
13
|
import { friendlySessionTimestamp } from './teach-state.ts';
|
|
@@ -34,10 +34,13 @@ interface SessionInfo {
|
|
|
34
34
|
}
|
|
35
35
|
|
|
36
36
|
export function listSiteSessions(site: string): SessionInfo[] {
|
|
37
|
-
|
|
38
|
-
|
|
37
|
+
return listSessionsInDir(localSessionsDir(site));
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export function listSessionsInDir(dir: string): SessionInfo[] {
|
|
41
|
+
if (!existsSync(dir)) return [];
|
|
39
42
|
|
|
40
|
-
const files = readdirSync(
|
|
43
|
+
const files = readdirSync(dir).filter(
|
|
41
44
|
(f) =>
|
|
42
45
|
f.endsWith('.json') &&
|
|
43
46
|
!f.includes('.redacted') &&
|
|
@@ -47,7 +50,7 @@ export function listSiteSessions(site: string): SessionInfo[] {
|
|
|
47
50
|
|
|
48
51
|
const infos: SessionInfo[] = [];
|
|
49
52
|
for (const filename of files) {
|
|
50
|
-
const absPath = pathJoin(
|
|
53
|
+
const absPath = pathJoin(dir, filename);
|
|
51
54
|
try {
|
|
52
55
|
const raw = JSON.parse(readFileSync(absPath, 'utf8'));
|
|
53
56
|
const session = SessionSchema.parse(raw);
|
|
@@ -190,6 +193,7 @@ export function mergeSessions(sessions: Session[]): Session {
|
|
|
190
193
|
|
|
191
194
|
export function writeCombinedSession(site: string, combined: Session): string {
|
|
192
195
|
const sessDir = localSessionsDir(site);
|
|
196
|
+
mkdirSync(sessDir, { recursive: true });
|
|
193
197
|
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
194
198
|
const filename = `combined-${timestamp}.json`;
|
|
195
199
|
const absPath = pathJoin(sessDir, filename);
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { findChromium } from './chromium.ts';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Shared loader for Playwright's chromium with the stealth plugin applied.
|
|
5
|
+
*
|
|
6
|
+
* Stealth patches navigator.webdriver, plugin enumeration, WebGL vendor
|
|
7
|
+
* strings, and other headless-Chrome telltales that anti-bot services
|
|
8
|
+
* (Akamai, Cloudflare, PerimeterX) detect. Vanilla headless Playwright
|
|
9
|
+
* gets tarpitted or 403'd by these services; the stealth-patched chromium
|
|
10
|
+
* loads the same pages in seconds.
|
|
11
|
+
*
|
|
12
|
+
* Falls back to vanilla `playwright` if `playwright-extra` /
|
|
13
|
+
* `puppeteer-extra-plugin-stealth` are not installed (preserves the
|
|
14
|
+
* graceful-degrade behavior of the original duplicated loaders in
|
|
15
|
+
* playbook-runner, replay-capture, and backend-ladder).
|
|
16
|
+
*
|
|
17
|
+
* Throws if no Playwright is available at all — callers translate the
|
|
18
|
+
* thrown error into their own result shape.
|
|
19
|
+
*/
|
|
20
|
+
export async function getStealthChromium(): Promise<typeof import('playwright').chromium> {
|
|
21
|
+
try {
|
|
22
|
+
const pwExtra = await import('playwright-extra');
|
|
23
|
+
const stealthMod = await import('puppeteer-extra-plugin-stealth');
|
|
24
|
+
const stealthFactory =
|
|
25
|
+
(stealthMod as { default?: () => unknown }).default ??
|
|
26
|
+
(stealthMod as unknown as () => unknown);
|
|
27
|
+
pwExtra.chromium.use(stealthFactory() as never);
|
|
28
|
+
return pwExtra.chromium as unknown as typeof import('playwright').chromium;
|
|
29
|
+
} catch {
|
|
30
|
+
const pw = await import('playwright');
|
|
31
|
+
return pw.chromium;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* True when the puppeteer-extra stealth plugin is installed and WILL be applied
|
|
37
|
+
* by getStealthChromium() (i.e. we're not on the vanilla-Playwright fallback).
|
|
38
|
+
*
|
|
39
|
+
* Callers use this to avoid stacking a manual `navigator.webdriver` patch on top
|
|
40
|
+
* of the plugin's: the stealth plugin removes the property the way a real Chrome
|
|
41
|
+
* does (it simply lacks `webdriver`), whereas a redundant
|
|
42
|
+
* `Object.defineProperty(navigator,'webdriver',{get:()=>false})` leaves a
|
|
43
|
+
* non-native property descriptor that is ITSELF a fingerprinting tell. So the
|
|
44
|
+
* manual patch should only run on the vanilla fallback, where it's the only
|
|
45
|
+
* protection. Import resolution is cached, so probing here is cheap.
|
|
46
|
+
*/
|
|
47
|
+
export async function isStealthPluginAvailable(): Promise<boolean> {
|
|
48
|
+
try {
|
|
49
|
+
await import('playwright-extra');
|
|
50
|
+
await import('puppeteer-extra-plugin-stealth');
|
|
51
|
+
return true;
|
|
52
|
+
} catch {
|
|
53
|
+
return false;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Path to the same Chromium binary `imprint record` uses for the user's
|
|
59
|
+
* recording session — Playwright's bundled "Google Chrome for Testing"
|
|
60
|
+
* (full Chrome build), the system Chrome on macOS, or a Linux distro
|
|
61
|
+
* Chrome/Chromium package, in that order of preference.
|
|
62
|
+
*
|
|
63
|
+
* Why this matters: by default Playwright's `chromium.launch({ headless: true })`
|
|
64
|
+
* picks `chrome-headless-shell` — a separate stripped-down binary that
|
|
65
|
+
* Akamai / Cloudflare / PerimeterX class anti-bot services detect at the
|
|
66
|
+
* binary/TLS-fingerprint layer regardless of how thoroughly the JS-level
|
|
67
|
+
* `navigator.webdriver` etc. are patched by the stealth plugin. The
|
|
68
|
+
* recording browser uses the FULL Chrome binary and Akamai trusts it; the
|
|
69
|
+
* replay browser using chrome-headless-shell looks like a bot. Using the
|
|
70
|
+
* SAME binary for both eliminates the binary asymmetry.
|
|
71
|
+
*
|
|
72
|
+
* Returns `undefined` if no Chromium can be located — callers should let
|
|
73
|
+
* Playwright fall back to whatever default it finds.
|
|
74
|
+
*/
|
|
75
|
+
export function getStealthExecutablePath(): string | undefined {
|
|
76
|
+
try {
|
|
77
|
+
return findChromium();
|
|
78
|
+
} catch {
|
|
79
|
+
return undefined;
|
|
80
|
+
}
|
|
81
|
+
}
|