imprint-mcp 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/README.md +165 -201
  2. package/examples/discoverandgo/README.md +1 -1
  3. package/examples/echo/README.md +1 -1
  4. package/examples/google-flights/README.md +28 -0
  5. package/examples/google-flights/_shared/batchexecute.ts +63 -0
  6. package/examples/google-flights/_shared/flights_request.ts +95 -0
  7. package/examples/google-flights/_shared/package.json +9 -0
  8. package/examples/google-flights/get_flight_booking_details/index.ts +159 -0
  9. package/examples/google-flights/get_flight_booking_details/package.json +9 -0
  10. package/examples/google-flights/get_flight_booking_details/parser.ts +182 -0
  11. package/examples/google-flights/get_flight_booking_details/playbook.yaml +138 -0
  12. package/examples/google-flights/get_flight_booking_details/request-transform.ts +86 -0
  13. package/examples/google-flights/get_flight_booking_details/workflow.json +98 -0
  14. package/examples/google-flights/get_flight_calendar_prices/index.ts +131 -0
  15. package/examples/google-flights/get_flight_calendar_prices/package.json +9 -0
  16. package/examples/google-flights/get_flight_calendar_prices/parser.ts +86 -0
  17. package/examples/google-flights/get_flight_calendar_prices/playbook.yaml +97 -0
  18. package/examples/google-flights/get_flight_calendar_prices/request-transform.ts +31 -0
  19. package/examples/google-flights/get_flight_calendar_prices/workflow.json +76 -0
  20. package/examples/google-flights/lookup_airport/index.ts +101 -0
  21. package/examples/google-flights/lookup_airport/package.json +9 -0
  22. package/examples/google-flights/lookup_airport/parser.ts +66 -0
  23. package/examples/google-flights/lookup_airport/playbook.yaml +47 -0
  24. package/examples/google-flights/lookup_airport/request-transform.ts +20 -0
  25. package/examples/google-flights/lookup_airport/workflow.json +57 -0
  26. package/examples/google-flights/search_flights/index.ts +219 -0
  27. package/examples/google-flights/search_flights/package.json +9 -0
  28. package/examples/google-flights/search_flights/parser.ts +169 -0
  29. package/examples/google-flights/search_flights/playbook.yaml +184 -0
  30. package/examples/google-flights/search_flights/request-transform.ts +119 -0
  31. package/examples/google-flights/search_flights/workflow.json +143 -0
  32. package/examples/google-hotels/README.md +29 -0
  33. package/examples/google-hotels/_shared/batchexecute.ts +73 -0
  34. package/examples/google-hotels/_shared/freq.ts +158 -0
  35. package/examples/google-hotels/_shared/package.json +9 -0
  36. package/examples/google-hotels/autocomplete_hotel_location/index.ts +80 -0
  37. package/examples/google-hotels/autocomplete_hotel_location/package.json +9 -0
  38. package/examples/google-hotels/autocomplete_hotel_location/parser.ts +71 -0
  39. package/examples/google-hotels/autocomplete_hotel_location/playbook.yaml +36 -0
  40. package/examples/google-hotels/autocomplete_hotel_location/request-transform.ts +37 -0
  41. package/examples/google-hotels/autocomplete_hotel_location/workflow.json +36 -0
  42. package/examples/google-hotels/get_hotel_booking_options/index.ts +143 -0
  43. package/examples/google-hotels/get_hotel_booking_options/package.json +9 -0
  44. package/examples/google-hotels/get_hotel_booking_options/parser.ts +271 -0
  45. package/examples/google-hotels/get_hotel_booking_options/playbook.yaml +154 -0
  46. package/examples/google-hotels/get_hotel_booking_options/request-transform.ts +154 -0
  47. package/examples/google-hotels/get_hotel_booking_options/workflow.json +84 -0
  48. package/examples/google-hotels/get_hotel_reviews/index.ts +81 -0
  49. package/examples/google-hotels/get_hotel_reviews/package.json +9 -0
  50. package/examples/google-hotels/get_hotel_reviews/parser.ts +128 -0
  51. package/examples/google-hotels/get_hotel_reviews/playbook.yaml +64 -0
  52. package/examples/google-hotels/get_hotel_reviews/request-transform.ts +42 -0
  53. package/examples/google-hotels/get_hotel_reviews/workflow.json +37 -0
  54. package/examples/google-hotels/search_hotels/index.ts +207 -0
  55. package/examples/google-hotels/search_hotels/package.json +9 -0
  56. package/examples/google-hotels/search_hotels/parser.ts +260 -0
  57. package/examples/google-hotels/search_hotels/playbook.yaml +87 -0
  58. package/examples/google-hotels/search_hotels/request-transform.ts +197 -0
  59. package/examples/google-hotels/search_hotels/workflow.json +127 -0
  60. package/package.json +3 -2
  61. package/prompts/audit-agent.md +71 -0
  62. package/prompts/build-planning.md +74 -0
  63. package/prompts/compile-agent.md +132 -28
  64. package/prompts/prereq-builder.md +64 -0
  65. package/prompts/prereq-planner.md +34 -0
  66. package/prompts/tool-planning.md +39 -0
  67. package/src/cli.ts +111 -4
  68. package/src/imprint/agent.ts +5 -0
  69. package/src/imprint/audit.ts +996 -0
  70. package/src/imprint/backend-ladder.ts +1214 -184
  71. package/src/imprint/build-plan.ts +1051 -0
  72. package/src/imprint/cdp-browser-fetch.ts +589 -0
  73. package/src/imprint/cdp-jar-cache.ts +320 -0
  74. package/src/imprint/chromium.ts +135 -0
  75. package/src/imprint/claude-cli-compile.ts +125 -25
  76. package/src/imprint/codex-cli-compile.ts +26 -23
  77. package/src/imprint/compile-agent-types.ts +38 -0
  78. package/src/imprint/compile-agent.ts +65 -27
  79. package/src/imprint/compile-tools.ts +1656 -64
  80. package/src/imprint/compile.ts +14 -2
  81. package/src/imprint/concurrency.ts +87 -0
  82. package/src/imprint/credential-extract.ts +174 -25
  83. package/src/imprint/cron.ts +1 -0
  84. package/src/imprint/doctor.ts +39 -0
  85. package/src/imprint/emit.ts +85 -0
  86. package/src/imprint/freeform-redact.ts +5 -4
  87. package/src/imprint/integrations.ts +2 -2
  88. package/src/imprint/llm.ts +56 -8
  89. package/src/imprint/mcp-compile-server.ts +43 -10
  90. package/src/imprint/mcp-maintenance.ts +9 -101
  91. package/src/imprint/mcp-server.ts +73 -7
  92. package/src/imprint/multi-progress.ts +7 -2
  93. package/src/imprint/param-grounding.ts +367 -0
  94. package/src/imprint/paths.ts +29 -0
  95. package/src/imprint/playbook-runner.ts +101 -40
  96. package/src/imprint/prereq-builder.ts +651 -0
  97. package/src/imprint/probe-backends.ts +6 -3
  98. package/src/imprint/record.ts +10 -1
  99. package/src/imprint/redact.ts +30 -2
  100. package/src/imprint/replay-capture.ts +19 -18
  101. package/src/imprint/runtime.ts +19 -10
  102. package/src/imprint/sensitive-keys.ts +141 -7
  103. package/src/imprint/session-diff.ts +79 -2
  104. package/src/imprint/session-merge.ts +9 -5
  105. package/src/imprint/stealth-chromium.ts +81 -0
  106. package/src/imprint/stealth-fetch.ts +309 -29
  107. package/src/imprint/stealth-token-cache.ts +88 -0
  108. package/src/imprint/teach-plan.ts +251 -0
  109. package/src/imprint/teach-state.ts +17 -0
  110. package/src/imprint/teach.ts +582 -147
  111. package/src/imprint/tool-candidates.ts +72 -14
  112. package/src/imprint/tool-plan.ts +313 -0
  113. package/src/imprint/tracing.ts +135 -6
  114. package/src/imprint/types.ts +61 -3
  115. package/examples/google-flights/search_google_flights/index.ts +0 -101
  116. package/examples/google-flights/search_google_flights/parser.test.ts +0 -140
  117. package/examples/google-flights/search_google_flights/parser.ts +0 -189
  118. package/examples/google-flights/search_google_flights/playbook.yaml +0 -130
  119. package/examples/google-flights/search_google_flights/workflow.json +0 -48
  120. package/examples/google-hotels/search_google_hotels/index.ts +0 -194
  121. package/examples/google-hotels/search_google_hotels/parser.test.ts +0 -168
  122. package/examples/google-hotels/search_google_hotels/parser.ts +0 -330
  123. package/examples/google-hotels/search_google_hotels/playbook.yaml +0 -125
  124. package/examples/google-hotels/search_google_hotels/workflow.json +0 -111
  125. package/examples/namecheap-domains/search_namecheap_domains/index.ts +0 -144
  126. package/examples/namecheap-domains/search_namecheap_domains/parser.ts +0 -380
  127. package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +0 -50
  128. package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +0 -136
  129. package/examples/namecheap-domains/search_namecheap_domains/workflow.json +0 -97
@@ -21,6 +21,22 @@ import type { CapturedRequest, Session } from './types.ts';
21
21
  const USER_INTERACTION_TYPES = new Set(['click', 'input', 'change', 'submit']);
22
22
  const MULTI_VALUE_HEADERS = new Set(['cookie', 'set-cookie']);
23
23
 
24
+ /**
25
+ * Detect a structured RPC envelope (XSSI-guarded or length-prefixed) whose body
26
+ * is NOT top-level JSON but carries doubly-encoded JSON as string payloads —
27
+ * e.g. Google `batchexecute` (`)]}'` guard + `<len>\n[...]` frames). Running the
28
+ * flat-text freeform scanner over such a body injects `[REDACTED]` into bare
29
+ * numeric IDs/coordinates inside the inner JSON and makes it unparseable, so the
30
+ * freeform fallback must skip these. The structure-aware key-based redaction
31
+ * still applies to any clean-JSON bodies; this only gates the flat-text scan.
32
+ */
33
+ export function looksLikeRpcEnvelope(body: string): boolean {
34
+ const head = body.slice(0, 64).trimStart();
35
+ if (head.startsWith(")]}'")) return true; // anti-XSSI guard: )]}' and )]}',
36
+ if (/^\d{1,9}\r?\n\[/.test(head)) return true; // length-prefixed frame: 219006\n[
37
+ return false;
38
+ }
39
+
24
40
  /**
25
41
  * Detect sensitive headers whose values are page-minted constants — baked
26
42
  * into the site's JavaScript, not per-user secrets. The recording starts
@@ -179,7 +195,12 @@ export function redactJsonBody(
179
195
  const visited = visit(inner, [...pathSoFar, k]);
180
196
  out[k] = JSON.stringify(visited);
181
197
  } catch {
182
- const r = freeform ? redactFreeformText(v) : { redacted: v, redactionsCount: 0 };
198
+ // Nested string that isn't parseable JSON: scan it as free text,
199
+ // unless it's a structured RPC envelope (flat-scanning corrupts it).
200
+ const r =
201
+ freeform && !looksLikeRpcEnvelope(v)
202
+ ? redactFreeformText(v)
203
+ : { redacted: v, redactionsCount: 0 };
183
204
  freeformCount += r.redactionsCount;
184
205
  out[k] = r.redacted;
185
206
  }
@@ -228,6 +249,9 @@ export function redactBody(
228
249
  } catch {
229
250
  const formR = redactFormBody(body, formPlaceholders, markerContext);
230
251
  if (formR.redactionsCount > 0 || formR.placeholdersInjected > 0 || !freeform) return formR;
252
+ // A structured RPC envelope (XSSI/length-prefixed) is not flat text —
253
+ // flat-scanning it would corrupt the doubly-encoded JSON payloads it carries.
254
+ if (looksLikeRpcEnvelope(body)) return formR;
231
255
  const freeformR = redactFreeformText(body);
232
256
  return {
233
257
  redacted: freeformR.redacted,
@@ -437,7 +461,11 @@ export function redactSession(
437
461
  response.mimeType,
438
462
  undefined,
439
463
  undefined,
440
- useFreeform,
464
+ // Responses are key-based only: never value-pattern (freeform) scan a
465
+ // server body. Keeps redaction focused on real secrets (post-login
466
+ // cookies + user-entered PII) and avoids corrupting structured RPC
467
+ // envelopes whose payloads are doubly-encoded JSON.
468
+ false,
441
469
  markerContext,
442
470
  );
443
471
  respBody = respBodyR.redacted;
@@ -15,6 +15,7 @@ import { join as pathJoin } from 'node:path';
15
15
  import type { Browser, BrowserContext, Locator, Page } from 'playwright';
16
16
  import { createLog } from './log.ts';
17
17
  import type { CapturedReplayRequest } from './session-diff.ts';
18
+ import { getStealthChromium, getStealthExecutablePath } from './stealth-chromium.ts';
18
19
  import type { CapturedEvent, Session } from './types.ts';
19
20
 
20
21
  const log = createLog('replay-capture');
@@ -62,25 +63,17 @@ export async function replayRawSession(opts: RawReplayOptions): Promise<ReplayCa
62
63
 
63
64
  let chromium: typeof import('playwright').chromium;
64
65
  try {
65
- const pwExtra = await import('playwright-extra');
66
- const stealthMod = await import('puppeteer-extra-plugin-stealth');
67
- const stealthFactory =
68
- (stealthMod as { default?: () => unknown }).default ??
69
- (stealthMod as unknown as () => unknown);
70
- pwExtra.chromium.use(stealthFactory() as never);
71
- chromium = pwExtra.chromium as unknown as typeof import('playwright').chromium;
72
- } catch {
73
- try {
74
- const pw = await import('playwright');
75
- chromium = pw.chromium;
76
- } catch (innerErr) {
77
- return { ok: false, requests: [], error: `Playwright not available: ${errMsg(innerErr)}` };
78
- }
66
+ chromium = await getStealthChromium();
67
+ } catch (innerErr) {
68
+ return { ok: false, requests: [], error: `Playwright not available: ${errMsg(innerErr)}` };
79
69
  }
80
70
 
81
71
  try {
82
72
  replayLog(`launching browser (headed=${!!opts.headed})`);
83
- browser = await chromium.launch({ headless: !opts.headed });
73
+ browser = await chromium.launch({
74
+ headless: !opts.headed,
75
+ executablePath: getStealthExecutablePath(),
76
+ });
84
77
  } catch (err) {
85
78
  replayLog(`browser launch failed: ${errMsg(err)}`);
86
79
  return { ok: false, requests: [], error: `Could not launch Chromium: ${errMsg(err)}` };
@@ -215,11 +208,19 @@ export async function replayRawSession(opts: RawReplayOptions): Promise<ReplayCa
215
208
  opts.onProgress?.(i + 1, replayableEvents.length, captured.length);
216
209
  }
217
210
 
218
- // Allow final network requests to settle
211
+ // Allow final network requests to settle, but never block forever: on a
212
+ // large recording a single hung response-body read can stall allSettled
213
+ // indefinitely (there is no outer timeout on the replay stage). Cap the
214
+ // wait and proceed with whatever bodies are ready — replay-diff is
215
+ // best-effort, so partial captures are acceptable.
216
+ const SETTLE_TIMEOUT_MS = 15_000;
219
217
  replayLog('waiting for networkidle...');
220
- await page.waitForLoadState('networkidle').catch(() => {});
218
+ await page.waitForLoadState('networkidle', { timeout: SETTLE_TIMEOUT_MS }).catch(() => {});
221
219
  await page.waitForTimeout(1000);
222
- await Promise.allSettled(pendingReads);
220
+ await Promise.race([
221
+ Promise.allSettled(pendingReads),
222
+ new Promise<void>((resolve) => setTimeout(resolve, SETTLE_TIMEOUT_MS)),
223
+ ]);
223
224
  captured.sort((a, b) => a.seq - b.seq);
224
225
 
225
226
  replayLog(`replay complete: captured ${captured.length} requests total`);
@@ -113,14 +113,23 @@ export async function executeWorkflow<T = unknown>(opts: ExecuteOptions): Promis
113
113
  (await loadCredentialStore(opts.workflow.site)) ??
114
114
  emptyStore(opts.workflow.site);
115
115
 
116
- // Validate required parameters are present.
116
+ // Validate required parameters are present and merge declared defaults
117
+ // into the working params map. Without the merge, `parameter.default` would
118
+ // be a presence-sentinel only — the substitution layer at
119
+ // `resolvePlaceholder` would still throw STATE_MISSING because it reads
120
+ // from this map directly. The schema declares `default` as a real value
121
+ // (string | number | boolean), so honor it.
122
+ const params: Record<string, string | number | boolean> = { ...opts.params };
117
123
  for (const p of opts.workflow.parameters) {
118
- if (!(p.name in opts.params) && p.default === undefined) {
119
- return {
120
- ok: false,
121
- error: 'UNKNOWN',
122
- message: `Missing required parameter: ${p.name} (${p.description})`,
123
- };
124
+ if (!(p.name in params)) {
125
+ if (p.default === undefined) {
126
+ return {
127
+ ok: false,
128
+ error: 'UNKNOWN',
129
+ message: `Missing required parameter: ${p.name} (${p.description})`,
130
+ };
131
+ }
132
+ params[p.name] = p.default;
124
133
  }
125
134
  }
126
135
 
@@ -163,7 +172,7 @@ export async function executeWorkflow<T = unknown>(opts: ExecuteOptions): Promis
163
172
  if (!req) continue;
164
173
 
165
174
  const subbedResult = substituteRequest(req, {
166
- params: opts.params,
175
+ params,
167
176
  credentials: liveCredentials,
168
177
  responseSlots,
169
178
  state,
@@ -180,7 +189,7 @@ export async function executeWorkflow<T = unknown>(opts: ExecuteOptions): Promis
180
189
  subbed.method,
181
190
  subbed.url,
182
191
  responseSlots.map((s) => s.raw),
183
- opts.params,
192
+ params,
184
193
  );
185
194
  if (typeof transformResult === 'string') {
186
195
  subbed.url = transformResult;
@@ -312,7 +321,7 @@ export async function executeWorkflow<T = unknown>(opts: ExecuteOptions): Promis
312
321
  };
313
322
  }
314
323
  finalData = mod.extract(finalData, {
315
- params: opts.params,
324
+ params,
316
325
  responses: responseSlots.map((s) => s.raw),
317
326
  });
318
327
  } catch (err) {
@@ -112,15 +112,126 @@ const SENSITIVE_KEYS = [
112
112
  'dob',
113
113
  ];
114
114
 
115
- const SENSITIVE_KEY_SET = new Set(SENSITIVE_KEYS.map((k) => k.toLowerCase()));
115
+ // `normalizeKey` (defined below) lowercases and strips `_`/`-` — set
116
+ // membership goes through it, so we MUST pre-normalize the stored entries
117
+ // or lookups for e.g. `j_password` (→ `jpassword`) will miss a stored
118
+ // `j_password`. Hoisting a local copy of the rule rather than ordering
119
+ // gymnastics keeps the file linear.
120
+ const _normalize = (s: string): string => s.toLowerCase().replace(/[-_]/g, '');
121
+
122
+ const SENSITIVE_KEY_SET = new Set(SENSITIVE_KEYS.map(_normalize));
116
123
 
117
124
  /** Subset of SENSITIVE_KEYS that specifically denote a credential (not PII).
118
125
  * Used by credential-extract.ts when looking for the password half of a
119
- * login form pair — we don't want to treat e.g. `dob` as a password. */
120
- const PASSWORD_LIKE_KEYS = new Set(
121
- ['password', 'passwd', 'pwd', 'pin', 'patronpassword', 'patron_password'].map((k) =>
122
- k.toLowerCase(),
123
- ),
126
+ * login form pair — we don't want to treat e.g. `dob` as a password.
127
+ *
128
+ * Inclusion criterion: a key name that, when present in a request body
129
+ * alongside a username-like partner, almost always means "this is the
130
+ * password the user typed at login time." Be liberal here — false positives
131
+ * cost the user one extra prompt confirmation; false negatives ship broken
132
+ * tools. New additions should reference a real recorded site that broke
133
+ * without them.
134
+ *
135
+ * Sites observed needing each entry:
136
+ * - password / passwd / pwd: most modern APIs
137
+ * - pin: bank / utility login forms
138
+ * - pass: legacy PHP forms (e.g. SMF)
139
+ * - secret: OAuth ROPC payloads
140
+ * - j_password: Java EE / Spring Security default form-login
141
+ * - userpassword / loginpassword / accountpassword:
142
+ * vendor SSO portals that namespace fields
143
+ * - patronpassword / patron_password: Discover & Go libraries (kept for back-compat)
144
+ */
145
+ const PASSWORD_LIKE_ENTRIES = [
146
+ 'password',
147
+ 'passwd',
148
+ 'pwd',
149
+ 'pin',
150
+ 'pass',
151
+ 'secret',
152
+ 'j_password',
153
+ 'userpassword',
154
+ 'loginpassword',
155
+ 'accountpassword',
156
+ 'patronpassword',
157
+ 'patron_password',
158
+ ];
159
+ const PASSWORD_LIKE_KEYS = new Set(PASSWORD_LIKE_ENTRIES.map(_normalize));
160
+
161
+ /** Subset of SENSITIVE_KEYS that specifically denote a username/email/login
162
+ * identifier — the partner half of a username+password login pair.
163
+ *
164
+ * Same inclusion criterion as PASSWORD_LIKE_KEYS: liberal coverage of real
165
+ * recorded forms, narrow enough not to match arbitrary identifiers. Note
166
+ * this set is intentionally distinct from `email`, `phone` etc. in
167
+ * SENSITIVE_KEYS — those get redacted as PII regardless, but only the
168
+ * subset here qualifies as the "username partner" the credential extractor
169
+ * pairs with a password.
170
+ *
171
+ * Sites observed needing each entry:
172
+ * - user / username / user_name / userid / user_id:
173
+ * most APIs
174
+ * - login / loginid / login_id / login_email:
175
+ * REST endpoints that name the form field after the action
176
+ * - email / emailaddress / email_address: email-as-username flows
177
+ * - account / accountid / account_id: enterprise SSO portals
178
+ * - patron / patronnumber / patron_number / patronid / patron_id:
179
+ * library systems (Discover & Go)
180
+ * - j_username: Java EE / Spring Security default form-login
181
+ * - signin / signinid / sign_in_id: vendor SSO portals (Okta-style)
182
+ * - usr / uid: legacy CGI / older PHP
183
+ * - memberid / member_id / membername / member_name:
184
+ * membership-driven sites (gyms, clubs)
185
+ * - customerid / customer_id / customernumber / customer_number:
186
+ * ecommerce account portals
187
+ * - clientid / client_id / clientnumber / client_number:
188
+ * B2B portals (CAUTION: also matches OAuth client_id;
189
+ * credential-extract.ts gates on having a password
190
+ * partner in the same parent, so OAuth token endpoints
191
+ * that pass client_id without a password won't match)
192
+ */
193
+ const USERNAME_LIKE_KEYS = new Set(
194
+ [
195
+ 'user',
196
+ 'username',
197
+ 'user_name',
198
+ 'userid',
199
+ 'user_id',
200
+ 'login',
201
+ 'loginid',
202
+ 'login_id',
203
+ 'loginemail',
204
+ 'login_email',
205
+ 'email',
206
+ 'emailaddress',
207
+ 'email_address',
208
+ 'account',
209
+ 'accountid',
210
+ 'account_id',
211
+ 'patron',
212
+ 'patronnumber',
213
+ 'patron_number',
214
+ 'patronid',
215
+ 'patron_id',
216
+ 'j_username',
217
+ 'signin',
218
+ 'signinid',
219
+ 'sign_in_id',
220
+ 'usr',
221
+ 'uid',
222
+ 'memberid',
223
+ 'member_id',
224
+ 'membername',
225
+ 'member_name',
226
+ 'customerid',
227
+ 'customer_id',
228
+ 'customernumber',
229
+ 'customer_number',
230
+ 'clientid',
231
+ 'client_id',
232
+ 'clientnumber',
233
+ 'client_number',
234
+ ].map(_normalize),
124
235
  );
125
236
 
126
237
  const SENSITIVE_HEADERS = [
@@ -138,7 +249,7 @@ const SENSITIVE_HEADERS = [
138
249
 
139
250
  const SENSITIVE_HEADER_SET = new Set(SENSITIVE_HEADERS.map((h) => h.toLowerCase()));
140
251
 
141
- export const normalizeKey = (s: string): string => s.toLowerCase().replace(/[-_]/g, '');
252
+ export const normalizeKey = _normalize;
142
253
 
143
254
  /** True if the key name suggests a sensitive value (auth, payment, PII). */
144
255
  export function isSensitiveKey(key: string): boolean {
@@ -151,6 +262,29 @@ export function isSensitiveCredentialKey(key: string): boolean {
151
262
  return PASSWORD_LIKE_KEYS.has(normalizeKey(key));
152
263
  }
153
264
 
265
+ /** True if the key name suggests a username/email/login identifier — the
266
+ * partner half of a login pair. Used in credential extraction and in the
267
+ * pre-emit guardrail that flags workflows templating credentials as plain
268
+ * parameters. */
269
+ export function isUsernameLikeKey(key: string): boolean {
270
+ return USERNAME_LIKE_KEYS.has(normalizeKey(key));
271
+ }
272
+
273
+ /** True for either half of a login pair (username or password). Used by the
274
+ * pre-emit guardrail and the post-redact pairing audit, which both need to
275
+ * decide "is this parameter name credential-shaped?" without caring which
276
+ * half. */
277
+ export function isLoginFieldKey(key: string): boolean {
278
+ const n = normalizeKey(key);
279
+ return PASSWORD_LIKE_KEYS.has(n) || USERNAME_LIKE_KEYS.has(n);
280
+ }
281
+
282
+ /** Raw password-like key strings (pre-normalization) for callers that need
283
+ * substring matching against raw body text rather than parsed key lookup. */
284
+ export function passwordLikeTokens(): readonly string[] {
285
+ return PASSWORD_LIKE_ENTRIES;
286
+ }
287
+
154
288
  export function isSensitiveHeader(header: string): boolean {
155
289
  return SENSITIVE_HEADER_SET.has(header.toLowerCase());
156
290
  }
@@ -318,6 +318,17 @@ function suggestStateName(location: string): string {
318
318
  .toLowerCase();
319
319
  }
320
320
 
321
+ /** Whether a value looks like an opaque token/id (vs human text, a city name, a
322
+ * date). Gates provenance-tagging of stable values so an incidental constant
323
+ * (a UI label, the echoed query) isn't treated as a server-provided token.
324
+ * Shared with the build-plan token detector. */
325
+ export function looksLikeToken(v: string): boolean {
326
+ if (v.length < 12) return false;
327
+ if (/\s/.test(v)) return false; // multi-word / free text
328
+ if (/^\d{4}-\d{2}-\d{2}$/.test(v)) return false; // dates
329
+ return /[:|_-]/.test(v) || /\d/.test(v);
330
+ }
331
+
321
332
  // ─── Main diff ──────────────────────────────────────────────────────────────
322
333
 
323
334
  export function diffTriagedSessions(
@@ -327,6 +338,12 @@ export function diffTriagedSessions(
327
338
  const pairs = alignRequests(original.requests, replay.requests);
328
339
  const pairedOrigSeqs = new Set(pairs.map((p) => p.originalSeq));
329
340
  const pairedReplaySeqs = new Set(pairs.map((p) => p.replaySeq));
341
+ // `searchPriorResponses` over the replay returns a producer in REPLAY-seq
342
+ // space, but `originalSeq` and every downstream consumer (capture hints,
343
+ // build-plan token detection, the planner) work in ORIGINAL-seq space — so a
344
+ // replay producer must be translated back via the alignment pairs.
345
+ const replayToOriginal = new Map(pairs.map((p) => [p.replaySeq, p.originalSeq]));
346
+ const toOriginalSeq = (replaySeq: number): number => replayToOriginal.get(replaySeq) ?? replaySeq;
330
347
 
331
348
  const classifications: ClassifiedValue[] = [];
332
349
 
@@ -347,17 +364,28 @@ export function diffTriagedSessions(
347
364
  if (v2Value === undefined) continue; // field only in run 1
348
365
 
349
366
  if (v1.value === v2Value) {
367
+ // Stable across runs. Normally a constant — but an OPAQUE stable value
368
+ // that also appears in a PRIOR response is a server-PROVIDED token (e.g.
369
+ // a per-entity id minted by a sibling search tool). The same-flow replay
370
+ // can't expose it by variance (same entity → same token), so recover its
371
+ // provenance from the original responses (already original-seq space).
372
+ // A cross-tool consumer then sources it as a param instead of hardcoding.
373
+ const provider = looksLikeToken(v1.value)
374
+ ? searchPriorResponses(v1.value, original.requests, pair.originalSeq)
375
+ : null;
350
376
  classifications.push({
351
377
  classification: 'constant',
352
378
  location: v1.location,
353
379
  originalSeq: pair.originalSeq,
354
380
  value1: v1.value,
355
381
  value2: v2Value,
382
+ ...(provider ? { producerSeq: provider.seq, producerPath: provider.path } : {}),
356
383
  });
357
384
  continue;
358
385
  }
359
386
 
360
- // Value differs — check if it came from a prior response in run 2
387
+ // Value differs — check if it came from a prior response in run 2,
388
+ // translating the replay producer back to original-seq space.
361
389
  const producer = searchPriorResponses(v2Value, replay.requests, pair.replaySeq);
362
390
 
363
391
  if (producer) {
@@ -368,7 +396,7 @@ export function diffTriagedSessions(
368
396
  originalSeq: pair.originalSeq,
369
397
  value1: v1.value,
370
398
  value2: v2Value,
371
- producerSeq: producer.seq,
399
+ producerSeq: toOriginalSeq(producer.seq),
372
400
  producerPath: producer.path,
373
401
  suggestedStateName: name || undefined,
374
402
  });
@@ -407,3 +435,52 @@ export function triageByAlignment(
407
435
  const aligned = alignRequests(run1TriagedRequests, run2AllRequests);
408
436
  return aligned.filter((pair) => pair.confidence >= 0.5).map((pair) => pair.replaySeq);
409
437
  }
438
+
439
+ /**
440
+ * Severity order — a value seen varying in ANY pass outranks one seen constant.
441
+ * server_derived (traceable to a response) wins over browser_minted.
442
+ */
443
+ const CLASSIFICATION_RANK: Record<ValueClassification, number> = {
444
+ constant: 0,
445
+ browser_minted: 1,
446
+ server_derived: 2,
447
+ };
448
+
449
+ /**
450
+ * Merge `ClassifiedValue`s from several diff passes that all share the SAME
451
+ * `original` recording (so `originalSeq` is a stable join key across passes).
452
+ *
453
+ * Each pass diffs the original recording against one other run — the automated
454
+ * browser replay AND every other real recording of the site. Anti-bot edges
455
+ * (Akamai, DataDome, …) often block the automated replay at the page level, so
456
+ * the replay reproduces only a fraction of the recording's requests and their
457
+ * functional values (GraphQL safelisting signatures, persisted-query hashes,
458
+ * app keys) never get classified. Real recordings come from a trusted browser
459
+ * and DO carry those requests, so diffing recordings against each other
460
+ * recovers the missing signal.
461
+ *
462
+ * Merge rule per (originalSeq, location):
463
+ * - a value that VARIES in any pass is ephemeral — the strongest non-constant
464
+ * classification wins (server_derived > browser_minted), preserving its
465
+ * producer provenance;
466
+ * - a value constant in every pass that observed it is `constant`.
467
+ * A value the replay never observed (because it was blocked) but that is
468
+ * identical across time-separated recordings is therefore kept as `constant`,
469
+ * not silently dropped.
470
+ */
471
+ export function mergeClassifications(passes: ClassifiedValue[][]): ClassifiedValue[] {
472
+ const byKey = new Map<string, ClassifiedValue>();
473
+ for (const pass of passes) {
474
+ for (const cv of pass) {
475
+ const key = `${cv.originalSeq}${cv.location}`;
476
+ const prev = byKey.get(key);
477
+ if (
478
+ !prev ||
479
+ CLASSIFICATION_RANK[cv.classification] > CLASSIFICATION_RANK[prev.classification]
480
+ ) {
481
+ byKey.set(key, cv);
482
+ }
483
+ }
484
+ }
485
+ return [...byKey.values()];
486
+ }
@@ -7,7 +7,7 @@
7
7
  * pipeline consumes unchanged.
8
8
  */
9
9
 
10
- import { existsSync, readFileSync, readdirSync, writeFileSync } from 'node:fs';
10
+ import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from 'node:fs';
11
11
  import { join as pathJoin } from 'node:path';
12
12
  import { localSessionsDir } from './paths.ts';
13
13
  import { friendlySessionTimestamp } from './teach-state.ts';
@@ -34,10 +34,13 @@ interface SessionInfo {
34
34
  }
35
35
 
36
36
  export function listSiteSessions(site: string): SessionInfo[] {
37
- const sessDir = localSessionsDir(site);
38
- if (!existsSync(sessDir)) return [];
37
+ return listSessionsInDir(localSessionsDir(site));
38
+ }
39
+
40
+ export function listSessionsInDir(dir: string): SessionInfo[] {
41
+ if (!existsSync(dir)) return [];
39
42
 
40
- const files = readdirSync(sessDir).filter(
43
+ const files = readdirSync(dir).filter(
41
44
  (f) =>
42
45
  f.endsWith('.json') &&
43
46
  !f.includes('.redacted') &&
@@ -47,7 +50,7 @@ export function listSiteSessions(site: string): SessionInfo[] {
47
50
 
48
51
  const infos: SessionInfo[] = [];
49
52
  for (const filename of files) {
50
- const absPath = pathJoin(sessDir, filename);
53
+ const absPath = pathJoin(dir, filename);
51
54
  try {
52
55
  const raw = JSON.parse(readFileSync(absPath, 'utf8'));
53
56
  const session = SessionSchema.parse(raw);
@@ -190,6 +193,7 @@ export function mergeSessions(sessions: Session[]): Session {
190
193
 
191
194
  export function writeCombinedSession(site: string, combined: Session): string {
192
195
  const sessDir = localSessionsDir(site);
196
+ mkdirSync(sessDir, { recursive: true });
193
197
  const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
194
198
  const filename = `combined-${timestamp}.json`;
195
199
  const absPath = pathJoin(sessDir, filename);
@@ -0,0 +1,81 @@
1
+ import { findChromium } from './chromium.ts';
2
+
3
+ /**
4
+ * Shared loader for Playwright's chromium with the stealth plugin applied.
5
+ *
6
+ * Stealth patches navigator.webdriver, plugin enumeration, WebGL vendor
7
+ * strings, and other headless-Chrome telltales that anti-bot services
8
+ * (Akamai, Cloudflare, PerimeterX) detect. Vanilla headless Playwright
9
+ * gets tarpitted or 403'd by these services; the stealth-patched chromium
10
+ * loads the same pages in seconds.
11
+ *
12
+ * Falls back to vanilla `playwright` if `playwright-extra` /
13
+ * `puppeteer-extra-plugin-stealth` are not installed (preserves the
14
+ * graceful-degrade behavior of the original duplicated loaders in
15
+ * playbook-runner, replay-capture, and backend-ladder).
16
+ *
17
+ * Throws if no Playwright is available at all — callers translate the
18
+ * thrown error into their own result shape.
19
+ */
20
+ export async function getStealthChromium(): Promise<typeof import('playwright').chromium> {
21
+ try {
22
+ const pwExtra = await import('playwright-extra');
23
+ const stealthMod = await import('puppeteer-extra-plugin-stealth');
24
+ const stealthFactory =
25
+ (stealthMod as { default?: () => unknown }).default ??
26
+ (stealthMod as unknown as () => unknown);
27
+ pwExtra.chromium.use(stealthFactory() as never);
28
+ return pwExtra.chromium as unknown as typeof import('playwright').chromium;
29
+ } catch {
30
+ const pw = await import('playwright');
31
+ return pw.chromium;
32
+ }
33
+ }
34
+
35
+ /**
36
+ * True when the puppeteer-extra stealth plugin is installed and WILL be applied
37
+ * by getStealthChromium() (i.e. we're not on the vanilla-Playwright fallback).
38
+ *
39
+ * Callers use this to avoid stacking a manual `navigator.webdriver` patch on top
40
+ * of the plugin's: the stealth plugin removes the property the way a real Chrome
41
+ * does (it simply lacks `webdriver`), whereas a redundant
42
+ * `Object.defineProperty(navigator,'webdriver',{get:()=>false})` leaves a
43
+ * non-native property descriptor that is ITSELF a fingerprinting tell. So the
44
+ * manual patch should only run on the vanilla fallback, where it's the only
45
+ * protection. Import resolution is cached, so probing here is cheap.
46
+ */
47
+ export async function isStealthPluginAvailable(): Promise<boolean> {
48
+ try {
49
+ await import('playwright-extra');
50
+ await import('puppeteer-extra-plugin-stealth');
51
+ return true;
52
+ } catch {
53
+ return false;
54
+ }
55
+ }
56
+
57
+ /**
58
+ * Path to the same Chromium binary `imprint record` uses for the user's
59
+ * recording session — Playwright's bundled "Google Chrome for Testing"
60
+ * (full Chrome build), the system Chrome on macOS, or a Linux distro
61
+ * Chrome/Chromium package, in that order of preference.
62
+ *
63
+ * Why this matters: by default Playwright's `chromium.launch({ headless: true })`
64
+ * picks `chrome-headless-shell` — a separate stripped-down binary that
65
+ * Akamai / Cloudflare / PerimeterX class anti-bot services detect at the
66
+ * binary/TLS-fingerprint layer regardless of how thoroughly the JS-level
67
+ * `navigator.webdriver` etc. are patched by the stealth plugin. The
68
+ * recording browser uses the FULL Chrome binary and Akamai trusts it; the
69
+ * replay browser using chrome-headless-shell looks like a bot. Using the
70
+ * SAME binary for both eliminates the binary asymmetry.
71
+ *
72
+ * Returns `undefined` if no Chromium can be located — callers should let
73
+ * Playwright fall back to whatever default it finds.
74
+ */
75
+ export function getStealthExecutablePath(): string | undefined {
76
+ try {
77
+ return findChromium();
78
+ } catch {
79
+ return undefined;
80
+ }
81
+ }