imprint-mcp 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/README.md +165 -201
  2. package/examples/discoverandgo/README.md +1 -1
  3. package/examples/echo/README.md +1 -1
  4. package/examples/google-flights/README.md +28 -0
  5. package/examples/google-flights/_shared/batchexecute.ts +63 -0
  6. package/examples/google-flights/_shared/flights_request.ts +95 -0
  7. package/examples/google-flights/_shared/package.json +9 -0
  8. package/examples/google-flights/get_flight_booking_details/index.ts +159 -0
  9. package/examples/google-flights/get_flight_booking_details/package.json +9 -0
  10. package/examples/google-flights/get_flight_booking_details/parser.ts +182 -0
  11. package/examples/google-flights/get_flight_booking_details/playbook.yaml +138 -0
  12. package/examples/google-flights/get_flight_booking_details/request-transform.ts +86 -0
  13. package/examples/google-flights/get_flight_booking_details/workflow.json +98 -0
  14. package/examples/google-flights/get_flight_calendar_prices/index.ts +131 -0
  15. package/examples/google-flights/get_flight_calendar_prices/package.json +9 -0
  16. package/examples/google-flights/get_flight_calendar_prices/parser.ts +86 -0
  17. package/examples/google-flights/get_flight_calendar_prices/playbook.yaml +97 -0
  18. package/examples/google-flights/get_flight_calendar_prices/request-transform.ts +31 -0
  19. package/examples/google-flights/get_flight_calendar_prices/workflow.json +76 -0
  20. package/examples/google-flights/lookup_airport/index.ts +101 -0
  21. package/examples/google-flights/lookup_airport/package.json +9 -0
  22. package/examples/google-flights/lookup_airport/parser.ts +66 -0
  23. package/examples/google-flights/lookup_airport/playbook.yaml +47 -0
  24. package/examples/google-flights/lookup_airport/request-transform.ts +20 -0
  25. package/examples/google-flights/lookup_airport/workflow.json +57 -0
  26. package/examples/google-flights/search_flights/index.ts +219 -0
  27. package/examples/google-flights/search_flights/package.json +9 -0
  28. package/examples/google-flights/search_flights/parser.ts +169 -0
  29. package/examples/google-flights/search_flights/playbook.yaml +184 -0
  30. package/examples/google-flights/search_flights/request-transform.ts +119 -0
  31. package/examples/google-flights/search_flights/workflow.json +143 -0
  32. package/examples/google-hotels/README.md +29 -0
  33. package/examples/google-hotels/_shared/batchexecute.ts +73 -0
  34. package/examples/google-hotels/_shared/freq.ts +158 -0
  35. package/examples/google-hotels/_shared/package.json +9 -0
  36. package/examples/google-hotels/autocomplete_hotel_location/index.ts +80 -0
  37. package/examples/google-hotels/autocomplete_hotel_location/package.json +9 -0
  38. package/examples/google-hotels/autocomplete_hotel_location/parser.ts +71 -0
  39. package/examples/google-hotels/autocomplete_hotel_location/playbook.yaml +36 -0
  40. package/examples/google-hotels/autocomplete_hotel_location/request-transform.ts +37 -0
  41. package/examples/google-hotels/autocomplete_hotel_location/workflow.json +36 -0
  42. package/examples/google-hotels/get_hotel_booking_options/index.ts +143 -0
  43. package/examples/google-hotels/get_hotel_booking_options/package.json +9 -0
  44. package/examples/google-hotels/get_hotel_booking_options/parser.ts +271 -0
  45. package/examples/google-hotels/get_hotel_booking_options/playbook.yaml +154 -0
  46. package/examples/google-hotels/get_hotel_booking_options/request-transform.ts +154 -0
  47. package/examples/google-hotels/get_hotel_booking_options/workflow.json +84 -0
  48. package/examples/google-hotels/get_hotel_reviews/index.ts +81 -0
  49. package/examples/google-hotels/get_hotel_reviews/package.json +9 -0
  50. package/examples/google-hotels/get_hotel_reviews/parser.ts +128 -0
  51. package/examples/google-hotels/get_hotel_reviews/playbook.yaml +64 -0
  52. package/examples/google-hotels/get_hotel_reviews/request-transform.ts +42 -0
  53. package/examples/google-hotels/get_hotel_reviews/workflow.json +37 -0
  54. package/examples/google-hotels/search_hotels/index.ts +207 -0
  55. package/examples/google-hotels/search_hotels/package.json +9 -0
  56. package/examples/google-hotels/search_hotels/parser.ts +260 -0
  57. package/examples/google-hotels/search_hotels/playbook.yaml +87 -0
  58. package/examples/google-hotels/search_hotels/request-transform.ts +197 -0
  59. package/examples/google-hotels/search_hotels/workflow.json +127 -0
  60. package/package.json +3 -2
  61. package/prompts/audit-agent.md +71 -0
  62. package/prompts/build-planning.md +74 -0
  63. package/prompts/compile-agent.md +132 -28
  64. package/prompts/prereq-builder.md +64 -0
  65. package/prompts/prereq-planner.md +34 -0
  66. package/prompts/tool-planning.md +39 -0
  67. package/src/cli.ts +111 -4
  68. package/src/imprint/agent.ts +5 -0
  69. package/src/imprint/audit.ts +996 -0
  70. package/src/imprint/backend-ladder.ts +1214 -184
  71. package/src/imprint/build-plan.ts +1051 -0
  72. package/src/imprint/cdp-browser-fetch.ts +589 -0
  73. package/src/imprint/cdp-jar-cache.ts +320 -0
  74. package/src/imprint/chromium.ts +135 -0
  75. package/src/imprint/claude-cli-compile.ts +125 -25
  76. package/src/imprint/codex-cli-compile.ts +26 -23
  77. package/src/imprint/compile-agent-types.ts +38 -0
  78. package/src/imprint/compile-agent.ts +65 -27
  79. package/src/imprint/compile-tools.ts +1656 -64
  80. package/src/imprint/compile.ts +14 -2
  81. package/src/imprint/concurrency.ts +87 -0
  82. package/src/imprint/credential-extract.ts +174 -25
  83. package/src/imprint/cron.ts +1 -0
  84. package/src/imprint/doctor.ts +39 -0
  85. package/src/imprint/emit.ts +85 -0
  86. package/src/imprint/freeform-redact.ts +5 -4
  87. package/src/imprint/integrations.ts +2 -2
  88. package/src/imprint/llm.ts +56 -8
  89. package/src/imprint/mcp-compile-server.ts +43 -10
  90. package/src/imprint/mcp-maintenance.ts +9 -101
  91. package/src/imprint/mcp-server.ts +73 -7
  92. package/src/imprint/multi-progress.ts +7 -2
  93. package/src/imprint/param-grounding.ts +367 -0
  94. package/src/imprint/paths.ts +29 -0
  95. package/src/imprint/playbook-runner.ts +101 -40
  96. package/src/imprint/prereq-builder.ts +651 -0
  97. package/src/imprint/probe-backends.ts +6 -3
  98. package/src/imprint/record.ts +10 -1
  99. package/src/imprint/redact.ts +30 -2
  100. package/src/imprint/replay-capture.ts +19 -18
  101. package/src/imprint/runtime.ts +19 -10
  102. package/src/imprint/sensitive-keys.ts +141 -7
  103. package/src/imprint/session-diff.ts +79 -2
  104. package/src/imprint/session-merge.ts +9 -5
  105. package/src/imprint/stealth-chromium.ts +81 -0
  106. package/src/imprint/stealth-fetch.ts +309 -29
  107. package/src/imprint/stealth-token-cache.ts +88 -0
  108. package/src/imprint/teach-plan.ts +251 -0
  109. package/src/imprint/teach-state.ts +17 -0
  110. package/src/imprint/teach.ts +582 -147
  111. package/src/imprint/tool-candidates.ts +72 -14
  112. package/src/imprint/tool-plan.ts +313 -0
  113. package/src/imprint/tracing.ts +135 -6
  114. package/src/imprint/types.ts +61 -3
  115. package/examples/google-flights/search_google_flights/index.ts +0 -101
  116. package/examples/google-flights/search_google_flights/parser.test.ts +0 -140
  117. package/examples/google-flights/search_google_flights/parser.ts +0 -189
  118. package/examples/google-flights/search_google_flights/playbook.yaml +0 -130
  119. package/examples/google-flights/search_google_flights/workflow.json +0 -48
  120. package/examples/google-hotels/search_google_hotels/index.ts +0 -194
  121. package/examples/google-hotels/search_google_hotels/parser.test.ts +0 -168
  122. package/examples/google-hotels/search_google_hotels/parser.ts +0 -330
  123. package/examples/google-hotels/search_google_hotels/playbook.yaml +0 -125
  124. package/examples/google-hotels/search_google_hotels/workflow.json +0 -111
  125. package/examples/namecheap-domains/search_namecheap_domains/index.ts +0 -144
  126. package/examples/namecheap-domains/search_namecheap_domains/parser.ts +0 -380
  127. package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +0 -50
  128. package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +0 -136
  129. package/examples/namecheap-domains/search_namecheap_domains/workflow.json +0 -97
@@ -0,0 +1,320 @@
1
+ /**
2
+ * File-backed cache for a cdp-browser-minted Akamai jar (MintedJar), so the
3
+ * "bootstrap-then-fetch" path launches a real Chrome ONCE per validity window
4
+ * and then replays many searches via plain fetch with the cached jar.
5
+ *
6
+ * Validity window: Akamai's ak_bmsc + bm_sv expire ~2h FIXED from first page
7
+ * load (non-sliding — activity does not extend it), so we operate well under
8
+ * that and re-mint after 90 min (JAR_MAX_AGE_SECONDS). A jar is only reusable
9
+ * while its `_abck` is still validated (`~0~`); a jar that has gone stale
10
+ * self-heals via the reactive `clearJar` on a replay 401/403/428/429.
11
+ *
12
+ * The file holds a LIVE session credential (validated _abck + session cookies).
13
+ * It lives under ~/.imprint/<site>/ (never the repo), is gitignored, and must
14
+ * never be copied into examples/ fixtures, PRs, or screenshots.
15
+ */
16
+
17
+ import {
18
+ existsSync,
19
+ mkdirSync,
20
+ readFileSync,
21
+ readdirSync,
22
+ renameSync,
23
+ rmSync,
24
+ statSync,
25
+ writeFileSync,
26
+ } from 'node:fs';
27
+ import { join as pathJoin } from 'node:path';
28
+ import { type MintedJar, jarCookiesValidated } from './cdp-browser-fetch.ts';
29
+ import { createLog } from './log.ts';
30
+
31
+ const log = createLog('cdp-jar');
32
+
33
+ const JAR_FILE = '.cdp-jar.json';
34
+
35
+ /** Re-mint after 90 min. The hard ceiling is the ~2h fixed ak_bmsc/bm_sv TTL;
36
+ * 90 min leaves margin for snapshot-issuance skew. */
37
+ export const JAR_MAX_AGE_SECONDS = 5400;
38
+
39
+ /** Effective max jar/recording age. Defaults to JAR_MAX_AGE_SECONDS but can be
40
+ * raised via IMPRINT_JAR_MAX_AGE_SECONDS for a long single-IP teach where the
41
+ * recording must stay seedable for the whole compile (the real Akamai TTL is
42
+ * ~2h, so values up to ~6900s are still safe). Clamped to the ~2h hard ceiling
43
+ * so a typo can't push past the real cookie expiry. Read per-call. */
44
+ function jarMaxAgeSeconds(): number {
45
+ const raw = Number(process.env.IMPRINT_JAR_MAX_AGE_SECONDS);
46
+ if (Number.isFinite(raw) && raw > 0) return Math.min(raw, 7200);
47
+ return JAR_MAX_AGE_SECONDS;
48
+ }
49
+
50
+ function jarPath(siteDir: string): string {
51
+ return pathJoin(siteDir, JAR_FILE);
52
+ }
53
+
54
+ /** Load a cached jar, or null if absent / malformed / aged-out / not validated.
55
+ * The cached `ua` is reused for replay verbatim; a UA drift (Chrome auto-update
56
+ * mid-window) is rare and self-heals reactively on a replay 403, so we do NOT
57
+ * launch Chrome just to gate on UA here. */
58
+ export function loadJar(siteDir: string): MintedJar | null {
59
+ const p = jarPath(siteDir);
60
+ if (!existsSync(p)) return null;
61
+ try {
62
+ const raw = JSON.parse(readFileSync(p, 'utf8')) as Partial<MintedJar>;
63
+ if (!raw || !Array.isArray(raw.cookies) || typeof raw.bootstrapEpoch !== 'number') return null;
64
+ const ageSeconds = (Date.now() - raw.bootstrapEpoch) / 1000;
65
+ const maxAge = jarMaxAgeSeconds();
66
+ if (ageSeconds >= maxAge) {
67
+ log(`cached jar in ${siteDir} is ${Math.round(ageSeconds)}s old (>= ${maxAge}s) — re-mint`);
68
+ return null;
69
+ }
70
+ // Validated = `_abck~0~` OR `bm_sv` present (the latter survives `_abck`
71
+ // rotating back to `~-1~`). Fall back to the abckFlag check for caches
72
+ // written before the `validated` field existed.
73
+ const validated = raw.validated ?? raw.abckFlag === '0';
74
+ if (!validated) {
75
+ log(`cached jar not validated (_abck~${raw.abckFlag}~, no bm_sv) — re-mint`);
76
+ return null;
77
+ }
78
+ return raw as MintedJar;
79
+ } catch {
80
+ return null;
81
+ }
82
+ }
83
+
84
+ /** Persist a minted jar (atomic temp + rename). Best-effort. */
85
+ export function saveJar(siteDir: string, jar: MintedJar): void {
86
+ try {
87
+ mkdirSync(siteDir, { recursive: true });
88
+ const p = jarPath(siteDir);
89
+ const tmp = `${p}.${process.pid}.tmp`;
90
+ writeFileSync(tmp, `${JSON.stringify(jar)}\n`, 'utf8');
91
+ renameSync(tmp, p);
92
+ } catch (err) {
93
+ log(`failed to persist jar to ${siteDir}: ${err instanceof Error ? err.message : String(err)}`);
94
+ }
95
+ }
96
+
97
+ /** Remove a cached jar (best-effort) — call on a replay 401/403/428/429 so the
98
+ * next call re-mints (reactive self-heal), or when a site's teach run ends. */
99
+ export function clearJar(siteDir: string): void {
100
+ try {
101
+ rmSync(jarPath(siteDir), { force: true });
102
+ } catch {
103
+ // best-effort
104
+ }
105
+ }
106
+
107
+ /** Path + mtime of the newest raw recorded session (excludes .redacted/.triaged),
108
+ * or null. Lets callers prefer a fresh recording over an older cached jar — e.g.
109
+ * after the user re-records on a new IP, the fresh recording must supersede the
110
+ * stale (old-IP) cached jar, which would otherwise tarpit. */
111
+ export function newestRecording(siteDir: string): { path: string; mtimeMs: number } | null {
112
+ const sessionsDir = pathJoin(siteDir, 'sessions');
113
+ if (!existsSync(sessionsDir)) return null;
114
+ let path = '';
115
+ let mtimeMs = 0;
116
+ try {
117
+ for (const f of readdirSync(sessionsDir)) {
118
+ if (!f.endsWith('.json') || f.endsWith('.redacted.json') || f.endsWith('.triaged.json')) {
119
+ continue;
120
+ }
121
+ // Skip synthetic `combined-*` merges — the jar must come from a GENUINE
122
+ // single browser recording whose `end` cookieSnapshot carries the real
123
+ // validated session (bm_sv). A combined session is a merge for tool
124
+ // detection and may not preserve a usable validated snapshot. (teach
125
+ // writes a fresh combined-*.json, so without this it'd be "newest".)
126
+ if (f.startsWith('combined-')) continue;
127
+ const p = pathJoin(sessionsDir, f);
128
+ const m = statSync(p).mtimeMs;
129
+ if (m > mtimeMs) {
130
+ mtimeMs = m;
131
+ path = p;
132
+ }
133
+ }
134
+ } catch {
135
+ return null;
136
+ }
137
+ return path ? { path, mtimeMs } : null;
138
+ }
139
+
140
+ /**
141
+ * Seed the jar cache from the most recent RECORDED session for this site, if
142
+ * fresh + validated. The recording is a REAL browser session, so its `_abck`
143
+ * is HIGH-TRUST (genuine interaction → many sequential .act succeed) — strictly
144
+ * better than a synthetic cdp-browser mint, whose quickly-validated `_abck` is
145
+ * low-trust and gets rate-tarpitted. This is the imprint-native pure-API path:
146
+ * "the recording IS the executable" — replay reuses the session the user already
147
+ * validated, via plain fetch. Returns true if a jar was seeded.
148
+ *
149
+ * Reads the newest raw session (not `.redacted`/`.triaged`), takes the validated
150
+ * "end" cookieSnapshot + the recording's UA, and saves a MintedJar. Bound to the
151
+ * recording's IP/UA, so a later 403 (IP/UA changed, or expiry) self-heals to a
152
+ * fresh mint via clearJar. `siteDir` is `~/.imprint/<site>`.
153
+ */
154
+ export function seedJarFromRecording(
155
+ siteDir: string,
156
+ // Reuse a newestRecording() result the caller already computed (avoids a
157
+ // second readdir+stat and closes the tiny TOCTOU window between the
158
+ // supersede check and the seed). Falls back to a fresh lookup if omitted.
159
+ precomputed?: { path: string; mtimeMs: number } | null,
160
+ // The workflow's bootstrap page URL (if any), so jar.html is seeded from the
161
+ // recorded response for THAT page — the same page a fresh cdp mint would
162
+ // navigate to. Falls back to the largest recorded text/html Document body.
163
+ bootstrapUrl?: string,
164
+ ): boolean {
165
+ const found = precomputed ?? newestRecording(siteDir);
166
+ if (!found) return false;
167
+ const newest = found.path;
168
+ const newestMtime = found.mtimeMs;
169
+ const ageSeconds = (Date.now() - newestMtime) / 1000;
170
+ const maxAge = jarMaxAgeSeconds();
171
+ if (ageSeconds >= maxAge) {
172
+ log(`newest recording is ${Math.round(ageSeconds)}s old (>= ${maxAge}s) — not seeding`);
173
+ return false;
174
+ }
175
+ let session: {
176
+ cookieSnapshots?: Array<{ label?: string; cookies?: Array<Record<string, unknown>> }>;
177
+ requests?: Array<{
178
+ requestHeaders?: unknown;
179
+ headers?: unknown;
180
+ url?: string;
181
+ method?: string;
182
+ resourceType?: string;
183
+ response?: { status?: number; mimeType?: string; body?: string };
184
+ }>;
185
+ };
186
+ try {
187
+ session = JSON.parse(readFileSync(newest, 'utf8'));
188
+ } catch {
189
+ return false;
190
+ }
191
+ const snaps = session.cookieSnapshots ?? [];
192
+ const end = snaps.find((s) => s.label === 'end') ?? snaps[snaps.length - 1];
193
+ if (!end || !Array.isArray(end.cookies)) return false;
194
+ const cookies = end.cookies.map((c) => ({
195
+ name: c.name as string,
196
+ value: c.value as string,
197
+ domain: c.domain as string,
198
+ path: (c.path as string) ?? '/',
199
+ expires: typeof c.expires === 'number' && c.expires > 0 ? (c.expires as number) : undefined,
200
+ httpOnly: c.httpOnly as boolean | undefined,
201
+ secure: c.secure as boolean | undefined,
202
+ sameSite: c.sameSite as string | undefined,
203
+ }));
204
+ const abck = cookies.find((c) => c.name === '_abck')?.value;
205
+ const abckFlag = abck?.split('~')[1] ?? '?';
206
+ // Validated = `_abck~0~` OR a `bm_sv` cookie (Akamai's validated-session
207
+ // marker). `_abck` rotates back to `~-1~` after clearing a request, so a real
208
+ // working recording often ends with `_abck~-1~` + `bm_sv` — that jar replays
209
+ // fine (verified live: 609KB results). Gating on `_abck==='0'` alone wrongly
210
+ // rejects such recordings.
211
+ if (!jarCookiesValidated(cookies)) {
212
+ log(`newest recording is not validated (_abck~${abckFlag}~, no bm_sv) — not seeding`);
213
+ return false;
214
+ }
215
+ let ua = '';
216
+ for (const r of session.requests ?? []) {
217
+ let h = (r.requestHeaders ?? r.headers ?? {}) as
218
+ | Record<string, string>
219
+ | Array<{ name: string; value: string }>;
220
+ if (Array.isArray(h)) h = Object.fromEntries(h.map((x) => [x.name, x.value]));
221
+ const u =
222
+ (h as Record<string, string>)['User-Agent'] ?? (h as Record<string, string>)['user-agent'];
223
+ if (u) {
224
+ ua = u;
225
+ break;
226
+ }
227
+ }
228
+ if (!ua) {
229
+ // Replay (makeJarUaFetch) gates on a non-empty UA, so an empty one means
230
+ // the wire UA falls back to the runtime default — which may not match the
231
+ // UA the recording's jar was bound to and can get the jar dropped on a
232
+ // UA-sensitive (Akamai) origin. Surface it so a mysteriously-rejected jar
233
+ // is debuggable rather than silently degraded.
234
+ log(
235
+ `WARNING: no User-Agent found in recording ${newest}; seeded jar has no UA (replay will use the default UA — may not match the jar)`,
236
+ );
237
+ }
238
+ // Seed jar.html from the recorded bootstrap page so html_regex bootstrap
239
+ // captures (csrf / csp-nonce scraped from the page) resolve on the
240
+ // recording-seed path — exactly as they would from a fresh cdp mint's
241
+ // captured HTML. Without this (the old `html: ''`), any workflow whose
242
+ // requests reference `${state.X}` from an html_regex capture STATE_MISSINGs.
243
+ const html = pickBootstrapHtml(session.requests ?? [], bootstrapUrl);
244
+ saveJar(siteDir, {
245
+ cookies,
246
+ ua,
247
+ html,
248
+ bootstrapEpoch: Math.round(newestMtime),
249
+ abckFlag,
250
+ validated: true, // gated above on jarCookiesValidated
251
+ source: 'recording',
252
+ });
253
+ log(
254
+ `seeded jar from recording ${newest} (${cookies.length} cookies, _abck~${abckFlag}~, bm_sv-validated, ua=${ua ? `${ua.slice(0, 40)}…` : '(none)'}, html=${html.length}b)`,
255
+ );
256
+ return true;
257
+ }
258
+
259
+ /**
260
+ * Choose the recorded HTML to seed into jar.html for html_regex bootstrap
261
+ * captures. Preference order: (1) the recorded response for the exact bootstrap
262
+ * URL; (2) same origin+path (query stripped — the bootstrap URL may carry
263
+ * params the recording didn't); (3) the largest recorded text/html Document
264
+ * body (the app shell / fully-rendered page most likely to carry csrf/nonce).
265
+ * Returns '' if the recording has no usable HTML document.
266
+ */
267
+ function pickBootstrapHtml(
268
+ requests: Array<{
269
+ url?: string;
270
+ method?: string;
271
+ resourceType?: string;
272
+ response?: { mimeType?: string; body?: string };
273
+ }>,
274
+ bootstrapUrl?: string,
275
+ ): string {
276
+ const hasBody = (r: { response?: { body?: string } }): boolean =>
277
+ typeof r.response?.body === 'string' && r.response.body.length > 0;
278
+ // A bootstrap "page" is a top-level navigation Document, not an XHR fragment
279
+ // (e.g. costco's rentalCarDetails.act is XHR + text/html but is NOT a page).
280
+ // Prefer real Documents; only broaden to any text/html body if the recording
281
+ // has no Document responses (older recordings may lack resourceType).
282
+ const documents = requests.filter((r) => r.resourceType === 'Document' && hasBody(r));
283
+ const docs =
284
+ documents.length > 0
285
+ ? documents
286
+ : // Older recordings may lack resourceType. Broaden to text/html bodies, but
287
+ // still exclude XHR-shaped endpoints (GET-only, no `.act`/`/api/`) so we
288
+ // don't seed an XHR fragment (e.g. rentalCarDetails.act) as the page.
289
+ requests.filter(
290
+ (r) =>
291
+ (r.response?.mimeType ?? '').includes('text/html') &&
292
+ hasBody(r) &&
293
+ (r.method ?? 'GET').toUpperCase() === 'GET' &&
294
+ !/\.act(\?|$)|\/api\//i.test(r.url ?? ''),
295
+ );
296
+ if (docs.length === 0) return '';
297
+ if (bootstrapUrl) {
298
+ const exact = docs.find((r) => r.url === bootstrapUrl);
299
+ if (exact?.response?.body) return exact.response.body;
300
+ try {
301
+ const want = new URL(bootstrapUrl);
302
+ const samePath = docs.find((r) => {
303
+ try {
304
+ const u = new URL(r.url ?? '');
305
+ return u.origin === want.origin && u.pathname === want.pathname;
306
+ } catch {
307
+ return false;
308
+ }
309
+ });
310
+ if (samePath?.response?.body) return samePath.response.body;
311
+ } catch {
312
+ // bootstrapUrl not a valid URL — fall through to largest-body
313
+ }
314
+ }
315
+ return (
316
+ docs.reduce((best, r) =>
317
+ (r.response?.body?.length ?? 0) > (best.response?.body?.length ?? 0) ? r : best,
318
+ ).response?.body ?? ''
319
+ );
320
+ }
@@ -21,6 +21,45 @@ interface LaunchOptions {
21
21
  userDataDir?: string;
22
22
  /** Extra Chromium flags (advanced). */
23
23
  extraArgs?: string[];
24
+ /** X display for HEADED Chrome on Linux (e.g. ":0", ":99"). Defaults to
25
+ * `process.env.DISPLAY`; if that's also empty AND we're launching headed on
26
+ * Linux, a virtual framebuffer (Xvfb) is started automatically and torn down
27
+ * on close(). Ignored on macOS/Windows (they use the native window server)
28
+ * and for headless launches (which need no display). */
29
+ display?: string;
30
+ /** Upstream proxy for ALL of this Chrome's traffic, e.g.
31
+ * "http://host:port" or "socks5://host:port". Use to egress the trusted
32
+ * bootstrap + in-page requests through a RESIDENTIAL IP — Akamai (and most
33
+ * bot defenses) heavily penalize datacenter/cloud egress, so minting a
34
+ * high-trust `_abck` from an AWS/GCP box needs a residential proxy here.
35
+ * Defaults to `proxyUrl()` (IMPRINT_PROXY env). Note: Chrome's
36
+ * `--proxy-server` takes no inline credentials; use an IP-authed proxy or a
37
+ * scheme://host:port URL (auth is handled separately if needed). */
38
+ proxy?: string;
39
+ }
40
+
41
+ /** The configured upstream proxy (IMPRINT_PROXY), or undefined. Centralized so
42
+ * the browser launch and every plain-fetch replay path egress through the SAME
43
+ * IP — otherwise a jar minted via the proxy would be replayed from the box's
44
+ * (datacenter) IP and Akamai would drop it on the mismatch. */
45
+ export function proxyUrl(): string | undefined {
46
+ const p = process.env.IMPRINT_PROXY?.trim();
47
+ return p && p.length > 0 ? p : undefined;
48
+ }
49
+
50
+ /** Strip inline credentials for Chrome's `--proxy-server` (which rejects them),
51
+ * keeping scheme://host:port. Returns null if unparseable. */
52
+ export function chromeProxyArg(proxy: string): string | null {
53
+ if (proxy.includes('://')) {
54
+ try {
55
+ const u = new URL(proxy);
56
+ return `${u.protocol}//${u.host}`;
57
+ } catch {
58
+ return null;
59
+ }
60
+ }
61
+ // Plain host:port (new URL would misparse the host as a scheme).
62
+ return /^[\w.-]+:\d+$/.test(proxy) ? proxy : null;
24
63
  }
25
64
 
26
65
  interface LaunchedChromium {
@@ -128,6 +167,76 @@ export function findChromium(): string {
128
167
  );
129
168
  }
130
169
 
170
+ interface XvfbHandle {
171
+ display: string;
172
+ close(): Promise<void>;
173
+ }
174
+
175
+ const XVFB_HINT =
176
+ 'The trusted-browser replay needs a display. Install Xvfb (Debian/Ubuntu: ' +
177
+ '`apt-get install xvfb`), or run with an existing display: `DISPLAY=:0 imprint …`. ' +
178
+ 'Run `imprint doctor` to check.';
179
+
180
+ function xvfbErrorMessage(err: unknown): string {
181
+ const code = (err as { code?: string } | null)?.code;
182
+ if (code === 'ENOENT') return `Xvfb not found on PATH.\n${XVFB_HINT}`;
183
+ return `Failed to start Xvfb: ${err instanceof Error ? err.message : String(err)}\n${XVFB_HINT}`;
184
+ }
185
+
186
+ /**
187
+ * Spawn a virtual X framebuffer so HEADED Chrome can run on a Linux server with
188
+ * no physical display. Headed real Chrome (not `--headless`) is the only config
189
+ * some behavioral anti-bot services trust — it has a real GPU/compositor and
190
+ * real window geometry, none of which a headless build exposes. Xvfb is
191
+ * transparent to Chrome: same window + GPU code path, just no monitor. Picks a
192
+ * free `:NN` display, waits for its socket, and returns a teardown handle.
193
+ */
194
+ async function startXvfb(): Promise<XvfbHandle> {
195
+ // Pick a display number whose socket doesn't already exist.
196
+ let displayNum = 99;
197
+ for (; displayNum < 120; displayNum++) {
198
+ if (!existsSync(`/tmp/.X11-unix/X${displayNum}`)) break;
199
+ }
200
+ const display = `:${displayNum}`;
201
+ const proc = spawn('Xvfb', [display, '-screen', '0', '1920x1080x24', '-nolisten', 'tcp'], {
202
+ stdio: ['ignore', 'ignore', isDebug() ? 'pipe' : 'ignore'],
203
+ detached: false,
204
+ });
205
+ let spawnError: unknown;
206
+ proc.on('error', (err) => {
207
+ spawnError = err;
208
+ });
209
+ if (isDebug()) proc.stderr?.on('data', (chunk) => process.stderr.write(chunk));
210
+
211
+ const teardown = async (): Promise<void> => {
212
+ if (proc.exitCode === null && proc.signalCode === null) {
213
+ proc.kill('SIGTERM');
214
+ await Promise.race([
215
+ new Promise<void>((resolve) => proc.once('exit', () => resolve())),
216
+ sleep(1000),
217
+ ]);
218
+ if (proc.exitCode === null) proc.kill('SIGKILL');
219
+ }
220
+ };
221
+
222
+ // Wait for the X socket to appear (or the process to fail).
223
+ const deadline = Date.now() + 5000;
224
+ while (Date.now() < deadline) {
225
+ if (spawnError) throw new Error(xvfbErrorMessage(spawnError));
226
+ if (proc.exitCode !== null) {
227
+ throw new Error(
228
+ `Xvfb exited early (code ${proc.exitCode}) — could not start a virtual display.\n${XVFB_HINT}`,
229
+ );
230
+ }
231
+ if (existsSync(`/tmp/.X11-unix/X${displayNum}`)) {
232
+ return { display, close: teardown };
233
+ }
234
+ await sleep(100);
235
+ }
236
+ await teardown();
237
+ throw new Error(`Xvfb did not create display ${display} within 5s.\n${XVFB_HINT}`);
238
+ }
239
+
131
240
  async function pickFreePort(): Promise<number> {
132
241
  return new Promise((resolve, reject) => {
133
242
  const server = createServer();
@@ -178,12 +287,36 @@ export async function launchChromium(opts: LaunchOptions = {}): Promise<Launched
178
287
  '--use-mock-keychain',
179
288
  ];
180
289
  if (opts.headless) args.push('--headless=new');
290
+ const proxy = opts.proxy ?? proxyUrl();
291
+ if (proxy) {
292
+ const arg = chromeProxyArg(proxy);
293
+ if (arg) {
294
+ args.push(`--proxy-server=${arg}`);
295
+ // Route ALL hosts through the proxy (don't let Chrome bypass any) so the
296
+ // egress IP is uniform; without this Chrome may direct-connect some hosts.
297
+ args.push('--proxy-bypass-list=<-loopback>');
298
+ }
299
+ }
181
300
  if (opts.extraArgs) args.push(...opts.extraArgs);
182
301
  args.push(opts.url ?? 'about:blank');
183
302
 
303
+ // Resolve a display for HEADED Chrome. macOS/Windows use the native window
304
+ // server, so DISPLAY is meaningless there — this only applies on Linux. An
305
+ // existing physical/forwarded display ($DISPLAY, or an explicit opts.display)
306
+ // is used as-is; on a headless Linux server with none, spin up a virtual
307
+ // framebuffer so the trusted headed-Chrome replay still works. A headless
308
+ // launch needs no display.
309
+ let xvfb: XvfbHandle | undefined;
310
+ let display = opts.display ?? process.env.DISPLAY;
311
+ if (process.platform === 'linux' && !opts.headless && !display) {
312
+ xvfb = await startXvfb();
313
+ display = xvfb.display;
314
+ }
315
+
184
316
  const child = spawn(exe, args, {
185
317
  stdio: ['ignore', 'ignore', 'pipe'],
186
318
  detached: false,
319
+ env: display ? { ...process.env, DISPLAY: display } : process.env,
187
320
  });
188
321
 
189
322
  // Chromium is noisy — only surface stderr under IMPRINT_DEBUG.
@@ -205,6 +338,8 @@ export async function launchChromium(opts: LaunchOptions = {}): Promise<Launched
205
338
  ]);
206
339
  if (child.exitCode === null) child.kill('SIGKILL');
207
340
  }
341
+ // Tear down the virtual display we started for this launch (if any).
342
+ await xvfb?.close().catch(() => {});
208
343
  };
209
344
 
210
345
  return { process: child, port, userDataDir, ready, close };