imprint-mcp 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -201
- package/examples/discoverandgo/README.md +1 -1
- package/examples/echo/README.md +1 -1
- package/examples/google-flights/README.md +28 -0
- package/examples/google-flights/_shared/batchexecute.ts +63 -0
- package/examples/google-flights/_shared/flights_request.ts +95 -0
- package/examples/google-flights/_shared/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/index.ts +159 -0
- package/examples/google-flights/get_flight_booking_details/package.json +9 -0
- package/examples/google-flights/get_flight_booking_details/parser.ts +182 -0
- package/examples/google-flights/get_flight_booking_details/playbook.yaml +138 -0
- package/examples/google-flights/get_flight_booking_details/request-transform.ts +86 -0
- package/examples/google-flights/get_flight_booking_details/workflow.json +98 -0
- package/examples/google-flights/get_flight_calendar_prices/index.ts +131 -0
- package/examples/google-flights/get_flight_calendar_prices/package.json +9 -0
- package/examples/google-flights/get_flight_calendar_prices/parser.ts +86 -0
- package/examples/google-flights/get_flight_calendar_prices/playbook.yaml +97 -0
- package/examples/google-flights/get_flight_calendar_prices/request-transform.ts +31 -0
- package/examples/google-flights/get_flight_calendar_prices/workflow.json +76 -0
- package/examples/google-flights/lookup_airport/index.ts +101 -0
- package/examples/google-flights/lookup_airport/package.json +9 -0
- package/examples/google-flights/lookup_airport/parser.ts +66 -0
- package/examples/google-flights/lookup_airport/playbook.yaml +47 -0
- package/examples/google-flights/lookup_airport/request-transform.ts +20 -0
- package/examples/google-flights/lookup_airport/workflow.json +57 -0
- package/examples/google-flights/search_flights/index.ts +219 -0
- package/examples/google-flights/search_flights/package.json +9 -0
- package/examples/google-flights/search_flights/parser.ts +169 -0
- package/examples/google-flights/search_flights/playbook.yaml +184 -0
- package/examples/google-flights/search_flights/request-transform.ts +119 -0
- package/examples/google-flights/search_flights/workflow.json +143 -0
- package/examples/google-hotels/README.md +29 -0
- package/examples/google-hotels/_shared/batchexecute.ts +73 -0
- package/examples/google-hotels/_shared/freq.ts +158 -0
- package/examples/google-hotels/_shared/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/index.ts +80 -0
- package/examples/google-hotels/autocomplete_hotel_location/package.json +9 -0
- package/examples/google-hotels/autocomplete_hotel_location/parser.ts +71 -0
- package/examples/google-hotels/autocomplete_hotel_location/playbook.yaml +36 -0
- package/examples/google-hotels/autocomplete_hotel_location/request-transform.ts +37 -0
- package/examples/google-hotels/autocomplete_hotel_location/workflow.json +36 -0
- package/examples/google-hotels/get_hotel_booking_options/index.ts +143 -0
- package/examples/google-hotels/get_hotel_booking_options/package.json +9 -0
- package/examples/google-hotels/get_hotel_booking_options/parser.ts +271 -0
- package/examples/google-hotels/get_hotel_booking_options/playbook.yaml +154 -0
- package/examples/google-hotels/get_hotel_booking_options/request-transform.ts +154 -0
- package/examples/google-hotels/get_hotel_booking_options/workflow.json +84 -0
- package/examples/google-hotels/get_hotel_reviews/index.ts +81 -0
- package/examples/google-hotels/get_hotel_reviews/package.json +9 -0
- package/examples/google-hotels/get_hotel_reviews/parser.ts +128 -0
- package/examples/google-hotels/get_hotel_reviews/playbook.yaml +64 -0
- package/examples/google-hotels/get_hotel_reviews/request-transform.ts +42 -0
- package/examples/google-hotels/get_hotel_reviews/workflow.json +37 -0
- package/examples/google-hotels/search_hotels/index.ts +207 -0
- package/examples/google-hotels/search_hotels/package.json +9 -0
- package/examples/google-hotels/search_hotels/parser.ts +260 -0
- package/examples/google-hotels/search_hotels/playbook.yaml +87 -0
- package/examples/google-hotels/search_hotels/request-transform.ts +197 -0
- package/examples/google-hotels/search_hotels/workflow.json +127 -0
- package/package.json +3 -2
- package/prompts/audit-agent.md +71 -0
- package/prompts/build-planning.md +74 -0
- package/prompts/compile-agent.md +131 -27
- package/prompts/prereq-builder.md +64 -0
- package/prompts/prereq-planner.md +34 -0
- package/prompts/tool-planning.md +39 -0
- package/src/cli.ts +109 -2
- package/src/imprint/agent.ts +5 -0
- package/src/imprint/audit.ts +996 -0
- package/src/imprint/backend-ladder.ts +1214 -184
- package/src/imprint/build-plan.ts +1051 -0
- package/src/imprint/cdp-browser-fetch.ts +589 -0
- package/src/imprint/cdp-jar-cache.ts +320 -0
- package/src/imprint/chromium.ts +135 -0
- package/src/imprint/claude-cli-compile.ts +125 -25
- package/src/imprint/codex-cli-compile.ts +26 -23
- package/src/imprint/compile-agent-types.ts +38 -0
- package/src/imprint/compile-agent.ts +63 -25
- package/src/imprint/compile-tools.ts +1656 -64
- package/src/imprint/compile.ts +13 -1
- package/src/imprint/concurrency.ts +87 -0
- package/src/imprint/cron.ts +1 -0
- package/src/imprint/doctor.ts +39 -0
- package/src/imprint/freeform-redact.ts +5 -4
- package/src/imprint/integrations.ts +2 -2
- package/src/imprint/llm.ts +56 -8
- package/src/imprint/mcp-compile-server.ts +43 -10
- package/src/imprint/mcp-maintenance.ts +9 -101
- package/src/imprint/mcp-server.ts +73 -7
- package/src/imprint/multi-progress.ts +7 -2
- package/src/imprint/param-grounding.ts +367 -0
- package/src/imprint/paths.ts +29 -0
- package/src/imprint/playbook-runner.ts +101 -40
- package/src/imprint/prereq-builder.ts +651 -0
- package/src/imprint/probe-backends.ts +6 -3
- package/src/imprint/record.ts +10 -1
- package/src/imprint/redact.ts +30 -2
- package/src/imprint/replay-capture.ts +19 -18
- package/src/imprint/runtime.ts +19 -10
- package/src/imprint/session-diff.ts +79 -2
- package/src/imprint/session-merge.ts +9 -5
- package/src/imprint/stealth-chromium.ts +81 -0
- package/src/imprint/stealth-fetch.ts +309 -29
- package/src/imprint/stealth-token-cache.ts +88 -0
- package/src/imprint/teach-plan.ts +251 -0
- package/src/imprint/teach-state.ts +10 -0
- package/src/imprint/teach.ts +456 -142
- package/src/imprint/tool-candidates.ts +72 -14
- package/src/imprint/tool-plan.ts +313 -0
- package/src/imprint/tracing.ts +135 -6
- package/src/imprint/types.ts +61 -3
- package/examples/google-flights/search_google_flights/index.ts +0 -101
- package/examples/google-flights/search_google_flights/parser.test.ts +0 -140
- package/examples/google-flights/search_google_flights/parser.ts +0 -189
- package/examples/google-flights/search_google_flights/playbook.yaml +0 -130
- package/examples/google-flights/search_google_flights/workflow.json +0 -48
- package/examples/google-hotels/search_google_hotels/index.ts +0 -194
- package/examples/google-hotels/search_google_hotels/parser.test.ts +0 -168
- package/examples/google-hotels/search_google_hotels/parser.ts +0 -330
- package/examples/google-hotels/search_google_hotels/playbook.yaml +0 -125
- package/examples/google-hotels/search_google_hotels/workflow.json +0 -111
- package/examples/namecheap-domains/search_namecheap_domains/index.ts +0 -144
- package/examples/namecheap-domains/search_namecheap_domains/parser.ts +0 -380
- package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +0 -50
- package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +0 -136
- package/examples/namecheap-domains/search_namecheap_domains/workflow.json +0 -97
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* File-backed cache for a cdp-browser-minted Akamai jar (MintedJar), so the
|
|
3
|
+
* "bootstrap-then-fetch" path launches a real Chrome ONCE per validity window
|
|
4
|
+
* and then replays many searches via plain fetch with the cached jar.
|
|
5
|
+
*
|
|
6
|
+
* Validity window: Akamai's ak_bmsc + bm_sv expire ~2h FIXED from first page
|
|
7
|
+
* load (non-sliding — activity does not extend it), so we operate well under
|
|
8
|
+
* that and re-mint after 90 min (JAR_MAX_AGE_SECONDS). A jar is only reusable
|
|
9
|
+
* while its `_abck` is still validated (`~0~`); a jar that has gone stale
|
|
10
|
+
* self-heals via the reactive `clearJar` on a replay 401/403/428/429.
|
|
11
|
+
*
|
|
12
|
+
* The file holds a LIVE session credential (validated _abck + session cookies).
|
|
13
|
+
* It lives under ~/.imprint/<site>/ (never the repo), is gitignored, and must
|
|
14
|
+
* never be copied into examples/ fixtures, PRs, or screenshots.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import {
|
|
18
|
+
existsSync,
|
|
19
|
+
mkdirSync,
|
|
20
|
+
readFileSync,
|
|
21
|
+
readdirSync,
|
|
22
|
+
renameSync,
|
|
23
|
+
rmSync,
|
|
24
|
+
statSync,
|
|
25
|
+
writeFileSync,
|
|
26
|
+
} from 'node:fs';
|
|
27
|
+
import { join as pathJoin } from 'node:path';
|
|
28
|
+
import { type MintedJar, jarCookiesValidated } from './cdp-browser-fetch.ts';
|
|
29
|
+
import { createLog } from './log.ts';
|
|
30
|
+
|
|
31
|
+
const log = createLog('cdp-jar');
|
|
32
|
+
|
|
33
|
+
const JAR_FILE = '.cdp-jar.json';
|
|
34
|
+
|
|
35
|
+
/** Re-mint after 90 min. The hard ceiling is the ~2h fixed ak_bmsc/bm_sv TTL;
|
|
36
|
+
* 90 min leaves margin for snapshot-issuance skew. */
|
|
37
|
+
export const JAR_MAX_AGE_SECONDS = 5400;
|
|
38
|
+
|
|
39
|
+
/** Effective max jar/recording age. Defaults to JAR_MAX_AGE_SECONDS but can be
|
|
40
|
+
* raised via IMPRINT_JAR_MAX_AGE_SECONDS for a long single-IP teach where the
|
|
41
|
+
* recording must stay seedable for the whole compile (the real Akamai TTL is
|
|
42
|
+
* ~2h, so values up to ~6900s are still safe). Clamped to the ~2h hard ceiling
|
|
43
|
+
* so a typo can't push past the real cookie expiry. Read per-call. */
|
|
44
|
+
function jarMaxAgeSeconds(): number {
|
|
45
|
+
const raw = Number(process.env.IMPRINT_JAR_MAX_AGE_SECONDS);
|
|
46
|
+
if (Number.isFinite(raw) && raw > 0) return Math.min(raw, 7200);
|
|
47
|
+
return JAR_MAX_AGE_SECONDS;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function jarPath(siteDir: string): string {
|
|
51
|
+
return pathJoin(siteDir, JAR_FILE);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/** Load a cached jar, or null if absent / malformed / aged-out / not validated.
|
|
55
|
+
* The cached `ua` is reused for replay verbatim; a UA drift (Chrome auto-update
|
|
56
|
+
* mid-window) is rare and self-heals reactively on a replay 403, so we do NOT
|
|
57
|
+
* launch Chrome just to gate on UA here. */
|
|
58
|
+
export function loadJar(siteDir: string): MintedJar | null {
|
|
59
|
+
const p = jarPath(siteDir);
|
|
60
|
+
if (!existsSync(p)) return null;
|
|
61
|
+
try {
|
|
62
|
+
const raw = JSON.parse(readFileSync(p, 'utf8')) as Partial<MintedJar>;
|
|
63
|
+
if (!raw || !Array.isArray(raw.cookies) || typeof raw.bootstrapEpoch !== 'number') return null;
|
|
64
|
+
const ageSeconds = (Date.now() - raw.bootstrapEpoch) / 1000;
|
|
65
|
+
const maxAge = jarMaxAgeSeconds();
|
|
66
|
+
if (ageSeconds >= maxAge) {
|
|
67
|
+
log(`cached jar in ${siteDir} is ${Math.round(ageSeconds)}s old (>= ${maxAge}s) — re-mint`);
|
|
68
|
+
return null;
|
|
69
|
+
}
|
|
70
|
+
// Validated = `_abck~0~` OR `bm_sv` present (the latter survives `_abck`
|
|
71
|
+
// rotating back to `~-1~`). Fall back to the abckFlag check for caches
|
|
72
|
+
// written before the `validated` field existed.
|
|
73
|
+
const validated = raw.validated ?? raw.abckFlag === '0';
|
|
74
|
+
if (!validated) {
|
|
75
|
+
log(`cached jar not validated (_abck~${raw.abckFlag}~, no bm_sv) — re-mint`);
|
|
76
|
+
return null;
|
|
77
|
+
}
|
|
78
|
+
return raw as MintedJar;
|
|
79
|
+
} catch {
|
|
80
|
+
return null;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/** Persist a minted jar (atomic temp + rename). Best-effort. */
|
|
85
|
+
export function saveJar(siteDir: string, jar: MintedJar): void {
|
|
86
|
+
try {
|
|
87
|
+
mkdirSync(siteDir, { recursive: true });
|
|
88
|
+
const p = jarPath(siteDir);
|
|
89
|
+
const tmp = `${p}.${process.pid}.tmp`;
|
|
90
|
+
writeFileSync(tmp, `${JSON.stringify(jar)}\n`, 'utf8');
|
|
91
|
+
renameSync(tmp, p);
|
|
92
|
+
} catch (err) {
|
|
93
|
+
log(`failed to persist jar to ${siteDir}: ${err instanceof Error ? err.message : String(err)}`);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/** Remove a cached jar (best-effort) — call on a replay 401/403/428/429 so the
|
|
98
|
+
* next call re-mints (reactive self-heal), or when a site's teach run ends. */
|
|
99
|
+
export function clearJar(siteDir: string): void {
|
|
100
|
+
try {
|
|
101
|
+
rmSync(jarPath(siteDir), { force: true });
|
|
102
|
+
} catch {
|
|
103
|
+
// best-effort
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/** Path + mtime of the newest raw recorded session (excludes .redacted/.triaged),
|
|
108
|
+
* or null. Lets callers prefer a fresh recording over an older cached jar — e.g.
|
|
109
|
+
* after the user re-records on a new IP, the fresh recording must supersede the
|
|
110
|
+
* stale (old-IP) cached jar, which would otherwise tarpit. */
|
|
111
|
+
export function newestRecording(siteDir: string): { path: string; mtimeMs: number } | null {
|
|
112
|
+
const sessionsDir = pathJoin(siteDir, 'sessions');
|
|
113
|
+
if (!existsSync(sessionsDir)) return null;
|
|
114
|
+
let path = '';
|
|
115
|
+
let mtimeMs = 0;
|
|
116
|
+
try {
|
|
117
|
+
for (const f of readdirSync(sessionsDir)) {
|
|
118
|
+
if (!f.endsWith('.json') || f.endsWith('.redacted.json') || f.endsWith('.triaged.json')) {
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
// Skip synthetic `combined-*` merges — the jar must come from a GENUINE
|
|
122
|
+
// single browser recording whose `end` cookieSnapshot carries the real
|
|
123
|
+
// validated session (bm_sv). A combined session is a merge for tool
|
|
124
|
+
// detection and may not preserve a usable validated snapshot. (teach
|
|
125
|
+
// writes a fresh combined-*.json, so without this it'd be "newest".)
|
|
126
|
+
if (f.startsWith('combined-')) continue;
|
|
127
|
+
const p = pathJoin(sessionsDir, f);
|
|
128
|
+
const m = statSync(p).mtimeMs;
|
|
129
|
+
if (m > mtimeMs) {
|
|
130
|
+
mtimeMs = m;
|
|
131
|
+
path = p;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
} catch {
|
|
135
|
+
return null;
|
|
136
|
+
}
|
|
137
|
+
return path ? { path, mtimeMs } : null;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Seed the jar cache from the most recent RECORDED session for this site, if
|
|
142
|
+
* fresh + validated. The recording is a REAL browser session, so its `_abck`
|
|
143
|
+
* is HIGH-TRUST (genuine interaction → many sequential .act succeed) — strictly
|
|
144
|
+
* better than a synthetic cdp-browser mint, whose quickly-validated `_abck` is
|
|
145
|
+
* low-trust and gets rate-tarpitted. This is the imprint-native pure-API path:
|
|
146
|
+
* "the recording IS the executable" — replay reuses the session the user already
|
|
147
|
+
* validated, via plain fetch. Returns true if a jar was seeded.
|
|
148
|
+
*
|
|
149
|
+
* Reads the newest raw session (not `.redacted`/`.triaged`), takes the validated
|
|
150
|
+
* "end" cookieSnapshot + the recording's UA, and saves a MintedJar. Bound to the
|
|
151
|
+
* recording's IP/UA, so a later 403 (IP/UA changed, or expiry) self-heals to a
|
|
152
|
+
* fresh mint via clearJar. `siteDir` is `~/.imprint/<site>`.
|
|
153
|
+
*/
|
|
154
|
+
export function seedJarFromRecording(
|
|
155
|
+
siteDir: string,
|
|
156
|
+
// Reuse a newestRecording() result the caller already computed (avoids a
|
|
157
|
+
// second readdir+stat and closes the tiny TOCTOU window between the
|
|
158
|
+
// supersede check and the seed). Falls back to a fresh lookup if omitted.
|
|
159
|
+
precomputed?: { path: string; mtimeMs: number } | null,
|
|
160
|
+
// The workflow's bootstrap page URL (if any), so jar.html is seeded from the
|
|
161
|
+
// recorded response for THAT page — the same page a fresh cdp mint would
|
|
162
|
+
// navigate to. Falls back to the largest recorded text/html Document body.
|
|
163
|
+
bootstrapUrl?: string,
|
|
164
|
+
): boolean {
|
|
165
|
+
const found = precomputed ?? newestRecording(siteDir);
|
|
166
|
+
if (!found) return false;
|
|
167
|
+
const newest = found.path;
|
|
168
|
+
const newestMtime = found.mtimeMs;
|
|
169
|
+
const ageSeconds = (Date.now() - newestMtime) / 1000;
|
|
170
|
+
const maxAge = jarMaxAgeSeconds();
|
|
171
|
+
if (ageSeconds >= maxAge) {
|
|
172
|
+
log(`newest recording is ${Math.round(ageSeconds)}s old (>= ${maxAge}s) — not seeding`);
|
|
173
|
+
return false;
|
|
174
|
+
}
|
|
175
|
+
let session: {
|
|
176
|
+
cookieSnapshots?: Array<{ label?: string; cookies?: Array<Record<string, unknown>> }>;
|
|
177
|
+
requests?: Array<{
|
|
178
|
+
requestHeaders?: unknown;
|
|
179
|
+
headers?: unknown;
|
|
180
|
+
url?: string;
|
|
181
|
+
method?: string;
|
|
182
|
+
resourceType?: string;
|
|
183
|
+
response?: { status?: number; mimeType?: string; body?: string };
|
|
184
|
+
}>;
|
|
185
|
+
};
|
|
186
|
+
try {
|
|
187
|
+
session = JSON.parse(readFileSync(newest, 'utf8'));
|
|
188
|
+
} catch {
|
|
189
|
+
return false;
|
|
190
|
+
}
|
|
191
|
+
const snaps = session.cookieSnapshots ?? [];
|
|
192
|
+
const end = snaps.find((s) => s.label === 'end') ?? snaps[snaps.length - 1];
|
|
193
|
+
if (!end || !Array.isArray(end.cookies)) return false;
|
|
194
|
+
const cookies = end.cookies.map((c) => ({
|
|
195
|
+
name: c.name as string,
|
|
196
|
+
value: c.value as string,
|
|
197
|
+
domain: c.domain as string,
|
|
198
|
+
path: (c.path as string) ?? '/',
|
|
199
|
+
expires: typeof c.expires === 'number' && c.expires > 0 ? (c.expires as number) : undefined,
|
|
200
|
+
httpOnly: c.httpOnly as boolean | undefined,
|
|
201
|
+
secure: c.secure as boolean | undefined,
|
|
202
|
+
sameSite: c.sameSite as string | undefined,
|
|
203
|
+
}));
|
|
204
|
+
const abck = cookies.find((c) => c.name === '_abck')?.value;
|
|
205
|
+
const abckFlag = abck?.split('~')[1] ?? '?';
|
|
206
|
+
// Validated = `_abck~0~` OR a `bm_sv` cookie (Akamai's validated-session
|
|
207
|
+
// marker). `_abck` rotates back to `~-1~` after clearing a request, so a real
|
|
208
|
+
// working recording often ends with `_abck~-1~` + `bm_sv` — that jar replays
|
|
209
|
+
// fine (verified live: 609KB results). Gating on `_abck==='0'` alone wrongly
|
|
210
|
+
// rejects such recordings.
|
|
211
|
+
if (!jarCookiesValidated(cookies)) {
|
|
212
|
+
log(`newest recording is not validated (_abck~${abckFlag}~, no bm_sv) — not seeding`);
|
|
213
|
+
return false;
|
|
214
|
+
}
|
|
215
|
+
let ua = '';
|
|
216
|
+
for (const r of session.requests ?? []) {
|
|
217
|
+
let h = (r.requestHeaders ?? r.headers ?? {}) as
|
|
218
|
+
| Record<string, string>
|
|
219
|
+
| Array<{ name: string; value: string }>;
|
|
220
|
+
if (Array.isArray(h)) h = Object.fromEntries(h.map((x) => [x.name, x.value]));
|
|
221
|
+
const u =
|
|
222
|
+
(h as Record<string, string>)['User-Agent'] ?? (h as Record<string, string>)['user-agent'];
|
|
223
|
+
if (u) {
|
|
224
|
+
ua = u;
|
|
225
|
+
break;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
if (!ua) {
|
|
229
|
+
// Replay (makeJarUaFetch) gates on a non-empty UA, so an empty one means
|
|
230
|
+
// the wire UA falls back to the runtime default — which may not match the
|
|
231
|
+
// UA the recording's jar was bound to and can get the jar dropped on a
|
|
232
|
+
// UA-sensitive (Akamai) origin. Surface it so a mysteriously-rejected jar
|
|
233
|
+
// is debuggable rather than silently degraded.
|
|
234
|
+
log(
|
|
235
|
+
`WARNING: no User-Agent found in recording ${newest}; seeded jar has no UA (replay will use the default UA — may not match the jar)`,
|
|
236
|
+
);
|
|
237
|
+
}
|
|
238
|
+
// Seed jar.html from the recorded bootstrap page so html_regex bootstrap
|
|
239
|
+
// captures (csrf / csp-nonce scraped from the page) resolve on the
|
|
240
|
+
// recording-seed path — exactly as they would from a fresh cdp mint's
|
|
241
|
+
// captured HTML. Without this (the old `html: ''`), any workflow whose
|
|
242
|
+
// requests reference `${state.X}` from an html_regex capture STATE_MISSINGs.
|
|
243
|
+
const html = pickBootstrapHtml(session.requests ?? [], bootstrapUrl);
|
|
244
|
+
saveJar(siteDir, {
|
|
245
|
+
cookies,
|
|
246
|
+
ua,
|
|
247
|
+
html,
|
|
248
|
+
bootstrapEpoch: Math.round(newestMtime),
|
|
249
|
+
abckFlag,
|
|
250
|
+
validated: true, // gated above on jarCookiesValidated
|
|
251
|
+
source: 'recording',
|
|
252
|
+
});
|
|
253
|
+
log(
|
|
254
|
+
`seeded jar from recording ${newest} (${cookies.length} cookies, _abck~${abckFlag}~, bm_sv-validated, ua=${ua ? `${ua.slice(0, 40)}…` : '(none)'}, html=${html.length}b)`,
|
|
255
|
+
);
|
|
256
|
+
return true;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Choose the recorded HTML to seed into jar.html for html_regex bootstrap
|
|
261
|
+
* captures. Preference order: (1) the recorded response for the exact bootstrap
|
|
262
|
+
* URL; (2) same origin+path (query stripped — the bootstrap URL may carry
|
|
263
|
+
* params the recording didn't); (3) the largest recorded text/html Document
|
|
264
|
+
* body (the app shell / fully-rendered page most likely to carry csrf/nonce).
|
|
265
|
+
* Returns '' if the recording has no usable HTML document.
|
|
266
|
+
*/
|
|
267
|
+
function pickBootstrapHtml(
|
|
268
|
+
requests: Array<{
|
|
269
|
+
url?: string;
|
|
270
|
+
method?: string;
|
|
271
|
+
resourceType?: string;
|
|
272
|
+
response?: { mimeType?: string; body?: string };
|
|
273
|
+
}>,
|
|
274
|
+
bootstrapUrl?: string,
|
|
275
|
+
): string {
|
|
276
|
+
const hasBody = (r: { response?: { body?: string } }): boolean =>
|
|
277
|
+
typeof r.response?.body === 'string' && r.response.body.length > 0;
|
|
278
|
+
// A bootstrap "page" is a top-level navigation Document, not an XHR fragment
|
|
279
|
+
// (e.g. costco's rentalCarDetails.act is XHR + text/html but is NOT a page).
|
|
280
|
+
// Prefer real Documents; only broaden to any text/html body if the recording
|
|
281
|
+
// has no Document responses (older recordings may lack resourceType).
|
|
282
|
+
const documents = requests.filter((r) => r.resourceType === 'Document' && hasBody(r));
|
|
283
|
+
const docs =
|
|
284
|
+
documents.length > 0
|
|
285
|
+
? documents
|
|
286
|
+
: // Older recordings may lack resourceType. Broaden to text/html bodies, but
|
|
287
|
+
// still exclude XHR-shaped endpoints (GET-only, no `.act`/`/api/`) so we
|
|
288
|
+
// don't seed an XHR fragment (e.g. rentalCarDetails.act) as the page.
|
|
289
|
+
requests.filter(
|
|
290
|
+
(r) =>
|
|
291
|
+
(r.response?.mimeType ?? '').includes('text/html') &&
|
|
292
|
+
hasBody(r) &&
|
|
293
|
+
(r.method ?? 'GET').toUpperCase() === 'GET' &&
|
|
294
|
+
!/\.act(\?|$)|\/api\//i.test(r.url ?? ''),
|
|
295
|
+
);
|
|
296
|
+
if (docs.length === 0) return '';
|
|
297
|
+
if (bootstrapUrl) {
|
|
298
|
+
const exact = docs.find((r) => r.url === bootstrapUrl);
|
|
299
|
+
if (exact?.response?.body) return exact.response.body;
|
|
300
|
+
try {
|
|
301
|
+
const want = new URL(bootstrapUrl);
|
|
302
|
+
const samePath = docs.find((r) => {
|
|
303
|
+
try {
|
|
304
|
+
const u = new URL(r.url ?? '');
|
|
305
|
+
return u.origin === want.origin && u.pathname === want.pathname;
|
|
306
|
+
} catch {
|
|
307
|
+
return false;
|
|
308
|
+
}
|
|
309
|
+
});
|
|
310
|
+
if (samePath?.response?.body) return samePath.response.body;
|
|
311
|
+
} catch {
|
|
312
|
+
// bootstrapUrl not a valid URL — fall through to largest-body
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
return (
|
|
316
|
+
docs.reduce((best, r) =>
|
|
317
|
+
(r.response?.body?.length ?? 0) > (best.response?.body?.length ?? 0) ? r : best,
|
|
318
|
+
).response?.body ?? ''
|
|
319
|
+
);
|
|
320
|
+
}
|
package/src/imprint/chromium.ts
CHANGED
|
@@ -21,6 +21,45 @@ interface LaunchOptions {
|
|
|
21
21
|
userDataDir?: string;
|
|
22
22
|
/** Extra Chromium flags (advanced). */
|
|
23
23
|
extraArgs?: string[];
|
|
24
|
+
/** X display for HEADED Chrome on Linux (e.g. ":0", ":99"). Defaults to
|
|
25
|
+
* `process.env.DISPLAY`; if that's also empty AND we're launching headed on
|
|
26
|
+
* Linux, a virtual framebuffer (Xvfb) is started automatically and torn down
|
|
27
|
+
* on close(). Ignored on macOS/Windows (they use the native window server)
|
|
28
|
+
* and for headless launches (which need no display). */
|
|
29
|
+
display?: string;
|
|
30
|
+
/** Upstream proxy for ALL of this Chrome's traffic, e.g.
|
|
31
|
+
* "http://host:port" or "socks5://host:port". Use to egress the trusted
|
|
32
|
+
* bootstrap + in-page requests through a RESIDENTIAL IP — Akamai (and most
|
|
33
|
+
* bot defenses) heavily penalize datacenter/cloud egress, so minting a
|
|
34
|
+
* high-trust `_abck` from an AWS/GCP box needs a residential proxy here.
|
|
35
|
+
* Defaults to `proxyUrl()` (IMPRINT_PROXY env). Note: Chrome's
|
|
36
|
+
* `--proxy-server` takes no inline credentials; use an IP-authed proxy or a
|
|
37
|
+
* scheme://host:port URL (auth is handled separately if needed). */
|
|
38
|
+
proxy?: string;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** The configured upstream proxy (IMPRINT_PROXY), or undefined. Centralized so
|
|
42
|
+
* the browser launch and every plain-fetch replay path egress through the SAME
|
|
43
|
+
* IP — otherwise a jar minted via the proxy would be replayed from the box's
|
|
44
|
+
* (datacenter) IP and Akamai would drop it on the mismatch. */
|
|
45
|
+
export function proxyUrl(): string | undefined {
|
|
46
|
+
const p = process.env.IMPRINT_PROXY?.trim();
|
|
47
|
+
return p && p.length > 0 ? p : undefined;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/** Strip inline credentials for Chrome's `--proxy-server` (which rejects them),
|
|
51
|
+
* keeping scheme://host:port. Returns null if unparseable. */
|
|
52
|
+
export function chromeProxyArg(proxy: string): string | null {
|
|
53
|
+
if (proxy.includes('://')) {
|
|
54
|
+
try {
|
|
55
|
+
const u = new URL(proxy);
|
|
56
|
+
return `${u.protocol}//${u.host}`;
|
|
57
|
+
} catch {
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
// Plain host:port (new URL would misparse the host as a scheme).
|
|
62
|
+
return /^[\w.-]+:\d+$/.test(proxy) ? proxy : null;
|
|
24
63
|
}
|
|
25
64
|
|
|
26
65
|
interface LaunchedChromium {
|
|
@@ -128,6 +167,76 @@ export function findChromium(): string {
|
|
|
128
167
|
);
|
|
129
168
|
}
|
|
130
169
|
|
|
170
|
+
interface XvfbHandle {
|
|
171
|
+
display: string;
|
|
172
|
+
close(): Promise<void>;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
const XVFB_HINT =
|
|
176
|
+
'The trusted-browser replay needs a display. Install Xvfb (Debian/Ubuntu: ' +
|
|
177
|
+
'`apt-get install xvfb`), or run with an existing display: `DISPLAY=:0 imprint …`. ' +
|
|
178
|
+
'Run `imprint doctor` to check.';
|
|
179
|
+
|
|
180
|
+
function xvfbErrorMessage(err: unknown): string {
|
|
181
|
+
const code = (err as { code?: string } | null)?.code;
|
|
182
|
+
if (code === 'ENOENT') return `Xvfb not found on PATH.\n${XVFB_HINT}`;
|
|
183
|
+
return `Failed to start Xvfb: ${err instanceof Error ? err.message : String(err)}\n${XVFB_HINT}`;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Spawn a virtual X framebuffer so HEADED Chrome can run on a Linux server with
|
|
188
|
+
* no physical display. Headed real Chrome (not `--headless`) is the only config
|
|
189
|
+
* some behavioral anti-bot services trust — it has a real GPU/compositor and
|
|
190
|
+
* real window geometry, none of which a headless build exposes. Xvfb is
|
|
191
|
+
* transparent to Chrome: same window + GPU code path, just no monitor. Picks a
|
|
192
|
+
* free `:NN` display, waits for its socket, and returns a teardown handle.
|
|
193
|
+
*/
|
|
194
|
+
async function startXvfb(): Promise<XvfbHandle> {
|
|
195
|
+
// Pick a display number whose socket doesn't already exist.
|
|
196
|
+
let displayNum = 99;
|
|
197
|
+
for (; displayNum < 120; displayNum++) {
|
|
198
|
+
if (!existsSync(`/tmp/.X11-unix/X${displayNum}`)) break;
|
|
199
|
+
}
|
|
200
|
+
const display = `:${displayNum}`;
|
|
201
|
+
const proc = spawn('Xvfb', [display, '-screen', '0', '1920x1080x24', '-nolisten', 'tcp'], {
|
|
202
|
+
stdio: ['ignore', 'ignore', isDebug() ? 'pipe' : 'ignore'],
|
|
203
|
+
detached: false,
|
|
204
|
+
});
|
|
205
|
+
let spawnError: unknown;
|
|
206
|
+
proc.on('error', (err) => {
|
|
207
|
+
spawnError = err;
|
|
208
|
+
});
|
|
209
|
+
if (isDebug()) proc.stderr?.on('data', (chunk) => process.stderr.write(chunk));
|
|
210
|
+
|
|
211
|
+
const teardown = async (): Promise<void> => {
|
|
212
|
+
if (proc.exitCode === null && proc.signalCode === null) {
|
|
213
|
+
proc.kill('SIGTERM');
|
|
214
|
+
await Promise.race([
|
|
215
|
+
new Promise<void>((resolve) => proc.once('exit', () => resolve())),
|
|
216
|
+
sleep(1000),
|
|
217
|
+
]);
|
|
218
|
+
if (proc.exitCode === null) proc.kill('SIGKILL');
|
|
219
|
+
}
|
|
220
|
+
};
|
|
221
|
+
|
|
222
|
+
// Wait for the X socket to appear (or the process to fail).
|
|
223
|
+
const deadline = Date.now() + 5000;
|
|
224
|
+
while (Date.now() < deadline) {
|
|
225
|
+
if (spawnError) throw new Error(xvfbErrorMessage(spawnError));
|
|
226
|
+
if (proc.exitCode !== null) {
|
|
227
|
+
throw new Error(
|
|
228
|
+
`Xvfb exited early (code ${proc.exitCode}) — could not start a virtual display.\n${XVFB_HINT}`,
|
|
229
|
+
);
|
|
230
|
+
}
|
|
231
|
+
if (existsSync(`/tmp/.X11-unix/X${displayNum}`)) {
|
|
232
|
+
return { display, close: teardown };
|
|
233
|
+
}
|
|
234
|
+
await sleep(100);
|
|
235
|
+
}
|
|
236
|
+
await teardown();
|
|
237
|
+
throw new Error(`Xvfb did not create display ${display} within 5s.\n${XVFB_HINT}`);
|
|
238
|
+
}
|
|
239
|
+
|
|
131
240
|
async function pickFreePort(): Promise<number> {
|
|
132
241
|
return new Promise((resolve, reject) => {
|
|
133
242
|
const server = createServer();
|
|
@@ -178,12 +287,36 @@ export async function launchChromium(opts: LaunchOptions = {}): Promise<Launched
|
|
|
178
287
|
'--use-mock-keychain',
|
|
179
288
|
];
|
|
180
289
|
if (opts.headless) args.push('--headless=new');
|
|
290
|
+
const proxy = opts.proxy ?? proxyUrl();
|
|
291
|
+
if (proxy) {
|
|
292
|
+
const arg = chromeProxyArg(proxy);
|
|
293
|
+
if (arg) {
|
|
294
|
+
args.push(`--proxy-server=${arg}`);
|
|
295
|
+
// Route ALL hosts through the proxy (don't let Chrome bypass any) so the
|
|
296
|
+
// egress IP is uniform; without this Chrome may direct-connect some hosts.
|
|
297
|
+
args.push('--proxy-bypass-list=<-loopback>');
|
|
298
|
+
}
|
|
299
|
+
}
|
|
181
300
|
if (opts.extraArgs) args.push(...opts.extraArgs);
|
|
182
301
|
args.push(opts.url ?? 'about:blank');
|
|
183
302
|
|
|
303
|
+
// Resolve a display for HEADED Chrome. macOS/Windows use the native window
|
|
304
|
+
// server, so DISPLAY is meaningless there — this only applies on Linux. An
|
|
305
|
+
// existing physical/forwarded display ($DISPLAY, or an explicit opts.display)
|
|
306
|
+
// is used as-is; on a headless Linux server with none, spin up a virtual
|
|
307
|
+
// framebuffer so the trusted headed-Chrome replay still works. A headless
|
|
308
|
+
// launch needs no display.
|
|
309
|
+
let xvfb: XvfbHandle | undefined;
|
|
310
|
+
let display = opts.display ?? process.env.DISPLAY;
|
|
311
|
+
if (process.platform === 'linux' && !opts.headless && !display) {
|
|
312
|
+
xvfb = await startXvfb();
|
|
313
|
+
display = xvfb.display;
|
|
314
|
+
}
|
|
315
|
+
|
|
184
316
|
const child = spawn(exe, args, {
|
|
185
317
|
stdio: ['ignore', 'ignore', 'pipe'],
|
|
186
318
|
detached: false,
|
|
319
|
+
env: display ? { ...process.env, DISPLAY: display } : process.env,
|
|
187
320
|
});
|
|
188
321
|
|
|
189
322
|
// Chromium is noisy — only surface stderr under IMPRINT_DEBUG.
|
|
@@ -205,6 +338,8 @@ export async function launchChromium(opts: LaunchOptions = {}): Promise<Launched
|
|
|
205
338
|
]);
|
|
206
339
|
if (child.exitCode === null) child.kill('SIGKILL');
|
|
207
340
|
}
|
|
341
|
+
// Tear down the virtual display we started for this launch (if any).
|
|
342
|
+
await xvfb?.close().catch(() => {});
|
|
208
343
|
};
|
|
209
344
|
|
|
210
345
|
return { process: child, port, userDataDir, ready, close };
|