imprint-mcp 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +168 -0
- package/LICENSE +21 -0
- package/README.md +322 -0
- package/examples/discoverandgo/README.md +57 -0
- package/examples/discoverandgo/book_discoverandgo_museum_pass/cron.json +8 -0
- package/examples/discoverandgo/book_discoverandgo_museum_pass/index.ts +89 -0
- package/examples/discoverandgo/book_discoverandgo_museum_pass/workflow.json +39 -0
- package/examples/echo/README.md +37 -0
- package/examples/echo/echo_test/index.ts +31 -0
- package/examples/google-flights/search_google_flights/index.ts +101 -0
- package/examples/google-flights/search_google_flights/parser.test.ts +140 -0
- package/examples/google-flights/search_google_flights/parser.ts +189 -0
- package/examples/google-flights/search_google_flights/playbook.yaml +130 -0
- package/examples/google-flights/search_google_flights/workflow.json +48 -0
- package/examples/google-hotels/search_google_hotels/index.ts +194 -0
- package/examples/google-hotels/search_google_hotels/parser.test.ts +168 -0
- package/examples/google-hotels/search_google_hotels/parser.ts +330 -0
- package/examples/google-hotels/search_google_hotels/playbook.yaml +125 -0
- package/examples/google-hotels/search_google_hotels/workflow.json +111 -0
- package/examples/namecheap-domains/search_namecheap_domains/index.ts +144 -0
- package/examples/namecheap-domains/search_namecheap_domains/parser.ts +380 -0
- package/examples/namecheap-domains/search_namecheap_domains/playbook.yaml +50 -0
- package/examples/namecheap-domains/search_namecheap_domains/request-transform.ts +136 -0
- package/examples/namecheap-domains/search_namecheap_domains/workflow.json +97 -0
- package/examples/southwest/README.md +81 -0
- package/examples/southwest/search_southwest_flights/backends.json +23 -0
- package/examples/southwest/search_southwest_flights/cron.json +19 -0
- package/examples/southwest/search_southwest_flights/index.ts +110 -0
- package/examples/southwest/search_southwest_flights/playbook.yaml +46 -0
- package/examples/southwest/search_southwest_flights/workflow.json +54 -0
- package/package.json +78 -0
- package/prompts/compile-agent.md +580 -0
- package/prompts/intent-detection.md +198 -0
- package/prompts/playbook-compilation.md +279 -0
- package/prompts/request-triage.md +74 -0
- package/prompts/tool-candidate-detection.md +104 -0
- package/src/cli.ts +1287 -0
- package/src/imprint/agent.ts +468 -0
- package/src/imprint/app-api-hosts.ts +53 -0
- package/src/imprint/backend-ladder.ts +568 -0
- package/src/imprint/check.ts +136 -0
- package/src/imprint/chromium.ts +211 -0
- package/src/imprint/claude-cli-compile.ts +640 -0
- package/src/imprint/cli-credential.ts +394 -0
- package/src/imprint/codex-cli-compile.ts +712 -0
- package/src/imprint/compile-agent-types.ts +40 -0
- package/src/imprint/compile-agent.ts +404 -0
- package/src/imprint/compile-tools.ts +1389 -0
- package/src/imprint/compile.ts +720 -0
- package/src/imprint/cookie-jar.ts +246 -0
- package/src/imprint/credential-bundle.ts +195 -0
- package/src/imprint/credential-extract.ts +290 -0
- package/src/imprint/credential-store.ts +707 -0
- package/src/imprint/cron.ts +312 -0
- package/src/imprint/doctor.ts +223 -0
- package/src/imprint/emit.ts +154 -0
- package/src/imprint/etld.ts +134 -0
- package/src/imprint/freeform-redact.ts +216 -0
- package/src/imprint/inject-listener.ts +137 -0
- package/src/imprint/install.ts +795 -0
- package/src/imprint/integrations.ts +385 -0
- package/src/imprint/is-compiled.ts +2 -0
- package/src/imprint/json-path.ts +100 -0
- package/src/imprint/llm.ts +998 -0
- package/src/imprint/load-json.ts +54 -0
- package/src/imprint/log.ts +33 -0
- package/src/imprint/login.ts +166 -0
- package/src/imprint/mcp-compile-server.ts +282 -0
- package/src/imprint/mcp-maintenance.ts +1790 -0
- package/src/imprint/mcp-server.ts +350 -0
- package/src/imprint/multi-progress.ts +69 -0
- package/src/imprint/notify.ts +155 -0
- package/src/imprint/paths.ts +64 -0
- package/src/imprint/playbook-parser.ts +21 -0
- package/src/imprint/playbook-runner.ts +465 -0
- package/src/imprint/probe-backends.ts +251 -0
- package/src/imprint/progress.ts +28 -0
- package/src/imprint/record.ts +470 -0
- package/src/imprint/redact.ts +550 -0
- package/src/imprint/replay-capture.ts +387 -0
- package/src/imprint/request-context.ts +66 -0
- package/src/imprint/runtime-link.ts +73 -0
- package/src/imprint/runtime.ts +942 -0
- package/src/imprint/sensitive-keys.ts +156 -0
- package/src/imprint/session-diff.ts +409 -0
- package/src/imprint/session-merge.ts +198 -0
- package/src/imprint/session-writer.ts +149 -0
- package/src/imprint/sites.ts +27 -0
- package/src/imprint/stealth-fetch.ts +434 -0
- package/src/imprint/teach-state.ts +235 -0
- package/src/imprint/teach.ts +2120 -0
- package/src/imprint/tool-candidates.ts +423 -0
- package/src/imprint/tool-loader.ts +186 -0
- package/src/imprint/tool-selection.ts +70 -0
- package/src/imprint/tracing.ts +508 -0
- package/src/imprint/types.ts +472 -0
- package/src/imprint/version.ts +21 -0
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bypass bot detection without keeping a browser alive.
|
|
3
|
+
*
|
|
4
|
+
* 1. Bootstrap: brief headless Chromium navigation to mint cookies +
|
|
5
|
+
* sensor headers the bot-detection JS (Akamai/Cloudflare/etc) injects.
|
|
6
|
+
* 2. Fetch: native fetch() with those cookies + sensor headers.
|
|
7
|
+
* 3. Refresh: re-bootstrap proactively after maxTokenAgeSeconds AND
|
|
8
|
+
* reactively on 403.
|
|
9
|
+
*
|
|
10
|
+
* ~12s bootstrap one-time, ~1s per API call after.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import type { Browser } from 'playwright';
|
|
14
|
+
import { isSameRegistrableDomain, registrableDomain } from './etld.ts';
|
|
15
|
+
import { createLog } from './log.ts';
|
|
16
|
+
|
|
17
|
+
export interface StealthFetchOptions {
|
|
18
|
+
/** Homepage URL to load during bootstrap (triggers bot-detection JS). */
|
|
19
|
+
baseUrl: string;
|
|
20
|
+
/** Seconds to wait after page load for sensor initialization. Default 3. */
|
|
21
|
+
sensorWaitSeconds?: number;
|
|
22
|
+
/** Launch headed for debugging. Default false. */
|
|
23
|
+
headed?: boolean;
|
|
24
|
+
/** Custom user agent. */
|
|
25
|
+
userAgent?: string;
|
|
26
|
+
/** Max number of auto-re-bootstraps on 403 per fetch call. Default 1. */
|
|
27
|
+
maxRetries?: number;
|
|
28
|
+
/** Proactive refresh threshold. Default 600s (10min) — Akamai's _abck
|
|
29
|
+
* lifetime varies; this amortizes the bootstrap without risking expiry. */
|
|
30
|
+
maxTokenAgeSeconds?: number;
|
|
31
|
+
/** Stop auto-retrying after this many consecutive 403s so the ladder
|
|
32
|
+
* can escalate to playbook. Default 3. */
|
|
33
|
+
maxConsecutiveFailures?: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface FetchInit {
|
|
37
|
+
method?: string;
|
|
38
|
+
headers?: Record<string, string>;
|
|
39
|
+
/** Anything `fetch()` accepts as a body. The retry loop reads this
|
|
40
|
+
* once per attempt via globalThis.fetch, so non-replayable bodies
|
|
41
|
+
* (ReadableStream consumed once, hand-rolled iterables) won't survive
|
|
42
|
+
* a 403 retry — callers that need retry-after-bot-bootstrap should
|
|
43
|
+
* pass a string, Blob, ArrayBuffer, FormData, or URLSearchParams. */
|
|
44
|
+
body?: RequestInit['body'];
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
interface FetchResult {
|
|
48
|
+
status: number;
|
|
49
|
+
ok: boolean;
|
|
50
|
+
body: string;
|
|
51
|
+
headers: Record<string, string>;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface TokenCache {
|
|
55
|
+
cookies: Array<{ name: string; value: string }>;
|
|
56
|
+
sensorHeaders: Record<string, string>;
|
|
57
|
+
bootstrappedAt: number;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export interface StealthFetch {
|
|
61
|
+
/** typeof fetch wrapper that auto-bootstraps + adds sensor headers. */
|
|
62
|
+
readonly fetchImpl: typeof fetch;
|
|
63
|
+
/** Drop cached tokens; next fetch re-bootstraps. */
|
|
64
|
+
invalidate(): void;
|
|
65
|
+
/** Token age in seconds; -1 if not bootstrapped yet. */
|
|
66
|
+
readonly tokenAgeSeconds: number;
|
|
67
|
+
/** Consecutive 403s; resets on success. */
|
|
68
|
+
readonly failureStreak: number;
|
|
69
|
+
/** Future-proof teardown hook. Today: no-op (defaultBootstrap closes
|
|
70
|
+
* its Browser inside its own try/finally; nothing else to release).
|
|
71
|
+
* Reserved for an architecture where StealthFetch owns a long-lived
|
|
72
|
+
* Browser across calls — callers can wire \`await sf.close()\` into
|
|
73
|
+
* shutdown handlers now and it'll Just Work later. */
|
|
74
|
+
close(): Promise<void>;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
interface BootstrapArgs {
|
|
78
|
+
baseUrl: string;
|
|
79
|
+
probeUrl?: string;
|
|
80
|
+
userAgent: string;
|
|
81
|
+
headed: boolean;
|
|
82
|
+
sensorWaitSeconds: number;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Test-only seam for swapping the Playwright bootstrap and the
|
|
87
|
+
* sensor-headered network call. Production code never passes these —
|
|
88
|
+
* defaults are real Chromium + globalThis.fetch.
|
|
89
|
+
*/
|
|
90
|
+
interface StealthFetchInternals {
|
|
91
|
+
bootstrap?: (args: BootstrapArgs) => Promise<TokenCache>;
|
|
92
|
+
underlyingFetch?: (url: string, init: FetchInit, tokens: TokenCache) => Promise<FetchResult>;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const DEFAULT_UA =
|
|
96
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36';
|
|
97
|
+
|
|
98
|
+
/** Standard headers the runtime sets — anything outbound NOT in this set
|
|
99
|
+
* was injected by sensor JS and is what we capture for replay. */
|
|
100
|
+
const STANDARD_HEADERS = new Set([
|
|
101
|
+
'accept',
|
|
102
|
+
'accept-encoding',
|
|
103
|
+
'accept-language',
|
|
104
|
+
'connection',
|
|
105
|
+
'content-length',
|
|
106
|
+
'content-type',
|
|
107
|
+
'host',
|
|
108
|
+
'origin',
|
|
109
|
+
'referer',
|
|
110
|
+
'sec-ch-ua',
|
|
111
|
+
'sec-ch-ua-mobile',
|
|
112
|
+
'sec-ch-ua-platform',
|
|
113
|
+
'sec-fetch-dest',
|
|
114
|
+
'sec-fetch-mode',
|
|
115
|
+
'sec-fetch-site',
|
|
116
|
+
'user-agent',
|
|
117
|
+
'cookie',
|
|
118
|
+
]);
|
|
119
|
+
|
|
120
|
+
/** Regenerate as fresh UUIDs per call. Sites validate these as
|
|
121
|
+
* unique-per-request and reject replay (verified vs. Southwest's
|
|
122
|
+
* X-User-Experience-ID → 400 VALIDATION__FIELD__INVALID). */
|
|
123
|
+
const FRESH_UUID_HEADERS = new Set([
|
|
124
|
+
'x-user-experience-id',
|
|
125
|
+
'x-request-id',
|
|
126
|
+
'x-correlation-id',
|
|
127
|
+
'x-trace-id',
|
|
128
|
+
]);
|
|
129
|
+
|
|
130
|
+
const log = createLog('stealth');
|
|
131
|
+
|
|
132
|
+
export function createStealthFetch(
|
|
133
|
+
optsOrUrl: StealthFetchOptions | string,
|
|
134
|
+
internals?: StealthFetchInternals,
|
|
135
|
+
): StealthFetch {
|
|
136
|
+
const o = typeof optsOrUrl === 'string' ? { baseUrl: optsOrUrl } : optsOrUrl;
|
|
137
|
+
const opts = {
|
|
138
|
+
baseUrl: o.baseUrl,
|
|
139
|
+
sensorWaitSeconds: o.sensorWaitSeconds ?? 3,
|
|
140
|
+
headed: o.headed ?? false,
|
|
141
|
+
userAgent: o.userAgent ?? DEFAULT_UA,
|
|
142
|
+
maxRetries: o.maxRetries ?? 1,
|
|
143
|
+
maxTokenAgeSeconds: o.maxTokenAgeSeconds ?? 600,
|
|
144
|
+
maxConsecutiveFailures: o.maxConsecutiveFailures ?? 3,
|
|
145
|
+
};
|
|
146
|
+
const bootstrapFn = internals?.bootstrap ?? defaultBootstrap;
|
|
147
|
+
const underlyingFetchFn = internals?.underlyingFetch ?? defaultUnderlyingFetch;
|
|
148
|
+
|
|
149
|
+
let tokens: TokenCache | null = null;
|
|
150
|
+
let consecutiveFailures = 0;
|
|
151
|
+
|
|
152
|
+
const tokenAge = (): number => {
|
|
153
|
+
if (!tokens) return -1;
|
|
154
|
+
return Math.floor((Date.now() - tokens.bootstrappedAt) / 1000);
|
|
155
|
+
};
|
|
156
|
+
|
|
157
|
+
async function ensureTokens(probeUrl?: string): Promise<void> {
|
|
158
|
+
if (tokens && tokenAge() >= opts.maxTokenAgeSeconds) {
|
|
159
|
+
log(`tokens ${tokenAge()}s old (>= ${opts.maxTokenAgeSeconds}s), refreshing proactively`);
|
|
160
|
+
tokens = null;
|
|
161
|
+
}
|
|
162
|
+
if (tokens) return;
|
|
163
|
+
const t0 = Date.now();
|
|
164
|
+
log('bootstrapping…');
|
|
165
|
+
tokens = await bootstrapFn({
|
|
166
|
+
baseUrl: opts.baseUrl,
|
|
167
|
+
probeUrl,
|
|
168
|
+
userAgent: opts.userAgent,
|
|
169
|
+
headed: opts.headed,
|
|
170
|
+
sensorWaitSeconds: opts.sensorWaitSeconds,
|
|
171
|
+
});
|
|
172
|
+
consecutiveFailures = 0; // fresh tokens → past failures don't count
|
|
173
|
+
log(
|
|
174
|
+
`bootstrapped in ${Date.now() - t0}ms — ${tokens.cookies.length} cookies, ${Object.keys(tokens.sensorHeaders).length} sensor headers`,
|
|
175
|
+
);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
async function fetchWithRetry(url: string, init?: FetchInit): Promise<FetchResult> {
|
|
179
|
+
const fullUrl = url.startsWith('http') ? url : `${new URL(opts.baseUrl).origin}${url}`;
|
|
180
|
+
await ensureTokens(fullUrl);
|
|
181
|
+
let retries = 0;
|
|
182
|
+
while (true) {
|
|
183
|
+
const t = tokens;
|
|
184
|
+
if (!t) throw new Error('No tokens (bootstrap failed?)');
|
|
185
|
+
const { headers: initHeaders, cookieHeader } = splitCookieHeader(init?.headers ?? {});
|
|
186
|
+
const result = await underlyingFetchFn(
|
|
187
|
+
fullUrl,
|
|
188
|
+
{
|
|
189
|
+
method: init?.method ?? 'GET',
|
|
190
|
+
headers: {
|
|
191
|
+
'User-Agent': opts.userAgent,
|
|
192
|
+
Accept: 'application/json, text/javascript, */*; q=0.01',
|
|
193
|
+
'Content-Type': 'application/json',
|
|
194
|
+
Cookie: mergeCookieHeader(
|
|
195
|
+
t.cookies.map((c) => `${c.name}=${c.value}`).join('; '),
|
|
196
|
+
cookieHeader,
|
|
197
|
+
),
|
|
198
|
+
Origin: new URL(fullUrl).origin,
|
|
199
|
+
Referer: opts.baseUrl,
|
|
200
|
+
...t.sensorHeaders,
|
|
201
|
+
...initHeaders,
|
|
202
|
+
},
|
|
203
|
+
body: init?.body,
|
|
204
|
+
},
|
|
205
|
+
t,
|
|
206
|
+
);
|
|
207
|
+
|
|
208
|
+
if (result.status === 403) {
|
|
209
|
+
consecutiveFailures++;
|
|
210
|
+
if (consecutiveFailures >= opts.maxConsecutiveFailures) {
|
|
211
|
+
log(
|
|
212
|
+
`${consecutiveFailures} consecutive 403s — giving up on this site (caller should escalate)`,
|
|
213
|
+
);
|
|
214
|
+
return result;
|
|
215
|
+
}
|
|
216
|
+
if (retries < opts.maxRetries) {
|
|
217
|
+
log(`got 403 — re-bootstrapping (attempt ${retries + 1}/${opts.maxRetries})`);
|
|
218
|
+
tokens = null;
|
|
219
|
+
await ensureTokens(fullUrl);
|
|
220
|
+
retries++;
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
return result;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Any non-403 (success or different error) resets the streak.
|
|
227
|
+
consecutiveFailures = 0;
|
|
228
|
+
return result;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
const fetchImpl: typeof fetch = (async (
|
|
233
|
+
input: string | URL | Request,
|
|
234
|
+
init?: RequestInit,
|
|
235
|
+
): Promise<Response> => {
|
|
236
|
+
const url =
|
|
237
|
+
typeof input === 'string' ? input : input instanceof URL ? input.toString() : input.url;
|
|
238
|
+
const headers: Record<string, string> = {};
|
|
239
|
+
if (init?.headers) {
|
|
240
|
+
const h = new Headers(init.headers);
|
|
241
|
+
h.forEach((v, k) => {
|
|
242
|
+
headers[k] = v;
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
// Regenerate per-call UUIDs (captured statics get rejected as stale).
|
|
246
|
+
// Always inject x-user-experience-id — Southwest requires it even
|
|
247
|
+
// when the recorded workflow omits it.
|
|
248
|
+
const present = new Set(Object.keys(headers).map((k) => k.toLowerCase()));
|
|
249
|
+
for (const k of Object.keys(headers)) {
|
|
250
|
+
if (FRESH_UUID_HEADERS.has(k.toLowerCase())) {
|
|
251
|
+
headers[k] = crypto.randomUUID();
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
if (!present.has('x-user-experience-id')) {
|
|
255
|
+
headers['X-User-Experience-ID'] = crypto.randomUUID();
|
|
256
|
+
}
|
|
257
|
+
const result = await fetchWithRetry(url, {
|
|
258
|
+
method: typeof init?.method === 'string' ? init.method : 'GET',
|
|
259
|
+
headers,
|
|
260
|
+
// Pass BodyInit through unchanged; globalThis.fetch handles every
|
|
261
|
+
// accepted shape (string, Blob, ArrayBuffer, FormData, URLSearchParams,
|
|
262
|
+
// ReadableStream). Previously we dropped any non-string body silently.
|
|
263
|
+
body: init?.body ?? undefined,
|
|
264
|
+
});
|
|
265
|
+
return new Response(result.body, {
|
|
266
|
+
status: result.status,
|
|
267
|
+
headers: new Headers(result.headers),
|
|
268
|
+
});
|
|
269
|
+
}) as typeof fetch;
|
|
270
|
+
|
|
271
|
+
return {
|
|
272
|
+
fetchImpl,
|
|
273
|
+
invalidate(): void {
|
|
274
|
+
tokens = null;
|
|
275
|
+
consecutiveFailures = 0;
|
|
276
|
+
},
|
|
277
|
+
get tokenAgeSeconds(): number {
|
|
278
|
+
return tokenAge();
|
|
279
|
+
},
|
|
280
|
+
get failureStreak(): number {
|
|
281
|
+
return consecutiveFailures;
|
|
282
|
+
},
|
|
283
|
+
// Intentional no-op — see the docstring on StealthFetch.close.
|
|
284
|
+
// Don't reset tokens/failures here: callers that hit close() are
|
|
285
|
+
// shutting down, not invalidating, and the difference matters if
|
|
286
|
+
// the future architecture grows real cleanup work.
|
|
287
|
+
async close(): Promise<void> {},
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
function splitCookieHeader(headers: Record<string, string>): {
|
|
292
|
+
headers: Record<string, string>;
|
|
293
|
+
cookieHeader: string | undefined;
|
|
294
|
+
} {
|
|
295
|
+
const next: Record<string, string> = {};
|
|
296
|
+
let cookieHeader: string | undefined;
|
|
297
|
+
for (const [key, value] of Object.entries(headers)) {
|
|
298
|
+
if (key.toLowerCase() === 'cookie') {
|
|
299
|
+
cookieHeader = value;
|
|
300
|
+
} else {
|
|
301
|
+
next[key] = value;
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
return { headers: next, cookieHeader };
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
function mergeCookieHeader(browserCookie: string, runtimeCookie: string | undefined): string {
|
|
308
|
+
const merged = new Map<string, string>();
|
|
309
|
+
for (const header of [browserCookie, runtimeCookie ?? '']) {
|
|
310
|
+
for (const part of header.split(';')) {
|
|
311
|
+
const trimmed = part.trim();
|
|
312
|
+
if (!trimmed) continue;
|
|
313
|
+
const eq = trimmed.indexOf('=');
|
|
314
|
+
if (eq <= 0) continue;
|
|
315
|
+
merged.set(trimmed.slice(0, eq), trimmed.slice(eq + 1));
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
return Array.from(merged.entries())
|
|
319
|
+
.map(([name, value]) => `${name}=${value}`)
|
|
320
|
+
.join('; ');
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/**
|
|
324
|
+
* Real Playwright bootstrap. Launches headless Chromium, navigates to
|
|
325
|
+
* `baseUrl`, lets the bot-detection JS run, captures the resulting
|
|
326
|
+
* cookies + sensor-injected headers via a route interceptor on a probe
|
|
327
|
+
* request, closes the browser. Returns a fresh TokenCache.
|
|
328
|
+
*/
|
|
329
|
+
async function defaultBootstrap(args: BootstrapArgs): Promise<TokenCache> {
|
|
330
|
+
const { chromium } = await import('playwright');
|
|
331
|
+
let browser: Browser | undefined;
|
|
332
|
+
try {
|
|
333
|
+
browser = await chromium.launch({
|
|
334
|
+
headless: !args.headed,
|
|
335
|
+
args: ['--disable-blink-features=AutomationControlled', '--no-sandbox'],
|
|
336
|
+
});
|
|
337
|
+
|
|
338
|
+
const context = await browser.newContext({
|
|
339
|
+
userAgent: args.userAgent,
|
|
340
|
+
viewport: { width: 1440, height: 900 },
|
|
341
|
+
screen: { width: 2560, height: 1440 },
|
|
342
|
+
locale: 'en-US',
|
|
343
|
+
timezoneId: 'America/Los_Angeles',
|
|
344
|
+
});
|
|
345
|
+
|
|
346
|
+
const page = await context.newPage();
|
|
347
|
+
await page.addInitScript(() => {
|
|
348
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
|
349
|
+
});
|
|
350
|
+
|
|
351
|
+
// 'domcontentloaded' (not 'networkidle') because SPAs keep connections
|
|
352
|
+
// alive forever; explicit sensor-wait lets bot-detection JS fire.
|
|
353
|
+
await page.goto(args.baseUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
354
|
+
await page.waitForTimeout(args.sensorWaitSeconds * 1000);
|
|
355
|
+
|
|
356
|
+
// Probe with known headers; any header we DIDN'T send was injected
|
|
357
|
+
// by the sensor — that's what we capture.
|
|
358
|
+
const probeHeaders: Record<string, string> = {
|
|
359
|
+
'Content-Type': 'application/json',
|
|
360
|
+
'X-API-Key': 'x',
|
|
361
|
+
'X-App-ID': 'x',
|
|
362
|
+
'X-Channel-ID': 'x',
|
|
363
|
+
'X-User-Experience-ID': 'x',
|
|
364
|
+
};
|
|
365
|
+
const probeSentKeys = new Set([
|
|
366
|
+
...Array.from(STANDARD_HEADERS),
|
|
367
|
+
...Object.keys(probeHeaders).map((k) => k.toLowerCase()),
|
|
368
|
+
]);
|
|
369
|
+
|
|
370
|
+
const sensorHeaders: Record<string, string> = {};
|
|
371
|
+
await page.route('**/*', async (route) => {
|
|
372
|
+
for (const [k, v] of Object.entries(route.request().headers())) {
|
|
373
|
+
if (!probeSentKeys.has(k.toLowerCase())) {
|
|
374
|
+
sensorHeaders[k] = v;
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
await route.abort();
|
|
378
|
+
});
|
|
379
|
+
|
|
380
|
+
const probe = args.probeUrl ?? `${new URL(args.baseUrl).origin}/api/__stealth_probe__`;
|
|
381
|
+
await page.evaluate(
|
|
382
|
+
async (probeArgs: { url: string; headers: Record<string, string> }) => {
|
|
383
|
+
try {
|
|
384
|
+
await fetch(probeArgs.url, {
|
|
385
|
+
method: 'POST',
|
|
386
|
+
headers: probeArgs.headers,
|
|
387
|
+
body: '{}',
|
|
388
|
+
});
|
|
389
|
+
} catch {
|
|
390
|
+
// expected: route aborts the request after capturing headers
|
|
391
|
+
}
|
|
392
|
+
},
|
|
393
|
+
{ url: probe, headers: probeHeaders },
|
|
394
|
+
);
|
|
395
|
+
|
|
396
|
+
await page.waitForTimeout(300);
|
|
397
|
+
|
|
398
|
+
// Capture cookies scoped to the recording's registrable domain
|
|
399
|
+
// (eTLD+1). Naive `.split('.').slice(-2)` was wrong for multi-part
|
|
400
|
+
// suffixes like .co.uk — it would match any cookie whose domain
|
|
401
|
+
// contained "co.uk".
|
|
402
|
+
const allCookies = await context.cookies();
|
|
403
|
+
const origin = new URL(args.baseUrl);
|
|
404
|
+
const root = registrableDomain(origin.hostname);
|
|
405
|
+
const cookies = allCookies
|
|
406
|
+
.filter((c) => {
|
|
407
|
+
const cookieHost = c.domain.replace(/^\./, '');
|
|
408
|
+
return isSameRegistrableDomain(cookieHost, root);
|
|
409
|
+
})
|
|
410
|
+
.map((c) => ({ name: c.name, value: c.value }));
|
|
411
|
+
|
|
412
|
+
return { cookies, sensorHeaders, bootstrappedAt: Date.now() };
|
|
413
|
+
} finally {
|
|
414
|
+
await browser?.close().catch(() => {});
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
async function defaultUnderlyingFetch(
|
|
419
|
+
url: string,
|
|
420
|
+
init: FetchInit,
|
|
421
|
+
_tokens: TokenCache,
|
|
422
|
+
): Promise<FetchResult> {
|
|
423
|
+
const resp = await globalThis.fetch(url, {
|
|
424
|
+
method: init.method ?? 'GET',
|
|
425
|
+
headers: init.headers,
|
|
426
|
+
body: init.body,
|
|
427
|
+
});
|
|
428
|
+
const body = await resp.text();
|
|
429
|
+
const headers: Record<string, string> = {};
|
|
430
|
+
resp.headers.forEach((v, k) => {
|
|
431
|
+
headers[k] = v;
|
|
432
|
+
});
|
|
433
|
+
return { status: resp.status, ok: resp.ok, body, headers };
|
|
434
|
+
}
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared persistence helpers for `imprint teach` checkpoint state.
|
|
3
|
+
*
|
|
4
|
+
* The state file is intentionally small JSON today, but callers should go
|
|
5
|
+
* through this module so a future DB-backed implementation can keep the same
|
|
6
|
+
* behavior at the CLI boundary.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import {
|
|
10
|
+
existsSync,
|
|
11
|
+
mkdirSync,
|
|
12
|
+
readFileSync,
|
|
13
|
+
readdirSync,
|
|
14
|
+
renameSync,
|
|
15
|
+
statSync,
|
|
16
|
+
unlinkSync,
|
|
17
|
+
writeFileSync,
|
|
18
|
+
} from 'node:fs';
|
|
19
|
+
import {
|
|
20
|
+
basename as pathBasename,
|
|
21
|
+
isAbsolute as pathIsAbsolute,
|
|
22
|
+
join as pathJoin,
|
|
23
|
+
resolve as pathResolve,
|
|
24
|
+
} from 'node:path';
|
|
25
|
+
import {
|
|
26
|
+
localSessionsDir,
|
|
27
|
+
localSiteDir,
|
|
28
|
+
relativeToLocalSite,
|
|
29
|
+
resolveLocalSitePath,
|
|
30
|
+
} from './paths.ts';
|
|
31
|
+
import type { SharedCompileContext, ToolCandidate } from './tool-candidates.ts';
|
|
32
|
+
|
|
33
|
+
export const TEACH_STEPS = [
|
|
34
|
+
'record',
|
|
35
|
+
'redact',
|
|
36
|
+
'replay-and-diff',
|
|
37
|
+
'triage',
|
|
38
|
+
'detect-candidates',
|
|
39
|
+
'generate',
|
|
40
|
+
'compile-playbook',
|
|
41
|
+
'emit',
|
|
42
|
+
'register',
|
|
43
|
+
] as const;
|
|
44
|
+
|
|
45
|
+
export type TeachStep = (typeof TEACH_STEPS)[number];
|
|
46
|
+
|
|
47
|
+
export interface WorkflowState {
|
|
48
|
+
sessionPath: string;
|
|
49
|
+
redactedPath?: string;
|
|
50
|
+
triagedPath?: string;
|
|
51
|
+
classificationsPath?: string;
|
|
52
|
+
completedSteps: TeachStep[];
|
|
53
|
+
error?: string;
|
|
54
|
+
startedAt: string;
|
|
55
|
+
updatedAt: string;
|
|
56
|
+
candidate?: ToolCandidate;
|
|
57
|
+
sharedContext?: SharedCompileContext;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export interface TeachState {
|
|
61
|
+
workflows: Record<string, WorkflowState>;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export function teachStatePath(site: string): string {
|
|
65
|
+
return pathJoin(localSiteDir(site), '.teach-state.json');
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function legacyStatePath(site: string): string {
|
|
69
|
+
return pathResolve('examples', site, '.teach-state.json');
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export function loadTeachState(site: string): TeachState {
|
|
73
|
+
const path = teachStatePath(site);
|
|
74
|
+
const isLegacy = !existsSync(path) && existsSync(legacyStatePath(site));
|
|
75
|
+
const loadPath = isLegacy ? legacyStatePath(site) : path;
|
|
76
|
+
if (!existsSync(loadPath)) return { workflows: {} };
|
|
77
|
+
try {
|
|
78
|
+
const state = JSON.parse(readFileSync(loadPath, 'utf8')) as TeachState;
|
|
79
|
+
return isLegacy ? normalizeLegacyTeachState(site, state) : state;
|
|
80
|
+
} catch {
|
|
81
|
+
return { workflows: {} };
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function normalizeLegacyTeachState(site: string, state: TeachState): TeachState {
|
|
86
|
+
const legacyRoot = pathResolve('examples', site);
|
|
87
|
+
for (const ws of Object.values(state.workflows)) {
|
|
88
|
+
if (ws.sessionPath && !pathIsAbsolute(ws.sessionPath)) {
|
|
89
|
+
ws.sessionPath = pathResolve(legacyRoot, ws.sessionPath);
|
|
90
|
+
}
|
|
91
|
+
if (ws.redactedPath && !pathIsAbsolute(ws.redactedPath)) {
|
|
92
|
+
ws.redactedPath = pathResolve(legacyRoot, ws.redactedPath);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
return state;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
export function saveTeachState(site: string, state: TeachState): void {
|
|
99
|
+
const path = teachStatePath(site);
|
|
100
|
+
mkdirSync(pathJoin(path, '..'), { recursive: true });
|
|
101
|
+
if (Object.keys(state.workflows).length === 0) {
|
|
102
|
+
try {
|
|
103
|
+
unlinkSync(path);
|
|
104
|
+
} catch {
|
|
105
|
+
// File might not exist — fine.
|
|
106
|
+
}
|
|
107
|
+
return;
|
|
108
|
+
}
|
|
109
|
+
const tmp = `${path}.tmp`;
|
|
110
|
+
writeFileSync(tmp, `${JSON.stringify(state, null, 2)}\n`, 'utf8');
|
|
111
|
+
try {
|
|
112
|
+
renameSync(tmp, path);
|
|
113
|
+
} catch {
|
|
114
|
+
// On Windows, rename can fail if dest exists. Fall back to overwrite.
|
|
115
|
+
writeFileSync(path, readFileSync(tmp, 'utf8'), 'utf8');
|
|
116
|
+
try {
|
|
117
|
+
unlinkSync(tmp);
|
|
118
|
+
} catch {
|
|
119
|
+
/* ignore */
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
export function resolveTeachStatePath(
|
|
125
|
+
site: string,
|
|
126
|
+
storedPath: string | null | undefined,
|
|
127
|
+
): string | null {
|
|
128
|
+
const value = storedPath?.trim();
|
|
129
|
+
if (!value) return null;
|
|
130
|
+
if (pathIsAbsolute(value)) return value;
|
|
131
|
+
return resolveLocalSitePath(site, value);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
export function toRelativeTeachStatePath(site: string, absPath: string): string {
|
|
135
|
+
const localRelative = relativeToLocalSite(site, absPath);
|
|
136
|
+
if (localRelative) return localRelative;
|
|
137
|
+
return `_external_/${pathBasename(absPath)}`;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
export function buildTeachStateFromSession(
|
|
141
|
+
site: string,
|
|
142
|
+
sessionPath: string,
|
|
143
|
+
redactedPath: string | null,
|
|
144
|
+
): WorkflowState {
|
|
145
|
+
const now = new Date().toISOString();
|
|
146
|
+
const ws: WorkflowState = {
|
|
147
|
+
sessionPath: toRelativeTeachStatePath(site, sessionPath),
|
|
148
|
+
completedSteps: redactedPath ? ['record', 'redact'] : ['record'],
|
|
149
|
+
startedAt: now,
|
|
150
|
+
updatedAt: now,
|
|
151
|
+
};
|
|
152
|
+
if (redactedPath) ws.redactedPath = toRelativeTeachStatePath(site, redactedPath);
|
|
153
|
+
return ws;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
export function nextTeachStep(completed: TeachStep[]): TeachStep {
|
|
157
|
+
if (completed.length === 0) return 'record';
|
|
158
|
+
const last = completed.at(-1);
|
|
159
|
+
if (!last) return 'record';
|
|
160
|
+
const lastIdx = TEACH_STEPS.indexOf(last);
|
|
161
|
+
if (lastIdx < 0 || lastIdx >= TEACH_STEPS.length - 1) return 'record';
|
|
162
|
+
return TEACH_STEPS[lastIdx + 1] as TeachStep;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/** Scan <IMPRINT_HOME>/<site>/ for completed workflows. A workflow is "complete"
|
|
166
|
+
* only when its tool directory has index.ts (emit ran successfully). */
|
|
167
|
+
export function discoverCompletedWorkflows(site: string): string[] {
|
|
168
|
+
const siteDir = localSiteDir(site);
|
|
169
|
+
if (!existsSync(siteDir)) return [];
|
|
170
|
+
const names: string[] = [];
|
|
171
|
+
|
|
172
|
+
for (const entry of readdirSync(siteDir)) {
|
|
173
|
+
if (entry === 'sessions' || entry === '_shared' || entry.startsWith('.')) continue;
|
|
174
|
+
const dir = pathResolve(siteDir, entry);
|
|
175
|
+
try {
|
|
176
|
+
if (!statSync(dir).isDirectory()) continue;
|
|
177
|
+
} catch {
|
|
178
|
+
continue;
|
|
179
|
+
}
|
|
180
|
+
if (existsSync(pathJoin(dir, 'index.ts'))) {
|
|
181
|
+
names.push(entry);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
return names;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/** Find the latest local session that has no matching state entry.
|
|
188
|
+
* Recordings live under IMPRINT_HOME/<site>/sessions/. */
|
|
189
|
+
export function discoverOrphanSession(site: string, state: TeachState): WorkflowState | null {
|
|
190
|
+
const trackedPaths = new Set(Object.values(state.workflows).map((ws) => ws.sessionPath));
|
|
191
|
+
|
|
192
|
+
const candidates: Array<{ absPath: string; file: string }> = [];
|
|
193
|
+
const sessDir = localSessionsDir(site);
|
|
194
|
+
if (!existsSync(sessDir)) return null;
|
|
195
|
+
const sessions = readdirSync(sessDir).filter(
|
|
196
|
+
(f) => f.endsWith('.json') && !f.includes('.redacted') && !f.includes('.triaged'),
|
|
197
|
+
);
|
|
198
|
+
for (const file of sessions) candidates.push({ absPath: pathJoin(sessDir, file), file });
|
|
199
|
+
|
|
200
|
+
candidates.sort((a, b) => b.file.localeCompare(a.file));
|
|
201
|
+
|
|
202
|
+
for (const { absPath } of candidates) {
|
|
203
|
+
const relPath = toRelativeTeachStatePath(site, absPath);
|
|
204
|
+
if (trackedPaths.has(relPath) || trackedPaths.has(absPath)) continue;
|
|
205
|
+
|
|
206
|
+
const redactedPath = absPath.replace(/\.json$/, '.redacted.json');
|
|
207
|
+
const hasRedacted = existsSync(redactedPath);
|
|
208
|
+
const completedSteps: TeachStep[] = ['record'];
|
|
209
|
+
if (hasRedacted) completedSteps.push('redact');
|
|
210
|
+
|
|
211
|
+
return {
|
|
212
|
+
sessionPath: relPath,
|
|
213
|
+
redactedPath: hasRedacted ? toRelativeTeachStatePath(site, redactedPath) : undefined,
|
|
214
|
+
completedSteps,
|
|
215
|
+
startedAt: new Date().toISOString(),
|
|
216
|
+
updatedAt: new Date().toISOString(),
|
|
217
|
+
};
|
|
218
|
+
}
|
|
219
|
+
return null;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
export function isExistingTeachFile(path: string | null | undefined): path is string {
|
|
223
|
+
if (!path) return false;
|
|
224
|
+
try {
|
|
225
|
+
return statSync(path).isFile();
|
|
226
|
+
} catch {
|
|
227
|
+
return false;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
export function friendlySessionTimestamp(sessionPath: string): string {
|
|
232
|
+
const m = sessionPath.match(/(\d{4}-\d{2}-\d{2})T(\d{2})-(\d{2})/);
|
|
233
|
+
if (!m) return pathBasename(sessionPath);
|
|
234
|
+
return `${m[1]} ${m[2]}:${m[3]}`;
|
|
235
|
+
}
|