@oh-my-pi/pi-coding-agent 14.5.11 → 14.5.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,554 @@
1
+ import * as fs from "node:fs";
2
+ import * as os from "node:os";
3
+ import * as path from "node:path";
4
+ import { $which, getPuppeteerDir, logger } from "@oh-my-pi/pi-utils";
5
+ import * as browsers from "@puppeteer/browsers";
6
+ import type { Browser, CDPSession, Page, default as Puppeteer } from "puppeteer-core";
7
+ import { PUPPETEER_REVISIONS } from "puppeteer-core/internal/revisions.js";
8
+ import stealthTamperingScript from "../puppeteer/00_stealth_tampering.txt" with { type: "text" };
9
+ import stealthActivityScript from "../puppeteer/01_stealth_activity.txt" with { type: "text" };
10
+ import stealthHairlineScript from "../puppeteer/02_stealth_hairline.txt" with { type: "text" };
11
+ import stealthBotdScript from "../puppeteer/03_stealth_botd.txt" with { type: "text" };
12
+ import stealthIframeScript from "../puppeteer/04_stealth_iframe.txt" with { type: "text" };
13
+ import stealthWebglScript from "../puppeteer/05_stealth_webgl.txt" with { type: "text" };
14
+ import stealthScreenScript from "../puppeteer/06_stealth_screen.txt" with { type: "text" };
15
+ import stealthFontsScript from "../puppeteer/07_stealth_fonts.txt" with { type: "text" };
16
+ import stealthAudioScript from "../puppeteer/08_stealth_audio.txt" with { type: "text" };
17
+ import stealthLocaleScript from "../puppeteer/09_stealth_locale.txt" with { type: "text" };
18
+ import stealthPluginsScript from "../puppeteer/10_stealth_plugins.txt" with { type: "text" };
19
+ import stealthHardwareScript from "../puppeteer/11_stealth_hardware.txt" with { type: "text" };
20
+ import stealthCodecsScript from "../puppeteer/12_stealth_codecs.txt" with { type: "text" };
21
+ import stealthWorkerScript from "../puppeteer/13_stealth_worker.txt" with { type: "text" };
22
+ import { ToolError } from "../tool-errors";
23
+
24
+ export const DEFAULT_VIEWPORT = { width: 1365, height: 768, deviceScaleFactor: 1.25 };
25
+ export const STEALTH_IGNORE_DEFAULT_ARGS = [
26
+ "--disable-extensions",
27
+ "--disable-default-apps",
28
+ "--disable-component-extensions-with-background-pages",
29
+ ];
30
+ export const STEALTH_ACCEPT_LANGUAGE = "en-US,en";
31
+
32
+ const PUPPETEER_SOURCE_URL_SUFFIX = "//# sourceURL=__puppeteer_evaluation_script__";
33
+
34
+ /**
35
+ * Lazy-import puppeteer from a safe CWD so cosmiconfig doesn't choke
36
+ * on malformed package.json files in the user's project tree.
37
+ *
38
+ * Dynamic import is required because puppeteer-core probes the cwd at module
39
+ * load time; we must `process.chdir` to a safe scratch dir before loading and
40
+ * restore cwd afterwards. A static import would run at module-init time before
41
+ * cwd is safe.
42
+ */
43
+ let puppeteerModule: typeof Puppeteer | undefined;
44
+ export async function loadPuppeteer(): Promise<typeof Puppeteer> {
45
+ if (puppeteerModule) return puppeteerModule;
46
+ const prev = process.cwd();
47
+ const safeDir = getPuppeteerDir();
48
+ await Bun.write(path.join(safeDir, "package.json"), "{}");
49
+ try {
50
+ process.chdir(safeDir);
51
+ puppeteerModule = (await import("puppeteer-core")).default;
52
+ return puppeteerModule;
53
+ } finally {
54
+ process.chdir(prev);
55
+ }
56
+ }
57
+
58
+ /**
59
+ * Lazily download Chromium on first browser launch via @puppeteer/browsers.
60
+ * Skipped when a system Chromium (NixOS) or PUPPETEER_EXECUTABLE_PATH is set.
61
+ * The browser is cached under ~/.omp/puppeteer (getPuppeteerDir).
62
+ */
63
+ let chromiumExecutablePromise: Promise<string | undefined> | undefined;
64
+ export async function ensureChromiumExecutable(): Promise<string | undefined> {
65
+ const sysChrome = resolveSystemChromium();
66
+ if (sysChrome) return sysChrome;
67
+ const envPath = process.env.PUPPETEER_EXECUTABLE_PATH;
68
+ if (envPath) return envPath;
69
+ if (chromiumExecutablePromise) return chromiumExecutablePromise;
70
+
71
+ chromiumExecutablePromise = (async () => {
72
+ const platform = browsers.detectBrowserPlatform();
73
+ if (!platform) {
74
+ logger.warn("Could not detect browser platform; relying on puppeteer default resolution");
75
+ return undefined;
76
+ }
77
+ const cacheDir = getPuppeteerDir();
78
+ const buildId = await browsers.resolveBuildId(browsers.Browser.CHROME, platform, PUPPETEER_REVISIONS.chrome);
79
+ const executablePath = browsers.computeExecutablePath({
80
+ browser: browsers.Browser.CHROME,
81
+ buildId,
82
+ cacheDir,
83
+ platform,
84
+ });
85
+ if (fs.existsSync(executablePath)) return executablePath;
86
+
87
+ logger.warn("Downloading Chromium for puppeteer (first browser use)", {
88
+ buildId,
89
+ platform,
90
+ cacheDir,
91
+ });
92
+ let lastReportedPercent = -1;
93
+ await browsers.install({
94
+ browser: browsers.Browser.CHROME,
95
+ buildId,
96
+ cacheDir,
97
+ platform,
98
+ downloadProgressCallback: (downloaded, total) => {
99
+ if (total <= 0) return;
100
+ const pct = Math.floor((downloaded / total) * 100);
101
+ if (pct >= lastReportedPercent + 10 || downloaded === total) {
102
+ lastReportedPercent = pct;
103
+ logger.debug(
104
+ `Chromium download: ${pct}% (${Math.round(downloaded / 1_000_000)} / ${Math.round(total / 1_000_000)} MB)`,
105
+ );
106
+ }
107
+ },
108
+ });
109
+ return executablePath;
110
+ })().catch(err => {
111
+ chromiumExecutablePromise = undefined;
112
+ throw new ToolError(
113
+ `Failed to install Chromium for puppeteer: ${(err as Error).message}. ` +
114
+ "Set PUPPETEER_EXECUTABLE_PATH to use an existing Chrome/Chromium binary, or install one manually.",
115
+ );
116
+ });
117
+ return chromiumExecutablePromise;
118
+ }
119
+
120
+ let _resolvedChromium: string | null | undefined; // undefined = unchecked; null = not found
121
+
122
+ function isExecutableFile(p: string): boolean {
123
+ try {
124
+ const st = fs.statSync(p);
125
+ return st.isFile();
126
+ } catch {
127
+ return false;
128
+ }
129
+ }
130
+
131
+ function systemChromiumCandidates(): string[] {
132
+ const home = os.homedir();
133
+ const candidates: string[] = [];
134
+ switch (process.platform) {
135
+ case "darwin": {
136
+ for (const root of ["/Applications", path.join(home, "Applications")]) {
137
+ candidates.push(
138
+ path.join(root, "Google Chrome.app/Contents/MacOS/Google Chrome"),
139
+ path.join(root, "Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta"),
140
+ path.join(root, "Google Chrome Dev.app/Contents/MacOS/Google Chrome Dev"),
141
+ path.join(root, "Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary"),
142
+ path.join(root, "Chromium.app/Contents/MacOS/Chromium"),
143
+ path.join(root, "Microsoft Edge.app/Contents/MacOS/Microsoft Edge"),
144
+ );
145
+ }
146
+ break;
147
+ }
148
+ case "linux": {
149
+ const names = ["google-chrome-stable", "google-chrome", "chromium", "chromium-browser", "chrome"];
150
+ for (const name of names) {
151
+ const found = $which(name);
152
+ if (found) candidates.push(found);
153
+ }
154
+ candidates.push(
155
+ "/usr/bin/google-chrome-stable",
156
+ "/usr/bin/google-chrome",
157
+ "/usr/bin/chromium",
158
+ "/usr/bin/chromium-browser",
159
+ "/snap/bin/chromium",
160
+ "/var/lib/flatpak/exports/bin/com.google.Chrome",
161
+ "/var/lib/flatpak/exports/bin/org.chromium.Chromium",
162
+ );
163
+ let onNixos = false;
164
+ try {
165
+ onNixos = fs.existsSync("/etc/NIXOS");
166
+ } catch {}
167
+ if (onNixos) {
168
+ candidates.push(path.join(home, ".nix-profile/bin/chromium"), "/run/current-system/sw/bin/chromium");
169
+ }
170
+ break;
171
+ }
172
+ case "win32": {
173
+ const programFiles = process.env.ProgramFiles ?? "C:\\Program Files";
174
+ const programFilesX86 = process.env["ProgramFiles(x86)"] ?? "C:\\Program Files (x86)";
175
+ const localAppData = process.env.LOCALAPPDATA ?? path.join(home, "AppData\\Local");
176
+ candidates.push(
177
+ path.join(programFiles, "Google\\Chrome\\Application\\chrome.exe"),
178
+ path.join(programFilesX86, "Google\\Chrome\\Application\\chrome.exe"),
179
+ path.join(localAppData, "Google\\Chrome\\Application\\chrome.exe"),
180
+ path.join(programFiles, "Chromium\\Application\\chrome.exe"),
181
+ path.join(localAppData, "Chromium\\Application\\chrome.exe"),
182
+ path.join(programFiles, "Microsoft\\Edge\\Application\\msedge.exe"),
183
+ path.join(programFilesX86, "Microsoft\\Edge\\Application\\msedge.exe"),
184
+ );
185
+ break;
186
+ }
187
+ }
188
+ return candidates;
189
+ }
190
+
191
+ export function resolveSystemChromium(): string | undefined {
192
+ if (_resolvedChromium !== undefined) return _resolvedChromium ?? undefined;
193
+ const seen = new Set<string>();
194
+ for (const candidate of systemChromiumCandidates()) {
195
+ if (!candidate || seen.has(candidate)) continue;
196
+ seen.add(candidate);
197
+ if (isExecutableFile(candidate)) {
198
+ _resolvedChromium = candidate;
199
+ logger.debug("Using system Chrome/Chromium", { path: candidate });
200
+ return candidate;
201
+ }
202
+ }
203
+ _resolvedChromium = null;
204
+ return undefined;
205
+ }
206
+
207
+ export interface LaunchHeadlessOptions {
208
+ headless: boolean;
209
+ viewport?: { width: number; height: number; deviceScaleFactor?: number };
210
+ }
211
+
212
+ export async function launchHeadlessBrowser(opts: LaunchHeadlessOptions): Promise<Browser> {
213
+ const vp = opts.viewport ?? DEFAULT_VIEWPORT;
214
+ const initialViewport = {
215
+ width: vp.width,
216
+ height: vp.height,
217
+ deviceScaleFactor: vp.deviceScaleFactor ?? DEFAULT_VIEWPORT.deviceScaleFactor,
218
+ };
219
+ const puppeteer = await loadPuppeteer();
220
+ const launchArgs = [
221
+ "--no-sandbox",
222
+ "--disable-setuid-sandbox",
223
+ "--disable-blink-features=AutomationControlled",
224
+ `--window-size=${initialViewport.width},${initialViewport.height}`,
225
+ ];
226
+ const proxy = process.env.PUPPETEER_PROXY;
227
+ if (proxy) {
228
+ launchArgs.push(`--proxy-server=${proxy}`);
229
+ // Chrome (since v72) bypasses proxies for localhost by default. When PUPPETEER_PROXY_BYPASS_LOOPBACK
230
+ // is true, add <-loopback> so traffic to localhost reaches the proxy (e.g. for mitmdump/auth capture).
231
+ const bypassLoopback = process.env.PUPPETEER_PROXY_BYPASS_LOOPBACK?.toLowerCase();
232
+ if (bypassLoopback === "true" || bypassLoopback === "1" || bypassLoopback === "yes" || bypassLoopback === "on") {
233
+ launchArgs.push("--proxy-bypass-list=<-loopback>");
234
+ }
235
+ }
236
+ const ignoreCert = process.env.PUPPETEER_PROXY_IGNORE_CERT_ERRORS?.toLowerCase();
237
+ if (ignoreCert === "true" || ignoreCert === "1" || ignoreCert === "yes" || ignoreCert === "on") {
238
+ launchArgs.push("--ignore-certificate-errors");
239
+ }
240
+ return await puppeteer.launch({
241
+ headless: opts.headless,
242
+ defaultViewport: opts.headless ? initialViewport : null,
243
+ executablePath: await ensureChromiumExecutable(),
244
+ args: launchArgs,
245
+ ignoreDefaultArgs: [...STEALTH_IGNORE_DEFAULT_ARGS],
246
+ });
247
+ }
248
+
249
+ export async function applyViewport(
250
+ page: Page,
251
+ viewport?: { width: number; height: number; deviceScaleFactor?: number },
252
+ ): Promise<void> {
253
+ if (!viewport) {
254
+ await page.setViewport(DEFAULT_VIEWPORT);
255
+ return;
256
+ }
257
+ await page.setViewport({
258
+ width: viewport.width,
259
+ height: viewport.height,
260
+ deviceScaleFactor: viewport.deviceScaleFactor ?? DEFAULT_VIEWPORT.deviceScaleFactor,
261
+ });
262
+ }
263
+
264
+ // =====================================================================
265
+ // Stealth patches
266
+ // =====================================================================
267
+
268
+ interface PuppeteerCdpClient {
269
+ send: (method: string, params?: Record<string, unknown>) => Promise<unknown>;
270
+ }
271
+
272
+ export interface UserAgentOverride {
273
+ userAgent: string;
274
+ platform: string;
275
+ acceptLanguage: string;
276
+ userAgentMetadata: {
277
+ brands: Array<{ brand: string; version: string }>;
278
+ fullVersion: string;
279
+ platform: string;
280
+ platformVersion: string;
281
+ architecture: string;
282
+ model: string;
283
+ mobile: boolean;
284
+ };
285
+ }
286
+
287
+ function resolvePageClient(page: Page): PuppeteerCdpClient | null {
288
+ const pageWithClient = page as Page & {
289
+ _client?: (() => PuppeteerCdpClient) | PuppeteerCdpClient;
290
+ };
291
+ if (!pageWithClient._client) return null;
292
+ return typeof pageWithClient._client === "function" ? pageWithClient._client() : pageWithClient._client;
293
+ }
294
+
295
+ const patchedClients = new WeakSet<object>();
296
+
297
+ function patchSourceUrl(page: Page): void {
298
+ const client = resolvePageClient(page);
299
+ if (!client) return;
300
+ const clientKey = client as object;
301
+ if (patchedClients.has(clientKey)) return;
302
+ patchedClients.add(clientKey);
303
+ const originalSend = client.send.bind(client);
304
+ client.send = async (method: string, params?: Record<string, unknown>) => {
305
+ const next = async (payload?: Record<string, unknown>) => {
306
+ try {
307
+ return await originalSend(method, payload);
308
+ } catch (error) {
309
+ if (
310
+ error instanceof Error &&
311
+ error.message.includes(
312
+ "Protocol error (Network.getResponseBody): No resource with given identifier found",
313
+ )
314
+ ) {
315
+ return undefined;
316
+ }
317
+ throw error;
318
+ }
319
+ };
320
+ if (!method || !params) {
321
+ return next(params);
322
+ }
323
+ const key =
324
+ method === "Runtime.evaluate"
325
+ ? "expression"
326
+ : method === "Runtime.callFunctionOn"
327
+ ? "functionDeclaration"
328
+ : null;
329
+ if (!key) {
330
+ return next(params);
331
+ }
332
+ const value = params[key];
333
+ if (typeof value !== "string" || !value.includes(PUPPETEER_SOURCE_URL_SUFFIX)) {
334
+ return next(params);
335
+ }
336
+ const patchedParams = { ...params, [key]: value.replace(PUPPETEER_SOURCE_URL_SUFFIX, "") };
337
+ return next(patchedParams);
338
+ };
339
+ }
340
+
341
+ async function resolveUserAgentOverride(page: Page): Promise<UserAgentOverride> {
342
+ const rawUserAgent = await page.browser().userAgent();
343
+ let userAgent = rawUserAgent.replace("HeadlessChrome/", "Chrome/");
344
+ if (userAgent.includes("Linux") && !userAgent.includes("Android")) {
345
+ userAgent = userAgent.replace(/\(([^)]+)\)/, "(Windows NT 10.0; Win64; x64)");
346
+ }
347
+
348
+ const uaVersionMatch = userAgent.match(/Chrome\/([\d|.]+)/);
349
+ const fallbackVersionMatch = uaVersionMatch ?? (await page.browser().version()).match(/\/([\d|.]+)/);
350
+ const uaVersion = fallbackVersionMatch?.[1] ?? "0";
351
+ const majorVersion = Number.parseInt(uaVersion.split(".")[0] ?? "0", 10) || 0;
352
+ const isAndroid = userAgent.includes("Android");
353
+ const platform = userAgent.includes("Mac OS X")
354
+ ? "MacIntel"
355
+ : isAndroid
356
+ ? "Android"
357
+ : userAgent.includes("Linux")
358
+ ? "Linux"
359
+ : "Win32";
360
+ const platformFull = userAgent.includes("Mac OS X")
361
+ ? "Mac OS X"
362
+ : isAndroid
363
+ ? "Android"
364
+ : userAgent.includes("Linux")
365
+ ? "Linux"
366
+ : "Windows";
367
+ const platformVersion = userAgent.includes("Mac OS X ")
368
+ ? (userAgent.match(/Mac OS X ([^)]+)/)?.[1] ?? "")
369
+ : userAgent.includes("Android ")
370
+ ? (userAgent.match(/Android ([^;]+)/)?.[1] ?? "")
371
+ : userAgent.includes("Windows ")
372
+ ? (userAgent.match(/Windows .*?([\d|.]+);?/)?.[1] ?? "")
373
+ : "";
374
+ const architecture = isAndroid ? "" : "x86";
375
+ const model = isAndroid ? (userAgent.match(/Android.*?;\s([^)]+)/)?.[1] ?? "") : "";
376
+
377
+ const brandOrders = [
378
+ [0, 1, 2],
379
+ [0, 2, 1],
380
+ [1, 0, 2],
381
+ [1, 2, 0],
382
+ [2, 0, 1],
383
+ [2, 1, 0],
384
+ ];
385
+ const order = brandOrders[majorVersion % brandOrders.length] ?? brandOrders[0]!;
386
+ const escapedChars = [" ", " ", ";"];
387
+ const greaseyBrand = `${escapedChars[order[0]!]}Not${escapedChars[order[1]!]}A${escapedChars[order[2]!]}Brand`;
388
+ const brands: { brand: string; version: string }[] = [];
389
+ brands[order[0]!] = { brand: greaseyBrand, version: "99" };
390
+ brands[order[1]!] = { brand: "Chromium", version: String(majorVersion) };
391
+ brands[order[2]!] = { brand: "Google Chrome", version: String(majorVersion) };
392
+
393
+ return {
394
+ userAgent,
395
+ platform,
396
+ acceptLanguage: STEALTH_ACCEPT_LANGUAGE,
397
+ userAgentMetadata: {
398
+ brands,
399
+ fullVersion: uaVersion,
400
+ platform: platformFull,
401
+ platformVersion,
402
+ architecture,
403
+ model,
404
+ mobile: isAndroid,
405
+ },
406
+ };
407
+ }
408
+
409
+ function wrapSession(session: CDPSession): PuppeteerCdpClient {
410
+ return {
411
+ send: async (method, params) => session.send(method as never, params as never),
412
+ };
413
+ }
414
+
415
+ async function sendUserAgentOverride(client: PuppeteerCdpClient, override: UserAgentOverride): Promise<void> {
416
+ try {
417
+ await client.send("Network.enable");
418
+ } catch {}
419
+ try {
420
+ await client.send("Network.setUserAgentOverride", override as unknown as Record<string, unknown>);
421
+ } catch (error) {
422
+ logger.debug("Failed to apply Network user agent override", {
423
+ error: error instanceof Error ? error.message : String(error),
424
+ });
425
+ }
426
+ try {
427
+ await client.send("Emulation.setUserAgentOverride", override as unknown as Record<string, unknown>);
428
+ } catch (error) {
429
+ logger.debug("Failed to apply Emulation user agent override", {
430
+ error: error instanceof Error ? error.message : String(error),
431
+ });
432
+ }
433
+ }
434
+
435
+ export interface UserAgentSession {
436
+ override: UserAgentOverride;
437
+ browserSession: CDPSession | null;
438
+ }
439
+
440
+ /** Configure UA override on the browser + auto-attach to new targets. */
441
+ async function configureUserAgentTargets(
442
+ browser: Browser,
443
+ state: { browserSession: CDPSession | null; override: UserAgentOverride },
444
+ ): Promise<void> {
445
+ if (!state.browserSession) {
446
+ state.browserSession = await browser.target().createCDPSession();
447
+ await state.browserSession.send("Target.setAutoAttach", {
448
+ autoAttach: true,
449
+ waitForDebuggerOnStart: false,
450
+ flatten: true,
451
+ });
452
+ state.browserSession.on("Target.attachedToTarget", async (event: { sessionId: string }) => {
453
+ const connection = state.browserSession?.connection();
454
+ const session = connection?.session(event.sessionId);
455
+ if (!session) return;
456
+ await sendUserAgentOverride(wrapSession(session), state.override);
457
+ });
458
+ }
459
+
460
+ const targets = browser.targets();
461
+ await Promise.all(
462
+ targets.map(async target => {
463
+ const session = await target.createCDPSession();
464
+ await sendUserAgentOverride(wrapSession(session), state.override);
465
+ }),
466
+ );
467
+ }
468
+
469
+ async function injectStealthScripts(page: Page): Promise<void> {
470
+ const scripts = [
471
+ stealthTamperingScript,
472
+ stealthActivityScript,
473
+ stealthHairlineScript,
474
+ stealthBotdScript,
475
+ stealthIframeScript,
476
+ stealthWebglScript,
477
+ stealthScreenScript,
478
+ stealthFontsScript,
479
+ stealthAudioScript,
480
+ stealthLocaleScript,
481
+ stealthPluginsScript,
482
+ stealthHardwareScript,
483
+ stealthCodecsScript,
484
+ stealthWorkerScript,
485
+ ];
486
+
487
+ const joint = scripts
488
+ .map(
489
+ script => `
490
+ try {
491
+ ${script};
492
+ } catch (e) {}
493
+ `,
494
+ )
495
+ .join(";\n");
496
+
497
+ await page.evaluateOnNewDocument(`(() => {
498
+ // Native function cache - captured before any tampering
499
+ const iframe = document.createElement("iframe");
500
+ iframe.style.display = "none";
501
+ document.head.appendChild(iframe);
502
+ const nativeWindow = iframe.contentWindow;
503
+ if (!nativeWindow) return;
504
+
505
+ // Cache pristine native functions
506
+ const Function_toString = nativeWindow.Function.prototype.toString;
507
+ const Object_getOwnPropertyDescriptor = nativeWindow.Object.getOwnPropertyDescriptor;
508
+ const Object_getOwnPropertyDescriptors = nativeWindow.Object.getOwnPropertyDescriptors;
509
+ const Object_getPrototypeOf = nativeWindow.Object.getPrototypeOf;
510
+ const Object_defineProperty = nativeWindow.Object.defineProperty;
511
+ const Object_getOwnPropertyDescriptorOriginal = nativeWindow.Object.getOwnPropertyDescriptor;
512
+ const Object_create = nativeWindow.Object.create;
513
+ const Object_keys = nativeWindow.Object.keys;
514
+ const Object_getOwnPropertyNames = nativeWindow.Object.getOwnPropertyNames;
515
+ const Object_entries = nativeWindow.Object.entries;
516
+ const Object_setPrototypeOf = nativeWindow.Object.setPrototypeOf;
517
+ const Object_assign = nativeWindow.Object.assign;
518
+ const Window_setTimeout = nativeWindow.setTimeout;
519
+ const Math_random = nativeWindow.Math.random;
520
+ const Math_floor = nativeWindow.Math.floor;
521
+ const Math_max = nativeWindow.Math.max;
522
+ const Math_min = nativeWindow.Math.min;
523
+ const Window_Event = nativeWindow.Event;
524
+ const Promise_resolve = nativeWindow.Promise.resolve.bind(nativeWindow.Promise);
525
+ const Window_Blob = nativeWindow.Blob;
526
+ const Window_Proxy = nativeWindow.Proxy;
527
+ const Intl_DateTimeFormat = nativeWindow.Intl.DateTimeFormat;
528
+ const Date_constructor = nativeWindow.Date;
529
+
530
+
531
+ ${joint}
532
+
533
+ document.head.removeChild(iframe);})();`);
534
+ }
535
+
536
+ /** Apply stealth patches + UA override to a headless page. Idempotent within a tab. */
537
+ export async function applyStealthPatches(
538
+ browser: Browser,
539
+ page: Page,
540
+ state: { browserSession: CDPSession | null; override: UserAgentOverride | null },
541
+ ): Promise<void> {
542
+ patchSourceUrl(page);
543
+ if (!state.override) {
544
+ state.override = await resolveUserAgentOverride(page);
545
+ }
546
+ const client = resolvePageClient(page);
547
+ if (client) {
548
+ await sendUserAgentOverride(client, state.override);
549
+ }
550
+ const targetState = { browserSession: state.browserSession, override: state.override };
551
+ await configureUserAgentTargets(browser, targetState);
552
+ state.browserSession = targetState.browserSession;
553
+ await injectStealthScripts(page);
554
+ }
@@ -0,0 +1,90 @@
1
+ import { Readability } from "@mozilla/readability";
2
+ import { parseHTML } from "linkedom";
3
+ import { htmlToBasicMarkdown } from "../../web/scrapers/types";
4
+
5
+ export type ReadableFormat = "text" | "markdown";
6
+
7
+ export interface ReadableResult {
8
+ url: string;
9
+ title?: string;
10
+ byline?: string;
11
+ excerpt?: string;
12
+ contentLength: number;
13
+ text?: string;
14
+ markdown?: string;
15
+ }
16
+
17
+ /** Trim to non-empty string or undefined. */
18
+ function normalize(text: string | null | undefined): string | undefined {
19
+ const trimmed = text?.trim();
20
+ return trimmed || undefined;
21
+ }
22
+
23
+ /**
24
+ * Extract readable content from raw HTML.
25
+ * Tries Readability (article-isolation scoring) first, then falls back to a
26
+ * CSS selector chain over the same pre-parsed DOM. Returns null if neither
27
+ * path yields usable content.
28
+ */
29
+ export function extractReadableFromHtml(html: string, url: string, format: ReadableFormat): ReadableResult | null {
30
+ const { document } = parseHTML(html);
31
+
32
+ // --- Primary: Readability article extraction ---
33
+ const article = new Readability(document).parse();
34
+ if (article) {
35
+ const result = toReadableResult(url, format, article.textContent, article.content, {
36
+ title: article.title,
37
+ byline: article.byline,
38
+ excerpt: article.excerpt,
39
+ length: article.length,
40
+ });
41
+ if (result) return result;
42
+ }
43
+
44
+ // --- Fallback: CSS selector chain ---
45
+ const candidates = [
46
+ document.querySelector("[data-pagefind-body]"),
47
+ document.querySelector("main article"),
48
+ document.querySelector("article"),
49
+ document.querySelector("main"),
50
+ document.querySelector("[role='main']"),
51
+ document.body,
52
+ ];
53
+ for (const el of candidates) {
54
+ if (!el) continue;
55
+ const innerHTML = el.innerHTML?.trim();
56
+ const textContent = el.textContent?.trim();
57
+ if (!innerHTML || !textContent) continue;
58
+ const result = toReadableResult(url, format, textContent, innerHTML, {
59
+ title: document.title,
60
+ excerpt: textContent.slice(0, 240),
61
+ length: textContent.length,
62
+ });
63
+ if (result) return result;
64
+ }
65
+
66
+ return null;
67
+ }
68
+
69
+ /** Shared builder for both extraction paths. */
70
+ function toReadableResult(
71
+ url: string,
72
+ format: ReadableFormat,
73
+ textContent: string | null | undefined,
74
+ htmlContent: string | null | undefined,
75
+ meta: { title?: string | null; byline?: string | null; excerpt?: string | null; length?: number | null },
76
+ ): ReadableResult | null {
77
+ const text = normalize(textContent);
78
+ const markdown = format === "markdown" ? (normalize(htmlToBasicMarkdown(htmlContent ?? "")) ?? text) : undefined;
79
+ const normalizedText = format === "text" ? text : undefined;
80
+ if (!normalizedText && !markdown) return null;
81
+ return {
82
+ url,
83
+ title: normalize(meta.title),
84
+ byline: normalize(meta.byline),
85
+ excerpt: normalize(meta.excerpt),
86
+ contentLength: meta.length ?? text?.length ?? markdown?.length ?? 0,
87
+ text: normalizedText,
88
+ markdown,
89
+ };
90
+ }