@apmantza/greedysearch-pi 1.9.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,645 +1,837 @@
1
- // extractors/common.mjs — shared utilities for CDP-based extractors
2
- // Extracts common patterns: cdp wrapper, tab management, clipboard interception, source parsing
3
-
4
- import { randomInt } from "node:crypto";
5
- import { spawn } from "node:child_process";
6
- import { dirname, join } from "node:path";
7
- import { fileURLToPath } from "node:url";
8
-
9
- const __dir = dirname(fileURLToPath(import.meta.url));
10
- const CDP = join(__dir, "..", "bin", "cdp.mjs");
11
-
12
- // ============================================================================
13
- // CDP wrapper
14
- // ============================================================================
15
-
16
- /**
17
- * Execute a CDP command through the cdp.mjs CLI
18
- * @param {string[]} args - Command arguments
19
- * @param {number} [timeoutMs=30000] - Timeout in milliseconds
20
- * @returns {Promise<string>} Command output
21
- */
22
- export function cdp(args, timeoutMs = 30000) {
23
- return new Promise((resolve, reject) => {
24
- const proc = spawn(process.execPath, [CDP, ...args], {
25
- stdio: ["ignore", "pipe", "pipe"],
26
- });
27
- let out = "";
28
- let err = "";
29
- proc.stdout.on("data", (d) => (out += d));
30
- proc.stderr.on("data", (d) => (err += d));
31
- const timer = setTimeout(() => {
32
- proc.kill();
33
- reject(new Error(`cdp timeout: ${args[0]}`));
34
- }, timeoutMs);
35
- proc.on("close", (code) => {
36
- clearTimeout(timer);
37
- if (code === 0) resolve(out.trim());
38
- else reject(new Error(err.trim() || `cdp exit ${code}`));
39
- });
40
- });
41
- }
42
-
43
- // ============================================================================
44
- // Tab management
45
- // ============================================================================
46
-
47
- /**
48
- * Get an existing tab by prefix or open a new one
49
- * @param {string|null} tabPrefix - Existing tab prefix, or null to create new
50
- * @returns {Promise<string>} Tab identifier
51
- */
52
- export async function getOrOpenTab(tabPrefix) {
53
- if (tabPrefix) return tabPrefix;
54
- // Always open a fresh tab to avoid SPA navigation issues
55
- const list = await cdp(["list"]);
56
- const anchor = list.split("\n")[0]?.slice(0, 8);
57
- if (!anchor)
58
- throw new Error(
59
- "No Chrome tabs found. Is Chrome running with --remote-debugging-port=9222?",
60
- );
61
- const raw = await cdp([
62
- "evalraw",
63
- anchor,
64
- "Target.createTarget",
65
- '{"url":"about:blank"}',
66
- ]);
67
- const { targetId } = JSON.parse(raw);
68
- await cdp(["list"]); // refresh cache
69
- const tid = targetId.slice(0, 8);
70
- // Inject stealth patches for anti-detection coverage (both headless + visible)
71
- injectHeadlessStealth(tid).catch(() => {});
72
- return tid;
73
- }
74
-
75
- // ============================================================================
76
- // Clipboard interception (for extractors that use copy-to-clipboard)
77
- // ============================================================================
78
-
79
- /**
80
- * Inject clipboard interceptor to capture text when copy buttons are clicked.
81
- * Each engine uses a unique global variable to avoid conflicts.
82
- * @param {string} tab - Tab identifier
83
- * @param {string} globalVar - Global variable name (e.g., '__pplxClipboard', '__geminiClipboard')
84
- */
85
- export async function injectClipboardInterceptor(tab, globalVar) {
86
- const code = `
87
- window.${globalVar} = null;
88
- const _origWriteText = navigator.clipboard.writeText.bind(navigator.clipboard);
89
- navigator.clipboard.writeText = function(text) {
90
- window.${globalVar} = text;
91
- return _origWriteText(text);
92
- };
93
- const _origWrite = navigator.clipboard.write.bind(navigator.clipboard);
94
- navigator.clipboard.write = async function(items) {
95
- try {
96
- for (const item of items) {
97
- if (item.types && item.types.includes('text/plain')) {
98
- const blob = await item.getType('text/plain');
99
- window.${globalVar} = await blob.text();
100
- break;
101
- }
102
- }
103
- } catch(e) {}
104
- return _origWrite(items);
105
- };
106
- `;
107
- await cdp(["eval", tab, code]);
108
- }
109
-
110
- // ============================================================================
111
- // Headless stealth injection
112
- // ============================================================================
113
-
114
- /**
115
- * Inject anti-detection patches into a page in headless mode.
116
- * Based on production patterns from screenshotrun.com.
117
- */
118
- export async function injectHeadlessStealth(tab) {
119
- const code = `
120
- (function() {
121
- // ── Runtime.enable / CDP detection masking ──────────────
122
- try { delete window.__REBROWSER_RUNTIME_ENABLE; } catch(_) {}
123
- try { delete window.__REBROWSER_DEVTOOLS; } catch(_) {}
124
- try { delete window.__nightmare; } catch(_) {}
125
- try { delete window.__phantom; } catch(_) {}
126
- try { delete window.callPhantom; } catch(_) {}
127
- try { delete window._phantom; } catch(_) {}
128
- try { delete window.Buffer; } catch(_) {}
129
-
130
- Object.defineProperty(navigator, 'webdriver', { get: () => false });
131
- Object.defineProperty(navigator, 'plugins', {
132
- get: () => {
133
- var p = [
134
- { name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format' },
135
- { name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai', description: '' },
136
- { name: 'Native Client', filename: 'internal-nacl-plugin', description: '' },
137
- ];
138
- p.length = 3;
139
- return p;
140
- },
141
- });
142
- Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
143
- if (!window.chrome) {
144
- window.chrome = {
145
- runtime: { connect: () => {}, sendMessage: () => {}, onMessage: { addListener: () => {} } },
146
- loadTimes: () => ({}),
147
- csi: () => ({}),
148
- };
149
- }
150
- var origQuery = navigator.permissions?.query;
151
- if (origQuery) {
152
- navigator.permissions.query = function(params) {
153
- if (params.name === 'notifications') return Promise.resolve({ state: Notification.permission });\n return origQuery(params);
154
- };
155
- }
156
- try {
157
- var getParam = WebGLRenderingContext.prototype.getParameter;
158
- WebGLRenderingContext.prototype.getParameter = function(p) {
159
- if (p === 37445) return 'Intel Inc.';
160
- if (p === 37446) return 'Intel Iris OpenGL Engine';
161
- return getParam.call(this, p);
162
- };
163
- } catch(_) {}
164
- Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 });
165
- Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
166
-
167
- // ── Canvas fingerprint noise ─────────────────────────
168
- // Headless rendering engines produce slightly different canvas output
169
- // than headed Chrome. Subtle noise breaks hash-based fingerprinting.
170
- try {
171
- var origFill = CanvasRenderingContext2D.prototype.fillText;
172
- CanvasRenderingContext2D.prototype.fillText = function() {
173
- this.globalAlpha = 1 - (Math.random() * 0.001);
174
- return origFill.apply(this, arguments);
175
- };
176
- } catch(_) {}
177
- try {
178
- var origStroke = CanvasRenderingContext2D.prototype.strokeText;
179
- CanvasRenderingContext2D.prototype.strokeText = function() {
180
- this.globalAlpha = 1 - (Math.random() * 0.001);
181
- return origStroke.apply(this, arguments);
182
- };
183
- } catch(_) {}
184
- try {
185
- var origToDataURL = HTMLCanvasElement.prototype.toDataURL;
186
- HTMLCanvasElement.prototype.toDataURL = function() {
187
- var ctx = this.getContext('2d');
188
- if (ctx) {
189
- // Add 1px noise pixel in corner (invisible but changes hash)
190
- var imgData = ctx.getImageData(0, 0, 1, 1);
191
- if (imgData) imgData.data[0] ^= (Math.random() < 0.5 ? 1 : 0);
192
- ctx.putImageData(imgData, 0, 0);
193
- }
194
- return origToDataURL.apply(this, arguments);
195
- };
196
- } catch(_) {}
197
-
198
- // ── window outer dimensions ──────────────────────────
199
- // outerWidth/Height = 0 in headless — a well-known bot signal.
200
- // Mirror innerWidth/Height (set by --window-size flag) so the ratio is sane.
201
- try {
202
- if (!window.outerWidth) Object.defineProperty(window, 'outerWidth', { get: () => window.innerWidth || 1920, configurable: true });
203
- if (!window.outerHeight) Object.defineProperty(window, 'outerHeight', { get: () => window.innerHeight || 1080, configurable: true });
204
- } catch(_) {}
205
-
206
- // ── screen properties ─────────────────────────────────
207
- try {
208
- if (!screen.colorDepth) Object.defineProperty(screen, 'colorDepth', { get: () => 24, configurable: true });
209
- if (!screen.pixelDepth) Object.defineProperty(screen, 'pixelDepth', { get: () => 24, configurable: true });
210
- } catch(_) {}
211
-
212
- // ── navigator.userAgentData (UA Client Hints) ─────────
213
- // Derive version from the UA string already set by --user-agent flag so the
214
- // two APIs are always consistent. Removes any "HeadlessChrome" brand entry.
215
- try {
216
- var _uaMajor = (navigator.userAgent.match(/Chrome\/(\d+)/) || [])[1] || '136';
217
- var _uaFull = (navigator.userAgent.match(/Chrome\/([\d.]+)/) || [])[1] || (_uaMajor + '.0.0.0');
218
- var _brands = [
219
- { brand: 'Not)A;Brand', version: '99' },
220
- { brand: 'Google Chrome', version: _uaMajor },
221
- { brand: 'Chromium', version: _uaMajor },
222
- ];
223
- Object.defineProperty(navigator, 'userAgentData', {
224
- get: function() {
225
- return {
226
- brands: _brands, mobile: false, platform: 'Windows',
227
- getHighEntropyValues: function() {
228
- return Promise.resolve({
229
- architecture: 'x86', bitness: '64',
230
- brands: _brands,
231
- fullVersionList: [
232
- { brand: 'Not)A;Brand', version: '99.0.0.0' },
233
- { brand: 'Google Chrome', version: _uaFull },
234
- { brand: 'Chromium', version: _uaFull },
235
- ],
236
- mobile: false, model: '', platform: 'Windows',
237
- platformVersion: '15.0.0', uaFullVersion: _uaFull, wow64: false,
238
- });
239
- },
240
- toJSON: function() { return { brands: _brands, mobile: false, platform: 'Windows' }; },
241
- };
242
- },
243
- configurable: true,
244
- });
245
- } catch(_) {}
246
-
247
- // ── CDP Runtime serialization guard ──────────────────
248
- // Sites detect CDP by putting a getter on Error.prototype.stack
249
- // and checking if console.log triggers it (only happens when
250
- // Runtime domain is enabled). We monkey-patch console methods to
251
- // strip custom getters from arguments before they reach CDP.
252
- try {
253
- var _origLog = console.log, _origError = console.error,
254
- _origWarn = console.warn, _origDebug = console.debug,
255
- _origInfo = console.info;
256
- var _safeArg = function(a) {
257
- if (a instanceof Error) {
258
- try { return new Error(a.message); } catch(_) { return a; }
259
- }
260
- return a;
261
- };
262
- console.log = function() { return _origLog.apply(console, Array.prototype.map.call(arguments, _safeArg)); };
263
- console.error = function() { return _origError.apply(console, Array.prototype.map.call(arguments, _safeArg)); };
264
- console.warn = function() { return _origWarn.apply(console, Array.prototype.map.call(arguments, _safeArg)); };
265
- console.debug = function() { return _origDebug.apply(console, Array.prototype.map.call(arguments, _safeArg)); };
266
- console.info = function() { return _origInfo.apply(console, Array.prototype.map.call(arguments, _safeArg)); };
267
- } catch(_) {}
268
- })();
269
- `;
270
- await cdp([
271
- "evalraw",
272
- tab,
273
- "Page.addScriptToEvaluateOnNewDocument",
274
- JSON.stringify({ source: code }),
275
- ]);
276
- }
277
-
278
- // ============================================================================
279
- // Source extraction from markdown
280
- // ============================================================================
281
-
282
- /**
283
- * Parse Markdown links from text to extract sources
284
- * @param {string} text - Text containing Markdown links like [title](url)
285
- * @returns {Array<{title: string, url: string}>} Extracted sources
286
- */
287
- export function parseSourcesFromMarkdown(text) {
288
- if (!text) return [];
289
- const results = [];
290
- let idx = 0;
291
- while (idx < text.length && results.length < 10) {
292
- const openBracket = text.indexOf("[", idx);
293
- if (openBracket === -1) break;
294
- const closeBracket = text.indexOf("](", openBracket);
295
- if (closeBracket === -1) break;
296
- const openParen = closeBracket + 2;
297
- // Validate URL prefix and find closing paren
298
- let closeParen = -1;
299
- for (let p = openParen; p < text.length; p++) {
300
- const ch = text[p];
301
- if (ch === ")") {
302
- closeParen = p;
303
- break;
304
- }
305
- if (/\s/.test(ch)) break; // whitespace in URL = invalid markdown link
306
- }
307
- if (closeParen !== -1) {
308
- const title = text.slice(openBracket + 1, closeBracket);
309
- const url = text.slice(openParen, closeParen);
310
- if (/^https?:\/\//i.test(url) && title) {
311
- // Deduplicate by URL
312
- if (!results.some((r) => r.url === url)) {
313
- results.push({ title, url });
314
- }
315
- }
316
- idx = closeParen + 1;
317
- } else {
318
- idx = openBracket + 1;
319
- }
320
- }
321
- return results;
322
- }
323
-
324
- // ============================================================================
325
- // Timing constants
326
- // ============================================================================
327
-
328
- export const TIMING = {
329
- postNav: 800, // settle after navigation
330
- postNavSlow: 1200, // settle after slower navigations (Bing, Gemini)
331
- postClick: 300, // settle after a UI click
332
- postType: 300, // settle after typing
333
- inputPoll: 400, // polling interval when waiting for input to appear
334
- copyPoll: 600, // polling interval when waiting for copy button
335
- afterVerify: 1500, // settle after a verification challenge completes
336
- };
337
-
338
- // ============================================================================
339
- // Copy button polling
340
- // ============================================================================
341
-
342
- /**
343
- * Wait for a copy button to appear in the DOM.
344
- * @param {string} tab - Tab identifier
345
- * @param {string} selector - CSS selector for the copy button
346
- * @param {object} [options]
347
- * @param {number} [options.timeout=60000] - Max wait in ms
348
- * @param {Function} [options.onPoll] - Optional async callback on each poll tick (e.g. scroll)
349
- * @returns {Promise<void>}
350
- */
351
- export async function waitForCopyButton(tab, selector, options = {}) {
352
- const { timeout = 60000, onPoll } = options;
353
- const deadline = Date.now() + timeout;
354
- let tick = 0;
355
- while (Date.now() < deadline) {
356
- await new Promise((r) => setTimeout(r, jitter(TIMING.copyPoll)));
357
- if (onPoll) await onPoll(++tick).catch(() => null);
358
- const found = await cdp([
359
- "eval",
360
- tab,
361
- `!!document.querySelector('${selector}')`,
362
- ]).catch(() => "false");
363
- if (found === "true") return;
364
- }
365
- throw new Error(
366
- `Copy button ('${selector}') did not appear within ${timeout}ms`,
367
- );
368
- }
369
-
370
- // ============================================================================
371
- // Timing jitter
372
- // ============================================================================
373
-
374
- /**
375
- * Add ±20% random jitter to a timing value to avoid bot-like regularity.
376
- * Also floors at 50ms minimum to prevent micro-polling.
377
- * @param {number} ms - Base interval in milliseconds
378
- * @returns {number} Jittered interval
379
- */
380
- export function jitter(ms) {
381
- const variance = ms * 0.4;
382
- const offset = randomInt(-Math.floor(variance), Math.floor(variance) + 1);
383
- return Math.max(50, Math.round(ms + offset));
384
- }
385
-
386
- // ============================================================================
387
- // Stream completion detection
388
- // ============================================================================
389
-
390
- /**
391
- * Wait for generation/streaming to complete by monitoring text length stability.
392
- *
393
- * Uses a SINGLE Runtime.evaluate call with awaitPromise: true — the stability
394
- * polling runs entirely inside the browser context, emitting no CDP traffic
395
- * during the wait. This avoids the CDP Runtime serialization detection vector
396
- * that would otherwise fire on every poll tick (~50 evals 1 eval).
397
- *
398
- * @param {string} tab - Tab identifier
399
- * @param {object} options - Options
400
- * @param {number} [options.timeout=30000] - Maximum wait time in ms
401
- * @param {number} [options.interval=600] - Polling interval in ms (jittered ±20%)
402
- * @param {number} [options.stableRounds=3] - Required stable rounds to consider complete
403
- * @param {string} [options.selector='document.body'] - Element to monitor (default: body)
404
- * @returns {Promise<number>} Final text length
405
- */
406
- export async function waitForStreamComplete(tab, options = {}) {
407
- const {
408
- timeout = 20000,
409
- interval = 600,
410
- stableRounds = 3,
411
- selector = "document.body",
412
- minLength = 0,
413
- } = options;
414
-
415
- // Single self-contained eval — polling runs in the browser, no CDP chatter.
416
- // The promise resolves when stability is reached or timeout expires.
417
- const code = String.raw`
418
- new Promise((resolve, reject) => {
419
- const _deadline = Date.now() + ${timeout};
420
- const _baseInterval = ${interval};
421
- const _stableRounds = ${stableRounds};
422
- const _minLength = ${minLength};
423
- let _lastLen = -1;
424
- let _stableCount = 0;
425
-
426
- function _jitter(ms) {
427
- return Math.max(50, ms + (Math.random() * ms * 0.4 - ms * 0.2));
428
- }
429
-
430
- function _poll() {
431
- try {
432
- // Re-query DOM each tick — element may not exist at eval start
433
- const el = ${selector};
434
- const cur = el?.innerText?.length ?? 0;
435
- if (cur >= _minLength) {
436
- if (cur === _lastLen) {
437
- _stableCount++;
438
- if (_stableCount >= _stableRounds) { resolve(cur); return; }
439
- } else {
440
- _lastLen = cur;
441
- _stableCount = 0;
442
- }
443
- }
444
- if (Date.now() < _deadline) {
445
- setTimeout(_poll, _jitter(_baseInterval));
446
- } else {
447
- if (_lastLen >= _minLength) { resolve(_lastLen); }
448
- else { reject(new Error('Generation did not stabilise within ${timeout}ms')); }
449
- }
450
- } catch(e) { reject(e); }
451
- }
452
-
453
- _poll();
454
- })
455
- `;
456
-
457
- // Use eval (which has awaitPromise:true in cdp.mjs) with generous timeout.
458
- // This is ONE Runtime.evaluate call — the polling loop runs in the browser.
459
- const lenStr = await cdp(["eval", tab, code], timeout + 10000);
460
- const currentLen = parseInt(lenStr, 10) || 0;
461
-
462
- if (currentLen >= minLength) return currentLen;
463
- throw new Error(`Generation did not stabilise within ${timeout}ms`);
464
- }
465
-
466
- // ============================================================================
467
- // DOM selector waiting (single eval, no polling)
468
- // ============================================================================
469
-
470
- /**
471
- * Wait for a CSS selector to appear in the DOM using a single self-contained
472
- * eval. The polling loop runs in the browser — zero CDP traffic until done.
473
- *
474
- * @param {string} tab - Tab identifier
475
- * @param {string} selector - CSS selector to wait for
476
- * @param {number} [timeoutMs=15000] - Maximum wait time in ms
477
- * @param {number} [interval=500] - Base polling interval in ms (jittered ±20%)
478
- * @returns {Promise<boolean>} true if selector was found, false on timeout
479
- */
480
- export async function waitForSelector(
481
- tab,
482
- selector,
483
- timeoutMs = 15000,
484
- interval = 500,
485
- ) {
486
- const code = String.raw`
487
- new Promise((resolve) => {
488
- const _deadline = Date.now() + ${timeoutMs};
489
- const _baseInterval = ${interval};
490
-
491
- function _jitter(ms) {
492
- return Math.max(50, ms + (Math.random() * ms * 0.4 - ms * 0.2));
493
- }
494
-
495
- function _poll() {
496
- try {
497
- if (document.querySelector('${selector}')) { resolve(true); return; }
498
- if (Date.now() < _deadline) { setTimeout(_poll, _jitter(_baseInterval)); }
499
- else { resolve(false); }
500
- } catch(_) { resolve(false); }
501
- }
502
-
503
- _poll();
504
- })
505
- `;
506
-
507
- const result = await cdp(["eval", tab, code], timeoutMs + 5000);
508
- return result === "true";
509
- }
510
-
511
- // ============================================================================
512
- // CLI argument parsing
513
- // ============================================================================
514
-
515
- /**
516
- * Prepare args if --stdin is present, read the query/prompt from stdin
517
- * and replace the --stdin flag with the content. This avoids leaking queries
518
- * and prompts via command-line arguments visible in the process table.
519
- * Call this before parseArgs().
520
- * @param {string[]} args - process.argv.slice(2)
521
- * @returns {Promise<string[]>} modified args with query in place of --stdin
522
- */
523
- export async function prepareArgs(args) {
524
- const stdinIdx = args.indexOf("--stdin");
525
- if (stdinIdx === -1) return args;
526
-
527
- const query = await new Promise((resolve) => {
528
- let data = "";
529
- process.stdin.setEncoding("utf8");
530
- process.stdin.on("data", (chunk) => (data += chunk));
531
- process.stdin.on("end", () => resolve(data.trim()));
532
- });
533
-
534
- // Replace --stdin with the query text (parseArgs will extract it as query)
535
- const modified = [...args];
536
- modified[stdinIdx] = query;
537
- return modified;
538
- }
539
-
540
- /**
541
- * Parse standard extractor CLI arguments
542
- * @param {string[]} args - process.argv.slice(2)
543
- * @returns {{query: string, tabPrefix: string|null, short: boolean, locale: string|null}}
544
- */
545
- export function parseArgs(args) {
546
- const short = args.includes("--short");
547
- let rest = args.filter((a) => a !== "--short");
548
-
549
- const tabFlagIdx = rest.indexOf("--tab");
550
- const tabPrefix = tabFlagIdx === -1 ? null : rest[tabFlagIdx + 1];
551
- if (tabFlagIdx !== -1) {
552
- rest = rest.filter((_, i) => i !== tabFlagIdx && i !== tabFlagIdx + 1);
553
- }
554
-
555
- const localeIdx = rest.indexOf("--locale");
556
- const locale = localeIdx === -1 ? null : rest[localeIdx + 1];
557
- if (localeIdx !== -1) {
558
- rest = rest.filter((_, i) => i !== localeIdx && i !== localeIdx + 1);
559
- }
560
-
561
- const query = rest.join(" ");
562
- return { query, tabPrefix, short, locale };
563
- }
564
-
565
- /**
566
- * Validate that a query was provided, show usage and exit if not
567
- * @param {string[]} args - process.argv.slice(2)
568
- * @param {string} usage - Usage string for error message
569
- */
570
- export function validateQuery(args, usage) {
571
- if (!args.length || args[0] === "--help") {
572
- process.stderr.write(usage);
573
- process.exit(1);
574
- }
575
- }
576
-
577
- // ============================================================================
578
- // Output formatting
579
- // ============================================================================
580
-
581
- /**
582
- * Truncate answer if short mode is enabled
583
- * @param {string} answer - Full answer text
584
- * @param {boolean} short - Whether to truncate
585
- * @param {number} [maxLen=300] - Maximum length in short mode
586
- * @returns {string} Formatted answer
587
- */
588
- export function formatAnswer(answer, short, maxLen = 300) {
589
- if (!short || answer.length <= maxLen) return answer;
590
- const truncated = answer.slice(0, maxLen);
591
- const lastSpace = truncated.lastIndexOf(" ");
592
- return lastSpace > 0 ? `${truncated.slice(0, lastSpace)}…` : `${truncated}…`;
593
- }
594
-
595
- /**
596
- * Output JSON result to stdout
597
- * @param {object} data - Data to output
598
- */
599
- export function outputJson(data) {
600
- process.stdout.write(`${JSON.stringify(data, null, 2)}\n`);
601
- }
602
-
603
- /**
604
- * Build a lightweight result envelope from data already collected during extraction.
605
- * Zero additional CDP calls — everything here is already known.
606
- * @param {object} fields
607
- * @returns {object}
608
- */
609
- export function buildEnvelope({
610
- engine,
611
- mode = "headless",
612
- clipboardEmpty = null,
613
- fallbackUsed = null,
614
- blockedBy = null,
615
- verificationResult = null,
616
- inputReady = null,
617
- durationMs = null,
618
- } = {}) {
619
- return {
620
- engine,
621
- mode,
622
- clipboardEmpty,
623
- fallbackUsed,
624
- blockedBy,
625
- verificationResult,
626
- inputReady,
627
- durationMs,
628
- };
629
- }
630
-
631
- /**
632
- * Handle and output error, then exit.
633
- * If an envelope is provided, writes it to stdout as JSON so the runner
634
- * can parse structured diagnostics even on failure.
635
- * @param {Error} error - Error to handle
636
- * @param {object} [envelope] - Optional envelope object
637
- */
638
- export function handleError(error, envelope = null) {
639
- if (envelope) {
640
- const out = JSON.stringify({ _envelope: envelope, error: error.message });
641
- process.stdout.write(`${out}\n`);
642
- }
643
- process.stderr.write(`Error: ${error.message}\n`);
644
- process.exit(1);
645
- }
1
+ // extractors/common.mjs — shared utilities for CDP-based extractors
2
+ // Extracts common patterns: cdp wrapper, tab management, clipboard interception, source parsing
3
+
4
+ import { randomInt } from "node:crypto";
5
+ import { spawn } from "node:child_process";
6
+ import { dirname, join } from "node:path";
7
+ import { fileURLToPath } from "node:url";
8
+
9
+ const __dir = dirname(fileURLToPath(import.meta.url));
10
+ const CDP = join(__dir, "..", "bin", "cdp.mjs");
11
+
12
+ // ============================================================================
13
+ // CDP wrapper
14
+ // ============================================================================
15
+
16
+ /**
17
+ * Execute a CDP command through the cdp.mjs CLI
18
+ * @param {string[]} args - Command arguments
19
+ * @param {number} [timeoutMs=30000] - Timeout in milliseconds
20
+ * @returns {Promise<string>} Command output
21
+ */
22
+ export function cdp(args, timeoutMs = 30000) {
23
+ return cdpWithInput(args, null, timeoutMs);
24
+ }
25
+
26
+ export function cdpWithInput(args, input = null, timeoutMs = 30000) {
27
+ return new Promise((resolve, reject) => {
28
+ const proc = spawn(process.execPath, [CDP, ...args], {
29
+ stdio: [input == null ? "ignore" : "pipe", "pipe", "pipe"],
30
+ });
31
+ if (input != null) {
32
+ proc.stdin.write(input);
33
+ proc.stdin.end();
34
+ }
35
+ let out = "";
36
+ let err = "";
37
+ proc.stdout.on("data", (d) => (out += d));
38
+ proc.stderr.on("data", (d) => (err += d));
39
+ const timer = setTimeout(() => {
40
+ proc.kill();
41
+ reject(new Error(`cdp timeout: ${args[0]}`));
42
+ }, timeoutMs);
43
+ proc.on("close", (code) => {
44
+ clearTimeout(timer);
45
+ if (code === 0) resolve(out.trim());
46
+ else reject(new Error(err.trim() || `cdp exit ${code}`));
47
+ });
48
+ });
49
+ }
50
+
51
+ // ============================================================================
52
+ // Tab management
53
+ // ============================================================================
54
+
55
+ /**
56
+ * Get an existing tab by prefix or open a new one
57
+ * @param {string|null} tabPrefix - Existing tab prefix, or null to create new
58
+ * @returns {Promise<string>} Tab identifier
59
+ */
60
+ export async function getOrOpenTab(tabPrefix) {
61
+ if (tabPrefix) return tabPrefix;
62
+ // Always open a fresh tab to avoid SPA navigation issues
63
+ const list = await cdp(["list"]);
64
+ const anchor = list.split("\n")[0]?.slice(0, 8);
65
+ if (!anchor)
66
+ throw new Error(
67
+ "No Chrome tabs found. Is Chrome running with --remote-debugging-port=9222?",
68
+ );
69
+ const raw = await cdp([
70
+ "evalraw",
71
+ anchor,
72
+ "Target.createTarget",
73
+ '{"url":"about:blank"}',
74
+ ]);
75
+ const { targetId } = JSON.parse(raw);
76
+ await cdp(["list"]); // refresh cache
77
+ const tid = targetId.slice(0, 8);
78
+ // Inject stealth patches for anti-detection coverage (both headless + visible).
79
+ // MUST be awaited: the daemon processes commands concurrently, so a
80
+ // fire-and-forget registration races the next Page.navigate and the
81
+ // script may not be in place when the new document is created.
82
+ // Sites like consensus.app use this race to detect automation — the
83
+ // script's Navigator/webdriver overrides are absent on first paint,
84
+ // fingerprinting fires, and the user is bounced to a sign-up wall.
85
+ try {
86
+ await injectHeadlessStealth(tid);
87
+ } catch (e) {
88
+ process.stderr.write(
89
+ `[getOrOpenTab] stealth injection failed: ${e.message}\n`,
90
+ );
91
+ }
92
+ return tid;
93
+ }
94
+
95
+ // ============================================================================
96
+ // Clipboard interception (for extractors that use copy-to-clipboard)
97
+ // ============================================================================
98
+
99
+ /**
100
+ * Inject clipboard interceptor to capture text when copy buttons are clicked.
101
+ * Each engine uses a unique global variable to avoid conflicts.
102
+ * @param {string} tab - Tab identifier
103
+ * @param {string} globalVar - Global variable name (e.g., '__pplxClipboard', '__geminiClipboard')
104
+ */
105
+ export async function injectClipboardInterceptor(tab, globalVar) {
106
+ const code = `
107
+ (() => {
108
+ window.${globalVar} = null;
109
+ const _clipboard = navigator.clipboard;
110
+ if (!_clipboard) return;
111
+ const _origWriteText = typeof _clipboard.writeText === 'function'
112
+ ? _clipboard.writeText.bind(_clipboard)
113
+ : null;
114
+ const _origWrite = typeof _clipboard.write === 'function'
115
+ ? _clipboard.write.bind(_clipboard)
116
+ : null;
117
+
118
+ _clipboard.writeText = function(text) {
119
+ window.${globalVar} = String(text ?? '');
120
+ if (!_origWriteText) return Promise.resolve();
121
+ // The OS/browser clipboard write may be denied in automated Chrome or
122
+ // when the tab is not focused. We only need the captured text; returning
123
+ // a resolved promise prevents the page from surfacing a misleading
124
+ // "failed to copy" toast after our interceptor already succeeded.
125
+ return Promise.resolve(_origWriteText(text)).catch(() => undefined);
126
+ };
127
+
128
+ _clipboard.write = async function(items) {
129
+ try {
130
+ for (const item of items || []) {
131
+ if (item.types && item.types.includes('text/plain')) {
132
+ const blob = await item.getType('text/plain');
133
+ window.${globalVar} = await blob.text();
134
+ break;
135
+ }
136
+ }
137
+ } catch(e) {}
138
+ if (!_origWrite) return undefined;
139
+ try { return await _origWrite(items); }
140
+ catch (_) { return undefined; }
141
+ };
142
+ })();
143
+ `;
144
+ await cdp(["eval", tab, code]);
145
+ }
146
+
147
+ // ============================================================================
148
+ // Headless stealth injection
149
+ // ============================================================================
150
+
151
+ /**
152
+ * Inject anti-detection patches into a page in headless mode.
153
+ * Based on production patterns from screenshotrun.com.
154
+ */
155
+ export async function injectHeadlessStealth(tab) {
156
+ const code = `
157
+ (function() {
158
+ // ── Runtime.enable / CDP detection masking ──────────────
159
+ try { delete window.__REBROWSER_RUNTIME_ENABLE; } catch(_) {}
160
+ try { delete window.__REBROWSER_DEVTOOLS; } catch(_) {}
161
+ try { delete window.__nightmare; } catch(_) {}
162
+ try { delete window.__phantom; } catch(_) {}
163
+ try { delete window.callPhantom; } catch(_) {}
164
+ try { delete window._phantom; } catch(_) {}
165
+ try { delete window.Buffer; } catch(_) {}
166
+
167
+ // Real Chrome without automation does not expose a useful webdriver value.
168
+ // A literal false value is itself a common stealth tell; prefer undefined and
169
+ // make the descriptor configurable like native browser properties.
170
+ Object.defineProperty(navigator, 'webdriver', { get: () => undefined, configurable: true });
171
+ Object.defineProperty(navigator, 'vendor', { get: () => 'Google Inc.', configurable: true });
172
+ Object.defineProperty(navigator, 'platform', { get: () => 'Win32', configurable: true });
173
+ Object.defineProperty(navigator, 'maxTouchPoints', { get: () => 0, configurable: true });
174
+ Object.defineProperty(navigator, 'pdfViewerEnabled', { get: () => true, configurable: true });
175
+ Object.defineProperty(navigator, 'plugins', {
176
+ get: () => {
177
+ var p = [
178
+ { name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format' },
179
+ { name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai', description: '' },
180
+ { name: 'Native Client', filename: 'internal-nacl-plugin', description: '' },
181
+ ];
182
+ p.length = 3;
183
+ return p;
184
+ },
185
+ });
186
+ Object.defineProperty(navigator, 'mimeTypes', {
187
+ get: () => {
188
+ var m = [
189
+ { type: 'application/pdf', suffixes: 'pdf', description: 'Portable Document Format', enabledPlugin: null },
190
+ { type: 'text/pdf', suffixes: 'pdf', description: 'Portable Document Format', enabledPlugin: null },
191
+ ];
192
+ m.item = function(i) { return m[i] || null; };
193
+ m.namedItem = function(name) { return m.find(function(x) { return x.type === name; }) || null; };
194
+ return m;
195
+ },
196
+ configurable: true,
197
+ });
198
+ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'], configurable: true });
199
+ try {
200
+ Object.defineProperty(navigator, 'connection', { get: () => ({ effectiveType: '4g', rtt: 50, downlink: 10, saveData: false }), configurable: true });
201
+ } catch(_) {}
202
+ if (!navigator.mediaDevices) {
203
+ Object.defineProperty(navigator, 'mediaDevices', {
204
+ get: () => ({
205
+ enumerateDevices: () => Promise.resolve([
206
+ { deviceId: 'default', kind: 'audioinput', label: '', groupId: 'default' },
207
+ { deviceId: 'default', kind: 'audiooutput', label: '', groupId: 'default' },
208
+ { deviceId: '', kind: 'videoinput', label: '', groupId: '' },
209
+ ]),
210
+ getUserMedia: () => Promise.reject(new DOMException('NotAllowedError')),
211
+ getDisplayMedia: () => Promise.reject(new DOMException('NotAllowedError')),
212
+ }),
213
+ configurable: true,
214
+ });
215
+ }
216
+ if (!window.chrome) {
217
+ window.chrome = {
218
+ app: { isInstalled: false, InstallState: {}, RunningState: {} },
219
+ runtime: {
220
+ OnInstalledReason: {}, OnRestartRequiredReason: {}, PlatformArch: {}, PlatformNaclArch: {}, PlatformOs: {}, RequestUpdateCheckStatus: {},
221
+ connect: () => ({}), sendMessage: () => {}, onMessage: { addListener: () => {} }
222
+ },
223
+ loadTimes: () => ({}),
224
+ csi: () => ({}),
225
+ };
226
+ }
227
+ var __greedyNativeFns = [];
228
+ function __markNative(fn) { try { __greedyNativeFns.push(fn); } catch(_) {} return fn; }
229
+
230
+ var origQuery = navigator.permissions?.query;
231
+ if (origQuery) {
232
+ navigator.permissions.query = __markNative(function query(params) {
233
+ if (params && params.name === 'notifications') return Promise.resolve({ state: Notification.permission || 'default', onchange: null });
234
+ return origQuery.apply(this, arguments);
235
+ });
236
+ }
237
+ try {
238
+ var getParam = WebGLRenderingContext.prototype.getParameter;
239
+ WebGLRenderingContext.prototype.getParameter = __markNative(function getParameter(p) {
240
+ if (p === 37445) return 'Intel Inc.';
241
+ if (p === 37446) return 'Intel Iris OpenGL Engine';
242
+ return getParam.call(this, p);
243
+ });
244
+ } catch(_) {}
245
+ Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8, configurable: true });
246
+ Object.defineProperty(navigator, 'deviceMemory', { get: () => 8, configurable: true });
247
+
248
+ // ── Canvas fingerprint noise ─────────────────────────
249
+ // Headless rendering engines produce slightly different canvas output
250
+ // than headed Chrome. Subtle noise breaks hash-based fingerprinting.
251
+ try {
252
+ var __canvasNoise = ((Date.now() % 997) + Math.floor(Math.random() * 997)) & 1;
253
+ var origFill = CanvasRenderingContext2D.prototype.fillText;
254
+ CanvasRenderingContext2D.prototype.fillText = __markNative(function fillText() {
255
+ this.globalAlpha = 0.9995;
256
+ return origFill.apply(this, arguments);
257
+ });
258
+ } catch(_) {}
259
+ try {
260
+ var origStroke = CanvasRenderingContext2D.prototype.strokeText;
261
+ CanvasRenderingContext2D.prototype.strokeText = __markNative(function strokeText() {
262
+ this.globalAlpha = 0.9995;
263
+ return origStroke.apply(this, arguments);
264
+ });
265
+ } catch(_) {}
266
+ try {
267
+ var origToDataURL = HTMLCanvasElement.prototype.toDataURL;
268
+ HTMLCanvasElement.prototype.toDataURL = __markNative(function toDataURL() {
269
+ var ctx = this.getContext('2d');
270
+ if (ctx) {
271
+ // Add 1px noise pixel in corner (invisible but changes hash)
272
+ var imgData = ctx.getImageData(0, 0, 1, 1);
273
+ if (imgData) imgData.data[0] ^= __canvasNoise;
274
+ ctx.putImageData(imgData, 0, 0);
275
+ }
276
+ return origToDataURL.apply(this, arguments);
277
+ });
278
+ } catch(_) {}
279
+
280
+ // ── window outer dimensions ──────────────────────────
281
+ // outerWidth/Height = 0 in headless — a well-known bot signal.
282
+ // Mirror innerWidth/Height (set by --window-size flag) so the ratio is sane.
283
+ try {
284
+ if (!window.outerWidth) Object.defineProperty(window, 'outerWidth', { get: () => window.innerWidth || 1920, configurable: true });
285
+ if (!window.outerHeight) Object.defineProperty(window, 'outerHeight', { get: () => window.innerHeight || 1080, configurable: true });
286
+ } catch(_) {}
287
+
288
+ // ── screen properties ─────────────────────────────────
289
+ try {
290
+ if (!screen.colorDepth) Object.defineProperty(screen, 'colorDepth', { get: () => 24, configurable: true });
291
+ if (!screen.pixelDepth) Object.defineProperty(screen, 'pixelDepth', { get: () => 24, configurable: true });
292
+ } catch(_) {}
293
+
294
+ // ── navigator.userAgentData (UA Client Hints) ─────────
295
+ // Derive version from the UA string already set by --user-agent flag so the
296
+ // two APIs are always consistent. Removes any "HeadlessChrome" brand entry.
297
+ try {
298
+ var _uaMajor = (navigator.userAgent.match(new RegExp('Chrome/([0-9]+)')) || [])[1] || '136';
299
+ var _uaFull = (navigator.userAgent.match(new RegExp('Chrome/([0-9.]+)')) || [])[1] || (_uaMajor + '.0.0.0');
300
+ var _brands = [
301
+ { brand: 'Not)A;Brand', version: '99' },
302
+ { brand: 'Google Chrome', version: _uaMajor },
303
+ { brand: 'Chromium', version: _uaMajor },
304
+ ];
305
+ Object.defineProperty(navigator, 'userAgentData', {
306
+ get: function() {
307
+ return {
308
+ brands: _brands, mobile: false, platform: 'Windows',
309
+ getHighEntropyValues: function() {
310
+ return Promise.resolve({
311
+ architecture: 'x86', bitness: '64',
312
+ brands: _brands,
313
+ fullVersionList: [
314
+ { brand: 'Not)A;Brand', version: '99.0.0.0' },
315
+ { brand: 'Google Chrome', version: _uaFull },
316
+ { brand: 'Chromium', version: _uaFull },
317
+ ],
318
+ mobile: false, model: '', platform: 'Windows',
319
+ platformVersion: '15.0.0', uaFullVersion: _uaFull, wow64: false,
320
+ });
321
+ },
322
+ toJSON: function() { return { brands: _brands, mobile: false, platform: 'Windows' }; },
323
+ };
324
+ },
325
+ configurable: true,
326
+ });
327
+ } catch(_) {}
328
+
329
+ // ── CDP Runtime serialization guard ──────────────────
330
+ // Sites detect CDP by putting a getter on Error.prototype.stack
331
+ // and checking if console.log triggers it (only happens when
332
+ // Runtime domain is enabled). We monkey-patch console methods to
333
+ // strip custom getters from arguments before they reach CDP.
334
+ try {
335
+ var _origLog = console.log, _origError = console.error,
336
+ _origWarn = console.warn, _origDebug = console.debug,
337
+ _origInfo = console.info;
338
+ var _safeArg = function(a) {
339
+ if (a instanceof Error) {
340
+ try { return new Error(a.message); } catch(_) { return a; }
341
+ }
342
+ return a;
343
+ };
344
+ console.log = __markNative(function log() { return _origLog.apply(console, Array.prototype.map.call(arguments, _safeArg)); });
345
+ console.error = __markNative(function error() { return _origError.apply(console, Array.prototype.map.call(arguments, _safeArg)); });
346
+ console.warn = __markNative(function warn() { return _origWarn.apply(console, Array.prototype.map.call(arguments, _safeArg)); });
347
+ console.debug = __markNative(function debug() { return _origDebug.apply(console, Array.prototype.map.call(arguments, _safeArg)); });
348
+ console.info = __markNative(function info() { return _origInfo.apply(console, Array.prototype.map.call(arguments, _safeArg)); });
349
+ } catch(_) {}
350
+
351
+ // ── Native function masking ──────────────────────────
352
+ // Patched APIs should not stringify as user-defined stealth code.
353
+ try {
354
+ var __nativeToString = Function.prototype.toString;
355
+ Function.prototype.toString = function toString() {
356
+ if (__greedyNativeFns.indexOf(this) !== -1) {
357
+ var name = this.name || '';
358
+ return 'function ' + name + '() { [native code] }';
359
+ }
360
+ return __nativeToString.call(this);
361
+ };
362
+ } catch(_) {}
363
+ })();
364
+ `;
365
+ await cdp([
366
+ "evalraw",
367
+ tab,
368
+ "Page.addScriptToEvaluateOnNewDocument",
369
+ JSON.stringify({ source: code }),
370
+ ]);
371
+ }
372
+
373
+ // ============================================================================
374
+ // Source extraction from markdown
375
+ // ============================================================================
376
+
377
+ /**
378
+ * Parse Markdown links from text to extract sources
379
+ * @param {string} text - Text containing Markdown links like [title](url)
380
+ * @returns {Array<{title: string, url: string}>} Extracted sources
381
+ */
382
+ export function parseSourcesFromMarkdown(text) {
383
+ if (!text) return [];
384
+ const results = [];
385
+ let idx = 0;
386
+ while (idx < text.length && results.length < 10) {
387
+ const openBracket = text.indexOf("[", idx);
388
+ if (openBracket === -1) break;
389
+ const closeBracket = text.indexOf("](", openBracket);
390
+ if (closeBracket === -1) break;
391
+ const openParen = closeBracket + 2;
392
+ // Validate URL prefix and find closing paren
393
+ let closeParen = -1;
394
+ for (let p = openParen; p < text.length; p++) {
395
+ const ch = text[p];
396
+ if (ch === ")") {
397
+ closeParen = p;
398
+ break;
399
+ }
400
+ if (/\s/.test(ch)) break; // whitespace in URL = invalid markdown link
401
+ }
402
+ if (closeParen !== -1) {
403
+ const title = text.slice(openBracket + 1, closeBracket);
404
+ const url = text.slice(openParen, closeParen);
405
+ if (/^https?:\/\//i.test(url) && title) {
406
+ // Deduplicate by URL
407
+ if (!results.some((r) => r.url === url)) {
408
+ results.push({ title, url });
409
+ }
410
+ }
411
+ idx = closeParen + 1;
412
+ } else {
413
+ idx = openBracket + 1;
414
+ }
415
+ }
416
+ return results;
417
+ }
418
+
419
+ /**
420
+ * Linear-time "is this a non-empty digit string?" check.
421
+ * Equivalent to /^\d+$/ without the regex — used to keep the
422
+ * parseSourcesFromMarkdownRefStyle inline scan free of any regex
423
+ * (SonarCloud hotspot js:S5852).
424
+ * @param {string} s
425
+ * @returns {boolean}
426
+ */
427
+ function isAllDigits(s) {
428
+ if (!s) return false;
429
+ for (let k = 0; k < s.length; k++) {
430
+ const c = s.charCodeAt(k);
431
+ if (c < 48 || c > 57) return false;
432
+ }
433
+ return true;
434
+ }
435
+
436
+ /**
437
+ * Parse reference-style markdown links: [text][num] with [num]: url "title" at bottom.
438
+ * ChatGPT uses this format for its inline citations.
439
+ * @param {string} text - Markdown text
440
+ * @returns {Array<{title: string, url: string}>} Extracted sources
441
+ */
442
+ export function parseSourcesFromMarkdownRefStyle(text) {
443
+ if (!text) return [];
444
+ const results = [];
445
+
446
+ // Find all reference definitions: [num]: url "title"
447
+ const refMap = new Map();
448
+ const refRegex = /^\[(\d+)\]:\s*(https?:\/\/[^\s"]+)(?:\s+"([^"]*)")?/gm;
449
+ let m;
450
+ while ((m = refRegex.exec(text)) !== null) {
451
+ const num = m[1];
452
+ const url = m[2];
453
+ const title = m[3] || "";
454
+ refMap.set(num, { url, title });
455
+ }
456
+
457
+ // Find inline references: [text][num] or [num]. Linear scan via
458
+ // indexOf avoids the ReDoS-prone /\[([^\]]*)\]\[(\d+)\]/g pattern
459
+ // (SonarCloud hotspot js:S5852). The original `[^\]]*` allowed `[`
460
+ // inside, which caused quadratic backtracking on inputs like
461
+ // `[a[[[[[[[[[[[1]`.
462
+ let cursor = 0;
463
+ while (cursor < text.length) {
464
+ const open = text.indexOf("[", cursor);
465
+ if (open === -1) break;
466
+ const close = text.indexOf("]", open + 1);
467
+ if (close === -1) break;
468
+ if (text[close + 1] !== "[") {
469
+ cursor = open + 1;
470
+ continue;
471
+ }
472
+ const close2 = text.indexOf("]", close + 2);
473
+ if (close2 === -1) break;
474
+
475
+ const inner = text.slice(open + 1, close);
476
+ const numStr = text.slice(close + 2, close2);
477
+ if (isAllDigits(numStr)) {
478
+ const ref = refMap.get(numStr);
479
+ if (ref && !results.some((r) => r.url === ref.url)) {
480
+ results.push({
481
+ title: inner.trim() || ref.title || "",
482
+ url: ref.url,
483
+ });
484
+ }
485
+ }
486
+ cursor = close2 + 1;
487
+ }
488
+
489
+ return results;
490
+ }
491
+
492
+ // ============================================================================
493
+ // Timing constants
494
+ // ============================================================================
495
+
496
+ export const TIMING = {
497
+ postNav: 800, // settle after navigation
498
+ postNavSlow: 1200, // settle after slower navigations (Bing, Gemini)
499
+ postClick: 300, // settle after a UI click
500
+ postType: 300, // settle after typing
501
+ inputPoll: 400, // polling interval when waiting for input to appear
502
+ copyPoll: 600, // polling interval when waiting for copy button
503
+ afterVerify: 1500, // settle after a verification challenge completes
504
+ };
505
+
506
+ // ============================================================================
507
+ // Copy button polling
508
+ // ============================================================================
509
+
510
+ /**
511
+ * Wait for a copy button to appear in the DOM.
512
+ * @param {string} tab - Tab identifier
513
+ * @param {string} selector - CSS selector for the copy button
514
+ * @param {object} [options]
515
+ * @param {number} [options.timeout=60000] - Max wait in ms
516
+ * @param {Function} [options.onPoll] - Optional async callback on each poll tick (e.g. scroll)
517
+ * @returns {Promise<void>}
518
+ */
519
+ export async function waitForCopyButton(tab, selector, options = {}) {
520
+ const { timeout = 60000, onPoll } = options;
521
+ const deadline = Date.now() + timeout;
522
+ let tick = 0;
523
+ while (Date.now() < deadline) {
524
+ await new Promise((r) => setTimeout(r, jitter(TIMING.copyPoll)));
525
+ if (onPoll) await onPoll(++tick).catch(() => null);
526
+ const found = await cdp([
527
+ "eval",
528
+ tab,
529
+ `!!document.querySelector('${selector}')`,
530
+ ]).catch(() => "false");
531
+ if (found === "true") return;
532
+ }
533
+ throw new Error(
534
+ `Copy button ('${selector}') did not appear within ${timeout}ms`,
535
+ );
536
+ }
537
+
538
+ // ============================================================================
539
+ // Timing jitter
540
+ // ============================================================================
541
+
542
+ /**
543
+ * Add ±20% random jitter to a timing value to avoid bot-like regularity.
544
+ * Also floors at 50ms minimum to prevent micro-polling.
545
+ * @param {number} ms - Base interval in milliseconds
546
+ * @returns {number} Jittered interval
547
+ */
548
+ export function jitter(ms) {
549
+ const variance = ms * 0.4;
550
+ const offset = randomInt(-Math.floor(variance), Math.floor(variance) + 1);
551
+ return Math.max(50, Math.round(ms + offset));
552
+ }
553
+
554
+ // ============================================================================
555
+ // Stream completion detection
556
+ // ============================================================================
557
+
558
+ /**
559
+ * Wait for generation/streaming to complete by monitoring text length stability.
560
+ *
561
+ * Uses a SINGLE Runtime.evaluate call with awaitPromise: true — the stability
562
+ * polling runs entirely inside the browser context, emitting no CDP traffic
563
+ * during the wait. This avoids the CDP Runtime serialization detection vector
564
+ * that would otherwise fire on every poll tick (~50 evals → 1 eval).
565
+ *
566
+ * @param {string} tab - Tab identifier
567
+ * @param {object} options - Options
568
+ * @param {number} [options.timeout=30000] - Maximum wait time in ms
569
+ * @param {number} [options.interval=600] - Polling interval in ms (jittered ±20%)
570
+ * @param {number} [options.stableRounds=3] - Required stable rounds to consider complete
571
+ * @param {string} [options.selector='document.body'] - Element to monitor (default: body)
572
+ * @returns {Promise<number>} Final text length
573
+ */
574
+ export async function waitForStreamComplete(tab, options = {}) {
575
+ const {
576
+ timeout = 20000,
577
+ interval = 600,
578
+ stableRounds = 3,
579
+ selector = "document.body",
580
+ minLength = 0,
581
+ } = options;
582
+
583
+ // Single self-contained eval polling runs in the browser, no CDP chatter.
584
+ // The promise resolves when stability is reached or timeout expires.
585
+ const code = String.raw`
586
+ new Promise((resolve, reject) => {
587
+ const _deadline = Date.now() + ${timeout};
588
+ const _baseInterval = ${interval};
589
+ const _stableRounds = ${stableRounds};
590
+ const _minLength = ${minLength};
591
+ let _lastLen = -1;
592
+ let _stableCount = 0;
593
+
594
+ function _jitter(ms) {
595
+ return Math.max(50, ms + (Math.random() * ms * 0.4 - ms * 0.2));
596
+ }
597
+
598
+ function _poll() {
599
+ try {
600
+ // Re-query DOM each tick — element may not exist at eval start
601
+ const el = ${selector};
602
+ const cur = el?.innerText?.length ?? 0;
603
+ if (cur >= _minLength) {
604
+ if (cur === _lastLen) {
605
+ _stableCount++;
606
+ if (_stableCount >= _stableRounds) { resolve(cur); return; }
607
+ } else {
608
+ _lastLen = cur;
609
+ _stableCount = 0;
610
+ }
611
+ }
612
+ if (Date.now() < _deadline) {
613
+ setTimeout(_poll, _jitter(_baseInterval));
614
+ } else {
615
+ if (_lastLen >= _minLength) { resolve(_lastLen); }
616
+ else { reject(new Error('Generation did not stabilise within ${timeout}ms')); }
617
+ }
618
+ } catch(e) { reject(e); }
619
+ }
620
+
621
+ _poll();
622
+ })
623
+ `;
624
+
625
+ // Use eval (which has awaitPromise:true in cdp.mjs) with generous timeout.
626
+ // This is ONE Runtime.evaluate call — the polling loop runs in the browser.
627
+ const lenStr = await cdp(["eval", tab, code], timeout + 10000);
628
+ const currentLen = parseInt(lenStr, 10) || 0;
629
+
630
+ if (currentLen >= minLength) return currentLen;
631
+ throw new Error(`Generation did not stabilise within ${timeout}ms`);
632
+ }
633
+
634
+ // ============================================================================
635
+ // DOM selector waiting (single eval, no polling)
636
+ // ============================================================================
637
+
638
+ /**
639
+ * Wait for a CSS selector to appear in the DOM using a single self-contained
640
+ * eval. The polling loop runs in the browser — zero CDP traffic until done.
641
+ *
642
+ * @param {string} tab - Tab identifier
643
+ * @param {string} selector - CSS selector to wait for
644
+ * @param {number} [timeoutMs=15000] - Maximum wait time in ms
645
+ * @param {number} [interval=500] - Base polling interval in ms (jittered ±20%)
646
+ * @returns {Promise<boolean>} true if selector was found, false on timeout
647
+ */
648
+ export async function waitForSelector(
649
+ tab,
650
+ selector,
651
+ timeoutMs = 15000,
652
+ interval = 500,
653
+ ) {
654
+ const code = String.raw`
655
+ new Promise((resolve) => {
656
+ const _deadline = Date.now() + ${timeoutMs};
657
+ const _baseInterval = ${interval};
658
+
659
+ function _jitter(ms) {
660
+ return Math.max(50, ms + (Math.random() * ms * 0.4 - ms * 0.2));
661
+ }
662
+
663
+ function _poll() {
664
+ try {
665
+ if (document.querySelector('${selector}')) { resolve(true); return; }
666
+ if (Date.now() < _deadline) { setTimeout(_poll, _jitter(_baseInterval)); }
667
+ else { resolve(false); }
668
+ } catch(_) { resolve(false); }
669
+ }
670
+
671
+ _poll();
672
+ })
673
+ `;
674
+
675
+ const result = await cdp(["eval", tab, code], timeoutMs + 5000);
676
+ return result === "true";
677
+ }
678
+
679
+ // ============================================================================
680
+ // CLI argument parsing
681
+ // ============================================================================
682
+
683
+ /**
684
+ * Prepare args — if --stdin is present, read the query/prompt from stdin
685
+ * and replace the --stdin flag with the content. This avoids leaking queries
686
+ * and prompts via command-line arguments visible in the process table.
687
+ * Call this before parseArgs().
688
+ * @param {string[]} args - process.argv.slice(2)
689
+ * @returns {Promise<string[]>} modified args with query in place of --stdin
690
+ */
691
+ export async function prepareArgs(args) {
692
+ const stdinIdx = args.indexOf("--stdin");
693
+ if (stdinIdx === -1) return args;
694
+
695
+ const query = await new Promise((resolve) => {
696
+ let data = "";
697
+ process.stdin.setEncoding("utf8");
698
+ process.stdin.on("data", (chunk) => (data += chunk));
699
+ process.stdin.on("end", () => resolve(data.trim()));
700
+ });
701
+
702
+ // Replace --stdin with the query text (parseArgs will extract it as query)
703
+ const modified = [...args];
704
+ modified[stdinIdx] = query;
705
+ return modified;
706
+ }
707
+
708
+ /**
709
+ * Parse standard extractor CLI arguments
710
+ * @param {string[]} args - process.argv.slice(2)
711
+ * @returns {{query: string, tabPrefix: string|null, short: boolean, locale: string|null}}
712
+ */
713
+ export function parseArgs(args) {
714
+ const short = args.includes("--short");
715
+ let rest = args.filter((a) => a !== "--short");
716
+
717
+ const tabFlagIdx = rest.indexOf("--tab");
718
+ const tabPrefix = tabFlagIdx === -1 ? null : rest[tabFlagIdx + 1];
719
+ if (tabFlagIdx !== -1) {
720
+ rest = rest.filter((_, i) => i !== tabFlagIdx && i !== tabFlagIdx + 1);
721
+ }
722
+
723
+ const localeIdx = rest.indexOf("--locale");
724
+ const locale = localeIdx === -1 ? null : rest[localeIdx + 1];
725
+ if (localeIdx !== -1) {
726
+ rest = rest.filter((_, i) => i !== localeIdx && i !== localeIdx + 1);
727
+ }
728
+
729
+ const query = rest.join(" ");
730
+ return { query, tabPrefix, short, locale };
731
+ }
732
+
733
+ /**
734
+ * Validate that a query was provided, show usage and exit if not
735
+ * @param {string[]} args - process.argv.slice(2)
736
+ * @param {string} usage - Usage string for error message
737
+ */
738
+ export function validateQuery(args, usage) {
739
+ if (!args.length || args[0] === "--help") {
740
+ process.stderr.write(usage);
741
+ process.exit(1);
742
+ }
743
+ }
744
+
745
+ // ============================================================================
746
+ // Output formatting
747
+ // ============================================================================
748
+
749
+ /**
750
+ * Truncate answer if short mode is enabled
751
+ * @param {string} answer - Full answer text
752
+ * @param {boolean} short - Whether to truncate
753
+ * @param {number} [maxLen=300] - Maximum length in short mode
754
+ * @returns {string} Formatted answer
755
+ */
756
+ export function formatAnswer(answer, short, maxLen = 300) {
757
+ if (!short || answer.length <= maxLen) return answer;
758
+ const truncated = answer.slice(0, maxLen);
759
+ const lastSpace = truncated.lastIndexOf(" ");
760
+ return lastSpace > 0 ? `${truncated.slice(0, lastSpace)}…` : `${truncated}…`;
761
+ }
762
+
763
+ /**
764
+ * Output JSON result to stdout
765
+ * @param {object} data - Data to output
766
+ */
767
+ export function outputJson(data) {
768
+ process.stdout.write(`${JSON.stringify(data, null, 2)}\n`);
769
+ }
770
+
771
+ /**
772
+ * Record the current extractor stage for debugging and timeout diagnostics.
773
+ * Writes `[engine] stage: <name> (+<ms>)` to stderr and updates `env.lastStage`
774
+ * / `env.stages` so the envelope carries the last known phase on any outcome
775
+ * (success, error, timeout, kill).
776
+ *
777
+ * @param {object} env - The mutable env object the extractor is filling in.
778
+ * @param {string} stage - Short, snake_case stage name (e.g. "nav", "type", "stream").
779
+ * @param {number} [startTime] - Optional extractor start time for elapsed-ms logging.
780
+ */
781
+ export function logStage(env, stage, startTime = null) {
782
+ if (!env || typeof env !== "object") return;
783
+ const elapsed = startTime ? ` (+${Date.now() - startTime}ms)` : "";
784
+ env.lastStage = stage;
785
+ if (!Array.isArray(env.stages)) env.stages = [];
786
+ env.stages.push({ stage, at: Date.now() });
787
+ const engine = env.engine || "extractor";
788
+ console.error(`[${engine}] stage: ${stage}${elapsed}`);
789
+ }
790
+
791
+ /**
792
+ * Build a lightweight result envelope from data already collected during extraction.
793
+ * Zero additional CDP calls — everything here is already known.
794
+ * @param {object} fields
795
+ * @returns {object}
796
+ */
797
+ export function buildEnvelope({
798
+ engine,
799
+ mode = "headless",
800
+ clipboardEmpty = null,
801
+ fallbackUsed = null,
802
+ blockedBy = null,
803
+ verificationResult = null,
804
+ inputReady = null,
805
+ durationMs = null,
806
+ lastStage = null,
807
+ stages = null,
808
+ } = {}) {
809
+ return {
810
+ engine,
811
+ mode,
812
+ clipboardEmpty,
813
+ fallbackUsed,
814
+ blockedBy,
815
+ verificationResult,
816
+ inputReady,
817
+ durationMs,
818
+ lastStage,
819
+ stages,
820
+ };
821
+ }
822
+
823
+ /**
824
+ * Handle and output error, then exit.
825
+ * If an envelope is provided, writes it to stdout as JSON so the runner
826
+ * can parse structured diagnostics even on failure.
827
+ * @param {Error} error - Error to handle
828
+ * @param {object} [envelope] - Optional envelope object
829
+ */
830
+ export function handleError(error, envelope = null) {
831
+ if (envelope) {
832
+ const out = JSON.stringify({ _envelope: envelope, error: error.message });
833
+ process.stdout.write(`${out}\n`);
834
+ }
835
+ process.stderr.write(`Error: ${error.message}\n`);
836
+ process.exit(1);
837
+ }