@apmantza/greedysearch-pi 2.0.0 → 2.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -57,12 +57,24 @@ const VERIFY_DETECT_JS = `
57
57
 
58
58
  // --- Cloudflare Turnstile widget inside closed shadow DOM (Copilot, etc.) ---
59
59
  // The iframe is not queryable from main document, but the host container
60
- // (#cf-turnstile) and the hidden response input are.
61
- var cfTurnstileHost = document.querySelector('#cf-turnstile, [id^="cf-chl-widget-"]');
60
+ // (#cf-turnstile) and the hidden response input are. When only the
61
+ // hidden response input matches (no #cf-turnstile host and no visible
62
+ // iframe), the actual challenge widget is rendered inside a closed
63
+ // shadow DOM and cannot be auto-clicked. Return a sentinel so callers
64
+ // know to surface this as needs-human verification instead of wasting
65
+ // time on a doomed waitForSelector.
66
+ var cfTurnstileHost = document.querySelector('#cf-turnstile');
62
67
  if (cfTurnstileHost) {
63
68
  var r2 = cfTurnstileHost.getBoundingClientRect();
64
69
  return JSON.stringify({t:'xy',x:r2.left+r2.width/2,y:r2.top+r2.height/2});
65
70
  }
71
+ // Hidden cf-chl-widget-*_response input present but no visible host:
72
+ // the widget is in closed shadow DOM. Signal this so handleVerification
73
+ // can return 'needs-human' rather than 'clear'.
74
+ var cfResponseInput = document.querySelector('input[name="cf-turnstile-response"], [id^="cf-chl-widget-"][id$="_response"]');
75
+ if (cfResponseInput && cfResponseInput.value === '') {
76
+ return 'cf-closed-shadow-dom';
77
+ }
66
78
 
67
79
  // --- Cloudflare challenge page ---
68
80
  var cfCheckbox = document.querySelector('#cf-stage input[type="checkbox"], .ctp-checkbox-container input');
@@ -77,15 +89,28 @@ const VERIFY_DETECT_JS = `
77
89
  }
78
90
 
79
91
  // --- Generic verify/continue/proceed buttons (catch-all) ---
80
- // IMPORTANT: exclude sign-in / OAuth buttons (e.g. "Continue with Google")
92
+ // IMPORTANT: exclude sign-in / OAuth buttons (e.g. "Continue with Google",
93
+ // "Continue with email", "Login or sign up for free"). These appear on
94
+ // many sites (Perplexity, ChatGPT, etc.) when the user isn't logged in,
95
+ // and clicking them triggers a sign-in flow that takes us to a login
96
+ // wall — a much worse outcome than the original search failure we were
97
+ // trying to recover from. The exclusion list must cover both OAuth
98
+ // providers AND generic "sign in / log in / with email" patterns.
81
99
  var btns = Array.from(document.querySelectorAll('button, input[type=submit], a[role=button]'));
82
100
  var verify = btns.find(b => {
83
101
  var t = (b.innerText?.trim() || b.value || '').toLowerCase();
84
- var isVerifyLike = (t.includes('verify') || t.includes('human') || t.includes('robot') || t.includes('continue') || t.includes('proceed')) &&
102
+ var isVerifyLike = (t === 'continue' || t === 'proceed' || t === 'next' ||
103
+ t.startsWith('verify ') || t.startsWith('human ') || t === 'i am human' || t.includes('robot check')) &&
85
104
  !t.includes('verified') && !document.querySelector('iframe[src*="recaptcha"]');
86
105
  if (!isVerifyLike) return false;
87
106
  // Exclude OAuth / sign-in buttons to prevent accidental login flows
88
- var isSignIn = /sign.in|log.in|google|microsoft|apple|facebook|github|auth/i.test(t);
107
+ // covers "Continue with Google", "Continue with Apple", "Continue
108
+ // with email", "Login or sign up", "Log in", "Sign in", "Sign up",
109
+ // "Single sign-on", and the visible panel "Login or sign up for free"
110
+ // text. The previous list missed "email" and "sso" which let the
111
+ // auto-click land on the email/SSO sign-in buttons on Perplexity's
112
+ // anonymous-mode homepage, navigating us into a login flow.
113
+ var isSignIn = new RegExp("sign.?in|log.?in|sign.?up|with\\s+(google|apple|email|github|facebook|microsoft|sso)|sso|auth", "i").test(t);
89
114
  return !isSignIn;
90
115
  });
91
116
  if (verify) { verify.setAttribute('data-gs-verify','1'); return JSON.stringify({t:'sel',s:'[data-gs-verify="1"]',txt:verify.innerText?.trim()||verify.value}); }
@@ -327,16 +352,23 @@ export async function humanClickElement(tab, cdpFn, selector) {
327
352
 
328
353
  /**
329
354
  * Parse a detection result and perform a human click if it found something.
330
- * Returns true if a click was performed.
355
+ *
356
+ * Returns a tristate string:
357
+ * - 'clicked' — a click was successfully dispatched
358
+ * - 'cant-click' — challenge was detected but we couldn't click it
359
+ * (zero-dimension element, OOPIF in closed shadow DOM, etc.)
360
+ * Caller should treat this as needs-human verification.
361
+ * - 'no-challenge' — no challenge detected, nothing to click
331
362
  */
332
- async function tryHumanClick(tab, cdp, detectResult) {
363
+ function tryHumanClick(tab, cdp, detectResult) {
333
364
  if (
334
365
  !detectResult ||
335
366
  detectResult === "null" ||
336
367
  detectResult === "cleared" ||
337
- detectResult === "still-verifying"
368
+ detectResult === "still-verifying" ||
369
+ detectResult === "cf-closed-shadow-dom"
338
370
  )
339
- return false;
371
+ return Promise.resolve("no-challenge");
340
372
 
341
373
  // JSON format: {t:"sel",s:"...",txt:"..."} or {t:"xy",x:...,y:...}
342
374
  try {
@@ -345,26 +377,138 @@ async function tryHumanClick(tab, cdp, detectResult) {
345
377
  process.stderr.write(
346
378
  `[greedysearch] Human-clicking "${info.txt}" via CDP...\n`,
347
379
  );
348
- const r = await humanClickElement(tab, cdp, info.s);
349
- return r !== null;
380
+ return humanClickElement(tab, cdp, info.s).then((r) =>
381
+ r !== null ? "clicked" : "cant-click",
382
+ );
350
383
  }
351
384
  if (info.t === "xy") {
352
385
  // Skip zero/invalid coordinates — element is off-screen or not rendered
353
- if (!info.x && !info.y) return false;
386
+ if (!info.x && !info.y) return Promise.resolve("cant-click");
354
387
  process.stderr.write(
355
388
  `[greedysearch] Human-clicking at (${info.x.toFixed(0)}, ${info.y.toFixed(0)})...\n`,
356
389
  );
357
- await humanClickXY(tab, cdp, info.x, info.y);
358
- return true;
390
+ return humanClickXY(tab, cdp, info.x, info.y).then(() => "clicked");
359
391
  }
360
392
  } catch {}
361
393
 
362
- return false;
394
+ return Promise.resolve("no-challenge");
363
395
  }
364
396
 
365
397
  export async function detectVerificationChallenge(tab, cdp) {
398
+ // Run the CDP-pierce probe FIRST so we get real click coordinates for
399
+ // Cloudflare iframes hidden inside closed shadow roots (chatgpt.com,
400
+ // perplexity.ai, etc.). The page-context probe falls back to a
401
+ // cf-closed-shadow-dom sentinel when the iframe is opaque to JS DOM
402
+ // queries, but that sentinel can't be auto-clicked.
403
+ const cfIframe = await findCloudflareIframeViaPierce(tab, cdp).catch(
404
+ () => null,
405
+ );
406
+ if (cfIframe) return cfIframe;
407
+
366
408
  const result = await cdp(["eval", tab, VERIFY_DETECT_JS]).catch(() => null);
367
- return result && result !== "null" ? result : null;
409
+ if (result && result !== "null") return result;
410
+
411
+ return null;
412
+ }
413
+
414
+ /**
415
+ * Walk the page DOM with pierce:true to locate a Cloudflare Turnstile
416
+ * iframe that's hidden inside a closed shadow root. Returns JSON of the
417
+ * shape `{t:'xy', x, y}` matching the main-document probe's convention,
418
+ * OR null if nothing was found.
419
+ *
420
+ * The returned coords target the **checkbox area** of the Turnstile widget
421
+ * (left ~25% of the 300x65 iframe, vertical center) rather than the
422
+ * iframe's geometric center, because the visible "Verify you are human"
423
+ * checkbox sits there in the standard widget layout.
424
+ */
425
+ async function findCloudflareIframeViaPierce(tab, cdp) {
426
+ if (typeof cdp !== "function") return null;
427
+
428
+ // Step 1: enable DOM domain if needed (cheap idempotent call)
429
+ await cdp(["evalraw", tab, "DOM.enable", "{}"]).catch(() => {});
430
+
431
+ // Step 2: get the full DOM tree with pierce — walks closed shadow roots
432
+ const doc = await cdp(["evalraw", tab, "DOM.getDocument", JSON.stringify({ depth: -1, pierce: true })]).catch(
433
+ () => null,
434
+ );
435
+ if (!doc) return null;
436
+ let docParsed;
437
+ try {
438
+ docParsed = JSON.parse(doc);
439
+ } catch {
440
+ return null;
441
+ }
442
+ if (docParsed.error || !docParsed.root) return null;
443
+
444
+ // Step 3: recursive walk looking for an iframe whose src points at
445
+ // challenges.cloudflare.com / turnstile
446
+ const root = docParsed.root;
447
+ const found = await walkForCfIframe(root, tab, cdp);
448
+ return found;
449
+ }
450
+
451
+ async function walkForCfIframe(node, tab, cdp) {
452
+ if (!node) return null;
453
+ const children = [];
454
+ if (node.shadowRoots && node.shadowRoots.length > 0) {
455
+ for (const s of node.shadowRoots) {
456
+ children.push(s);
457
+ }
458
+ }
459
+ if (node.children) {
460
+ for (const c of node.children) children.push(c);
461
+ }
462
+ for (const child of children) {
463
+ if (child.nodeName === "IFRAME") {
464
+ const attrs = child.attributes || [];
465
+ const srcIdx = attrs.indexOf("src");
466
+ const src = srcIdx >= 0 ? attrs[srcIdx + 1] : "";
467
+ if (
468
+ src &&
469
+ /challenges\.cloudflare\.com|turnstile/i.test(src) &&
470
+ child.backendNodeId
471
+ ) {
472
+ // Get bounding box via DOM.getBoxModel
473
+ const boxRes = await cdp([
474
+ "evalraw",
475
+ tab,
476
+ "DOM.getBoxModel",
477
+ JSON.stringify({ backendNodeId: child.backendNodeId }),
478
+ ]).catch(() => null);
479
+ if (!boxRes) continue;
480
+ let boxParsed;
481
+ try {
482
+ boxParsed = JSON.parse(boxRes);
483
+ } catch {
484
+ continue;
485
+ }
486
+ const content =
487
+ boxParsed?.model?.content || boxParsed?.result?.model?.content;
488
+ if (!content || content.length < 8) continue;
489
+ // content = [x1, y1, x2, y2, x3, y3, x4, y4]
490
+ const x1 = content[0];
491
+ const y1 = content[1];
492
+ const x3 = content[4];
493
+ const y3 = content[5];
494
+ const width = x3 - x1;
495
+ const height = y3 - y1;
496
+ // Skip degenerate boxes (hidden iframes)
497
+ if (width < 50 || height < 20) continue;
498
+ // Click the checkbox: standard CF widget is 300x65 with the
499
+ // checkbox centered at ~25% width, 50% height.
500
+ const checkboxX = x1 + width * 0.25;
501
+ const checkboxY = y1 + height * 0.5;
502
+ process.stderr.write(
503
+ `[greedysearch] Found CF iframe via CDP pierce at (${x1.toFixed(0)}, ${y1.toFixed(0)}) ${width.toFixed(0)}x${height.toFixed(0)}, clicking checkbox at (${checkboxX.toFixed(0)}, ${checkboxY.toFixed(0)})\n`,
504
+ );
505
+ return JSON.stringify({ t: "xy", x: checkboxX, y: checkboxY });
506
+ }
507
+ }
508
+ const deeper = await walkForCfIframe(child, tab, cdp);
509
+ if (deeper) return deeper;
510
+ }
511
+ return null;
368
512
  }
369
513
 
370
514
  // Returns 'clear' | 'clicked' | 'needs-human'
@@ -389,9 +533,17 @@ export async function handleVerification(tab, cdp, waitMs = 30000) {
389
533
  return "needs-human";
390
534
  }
391
535
 
536
+ // Cloudflare Turnstile rendered inside a closed shadow root (e.g.
537
+ // chatgpt.com). detectVerificationChallenge now uses CDP-level
538
+ // DOM.getDocument({pierce:true}) to walk into the closed root and
539
+ // locate the iframe's screen-space bounding box. The result here is
540
+ // a normal {t:'xy',x,y} coordinate payload that flows through the
541
+ // regular click path. The historical "cf-closed-shadow-dom" sentinel
542
+ // is kept in VERIFY_DETECT_JS only as a safety net for unusual pages.
543
+
392
544
  // Perform human click on detected element
393
- const clicked = await tryHumanClick(tab, cdp, result);
394
- if (clicked) {
545
+ const clickResult = await tryHumanClick(tab, cdp, result);
546
+ if (clickResult === "clicked") {
395
547
  await new Promise((r) => setTimeout(r, 2000));
396
548
 
397
549
  // Retry loop — keep checking until cleared or timeout
@@ -417,5 +569,17 @@ export async function handleVerification(tab, cdp, waitMs = 30000) {
417
569
  return "needs-human";
418
570
  }
419
571
 
572
+ // Challenge was detected but we couldn't auto-click it (zero-dimension
573
+ // element, OOPIF without coordinates, etc.). Surface this rather than
574
+ // silently returning 'clear' — the caller would otherwise proceed and
575
+ // fail downstream on a selector that won't appear until the challenge
576
+ // is solved.
577
+ if (clickResult === "cant-click") {
578
+ process.stderr.write(
579
+ "[greedysearch] Verification challenge detected but cannot be auto-clicked — please solve it manually in the visible browser window.\n",
580
+ );
581
+ return "needs-human";
582
+ }
583
+
420
584
  return "clear";
421
585
  }
@@ -85,39 +85,45 @@ async function scrollToBottom(tab) {
85
85
  * the assistant's response copy button).
86
86
  */
87
87
  async function extractAnswerFromDom(tab) {
88
- const raw = await cdp([
89
- "eval",
90
- tab,
91
- String.raw`
92
- (() => {
93
- // The model-response element is a custom element <model-response>.
94
- // Its innerText starts with the "Gemini said" label in the
95
- // current locale; strip that prefix and return the rest.
96
- const resp = document.querySelector('model-response');
97
- if (!resp) return JSON.stringify({ answer: '', sources: [] });
98
- const text = (resp.innerText || resp.textContent || '').trim();
99
- // Strip the locale-specific "Gemini said" label prefix.
100
- // It varies ("Το Gemini είπε" in Greek, "Gemini said" in
101
- // English, etc.) so we just look for the first newline and
102
- // take what follows.
103
- const idx = text.indexOf('\n');
104
- const answer = idx >= 0 ? text.slice(idx + 1).trim() : text;
105
- if (!answer) return JSON.stringify({ answer: '', sources: [] });
106
- // Extract source links from the response.
107
- const seen = new Set();
108
- const sources = [];
109
- for (const link of resp.querySelectorAll('a[href]')) {
110
- const url = link.href;
111
- if (!url || seen.has(url)) continue;
112
- seen.add(url);
113
- const title = (link.innerText || link.textContent || '').replace(/\s+/g, ' ').trim();
114
- sources.push({ title, url });
115
- if (sources.length >= 10) break;
116
- }
117
- return JSON.stringify({ answer, sources });
118
- })()
119
- `,
120
- ]);
88
+ const raw = await cdp(
89
+ [
90
+ "eval",
91
+ tab,
92
+ String.raw`
93
+ new Promise((resolve) => {
94
+ const _deadline = Date.now() + 6000;
95
+ function _tryExtract() {
96
+ const resp = document.querySelector('model-response');
97
+ if (resp) {
98
+ const text = (resp.innerText || resp.textContent || '').trim();
99
+ const idx = text.indexOf('\n');
100
+ const answer = idx >= 0 ? text.slice(idx + 1).trim() : text;
101
+ if (answer) {
102
+ const seen = new Set();
103
+ const sources = [];
104
+ for (const link of resp.querySelectorAll('a[href]')) {
105
+ const url = link.href;
106
+ if (!url || seen.has(url)) continue;
107
+ seen.add(url);
108
+ const title = (link.innerText || link.textContent || '').replace(/\s+/g, ' ').trim();
109
+ sources.push({ title, url });
110
+ if (sources.length >= 10) break;
111
+ }
112
+ return resolve(JSON.stringify({ answer, sources }));
113
+ }
114
+ }
115
+ if (Date.now() < _deadline) {
116
+ setTimeout(_tryExtract, 500);
117
+ } else {
118
+ resolve(JSON.stringify({ answer: '', sources: [] }));
119
+ }
120
+ }
121
+ _tryExtract();
122
+ })
123
+ `,
124
+ ],
125
+ 8000,
126
+ );
121
127
  try {
122
128
  return JSON.parse(raw);
123
129
  } catch {
@@ -133,6 +139,11 @@ async function extractAnswer(tab, query = "") {
133
139
  // count >= 2, which is unreliable: the Gemini UI has many copy
134
140
  // icons (copy link, copy code, etc.), and the last one on the page
135
141
  // is not always the assistant response copy button.
142
+ //
143
+ // minLength: 60 — Gemini renders a streaming header/prefix
144
+ // ("Gemini said" + UI chrome = ~25 chars) before the body arrives.
145
+ // The old 20-char threshold often resolved at the header stage and
146
+ // the copy button click then captured a partial/header-only result.
136
147
  let modelReady = false;
137
148
  const modelDeadline = Date.now() + 12000;
138
149
  while (Date.now() < modelDeadline) {
@@ -146,10 +157,10 @@ async function extractAnswer(tab, query = "") {
146
157
  // Must have content beyond the locale-specific label
147
158
  // ("Gemini said" / "Το Gemini είπε" / etc.) and ideally
148
159
  // a copy button rendered on the response.
149
- return t.length > 20;
160
+ return t.length > 60;
150
161
  })()`,
151
162
  ]);
152
- if (ready === true) {
163
+ if (ready === "true") {
153
164
  modelReady = true;
154
165
  break;
155
166
  }
@@ -310,7 +321,11 @@ async function main() {
310
321
  if (++pollTick % 10 === 0) scrollToBottom(tab).catch(() => null);
311
322
  }, 6000);
312
323
  try {
313
- await waitForStreamComplete(tab, { timeout: 45000, minLength: 50 });
324
+ await waitForStreamComplete(tab, {
325
+ timeout: 45000,
326
+ stableRounds: 5,
327
+ minLength: 60,
328
+ });
314
329
  } finally {
315
330
  clearInterval(scrollInterval);
316
331
  }
@@ -1,128 +1,129 @@
1
- #!/usr/bin/env node
2
-
3
- // extractors/google-ai.mjs
4
- // Navigate Google AI Mode (udm=50), wait for answer, return clean answer + sources.
5
- //
6
- // Usage:
7
- // node extractors/google-ai.mjs "<query>" [--tab <prefix>]
8
- //
9
- // Output (stdout): JSON { answer, sources, query, url }
10
- // Errors go to stderr only — stdout is always clean JSON for piping.
11
-
12
- import {
13
- cdp,
14
- formatAnswer,
15
- getOrOpenTab,
16
- handleError,
17
- jitter,
18
- outputJson,
19
- parseArgs,
20
- prepareArgs,
21
- TIMING,
22
- validateQuery,
23
- waitForStreamComplete,
24
- } from "./common.mjs";
25
- import { dismissConsent, handleVerification } from "./consent.mjs";
26
- import { SELECTORS } from "./selectors.mjs";
27
-
28
- const S = SELECTORS.google;
29
-
30
- const MIN_ANSWER_LENGTH = 50;
31
-
32
- async function extractAnswer(tab) {
33
- const excludeFilter = S.sourceExclude
34
- .map((e) => `!a.href.includes('${e}')`)
35
- .join(" && ");
36
- const raw = await cdp([
37
- "eval",
38
- tab,
39
- String.raw`
40
- (function() {
41
- var el = document.querySelector('${S.answerContainer}');
42
- if (!el) return JSON.stringify({ answer: '', sources: [] });
43
- var answer = el.innerText.trim();
44
- var sources = Array.from(document.querySelectorAll('${S.sourceLink}'))
45
- .filter(a => ${excludeFilter})
46
- .map(a => ({ url: a.href.split('#')[0], title: (a.closest('${S.sourceHeadingParent}')?.querySelector('h3, [role=heading]')?.innerText || a.innerText?.trim().split('\n')[0] || '').slice(0, 100) }))
47
- .filter(s => s.url && s.url.length > 10)
48
- .filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
49
- .slice(0, 10);
50
- return JSON.stringify({ answer, sources });
51
- })()
52
- `,
53
- ]);
54
- return JSON.parse(raw);
55
- }
56
-
57
- // ============================================================================
58
- // Main
59
- // ============================================================================
60
-
61
- const USAGE =
62
- 'Usage: node extractors/google-ai.mjs "<query>" [--tab <prefix>]\n';
63
-
64
- async function main() {
65
- const args = await prepareArgs(process.argv.slice(2));
66
- validateQuery(args, USAGE);
67
-
68
- const { query, tabPrefix, short, locale } = parseArgs(args);
69
-
70
- try {
71
- // Only refresh page list when creating a fresh tab (no prefix provided)
72
- if (!tabPrefix) await cdp(["list"]);
73
- const tab = await getOrOpenTab(tabPrefix);
74
-
75
- // Build URL with language parameter (default to English)
76
- const langParam = locale ? `&hl=${encodeURIComponent(locale)}` : "&hl=en";
77
- const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&udm=50${langParam}`;
78
- await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
79
- await dismissConsent(tab, cdp);
80
-
81
- // If consent redirected us away, navigate back
82
- const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
83
- () => "",
84
- );
85
- if (!currentUrl.includes("google.com/search")) {
86
- await cdp(["nav", tab, url], 20000);
87
- await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
88
- }
89
-
90
- // Handle "verify you're human" — auto-click simple buttons, wait for user on hard CAPTCHA
91
- const verifyResult = await handleVerification(tab, cdp, 10000);
92
- if (verifyResult === "needs-human")
93
- throw new Error(
94
- "Google verification required — could not be completed automatically",
95
- );
96
- if (verifyResult === "clicked" || verifyResult === "cleared-by-user") {
97
- // Re-navigate to the search URL after verification
98
- await cdp(["nav", tab, url], 20000);
99
- await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
100
- }
101
-
102
- await waitForStreamComplete(tab, {
103
- timeout: 30000,
104
- selector: `document.querySelector('${S.answerContainer}')`,
105
- minLength: MIN_ANSWER_LENGTH,
106
- });
107
-
108
- const { answer, sources } = await extractAnswer(tab);
109
- if (!answer)
110
- throw new Error(
111
- "No answer extracted — Google AI Mode may not have responded",
112
- );
113
-
114
- const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
115
- () => url,
116
- );
117
- outputJson({
118
- query,
119
- url: finalUrl,
120
- answer: formatAnswer(answer, short),
121
- sources,
122
- });
123
- } catch (e) {
124
- handleError(e);
125
- }
126
- }
127
-
128
- main();
1
+ #!/usr/bin/env node
2
+
3
+ // extractors/google-ai.mjs
4
+ // Navigate Google AI Mode (udm=50), wait for answer, return clean answer + sources.
5
+ //
6
+ // Usage:
7
+ // node extractors/google-ai.mjs "<query>" [--tab <prefix>]
8
+ //
9
+ // Output (stdout): JSON { answer, sources, query, url }
10
+ // Errors go to stderr only — stdout is always clean JSON for piping.
11
+
12
+ import {
13
+ cdp,
14
+ formatAnswer,
15
+ getOrOpenTab,
16
+ handleError,
17
+ jitter,
18
+ outputJson,
19
+ parseArgs,
20
+ prepareArgs,
21
+ TIMING,
22
+ validateQuery,
23
+ waitForStreamComplete,
24
+ } from "./common.mjs";
25
+ import { dismissConsent, handleVerification } from "./consent.mjs";
26
+ import { SELECTORS } from "./selectors.mjs";
27
+
28
+ const S = SELECTORS.google;
29
+
30
+ const MIN_ANSWER_LENGTH = 50;
31
+
32
+ async function extractAnswer(tab) {
33
+ const excludeFilter = S.sourceExclude
34
+ .map((e) => `!a.href.includes('${e}')`)
35
+ .join(" && ");
36
+ const raw = await cdp([
37
+ "eval",
38
+ tab,
39
+ String.raw`
40
+ (function() {
41
+ var el = document.querySelector('${S.answerContainer}');
42
+ if (!el) return JSON.stringify({ answer: '', sources: [] });
43
+ var answer = el.innerText.trim();
44
+ var sources = Array.from(document.querySelectorAll('${S.sourceLink}'))
45
+ .filter(a => ${excludeFilter})
46
+ .map(a => ({ url: a.href.split('#')[0], title: (a.closest('${S.sourceHeadingParent}')?.querySelector('h3, [role=heading]')?.innerText || a.innerText?.trim().split('\n')[0] || '').slice(0, 100) }))
47
+ .filter(s => s.url && s.url.length > 10)
48
+ .filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
49
+ .slice(0, 10);
50
+ return JSON.stringify({ answer, sources });
51
+ })()
52
+ `,
53
+ ]);
54
+ return JSON.parse(raw);
55
+ }
56
+
57
+ // ============================================================================
58
+ // Main
59
+ // ============================================================================
60
+
61
+ const USAGE =
62
+ 'Usage: node extractors/google-ai.mjs "<query>" [--tab <prefix>]\n';
63
+
64
+ async function main() {
65
+ const args = await prepareArgs(process.argv.slice(2));
66
+ validateQuery(args, USAGE);
67
+
68
+ const { query, tabPrefix, short, locale } = parseArgs(args);
69
+
70
+ try {
71
+ // Only refresh page list when creating a fresh tab (no prefix provided)
72
+ if (!tabPrefix) await cdp(["list"]);
73
+ const tab = await getOrOpenTab(tabPrefix);
74
+
75
+ // Build URL with language parameter (default to English)
76
+ const langParam = locale ? `&hl=${encodeURIComponent(locale)}` : "&hl=en";
77
+ const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&udm=50${langParam}`;
78
+ await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
79
+ await dismissConsent(tab, cdp);
80
+
81
+ // If consent redirected us away, navigate back
82
+ const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
83
+ () => "",
84
+ );
85
+ if (!currentUrl.includes("google.com/search")) {
86
+ await cdp(["nav", tab, url], 20000);
87
+ await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
88
+ }
89
+
90
+ // Handle "verify you're human" — auto-click simple buttons, wait for user on hard CAPTCHA
91
+ const verifyResult = await handleVerification(tab, cdp, 10000);
92
+ if (verifyResult === "needs-human")
93
+ throw new Error(
94
+ "Google verification required — could not be completed automatically",
95
+ );
96
+ if (verifyResult === "clicked" || verifyResult === "cleared-by-user") {
97
+ // Re-navigate to the search URL after verification
98
+ await cdp(["nav", tab, url], 20000);
99
+ await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
100
+ }
101
+
102
+ await waitForStreamComplete(tab, {
103
+ timeout: 30000,
104
+ stableRounds: 5,
105
+ selector: `document.querySelector('${S.answerContainer}')`,
106
+ minLength: MIN_ANSWER_LENGTH,
107
+ });
108
+
109
+ const { answer, sources } = await extractAnswer(tab);
110
+ if (!answer)
111
+ throw new Error(
112
+ "No answer extracted — Google AI Mode may not have responded",
113
+ );
114
+
115
+ const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
116
+ () => url,
117
+ );
118
+ outputJson({
119
+ query,
120
+ url: finalUrl,
121
+ answer: formatAnswer(answer, short),
122
+ sources,
123
+ });
124
+ } catch (e) {
125
+ handleError(e);
126
+ }
127
+ }
128
+
129
+ main();