@apmantza/greedysearch-pi 2.0.0 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +54 -3
- package/README.md +2 -2
- package/bin/search.mjs +121 -13
- package/extractors/bing-copilot.mjs +6 -14
- package/extractors/chatgpt.mjs +130 -13
- package/extractors/common.mjs +58 -1
- package/extractors/consent.mjs +182 -18
- package/extractors/gemini.mjs +51 -36
- package/extractors/google-ai.mjs +129 -128
- package/extractors/logically.mjs +68 -6
- package/extractors/perplexity.mjs +547 -217
- package/package.json +2 -2
- package/skills/greedy-search/skill.md +20 -18
- package/src/fetcher.mjs +15 -0
- package/src/formatters/results.ts +24 -2
- package/src/search/challenge-detect.mjs +205 -0
- package/src/search/constants.mjs +5 -0
- package/src/search/progress.mjs +145 -0
- package/src/search/recovery.mjs +25 -3
- package/src/search/research.mjs +366 -7
- package/src/search/scale-aware.mjs +93 -0
- package/src/search/simple-research.mjs +520 -0
- package/src/tools/greedy-search-handler.ts +8 -10
- package/src/tools/shared.ts +145 -20
- package/test.mjs +160 -12
package/extractors/consent.mjs
CHANGED
|
@@ -57,12 +57,24 @@ const VERIFY_DETECT_JS = `
|
|
|
57
57
|
|
|
58
58
|
// --- Cloudflare Turnstile widget inside closed shadow DOM (Copilot, etc.) ---
|
|
59
59
|
// The iframe is not queryable from main document, but the host container
|
|
60
|
-
// (#cf-turnstile) and the hidden response input are.
|
|
61
|
-
|
|
60
|
+
// (#cf-turnstile) and the hidden response input are. When only the
|
|
61
|
+
// hidden response input matches (no #cf-turnstile host and no visible
|
|
62
|
+
// iframe), the actual challenge widget is rendered inside a closed
|
|
63
|
+
// shadow DOM and cannot be auto-clicked. Return a sentinel so callers
|
|
64
|
+
// know to surface this as needs-human verification instead of wasting
|
|
65
|
+
// time on a doomed waitForSelector.
|
|
66
|
+
var cfTurnstileHost = document.querySelector('#cf-turnstile');
|
|
62
67
|
if (cfTurnstileHost) {
|
|
63
68
|
var r2 = cfTurnstileHost.getBoundingClientRect();
|
|
64
69
|
return JSON.stringify({t:'xy',x:r2.left+r2.width/2,y:r2.top+r2.height/2});
|
|
65
70
|
}
|
|
71
|
+
// Hidden cf-chl-widget-*_response input present but no visible host:
|
|
72
|
+
// the widget is in closed shadow DOM. Signal this so handleVerification
|
|
73
|
+
// can return 'needs-human' rather than 'clear'.
|
|
74
|
+
var cfResponseInput = document.querySelector('input[name="cf-turnstile-response"], [id^="cf-chl-widget-"][id$="_response"]');
|
|
75
|
+
if (cfResponseInput && cfResponseInput.value === '') {
|
|
76
|
+
return 'cf-closed-shadow-dom';
|
|
77
|
+
}
|
|
66
78
|
|
|
67
79
|
// --- Cloudflare challenge page ---
|
|
68
80
|
var cfCheckbox = document.querySelector('#cf-stage input[type="checkbox"], .ctp-checkbox-container input');
|
|
@@ -77,15 +89,28 @@ const VERIFY_DETECT_JS = `
|
|
|
77
89
|
}
|
|
78
90
|
|
|
79
91
|
// --- Generic verify/continue/proceed buttons (catch-all) ---
|
|
80
|
-
// IMPORTANT: exclude sign-in / OAuth buttons (e.g. "Continue with Google"
|
|
92
|
+
// IMPORTANT: exclude sign-in / OAuth buttons (e.g. "Continue with Google",
|
|
93
|
+
// "Continue with email", "Login or sign up for free"). These appear on
|
|
94
|
+
// many sites (Perplexity, ChatGPT, etc.) when the user isn't logged in,
|
|
95
|
+
// and clicking them triggers a sign-in flow that takes us to a login
|
|
96
|
+
// wall — a much worse outcome than the original search failure we were
|
|
97
|
+
// trying to recover from. The exclusion list must cover both OAuth
|
|
98
|
+
// providers AND generic "sign in / log in / with email" patterns.
|
|
81
99
|
var btns = Array.from(document.querySelectorAll('button, input[type=submit], a[role=button]'));
|
|
82
100
|
var verify = btns.find(b => {
|
|
83
101
|
var t = (b.innerText?.trim() || b.value || '').toLowerCase();
|
|
84
|
-
var isVerifyLike = (t
|
|
102
|
+
var isVerifyLike = (t === 'continue' || t === 'proceed' || t === 'next' ||
|
|
103
|
+
t.startsWith('verify ') || t.startsWith('human ') || t === 'i am human' || t.includes('robot check')) &&
|
|
85
104
|
!t.includes('verified') && !document.querySelector('iframe[src*="recaptcha"]');
|
|
86
105
|
if (!isVerifyLike) return false;
|
|
87
106
|
// Exclude OAuth / sign-in buttons to prevent accidental login flows
|
|
88
|
-
|
|
107
|
+
// — covers "Continue with Google", "Continue with Apple", "Continue
|
|
108
|
+
// with email", "Login or sign up", "Log in", "Sign in", "Sign up",
|
|
109
|
+
// "Single sign-on", and the visible panel "Login or sign up for free"
|
|
110
|
+
// text. The previous list missed "email" and "sso" which let the
|
|
111
|
+
// auto-click land on the email/SSO sign-in buttons on Perplexity's
|
|
112
|
+
// anonymous-mode homepage, navigating us into a login flow.
|
|
113
|
+
var isSignIn = new RegExp("sign.?in|log.?in|sign.?up|with\\s+(google|apple|email|github|facebook|microsoft|sso)|sso|auth", "i").test(t);
|
|
89
114
|
return !isSignIn;
|
|
90
115
|
});
|
|
91
116
|
if (verify) { verify.setAttribute('data-gs-verify','1'); return JSON.stringify({t:'sel',s:'[data-gs-verify="1"]',txt:verify.innerText?.trim()||verify.value}); }
|
|
@@ -327,16 +352,23 @@ export async function humanClickElement(tab, cdpFn, selector) {
|
|
|
327
352
|
|
|
328
353
|
/**
|
|
329
354
|
* Parse a detection result and perform a human click if it found something.
|
|
330
|
-
*
|
|
355
|
+
*
|
|
356
|
+
* Returns a tristate string:
|
|
357
|
+
* - 'clicked' — a click was successfully dispatched
|
|
358
|
+
* - 'cant-click' — challenge was detected but we couldn't click it
|
|
359
|
+
* (zero-dimension element, OOPIF in closed shadow DOM, etc.)
|
|
360
|
+
* Caller should treat this as needs-human verification.
|
|
361
|
+
* - 'no-challenge' — no challenge detected, nothing to click
|
|
331
362
|
*/
|
|
332
|
-
|
|
363
|
+
function tryHumanClick(tab, cdp, detectResult) {
|
|
333
364
|
if (
|
|
334
365
|
!detectResult ||
|
|
335
366
|
detectResult === "null" ||
|
|
336
367
|
detectResult === "cleared" ||
|
|
337
|
-
detectResult === "still-verifying"
|
|
368
|
+
detectResult === "still-verifying" ||
|
|
369
|
+
detectResult === "cf-closed-shadow-dom"
|
|
338
370
|
)
|
|
339
|
-
return
|
|
371
|
+
return Promise.resolve("no-challenge");
|
|
340
372
|
|
|
341
373
|
// JSON format: {t:"sel",s:"...",txt:"..."} or {t:"xy",x:...,y:...}
|
|
342
374
|
try {
|
|
@@ -345,26 +377,138 @@ async function tryHumanClick(tab, cdp, detectResult) {
|
|
|
345
377
|
process.stderr.write(
|
|
346
378
|
`[greedysearch] Human-clicking "${info.txt}" via CDP...\n`,
|
|
347
379
|
);
|
|
348
|
-
|
|
349
|
-
|
|
380
|
+
return humanClickElement(tab, cdp, info.s).then((r) =>
|
|
381
|
+
r !== null ? "clicked" : "cant-click",
|
|
382
|
+
);
|
|
350
383
|
}
|
|
351
384
|
if (info.t === "xy") {
|
|
352
385
|
// Skip zero/invalid coordinates — element is off-screen or not rendered
|
|
353
|
-
if (!info.x && !info.y) return
|
|
386
|
+
if (!info.x && !info.y) return Promise.resolve("cant-click");
|
|
354
387
|
process.stderr.write(
|
|
355
388
|
`[greedysearch] Human-clicking at (${info.x.toFixed(0)}, ${info.y.toFixed(0)})...\n`,
|
|
356
389
|
);
|
|
357
|
-
|
|
358
|
-
return true;
|
|
390
|
+
return humanClickXY(tab, cdp, info.x, info.y).then(() => "clicked");
|
|
359
391
|
}
|
|
360
392
|
} catch {}
|
|
361
393
|
|
|
362
|
-
return
|
|
394
|
+
return Promise.resolve("no-challenge");
|
|
363
395
|
}
|
|
364
396
|
|
|
365
397
|
export async function detectVerificationChallenge(tab, cdp) {
|
|
398
|
+
// Run the CDP-pierce probe FIRST so we get real click coordinates for
|
|
399
|
+
// Cloudflare iframes hidden inside closed shadow roots (chatgpt.com,
|
|
400
|
+
// perplexity.ai, etc.). The page-context probe falls back to a
|
|
401
|
+
// cf-closed-shadow-dom sentinel when the iframe is opaque to JS DOM
|
|
402
|
+
// queries, but that sentinel can't be auto-clicked.
|
|
403
|
+
const cfIframe = await findCloudflareIframeViaPierce(tab, cdp).catch(
|
|
404
|
+
() => null,
|
|
405
|
+
);
|
|
406
|
+
if (cfIframe) return cfIframe;
|
|
407
|
+
|
|
366
408
|
const result = await cdp(["eval", tab, VERIFY_DETECT_JS]).catch(() => null);
|
|
367
|
-
|
|
409
|
+
if (result && result !== "null") return result;
|
|
410
|
+
|
|
411
|
+
return null;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
/**
|
|
415
|
+
* Walk the page DOM with pierce:true to locate a Cloudflare Turnstile
|
|
416
|
+
* iframe that's hidden inside a closed shadow root. Returns JSON of the
|
|
417
|
+
* shape `{t:'xy', x, y}` matching the main-document probe's convention,
|
|
418
|
+
* OR null if nothing was found.
|
|
419
|
+
*
|
|
420
|
+
* The returned coords target the **checkbox area** of the Turnstile widget
|
|
421
|
+
* (left ~25% of the 300x65 iframe, vertical center) rather than the
|
|
422
|
+
* iframe's geometric center, because the visible "Verify you are human"
|
|
423
|
+
* checkbox sits there in the standard widget layout.
|
|
424
|
+
*/
|
|
425
|
+
async function findCloudflareIframeViaPierce(tab, cdp) {
|
|
426
|
+
if (typeof cdp !== "function") return null;
|
|
427
|
+
|
|
428
|
+
// Step 1: enable DOM domain if needed (cheap idempotent call)
|
|
429
|
+
await cdp(["evalraw", tab, "DOM.enable", "{}"]).catch(() => {});
|
|
430
|
+
|
|
431
|
+
// Step 2: get the full DOM tree with pierce — walks closed shadow roots
|
|
432
|
+
const doc = await cdp(["evalraw", tab, "DOM.getDocument", JSON.stringify({ depth: -1, pierce: true })]).catch(
|
|
433
|
+
() => null,
|
|
434
|
+
);
|
|
435
|
+
if (!doc) return null;
|
|
436
|
+
let docParsed;
|
|
437
|
+
try {
|
|
438
|
+
docParsed = JSON.parse(doc);
|
|
439
|
+
} catch {
|
|
440
|
+
return null;
|
|
441
|
+
}
|
|
442
|
+
if (docParsed.error || !docParsed.root) return null;
|
|
443
|
+
|
|
444
|
+
// Step 3: recursive walk looking for an iframe whose src points at
|
|
445
|
+
// challenges.cloudflare.com / turnstile
|
|
446
|
+
const root = docParsed.root;
|
|
447
|
+
const found = await walkForCfIframe(root, tab, cdp);
|
|
448
|
+
return found;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
async function walkForCfIframe(node, tab, cdp) {
|
|
452
|
+
if (!node) return null;
|
|
453
|
+
const children = [];
|
|
454
|
+
if (node.shadowRoots && node.shadowRoots.length > 0) {
|
|
455
|
+
for (const s of node.shadowRoots) {
|
|
456
|
+
children.push(s);
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
if (node.children) {
|
|
460
|
+
for (const c of node.children) children.push(c);
|
|
461
|
+
}
|
|
462
|
+
for (const child of children) {
|
|
463
|
+
if (child.nodeName === "IFRAME") {
|
|
464
|
+
const attrs = child.attributes || [];
|
|
465
|
+
const srcIdx = attrs.indexOf("src");
|
|
466
|
+
const src = srcIdx >= 0 ? attrs[srcIdx + 1] : "";
|
|
467
|
+
if (
|
|
468
|
+
src &&
|
|
469
|
+
/challenges\.cloudflare\.com|turnstile/i.test(src) &&
|
|
470
|
+
child.backendNodeId
|
|
471
|
+
) {
|
|
472
|
+
// Get bounding box via DOM.getBoxModel
|
|
473
|
+
const boxRes = await cdp([
|
|
474
|
+
"evalraw",
|
|
475
|
+
tab,
|
|
476
|
+
"DOM.getBoxModel",
|
|
477
|
+
JSON.stringify({ backendNodeId: child.backendNodeId }),
|
|
478
|
+
]).catch(() => null);
|
|
479
|
+
if (!boxRes) continue;
|
|
480
|
+
let boxParsed;
|
|
481
|
+
try {
|
|
482
|
+
boxParsed = JSON.parse(boxRes);
|
|
483
|
+
} catch {
|
|
484
|
+
continue;
|
|
485
|
+
}
|
|
486
|
+
const content =
|
|
487
|
+
boxParsed?.model?.content || boxParsed?.result?.model?.content;
|
|
488
|
+
if (!content || content.length < 8) continue;
|
|
489
|
+
// content = [x1, y1, x2, y2, x3, y3, x4, y4]
|
|
490
|
+
const x1 = content[0];
|
|
491
|
+
const y1 = content[1];
|
|
492
|
+
const x3 = content[4];
|
|
493
|
+
const y3 = content[5];
|
|
494
|
+
const width = x3 - x1;
|
|
495
|
+
const height = y3 - y1;
|
|
496
|
+
// Skip degenerate boxes (hidden iframes)
|
|
497
|
+
if (width < 50 || height < 20) continue;
|
|
498
|
+
// Click the checkbox: standard CF widget is 300x65 with the
|
|
499
|
+
// checkbox centered at ~25% width, 50% height.
|
|
500
|
+
const checkboxX = x1 + width * 0.25;
|
|
501
|
+
const checkboxY = y1 + height * 0.5;
|
|
502
|
+
process.stderr.write(
|
|
503
|
+
`[greedysearch] Found CF iframe via CDP pierce at (${x1.toFixed(0)}, ${y1.toFixed(0)}) ${width.toFixed(0)}x${height.toFixed(0)}, clicking checkbox at (${checkboxX.toFixed(0)}, ${checkboxY.toFixed(0)})\n`,
|
|
504
|
+
);
|
|
505
|
+
return JSON.stringify({ t: "xy", x: checkboxX, y: checkboxY });
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
const deeper = await walkForCfIframe(child, tab, cdp);
|
|
509
|
+
if (deeper) return deeper;
|
|
510
|
+
}
|
|
511
|
+
return null;
|
|
368
512
|
}
|
|
369
513
|
|
|
370
514
|
// Returns 'clear' | 'clicked' | 'needs-human'
|
|
@@ -389,9 +533,17 @@ export async function handleVerification(tab, cdp, waitMs = 30000) {
|
|
|
389
533
|
return "needs-human";
|
|
390
534
|
}
|
|
391
535
|
|
|
536
|
+
// Cloudflare Turnstile rendered inside a closed shadow root (e.g.
|
|
537
|
+
// chatgpt.com). detectVerificationChallenge now uses CDP-level
|
|
538
|
+
// DOM.getDocument({pierce:true}) to walk into the closed root and
|
|
539
|
+
// locate the iframe's screen-space bounding box. The result here is
|
|
540
|
+
// a normal {t:'xy',x,y} coordinate payload that flows through the
|
|
541
|
+
// regular click path. The historical "cf-closed-shadow-dom" sentinel
|
|
542
|
+
// is kept in VERIFY_DETECT_JS only as a safety net for unusual pages.
|
|
543
|
+
|
|
392
544
|
// Perform human click on detected element
|
|
393
|
-
const
|
|
394
|
-
if (clicked) {
|
|
545
|
+
const clickResult = await tryHumanClick(tab, cdp, result);
|
|
546
|
+
if (clickResult === "clicked") {
|
|
395
547
|
await new Promise((r) => setTimeout(r, 2000));
|
|
396
548
|
|
|
397
549
|
// Retry loop — keep checking until cleared or timeout
|
|
@@ -417,5 +569,17 @@ export async function handleVerification(tab, cdp, waitMs = 30000) {
|
|
|
417
569
|
return "needs-human";
|
|
418
570
|
}
|
|
419
571
|
|
|
572
|
+
// Challenge was detected but we couldn't auto-click it (zero-dimension
|
|
573
|
+
// element, OOPIF without coordinates, etc.). Surface this rather than
|
|
574
|
+
// silently returning 'clear' — the caller would otherwise proceed and
|
|
575
|
+
// fail downstream on a selector that won't appear until the challenge
|
|
576
|
+
// is solved.
|
|
577
|
+
if (clickResult === "cant-click") {
|
|
578
|
+
process.stderr.write(
|
|
579
|
+
"[greedysearch] Verification challenge detected but cannot be auto-clicked — please solve it manually in the visible browser window.\n",
|
|
580
|
+
);
|
|
581
|
+
return "needs-human";
|
|
582
|
+
}
|
|
583
|
+
|
|
420
584
|
return "clear";
|
|
421
585
|
}
|
package/extractors/gemini.mjs
CHANGED
|
@@ -85,39 +85,45 @@ async function scrollToBottom(tab) {
|
|
|
85
85
|
* the assistant's response copy button).
|
|
86
86
|
*/
|
|
87
87
|
async function extractAnswerFromDom(tab) {
|
|
88
|
-
const raw = await cdp(
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
88
|
+
const raw = await cdp(
|
|
89
|
+
[
|
|
90
|
+
"eval",
|
|
91
|
+
tab,
|
|
92
|
+
String.raw`
|
|
93
|
+
new Promise((resolve) => {
|
|
94
|
+
const _deadline = Date.now() + 6000;
|
|
95
|
+
function _tryExtract() {
|
|
96
|
+
const resp = document.querySelector('model-response');
|
|
97
|
+
if (resp) {
|
|
98
|
+
const text = (resp.innerText || resp.textContent || '').trim();
|
|
99
|
+
const idx = text.indexOf('\n');
|
|
100
|
+
const answer = idx >= 0 ? text.slice(idx + 1).trim() : text;
|
|
101
|
+
if (answer) {
|
|
102
|
+
const seen = new Set();
|
|
103
|
+
const sources = [];
|
|
104
|
+
for (const link of resp.querySelectorAll('a[href]')) {
|
|
105
|
+
const url = link.href;
|
|
106
|
+
if (!url || seen.has(url)) continue;
|
|
107
|
+
seen.add(url);
|
|
108
|
+
const title = (link.innerText || link.textContent || '').replace(/\s+/g, ' ').trim();
|
|
109
|
+
sources.push({ title, url });
|
|
110
|
+
if (sources.length >= 10) break;
|
|
111
|
+
}
|
|
112
|
+
return resolve(JSON.stringify({ answer, sources }));
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
if (Date.now() < _deadline) {
|
|
116
|
+
setTimeout(_tryExtract, 500);
|
|
117
|
+
} else {
|
|
118
|
+
resolve(JSON.stringify({ answer: '', sources: [] }));
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
_tryExtract();
|
|
122
|
+
})
|
|
123
|
+
`,
|
|
124
|
+
],
|
|
125
|
+
8000,
|
|
126
|
+
);
|
|
121
127
|
try {
|
|
122
128
|
return JSON.parse(raw);
|
|
123
129
|
} catch {
|
|
@@ -133,6 +139,11 @@ async function extractAnswer(tab, query = "") {
|
|
|
133
139
|
// count >= 2, which is unreliable: the Gemini UI has many copy
|
|
134
140
|
// icons (copy link, copy code, etc.), and the last one on the page
|
|
135
141
|
// is not always the assistant response copy button.
|
|
142
|
+
//
|
|
143
|
+
// minLength: 60 — Gemini renders a streaming header/prefix
|
|
144
|
+
// ("Gemini said" + UI chrome = ~25 chars) before the body arrives.
|
|
145
|
+
// The old 20-char threshold often resolved at the header stage and
|
|
146
|
+
// the copy button click then captured a partial/header-only result.
|
|
136
147
|
let modelReady = false;
|
|
137
148
|
const modelDeadline = Date.now() + 12000;
|
|
138
149
|
while (Date.now() < modelDeadline) {
|
|
@@ -146,10 +157,10 @@ async function extractAnswer(tab, query = "") {
|
|
|
146
157
|
// Must have content beyond the locale-specific label
|
|
147
158
|
// ("Gemini said" / "Το Gemini είπε" / etc.) and ideally
|
|
148
159
|
// a copy button rendered on the response.
|
|
149
|
-
return t.length >
|
|
160
|
+
return t.length > 60;
|
|
150
161
|
})()`,
|
|
151
162
|
]);
|
|
152
|
-
if (ready === true) {
|
|
163
|
+
if (ready === "true") {
|
|
153
164
|
modelReady = true;
|
|
154
165
|
break;
|
|
155
166
|
}
|
|
@@ -310,7 +321,11 @@ async function main() {
|
|
|
310
321
|
if (++pollTick % 10 === 0) scrollToBottom(tab).catch(() => null);
|
|
311
322
|
}, 6000);
|
|
312
323
|
try {
|
|
313
|
-
await waitForStreamComplete(tab, {
|
|
324
|
+
await waitForStreamComplete(tab, {
|
|
325
|
+
timeout: 45000,
|
|
326
|
+
stableRounds: 5,
|
|
327
|
+
minLength: 60,
|
|
328
|
+
});
|
|
314
329
|
} finally {
|
|
315
330
|
clearInterval(scrollInterval);
|
|
316
331
|
}
|
package/extractors/google-ai.mjs
CHANGED
|
@@ -1,128 +1,129 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
// extractors/google-ai.mjs
|
|
4
|
-
// Navigate Google AI Mode (udm=50), wait for answer, return clean answer + sources.
|
|
5
|
-
//
|
|
6
|
-
// Usage:
|
|
7
|
-
// node extractors/google-ai.mjs "<query>" [--tab <prefix>]
|
|
8
|
-
//
|
|
9
|
-
// Output (stdout): JSON { answer, sources, query, url }
|
|
10
|
-
// Errors go to stderr only — stdout is always clean JSON for piping.
|
|
11
|
-
|
|
12
|
-
import {
|
|
13
|
-
cdp,
|
|
14
|
-
formatAnswer,
|
|
15
|
-
getOrOpenTab,
|
|
16
|
-
handleError,
|
|
17
|
-
jitter,
|
|
18
|
-
outputJson,
|
|
19
|
-
parseArgs,
|
|
20
|
-
prepareArgs,
|
|
21
|
-
TIMING,
|
|
22
|
-
validateQuery,
|
|
23
|
-
waitForStreamComplete,
|
|
24
|
-
} from "./common.mjs";
|
|
25
|
-
import { dismissConsent, handleVerification } from "./consent.mjs";
|
|
26
|
-
import { SELECTORS } from "./selectors.mjs";
|
|
27
|
-
|
|
28
|
-
const S = SELECTORS.google;
|
|
29
|
-
|
|
30
|
-
const MIN_ANSWER_LENGTH = 50;
|
|
31
|
-
|
|
32
|
-
async function extractAnswer(tab) {
|
|
33
|
-
const excludeFilter = S.sourceExclude
|
|
34
|
-
.map((e) => `!a.href.includes('${e}')`)
|
|
35
|
-
.join(" && ");
|
|
36
|
-
const raw = await cdp([
|
|
37
|
-
"eval",
|
|
38
|
-
tab,
|
|
39
|
-
String.raw`
|
|
40
|
-
(function() {
|
|
41
|
-
var el = document.querySelector('${S.answerContainer}');
|
|
42
|
-
if (!el) return JSON.stringify({ answer: '', sources: [] });
|
|
43
|
-
var answer = el.innerText.trim();
|
|
44
|
-
var sources = Array.from(document.querySelectorAll('${S.sourceLink}'))
|
|
45
|
-
.filter(a => ${excludeFilter})
|
|
46
|
-
.map(a => ({ url: a.href.split('#')[0], title: (a.closest('${S.sourceHeadingParent}')?.querySelector('h3, [role=heading]')?.innerText || a.innerText?.trim().split('\n')[0] || '').slice(0, 100) }))
|
|
47
|
-
.filter(s => s.url && s.url.length > 10)
|
|
48
|
-
.filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
|
|
49
|
-
.slice(0, 10);
|
|
50
|
-
return JSON.stringify({ answer, sources });
|
|
51
|
-
})()
|
|
52
|
-
`,
|
|
53
|
-
]);
|
|
54
|
-
return JSON.parse(raw);
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
// ============================================================================
|
|
58
|
-
// Main
|
|
59
|
-
// ============================================================================
|
|
60
|
-
|
|
61
|
-
const USAGE =
|
|
62
|
-
'Usage: node extractors/google-ai.mjs "<query>" [--tab <prefix>]\n';
|
|
63
|
-
|
|
64
|
-
async function main() {
|
|
65
|
-
const args = await prepareArgs(process.argv.slice(2));
|
|
66
|
-
validateQuery(args, USAGE);
|
|
67
|
-
|
|
68
|
-
const { query, tabPrefix, short, locale } = parseArgs(args);
|
|
69
|
-
|
|
70
|
-
try {
|
|
71
|
-
// Only refresh page list when creating a fresh tab (no prefix provided)
|
|
72
|
-
if (!tabPrefix) await cdp(["list"]);
|
|
73
|
-
const tab = await getOrOpenTab(tabPrefix);
|
|
74
|
-
|
|
75
|
-
// Build URL with language parameter (default to English)
|
|
76
|
-
const langParam = locale ? `&hl=${encodeURIComponent(locale)}` : "&hl=en";
|
|
77
|
-
const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&udm=50${langParam}`;
|
|
78
|
-
await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
|
|
79
|
-
await dismissConsent(tab, cdp);
|
|
80
|
-
|
|
81
|
-
// If consent redirected us away, navigate back
|
|
82
|
-
const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
|
|
83
|
-
() => "",
|
|
84
|
-
);
|
|
85
|
-
if (!currentUrl.includes("google.com/search")) {
|
|
86
|
-
await cdp(["nav", tab, url], 20000);
|
|
87
|
-
await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
// Handle "verify you're human" — auto-click simple buttons, wait for user on hard CAPTCHA
|
|
91
|
-
const verifyResult = await handleVerification(tab, cdp, 10000);
|
|
92
|
-
if (verifyResult === "needs-human")
|
|
93
|
-
throw new Error(
|
|
94
|
-
"Google verification required — could not be completed automatically",
|
|
95
|
-
);
|
|
96
|
-
if (verifyResult === "clicked" || verifyResult === "cleared-by-user") {
|
|
97
|
-
// Re-navigate to the search URL after verification
|
|
98
|
-
await cdp(["nav", tab, url], 20000);
|
|
99
|
-
await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
await waitForStreamComplete(tab, {
|
|
103
|
-
timeout: 30000,
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// extractors/google-ai.mjs
|
|
4
|
+
// Navigate Google AI Mode (udm=50), wait for answer, return clean answer + sources.
|
|
5
|
+
//
|
|
6
|
+
// Usage:
|
|
7
|
+
// node extractors/google-ai.mjs "<query>" [--tab <prefix>]
|
|
8
|
+
//
|
|
9
|
+
// Output (stdout): JSON { answer, sources, query, url }
|
|
10
|
+
// Errors go to stderr only — stdout is always clean JSON for piping.
|
|
11
|
+
|
|
12
|
+
import {
|
|
13
|
+
cdp,
|
|
14
|
+
formatAnswer,
|
|
15
|
+
getOrOpenTab,
|
|
16
|
+
handleError,
|
|
17
|
+
jitter,
|
|
18
|
+
outputJson,
|
|
19
|
+
parseArgs,
|
|
20
|
+
prepareArgs,
|
|
21
|
+
TIMING,
|
|
22
|
+
validateQuery,
|
|
23
|
+
waitForStreamComplete,
|
|
24
|
+
} from "./common.mjs";
|
|
25
|
+
import { dismissConsent, handleVerification } from "./consent.mjs";
|
|
26
|
+
import { SELECTORS } from "./selectors.mjs";
|
|
27
|
+
|
|
28
|
+
const S = SELECTORS.google;
|
|
29
|
+
|
|
30
|
+
const MIN_ANSWER_LENGTH = 50;
|
|
31
|
+
|
|
32
|
+
async function extractAnswer(tab) {
|
|
33
|
+
const excludeFilter = S.sourceExclude
|
|
34
|
+
.map((e) => `!a.href.includes('${e}')`)
|
|
35
|
+
.join(" && ");
|
|
36
|
+
const raw = await cdp([
|
|
37
|
+
"eval",
|
|
38
|
+
tab,
|
|
39
|
+
String.raw`
|
|
40
|
+
(function() {
|
|
41
|
+
var el = document.querySelector('${S.answerContainer}');
|
|
42
|
+
if (!el) return JSON.stringify({ answer: '', sources: [] });
|
|
43
|
+
var answer = el.innerText.trim();
|
|
44
|
+
var sources = Array.from(document.querySelectorAll('${S.sourceLink}'))
|
|
45
|
+
.filter(a => ${excludeFilter})
|
|
46
|
+
.map(a => ({ url: a.href.split('#')[0], title: (a.closest('${S.sourceHeadingParent}')?.querySelector('h3, [role=heading]')?.innerText || a.innerText?.trim().split('\n')[0] || '').slice(0, 100) }))
|
|
47
|
+
.filter(s => s.url && s.url.length > 10)
|
|
48
|
+
.filter((v, i, arr) => arr.findIndex(x => x.url === v.url) === i)
|
|
49
|
+
.slice(0, 10);
|
|
50
|
+
return JSON.stringify({ answer, sources });
|
|
51
|
+
})()
|
|
52
|
+
`,
|
|
53
|
+
]);
|
|
54
|
+
return JSON.parse(raw);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// ============================================================================
|
|
58
|
+
// Main
|
|
59
|
+
// ============================================================================
|
|
60
|
+
|
|
61
|
+
const USAGE =
|
|
62
|
+
'Usage: node extractors/google-ai.mjs "<query>" [--tab <prefix>]\n';
|
|
63
|
+
|
|
64
|
+
async function main() {
|
|
65
|
+
const args = await prepareArgs(process.argv.slice(2));
|
|
66
|
+
validateQuery(args, USAGE);
|
|
67
|
+
|
|
68
|
+
const { query, tabPrefix, short, locale } = parseArgs(args);
|
|
69
|
+
|
|
70
|
+
try {
|
|
71
|
+
// Only refresh page list when creating a fresh tab (no prefix provided)
|
|
72
|
+
if (!tabPrefix) await cdp(["list"]);
|
|
73
|
+
const tab = await getOrOpenTab(tabPrefix);
|
|
74
|
+
|
|
75
|
+
// Build URL with language parameter (default to English)
|
|
76
|
+
const langParam = locale ? `&hl=${encodeURIComponent(locale)}` : "&hl=en";
|
|
77
|
+
const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&udm=50${langParam}`;
|
|
78
|
+
await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
|
|
79
|
+
await dismissConsent(tab, cdp);
|
|
80
|
+
|
|
81
|
+
// If consent redirected us away, navigate back
|
|
82
|
+
const currentUrl = await cdp(["eval", tab, "document.location.href"]).catch(
|
|
83
|
+
() => "",
|
|
84
|
+
);
|
|
85
|
+
if (!currentUrl.includes("google.com/search")) {
|
|
86
|
+
await cdp(["nav", tab, url], 20000);
|
|
87
|
+
await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Handle "verify you're human" — auto-click simple buttons, wait for user on hard CAPTCHA
|
|
91
|
+
const verifyResult = await handleVerification(tab, cdp, 10000);
|
|
92
|
+
if (verifyResult === "needs-human")
|
|
93
|
+
throw new Error(
|
|
94
|
+
"Google verification required — could not be completed automatically",
|
|
95
|
+
);
|
|
96
|
+
if (verifyResult === "clicked" || verifyResult === "cleared-by-user") {
|
|
97
|
+
// Re-navigate to the search URL after verification
|
|
98
|
+
await cdp(["nav", tab, url], 20000);
|
|
99
|
+
await new Promise((r) => setTimeout(r, jitter(TIMING.postNav)));
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
await waitForStreamComplete(tab, {
|
|
103
|
+
timeout: 30000,
|
|
104
|
+
stableRounds: 5,
|
|
105
|
+
selector: `document.querySelector('${S.answerContainer}')`,
|
|
106
|
+
minLength: MIN_ANSWER_LENGTH,
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
const { answer, sources } = await extractAnswer(tab);
|
|
110
|
+
if (!answer)
|
|
111
|
+
throw new Error(
|
|
112
|
+
"No answer extracted — Google AI Mode may not have responded",
|
|
113
|
+
);
|
|
114
|
+
|
|
115
|
+
const finalUrl = await cdp(["eval", tab, "document.location.href"]).catch(
|
|
116
|
+
() => url,
|
|
117
|
+
);
|
|
118
|
+
outputJson({
|
|
119
|
+
query,
|
|
120
|
+
url: finalUrl,
|
|
121
|
+
answer: formatAnswer(answer, short),
|
|
122
|
+
sources,
|
|
123
|
+
});
|
|
124
|
+
} catch (e) {
|
|
125
|
+
handleError(e);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
main();
|