@apmantza/greedysearch-pi 1.9.2 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +132 -2
- package/README.md +82 -47
- package/bin/cdp.mjs +1153 -1108
- package/bin/launch.mjs +9 -0
- package/bin/search.mjs +318 -81
- package/extractors/bing-copilot.mjs +48 -18
- package/extractors/chatgpt.mjs +553 -0
- package/extractors/common.mjs +213 -22
- package/extractors/consensus.mjs +655 -0
- package/extractors/consent.mjs +182 -18
- package/extractors/gemini.mjs +350 -217
- package/extractors/google-ai.mjs +129 -128
- package/extractors/logically.mjs +629 -0
- package/extractors/perplexity.mjs +547 -217
- package/extractors/selectors.mjs +3 -2
- package/extractors/semantic-scholar.mjs +219 -0
- package/package.json +8 -4
- package/skills/greedy-search/skill.md +20 -12
- package/src/fetcher.mjs +23 -1
- package/src/formatters/results.ts +185 -128
- package/src/search/browser-lifecycle.mjs +27 -5
- package/src/search/challenge-detect.mjs +205 -0
- package/src/search/chrome.mjs +653 -590
- package/src/search/constants.mjs +155 -39
- package/src/search/engines.mjs +114 -76
- package/src/search/fetch-source.mjs +566 -451
- package/src/search/pdf.mjs +68 -0
- package/src/search/progress.mjs +145 -0
- package/src/search/recovery.mjs +73 -45
- package/src/search/research.mjs +1419 -62
- package/src/search/scale-aware.mjs +93 -0
- package/src/search/simple-research.mjs +520 -0
- package/src/search/sources.mjs +52 -22
- package/src/search/synthesis-runner.mjs +105 -26
- package/src/search/synthesis.mjs +286 -246
- package/src/tools/greedy-search-handler.ts +129 -59
- package/src/tools/shared.ts +312 -186
- package/src/types.ts +110 -104
- package/test.mjs +537 -18
package/extractors/consent.mjs
CHANGED
|
@@ -57,12 +57,24 @@ const VERIFY_DETECT_JS = `
|
|
|
57
57
|
|
|
58
58
|
// --- Cloudflare Turnstile widget inside closed shadow DOM (Copilot, etc.) ---
|
|
59
59
|
// The iframe is not queryable from main document, but the host container
|
|
60
|
-
// (#cf-turnstile) and the hidden response input are.
|
|
61
|
-
|
|
60
|
+
// (#cf-turnstile) and the hidden response input are. When only the
|
|
61
|
+
// hidden response input matches (no #cf-turnstile host and no visible
|
|
62
|
+
// iframe), the actual challenge widget is rendered inside a closed
|
|
63
|
+
// shadow DOM and cannot be auto-clicked. Return a sentinel so callers
|
|
64
|
+
// know to surface this as needs-human verification instead of wasting
|
|
65
|
+
// time on a doomed waitForSelector.
|
|
66
|
+
var cfTurnstileHost = document.querySelector('#cf-turnstile');
|
|
62
67
|
if (cfTurnstileHost) {
|
|
63
68
|
var r2 = cfTurnstileHost.getBoundingClientRect();
|
|
64
69
|
return JSON.stringify({t:'xy',x:r2.left+r2.width/2,y:r2.top+r2.height/2});
|
|
65
70
|
}
|
|
71
|
+
// Hidden cf-chl-widget-*_response input present but no visible host:
|
|
72
|
+
// the widget is in closed shadow DOM. Signal this so handleVerification
|
|
73
|
+
// can return 'needs-human' rather than 'clear'.
|
|
74
|
+
var cfResponseInput = document.querySelector('input[name="cf-turnstile-response"], [id^="cf-chl-widget-"][id$="_response"]');
|
|
75
|
+
if (cfResponseInput && cfResponseInput.value === '') {
|
|
76
|
+
return 'cf-closed-shadow-dom';
|
|
77
|
+
}
|
|
66
78
|
|
|
67
79
|
// --- Cloudflare challenge page ---
|
|
68
80
|
var cfCheckbox = document.querySelector('#cf-stage input[type="checkbox"], .ctp-checkbox-container input');
|
|
@@ -77,15 +89,28 @@ const VERIFY_DETECT_JS = `
|
|
|
77
89
|
}
|
|
78
90
|
|
|
79
91
|
// --- Generic verify/continue/proceed buttons (catch-all) ---
|
|
80
|
-
// IMPORTANT: exclude sign-in / OAuth buttons (e.g. "Continue with Google"
|
|
92
|
+
// IMPORTANT: exclude sign-in / OAuth buttons (e.g. "Continue with Google",
|
|
93
|
+
// "Continue with email", "Login or sign up for free"). These appear on
|
|
94
|
+
// many sites (Perplexity, ChatGPT, etc.) when the user isn't logged in,
|
|
95
|
+
// and clicking them triggers a sign-in flow that takes us to a login
|
|
96
|
+
// wall — a much worse outcome than the original search failure we were
|
|
97
|
+
// trying to recover from. The exclusion list must cover both OAuth
|
|
98
|
+
// providers AND generic "sign in / log in / with email" patterns.
|
|
81
99
|
var btns = Array.from(document.querySelectorAll('button, input[type=submit], a[role=button]'));
|
|
82
100
|
var verify = btns.find(b => {
|
|
83
101
|
var t = (b.innerText?.trim() || b.value || '').toLowerCase();
|
|
84
|
-
var isVerifyLike = (t
|
|
102
|
+
var isVerifyLike = (t === 'continue' || t === 'proceed' || t === 'next' ||
|
|
103
|
+
t.startsWith('verify ') || t.startsWith('human ') || t === 'i am human' || t.includes('robot check')) &&
|
|
85
104
|
!t.includes('verified') && !document.querySelector('iframe[src*="recaptcha"]');
|
|
86
105
|
if (!isVerifyLike) return false;
|
|
87
106
|
// Exclude OAuth / sign-in buttons to prevent accidental login flows
|
|
88
|
-
|
|
107
|
+
// — covers "Continue with Google", "Continue with Apple", "Continue
|
|
108
|
+
// with email", "Login or sign up", "Log in", "Sign in", "Sign up",
|
|
109
|
+
// "Single sign-on", and the visible panel "Login or sign up for free"
|
|
110
|
+
// text. The previous list missed "email" and "sso" which let the
|
|
111
|
+
// auto-click land on the email/SSO sign-in buttons on Perplexity's
|
|
112
|
+
// anonymous-mode homepage, navigating us into a login flow.
|
|
113
|
+
var isSignIn = new RegExp("sign.?in|log.?in|sign.?up|with\\s+(google|apple|email|github|facebook|microsoft|sso)|sso|auth", "i").test(t);
|
|
89
114
|
return !isSignIn;
|
|
90
115
|
});
|
|
91
116
|
if (verify) { verify.setAttribute('data-gs-verify','1'); return JSON.stringify({t:'sel',s:'[data-gs-verify="1"]',txt:verify.innerText?.trim()||verify.value}); }
|
|
@@ -327,16 +352,23 @@ export async function humanClickElement(tab, cdpFn, selector) {
|
|
|
327
352
|
|
|
328
353
|
/**
|
|
329
354
|
* Parse a detection result and perform a human click if it found something.
|
|
330
|
-
*
|
|
355
|
+
*
|
|
356
|
+
* Returns a tristate string:
|
|
357
|
+
* - 'clicked' — a click was successfully dispatched
|
|
358
|
+
* - 'cant-click' — challenge was detected but we couldn't click it
|
|
359
|
+
* (zero-dimension element, OOPIF in closed shadow DOM, etc.)
|
|
360
|
+
* Caller should treat this as needs-human verification.
|
|
361
|
+
* - 'no-challenge' — no challenge detected, nothing to click
|
|
331
362
|
*/
|
|
332
|
-
|
|
363
|
+
function tryHumanClick(tab, cdp, detectResult) {
|
|
333
364
|
if (
|
|
334
365
|
!detectResult ||
|
|
335
366
|
detectResult === "null" ||
|
|
336
367
|
detectResult === "cleared" ||
|
|
337
|
-
detectResult === "still-verifying"
|
|
368
|
+
detectResult === "still-verifying" ||
|
|
369
|
+
detectResult === "cf-closed-shadow-dom"
|
|
338
370
|
)
|
|
339
|
-
return
|
|
371
|
+
return Promise.resolve("no-challenge");
|
|
340
372
|
|
|
341
373
|
// JSON format: {t:"sel",s:"...",txt:"..."} or {t:"xy",x:...,y:...}
|
|
342
374
|
try {
|
|
@@ -345,26 +377,138 @@ async function tryHumanClick(tab, cdp, detectResult) {
|
|
|
345
377
|
process.stderr.write(
|
|
346
378
|
`[greedysearch] Human-clicking "${info.txt}" via CDP...\n`,
|
|
347
379
|
);
|
|
348
|
-
|
|
349
|
-
|
|
380
|
+
return humanClickElement(tab, cdp, info.s).then((r) =>
|
|
381
|
+
r !== null ? "clicked" : "cant-click",
|
|
382
|
+
);
|
|
350
383
|
}
|
|
351
384
|
if (info.t === "xy") {
|
|
352
385
|
// Skip zero/invalid coordinates — element is off-screen or not rendered
|
|
353
|
-
if (!info.x && !info.y) return
|
|
386
|
+
if (!info.x && !info.y) return Promise.resolve("cant-click");
|
|
354
387
|
process.stderr.write(
|
|
355
388
|
`[greedysearch] Human-clicking at (${info.x.toFixed(0)}, ${info.y.toFixed(0)})...\n`,
|
|
356
389
|
);
|
|
357
|
-
|
|
358
|
-
return true;
|
|
390
|
+
return humanClickXY(tab, cdp, info.x, info.y).then(() => "clicked");
|
|
359
391
|
}
|
|
360
392
|
} catch {}
|
|
361
393
|
|
|
362
|
-
return
|
|
394
|
+
return Promise.resolve("no-challenge");
|
|
363
395
|
}
|
|
364
396
|
|
|
365
397
|
export async function detectVerificationChallenge(tab, cdp) {
|
|
398
|
+
// Run the CDP-pierce probe FIRST so we get real click coordinates for
|
|
399
|
+
// Cloudflare iframes hidden inside closed shadow roots (chatgpt.com,
|
|
400
|
+
// perplexity.ai, etc.). The page-context probe falls back to a
|
|
401
|
+
// cf-closed-shadow-dom sentinel when the iframe is opaque to JS DOM
|
|
402
|
+
// queries, but that sentinel can't be auto-clicked.
|
|
403
|
+
const cfIframe = await findCloudflareIframeViaPierce(tab, cdp).catch(
|
|
404
|
+
() => null,
|
|
405
|
+
);
|
|
406
|
+
if (cfIframe) return cfIframe;
|
|
407
|
+
|
|
366
408
|
const result = await cdp(["eval", tab, VERIFY_DETECT_JS]).catch(() => null);
|
|
367
|
-
|
|
409
|
+
if (result && result !== "null") return result;
|
|
410
|
+
|
|
411
|
+
return null;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
/**
|
|
415
|
+
* Walk the page DOM with pierce:true to locate a Cloudflare Turnstile
|
|
416
|
+
* iframe that's hidden inside a closed shadow root. Returns JSON of the
|
|
417
|
+
* shape `{t:'xy', x, y}` matching the main-document probe's convention,
|
|
418
|
+
* OR null if nothing was found.
|
|
419
|
+
*
|
|
420
|
+
* The returned coords target the **checkbox area** of the Turnstile widget
|
|
421
|
+
* (left ~25% of the 300x65 iframe, vertical center) rather than the
|
|
422
|
+
* iframe's geometric center, because the visible "Verify you are human"
|
|
423
|
+
* checkbox sits there in the standard widget layout.
|
|
424
|
+
*/
|
|
425
|
+
async function findCloudflareIframeViaPierce(tab, cdp) {
|
|
426
|
+
if (typeof cdp !== "function") return null;
|
|
427
|
+
|
|
428
|
+
// Step 1: enable DOM domain if needed (cheap idempotent call)
|
|
429
|
+
await cdp(["evalraw", tab, "DOM.enable", "{}"]).catch(() => {});
|
|
430
|
+
|
|
431
|
+
// Step 2: get the full DOM tree with pierce — walks closed shadow roots
|
|
432
|
+
const doc = await cdp(["evalraw", tab, "DOM.getDocument", JSON.stringify({ depth: -1, pierce: true })]).catch(
|
|
433
|
+
() => null,
|
|
434
|
+
);
|
|
435
|
+
if (!doc) return null;
|
|
436
|
+
let docParsed;
|
|
437
|
+
try {
|
|
438
|
+
docParsed = JSON.parse(doc);
|
|
439
|
+
} catch {
|
|
440
|
+
return null;
|
|
441
|
+
}
|
|
442
|
+
if (docParsed.error || !docParsed.root) return null;
|
|
443
|
+
|
|
444
|
+
// Step 3: recursive walk looking for an iframe whose src points at
|
|
445
|
+
// challenges.cloudflare.com / turnstile
|
|
446
|
+
const root = docParsed.root;
|
|
447
|
+
const found = await walkForCfIframe(root, tab, cdp);
|
|
448
|
+
return found;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
async function walkForCfIframe(node, tab, cdp) {
|
|
452
|
+
if (!node) return null;
|
|
453
|
+
const children = [];
|
|
454
|
+
if (node.shadowRoots && node.shadowRoots.length > 0) {
|
|
455
|
+
for (const s of node.shadowRoots) {
|
|
456
|
+
children.push(s);
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
if (node.children) {
|
|
460
|
+
for (const c of node.children) children.push(c);
|
|
461
|
+
}
|
|
462
|
+
for (const child of children) {
|
|
463
|
+
if (child.nodeName === "IFRAME") {
|
|
464
|
+
const attrs = child.attributes || [];
|
|
465
|
+
const srcIdx = attrs.indexOf("src");
|
|
466
|
+
const src = srcIdx >= 0 ? attrs[srcIdx + 1] : "";
|
|
467
|
+
if (
|
|
468
|
+
src &&
|
|
469
|
+
/challenges\.cloudflare\.com|turnstile/i.test(src) &&
|
|
470
|
+
child.backendNodeId
|
|
471
|
+
) {
|
|
472
|
+
// Get bounding box via DOM.getBoxModel
|
|
473
|
+
const boxRes = await cdp([
|
|
474
|
+
"evalraw",
|
|
475
|
+
tab,
|
|
476
|
+
"DOM.getBoxModel",
|
|
477
|
+
JSON.stringify({ backendNodeId: child.backendNodeId }),
|
|
478
|
+
]).catch(() => null);
|
|
479
|
+
if (!boxRes) continue;
|
|
480
|
+
let boxParsed;
|
|
481
|
+
try {
|
|
482
|
+
boxParsed = JSON.parse(boxRes);
|
|
483
|
+
} catch {
|
|
484
|
+
continue;
|
|
485
|
+
}
|
|
486
|
+
const content =
|
|
487
|
+
boxParsed?.model?.content || boxParsed?.result?.model?.content;
|
|
488
|
+
if (!content || content.length < 8) continue;
|
|
489
|
+
// content = [x1, y1, x2, y2, x3, y3, x4, y4]
|
|
490
|
+
const x1 = content[0];
|
|
491
|
+
const y1 = content[1];
|
|
492
|
+
const x3 = content[4];
|
|
493
|
+
const y3 = content[5];
|
|
494
|
+
const width = x3 - x1;
|
|
495
|
+
const height = y3 - y1;
|
|
496
|
+
// Skip degenerate boxes (hidden iframes)
|
|
497
|
+
if (width < 50 || height < 20) continue;
|
|
498
|
+
// Click the checkbox: standard CF widget is 300x65 with the
|
|
499
|
+
// checkbox centered at ~25% width, 50% height.
|
|
500
|
+
const checkboxX = x1 + width * 0.25;
|
|
501
|
+
const checkboxY = y1 + height * 0.5;
|
|
502
|
+
process.stderr.write(
|
|
503
|
+
`[greedysearch] Found CF iframe via CDP pierce at (${x1.toFixed(0)}, ${y1.toFixed(0)}) ${width.toFixed(0)}x${height.toFixed(0)}, clicking checkbox at (${checkboxX.toFixed(0)}, ${checkboxY.toFixed(0)})\n`,
|
|
504
|
+
);
|
|
505
|
+
return JSON.stringify({ t: "xy", x: checkboxX, y: checkboxY });
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
const deeper = await walkForCfIframe(child, tab, cdp);
|
|
509
|
+
if (deeper) return deeper;
|
|
510
|
+
}
|
|
511
|
+
return null;
|
|
368
512
|
}
|
|
369
513
|
|
|
370
514
|
// Returns 'clear' | 'clicked' | 'needs-human'
|
|
@@ -389,9 +533,17 @@ export async function handleVerification(tab, cdp, waitMs = 30000) {
|
|
|
389
533
|
return "needs-human";
|
|
390
534
|
}
|
|
391
535
|
|
|
536
|
+
// Cloudflare Turnstile rendered inside a closed shadow root (e.g.
|
|
537
|
+
// chatgpt.com). detectVerificationChallenge now uses CDP-level
|
|
538
|
+
// DOM.getDocument({pierce:true}) to walk into the closed root and
|
|
539
|
+
// locate the iframe's screen-space bounding box. The result here is
|
|
540
|
+
// a normal {t:'xy',x,y} coordinate payload that flows through the
|
|
541
|
+
// regular click path. The historical "cf-closed-shadow-dom" sentinel
|
|
542
|
+
// is kept in VERIFY_DETECT_JS only as a safety net for unusual pages.
|
|
543
|
+
|
|
392
544
|
// Perform human click on detected element
|
|
393
|
-
const
|
|
394
|
-
if (clicked) {
|
|
545
|
+
const clickResult = await tryHumanClick(tab, cdp, result);
|
|
546
|
+
if (clickResult === "clicked") {
|
|
395
547
|
await new Promise((r) => setTimeout(r, 2000));
|
|
396
548
|
|
|
397
549
|
// Retry loop — keep checking until cleared or timeout
|
|
@@ -417,5 +569,17 @@ export async function handleVerification(tab, cdp, waitMs = 30000) {
|
|
|
417
569
|
return "needs-human";
|
|
418
570
|
}
|
|
419
571
|
|
|
572
|
+
// Challenge was detected but we couldn't auto-click it (zero-dimension
|
|
573
|
+
// element, OOPIF without coordinates, etc.). Surface this rather than
|
|
574
|
+
// silently returning 'clear' — the caller would otherwise proceed and
|
|
575
|
+
// fail downstream on a selector that won't appear until the challenge
|
|
576
|
+
// is solved.
|
|
577
|
+
if (clickResult === "cant-click") {
|
|
578
|
+
process.stderr.write(
|
|
579
|
+
"[greedysearch] Verification challenge detected but cannot be auto-clicked — please solve it manually in the visible browser window.\n",
|
|
580
|
+
);
|
|
581
|
+
return "needs-human";
|
|
582
|
+
}
|
|
583
|
+
|
|
420
584
|
return "clear";
|
|
421
585
|
}
|