@apmantza/greedysearch-pi 1.9.2 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/CHANGELOG.md +132 -2
  2. package/README.md +82 -47
  3. package/bin/cdp.mjs +1153 -1108
  4. package/bin/launch.mjs +9 -0
  5. package/bin/search.mjs +318 -81
  6. package/extractors/bing-copilot.mjs +48 -18
  7. package/extractors/chatgpt.mjs +553 -0
  8. package/extractors/common.mjs +213 -22
  9. package/extractors/consensus.mjs +655 -0
  10. package/extractors/consent.mjs +182 -18
  11. package/extractors/gemini.mjs +350 -217
  12. package/extractors/google-ai.mjs +129 -128
  13. package/extractors/logically.mjs +629 -0
  14. package/extractors/perplexity.mjs +547 -217
  15. package/extractors/selectors.mjs +3 -2
  16. package/extractors/semantic-scholar.mjs +219 -0
  17. package/package.json +8 -4
  18. package/skills/greedy-search/skill.md +20 -12
  19. package/src/fetcher.mjs +23 -1
  20. package/src/formatters/results.ts +185 -128
  21. package/src/search/browser-lifecycle.mjs +27 -5
  22. package/src/search/challenge-detect.mjs +205 -0
  23. package/src/search/chrome.mjs +653 -590
  24. package/src/search/constants.mjs +155 -39
  25. package/src/search/engines.mjs +114 -76
  26. package/src/search/fetch-source.mjs +566 -451
  27. package/src/search/pdf.mjs +68 -0
  28. package/src/search/progress.mjs +145 -0
  29. package/src/search/recovery.mjs +73 -45
  30. package/src/search/research.mjs +1419 -62
  31. package/src/search/scale-aware.mjs +93 -0
  32. package/src/search/simple-research.mjs +520 -0
  33. package/src/search/sources.mjs +52 -22
  34. package/src/search/synthesis-runner.mjs +105 -26
  35. package/src/search/synthesis.mjs +286 -246
  36. package/src/tools/greedy-search-handler.ts +129 -59
  37. package/src/tools/shared.ts +312 -186
  38. package/src/types.ts +110 -104
  39. package/test.mjs +537 -18
@@ -57,12 +57,24 @@ const VERIFY_DETECT_JS = `
57
57
 
58
58
  // --- Cloudflare Turnstile widget inside closed shadow DOM (Copilot, etc.) ---
59
59
  // The iframe is not queryable from main document, but the host container
60
- // (#cf-turnstile) and the hidden response input are.
61
- var cfTurnstileHost = document.querySelector('#cf-turnstile, [id^="cf-chl-widget-"]');
60
+ // (#cf-turnstile) and the hidden response input are. When only the
61
+ // hidden response input matches (no #cf-turnstile host and no visible
62
+ // iframe), the actual challenge widget is rendered inside a closed
63
+ // shadow DOM and cannot be auto-clicked. Return a sentinel so callers
64
+ // know to surface this as needs-human verification instead of wasting
65
+ // time on a doomed waitForSelector.
66
+ var cfTurnstileHost = document.querySelector('#cf-turnstile');
62
67
  if (cfTurnstileHost) {
63
68
  var r2 = cfTurnstileHost.getBoundingClientRect();
64
69
  return JSON.stringify({t:'xy',x:r2.left+r2.width/2,y:r2.top+r2.height/2});
65
70
  }
71
+ // Hidden cf-chl-widget-*_response input present but no visible host:
72
+ // the widget is in closed shadow DOM. Signal this so handleVerification
73
+ // can return 'needs-human' rather than 'clear'.
74
+ var cfResponseInput = document.querySelector('input[name="cf-turnstile-response"], [id^="cf-chl-widget-"][id$="_response"]');
75
+ if (cfResponseInput && cfResponseInput.value === '') {
76
+ return 'cf-closed-shadow-dom';
77
+ }
66
78
 
67
79
  // --- Cloudflare challenge page ---
68
80
  var cfCheckbox = document.querySelector('#cf-stage input[type="checkbox"], .ctp-checkbox-container input');
@@ -77,15 +89,28 @@ const VERIFY_DETECT_JS = `
77
89
  }
78
90
 
79
91
  // --- Generic verify/continue/proceed buttons (catch-all) ---
80
- // IMPORTANT: exclude sign-in / OAuth buttons (e.g. "Continue with Google")
92
+ // IMPORTANT: exclude sign-in / OAuth buttons (e.g. "Continue with Google",
93
+ // "Continue with email", "Login or sign up for free"). These appear on
94
+ // many sites (Perplexity, ChatGPT, etc.) when the user isn't logged in,
95
+ // and clicking them triggers a sign-in flow that takes us to a login
96
+ // wall — a much worse outcome than the original search failure we were
97
+ // trying to recover from. The exclusion list must cover both OAuth
98
+ // providers AND generic "sign in / log in / with email" patterns.
81
99
  var btns = Array.from(document.querySelectorAll('button, input[type=submit], a[role=button]'));
82
100
  var verify = btns.find(b => {
83
101
  var t = (b.innerText?.trim() || b.value || '').toLowerCase();
84
- var isVerifyLike = (t.includes('verify') || t.includes('human') || t.includes('robot') || t.includes('continue') || t.includes('proceed')) &&
102
+ var isVerifyLike = (t === 'continue' || t === 'proceed' || t === 'next' ||
103
+ t.startsWith('verify ') || t.startsWith('human ') || t === 'i am human' || t.includes('robot check')) &&
85
104
  !t.includes('verified') && !document.querySelector('iframe[src*="recaptcha"]');
86
105
  if (!isVerifyLike) return false;
87
106
  // Exclude OAuth / sign-in buttons to prevent accidental login flows
88
- var isSignIn = /sign.in|log.in|google|microsoft|apple|facebook|github|auth/i.test(t);
107
+ // covers "Continue with Google", "Continue with Apple", "Continue
108
+ // with email", "Login or sign up", "Log in", "Sign in", "Sign up",
109
+ // "Single sign-on", and the visible panel "Login or sign up for free"
110
+ // text. The previous list missed "email" and "sso" which let the
111
+ // auto-click land on the email/SSO sign-in buttons on Perplexity's
112
+ // anonymous-mode homepage, navigating us into a login flow.
113
+ var isSignIn = new RegExp("sign.?in|log.?in|sign.?up|with\\s+(google|apple|email|github|facebook|microsoft|sso)|sso|auth", "i").test(t);
89
114
  return !isSignIn;
90
115
  });
91
116
  if (verify) { verify.setAttribute('data-gs-verify','1'); return JSON.stringify({t:'sel',s:'[data-gs-verify="1"]',txt:verify.innerText?.trim()||verify.value}); }
@@ -327,16 +352,23 @@ export async function humanClickElement(tab, cdpFn, selector) {
327
352
 
328
353
  /**
329
354
  * Parse a detection result and perform a human click if it found something.
330
- * Returns true if a click was performed.
355
+ *
356
+ * Returns a tristate string:
357
+ * - 'clicked' — a click was successfully dispatched
358
+ * - 'cant-click' — challenge was detected but we couldn't click it
359
+ * (zero-dimension element, OOPIF in closed shadow DOM, etc.)
360
+ * Caller should treat this as needs-human verification.
361
+ * - 'no-challenge' — no challenge detected, nothing to click
331
362
  */
332
- async function tryHumanClick(tab, cdp, detectResult) {
363
+ function tryHumanClick(tab, cdp, detectResult) {
333
364
  if (
334
365
  !detectResult ||
335
366
  detectResult === "null" ||
336
367
  detectResult === "cleared" ||
337
- detectResult === "still-verifying"
368
+ detectResult === "still-verifying" ||
369
+ detectResult === "cf-closed-shadow-dom"
338
370
  )
339
- return false;
371
+ return Promise.resolve("no-challenge");
340
372
 
341
373
  // JSON format: {t:"sel",s:"...",txt:"..."} or {t:"xy",x:...,y:...}
342
374
  try {
@@ -345,26 +377,138 @@ async function tryHumanClick(tab, cdp, detectResult) {
345
377
  process.stderr.write(
346
378
  `[greedysearch] Human-clicking "${info.txt}" via CDP...\n`,
347
379
  );
348
- const r = await humanClickElement(tab, cdp, info.s);
349
- return r !== null;
380
+ return humanClickElement(tab, cdp, info.s).then((r) =>
381
+ r !== null ? "clicked" : "cant-click",
382
+ );
350
383
  }
351
384
  if (info.t === "xy") {
352
385
  // Skip zero/invalid coordinates — element is off-screen or not rendered
353
- if (!info.x && !info.y) return false;
386
+ if (!info.x && !info.y) return Promise.resolve("cant-click");
354
387
  process.stderr.write(
355
388
  `[greedysearch] Human-clicking at (${info.x.toFixed(0)}, ${info.y.toFixed(0)})...\n`,
356
389
  );
357
- await humanClickXY(tab, cdp, info.x, info.y);
358
- return true;
390
+ return humanClickXY(tab, cdp, info.x, info.y).then(() => "clicked");
359
391
  }
360
392
  } catch {}
361
393
 
362
- return false;
394
+ return Promise.resolve("no-challenge");
363
395
  }
364
396
 
365
397
  export async function detectVerificationChallenge(tab, cdp) {
398
+ // Run the CDP-pierce probe FIRST so we get real click coordinates for
399
+ // Cloudflare iframes hidden inside closed shadow roots (chatgpt.com,
400
+ // perplexity.ai, etc.). The page-context probe falls back to a
401
+ // cf-closed-shadow-dom sentinel when the iframe is opaque to JS DOM
402
+ // queries, but that sentinel can't be auto-clicked.
403
+ const cfIframe = await findCloudflareIframeViaPierce(tab, cdp).catch(
404
+ () => null,
405
+ );
406
+ if (cfIframe) return cfIframe;
407
+
366
408
  const result = await cdp(["eval", tab, VERIFY_DETECT_JS]).catch(() => null);
367
- return result && result !== "null" ? result : null;
409
+ if (result && result !== "null") return result;
410
+
411
+ return null;
412
+ }
413
+
414
+ /**
415
+ * Walk the page DOM with pierce:true to locate a Cloudflare Turnstile
416
+ * iframe that's hidden inside a closed shadow root. Returns JSON of the
417
+ * shape `{t:'xy', x, y}` matching the main-document probe's convention,
418
+ * OR null if nothing was found.
419
+ *
420
+ * The returned coords target the **checkbox area** of the Turnstile widget
421
+ * (left ~25% of the 300x65 iframe, vertical center) rather than the
422
+ * iframe's geometric center, because the visible "Verify you are human"
423
+ * checkbox sits there in the standard widget layout.
424
+ */
425
+ async function findCloudflareIframeViaPierce(tab, cdp) {
426
+ if (typeof cdp !== "function") return null;
427
+
428
+ // Step 1: enable DOM domain if needed (cheap idempotent call)
429
+ await cdp(["evalraw", tab, "DOM.enable", "{}"]).catch(() => {});
430
+
431
+ // Step 2: get the full DOM tree with pierce — walks closed shadow roots
432
+ const doc = await cdp(["evalraw", tab, "DOM.getDocument", JSON.stringify({ depth: -1, pierce: true })]).catch(
433
+ () => null,
434
+ );
435
+ if (!doc) return null;
436
+ let docParsed;
437
+ try {
438
+ docParsed = JSON.parse(doc);
439
+ } catch {
440
+ return null;
441
+ }
442
+ if (docParsed.error || !docParsed.root) return null;
443
+
444
+ // Step 3: recursive walk looking for an iframe whose src points at
445
+ // challenges.cloudflare.com / turnstile
446
+ const root = docParsed.root;
447
+ const found = await walkForCfIframe(root, tab, cdp);
448
+ return found;
449
+ }
450
+
451
+ async function walkForCfIframe(node, tab, cdp) {
452
+ if (!node) return null;
453
+ const children = [];
454
+ if (node.shadowRoots && node.shadowRoots.length > 0) {
455
+ for (const s of node.shadowRoots) {
456
+ children.push(s);
457
+ }
458
+ }
459
+ if (node.children) {
460
+ for (const c of node.children) children.push(c);
461
+ }
462
+ for (const child of children) {
463
+ if (child.nodeName === "IFRAME") {
464
+ const attrs = child.attributes || [];
465
+ const srcIdx = attrs.indexOf("src");
466
+ const src = srcIdx >= 0 ? attrs[srcIdx + 1] : "";
467
+ if (
468
+ src &&
469
+ /challenges\.cloudflare\.com|turnstile/i.test(src) &&
470
+ child.backendNodeId
471
+ ) {
472
+ // Get bounding box via DOM.getBoxModel
473
+ const boxRes = await cdp([
474
+ "evalraw",
475
+ tab,
476
+ "DOM.getBoxModel",
477
+ JSON.stringify({ backendNodeId: child.backendNodeId }),
478
+ ]).catch(() => null);
479
+ if (!boxRes) continue;
480
+ let boxParsed;
481
+ try {
482
+ boxParsed = JSON.parse(boxRes);
483
+ } catch {
484
+ continue;
485
+ }
486
+ const content =
487
+ boxParsed?.model?.content || boxParsed?.result?.model?.content;
488
+ if (!content || content.length < 8) continue;
489
+ // content = [x1, y1, x2, y2, x3, y3, x4, y4]
490
+ const x1 = content[0];
491
+ const y1 = content[1];
492
+ const x3 = content[4];
493
+ const y3 = content[5];
494
+ const width = x3 - x1;
495
+ const height = y3 - y1;
496
+ // Skip degenerate boxes (hidden iframes)
497
+ if (width < 50 || height < 20) continue;
498
+ // Click the checkbox: standard CF widget is 300x65 with the
499
+ // checkbox centered at ~25% width, 50% height.
500
+ const checkboxX = x1 + width * 0.25;
501
+ const checkboxY = y1 + height * 0.5;
502
+ process.stderr.write(
503
+ `[greedysearch] Found CF iframe via CDP pierce at (${x1.toFixed(0)}, ${y1.toFixed(0)}) ${width.toFixed(0)}x${height.toFixed(0)}, clicking checkbox at (${checkboxX.toFixed(0)}, ${checkboxY.toFixed(0)})\n`,
504
+ );
505
+ return JSON.stringify({ t: "xy", x: checkboxX, y: checkboxY });
506
+ }
507
+ }
508
+ const deeper = await walkForCfIframe(child, tab, cdp);
509
+ if (deeper) return deeper;
510
+ }
511
+ return null;
368
512
  }
369
513
 
370
514
  // Returns 'clear' | 'clicked' | 'needs-human'
@@ -389,9 +533,17 @@ export async function handleVerification(tab, cdp, waitMs = 30000) {
389
533
  return "needs-human";
390
534
  }
391
535
 
536
+ // Cloudflare Turnstile rendered inside a closed shadow root (e.g.
537
+ // chatgpt.com). detectVerificationChallenge now uses CDP-level
538
+ // DOM.getDocument({pierce:true}) to walk into the closed root and
539
+ // locate the iframe's screen-space bounding box. The result here is
540
+ // a normal {t:'xy',x,y} coordinate payload that flows through the
541
+ // regular click path. The historical "cf-closed-shadow-dom" sentinel
542
+ // is kept in VERIFY_DETECT_JS only as a safety net for unusual pages.
543
+
392
544
  // Perform human click on detected element
393
- const clicked = await tryHumanClick(tab, cdp, result);
394
- if (clicked) {
545
+ const clickResult = await tryHumanClick(tab, cdp, result);
546
+ if (clickResult === "clicked") {
395
547
  await new Promise((r) => setTimeout(r, 2000));
396
548
 
397
549
  // Retry loop — keep checking until cleared or timeout
@@ -417,5 +569,17 @@ export async function handleVerification(tab, cdp, waitMs = 30000) {
417
569
  return "needs-human";
418
570
  }
419
571
 
572
+ // Challenge was detected but we couldn't auto-click it (zero-dimension
573
+ // element, OOPIF without coordinates, etc.). Surface this rather than
574
+ // silently returning 'clear' — the caller would otherwise proceed and
575
+ // fail downstream on a selector that won't appear until the challenge
576
+ // is solved.
577
+ if (clickResult === "cant-click") {
578
+ process.stderr.write(
579
+ "[greedysearch] Verification challenge detected but cannot be auto-clicked — please solve it manually in the visible browser window.\n",
580
+ );
581
+ return "needs-human";
582
+ }
583
+
420
584
  return "clear";
421
585
  }