@apmantza/greedysearch-pi 1.9.2 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/CHANGELOG.md +132 -2
  2. package/README.md +82 -47
  3. package/bin/cdp.mjs +1153 -1108
  4. package/bin/launch.mjs +9 -0
  5. package/bin/search.mjs +318 -81
  6. package/extractors/bing-copilot.mjs +48 -18
  7. package/extractors/chatgpt.mjs +553 -0
  8. package/extractors/common.mjs +213 -22
  9. package/extractors/consensus.mjs +655 -0
  10. package/extractors/consent.mjs +182 -18
  11. package/extractors/gemini.mjs +350 -217
  12. package/extractors/google-ai.mjs +129 -128
  13. package/extractors/logically.mjs +629 -0
  14. package/extractors/perplexity.mjs +547 -217
  15. package/extractors/selectors.mjs +3 -2
  16. package/extractors/semantic-scholar.mjs +219 -0
  17. package/package.json +8 -4
  18. package/skills/greedy-search/skill.md +20 -12
  19. package/src/fetcher.mjs +23 -1
  20. package/src/formatters/results.ts +185 -128
  21. package/src/search/browser-lifecycle.mjs +27 -5
  22. package/src/search/challenge-detect.mjs +205 -0
  23. package/src/search/chrome.mjs +653 -590
  24. package/src/search/constants.mjs +155 -39
  25. package/src/search/engines.mjs +114 -76
  26. package/src/search/fetch-source.mjs +566 -451
  27. package/src/search/pdf.mjs +68 -0
  28. package/src/search/progress.mjs +145 -0
  29. package/src/search/recovery.mjs +73 -45
  30. package/src/search/research.mjs +1419 -62
  31. package/src/search/scale-aware.mjs +93 -0
  32. package/src/search/simple-research.mjs +520 -0
  33. package/src/search/sources.mjs +52 -22
  34. package/src/search/synthesis-runner.mjs +105 -26
  35. package/src/search/synthesis.mjs +286 -246
  36. package/src/tools/greedy-search-handler.ts +129 -59
  37. package/src/tools/shared.ts +312 -186
  38. package/src/types.ts +110 -104
  39. package/test.mjs +537 -18
@@ -116,13 +116,35 @@ function getProcessCommandLine(pid) {
116
116
  * @param {number} debugPort - expected debug port
117
117
  * @returns {boolean}
118
118
  */
119
- export function verifyBrowserProcess(pid, tempDir, debugPort = GREEDY_PORT) {
120
- const cmdLine = getProcessCommandLine(pid);
119
+ export function commandLineMatchesGreedyChrome(
120
+ cmdLine,
121
+ tempDir,
122
+ debugPort = GREEDY_PORT,
123
+ ) {
121
124
  if (!cmdLine) return false;
125
+ // Windows may report Chrome command lines with backslashes while the shared
126
+ // GREEDY_PROFILE_DIR constant is normalized to forward slashes. Compare a
127
+ // normalized form so child processes do not misclassify a live GreedySearch
128
+ // Chrome as a ghost and kill it during cleanupStaleSessions().
129
+ const normalize = (value) =>
130
+ String(value || "")
131
+ .replaceAll("\\", "/")
132
+ .toLowerCase();
133
+ const normalizedCmdLine = normalize(cmdLine);
134
+ const normalizedTempDir = normalize(tempDir);
135
+
122
136
  return (
123
- cmdLine.includes(tempDir) &&
124
- cmdLine.includes(`--remote-debugging-port=${debugPort}`) &&
125
- !cmdLine.includes("--type=")
137
+ normalizedCmdLine.includes(normalizedTempDir) &&
138
+ normalizedCmdLine.includes(`--remote-debugging-port=${debugPort}`) &&
139
+ !normalizedCmdLine.includes("--type=")
140
+ );
141
+ }
142
+
143
+ export function verifyBrowserProcess(pid, tempDir, debugPort = GREEDY_PORT) {
144
+ return commandLineMatchesGreedyChrome(
145
+ getProcessCommandLine(pid),
146
+ tempDir,
147
+ debugPort,
126
148
  );
127
149
  }
128
150
 
@@ -0,0 +1,205 @@
1
+ // src/search/challenge-detect.mjs — Detect when a Cloudflare/Turnstile/captcha
2
+ // challenge has been solved so the extractor can auto-resume.
3
+ //
4
+ // Polls page state (URL, title, DOM markers, cookie presence) instead of waiting
5
+ // for a hard timeout. Resolves once the engine-specific "challenge cleared"
6
+ // signal is observed, or rejects with a clear error if the polling budget is
7
+ // exhausted before any progress.
8
+ //
9
+ // Usage:
10
+ // const cleared = await waitForChallengeCleared({ tab, engine: "chatgpt", timeoutMs: 300000 });
11
+ // if (!cleared) emit _needsHumanVerification; else re-run extractor.
12
+
13
+ import { cdp } from "../../extractors/common.mjs";
14
+
15
+ const DEFAULT_TIMEOUT_MS = Number.parseInt(
16
+ process.env.GREEDY_SEARCH_CHALLENGE_WAIT_MS || "300000",
17
+ 10,
18
+ ); // 5 minutes default
19
+ const POLL_INTERVAL_MS = 3000;
20
+
21
+ const ENGINE_SIGNALS = {
22
+ chatgpt: {
23
+ // After Cloudflare clearance, chatgpt.com shows the chat UI.
24
+ // Title changes from "Περιμένετε..." / "Just a moment..." → "ChatGPT"
25
+ // and div.ProseMirror renders.
26
+ name: "chatgpt",
27
+ isCleared: async (tab) => {
28
+ const probe = await cdp([
29
+ "eval",
30
+ tab,
31
+ `(() => {
32
+ const title = document.title;
33
+ const onChatGPT = location.hostname === "chatgpt.com";
34
+ const hasProseMirror = !!document.querySelector("div.ProseMirror");
35
+ const hasTurnstileInput =
36
+ !!document.querySelector("input[name=\\"cf-turnstile-response\\"]") ||
37
+ !!document.querySelector("iframe[id^=\\"cf-chl-widget-\\"]");
38
+ // Body innerText is empty while on the Turnstile page.
39
+ const bodyText = (document.body && document.body.innerText) || "";
40
+ return JSON.stringify({
41
+ title,
42
+ url: location.href,
43
+ hasProseMirror,
44
+ hasTurnstileInput,
45
+ bodyLen: bodyText.length,
46
+ onChatGPT,
47
+ });
48
+ })()`,
49
+ ]).catch(() => null);
50
+ if (!probe) return false;
51
+ let info;
52
+ try {
53
+ info = JSON.parse(probe);
54
+ } catch {
55
+ return false;
56
+ }
57
+ // Cleared when we're on chatgpt.com, the title is no longer the
58
+ // "Please wait…" placeholder, and either the chat UI rendered or
59
+ // the Turnstile marker is gone.
60
+ if (!info.onChatGPT) return false;
61
+ if (
62
+ info.title &&
63
+ /περιμένετε|please wait|just a moment|verifying|checking/i.test(
64
+ info.title,
65
+ )
66
+ ) {
67
+ return false;
68
+ }
69
+ if (info.hasTurnstileInput) return false;
70
+ // Either chat UI appeared OR we navigated past chatgpt.com (signed-in landing)
71
+ return info.hasProseMirror || info.bodyLen > 50;
72
+ },
73
+ },
74
+ bing: {
75
+ // Copilot shows "Verify you are human" challenge, then transitions to the chat UI.
76
+ // Cleared signals: URL on copilot.microsoft.com (no /challenge), textarea/input exists,
77
+ // or the Turnstile iframe is gone.
78
+ name: "bing",
79
+ isCleared: async (tab) => {
80
+ const probe = await cdp([
81
+ "eval",
82
+ tab,
83
+ `(() => {
84
+ const url = location.href;
85
+ const title = document.title;
86
+ const onCopilot = /copilot\\.microsoft\\.com/.test(location.hostname);
87
+ const onChallenge =
88
+ /challenge|turnstile|cdn-cgi\\/challenge/i.test(url) ||
89
+ /verify|human|robot/i.test(title);
90
+ const hasTextarea =
91
+ !!document.querySelector("textarea") ||
92
+ !!document.querySelector("div[contenteditable=\\"true\\"]");
93
+ const hasTurnstileInput =
94
+ !!document.querySelector("iframe[id^=\\"cf-chl-widget-\\"]") ||
95
+ !!document.querySelector("input[name=\\"cf-turnstile-response\\"]");
96
+ const bodyText = (document.body && document.body.innerText) || "";
97
+ return JSON.stringify({
98
+ url,
99
+ title,
100
+ onCopilot,
101
+ onChallenge,
102
+ hasTextarea,
103
+ hasTurnstileInput,
104
+ bodyLen: bodyText.length,
105
+ });
106
+ })()`,
107
+ ]).catch(() => null);
108
+ if (!probe) return false;
109
+ let info;
110
+ try {
111
+ info = JSON.parse(probe);
112
+ } catch {
113
+ return false;
114
+ }
115
+ if (!info.onCopilot) return false;
116
+ if (info.onChallenge) return false;
117
+ if (info.hasTurnstileInput) return false;
118
+ // Either chat input appeared OR we're past the challenge.
119
+ return info.hasTextarea || info.bodyLen > 50;
120
+ },
121
+ },
122
+ };
123
+
124
+ /**
125
+ * Generic fallback: poll for cf_clearance cookie presence on the engine domain.
126
+ * Used when the engine doesn't have specific DOM signals defined.
127
+ */
128
+ async function pollForCfClearanceCookie(tab) {
129
+ const probe = await cdp([
130
+ "eval",
131
+ tab,
132
+ `(() => {
133
+ const cookies = document.cookie || "";
134
+ return JSON.stringify({
135
+ hasCfClearance: /(?:^|;\\s*)cf_clearance=/.test(cookies),
136
+ hasCfBm: /(?:^|;\\s*)__cf_bm=/.test(cookies),
137
+ cookiesLength: cookies.length,
138
+ });
139
+ })()`,
140
+ ]).catch(() => null);
141
+ if (!probe) return false;
142
+ try {
143
+ const info = JSON.parse(probe);
144
+ return info.hasCfClearance || info.hasCfBm;
145
+ } catch {
146
+ return false;
147
+ }
148
+ }
149
+
150
+ /**
151
+ * Poll page state until a Cloudflare/Turnstile challenge is cleared.
152
+ *
153
+ * Returns:
154
+ * { cleared: true, signal: "..." } — challenge cleared; safe to re-extract.
155
+ * { cleared: false, reason: "..." } — timeout or unrecoverable.
156
+ */
157
+ export async function waitForChallengeCleared({
158
+ tab,
159
+ engine,
160
+ timeoutMs = DEFAULT_TIMEOUT_MS,
161
+ intervalMs = POLL_INTERVAL_MS,
162
+ signal: externalSignal,
163
+ log = () => {},
164
+ }) {
165
+ const def = ENGINE_SIGNALS[engine];
166
+ const start = Date.now();
167
+ let lastState = null;
168
+
169
+ while (Date.now() - start < timeoutMs) {
170
+ if (externalSignal?.aborted) {
171
+ return { cleared: false, reason: "aborted" };
172
+ }
173
+ const elapsed = Math.floor((Date.now() - start) / 1000);
174
+
175
+ let cleared = false;
176
+ if (def) {
177
+ cleared = await def.isCleared(tab).catch(() => false);
178
+ } else {
179
+ cleared = await pollForCfClearanceCookie(tab).catch(() => false);
180
+ }
181
+ if (cleared) {
182
+ log(
183
+ `[greedysearch] ✅ ${engine} challenge cleared after ${elapsed}s — auto-resuming extraction.`,
184
+ );
185
+ return { cleared: true, signal: def ? "dom-marker" : "cookie" };
186
+ }
187
+
188
+ // Periodic heartbeat to stderr so the user knows we're still polling
189
+ if (elapsed > 0 && elapsed % 30 === 0 && lastState !== elapsed) {
190
+ lastState = elapsed;
191
+ log(
192
+ `[greedysearch] ⏳ Waiting for ${engine} challenge to clear (${elapsed}s/${Math.floor(timeoutMs / 1000)}s)...`,
193
+ );
194
+ }
195
+
196
+ await new Promise((r) => setTimeout(r, intervalMs));
197
+ }
198
+
199
+ return {
200
+ cleared: false,
201
+ reason: `Challenge not cleared within ${Math.floor(timeoutMs / 1000)}s`,
202
+ };
203
+ }
204
+
205
+ export const CHALLENGE_ENGINES = Object.keys(ENGINE_SIGNALS);