@apmantza/greedysearch-pi 1.9.2 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +132 -2
- package/README.md +82 -47
- package/bin/cdp.mjs +1153 -1108
- package/bin/launch.mjs +9 -0
- package/bin/search.mjs +318 -81
- package/extractors/bing-copilot.mjs +48 -18
- package/extractors/chatgpt.mjs +553 -0
- package/extractors/common.mjs +213 -22
- package/extractors/consensus.mjs +655 -0
- package/extractors/consent.mjs +182 -18
- package/extractors/gemini.mjs +350 -217
- package/extractors/google-ai.mjs +129 -128
- package/extractors/logically.mjs +629 -0
- package/extractors/perplexity.mjs +547 -217
- package/extractors/selectors.mjs +3 -2
- package/extractors/semantic-scholar.mjs +219 -0
- package/package.json +8 -4
- package/skills/greedy-search/skill.md +20 -12
- package/src/fetcher.mjs +23 -1
- package/src/formatters/results.ts +185 -128
- package/src/search/browser-lifecycle.mjs +27 -5
- package/src/search/challenge-detect.mjs +205 -0
- package/src/search/chrome.mjs +653 -590
- package/src/search/constants.mjs +155 -39
- package/src/search/engines.mjs +114 -76
- package/src/search/fetch-source.mjs +566 -451
- package/src/search/pdf.mjs +68 -0
- package/src/search/progress.mjs +145 -0
- package/src/search/recovery.mjs +73 -45
- package/src/search/research.mjs +1419 -62
- package/src/search/scale-aware.mjs +93 -0
- package/src/search/simple-research.mjs +520 -0
- package/src/search/sources.mjs +52 -22
- package/src/search/synthesis-runner.mjs +105 -26
- package/src/search/synthesis.mjs +286 -246
- package/src/tools/greedy-search-handler.ts +129 -59
- package/src/tools/shared.ts +312 -186
- package/src/types.ts +110 -104
- package/test.mjs +537 -18
|
@@ -116,13 +116,35 @@ function getProcessCommandLine(pid) {
|
|
|
116
116
|
* @param {number} debugPort - expected debug port
|
|
117
117
|
* @returns {boolean}
|
|
118
118
|
*/
|
|
119
|
-
export function
|
|
120
|
-
|
|
119
|
+
export function commandLineMatchesGreedyChrome(
|
|
120
|
+
cmdLine,
|
|
121
|
+
tempDir,
|
|
122
|
+
debugPort = GREEDY_PORT,
|
|
123
|
+
) {
|
|
121
124
|
if (!cmdLine) return false;
|
|
125
|
+
// Windows may report Chrome command lines with backslashes while the shared
|
|
126
|
+
// GREEDY_PROFILE_DIR constant is normalized to forward slashes. Compare a
|
|
127
|
+
// normalized form so child processes do not misclassify a live GreedySearch
|
|
128
|
+
// Chrome as a ghost and kill it during cleanupStaleSessions().
|
|
129
|
+
const normalize = (value) =>
|
|
130
|
+
String(value || "")
|
|
131
|
+
.replaceAll("\\", "/")
|
|
132
|
+
.toLowerCase();
|
|
133
|
+
const normalizedCmdLine = normalize(cmdLine);
|
|
134
|
+
const normalizedTempDir = normalize(tempDir);
|
|
135
|
+
|
|
122
136
|
return (
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
!
|
|
137
|
+
normalizedCmdLine.includes(normalizedTempDir) &&
|
|
138
|
+
normalizedCmdLine.includes(`--remote-debugging-port=${debugPort}`) &&
|
|
139
|
+
!normalizedCmdLine.includes("--type=")
|
|
140
|
+
);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
export function verifyBrowserProcess(pid, tempDir, debugPort = GREEDY_PORT) {
|
|
144
|
+
return commandLineMatchesGreedyChrome(
|
|
145
|
+
getProcessCommandLine(pid),
|
|
146
|
+
tempDir,
|
|
147
|
+
debugPort,
|
|
126
148
|
);
|
|
127
149
|
}
|
|
128
150
|
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
// src/search/challenge-detect.mjs — Detect when a Cloudflare/Turnstile/captcha
|
|
2
|
+
// challenge has been solved so the extractor can auto-resume.
|
|
3
|
+
//
|
|
4
|
+
// Polls page state (URL, title, DOM markers, cookie presence) instead of waiting
|
|
5
|
+
// for a hard timeout. Resolves once the engine-specific "challenge cleared"
|
|
6
|
+
// signal is observed, or rejects with a clear error if the polling budget is
|
|
7
|
+
// exhausted before any progress.
|
|
8
|
+
//
|
|
9
|
+
// Usage:
|
|
10
|
+
// const cleared = await waitForChallengeCleared({ tab, engine: "chatgpt", timeoutMs: 300000 });
|
|
11
|
+
// if (!cleared) emit _needsHumanVerification; else re-run extractor.
|
|
12
|
+
|
|
13
|
+
import { cdp } from "../../extractors/common.mjs";
|
|
14
|
+
|
|
15
|
+
const DEFAULT_TIMEOUT_MS = Number.parseInt(
|
|
16
|
+
process.env.GREEDY_SEARCH_CHALLENGE_WAIT_MS || "300000",
|
|
17
|
+
10,
|
|
18
|
+
); // 5 minutes default
|
|
19
|
+
const POLL_INTERVAL_MS = 3000;
|
|
20
|
+
|
|
21
|
+
const ENGINE_SIGNALS = {
|
|
22
|
+
chatgpt: {
|
|
23
|
+
// After Cloudflare clearance, chatgpt.com shows the chat UI.
|
|
24
|
+
// Title changes from "Περιμένετε..." / "Just a moment..." → "ChatGPT"
|
|
25
|
+
// and div.ProseMirror renders.
|
|
26
|
+
name: "chatgpt",
|
|
27
|
+
isCleared: async (tab) => {
|
|
28
|
+
const probe = await cdp([
|
|
29
|
+
"eval",
|
|
30
|
+
tab,
|
|
31
|
+
`(() => {
|
|
32
|
+
const title = document.title;
|
|
33
|
+
const onChatGPT = location.hostname === "chatgpt.com";
|
|
34
|
+
const hasProseMirror = !!document.querySelector("div.ProseMirror");
|
|
35
|
+
const hasTurnstileInput =
|
|
36
|
+
!!document.querySelector("input[name=\\"cf-turnstile-response\\"]") ||
|
|
37
|
+
!!document.querySelector("iframe[id^=\\"cf-chl-widget-\\"]");
|
|
38
|
+
// Body innerText is empty while on the Turnstile page.
|
|
39
|
+
const bodyText = (document.body && document.body.innerText) || "";
|
|
40
|
+
return JSON.stringify({
|
|
41
|
+
title,
|
|
42
|
+
url: location.href,
|
|
43
|
+
hasProseMirror,
|
|
44
|
+
hasTurnstileInput,
|
|
45
|
+
bodyLen: bodyText.length,
|
|
46
|
+
onChatGPT,
|
|
47
|
+
});
|
|
48
|
+
})()`,
|
|
49
|
+
]).catch(() => null);
|
|
50
|
+
if (!probe) return false;
|
|
51
|
+
let info;
|
|
52
|
+
try {
|
|
53
|
+
info = JSON.parse(probe);
|
|
54
|
+
} catch {
|
|
55
|
+
return false;
|
|
56
|
+
}
|
|
57
|
+
// Cleared when we're on chatgpt.com, the title is no longer the
|
|
58
|
+
// "Please wait…" placeholder, and either the chat UI rendered or
|
|
59
|
+
// the Turnstile marker is gone.
|
|
60
|
+
if (!info.onChatGPT) return false;
|
|
61
|
+
if (
|
|
62
|
+
info.title &&
|
|
63
|
+
/περιμένετε|please wait|just a moment|verifying|checking/i.test(
|
|
64
|
+
info.title,
|
|
65
|
+
)
|
|
66
|
+
) {
|
|
67
|
+
return false;
|
|
68
|
+
}
|
|
69
|
+
if (info.hasTurnstileInput) return false;
|
|
70
|
+
// Either chat UI appeared OR we navigated past chatgpt.com (signed-in landing)
|
|
71
|
+
return info.hasProseMirror || info.bodyLen > 50;
|
|
72
|
+
},
|
|
73
|
+
},
|
|
74
|
+
bing: {
|
|
75
|
+
// Copilot shows "Verify you are human" challenge, then transitions to the chat UI.
|
|
76
|
+
// Cleared signals: URL on copilot.microsoft.com (no /challenge), textarea/input exists,
|
|
77
|
+
// or the Turnstile iframe is gone.
|
|
78
|
+
name: "bing",
|
|
79
|
+
isCleared: async (tab) => {
|
|
80
|
+
const probe = await cdp([
|
|
81
|
+
"eval",
|
|
82
|
+
tab,
|
|
83
|
+
`(() => {
|
|
84
|
+
const url = location.href;
|
|
85
|
+
const title = document.title;
|
|
86
|
+
const onCopilot = /copilot\\.microsoft\\.com/.test(location.hostname);
|
|
87
|
+
const onChallenge =
|
|
88
|
+
/challenge|turnstile|cdn-cgi\\/challenge/i.test(url) ||
|
|
89
|
+
/verify|human|robot/i.test(title);
|
|
90
|
+
const hasTextarea =
|
|
91
|
+
!!document.querySelector("textarea") ||
|
|
92
|
+
!!document.querySelector("div[contenteditable=\\"true\\"]");
|
|
93
|
+
const hasTurnstileInput =
|
|
94
|
+
!!document.querySelector("iframe[id^=\\"cf-chl-widget-\\"]") ||
|
|
95
|
+
!!document.querySelector("input[name=\\"cf-turnstile-response\\"]");
|
|
96
|
+
const bodyText = (document.body && document.body.innerText) || "";
|
|
97
|
+
return JSON.stringify({
|
|
98
|
+
url,
|
|
99
|
+
title,
|
|
100
|
+
onCopilot,
|
|
101
|
+
onChallenge,
|
|
102
|
+
hasTextarea,
|
|
103
|
+
hasTurnstileInput,
|
|
104
|
+
bodyLen: bodyText.length,
|
|
105
|
+
});
|
|
106
|
+
})()`,
|
|
107
|
+
]).catch(() => null);
|
|
108
|
+
if (!probe) return false;
|
|
109
|
+
let info;
|
|
110
|
+
try {
|
|
111
|
+
info = JSON.parse(probe);
|
|
112
|
+
} catch {
|
|
113
|
+
return false;
|
|
114
|
+
}
|
|
115
|
+
if (!info.onCopilot) return false;
|
|
116
|
+
if (info.onChallenge) return false;
|
|
117
|
+
if (info.hasTurnstileInput) return false;
|
|
118
|
+
// Either chat input appeared OR we're past the challenge.
|
|
119
|
+
return info.hasTextarea || info.bodyLen > 50;
|
|
120
|
+
},
|
|
121
|
+
},
|
|
122
|
+
};
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Generic fallback: poll for cf_clearance cookie presence on the engine domain.
|
|
126
|
+
* Used when the engine doesn't have specific DOM signals defined.
|
|
127
|
+
*/
|
|
128
|
+
async function pollForCfClearanceCookie(tab) {
|
|
129
|
+
const probe = await cdp([
|
|
130
|
+
"eval",
|
|
131
|
+
tab,
|
|
132
|
+
`(() => {
|
|
133
|
+
const cookies = document.cookie || "";
|
|
134
|
+
return JSON.stringify({
|
|
135
|
+
hasCfClearance: /(?:^|;\\s*)cf_clearance=/.test(cookies),
|
|
136
|
+
hasCfBm: /(?:^|;\\s*)__cf_bm=/.test(cookies),
|
|
137
|
+
cookiesLength: cookies.length,
|
|
138
|
+
});
|
|
139
|
+
})()`,
|
|
140
|
+
]).catch(() => null);
|
|
141
|
+
if (!probe) return false;
|
|
142
|
+
try {
|
|
143
|
+
const info = JSON.parse(probe);
|
|
144
|
+
return info.hasCfClearance || info.hasCfBm;
|
|
145
|
+
} catch {
|
|
146
|
+
return false;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Poll page state until a Cloudflare/Turnstile challenge is cleared.
|
|
152
|
+
*
|
|
153
|
+
* Returns:
|
|
154
|
+
* { cleared: true, signal: "..." } — challenge cleared; safe to re-extract.
|
|
155
|
+
* { cleared: false, reason: "..." } — timeout or unrecoverable.
|
|
156
|
+
*/
|
|
157
|
+
export async function waitForChallengeCleared({
|
|
158
|
+
tab,
|
|
159
|
+
engine,
|
|
160
|
+
timeoutMs = DEFAULT_TIMEOUT_MS,
|
|
161
|
+
intervalMs = POLL_INTERVAL_MS,
|
|
162
|
+
signal: externalSignal,
|
|
163
|
+
log = () => {},
|
|
164
|
+
}) {
|
|
165
|
+
const def = ENGINE_SIGNALS[engine];
|
|
166
|
+
const start = Date.now();
|
|
167
|
+
let lastState = null;
|
|
168
|
+
|
|
169
|
+
while (Date.now() - start < timeoutMs) {
|
|
170
|
+
if (externalSignal?.aborted) {
|
|
171
|
+
return { cleared: false, reason: "aborted" };
|
|
172
|
+
}
|
|
173
|
+
const elapsed = Math.floor((Date.now() - start) / 1000);
|
|
174
|
+
|
|
175
|
+
let cleared = false;
|
|
176
|
+
if (def) {
|
|
177
|
+
cleared = await def.isCleared(tab).catch(() => false);
|
|
178
|
+
} else {
|
|
179
|
+
cleared = await pollForCfClearanceCookie(tab).catch(() => false);
|
|
180
|
+
}
|
|
181
|
+
if (cleared) {
|
|
182
|
+
log(
|
|
183
|
+
`[greedysearch] ✅ ${engine} challenge cleared after ${elapsed}s — auto-resuming extraction.`,
|
|
184
|
+
);
|
|
185
|
+
return { cleared: true, signal: def ? "dom-marker" : "cookie" };
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Periodic heartbeat to stderr so the user knows we're still polling
|
|
189
|
+
if (elapsed > 0 && elapsed % 30 === 0 && lastState !== elapsed) {
|
|
190
|
+
lastState = elapsed;
|
|
191
|
+
log(
|
|
192
|
+
`[greedysearch] ⏳ Waiting for ${engine} challenge to clear (${elapsed}s/${Math.floor(timeoutMs / 1000)}s)...`,
|
|
193
|
+
);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
await new Promise((r) => setTimeout(r, intervalMs));
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
return {
|
|
200
|
+
cleared: false,
|
|
201
|
+
reason: `Challenge not cleared within ${Math.floor(timeoutMs / 1000)}s`,
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
export const CHALLENGE_ENGINES = Object.keys(ENGINE_SIGNALS);
|