@apmantza/greedysearch-pi 1.9.2 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +132 -2
- package/README.md +82 -47
- package/bin/cdp.mjs +1153 -1108
- package/bin/launch.mjs +9 -0
- package/bin/search.mjs +318 -81
- package/extractors/bing-copilot.mjs +48 -18
- package/extractors/chatgpt.mjs +553 -0
- package/extractors/common.mjs +213 -22
- package/extractors/consensus.mjs +655 -0
- package/extractors/consent.mjs +182 -18
- package/extractors/gemini.mjs +350 -217
- package/extractors/google-ai.mjs +129 -128
- package/extractors/logically.mjs +629 -0
- package/extractors/perplexity.mjs +547 -217
- package/extractors/selectors.mjs +3 -2
- package/extractors/semantic-scholar.mjs +219 -0
- package/package.json +8 -4
- package/skills/greedy-search/skill.md +20 -12
- package/src/fetcher.mjs +23 -1
- package/src/formatters/results.ts +185 -128
- package/src/search/browser-lifecycle.mjs +27 -5
- package/src/search/challenge-detect.mjs +205 -0
- package/src/search/chrome.mjs +653 -590
- package/src/search/constants.mjs +155 -39
- package/src/search/engines.mjs +114 -76
- package/src/search/fetch-source.mjs +566 -451
- package/src/search/pdf.mjs +68 -0
- package/src/search/progress.mjs +145 -0
- package/src/search/recovery.mjs +73 -45
- package/src/search/research.mjs +1419 -62
- package/src/search/scale-aware.mjs +93 -0
- package/src/search/simple-research.mjs +520 -0
- package/src/search/sources.mjs +52 -22
- package/src/search/synthesis-runner.mjs +105 -26
- package/src/search/synthesis.mjs +286 -246
- package/src/tools/greedy-search-handler.ts +129 -59
- package/src/tools/shared.ts +312 -186
- package/src/types.ts +110 -104
- package/test.mjs +537 -18
package/bin/launch.mjs
CHANGED
|
@@ -76,6 +76,15 @@ const BASE_CHROME_FLAGS = [
|
|
|
76
76
|
"--window-size=1920,1080",
|
|
77
77
|
"--lang=en-US",
|
|
78
78
|
"--force-color-profile=srgb",
|
|
79
|
+
// Background-tab throttling kills parallel extractions: Chrome clamps
|
|
80
|
+
// setTimeout to ~1Hz in unfocused tabs, so a streaming response that
|
|
81
|
+
// finishes in 5s solo takes 60s+ when 4 engines share one Chrome.
|
|
82
|
+
// The trio below restores full-speed JS in every tab. Safe for our
|
|
83
|
+
// anti-bot stealth — Cloudflare detects CDP/webdriver artifacts, not
|
|
84
|
+
// timer-throttling behavior. Same flags Playwright/Puppeteer add.
|
|
85
|
+
"--disable-background-timer-throttling",
|
|
86
|
+
"--disable-renderer-backgrounding",
|
|
87
|
+
"--disable-backgrounding-occluded-windows",
|
|
79
88
|
];
|
|
80
89
|
|
|
81
90
|
function getChromeVersion(chromePath) {
|
package/bin/search.mjs
CHANGED
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
// node search.mjs gem "latest React features"
|
|
21
21
|
// node search.mjs all "how does TCP congestion control work"
|
|
22
22
|
|
|
23
|
-
import { existsSync, readFileSync } from "node:fs";
|
|
23
|
+
import { appendFileSync, existsSync, readFileSync } from "node:fs";
|
|
24
24
|
// Config file for user defaults
|
|
25
25
|
import { homedir } from "node:os";
|
|
26
26
|
import { join } from "node:path";
|
|
@@ -33,12 +33,18 @@ import {
|
|
|
33
33
|
openNewTab,
|
|
34
34
|
touchActivity,
|
|
35
35
|
} from "../src/search/chrome.mjs";
|
|
36
|
-
import {
|
|
36
|
+
import {
|
|
37
|
+
ALL_ENGINES,
|
|
38
|
+
ENGINES,
|
|
39
|
+
SYNTHESIZER,
|
|
40
|
+
VISIBLE_RECOVERY_LOG,
|
|
41
|
+
} from "../src/search/constants.mjs";
|
|
37
42
|
import { runExtractor } from "../src/search/engines.mjs";
|
|
38
43
|
import {
|
|
39
44
|
fetchMultipleSources,
|
|
40
45
|
fetchTopSource,
|
|
41
46
|
} from "../src/search/fetch-source.mjs";
|
|
47
|
+
import { waitForChallengeCleared } from "../src/search/challenge-detect.mjs";
|
|
42
48
|
import { writeSourcesToFiles } from "../src/search/file-sources.mjs";
|
|
43
49
|
import { writeOutput } from "../src/search/output.mjs";
|
|
44
50
|
import {
|
|
@@ -51,7 +57,11 @@ import {
|
|
|
51
57
|
mergeFetchDataIntoSources,
|
|
52
58
|
} from "../src/search/sources.mjs";
|
|
53
59
|
import { buildConfidence } from "../src/search/synthesis.mjs";
|
|
54
|
-
import {
|
|
60
|
+
import {
|
|
61
|
+
getSynthesisStartUrl,
|
|
62
|
+
normalizeSynthesizer,
|
|
63
|
+
synthesizeResults,
|
|
64
|
+
} from "../src/search/synthesis-runner.mjs";
|
|
55
65
|
import { normalizeQuery } from "../src/search/query.mjs";
|
|
56
66
|
import { runResearchMode } from "../src/search/research.mjs";
|
|
57
67
|
|
|
@@ -69,6 +79,18 @@ function loadUserConfig() {
|
|
|
69
79
|
return {};
|
|
70
80
|
}
|
|
71
81
|
|
|
82
|
+
function logVisibleRecovery(event) {
|
|
83
|
+
try {
|
|
84
|
+
appendFileSync(
|
|
85
|
+
VISIBLE_RECOVERY_LOG,
|
|
86
|
+
`${JSON.stringify({ at: new Date().toISOString(), ...event })}\n`,
|
|
87
|
+
"utf8",
|
|
88
|
+
);
|
|
89
|
+
} catch {
|
|
90
|
+
// Best-effort diagnostics only. Never fail a search because logging failed.
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
72
94
|
/** Read query/prompt from stdin (used with --stdin to avoid command-line leakage) */
|
|
73
95
|
async function readStdin() {
|
|
74
96
|
return new Promise((resolve) => {
|
|
@@ -89,16 +111,20 @@ async function main() {
|
|
|
89
111
|
`${[
|
|
90
112
|
'Usage: node search.mjs <engine> "<query>"',
|
|
91
113
|
"",
|
|
92
|
-
"Engines: perplexity (p),
|
|
114
|
+
"Engines: all, perplexity (p), google (g), chatgpt (gpt), gemini (gem), semantic-scholar (s2), logically (log), bing (b)",
|
|
93
115
|
"",
|
|
94
116
|
"Flags:",
|
|
95
|
-
" --
|
|
96
|
-
" --
|
|
97
|
-
" --
|
|
117
|
+
" --synthesize For engine=all: synthesize fetched sources",
|
|
118
|
+
" --synthesizer <engine> Synthesis engine (default from ~/.pi/greedyconfig)",
|
|
119
|
+
" --fast Legacy quick mode: no source fetching or synthesis",
|
|
120
|
+
" --depth <mode> Legacy: fast|standard|deep aliases, or research",
|
|
121
|
+
" --deep-research Deprecated alias for --research",
|
|
98
122
|
" --research Iterative query/learnings loop (alias: --depth research)",
|
|
99
123
|
" --breadth <n> Research mode query breadth, 1-5 (default: 3)",
|
|
100
124
|
" --iterations <n> Research mode rounds, 1-3 (default: 2)",
|
|
101
125
|
" --max-sources <n> Research mode fetched source cap, 3-12",
|
|
126
|
+
" --research-out-dir <dir> Write research bundle to a specific directory",
|
|
127
|
+
" --no-research-bundle Disable the default .pi/greedysearch-research bundle",
|
|
102
128
|
" --fetch-top-source Fetch content from top source",
|
|
103
129
|
" --inline Output JSON to stdout (for piping)",
|
|
104
130
|
" --locale <lang> Force results language (en, de, fr, etc.)",
|
|
@@ -112,10 +138,11 @@ async function main() {
|
|
|
112
138
|
" GREEDY_SEARCH_LOCALE Default locale (default: en)",
|
|
113
139
|
"",
|
|
114
140
|
"Examples:",
|
|
115
|
-
' node search.mjs all "Node.js streams"
|
|
116
|
-
' node search.mjs all "
|
|
141
|
+
' node search.mjs all "Node.js streams" # Grounded: engines + fetched sources',
|
|
142
|
+
' node search.mjs all "Node.js streams" --synthesize # Add Gemini synthesis',
|
|
143
|
+
' node search.mjs all "quick check" --fast # Legacy fast: no sources/synthesis',
|
|
117
144
|
' node search.mjs all "browser automation" --research --breadth 3 --iterations 2',
|
|
118
|
-
' node search.mjs p "what is memoization"
|
|
145
|
+
' node search.mjs p "what is memoization" # Single engine search',
|
|
119
146
|
].join("\n")}\n`,
|
|
120
147
|
);
|
|
121
148
|
process.exit(1);
|
|
@@ -129,6 +156,11 @@ async function main() {
|
|
|
129
156
|
process.env.GREEDY_SEARCH_VISIBLE = "1";
|
|
130
157
|
process.env.GREEDY_SEARCH_ALWAYS_VISIBLE = "1";
|
|
131
158
|
delete process.env.GREEDY_SEARCH_HEADLESS;
|
|
159
|
+
} else if (process.env.GREEDY_SEARCH_VISIBLE !== "1") {
|
|
160
|
+
// Establish the desired mode BEFORE ensureChrome() so a stale visible
|
|
161
|
+
// recovery browser is switched back to headless before research planning
|
|
162
|
+
// and Gemini synthesis tabs are opened.
|
|
163
|
+
process.env.GREEDY_SEARCH_HEADLESS = "1";
|
|
132
164
|
}
|
|
133
165
|
|
|
134
166
|
await ensureChrome();
|
|
@@ -136,41 +168,44 @@ async function main() {
|
|
|
136
168
|
// Track activity for headless idle timeout
|
|
137
169
|
touchActivity();
|
|
138
170
|
|
|
139
|
-
// Depth modes: fast (no synthesis/fetch), standard (synthesis+fetch 5 sources)
|
|
140
171
|
const depthIdx = args.indexOf("--depth");
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
} else if (args.includes("--fast")) {
|
|
146
|
-
depth = "fast"; // Explicit fast mode requested
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
// For single engine (not "all"), default to fast unless explicit
|
|
172
|
+
const legacyDepth =
|
|
173
|
+
depthIdx !== -1 && args[depthIdx + 1]
|
|
174
|
+
? args[depthIdx + 1].toLowerCase()
|
|
175
|
+
: null;
|
|
150
176
|
const engineArg = args.find((a) => !a.startsWith("--"))?.toLowerCase();
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
177
|
+
const researchMode =
|
|
178
|
+
args.includes("--research") ||
|
|
179
|
+
args.includes("--deep-research") ||
|
|
180
|
+
legacyDepth === "research";
|
|
181
|
+
const legacyFast = args.includes("--fast") || legacyDepth === "fast";
|
|
182
|
+
const legacySynthesisDepth =
|
|
183
|
+
legacyDepth === "standard" ||
|
|
184
|
+
legacyDepth === "deep" ||
|
|
185
|
+
args.includes("--deep");
|
|
186
|
+
const shouldFetchSources = engineArg === "all" && !legacyFast;
|
|
187
|
+
const shouldSynthesize =
|
|
188
|
+
engineArg === "all" &&
|
|
189
|
+
!legacyFast &&
|
|
190
|
+
(args.includes("--synthesize") || legacySynthesisDepth);
|
|
191
|
+
const groundedSynthesis = legacyDepth === "deep" || args.includes("--deep");
|
|
154
192
|
|
|
155
|
-
// --deep-research / --deep flags map to deep mode (backward compat)
|
|
156
193
|
if (args.includes("--deep-research")) {
|
|
157
|
-
depth = "standard";
|
|
158
194
|
process.stderr.write(
|
|
159
|
-
"[greedysearch] --deep-research is deprecated; use --
|
|
195
|
+
"[greedysearch] --deep-research is deprecated; use --research or --depth research\n",
|
|
160
196
|
);
|
|
161
197
|
}
|
|
162
|
-
if (
|
|
163
|
-
depth = "deep";
|
|
164
|
-
}
|
|
165
|
-
if (args.includes("--research")) {
|
|
166
|
-
depth = "research";
|
|
167
|
-
}
|
|
168
|
-
if (args.includes("--synthesize")) {
|
|
198
|
+
if (legacySynthesisDepth) {
|
|
169
199
|
process.stderr.write(
|
|
170
|
-
"[greedysearch]
|
|
200
|
+
"[greedysearch] depth fast|standard|deep is deprecated; use default grounded search plus --synthesize when needed\n",
|
|
171
201
|
);
|
|
172
202
|
}
|
|
173
203
|
|
|
204
|
+
const synthesizerIdx = args.indexOf("--synthesizer");
|
|
205
|
+
const synthesizer = normalizeSynthesizer(
|
|
206
|
+
synthesizerIdx === -1 ? SYNTHESIZER : args[synthesizerIdx + 1],
|
|
207
|
+
);
|
|
208
|
+
|
|
174
209
|
const full = args.includes("--full");
|
|
175
210
|
const short = !full;
|
|
176
211
|
const fetchSource = args.includes("--fetch-top-source");
|
|
@@ -183,9 +218,10 @@ async function main() {
|
|
|
183
218
|
iterationsIdx === -1 ? undefined : args[iterationsIdx + 1];
|
|
184
219
|
const researchMaxSources =
|
|
185
220
|
maxSourcesIdx === -1 ? undefined : args[maxSourcesIdx + 1];
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
221
|
+
const researchOutDirIdx = args.indexOf("--research-out-dir");
|
|
222
|
+
const researchOutDir =
|
|
223
|
+
researchOutDirIdx === -1 ? undefined : args[researchOutDirIdx + 1];
|
|
224
|
+
const writeResearchBundle = !args.includes("--no-research-bundle");
|
|
189
225
|
const outIdx = args.indexOf("--out");
|
|
190
226
|
const outFile = outIdx === -1 ? null : args[outIdx + 1];
|
|
191
227
|
|
|
@@ -218,18 +254,23 @@ async function main() {
|
|
|
218
254
|
a !== "--visible" &&
|
|
219
255
|
a !== "--always-visible" &&
|
|
220
256
|
a !== "--depth" &&
|
|
257
|
+
a !== "--synthesizer" &&
|
|
221
258
|
a !== "--out" &&
|
|
222
259
|
a !== "--locale" &&
|
|
223
260
|
a !== "--breadth" &&
|
|
224
261
|
a !== "--iterations" &&
|
|
225
262
|
a !== "--max-sources" &&
|
|
263
|
+
a !== "--research-out-dir" &&
|
|
264
|
+
a !== "--no-research-bundle" &&
|
|
226
265
|
a !== "--help" &&
|
|
227
266
|
(depthIdx === -1 || i !== depthIdx + 1) &&
|
|
267
|
+
(synthesizerIdx === -1 || i !== synthesizerIdx + 1) &&
|
|
228
268
|
(outIdx === -1 || i !== outIdx + 1) &&
|
|
229
269
|
(localeIdx === -1 || i !== localeIdx + 1) &&
|
|
230
270
|
(breadthIdx === -1 || i !== breadthIdx + 1) &&
|
|
231
271
|
(iterationsIdx === -1 || i !== iterationsIdx + 1) &&
|
|
232
|
-
(maxSourcesIdx === -1 || i !== maxSourcesIdx + 1)
|
|
272
|
+
(maxSourcesIdx === -1 || i !== maxSourcesIdx + 1) &&
|
|
273
|
+
(researchOutDirIdx === -1 || i !== researchOutDirIdx + 1),
|
|
233
274
|
);
|
|
234
275
|
const engine = rest[0]?.toLowerCase();
|
|
235
276
|
// Read query from stdin when --stdin flag is set (avoids leaking query in process table)
|
|
@@ -241,7 +282,7 @@ async function main() {
|
|
|
241
282
|
query = rest.slice(1).join(" ");
|
|
242
283
|
}
|
|
243
284
|
|
|
244
|
-
if (
|
|
285
|
+
if (researchMode) {
|
|
245
286
|
if (engine !== "all") {
|
|
246
287
|
process.stderr.write(
|
|
247
288
|
`[greedysearch] Research mode uses all engines; ignoring engine "${engine}".\n`,
|
|
@@ -254,6 +295,8 @@ async function main() {
|
|
|
254
295
|
maxSources: researchMaxSources,
|
|
255
296
|
locale,
|
|
256
297
|
short,
|
|
298
|
+
writeBundle: writeResearchBundle,
|
|
299
|
+
researchOutDir,
|
|
257
300
|
});
|
|
258
301
|
writeOutput(out, outFile, {
|
|
259
302
|
inline,
|
|
@@ -270,8 +313,11 @@ async function main() {
|
|
|
270
313
|
// engine homepage so extractors can skip the initial navigation.
|
|
271
314
|
const ENGINE_START_URLS = {
|
|
272
315
|
perplexity: "https://www.perplexity.ai/",
|
|
273
|
-
bing: "https://copilot.microsoft.com/",
|
|
274
316
|
google: "https://www.google.com/",
|
|
317
|
+
"semantic-scholar": "https://www.semanticscholar.org/",
|
|
318
|
+
semanticscholar: "https://www.semanticscholar.org/",
|
|
319
|
+
s2: "https://www.semanticscholar.org/",
|
|
320
|
+
logically: "https://logically.app/research-assistant/",
|
|
275
321
|
};
|
|
276
322
|
const engineTabs = await Promise.all(
|
|
277
323
|
ALL_ENGINES.map((e) => openNewTab(ENGINE_START_URLS[e])),
|
|
@@ -280,11 +326,10 @@ async function main() {
|
|
|
280
326
|
await cdp(["list"]);
|
|
281
327
|
|
|
282
328
|
// Time-bounded per-engine extraction so slow engines don't stall the batch.
|
|
283
|
-
// Bing can take a little longer than Google/Perplexity under CDP contention;
|
|
284
|
-
// keep fast mode bounded while avoiding most false recovery trips.
|
|
285
329
|
const engineTimeoutFor = (engineName) => {
|
|
286
|
-
if (
|
|
287
|
-
|
|
330
|
+
if (!legacyFast) return 70000;
|
|
331
|
+
// ChatGPT needs ~25-30s solo; under CDP contention needs more headroom
|
|
332
|
+
return engineName === "chatgpt" ? 60000 : 35000;
|
|
288
333
|
};
|
|
289
334
|
|
|
290
335
|
try {
|
|
@@ -316,7 +361,19 @@ async function main() {
|
|
|
316
361
|
if (r.status === "fulfilled") {
|
|
317
362
|
out[r.value.engine] = r.value;
|
|
318
363
|
} else {
|
|
319
|
-
|
|
364
|
+
const err = r.reason;
|
|
365
|
+
const msg = err?.message || "unknown error";
|
|
366
|
+
out[ALL_ENGINES[i]] = { error: msg };
|
|
367
|
+
if (err?.lastStage) {
|
|
368
|
+
process.stderr.write(
|
|
369
|
+
`[greedysearch] ${ALL_ENGINES[i]} failed at stage '${err.lastStage}': ${msg}\n`,
|
|
370
|
+
);
|
|
371
|
+
}
|
|
372
|
+
if (err?.partialErr) {
|
|
373
|
+
process.stderr.write(
|
|
374
|
+
`[greedysearch] ${ALL_ENGINES[i]} tail stderr:\n${err.partialErr}\n`,
|
|
375
|
+
);
|
|
376
|
+
}
|
|
320
377
|
}
|
|
321
378
|
}
|
|
322
379
|
|
|
@@ -331,6 +388,20 @@ async function main() {
|
|
|
331
388
|
recoveryCandidates.length > 0 &&
|
|
332
389
|
process.env.GREEDY_SEARCH_VISIBLE !== "1"
|
|
333
390
|
) {
|
|
391
|
+
logVisibleRecovery({
|
|
392
|
+
scope: "all",
|
|
393
|
+
phase: "start",
|
|
394
|
+
engines: recoveryCandidates,
|
|
395
|
+
reasons: Object.fromEntries(
|
|
396
|
+
recoveryCandidates.map((engineName) => [
|
|
397
|
+
engineName,
|
|
398
|
+
{
|
|
399
|
+
error: out[engineName]?.error || null,
|
|
400
|
+
envelope: out[engineName]?._envelope || null,
|
|
401
|
+
},
|
|
402
|
+
]),
|
|
403
|
+
),
|
|
404
|
+
});
|
|
334
405
|
process.stderr.write(
|
|
335
406
|
`[greedysearch] 🔓 Headless ${recoveryCandidates.join(", ")} search hit timeout/verification/antibot signals — retrying visible to establish cookies...\n`,
|
|
336
407
|
);
|
|
@@ -431,22 +502,94 @@ async function main() {
|
|
|
431
502
|
stillBlocked.push(...secondStillBlocked);
|
|
432
503
|
}
|
|
433
504
|
|
|
505
|
+
logVisibleRecovery({
|
|
506
|
+
scope: "all",
|
|
507
|
+
phase: stillBlocked.length > 0 ? "needs-human" : "success",
|
|
508
|
+
engines: recoveryCandidates,
|
|
509
|
+
results: Object.fromEntries(
|
|
510
|
+
recoveryCandidates.map((engineName) => [
|
|
511
|
+
engineName,
|
|
512
|
+
{
|
|
513
|
+
mode: out[engineName]?._envelope?.mode || null,
|
|
514
|
+
durationMs: out[engineName]?._envelope?.durationMs || null,
|
|
515
|
+
lastStage: out[engineName]?._envelope?.lastStage || null,
|
|
516
|
+
error: out[engineName]?.error || null,
|
|
517
|
+
},
|
|
518
|
+
]),
|
|
519
|
+
),
|
|
520
|
+
});
|
|
521
|
+
|
|
434
522
|
if (stillBlocked.length > 0) {
|
|
435
523
|
for (const blockedEngine of stillBlocked) {
|
|
436
524
|
process.stderr.write(`PROGRESS:${blockedEngine}:needs-human\n`);
|
|
437
525
|
}
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
526
|
+
|
|
527
|
+
// Poll for the user to solve any remaining challenges in
|
|
528
|
+
// visible Chrome. If a per-engine challenge clears, retry
|
|
529
|
+
// that engine's extractor on the cleared tab. Fall back to
|
|
530
|
+
// the existing _needsHumanVerification envelope only if the
|
|
531
|
+
// polling budget is exhausted.
|
|
532
|
+
const allPollResults = await Promise.all(
|
|
533
|
+
stillBlocked.map(async (blockedEngine) => {
|
|
534
|
+
const tab =
|
|
535
|
+
retryTabs[recoveryCandidates.indexOf(blockedEngine)];
|
|
536
|
+
const result = await waitForChallengeCleared({
|
|
537
|
+
tab,
|
|
538
|
+
engine: blockedEngine,
|
|
539
|
+
}).catch((pollErr) => ({
|
|
540
|
+
cleared: false,
|
|
541
|
+
reason: pollErr.message || String(pollErr),
|
|
542
|
+
}));
|
|
543
|
+
return { engine: blockedEngine, tab, ...result };
|
|
544
|
+
}),
|
|
545
|
+
);
|
|
546
|
+
const clearedEngines = allPollResults.filter((p) => p.cleared);
|
|
547
|
+
if (clearedEngines.length > 0) {
|
|
548
|
+
process.stderr.write(
|
|
549
|
+
`[greedysearch] 🔄 Auto-resuming ${clearedEngines.map((p) => p.engine).join(", ")} on cleared tabs...\n`,
|
|
550
|
+
);
|
|
551
|
+
await Promise.allSettled(
|
|
552
|
+
clearedEngines.map(async (p) => {
|
|
553
|
+
const script = ENGINES[p.engine];
|
|
554
|
+
try {
|
|
555
|
+
const result = await runExtractor(
|
|
556
|
+
script,
|
|
557
|
+
query,
|
|
558
|
+
p.tab,
|
|
559
|
+
short,
|
|
560
|
+
null,
|
|
561
|
+
locale,
|
|
562
|
+
);
|
|
563
|
+
out[p.engine] = result;
|
|
564
|
+
process.stderr.write(`PROGRESS:${p.engine}:done\n`);
|
|
565
|
+
} catch (resumeErr) {
|
|
566
|
+
process.stderr.write(
|
|
567
|
+
`[greedysearch] ⚠️ Resume extraction failed for ${p.engine}: ${resumeErr.message}\n`,
|
|
568
|
+
);
|
|
569
|
+
}
|
|
570
|
+
}),
|
|
571
|
+
);
|
|
572
|
+
}
|
|
573
|
+
const stillStillBlocked = stillBlocked.filter(
|
|
574
|
+
(e) => !clearedEngines.find((p) => p.engine === e),
|
|
446
575
|
);
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
576
|
+
if (stillStillBlocked.length === 0) {
|
|
577
|
+
// All blocked engines cleared and resumed successfully
|
|
578
|
+
keepVisibleForHuman = false;
|
|
579
|
+
} else {
|
|
580
|
+
keepVisibleForHuman = true;
|
|
581
|
+
out._needsHumanVerification = {
|
|
582
|
+
engines: stillStillBlocked,
|
|
583
|
+
message:
|
|
584
|
+
"Visible Chrome is open with the engine page loaded. Solve the Turnstile checkbox or other challenge in the visible window to store cookies. Cookies persist for future runs.",
|
|
585
|
+
};
|
|
586
|
+
process.stderr.write(
|
|
587
|
+
`[greedysearch] 🔓 ${stillStillBlocked.join(", ")} still blocked — keeping visible Chrome open. Solve the challenge in the window to store cookies, then rerun.\n`,
|
|
588
|
+
);
|
|
589
|
+
// Visible Chrome stays open so the user can interact with any
|
|
590
|
+
// Turnstile/Cloudflare challenge. Once solved, cookies are stored
|
|
591
|
+
// in the shared profile and future headless runs will reuse them.
|
|
592
|
+
}
|
|
450
593
|
}
|
|
451
594
|
} finally {
|
|
452
595
|
if (keepVisibleForHuman) {
|
|
@@ -488,18 +631,9 @@ async function main() {
|
|
|
488
631
|
// Build a canonical source registry across all engines
|
|
489
632
|
out._sources = buildSourceRegistry(out, query);
|
|
490
633
|
|
|
491
|
-
// Pre-navigate Gemini tab in parallel with source fetch so the page
|
|
492
|
-
// is already loaded when synthesis starts — saves ~4s of nav time.
|
|
493
|
-
let geminiTabPromise = null;
|
|
494
|
-
if (depth !== "fast") {
|
|
495
|
-
geminiTabPromise = openNewTab("https://gemini.google.com/app").catch(
|
|
496
|
-
() => null,
|
|
497
|
-
);
|
|
498
|
-
}
|
|
499
|
-
|
|
500
634
|
// Source fetching: default for all "all" searches
|
|
501
635
|
// Fetch all sources in a single batch (concurrency = source count).
|
|
502
|
-
if (
|
|
636
|
+
if (shouldFetchSources && out._sources.length > 0) {
|
|
503
637
|
process.stderr.write("PROGRESS:source-fetch:start\n");
|
|
504
638
|
const fetchedSources = await fetchMultipleSources(
|
|
505
639
|
out._sources,
|
|
@@ -512,29 +646,41 @@ async function main() {
|
|
|
512
646
|
process.stderr.write("PROGRESS:source-fetch:done\n");
|
|
513
647
|
}
|
|
514
648
|
|
|
515
|
-
//
|
|
516
|
-
|
|
649
|
+
// Optional engine-agnostic synthesis for multi-engine searches.
|
|
650
|
+
// Open the synthesizer tab HERE (after source fetch) instead of
|
|
651
|
+
// pre-opening before source fetch. Pre-opening was fragile: Chrome could
|
|
652
|
+
// be killed during visible recovery or idle-timeout between source fetch
|
|
653
|
+
// and synthesis, leaving a stale tab ID that causes "No target matching prefix".
|
|
654
|
+
if (shouldSynthesize) {
|
|
517
655
|
process.stderr.write("PROGRESS:synthesis:start\n");
|
|
518
656
|
process.stderr.write(
|
|
519
|
-
|
|
657
|
+
`[greedysearch] Synthesizing results with ${synthesizer}...\n`,
|
|
520
658
|
);
|
|
659
|
+
let synthesisTab = null;
|
|
521
660
|
try {
|
|
522
|
-
|
|
523
|
-
const synthesis = await
|
|
524
|
-
grounded:
|
|
525
|
-
tabPrefix:
|
|
661
|
+
synthesisTab = await openNewTab(getSynthesisStartUrl(synthesizer));
|
|
662
|
+
const synthesis = await synthesizeResults(query, out, {
|
|
663
|
+
grounded: groundedSynthesis,
|
|
664
|
+
tabPrefix: synthesisTab,
|
|
665
|
+
visible: process.env.GREEDY_SEARCH_VISIBLE === "1",
|
|
666
|
+
synthesizer,
|
|
526
667
|
});
|
|
527
668
|
out._synthesis = {
|
|
528
669
|
...synthesis,
|
|
529
670
|
synthesized: true,
|
|
530
671
|
};
|
|
531
|
-
await closeTab(geminiTab);
|
|
532
672
|
process.stderr.write("PROGRESS:synthesis:done\n");
|
|
533
673
|
} catch (e) {
|
|
534
674
|
process.stderr.write(
|
|
535
675
|
`[greedysearch] Synthesis failed: ${e.message}\n`,
|
|
536
676
|
);
|
|
537
|
-
out._synthesis = {
|
|
677
|
+
out._synthesis = {
|
|
678
|
+
error: e.message,
|
|
679
|
+
synthesized: false,
|
|
680
|
+
synthesizedBy: synthesizer,
|
|
681
|
+
};
|
|
682
|
+
} finally {
|
|
683
|
+
if (synthesisTab) await closeTab(synthesisTab);
|
|
538
684
|
}
|
|
539
685
|
}
|
|
540
686
|
|
|
@@ -544,12 +690,12 @@ async function main() {
|
|
|
544
690
|
out._topSource = await fetchTopSource(top.canonicalUrl || top.url);
|
|
545
691
|
}
|
|
546
692
|
|
|
547
|
-
//
|
|
548
|
-
if (
|
|
693
|
+
// Include confidence metrics for grounded multi-engine searches.
|
|
694
|
+
if (!legacyFast) out._confidence = buildConfidence(out);
|
|
549
695
|
|
|
550
696
|
writeOutput(out, outFile, {
|
|
551
697
|
inline,
|
|
552
|
-
synthesize:
|
|
698
|
+
synthesize: shouldSynthesize,
|
|
553
699
|
query,
|
|
554
700
|
});
|
|
555
701
|
return;
|
|
@@ -585,13 +731,31 @@ async function main() {
|
|
|
585
731
|
? "bing"
|
|
586
732
|
: script.includes("perplexity")
|
|
587
733
|
? "perplexity"
|
|
588
|
-
:
|
|
734
|
+
: script.includes("chatgpt")
|
|
735
|
+
? "chatgpt"
|
|
736
|
+
: script.includes("semantic-scholar")
|
|
737
|
+
? "semantic-scholar"
|
|
738
|
+
: script.includes("logically")
|
|
739
|
+
? "logically"
|
|
740
|
+
: null;
|
|
589
741
|
const canRetryVisible =
|
|
590
742
|
recoveryEngine &&
|
|
591
743
|
process.env.GREEDY_SEARCH_VISIBLE !== "1" &&
|
|
592
744
|
isHeadlessBlockedResult(e);
|
|
593
745
|
|
|
594
746
|
if (canRetryVisible) {
|
|
747
|
+
logVisibleRecovery({
|
|
748
|
+
scope: "single",
|
|
749
|
+
phase: "start",
|
|
750
|
+
engines: [recoveryEngine],
|
|
751
|
+
reasons: {
|
|
752
|
+
[recoveryEngine]: {
|
|
753
|
+
error: e.message || null,
|
|
754
|
+
envelope: e.envelope || null,
|
|
755
|
+
lastStage: e.lastStage || null,
|
|
756
|
+
},
|
|
757
|
+
},
|
|
758
|
+
});
|
|
595
759
|
process.stderr.write(
|
|
596
760
|
`[greedysearch] 🔓 ${recoveryEngine} blocked in headless — retrying visible to establish cookies...\n`,
|
|
597
761
|
);
|
|
@@ -612,14 +776,87 @@ async function main() {
|
|
|
612
776
|
null,
|
|
613
777
|
locale,
|
|
614
778
|
);
|
|
779
|
+
logVisibleRecovery({
|
|
780
|
+
scope: "single",
|
|
781
|
+
phase: "success",
|
|
782
|
+
engines: [recoveryEngine],
|
|
783
|
+
result: {
|
|
784
|
+
engine: recoveryEngine,
|
|
785
|
+
mode: result._envelope?.mode || null,
|
|
786
|
+
durationMs: result._envelope?.durationMs || null,
|
|
787
|
+
lastStage: result._envelope?.lastStage || null,
|
|
788
|
+
},
|
|
789
|
+
});
|
|
615
790
|
if (fetchSource && result.sources?.length > 0) {
|
|
616
791
|
result.topSource = await fetchTopSource(result.sources[0].url);
|
|
617
792
|
}
|
|
618
793
|
writeOutput(result, outFile, { inline, synthesize: false, query });
|
|
619
794
|
return;
|
|
620
795
|
} catch (retryErr) {
|
|
621
|
-
|
|
622
|
-
|
|
796
|
+
logVisibleRecovery({
|
|
797
|
+
scope: "single",
|
|
798
|
+
phase: "needs-human",
|
|
799
|
+
engines: [recoveryEngine],
|
|
800
|
+
result: {
|
|
801
|
+
engine: recoveryEngine,
|
|
802
|
+
error: retryErr.message || String(retryErr),
|
|
803
|
+
envelope: retryErr.envelope || null,
|
|
804
|
+
},
|
|
805
|
+
});
|
|
806
|
+
// Any visible retry failure: poll for the user to solve the challenge in
|
|
807
|
+
// visible Chrome. If the page transitions past the challenge (cookies
|
|
808
|
+
// cleared, chat UI rendered, Turnstile iframe gone), automatically retry
|
|
809
|
+
// the extractor so the user does not need to rerun manually. Fall back
|
|
810
|
+
// to the existing _needsHumanVerification envelope only if the polling
|
|
811
|
+
// budget is exhausted.
|
|
812
|
+
const pollResult = await waitForChallengeCleared({
|
|
813
|
+
tab: retryTab,
|
|
814
|
+
engine: recoveryEngine,
|
|
815
|
+
}).catch((pollErr) => ({
|
|
816
|
+
cleared: false,
|
|
817
|
+
reason: pollErr.message || String(pollErr),
|
|
818
|
+
}));
|
|
819
|
+
|
|
820
|
+
if (pollResult.cleared) {
|
|
821
|
+
process.stderr.write(
|
|
822
|
+
`[greedysearch] 🔄 Auto-resuming ${recoveryEngine} extraction on the now-cleared tab...\n`,
|
|
823
|
+
);
|
|
824
|
+
try {
|
|
825
|
+
const result = await runExtractor(
|
|
826
|
+
script,
|
|
827
|
+
query,
|
|
828
|
+
retryTab,
|
|
829
|
+
short,
|
|
830
|
+
null,
|
|
831
|
+
locale,
|
|
832
|
+
);
|
|
833
|
+
logVisibleRecovery({
|
|
834
|
+
scope: "single",
|
|
835
|
+
phase: "success-after-poll",
|
|
836
|
+
engines: [recoveryEngine],
|
|
837
|
+
result: {
|
|
838
|
+
engine: recoveryEngine,
|
|
839
|
+
mode: result._envelope?.mode || null,
|
|
840
|
+
durationMs: result._envelope?.durationMs || null,
|
|
841
|
+
lastStage: result._envelope?.lastStage || null,
|
|
842
|
+
},
|
|
843
|
+
});
|
|
844
|
+
if (fetchSource && result.sources?.length > 0) {
|
|
845
|
+
result.topSource = await fetchTopSource(result.sources[0].url);
|
|
846
|
+
}
|
|
847
|
+
writeOutput(result, outFile, { inline, synthesize: false, query });
|
|
848
|
+
return;
|
|
849
|
+
} catch (resumeErr) {
|
|
850
|
+
process.stderr.write(
|
|
851
|
+
`[greedysearch] ⚠️ Resume extraction failed: ${resumeErr.message}\n`,
|
|
852
|
+
);
|
|
853
|
+
// Fall through to needs-human with the resume error context
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
// Polling timed out (or resume extraction failed) — keep Chrome open so the
|
|
858
|
+
// user can solve Turnstile. Once solved, cookies are stored in the shared
|
|
859
|
+
// profile for future headless runs.
|
|
623
860
|
keepVisibleForHuman = true;
|
|
624
861
|
writeOutput(
|
|
625
862
|
{
|