@apmantza/greedysearch-pi 1.8.7 → 1.8.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +35 -0
- package/README.md +5 -4
- package/bin/search.mjs +668 -623
- package/extractors/bing-aria.mjs +539 -0
- package/extractors/bing-copilot.mjs +1 -1
- package/extractors/common.mjs +561 -529
- package/extractors/gemini.mjs +150 -150
- package/extractors/selectors.mjs +54 -54
- package/package.json +1 -1
- package/skills/greedy-search/skill.md +26 -53
- package/src/fetcher.mjs +652 -652
- package/src/search/browser-lifecycle.mjs +615 -0
- package/src/search/chrome.mjs +529 -449
- package/src/search/constants.mjs +44 -43
- package/src/search/engines.mjs +3 -2
- package/src/search/sources.mjs +5 -1
- package/src/search/synthesis.mjs +235 -223
- package/src/utils/content.mjs +5 -1
- package/src/utils/system-cmds.mjs +101 -0
package/bin/search.mjs
CHANGED
|
@@ -1,623 +1,668 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
// search.mjs - unified CLI for GreedySearch extractors
|
|
4
|
-
//
|
|
5
|
-
// Usage:
|
|
6
|
-
// node search.mjs <engine> "<query>"
|
|
7
|
-
// node search.mjs all "<query>"
|
|
8
|
-
//
|
|
9
|
-
// Engines:
|
|
10
|
-
// perplexity | pplx | p
|
|
11
|
-
// bing | copilot | b
|
|
12
|
-
// google | g
|
|
13
|
-
// gemini | gem
|
|
14
|
-
// all - fan-out to all engines in parallel
|
|
15
|
-
//
|
|
16
|
-
// Output: JSON to stdout, errors to stderr
|
|
17
|
-
//
|
|
18
|
-
// Examples:
|
|
19
|
-
// node search.mjs p "what is memoization"
|
|
20
|
-
// node search.mjs gem "latest React features"
|
|
21
|
-
// node search.mjs all "how does TCP congestion control work"
|
|
22
|
-
|
|
23
|
-
import { existsSync, readFileSync } from "node:fs";
|
|
24
|
-
// Config file for user defaults
|
|
25
|
-
import { homedir } from "node:os";
|
|
26
|
-
import { join } from "node:path";
|
|
27
|
-
import {
|
|
28
|
-
activateTab,
|
|
29
|
-
cdp,
|
|
30
|
-
closeTab,
|
|
31
|
-
closeTabs,
|
|
32
|
-
ensureChrome,
|
|
33
|
-
killHeadlessChrome,
|
|
34
|
-
openNewTab,
|
|
35
|
-
touchActivity,
|
|
36
|
-
} from "../src/search/chrome.mjs";
|
|
37
|
-
import { ALL_ENGINES, ENGINES } from "../src/search/constants.mjs";
|
|
38
|
-
import { runExtractor } from "../src/search/engines.mjs";
|
|
39
|
-
import {
|
|
40
|
-
fetchMultipleSources,
|
|
41
|
-
fetchTopSource,
|
|
42
|
-
} from "../src/search/fetch-source.mjs";
|
|
43
|
-
import { writeOutput } from "../src/search/output.mjs";
|
|
44
|
-
import {
|
|
45
|
-
findHeadlessBlockedEngines,
|
|
46
|
-
isHeadlessBlockedError,
|
|
47
|
-
isManualVerificationError,
|
|
48
|
-
} from "../src/search/recovery.mjs";
|
|
49
|
-
import {
|
|
50
|
-
buildSourceRegistry,
|
|
51
|
-
mergeFetchDataIntoSources,
|
|
52
|
-
} from "../src/search/sources.mjs";
|
|
53
|
-
import { buildConfidence } from "../src/search/synthesis.mjs";
|
|
54
|
-
import { synthesizeWithGemini } from "../src/search/synthesis-runner.mjs";
|
|
55
|
-
|
|
56
|
-
const CONFIG_DIR = join(homedir(), ".config", "greedysearch");
|
|
57
|
-
const CONFIG_FILE = join(CONFIG_DIR, "config.json");
|
|
58
|
-
|
|
59
|
-
function loadUserConfig() {
|
|
60
|
-
try {
|
|
61
|
-
if (existsSync(CONFIG_FILE)) {
|
|
62
|
-
return JSON.parse(readFileSync(CONFIG_FILE, "utf8"));
|
|
63
|
-
}
|
|
64
|
-
} catch {
|
|
65
|
-
// Ignore errors
|
|
66
|
-
}
|
|
67
|
-
return {};
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
/** Read query/prompt from stdin (used with --stdin to avoid command-line leakage) */
|
|
71
|
-
async function readStdin() {
|
|
72
|
-
return new Promise((resolve) => {
|
|
73
|
-
let data = "";
|
|
74
|
-
process.stdin.setEncoding("utf8");
|
|
75
|
-
process.stdin.on("data", (chunk) => (data += chunk));
|
|
76
|
-
process.stdin.on("end", () => resolve(data.trim()));
|
|
77
|
-
if (process.stdin.isTTY) resolve("");
|
|
78
|
-
});
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
// ─── Main ──────────────────────────────────────────────────────────────────
|
|
82
|
-
|
|
83
|
-
async function main() {
|
|
84
|
-
const args = process.argv.slice(2);
|
|
85
|
-
if (args.length < 2 || args[0] === "--help") {
|
|
86
|
-
process.stderr.write(
|
|
87
|
-
`${[
|
|
88
|
-
'Usage: node search.mjs <engine> "<query>"',
|
|
89
|
-
"",
|
|
90
|
-
"Engines: perplexity (p), bing (b), google (g), gemini (gem), all",
|
|
91
|
-
"",
|
|
92
|
-
"Flags:",
|
|
93
|
-
" --fast Quick mode: no source fetching or synthesis",
|
|
94
|
-
" --synthesize Deprecated: synthesis is now default for multi-engine",
|
|
95
|
-
" --deep-research Deprecated: source fetching is now default",
|
|
96
|
-
" --fetch-top-source Fetch content from top source",
|
|
97
|
-
" --inline Output JSON to stdout (for piping)",
|
|
98
|
-
" --locale <lang> Force results language (en, de, fr, etc.)",
|
|
99
|
-
" --visible Always use visible Chrome for this search",
|
|
100
|
-
" --always-visible Alias for --visible",
|
|
101
|
-
" --stdin Read query from stdin (avoids command-line leakage)",
|
|
102
|
-
"",
|
|
103
|
-
"Environment:",
|
|
104
|
-
" GREEDY_SEARCH_VISIBLE Set to 1 to show Chrome window (disables headless)",
|
|
105
|
-
" GREEDY_SEARCH_ALWAYS_VISIBLE Set to 1 to force visible mode for all runs",
|
|
106
|
-
" GREEDY_SEARCH_LOCALE Default locale (default: en)",
|
|
107
|
-
"",
|
|
108
|
-
"Examples:",
|
|
109
|
-
' node search.mjs all "Node.js streams" # Default: sources + synthesis',
|
|
110
|
-
' node search.mjs all "quick check" --fast # Fast: no sources/synthesis',
|
|
111
|
-
' node search.mjs p "what is memoization" # Single engine: fast mode',
|
|
112
|
-
].join("\n")}\n`,
|
|
113
|
-
);
|
|
114
|
-
process.exit(1);
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
const alwaysVisible =
|
|
118
|
-
args.includes("--visible") ||
|
|
119
|
-
args.includes("--always-visible") ||
|
|
120
|
-
process.env.GREEDY_SEARCH_ALWAYS_VISIBLE === "1";
|
|
121
|
-
if (alwaysVisible) {
|
|
122
|
-
process.env.GREEDY_SEARCH_VISIBLE = "1";
|
|
123
|
-
process.env.GREEDY_SEARCH_ALWAYS_VISIBLE = "1";
|
|
124
|
-
delete process.env.GREEDY_SEARCH_HEADLESS;
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
await ensureChrome();
|
|
128
|
-
|
|
129
|
-
// Track activity for headless idle timeout
|
|
130
|
-
touchActivity();
|
|
131
|
-
|
|
132
|
-
// Depth modes: fast (no synthesis/fetch), standard (synthesis+fetch 5 sources)
|
|
133
|
-
const depthIdx = args.indexOf("--depth");
|
|
134
|
-
let depth = "standard"; // DEFAULT: synthesis + source fetch
|
|
135
|
-
|
|
136
|
-
if (depthIdx !== -1 && args[depthIdx + 1]) {
|
|
137
|
-
depth = args[depthIdx + 1];
|
|
138
|
-
} else if (args.includes("--fast")) {
|
|
139
|
-
depth = "fast"; // Explicit fast mode requested
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
// For single engine (not "all"), default to fast unless explicit
|
|
143
|
-
const engineArg = args.find((a) => !a.startsWith("--"))?.toLowerCase();
|
|
144
|
-
if (engineArg !== "all" && depthIdx === -1 && !args.includes("--fast")) {
|
|
145
|
-
depth = "fast";
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
// --deep-research / --deep flags map to deep mode (backward compat)
|
|
149
|
-
if (args.includes("--deep-research")) {
|
|
150
|
-
depth = "standard";
|
|
151
|
-
process.stderr.write(
|
|
152
|
-
"[greedysearch] --deep-research is deprecated; use --depth standard (now default)\n",
|
|
153
|
-
);
|
|
154
|
-
}
|
|
155
|
-
if (args.includes("--deep")) {
|
|
156
|
-
depth = "deep";
|
|
157
|
-
}
|
|
158
|
-
if (args.includes("--synthesize")) {
|
|
159
|
-
process.stderr.write(
|
|
160
|
-
"[greedysearch] --synthesize is deprecated; synthesis is now default for multi-engine\n",
|
|
161
|
-
);
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
const full = args.includes("--full");
|
|
165
|
-
const short = !full;
|
|
166
|
-
const fetchSource = args.includes("--fetch-top-source");
|
|
167
|
-
const inline = args.includes("--inline");
|
|
168
|
-
// Headless is the default — only disable if GREEDY_SEARCH_VISIBLE=1
|
|
169
|
-
if (process.env.GREEDY_SEARCH_VISIBLE !== "1")
|
|
170
|
-
process.env.GREEDY_SEARCH_HEADLESS = "1";
|
|
171
|
-
const outIdx = args.indexOf("--out");
|
|
172
|
-
const outFile = outIdx === -1 ? null : args[outIdx + 1];
|
|
173
|
-
|
|
174
|
-
// Locale handling: CLI flag > env var > config file > default (en)
|
|
175
|
-
const localeIdx = args.indexOf("--locale");
|
|
176
|
-
const envLocale = process.env.GREEDY_SEARCH_LOCALE;
|
|
177
|
-
const userConfig = loadUserConfig();
|
|
178
|
-
let locale = "en"; // Default to English
|
|
179
|
-
|
|
180
|
-
if (localeIdx !== -1 && args[localeIdx + 1]) {
|
|
181
|
-
locale = args[localeIdx + 1];
|
|
182
|
-
} else if (envLocale) {
|
|
183
|
-
locale = envLocale;
|
|
184
|
-
} else if (userConfig.locale) {
|
|
185
|
-
locale = userConfig.locale;
|
|
186
|
-
}
|
|
187
|
-
const rest = args.filter(
|
|
188
|
-
(a, i) =>
|
|
189
|
-
a !== "--full" &&
|
|
190
|
-
a !== "--short" &&
|
|
191
|
-
a !== "--fast" &&
|
|
192
|
-
a !== "--fetch-top-source" &&
|
|
193
|
-
a !== "--synthesize" &&
|
|
194
|
-
a !== "--deep-research" &&
|
|
195
|
-
a !== "--deep" &&
|
|
196
|
-
a !== "--inline" &&
|
|
197
|
-
a !== "--stdin" &&
|
|
198
|
-
a !== "--headless" &&
|
|
199
|
-
a !== "--visible" &&
|
|
200
|
-
a !== "--always-visible" &&
|
|
201
|
-
a !== "--depth" &&
|
|
202
|
-
a !== "--out" &&
|
|
203
|
-
a !== "--help" &&
|
|
204
|
-
(depthIdx === -1 || i !== depthIdx + 1) &&
|
|
205
|
-
(outIdx === -1 || i !== outIdx + 1),
|
|
206
|
-
);
|
|
207
|
-
const engine = rest[0]?.toLowerCase();
|
|
208
|
-
// Read query from stdin when --stdin flag is set (avoids leaking query in process table)
|
|
209
|
-
const useStdin = args.includes("--stdin");
|
|
210
|
-
let query;
|
|
211
|
-
if (useStdin) {
|
|
212
|
-
query = await readStdin();
|
|
213
|
-
} else {
|
|
214
|
-
query = rest.slice(1).join(" ");
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
if (engine === "all") {
|
|
218
|
-
await cdp(["list"]); // refresh pages cache
|
|
219
|
-
|
|
220
|
-
// Create fresh tabs for each engine in parallel, seeded directly to the
|
|
221
|
-
// engine homepage so extractors can skip the initial navigation.
|
|
222
|
-
const ENGINE_START_URLS = {
|
|
223
|
-
perplexity: "https://www.perplexity.ai/",
|
|
224
|
-
bing: "https://copilot.microsoft.com/",
|
|
225
|
-
google: "https://www.google.com/",
|
|
226
|
-
};
|
|
227
|
-
const engineTabs = await Promise.all(
|
|
228
|
-
ALL_ENGINES.map((e) => openNewTab(ENGINE_START_URLS[e])),
|
|
229
|
-
);
|
|
230
|
-
// Refresh cache so the new tabs are discoverable by cdp.mjs
|
|
231
|
-
await cdp(["list"]);
|
|
232
|
-
|
|
233
|
-
// Time-bounded per-engine extraction so slow engines don't stall the batch.
|
|
234
|
-
// Fast mode: 22s per engine (total budget ~25s incl overhead).
|
|
235
|
-
// Standard/deep: 35s per engine (total budget ~40s incl overhead).
|
|
236
|
-
const engineTimeoutMs = depth === "fast" ? 30000 : 55000;
|
|
237
|
-
|
|
238
|
-
try {
|
|
239
|
-
const results = await Promise.allSettled(
|
|
240
|
-
ALL_ENGINES.map((e, i) =>
|
|
241
|
-
runExtractor(
|
|
242
|
-
ENGINES[e],
|
|
243
|
-
query,
|
|
244
|
-
engineTabs[i],
|
|
245
|
-
short,
|
|
246
|
-
engineTimeoutMs,
|
|
247
|
-
locale,
|
|
248
|
-
)
|
|
249
|
-
.then((r) => {
|
|
250
|
-
process.stderr.write(`PROGRESS:${e}:done\n`);
|
|
251
|
-
return { engine: e, ...r };
|
|
252
|
-
})
|
|
253
|
-
.catch((err) => {
|
|
254
|
-
process.stderr.write(`PROGRESS:${e}:error\n`);
|
|
255
|
-
throw err;
|
|
256
|
-
}),
|
|
257
|
-
),
|
|
258
|
-
);
|
|
259
|
-
|
|
260
|
-
const out = {};
|
|
261
|
-
for (let i = 0; i < results.length; i++) {
|
|
262
|
-
const r = results[i];
|
|
263
|
-
if (r.status === "fulfilled") {
|
|
264
|
-
out[r.value.engine] = r.value;
|
|
265
|
-
} else {
|
|
266
|
-
out[ALL_ENGINES[i]] = { error: r.reason?.message || "unknown error" };
|
|
267
|
-
}
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
// Cloudflare/verification recovery: if Perplexity or Bing were blocked
|
|
271
|
-
// in headless mode, retry in visible Chrome to establish cookies,
|
|
272
|
-
// then continue headless with the profile now carrying valid session state.
|
|
273
|
-
// Recovery is allowed even in fast mode because verification failure would
|
|
274
|
-
// otherwise produce no usable result.
|
|
275
|
-
const cfBlocked = findHeadlessBlockedEngines(out);
|
|
276
|
-
|
|
277
|
-
if (cfBlocked.length > 0 && process.env.GREEDY_SEARCH_VISIBLE !== "1") {
|
|
278
|
-
process.stderr.write(
|
|
279
|
-
`[greedysearch] 🔓 Cloudflare/verification blocked ${cfBlocked.join(", ")} in headless — retrying visible to establish cookies...\n`,
|
|
280
|
-
);
|
|
281
|
-
for (const blockedEngine of cfBlocked) {
|
|
282
|
-
process.stderr.write(`PROGRESS:${blockedEngine}:needs-human\n`);
|
|
283
|
-
}
|
|
284
|
-
// Close headless tabs, kill headless Chrome
|
|
285
|
-
await closeTabs(engineTabs);
|
|
286
|
-
await killHeadlessChrome();
|
|
287
|
-
process.env.GREEDY_SEARCH_VISIBLE = "1";
|
|
288
|
-
delete process.env.GREEDY_SEARCH_HEADLESS;
|
|
289
|
-
await ensureChrome();
|
|
290
|
-
await cdp(["list"]);
|
|
291
|
-
|
|
292
|
-
// Retry blocked engines in visible Chrome
|
|
293
|
-
const retryTabs = [];
|
|
294
|
-
let keepVisibleForHuman = false;
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
process.stderr.write(
|
|
328
|
-
`[greedysearch]
|
|
329
|
-
);
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
);
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
}
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
ws
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// search.mjs - unified CLI for GreedySearch extractors
|
|
4
|
+
//
|
|
5
|
+
// Usage:
|
|
6
|
+
// node search.mjs <engine> "<query>"
|
|
7
|
+
// node search.mjs all "<query>"
|
|
8
|
+
//
|
|
9
|
+
// Engines:
|
|
10
|
+
// perplexity | pplx | p
|
|
11
|
+
// bing | copilot | b
|
|
12
|
+
// google | g
|
|
13
|
+
// gemini | gem
|
|
14
|
+
// all - fan-out to all engines in parallel
|
|
15
|
+
//
|
|
16
|
+
// Output: JSON to stdout, errors to stderr
|
|
17
|
+
//
|
|
18
|
+
// Examples:
|
|
19
|
+
// node search.mjs p "what is memoization"
|
|
20
|
+
// node search.mjs gem "latest React features"
|
|
21
|
+
// node search.mjs all "how does TCP congestion control work"
|
|
22
|
+
|
|
23
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
24
|
+
// Config file for user defaults
|
|
25
|
+
import { homedir } from "node:os";
|
|
26
|
+
import { join } from "node:path";
|
|
27
|
+
import {
|
|
28
|
+
activateTab,
|
|
29
|
+
cdp,
|
|
30
|
+
closeTab,
|
|
31
|
+
closeTabs,
|
|
32
|
+
ensureChrome,
|
|
33
|
+
killHeadlessChrome,
|
|
34
|
+
openNewTab,
|
|
35
|
+
touchActivity,
|
|
36
|
+
} from "../src/search/chrome.mjs";
|
|
37
|
+
import { ALL_ENGINES, ENGINES } from "../src/search/constants.mjs";
|
|
38
|
+
import { runExtractor } from "../src/search/engines.mjs";
|
|
39
|
+
import {
|
|
40
|
+
fetchMultipleSources,
|
|
41
|
+
fetchTopSource,
|
|
42
|
+
} from "../src/search/fetch-source.mjs";
|
|
43
|
+
import { writeOutput } from "../src/search/output.mjs";
|
|
44
|
+
import {
|
|
45
|
+
findHeadlessBlockedEngines,
|
|
46
|
+
isHeadlessBlockedError,
|
|
47
|
+
isManualVerificationError,
|
|
48
|
+
} from "../src/search/recovery.mjs";
|
|
49
|
+
import {
|
|
50
|
+
buildSourceRegistry,
|
|
51
|
+
mergeFetchDataIntoSources,
|
|
52
|
+
} from "../src/search/sources.mjs";
|
|
53
|
+
import { buildConfidence } from "../src/search/synthesis.mjs";
|
|
54
|
+
import { synthesizeWithGemini } from "../src/search/synthesis-runner.mjs";
|
|
55
|
+
|
|
56
|
+
const CONFIG_DIR = join(homedir(), ".config", "greedysearch");
|
|
57
|
+
const CONFIG_FILE = join(CONFIG_DIR, "config.json");
|
|
58
|
+
|
|
59
|
+
function loadUserConfig() {
|
|
60
|
+
try {
|
|
61
|
+
if (existsSync(CONFIG_FILE)) {
|
|
62
|
+
return JSON.parse(readFileSync(CONFIG_FILE, "utf8"));
|
|
63
|
+
}
|
|
64
|
+
} catch {
|
|
65
|
+
// Ignore errors
|
|
66
|
+
}
|
|
67
|
+
return {};
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/** Read query/prompt from stdin (used with --stdin to avoid command-line leakage) */
|
|
71
|
+
async function readStdin() {
|
|
72
|
+
return new Promise((resolve) => {
|
|
73
|
+
let data = "";
|
|
74
|
+
process.stdin.setEncoding("utf8");
|
|
75
|
+
process.stdin.on("data", (chunk) => (data += chunk));
|
|
76
|
+
process.stdin.on("end", () => resolve(data.trim()));
|
|
77
|
+
if (process.stdin.isTTY) resolve("");
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// ─── Main ──────────────────────────────────────────────────────────────────
|
|
82
|
+
|
|
83
|
+
async function main() {
|
|
84
|
+
const args = process.argv.slice(2);
|
|
85
|
+
if (args.length < 2 || args[0] === "--help") {
|
|
86
|
+
process.stderr.write(
|
|
87
|
+
`${[
|
|
88
|
+
'Usage: node search.mjs <engine> "<query>"',
|
|
89
|
+
"",
|
|
90
|
+
"Engines: perplexity (p), bing (b), google (g), gemini (gem), all",
|
|
91
|
+
"",
|
|
92
|
+
"Flags:",
|
|
93
|
+
" --fast Quick mode: no source fetching or synthesis",
|
|
94
|
+
" --synthesize Deprecated: synthesis is now default for multi-engine",
|
|
95
|
+
" --deep-research Deprecated: source fetching is now default",
|
|
96
|
+
" --fetch-top-source Fetch content from top source",
|
|
97
|
+
" --inline Output JSON to stdout (for piping)",
|
|
98
|
+
" --locale <lang> Force results language (en, de, fr, etc.)",
|
|
99
|
+
" --visible Always use visible Chrome for this search",
|
|
100
|
+
" --always-visible Alias for --visible",
|
|
101
|
+
" --stdin Read query from stdin (avoids command-line leakage)",
|
|
102
|
+
"",
|
|
103
|
+
"Environment:",
|
|
104
|
+
" GREEDY_SEARCH_VISIBLE Set to 1 to show Chrome window (disables headless)",
|
|
105
|
+
" GREEDY_SEARCH_ALWAYS_VISIBLE Set to 1 to force visible mode for all runs",
|
|
106
|
+
" GREEDY_SEARCH_LOCALE Default locale (default: en)",
|
|
107
|
+
"",
|
|
108
|
+
"Examples:",
|
|
109
|
+
' node search.mjs all "Node.js streams" # Default: sources + synthesis',
|
|
110
|
+
' node search.mjs all "quick check" --fast # Fast: no sources/synthesis',
|
|
111
|
+
' node search.mjs p "what is memoization" # Single engine: fast mode',
|
|
112
|
+
].join("\n")}\n`,
|
|
113
|
+
);
|
|
114
|
+
process.exit(1);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
const alwaysVisible =
|
|
118
|
+
args.includes("--visible") ||
|
|
119
|
+
args.includes("--always-visible") ||
|
|
120
|
+
process.env.GREEDY_SEARCH_ALWAYS_VISIBLE === "1";
|
|
121
|
+
if (alwaysVisible) {
|
|
122
|
+
process.env.GREEDY_SEARCH_VISIBLE = "1";
|
|
123
|
+
process.env.GREEDY_SEARCH_ALWAYS_VISIBLE = "1";
|
|
124
|
+
delete process.env.GREEDY_SEARCH_HEADLESS;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
await ensureChrome();
|
|
128
|
+
|
|
129
|
+
// Track activity for headless idle timeout
|
|
130
|
+
touchActivity();
|
|
131
|
+
|
|
132
|
+
// Depth modes: fast (no synthesis/fetch), standard (synthesis+fetch 5 sources)
|
|
133
|
+
const depthIdx = args.indexOf("--depth");
|
|
134
|
+
let depth = "standard"; // DEFAULT: synthesis + source fetch
|
|
135
|
+
|
|
136
|
+
if (depthIdx !== -1 && args[depthIdx + 1]) {
|
|
137
|
+
depth = args[depthIdx + 1];
|
|
138
|
+
} else if (args.includes("--fast")) {
|
|
139
|
+
depth = "fast"; // Explicit fast mode requested
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// For single engine (not "all"), default to fast unless explicit
|
|
143
|
+
const engineArg = args.find((a) => !a.startsWith("--"))?.toLowerCase();
|
|
144
|
+
if (engineArg !== "all" && depthIdx === -1 && !args.includes("--fast")) {
|
|
145
|
+
depth = "fast";
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// --deep-research / --deep flags map to deep mode (backward compat)
|
|
149
|
+
if (args.includes("--deep-research")) {
|
|
150
|
+
depth = "standard";
|
|
151
|
+
process.stderr.write(
|
|
152
|
+
"[greedysearch] --deep-research is deprecated; use --depth standard (now default)\n",
|
|
153
|
+
);
|
|
154
|
+
}
|
|
155
|
+
if (args.includes("--deep")) {
|
|
156
|
+
depth = "deep";
|
|
157
|
+
}
|
|
158
|
+
if (args.includes("--synthesize")) {
|
|
159
|
+
process.stderr.write(
|
|
160
|
+
"[greedysearch] --synthesize is deprecated; synthesis is now default for multi-engine\n",
|
|
161
|
+
);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
const full = args.includes("--full");
|
|
165
|
+
const short = !full;
|
|
166
|
+
const fetchSource = args.includes("--fetch-top-source");
|
|
167
|
+
const inline = args.includes("--inline");
|
|
168
|
+
// Headless is the default — only disable if GREEDY_SEARCH_VISIBLE=1
|
|
169
|
+
if (process.env.GREEDY_SEARCH_VISIBLE !== "1")
|
|
170
|
+
process.env.GREEDY_SEARCH_HEADLESS = "1";
|
|
171
|
+
const outIdx = args.indexOf("--out");
|
|
172
|
+
const outFile = outIdx === -1 ? null : args[outIdx + 1];
|
|
173
|
+
|
|
174
|
+
// Locale handling: CLI flag > env var > config file > default (en)
|
|
175
|
+
const localeIdx = args.indexOf("--locale");
|
|
176
|
+
const envLocale = process.env.GREEDY_SEARCH_LOCALE;
|
|
177
|
+
const userConfig = loadUserConfig();
|
|
178
|
+
let locale = "en"; // Default to English
|
|
179
|
+
|
|
180
|
+
if (localeIdx !== -1 && args[localeIdx + 1]) {
|
|
181
|
+
locale = args[localeIdx + 1];
|
|
182
|
+
} else if (envLocale) {
|
|
183
|
+
locale = envLocale;
|
|
184
|
+
} else if (userConfig.locale) {
|
|
185
|
+
locale = userConfig.locale;
|
|
186
|
+
}
|
|
187
|
+
const rest = args.filter(
|
|
188
|
+
(a, i) =>
|
|
189
|
+
a !== "--full" &&
|
|
190
|
+
a !== "--short" &&
|
|
191
|
+
a !== "--fast" &&
|
|
192
|
+
a !== "--fetch-top-source" &&
|
|
193
|
+
a !== "--synthesize" &&
|
|
194
|
+
a !== "--deep-research" &&
|
|
195
|
+
a !== "--deep" &&
|
|
196
|
+
a !== "--inline" &&
|
|
197
|
+
a !== "--stdin" &&
|
|
198
|
+
a !== "--headless" &&
|
|
199
|
+
a !== "--visible" &&
|
|
200
|
+
a !== "--always-visible" &&
|
|
201
|
+
a !== "--depth" &&
|
|
202
|
+
a !== "--out" &&
|
|
203
|
+
a !== "--help" &&
|
|
204
|
+
(depthIdx === -1 || i !== depthIdx + 1) &&
|
|
205
|
+
(outIdx === -1 || i !== outIdx + 1),
|
|
206
|
+
);
|
|
207
|
+
const engine = rest[0]?.toLowerCase();
|
|
208
|
+
// Read query from stdin when --stdin flag is set (avoids leaking query in process table)
|
|
209
|
+
const useStdin = args.includes("--stdin");
|
|
210
|
+
let query;
|
|
211
|
+
if (useStdin) {
|
|
212
|
+
query = await readStdin();
|
|
213
|
+
} else {
|
|
214
|
+
query = rest.slice(1).join(" ");
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
if (engine === "all") {
|
|
218
|
+
await cdp(["list"]); // refresh pages cache
|
|
219
|
+
|
|
220
|
+
// Create fresh tabs for each engine in parallel, seeded directly to the
|
|
221
|
+
// engine homepage so extractors can skip the initial navigation.
|
|
222
|
+
const ENGINE_START_URLS = {
|
|
223
|
+
perplexity: "https://www.perplexity.ai/",
|
|
224
|
+
bing: "https://copilot.microsoft.com/",
|
|
225
|
+
google: "https://www.google.com/",
|
|
226
|
+
};
|
|
227
|
+
const engineTabs = await Promise.all(
|
|
228
|
+
ALL_ENGINES.map((e) => openNewTab(ENGINE_START_URLS[e])),
|
|
229
|
+
);
|
|
230
|
+
// Refresh cache so the new tabs are discoverable by cdp.mjs
|
|
231
|
+
await cdp(["list"]);
|
|
232
|
+
|
|
233
|
+
// Time-bounded per-engine extraction so slow engines don't stall the batch.
|
|
234
|
+
// Fast mode: 22s per engine (total budget ~25s incl overhead).
|
|
235
|
+
// Standard/deep: 35s per engine (total budget ~40s incl overhead).
|
|
236
|
+
const engineTimeoutMs = depth === "fast" ? 30000 : 55000;
|
|
237
|
+
|
|
238
|
+
try {
|
|
239
|
+
const results = await Promise.allSettled(
|
|
240
|
+
ALL_ENGINES.map((e, i) =>
|
|
241
|
+
runExtractor(
|
|
242
|
+
ENGINES[e],
|
|
243
|
+
query,
|
|
244
|
+
engineTabs[i],
|
|
245
|
+
short,
|
|
246
|
+
engineTimeoutMs,
|
|
247
|
+
locale,
|
|
248
|
+
)
|
|
249
|
+
.then((r) => {
|
|
250
|
+
process.stderr.write(`PROGRESS:${e}:done\n`);
|
|
251
|
+
return { engine: e, ...r };
|
|
252
|
+
})
|
|
253
|
+
.catch((err) => {
|
|
254
|
+
process.stderr.write(`PROGRESS:${e}:error\n`);
|
|
255
|
+
throw err;
|
|
256
|
+
}),
|
|
257
|
+
),
|
|
258
|
+
);
|
|
259
|
+
|
|
260
|
+
const out = {};
|
|
261
|
+
for (let i = 0; i < results.length; i++) {
|
|
262
|
+
const r = results[i];
|
|
263
|
+
if (r.status === "fulfilled") {
|
|
264
|
+
out[r.value.engine] = r.value;
|
|
265
|
+
} else {
|
|
266
|
+
out[ALL_ENGINES[i]] = { error: r.reason?.message || "unknown error" };
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// Cloudflare/verification recovery: if Perplexity or Bing were blocked
|
|
271
|
+
// in headless mode, retry in visible Chrome to establish cookies,
|
|
272
|
+
// then continue headless with the profile now carrying valid session state.
|
|
273
|
+
// Recovery is allowed even in fast mode because verification failure would
|
|
274
|
+
// otherwise produce no usable result.
|
|
275
|
+
const cfBlocked = findHeadlessBlockedEngines(out);
|
|
276
|
+
|
|
277
|
+
if (cfBlocked.length > 0 && process.env.GREEDY_SEARCH_VISIBLE !== "1") {
|
|
278
|
+
process.stderr.write(
|
|
279
|
+
`[greedysearch] 🔓 Cloudflare/verification blocked ${cfBlocked.join(", ")} in headless — retrying visible to establish cookies...\n`,
|
|
280
|
+
);
|
|
281
|
+
for (const blockedEngine of cfBlocked) {
|
|
282
|
+
process.stderr.write(`PROGRESS:${blockedEngine}:needs-human\n`);
|
|
283
|
+
}
|
|
284
|
+
// Close headless tabs, kill headless Chrome
|
|
285
|
+
await closeTabs(engineTabs);
|
|
286
|
+
await killHeadlessChrome();
|
|
287
|
+
process.env.GREEDY_SEARCH_VISIBLE = "1";
|
|
288
|
+
delete process.env.GREEDY_SEARCH_HEADLESS;
|
|
289
|
+
await ensureChrome();
|
|
290
|
+
await cdp(["list"]);
|
|
291
|
+
|
|
292
|
+
// Retry blocked engines in visible Chrome
|
|
293
|
+
const retryTabs = [];
|
|
294
|
+
let keepVisibleForHuman = false;
|
|
295
|
+
let recovered = 0;
|
|
296
|
+
for (let i = 0; i < cfBlocked.length; i++) {
|
|
297
|
+
const tab = await openNewTab();
|
|
298
|
+
retryTabs.push(tab);
|
|
299
|
+
}
|
|
300
|
+
try {
|
|
301
|
+
// First visible retry: navigate to the engine page.
|
|
302
|
+
// Cloudflare/Turnstile may resolve and redirect, disrupting the CDP session
|
|
303
|
+
// ("Inspected target navigated or closed"). If so, the cookies are now cached
|
|
304
|
+
// and a second retry on the same tab should succeed.
|
|
305
|
+
const retries = await Promise.allSettled(
|
|
306
|
+
cfBlocked.map((e, i) =>
|
|
307
|
+
runExtractor(ENGINES[e], query, retryTabs[i], short, null, locale)
|
|
308
|
+
.then((r) => ({ engine: e, ...r }))
|
|
309
|
+
.catch((err) => ({ engine: e, error: err.message })),
|
|
310
|
+
),
|
|
311
|
+
);
|
|
312
|
+
const stillBlocked = [];
|
|
313
|
+
const manualVerification = [];
|
|
314
|
+
for (const r of retries) {
|
|
315
|
+
if (r.status === "fulfilled" && !r.value.error) {
|
|
316
|
+
out[r.value.engine] = r.value;
|
|
317
|
+
recovered++;
|
|
318
|
+
} else if (r.status === "fulfilled") {
|
|
319
|
+
out[r.value.engine] = r.value;
|
|
320
|
+
stillBlocked.push(r.value.engine);
|
|
321
|
+
if (isManualVerificationError(r.value.error)) {
|
|
322
|
+
manualVerification.push(r.value.engine);
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
if (recovered > 0) {
|
|
327
|
+
process.stderr.write(
|
|
328
|
+
`[greedysearch] ✅ ${recovered}/${cfBlocked.length} engine(s) recovered — cookies cached for future headless runs.\n`,
|
|
329
|
+
);
|
|
330
|
+
} else {
|
|
331
|
+
process.stderr.write(
|
|
332
|
+
`[greedysearch] ⚠️ Recovery attempt failed — ${cfBlocked.join(", ")} still blocked in visible mode.\n`,
|
|
333
|
+
);
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// Second retry for still-blocked engines: the first retry may have resolved
|
|
337
|
+
// Cloudflare/Turnstile (navigating through the challenge), so cookies are now
|
|
338
|
+
// cached and the page should load without the blocking challenge.
|
|
339
|
+
if (stillBlocked.length > 0) {
|
|
340
|
+
process.stderr.write(
|
|
341
|
+
`[greedysearch] Second visible retry for ${stillBlocked.join(", ")} — Turnstile may have resolved on first attempt...\n`,
|
|
342
|
+
);
|
|
343
|
+
const secondRetries = await Promise.allSettled(
|
|
344
|
+
stillBlocked.map((e) => {
|
|
345
|
+
const idx = cfBlocked.indexOf(e);
|
|
346
|
+
return runExtractor(
|
|
347
|
+
ENGINES[e],
|
|
348
|
+
query,
|
|
349
|
+
retryTabs[idx],
|
|
350
|
+
short,
|
|
351
|
+
null,
|
|
352
|
+
locale,
|
|
353
|
+
)
|
|
354
|
+
.then((r) => ({ engine: e, ...r }))
|
|
355
|
+
.catch((err) => ({ engine: e, error: err.message }));
|
|
356
|
+
}),
|
|
357
|
+
);
|
|
358
|
+
const secondStillBlocked = [];
|
|
359
|
+
for (const r of secondRetries) {
|
|
360
|
+
if (r.status === "fulfilled" && !r.value.error) {
|
|
361
|
+
out[r.value.engine] = r.value;
|
|
362
|
+
recovered++;
|
|
363
|
+
process.stderr.write(
|
|
364
|
+
`[greedysearch] ✅ ${r.value.engine} recovered on second visible retry.\n`,
|
|
365
|
+
);
|
|
366
|
+
} else {
|
|
367
|
+
secondStillBlocked.push(r.value?.engine || "unknown");
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
stillBlocked.length = 0;
|
|
371
|
+
stillBlocked.push(...secondStillBlocked);
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
if (stillBlocked.length > 0) {
|
|
375
|
+
keepVisibleForHuman = true;
|
|
376
|
+
out._needsHumanVerification = {
|
|
377
|
+
engines: stillBlocked,
|
|
378
|
+
message:
|
|
379
|
+
"Visible Chrome is open with the engine page loaded. Solve the Turnstile checkbox or other challenge in the visible window to store cookies. Cookies persist for future runs.",
|
|
380
|
+
};
|
|
381
|
+
process.stderr.write(
|
|
382
|
+
`[greedysearch] 🔓 ${stillBlocked.join(", ")} still blocked — keeping visible Chrome open. Solve the challenge in the window to store cookies, then rerun.\n`,
|
|
383
|
+
);
|
|
384
|
+
// Visible Chrome stays open so the user can interact with any
|
|
385
|
+
// Turnstile/Cloudflare challenge. Once solved, cookies are stored
|
|
386
|
+
// in the shared profile and future headless runs will reuse them.
|
|
387
|
+
}
|
|
388
|
+
} finally {
|
|
389
|
+
// Keep visible Chrome alive if engines were recovered (cookies now cached)
|
|
390
|
+
// or if the user needs to solve verification manually.
|
|
391
|
+
// Killing Chrome with taskkill /F would lose the cookie database writes.
|
|
392
|
+
if (!keepVisibleForHuman && recovered === 0) {
|
|
393
|
+
// Kill visible Chrome, relaunch headless for remaining pipeline
|
|
394
|
+
await closeTabs(retryTabs);
|
|
395
|
+
process.stderr.write(
|
|
396
|
+
"[greedysearch] Switching back to headless Chrome...\n",
|
|
397
|
+
);
|
|
398
|
+
await killHeadlessChrome();
|
|
399
|
+
delete process.env.GREEDY_SEARCH_VISIBLE;
|
|
400
|
+
process.env.GREEDY_SEARCH_HEADLESS = "1";
|
|
401
|
+
await ensureChrome();
|
|
402
|
+
await cdp(["list"]);
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// Minimize visible Chrome if it was kept alive (recovery succeeded or needs-human)
|
|
407
|
+
if (keepVisibleForHuman || recovered > 0) {
|
|
408
|
+
minimizeChrome().catch(() => {});
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
// Clear engineTabs — finally{} closeTabs handles empty arrays gracefully
|
|
412
|
+
engineTabs.length = 0;
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
// Build a canonical source registry across all engines
|
|
416
|
+
out._sources = buildSourceRegistry(out, query);
|
|
417
|
+
|
|
418
|
+
// Source fetching: default for all "all" searches
|
|
419
|
+
if (depth !== "fast" && out._sources.length > 0) {
|
|
420
|
+
process.stderr.write("PROGRESS:source-fetch:start\n");
|
|
421
|
+
const fetchedSources = await fetchMultipleSources(
|
|
422
|
+
out._sources,
|
|
423
|
+
5,
|
|
424
|
+
8000,
|
|
425
|
+
);
|
|
426
|
+
|
|
427
|
+
out._sources = mergeFetchDataIntoSources(out._sources, fetchedSources);
|
|
428
|
+
out._fetchedSources = fetchedSources;
|
|
429
|
+
process.stderr.write("PROGRESS:source-fetch:done\n");
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
// Synthesize with Gemini for all non-fast modes
|
|
433
|
+
if (depth !== "fast") {
|
|
434
|
+
process.stderr.write("PROGRESS:synthesis:start\n");
|
|
435
|
+
process.stderr.write(
|
|
436
|
+
"[greedysearch] Synthesizing results with Gemini...\n",
|
|
437
|
+
);
|
|
438
|
+
try {
|
|
439
|
+
const geminiTab = await openNewTab();
|
|
440
|
+
await activateTab(geminiTab);
|
|
441
|
+
const synthesis = await synthesizeWithGemini(query, out, {
|
|
442
|
+
grounded: depth === "deep",
|
|
443
|
+
tabPrefix: geminiTab,
|
|
444
|
+
});
|
|
445
|
+
out._synthesis = {
|
|
446
|
+
...synthesis,
|
|
447
|
+
synthesized: true,
|
|
448
|
+
};
|
|
449
|
+
await closeTab(geminiTab);
|
|
450
|
+
process.stderr.write("PROGRESS:synthesis:done\n");
|
|
451
|
+
} catch (e) {
|
|
452
|
+
process.stderr.write(
|
|
453
|
+
`[greedysearch] Synthesis failed: ${e.message}\n`,
|
|
454
|
+
);
|
|
455
|
+
out._synthesis = { error: e.message, synthesized: false };
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
if (fetchSource) {
|
|
460
|
+
const top = pickTopSource(out);
|
|
461
|
+
if (top)
|
|
462
|
+
out._topSource = await fetchTopSource(top.canonicalUrl || top.url);
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// Always include confidence metrics for non-fast searches
|
|
466
|
+
if (depth !== "fast") out._confidence = buildConfidence(out);
|
|
467
|
+
|
|
468
|
+
writeOutput(out, outFile, {
|
|
469
|
+
inline,
|
|
470
|
+
synthesize: depth !== "fast",
|
|
471
|
+
query,
|
|
472
|
+
});
|
|
473
|
+
return;
|
|
474
|
+
} finally {
|
|
475
|
+
await closeTabs(engineTabs);
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
// Single engine
|
|
480
|
+
const script = ENGINES[engine];
|
|
481
|
+
if (!script) {
|
|
482
|
+
process.stderr.write(
|
|
483
|
+
`Unknown engine: "${engine}"\nAvailable: ${Object.keys(ENGINES).join(", ")}\n`,
|
|
484
|
+
);
|
|
485
|
+
process.exit(1);
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
try {
|
|
489
|
+
const result = await runExtractor(script, query, null, short, null, locale);
|
|
490
|
+
if (fetchSource && result.sources?.length > 0) {
|
|
491
|
+
result.topSource = await fetchTopSource(result.sources[0].url);
|
|
492
|
+
}
|
|
493
|
+
writeOutput(result, outFile, { inline, synthesize: false, query });
|
|
494
|
+
} catch (e) {
|
|
495
|
+
const recoveryEngine = script.includes("bing")
|
|
496
|
+
? "bing"
|
|
497
|
+
: script.includes("perplexity")
|
|
498
|
+
? "perplexity"
|
|
499
|
+
: null;
|
|
500
|
+
const canRetryVisible =
|
|
501
|
+
recoveryEngine &&
|
|
502
|
+
process.env.GREEDY_SEARCH_VISIBLE !== "1" &&
|
|
503
|
+
isHeadlessBlockedError(e.message);
|
|
504
|
+
|
|
505
|
+
if (canRetryVisible) {
|
|
506
|
+
process.stderr.write(
|
|
507
|
+
`[greedysearch] 🔓 ${recoveryEngine} blocked in headless — retrying visible to establish cookies...\n`,
|
|
508
|
+
);
|
|
509
|
+
await killHeadlessChrome();
|
|
510
|
+
process.env.GREEDY_SEARCH_VISIBLE = "1";
|
|
511
|
+
delete process.env.GREEDY_SEARCH_HEADLESS;
|
|
512
|
+
await ensureChrome();
|
|
513
|
+
await cdp(["list"]);
|
|
514
|
+
|
|
515
|
+
const retryTab = await openNewTab();
|
|
516
|
+
let keepVisibleForHuman = false;
|
|
517
|
+
try {
|
|
518
|
+
const result = await runExtractor(
|
|
519
|
+
script,
|
|
520
|
+
query,
|
|
521
|
+
retryTab,
|
|
522
|
+
short,
|
|
523
|
+
null,
|
|
524
|
+
locale,
|
|
525
|
+
);
|
|
526
|
+
if (fetchSource && result.sources?.length > 0) {
|
|
527
|
+
result.topSource = await fetchTopSource(result.sources[0].url);
|
|
528
|
+
}
|
|
529
|
+
writeOutput(result, outFile, { inline, synthesize: false, query });
|
|
530
|
+
return;
|
|
531
|
+
} catch (retryErr) {
|
|
532
|
+
// Any visible retry failure: keep Chrome open so user can solve Turnstile.
|
|
533
|
+
// Once solved, cookies are stored in the shared profile for future headless runs.
|
|
534
|
+
keepVisibleForHuman = true;
|
|
535
|
+
writeOutput(
|
|
536
|
+
{
|
|
537
|
+
query,
|
|
538
|
+
error: retryErr.message,
|
|
539
|
+
_needsHumanVerification: {
|
|
540
|
+
engines: [recoveryEngine],
|
|
541
|
+
message:
|
|
542
|
+
"Visible Chrome is open with the engine page loaded. Solve the Turnstile checkbox or other challenge to store cookies. Cookies persist for future runs.",
|
|
543
|
+
},
|
|
544
|
+
},
|
|
545
|
+
outFile,
|
|
546
|
+
{ inline, synthesize: false, query },
|
|
547
|
+
);
|
|
548
|
+
return;
|
|
549
|
+
} finally {
|
|
550
|
+
if (!keepVisibleForHuman) {
|
|
551
|
+
await closeTab(retryTab);
|
|
552
|
+
await killHeadlessChrome();
|
|
553
|
+
delete process.env.GREEDY_SEARCH_VISIBLE;
|
|
554
|
+
process.env.GREEDY_SEARCH_HEADLESS = "1";
|
|
555
|
+
} else {
|
|
556
|
+
// Minimize the visible window so it's out of the way
|
|
557
|
+
minimizeChrome().catch(() => {});
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
process.stderr.write(`Error: ${e.message}\n`);
|
|
563
|
+
process.exit(1);
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
function pickTopSource(out) {
|
|
568
|
+
if (Array.isArray(out._sources) && out._sources.length > 0)
|
|
569
|
+
return out._sources[0];
|
|
570
|
+
for (const engine of ["perplexity", "google", "bing"]) {
|
|
571
|
+
const r = out[engine];
|
|
572
|
+
if (r?.sources?.length > 0) return r.sources[0];
|
|
573
|
+
}
|
|
574
|
+
return null;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
/**
|
|
578
|
+
* Minimize Chrome window via CDP after search completes.
|
|
579
|
+
* Called at the end of search to keep window minimized.
|
|
580
|
+
* Skipped in headless mode (no window to minimize).
|
|
581
|
+
*/
|
|
582
|
+
async function minimizeChrome() {
|
|
583
|
+
// In headless mode (default), there's no window to minimize
|
|
584
|
+
if (process.env.GREEDY_SEARCH_HEADLESS === "1") return;
|
|
585
|
+
|
|
586
|
+
try {
|
|
587
|
+
const http = await import("node:http");
|
|
588
|
+
const version = await new Promise((resolve, reject) => {
|
|
589
|
+
http
|
|
590
|
+
.get(`http://localhost:9222/json/version`, (res) => {
|
|
591
|
+
let body = "";
|
|
592
|
+
res.on("data", (d) => (body += d));
|
|
593
|
+
res.on("end", () => resolve(JSON.parse(body)));
|
|
594
|
+
})
|
|
595
|
+
.on("error", reject);
|
|
596
|
+
});
|
|
597
|
+
|
|
598
|
+
const wsUrl = version.webSocketDebuggerUrl;
|
|
599
|
+
const WebSocket = globalThis.WebSocket;
|
|
600
|
+
if (!WebSocket) return;
|
|
601
|
+
|
|
602
|
+
const ws = new WebSocket(wsUrl);
|
|
603
|
+
let requestId = 0;
|
|
604
|
+
const pending = new Map();
|
|
605
|
+
|
|
606
|
+
ws.onopen = () => {
|
|
607
|
+
const id = ++requestId;
|
|
608
|
+
pending.set(id, {
|
|
609
|
+
resolve: (result) => {
|
|
610
|
+
const targets = result.targetInfos || [];
|
|
611
|
+
const pageTarget = targets.find((t) => t.type === "page");
|
|
612
|
+
if (!pageTarget) {
|
|
613
|
+
ws.close();
|
|
614
|
+
return;
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
const winId = ++requestId;
|
|
618
|
+
pending.set(winId, {
|
|
619
|
+
resolve: (winResult) => {
|
|
620
|
+
const windowId = winResult.windowId;
|
|
621
|
+
const minId = ++requestId;
|
|
622
|
+
pending.set(minId, { resolve: () => {}, reject: () => {} });
|
|
623
|
+
ws.send(
|
|
624
|
+
JSON.stringify({
|
|
625
|
+
id: minId,
|
|
626
|
+
method: "Browser.setWindowBounds",
|
|
627
|
+
params: { windowId, bounds: { windowState: "minimized" } },
|
|
628
|
+
}),
|
|
629
|
+
);
|
|
630
|
+
setTimeout(() => ws.close(), 500);
|
|
631
|
+
},
|
|
632
|
+
reject: () => ws.close(),
|
|
633
|
+
});
|
|
634
|
+
ws.send(
|
|
635
|
+
JSON.stringify({
|
|
636
|
+
id: winId,
|
|
637
|
+
method: "Browser.getWindowForTarget",
|
|
638
|
+
params: { targetId: pageTarget.targetId },
|
|
639
|
+
}),
|
|
640
|
+
);
|
|
641
|
+
},
|
|
642
|
+
reject: () => ws.close(),
|
|
643
|
+
});
|
|
644
|
+
ws.send(JSON.stringify({ id, method: "Target.getTargets", params: {} }));
|
|
645
|
+
};
|
|
646
|
+
|
|
647
|
+
ws.onmessage = (event) => {
|
|
648
|
+
const msg = JSON.parse(event.data);
|
|
649
|
+
if (msg.id && pending.has(msg.id)) {
|
|
650
|
+
const { resolve, reject } = pending.get(msg.id);
|
|
651
|
+
pending.delete(msg.id);
|
|
652
|
+
if (msg.error) reject?.(msg.error);
|
|
653
|
+
else resolve?.(msg.result);
|
|
654
|
+
}
|
|
655
|
+
};
|
|
656
|
+
|
|
657
|
+
setTimeout(() => ws.close(), 3000);
|
|
658
|
+
} catch {
|
|
659
|
+
// Best-effort
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
main().finally(async () => {
|
|
664
|
+
// Touch activity timestamp for headless idle timeout
|
|
665
|
+
touchActivity();
|
|
666
|
+
// Ensure window is minimized after search completes (best-effort, non-blocking)
|
|
667
|
+
minimizeChrome().catch(() => {});
|
|
668
|
+
});
|