@apmantza/greedysearch-pi 1.9.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/search.mjs CHANGED
@@ -1,674 +1,886 @@
1
- #!/usr/bin/env node
2
-
3
- // search.mjs - unified CLI for GreedySearch extractors
4
- //
5
- // Usage:
6
- // node search.mjs <engine> "<query>"
7
- // node search.mjs all "<query>"
8
- //
9
- // Engines:
10
- // perplexity | pplx | p
11
- // bing | copilot | b
12
- // google | g
13
- // gemini | gem
14
- // all - fan-out to all engines in parallel
15
- //
16
- // Output: JSON to stdout, errors to stderr
17
- //
18
- // Examples:
19
- // node search.mjs p "what is memoization"
20
- // node search.mjs gem "latest React features"
21
- // node search.mjs all "how does TCP congestion control work"
22
-
23
- import { existsSync, readFileSync } from "node:fs";
24
- // Config file for user defaults
25
- import { homedir } from "node:os";
26
- import { join } from "node:path";
27
- import {
28
- cdp,
29
- closeTab,
30
- closeTabs,
31
- ensureChrome,
32
- killHeadlessChrome,
33
- openNewTab,
34
- touchActivity,
35
- } from "../src/search/chrome.mjs";
36
- import { ALL_ENGINES, ENGINES } from "../src/search/constants.mjs";
37
- import { runExtractor } from "../src/search/engines.mjs";
38
- import {
39
- fetchMultipleSources,
40
- fetchTopSource,
41
- } from "../src/search/fetch-source.mjs";
42
- import { writeSourcesToFiles } from "../src/search/file-sources.mjs";
43
- import { writeOutput } from "../src/search/output.mjs";
44
- import {
45
- findHeadlessBlockedEngines,
46
- isHeadlessBlockedResult,
47
- isManualVerificationError,
48
- } from "../src/search/recovery.mjs";
49
- import {
50
- buildSourceRegistry,
51
- mergeFetchDataIntoSources,
52
- } from "../src/search/sources.mjs";
53
- import { buildConfidence } from "../src/search/synthesis.mjs";
54
- import { synthesizeWithGemini } from "../src/search/synthesis-runner.mjs";
55
- import { normalizeQuery } from "../src/search/query.mjs";
56
-
57
- const CONFIG_DIR = join(homedir(), ".config", "greedysearch");
58
- const CONFIG_FILE = join(CONFIG_DIR, "config.json");
59
-
60
- function loadUserConfig() {
61
- try {
62
- if (existsSync(CONFIG_FILE)) {
63
- return JSON.parse(readFileSync(CONFIG_FILE, "utf8"));
64
- }
65
- } catch {
66
- // Ignore errors
67
- }
68
- return {};
69
- }
70
-
71
- /** Read query/prompt from stdin (used with --stdin to avoid command-line leakage) */
72
- async function readStdin() {
73
- return new Promise((resolve) => {
74
- let data = "";
75
- process.stdin.setEncoding("utf8");
76
- process.stdin.on("data", (chunk) => (data += chunk));
77
- process.stdin.on("end", () => resolve(data.trim()));
78
- if (process.stdin.isTTY) resolve("");
79
- });
80
- }
81
-
82
- // ─── Main ──────────────────────────────────────────────────────────────────
83
-
84
- async function main() {
85
- const args = process.argv.slice(2);
86
- if (args.length < 2 || args[0] === "--help") {
87
- process.stderr.write(
88
- `${[
89
- 'Usage: node search.mjs <engine> "<query>"',
90
- "",
91
- "Engines: perplexity (p), bing (b), google (g), gemini (gem), all",
92
- "",
93
- "Flags:",
94
- " --fast Quick mode: no source fetching or synthesis",
95
- " --synthesize Deprecated: synthesis is now default for multi-engine",
96
- " --deep-research Deprecated: source fetching is now default",
97
- " --fetch-top-source Fetch content from top source",
98
- " --inline Output JSON to stdout (for piping)",
99
- " --locale <lang> Force results language (en, de, fr, etc.)",
100
- " --visible Always use visible Chrome for this search",
101
- " --always-visible Alias for --visible",
102
- " --stdin Read query from stdin (avoids command-line leakage)",
103
- "",
104
- "Environment:",
105
- " GREEDY_SEARCH_VISIBLE Set to 1 to show Chrome window (disables headless)",
106
- " GREEDY_SEARCH_ALWAYS_VISIBLE Set to 1 to force visible mode for all runs",
107
- " GREEDY_SEARCH_LOCALE Default locale (default: en)",
108
- "",
109
- "Examples:",
110
- ' node search.mjs all "Node.js streams" # Default: sources + synthesis',
111
- ' node search.mjs all "quick check" --fast # Fast: no sources/synthesis',
112
- ' node search.mjs p "what is memoization" # Single engine: fast mode',
113
- ].join("\n")}\n`,
114
- );
115
- process.exit(1);
116
- }
117
-
118
- const alwaysVisible =
119
- args.includes("--visible") ||
120
- args.includes("--always-visible") ||
121
- process.env.GREEDY_SEARCH_ALWAYS_VISIBLE === "1";
122
- if (alwaysVisible) {
123
- process.env.GREEDY_SEARCH_VISIBLE = "1";
124
- process.env.GREEDY_SEARCH_ALWAYS_VISIBLE = "1";
125
- delete process.env.GREEDY_SEARCH_HEADLESS;
126
- }
127
-
128
- await ensureChrome();
129
-
130
- // Track activity for headless idle timeout
131
- touchActivity();
132
-
133
- // Depth modes: fast (no synthesis/fetch), standard (synthesis+fetch 5 sources)
134
- const depthIdx = args.indexOf("--depth");
135
- let depth = "standard"; // DEFAULT: synthesis + source fetch
136
-
137
- if (depthIdx !== -1 && args[depthIdx + 1]) {
138
- depth = args[depthIdx + 1];
139
- } else if (args.includes("--fast")) {
140
- depth = "fast"; // Explicit fast mode requested
141
- }
142
-
143
- // For single engine (not "all"), default to fast unless explicit
144
- const engineArg = args.find((a) => !a.startsWith("--"))?.toLowerCase();
145
- if (engineArg !== "all" && depthIdx === -1 && !args.includes("--fast")) {
146
- depth = "fast";
147
- }
148
-
149
- // --deep-research / --deep flags map to deep mode (backward compat)
150
- if (args.includes("--deep-research")) {
151
- depth = "standard";
152
- process.stderr.write(
153
- "[greedysearch] --deep-research is deprecated; use --depth standard (now default)\n",
154
- );
155
- }
156
- if (args.includes("--deep")) {
157
- depth = "deep";
158
- }
159
- if (args.includes("--synthesize")) {
160
- process.stderr.write(
161
- "[greedysearch] --synthesize is deprecated; synthesis is now default for multi-engine\n",
162
- );
163
- }
164
-
165
- const full = args.includes("--full");
166
- const short = !full;
167
- const fetchSource = args.includes("--fetch-top-source");
168
- const inline = args.includes("--inline");
169
- // Headless is the default — only disable if GREEDY_SEARCH_VISIBLE=1
170
- if (process.env.GREEDY_SEARCH_VISIBLE !== "1")
171
- process.env.GREEDY_SEARCH_HEADLESS = "1";
172
- const outIdx = args.indexOf("--out");
173
- const outFile = outIdx === -1 ? null : args[outIdx + 1];
174
-
175
- // Locale handling: CLI flag > env var > config file > default (en)
176
- const localeIdx = args.indexOf("--locale");
177
- const envLocale = process.env.GREEDY_SEARCH_LOCALE;
178
- const userConfig = loadUserConfig();
179
- let locale = "en"; // Default to English
180
-
181
- if (localeIdx !== -1 && args[localeIdx + 1]) {
182
- locale = args[localeIdx + 1];
183
- } else if (envLocale) {
184
- locale = envLocale;
185
- } else if (userConfig.locale) {
186
- locale = userConfig.locale;
187
- }
188
- const rest = args.filter(
189
- (a, i) =>
190
- a !== "--full" &&
191
- a !== "--short" &&
192
- a !== "--fast" &&
193
- a !== "--fetch-top-source" &&
194
- a !== "--synthesize" &&
195
- a !== "--deep-research" &&
196
- a !== "--deep" &&
197
- a !== "--inline" &&
198
- a !== "--stdin" &&
199
- a !== "--headless" &&
200
- a !== "--visible" &&
201
- a !== "--always-visible" &&
202
- a !== "--depth" &&
203
- a !== "--out" &&
204
- a !== "--help" &&
205
- (depthIdx === -1 || i !== depthIdx + 1) &&
206
- (outIdx === -1 || i !== outIdx + 1),
207
- );
208
- const engine = rest[0]?.toLowerCase();
209
- // Read query from stdin when --stdin flag is set (avoids leaking query in process table)
210
- const useStdin = args.includes("--stdin");
211
- let query;
212
- if (useStdin) {
213
- query = await readStdin();
214
- } else {
215
- query = rest.slice(1).join(" ");
216
- }
217
-
218
- if (engine === "all") {
219
- await cdp(["list"]); // refresh pages cache
220
-
221
- // Create fresh tabs for each engine in parallel, seeded directly to the
222
- // engine homepage so extractors can skip the initial navigation.
223
- const ENGINE_START_URLS = {
224
- perplexity: "https://www.perplexity.ai/",
225
- bing: "https://copilot.microsoft.com/",
226
- google: "https://www.google.com/",
227
- };
228
- const engineTabs = await Promise.all(
229
- ALL_ENGINES.map((e) => openNewTab(ENGINE_START_URLS[e])),
230
- );
231
- // Refresh cache so the new tabs are discoverable by cdp.mjs
232
- await cdp(["list"]);
233
-
234
- // Time-bounded per-engine extraction so slow engines don't stall the batch.
235
- // Fast mode: 22s per engine (total budget ~25s incl overhead).
236
- // Standard/deep: 35s per engine (total budget ~40s incl overhead).
237
- const engineTimeoutMs = depth === "fast" ? 30000 : 55000;
238
-
239
- try {
240
- const results = await Promise.allSettled(
241
- ALL_ENGINES.map((e, i) =>
242
- runExtractor(
243
- ENGINES[e],
244
- normalizeQuery(query),
245
- engineTabs[i],
246
- short,
247
- engineTimeoutMs,
248
- locale,
249
- )
250
- .then((r) => {
251
- process.stderr.write(`PROGRESS:${e}:done\n`);
252
- return { engine: e, ...r };
253
- })
254
- .catch((err) => {
255
- process.stderr.write(`PROGRESS:${e}:error\n`);
256
- throw err;
257
- }),
258
- ),
259
- );
260
-
261
- const out = {};
262
- for (let i = 0; i < results.length; i++) {
263
- const r = results[i];
264
- if (r.status === "fulfilled") {
265
- out[r.value.engine] = r.value;
266
- } else {
267
- out[ALL_ENGINES[i]] = { error: r.reason?.message || "unknown error" };
268
- }
269
- }
270
-
271
- // Cloudflare/verification recovery: if Perplexity or Bing were blocked
272
- // in headless mode, retry in visible Chrome to establish cookies,
273
- // then continue headless with the profile now carrying valid session state.
274
- // Recovery is allowed even in fast mode because verification failure would
275
- // otherwise produce no usable result.
276
- const cfBlocked = findHeadlessBlockedEngines(out);
277
-
278
- if (cfBlocked.length > 0 && process.env.GREEDY_SEARCH_VISIBLE !== "1") {
279
- process.stderr.write(
280
- `[greedysearch] 🔓 Cloudflare/verification blocked ${cfBlocked.join(", ")} in headless — retrying visible to establish cookies...\n`,
281
- );
282
- for (const blockedEngine of cfBlocked) {
283
- process.stderr.write(`PROGRESS:${blockedEngine}:needs-human\n`);
284
- }
285
- // Close headless tabs, kill headless Chrome
286
- await closeTabs(engineTabs);
287
- await killHeadlessChrome();
288
- process.env.GREEDY_SEARCH_VISIBLE = "1";
289
- delete process.env.GREEDY_SEARCH_HEADLESS;
290
- await ensureChrome();
291
- await cdp(["list"]);
292
-
293
- // Retry blocked engines in visible Chrome
294
- const retryTabs = [];
295
- let keepVisibleForHuman = false;
296
- let recovered = 0;
297
- for (let i = 0; i < cfBlocked.length; i++) {
298
- const tab = await openNewTab();
299
- retryTabs.push(tab);
300
- }
301
- try {
302
- // First visible retry: navigate to the engine page.
303
- // Cloudflare/Turnstile may resolve and redirect, disrupting the CDP session
304
- // ("Inspected target navigated or closed"). If so, the cookies are now cached
305
- // and a second retry on the same tab should succeed.
306
- const retries = await Promise.allSettled(
307
- cfBlocked.map((e, i) =>
308
- runExtractor(ENGINES[e], query, retryTabs[i], short, null, locale)
309
- .then((r) => ({ engine: e, ...r }))
310
- .catch((err) => ({ engine: e, error: err.message })),
311
- ),
312
- );
313
- const stillBlocked = [];
314
- const manualVerification = [];
315
- for (const r of retries) {
316
- if (r.status === "fulfilled" && !r.value.error) {
317
- out[r.value.engine] = r.value;
318
- recovered++;
319
- } else if (r.status === "fulfilled") {
320
- out[r.value.engine] = r.value;
321
- stillBlocked.push(r.value.engine);
322
- if (isManualVerificationError(r.value.error)) {
323
- manualVerification.push(r.value.engine);
324
- }
325
- }
326
- }
327
- if (recovered > 0) {
328
- process.stderr.write(
329
- `[greedysearch] ✅ ${recovered}/${cfBlocked.length} engine(s) recovered — cookies cached for future headless runs.\n`,
330
- );
331
- } else {
332
- process.stderr.write(
333
- `[greedysearch] ⚠️ Recovery attempt failed — ${cfBlocked.join(", ")} still blocked in visible mode.\n`,
334
- );
335
- }
336
-
337
- // Second retry for still-blocked engines: the first retry may have resolved
338
- // Cloudflare/Turnstile (navigating through the challenge), so cookies are now
339
- // cached and the page should load without the blocking challenge.
340
- if (stillBlocked.length > 0) {
341
- process.stderr.write(
342
- `[greedysearch] Second visible retry for ${stillBlocked.join(", ")} — Turnstile may have resolved on first attempt...\n`,
343
- );
344
- const secondRetries = await Promise.allSettled(
345
- stillBlocked.map((e) => {
346
- const idx = cfBlocked.indexOf(e);
347
- return runExtractor(
348
- ENGINES[e],
349
- query,
350
- retryTabs[idx],
351
- short,
352
- null,
353
- locale,
354
- )
355
- .then((r) => ({ engine: e, ...r }))
356
- .catch((err) => ({ engine: e, error: err.message }));
357
- }),
358
- );
359
- const secondStillBlocked = [];
360
- for (const r of secondRetries) {
361
- if (r.status === "fulfilled" && !r.value.error) {
362
- out[r.value.engine] = r.value;
363
- recovered++;
364
- process.stderr.write(
365
- `[greedysearch] ${r.value.engine} recovered on second visible retry.\n`,
366
- );
367
- } else {
368
- secondStillBlocked.push(r.value?.engine || "unknown");
369
- }
370
- }
371
- stillBlocked.length = 0;
372
- stillBlocked.push(...secondStillBlocked);
373
- }
374
-
375
- if (stillBlocked.length > 0) {
376
- keepVisibleForHuman = true;
377
- out._needsHumanVerification = {
378
- engines: stillBlocked,
379
- message:
380
- "Visible Chrome is open with the engine page loaded. Solve the Turnstile checkbox or other challenge in the visible window to store cookies. Cookies persist for future runs.",
381
- };
382
- process.stderr.write(
383
- `[greedysearch] 🔓 ${stillBlocked.join(", ")} still blocked — keeping visible Chrome open. Solve the challenge in the window to store cookies, then rerun.\n`,
384
- );
385
- // Visible Chrome stays open so the user can interact with any
386
- // Turnstile/Cloudflare challenge. Once solved, cookies are stored
387
- // in the shared profile and future headless runs will reuse them.
388
- }
389
- } finally {
390
- if (keepVisibleForHuman) {
391
- // User must interact — keep visible Chrome open but out of the way
392
- minimizeChrome().catch(() => {});
393
- } else {
394
- // Switch back to headless for synthesis + source fetch.
395
- // killHeadlessChrome() sends Browser.close first so Chrome flushes
396
- // its cookie database before the force-kill — cookies are preserved.
397
- await closeTabs(retryTabs);
398
- process.stderr.write(
399
- "[greedysearch] Switching back to headless Chrome...\n",
400
- );
401
- await killHeadlessChrome();
402
- delete process.env.GREEDY_SEARCH_VISIBLE;
403
- process.env.GREEDY_SEARCH_HEADLESS = "1";
404
- await ensureChrome();
405
- await cdp(["list"]);
406
- }
407
- }
408
-
409
- // Clear engineTabs — finally{} closeTabs handles empty arrays gracefully
410
- engineTabs.length = 0;
411
- }
412
-
413
- // Build a canonical source registry across all engines
414
- out._sources = buildSourceRegistry(out, query);
415
-
416
- // Pre-navigate Gemini tab in parallel with source fetch so the page
417
- // is already loaded when synthesis starts — saves ~4s of nav time.
418
- let geminiTabPromise = null;
419
- if (depth !== "fast") {
420
- geminiTabPromise = openNewTab("https://gemini.google.com/app")
421
- .catch(() => null);
422
- }
423
-
424
- // Source fetching: default for all "all" searches
425
- // Fetch all sources in a single batch (concurrency = source count).
426
- if (depth !== "fast" && out._sources.length > 0) {
427
- process.stderr.write("PROGRESS:source-fetch:start\n");
428
- const fetchedSources = await fetchMultipleSources(
429
- out._sources,
430
- 5,
431
- 8000,
432
- );
433
-
434
- out._sources = mergeFetchDataIntoSources(out._sources, fetchedSources);
435
- out._fetchedSources = writeSourcesToFiles(fetchedSources);
436
- process.stderr.write("PROGRESS:source-fetch:done\n");
437
- }
438
-
439
- // Synthesize with Gemini for all non-fast modes
440
- if (depth !== "fast") {
441
- process.stderr.write("PROGRESS:synthesis:start\n");
442
- process.stderr.write(
443
- "[greedysearch] Synthesizing results with Gemini...\n",
444
- );
445
- try {
446
- const geminiTab = await geminiTabPromise ?? await openNewTab();
447
- const synthesis = await synthesizeWithGemini(query, out, {
448
- grounded: depth === "deep",
449
- tabPrefix: geminiTab,
450
- });
451
- out._synthesis = {
452
- ...synthesis,
453
- synthesized: true,
454
- };
455
- await closeTab(geminiTab);
456
- process.stderr.write("PROGRESS:synthesis:done\n");
457
- } catch (e) {
458
- process.stderr.write(
459
- `[greedysearch] Synthesis failed: ${e.message}\n`,
460
- );
461
- out._synthesis = { error: e.message, synthesized: false };
462
- }
463
- }
464
-
465
- if (fetchSource) {
466
- const top = pickTopSource(out);
467
- if (top)
468
- out._topSource = await fetchTopSource(top.canonicalUrl || top.url);
469
- }
470
-
471
- // Always include confidence metrics for non-fast searches
472
- if (depth !== "fast") out._confidence = buildConfidence(out);
473
-
474
- writeOutput(out, outFile, {
475
- inline,
476
- synthesize: depth !== "fast",
477
- query,
478
- });
479
- return;
480
- } finally {
481
- await closeTabs(engineTabs);
482
- }
483
- }
484
-
485
- // Single engine
486
- const script = ENGINES[engine];
487
- if (!script) {
488
- process.stderr.write(
489
- `Unknown engine: "${engine}"\nAvailable: ${Object.keys(ENGINES).join(", ")}\n`,
490
- );
491
- process.exit(1);
492
- }
493
-
494
- try {
495
- const result = await runExtractor(script, normalizeQuery(query), null, short, null, locale);
496
- if (fetchSource && result.sources?.length > 0) {
497
- result.topSource = await fetchTopSource(result.sources[0].url);
498
- }
499
- writeOutput(result, outFile, { inline, synthesize: false, query });
500
- } catch (e) {
501
- const recoveryEngine = script.includes("bing")
502
- ? "bing"
503
- : script.includes("perplexity")
504
- ? "perplexity"
505
- : null;
506
- const canRetryVisible =
507
- recoveryEngine &&
508
- process.env.GREEDY_SEARCH_VISIBLE !== "1" &&
509
- isHeadlessBlockedResult(e);
510
-
511
- if (canRetryVisible) {
512
- process.stderr.write(
513
- `[greedysearch] 🔓 ${recoveryEngine} blocked in headless — retrying visible to establish cookies...\n`,
514
- );
515
- await killHeadlessChrome();
516
- process.env.GREEDY_SEARCH_VISIBLE = "1";
517
- delete process.env.GREEDY_SEARCH_HEADLESS;
518
- await ensureChrome();
519
- await cdp(["list"]);
520
-
521
- const retryTab = await openNewTab();
522
- let keepVisibleForHuman = false;
523
- try {
524
- const result = await runExtractor(
525
- script,
526
- query,
527
- retryTab,
528
- short,
529
- null,
530
- locale,
531
- );
532
- if (fetchSource && result.sources?.length > 0) {
533
- result.topSource = await fetchTopSource(result.sources[0].url);
534
- }
535
- writeOutput(result, outFile, { inline, synthesize: false, query });
536
- return;
537
- } catch (retryErr) {
538
- // Any visible retry failure: keep Chrome open so user can solve Turnstile.
539
- // Once solved, cookies are stored in the shared profile for future headless runs.
540
- keepVisibleForHuman = true;
541
- writeOutput(
542
- {
543
- query,
544
- error: retryErr.message,
545
- _needsHumanVerification: {
546
- engines: [recoveryEngine],
547
- message:
548
- "Visible Chrome is open with the engine page loaded. Solve the Turnstile checkbox or other challenge to store cookies. Cookies persist for future runs.",
549
- },
550
- },
551
- outFile,
552
- { inline, synthesize: false, query },
553
- );
554
- return;
555
- } finally {
556
- if (!keepVisibleForHuman) {
557
- await closeTab(retryTab);
558
- await killHeadlessChrome();
559
- delete process.env.GREEDY_SEARCH_VISIBLE;
560
- process.env.GREEDY_SEARCH_HEADLESS = "1";
561
- } else {
562
- // Minimize the visible window so it's out of the way
563
- minimizeChrome().catch(() => {});
564
- }
565
- }
566
- }
567
-
568
- process.stderr.write(`Error: ${e.message}\n`);
569
- process.exit(1);
570
- }
571
- }
572
-
573
- function pickTopSource(out) {
574
- if (Array.isArray(out._sources) && out._sources.length > 0)
575
- return out._sources[0];
576
- for (const engine of ["perplexity", "google", "bing"]) {
577
- const r = out[engine];
578
- if (r?.sources?.length > 0) return r.sources[0];
579
- }
580
- return null;
581
- }
582
-
583
- /**
584
- * Minimize Chrome window via CDP after search completes.
585
- * Called at the end of search to keep window minimized.
586
- * Skipped in headless mode (no window to minimize).
587
- */
588
- async function minimizeChrome() {
589
- // In headless mode (default), there's no window to minimize
590
- if (process.env.GREEDY_SEARCH_HEADLESS === "1") return;
591
-
592
- try {
593
- const http = await import("node:http");
594
- const version = await new Promise((resolve, reject) => {
595
- http
596
- .get(`http://localhost:9222/json/version`, (res) => {
597
- let body = "";
598
- res.on("data", (d) => (body += d));
599
- res.on("end", () => resolve(JSON.parse(body)));
600
- })
601
- .on("error", reject);
602
- });
603
-
604
- const wsUrl = version.webSocketDebuggerUrl;
605
- const WebSocket = globalThis.WebSocket;
606
- if (!WebSocket) return;
607
-
608
- const ws = new WebSocket(wsUrl);
609
- let requestId = 0;
610
- const pending = new Map();
611
-
612
- ws.onopen = () => {
613
- const id = ++requestId;
614
- pending.set(id, {
615
- resolve: (result) => {
616
- const targets = result.targetInfos || [];
617
- const pageTarget = targets.find((t) => t.type === "page");
618
- if (!pageTarget) {
619
- ws.close();
620
- return;
621
- }
622
-
623
- const winId = ++requestId;
624
- pending.set(winId, {
625
- resolve: (winResult) => {
626
- const windowId = winResult.windowId;
627
- const minId = ++requestId;
628
- pending.set(minId, { resolve: () => {}, reject: () => {} });
629
- ws.send(
630
- JSON.stringify({
631
- id: minId,
632
- method: "Browser.setWindowBounds",
633
- params: { windowId, bounds: { windowState: "minimized" } },
634
- }),
635
- );
636
- setTimeout(() => ws.close(), 500);
637
- },
638
- reject: () => ws.close(),
639
- });
640
- ws.send(
641
- JSON.stringify({
642
- id: winId,
643
- method: "Browser.getWindowForTarget",
644
- params: { targetId: pageTarget.targetId },
645
- }),
646
- );
647
- },
648
- reject: () => ws.close(),
649
- });
650
- ws.send(JSON.stringify({ id, method: "Target.getTargets", params: {} }));
651
- };
652
-
653
- ws.onmessage = (event) => {
654
- const msg = JSON.parse(event.data);
655
- if (msg.id && pending.has(msg.id)) {
656
- const { resolve, reject } = pending.get(msg.id);
657
- pending.delete(msg.id);
658
- if (msg.error) reject?.(msg.error);
659
- else resolve?.(msg.result);
660
- }
661
- };
662
-
663
- setTimeout(() => ws.close(), 3000);
664
- } catch {
665
- // Best-effort
666
- }
667
- }
668
-
669
- main().finally(async () => {
670
- // Touch activity timestamp for headless idle timeout
671
- touchActivity();
672
- // Ensure window is minimized after search completes (best-effort, non-blocking)
673
- minimizeChrome().catch(() => {});
674
- });
1
+ #!/usr/bin/env node
2
+
3
+ // search.mjs - unified CLI for GreedySearch extractors
4
+ //
5
+ // Usage:
6
+ // node search.mjs <engine> "<query>"
7
+ // node search.mjs all "<query>"
8
+ //
9
+ // Engines:
10
+ // perplexity | pplx | p
11
+ // bing | copilot | b
12
+ // google | g
13
+ // gemini | gem
14
+ // all - fan-out to all engines in parallel
15
+ //
16
+ // Output: JSON to stdout, errors to stderr
17
+ //
18
+ // Examples:
19
+ // node search.mjs p "what is memoization"
20
+ // node search.mjs gem "latest React features"
21
+ // node search.mjs all "how does TCP congestion control work"
22
+
23
+ import { appendFileSync, existsSync, readFileSync } from "node:fs";
24
+ // Config file for user defaults
25
+ import { homedir } from "node:os";
26
+ import { join } from "node:path";
27
+ import {
28
+ cdp,
29
+ closeTab,
30
+ closeTabs,
31
+ ensureChrome,
32
+ killHeadlessChrome,
33
+ openNewTab,
34
+ touchActivity,
35
+ } from "../src/search/chrome.mjs";
36
+ import {
37
+ ALL_ENGINES,
38
+ ENGINES,
39
+ SYNTHESIZER,
40
+ VISIBLE_RECOVERY_LOG,
41
+ } from "../src/search/constants.mjs";
42
+ import { runExtractor } from "../src/search/engines.mjs";
43
+ import {
44
+ fetchMultipleSources,
45
+ fetchTopSource,
46
+ } from "../src/search/fetch-source.mjs";
47
+ import { writeSourcesToFiles } from "../src/search/file-sources.mjs";
48
+ import { writeOutput } from "../src/search/output.mjs";
49
+ import {
50
+ findHeadlessBlockedEngines,
51
+ isHeadlessBlockedResult,
52
+ isManualVerificationError,
53
+ } from "../src/search/recovery.mjs";
54
+ import {
55
+ buildSourceRegistry,
56
+ mergeFetchDataIntoSources,
57
+ } from "../src/search/sources.mjs";
58
+ import { buildConfidence } from "../src/search/synthesis.mjs";
59
+ import {
60
+ getSynthesisStartUrl,
61
+ normalizeSynthesizer,
62
+ synthesizeResults,
63
+ } from "../src/search/synthesis-runner.mjs";
64
+ import { normalizeQuery } from "../src/search/query.mjs";
65
+ import { runResearchMode } from "../src/search/research.mjs";
66
+
67
+ const CONFIG_DIR = join(homedir(), ".config", "greedysearch");
68
+ const CONFIG_FILE = join(CONFIG_DIR, "config.json");
69
+
70
+ function loadUserConfig() {
71
+ try {
72
+ if (existsSync(CONFIG_FILE)) {
73
+ return JSON.parse(readFileSync(CONFIG_FILE, "utf8"));
74
+ }
75
+ } catch {
76
+ // Ignore errors
77
+ }
78
+ return {};
79
+ }
80
+
81
+ function logVisibleRecovery(event) {
82
+ try {
83
+ appendFileSync(
84
+ VISIBLE_RECOVERY_LOG,
85
+ `${JSON.stringify({ at: new Date().toISOString(), ...event })}\n`,
86
+ "utf8",
87
+ );
88
+ } catch {
89
+ // Best-effort diagnostics only. Never fail a search because logging failed.
90
+ }
91
+ }
92
+
93
+ /** Read query/prompt from stdin (used with --stdin to avoid command-line leakage) */
94
+ async function readStdin() {
95
+ return new Promise((resolve) => {
96
+ let data = "";
97
+ process.stdin.setEncoding("utf8");
98
+ process.stdin.on("data", (chunk) => (data += chunk));
99
+ process.stdin.on("end", () => resolve(data.trim()));
100
+ if (process.stdin.isTTY) resolve("");
101
+ });
102
+ }
103
+
104
+ // ─── Main ──────────────────────────────────────────────────────────────────
105
+
106
+ async function main() {
107
+ const args = process.argv.slice(2);
108
+ if (args.length < 2 || args[0] === "--help") {
109
+ process.stderr.write(
110
+ `${[
111
+ 'Usage: node search.mjs <engine> "<query>"',
112
+ "",
113
+ "Engines: all, perplexity (p), google (g), chatgpt (gpt), gemini (gem), semantic-scholar (s2), logically (log), bing (b)",
114
+ "",
115
+ "Flags:",
116
+ " --synthesize For engine=all: synthesize fetched sources",
117
+ " --synthesizer <engine> Synthesis engine (default from ~/.pi/greedyconfig)",
118
+ " --fast Legacy quick mode: no source fetching or synthesis",
119
+ " --depth <mode> Legacy: fast|standard|deep aliases, or research",
120
+ " --deep-research Deprecated alias for --research",
121
+ " --research Iterative query/learnings loop (alias: --depth research)",
122
+ " --breadth <n> Research mode query breadth, 1-5 (default: 3)",
123
+ " --iterations <n> Research mode rounds, 1-3 (default: 2)",
124
+ " --max-sources <n> Research mode fetched source cap, 3-12",
125
+ " --research-out-dir <dir> Write research bundle to a specific directory",
126
+ " --no-research-bundle Disable the default .pi/greedysearch-research bundle",
127
+ " --fetch-top-source Fetch content from top source",
128
+ " --inline Output JSON to stdout (for piping)",
129
+ " --locale <lang> Force results language (en, de, fr, etc.)",
130
+ " --visible Always use visible Chrome for this search",
131
+ " --always-visible Alias for --visible",
132
+ " --stdin Read query from stdin (avoids command-line leakage)",
133
+ "",
134
+ "Environment:",
135
+ " GREEDY_SEARCH_VISIBLE Set to 1 to show Chrome window (disables headless)",
136
+ " GREEDY_SEARCH_ALWAYS_VISIBLE Set to 1 to force visible mode for all runs",
137
+ " GREEDY_SEARCH_LOCALE Default locale (default: en)",
138
+ "",
139
+ "Examples:",
140
+ ' node search.mjs all "Node.js streams" # Grounded: engines + fetched sources',
141
+ ' node search.mjs all "Node.js streams" --synthesize # Add Gemini synthesis',
142
+ ' node search.mjs all "quick check" --fast # Legacy fast: no sources/synthesis',
143
+ ' node search.mjs all "browser automation" --research --breadth 3 --iterations 2',
144
+ ' node search.mjs p "what is memoization" # Single engine search',
145
+ ].join("\n")}\n`,
146
+ );
147
+ process.exit(1);
148
+ }
149
+
150
+ const alwaysVisible =
151
+ args.includes("--visible") ||
152
+ args.includes("--always-visible") ||
153
+ process.env.GREEDY_SEARCH_ALWAYS_VISIBLE === "1";
154
+ if (alwaysVisible) {
155
+ process.env.GREEDY_SEARCH_VISIBLE = "1";
156
+ process.env.GREEDY_SEARCH_ALWAYS_VISIBLE = "1";
157
+ delete process.env.GREEDY_SEARCH_HEADLESS;
158
+ } else if (process.env.GREEDY_SEARCH_VISIBLE !== "1") {
159
+ // Establish the desired mode BEFORE ensureChrome() so a stale visible
160
+ // recovery browser is switched back to headless before research planning
161
+ // and Gemini synthesis tabs are opened.
162
+ process.env.GREEDY_SEARCH_HEADLESS = "1";
163
+ }
164
+
165
+ await ensureChrome();
166
+
167
+ // Track activity for headless idle timeout
168
+ touchActivity();
169
+
170
+ const depthIdx = args.indexOf("--depth");
171
+ const legacyDepth =
172
+ depthIdx !== -1 && args[depthIdx + 1]
173
+ ? args[depthIdx + 1].toLowerCase()
174
+ : null;
175
+ const engineArg = args.find((a) => !a.startsWith("--"))?.toLowerCase();
176
+ const researchMode =
177
+ args.includes("--research") ||
178
+ args.includes("--deep-research") ||
179
+ legacyDepth === "research";
180
+ const legacyFast = args.includes("--fast") || legacyDepth === "fast";
181
+ const legacySynthesisDepth =
182
+ legacyDepth === "standard" ||
183
+ legacyDepth === "deep" ||
184
+ args.includes("--deep");
185
+ const shouldFetchSources = engineArg === "all" && !legacyFast;
186
+ const shouldSynthesize =
187
+ engineArg === "all" &&
188
+ !legacyFast &&
189
+ (args.includes("--synthesize") || legacySynthesisDepth);
190
+ const groundedSynthesis = legacyDepth === "deep" || args.includes("--deep");
191
+
192
+ if (args.includes("--deep-research")) {
193
+ process.stderr.write(
194
+ "[greedysearch] --deep-research is deprecated; use --research or --depth research\n",
195
+ );
196
+ }
197
+ if (legacySynthesisDepth) {
198
+ process.stderr.write(
199
+ "[greedysearch] depth fast|standard|deep is deprecated; use default grounded search plus --synthesize when needed\n",
200
+ );
201
+ }
202
+
203
+ const synthesizerIdx = args.indexOf("--synthesizer");
204
+ const synthesizer = normalizeSynthesizer(
205
+ synthesizerIdx === -1 ? SYNTHESIZER : args[synthesizerIdx + 1],
206
+ );
207
+
208
+ const full = args.includes("--full");
209
+ const short = !full;
210
+ const fetchSource = args.includes("--fetch-top-source");
211
+ const inline = args.includes("--inline");
212
+ const breadthIdx = args.indexOf("--breadth");
213
+ const iterationsIdx = args.indexOf("--iterations");
214
+ const maxSourcesIdx = args.indexOf("--max-sources");
215
+ const researchBreadth = breadthIdx === -1 ? undefined : args[breadthIdx + 1];
216
+ const researchIterations =
217
+ iterationsIdx === -1 ? undefined : args[iterationsIdx + 1];
218
+ const researchMaxSources =
219
+ maxSourcesIdx === -1 ? undefined : args[maxSourcesIdx + 1];
220
+ const researchOutDirIdx = args.indexOf("--research-out-dir");
221
+ const researchOutDir =
222
+ researchOutDirIdx === -1 ? undefined : args[researchOutDirIdx + 1];
223
+ const writeResearchBundle = !args.includes("--no-research-bundle");
224
+ const outIdx = args.indexOf("--out");
225
+ const outFile = outIdx === -1 ? null : args[outIdx + 1];
226
+
227
+ // Locale handling: CLI flag > env var > config file > default (en)
228
+ const localeIdx = args.indexOf("--locale");
229
+ const envLocale = process.env.GREEDY_SEARCH_LOCALE;
230
+ const userConfig = loadUserConfig();
231
+ let locale = "en"; // Default to English
232
+
233
+ if (localeIdx !== -1 && args[localeIdx + 1]) {
234
+ locale = args[localeIdx + 1];
235
+ } else if (envLocale) {
236
+ locale = envLocale;
237
+ } else if (userConfig.locale) {
238
+ locale = userConfig.locale;
239
+ }
240
+ const rest = args.filter(
241
+ (a, i) =>
242
+ a !== "--full" &&
243
+ a !== "--short" &&
244
+ a !== "--fast" &&
245
+ a !== "--fetch-top-source" &&
246
+ a !== "--synthesize" &&
247
+ a !== "--deep-research" &&
248
+ a !== "--deep" &&
249
+ a !== "--research" &&
250
+ a !== "--inline" &&
251
+ a !== "--stdin" &&
252
+ a !== "--headless" &&
253
+ a !== "--visible" &&
254
+ a !== "--always-visible" &&
255
+ a !== "--depth" &&
256
+ a !== "--synthesizer" &&
257
+ a !== "--out" &&
258
+ a !== "--locale" &&
259
+ a !== "--breadth" &&
260
+ a !== "--iterations" &&
261
+ a !== "--max-sources" &&
262
+ a !== "--research-out-dir" &&
263
+ a !== "--no-research-bundle" &&
264
+ a !== "--help" &&
265
+ (depthIdx === -1 || i !== depthIdx + 1) &&
266
+ (synthesizerIdx === -1 || i !== synthesizerIdx + 1) &&
267
+ (outIdx === -1 || i !== outIdx + 1) &&
268
+ (localeIdx === -1 || i !== localeIdx + 1) &&
269
+ (breadthIdx === -1 || i !== breadthIdx + 1) &&
270
+ (iterationsIdx === -1 || i !== iterationsIdx + 1) &&
271
+ (maxSourcesIdx === -1 || i !== maxSourcesIdx + 1) &&
272
+ (researchOutDirIdx === -1 || i !== researchOutDirIdx + 1),
273
+ );
274
+ const engine = rest[0]?.toLowerCase();
275
+ // Read query from stdin when --stdin flag is set (avoids leaking query in process table)
276
+ const useStdin = args.includes("--stdin");
277
+ let query;
278
+ if (useStdin) {
279
+ query = await readStdin();
280
+ } else {
281
+ query = rest.slice(1).join(" ");
282
+ }
283
+
284
+ if (researchMode) {
285
+ if (engine !== "all") {
286
+ process.stderr.write(
287
+ `[greedysearch] Research mode uses all engines; ignoring engine "${engine}".\n`,
288
+ );
289
+ }
290
+ const out = await runResearchMode({
291
+ query: normalizeQuery(query),
292
+ breadth: researchBreadth,
293
+ iterations: researchIterations,
294
+ maxSources: researchMaxSources,
295
+ locale,
296
+ short,
297
+ writeBundle: writeResearchBundle,
298
+ researchOutDir,
299
+ });
300
+ writeOutput(out, outFile, {
301
+ inline,
302
+ synthesize: true,
303
+ query,
304
+ });
305
+ return;
306
+ }
307
+
308
+ if (engine === "all") {
309
+ await cdp(["list"]); // refresh pages cache
310
+
311
+ // Create fresh tabs for each engine in parallel, seeded directly to the
312
+ // engine homepage so extractors can skip the initial navigation.
313
+ const ENGINE_START_URLS = {
314
+ perplexity: "https://www.perplexity.ai/",
315
+ google: "https://www.google.com/",
316
+ "semantic-scholar": "https://www.semanticscholar.org/",
317
+ semanticscholar: "https://www.semanticscholar.org/",
318
+ s2: "https://www.semanticscholar.org/",
319
+ logically: "https://logically.app/research-assistant/",
320
+ };
321
+ const engineTabs = await Promise.all(
322
+ ALL_ENGINES.map((e) => openNewTab(ENGINE_START_URLS[e])),
323
+ );
324
+ // Refresh cache so the new tabs are discoverable by cdp.mjs
325
+ await cdp(["list"]);
326
+
327
+ // Time-bounded per-engine extraction so slow engines don't stall the batch.
328
+ const engineTimeoutFor = (engineName) => {
329
+ if (!legacyFast) return 70000;
330
+ // ChatGPT needs ~25-30s solo; under CDP contention needs more headroom
331
+ return engineName === "chatgpt" ? 60000 : 35000;
332
+ };
333
+
334
+ try {
335
+ const results = await Promise.allSettled(
336
+ ALL_ENGINES.map((e, i) =>
337
+ runExtractor(
338
+ ENGINES[e],
339
+ normalizeQuery(query),
340
+ engineTabs[i],
341
+ short,
342
+ engineTimeoutFor(e),
343
+ locale,
344
+ )
345
+ .then((r) => {
346
+ process.stderr.write(`PROGRESS:${e}:done\n`);
347
+ return { engine: e, ...r };
348
+ })
349
+ .catch((err) => {
350
+ // Do not emit PROGRESS:error yet: Bing/Perplexity may recover in
351
+ // visible mode. Emit the final status after recovery has run.
352
+ throw err;
353
+ }),
354
+ ),
355
+ );
356
+
357
+ const out = {};
358
+ for (let i = 0; i < results.length; i++) {
359
+ const r = results[i];
360
+ if (r.status === "fulfilled") {
361
+ out[r.value.engine] = r.value;
362
+ } else {
363
+ const err = r.reason;
364
+ const msg = err?.message || "unknown error";
365
+ out[ALL_ENGINES[i]] = { error: msg };
366
+ if (err?.lastStage) {
367
+ process.stderr.write(
368
+ `[greedysearch] ${ALL_ENGINES[i]} failed at stage '${err.lastStage}': ${msg}\n`,
369
+ );
370
+ }
371
+ if (err?.partialErr) {
372
+ process.stderr.write(
373
+ `[greedysearch] ${ALL_ENGINES[i]} tail stderr:\n${err.partialErr}\n`,
374
+ );
375
+ }
376
+ }
377
+ }
378
+
379
+ // Cloudflare/verification recovery: if Perplexity or Bing were blocked
380
+ // in headless mode, retry in visible Chrome to establish cookies,
381
+ // then continue headless with the profile now carrying valid session state.
382
+ // Recovery is allowed even in fast mode because verification failure would
383
+ // otherwise produce no usable result.
384
+ const recoveryCandidates = findHeadlessBlockedEngines(out);
385
+
386
+ if (
387
+ recoveryCandidates.length > 0 &&
388
+ process.env.GREEDY_SEARCH_VISIBLE !== "1"
389
+ ) {
390
+ logVisibleRecovery({
391
+ scope: "all",
392
+ phase: "start",
393
+ engines: recoveryCandidates,
394
+ reasons: Object.fromEntries(
395
+ recoveryCandidates.map((engineName) => [
396
+ engineName,
397
+ {
398
+ error: out[engineName]?.error || null,
399
+ envelope: out[engineName]?._envelope || null,
400
+ },
401
+ ]),
402
+ ),
403
+ });
404
+ process.stderr.write(
405
+ `[greedysearch] 🔓 Headless ${recoveryCandidates.join(", ")} search hit timeout/verification/antibot signals — retrying visible to establish cookies...\n`,
406
+ );
407
+ for (const blockedEngine of recoveryCandidates) {
408
+ process.stderr.write(
409
+ `[greedysearch] ${blockedEngine} recovery starting in visible mode...\n`,
410
+ );
411
+ }
412
+ // Close headless tabs, kill headless Chrome
413
+ await closeTabs(engineTabs);
414
+ await killHeadlessChrome();
415
+ process.env.GREEDY_SEARCH_VISIBLE = "1";
416
+ delete process.env.GREEDY_SEARCH_HEADLESS;
417
+ await ensureChrome();
418
+ await cdp(["list"]);
419
+
420
+ // Retry blocked engines in visible Chrome
421
+ const retryTabs = [];
422
+ let keepVisibleForHuman = false;
423
+ let recovered = 0;
424
+ for (let i = 0; i < recoveryCandidates.length; i++) {
425
+ const tab = await openNewTab();
426
+ retryTabs.push(tab);
427
+ }
428
+ try {
429
+ // First visible retry: navigate to the engine page.
430
+ // Cloudflare/Turnstile may resolve and redirect, disrupting the CDP session
431
+ // ("Inspected target navigated or closed"). If so, the cookies are now cached
432
+ // and a second retry on the same tab should succeed.
433
+ const retries = await Promise.allSettled(
434
+ recoveryCandidates.map((e, i) =>
435
+ runExtractor(ENGINES[e], query, retryTabs[i], short, null, locale)
436
+ .then((r) => ({ engine: e, ...r }))
437
+ .catch((err) => ({ engine: e, error: err.message })),
438
+ ),
439
+ );
440
+ const stillBlocked = [];
441
+ const manualVerification = [];
442
+ for (const r of retries) {
443
+ if (r.status === "fulfilled" && !r.value.error) {
444
+ out[r.value.engine] = r.value;
445
+ recovered++;
446
+ process.stderr.write(`PROGRESS:${r.value.engine}:done\n`);
447
+ } else if (r.status === "fulfilled") {
448
+ out[r.value.engine] = r.value;
449
+ stillBlocked.push(r.value.engine);
450
+ if (isManualVerificationError(r.value.error)) {
451
+ manualVerification.push(r.value.engine);
452
+ }
453
+ }
454
+ }
455
+ if (recovered > 0) {
456
+ process.stderr.write(
457
+ `[greedysearch] ✅ ${recovered}/${recoveryCandidates.length} engine(s) recovered — cookies cached for future headless runs.\n`,
458
+ );
459
+ } else {
460
+ process.stderr.write(
461
+ `[greedysearch] ⚠️ Recovery attempt did not extract an answer — ${recoveryCandidates.join(", ")} may still need manual verification or a DOM fallback.\n`,
462
+ );
463
+ }
464
+
465
+ // Second retry for still-blocked engines: the first retry may have resolved
466
+ // Cloudflare/Turnstile (navigating through the challenge), so cookies are now
467
+ // cached and the page should load without the blocking challenge.
468
+ if (stillBlocked.length > 0) {
469
+ process.stderr.write(
470
+ `[greedysearch] Second visible retry for ${stillBlocked.join(", ")} — Turnstile may have resolved on first attempt...\n`,
471
+ );
472
+ const secondRetries = await Promise.allSettled(
473
+ stillBlocked.map((e) => {
474
+ const idx = recoveryCandidates.indexOf(e);
475
+ return runExtractor(
476
+ ENGINES[e],
477
+ query,
478
+ retryTabs[idx],
479
+ short,
480
+ null,
481
+ locale,
482
+ )
483
+ .then((r) => ({ engine: e, ...r }))
484
+ .catch((err) => ({ engine: e, error: err.message }));
485
+ }),
486
+ );
487
+ const secondStillBlocked = [];
488
+ for (const r of secondRetries) {
489
+ if (r.status === "fulfilled" && !r.value.error) {
490
+ out[r.value.engine] = r.value;
491
+ recovered++;
492
+ process.stderr.write(`PROGRESS:${r.value.engine}:done\n`);
493
+ process.stderr.write(
494
+ `[greedysearch] ✅ ${r.value.engine} recovered on second visible retry.\n`,
495
+ );
496
+ } else {
497
+ secondStillBlocked.push(r.value?.engine || "unknown");
498
+ }
499
+ }
500
+ stillBlocked.length = 0;
501
+ stillBlocked.push(...secondStillBlocked);
502
+ }
503
+
504
+ logVisibleRecovery({
505
+ scope: "all",
506
+ phase: stillBlocked.length > 0 ? "needs-human" : "success",
507
+ engines: recoveryCandidates,
508
+ results: Object.fromEntries(
509
+ recoveryCandidates.map((engineName) => [
510
+ engineName,
511
+ {
512
+ mode: out[engineName]?._envelope?.mode || null,
513
+ durationMs: out[engineName]?._envelope?.durationMs || null,
514
+ lastStage: out[engineName]?._envelope?.lastStage || null,
515
+ error: out[engineName]?.error || null,
516
+ },
517
+ ]),
518
+ ),
519
+ });
520
+
521
+ if (stillBlocked.length > 0) {
522
+ for (const blockedEngine of stillBlocked) {
523
+ process.stderr.write(`PROGRESS:${blockedEngine}:needs-human\n`);
524
+ }
525
+ keepVisibleForHuman = true;
526
+ out._needsHumanVerification = {
527
+ engines: stillBlocked,
528
+ message:
529
+ "Visible Chrome is open with the engine page loaded. Solve the Turnstile checkbox or other challenge in the visible window to store cookies. Cookies persist for future runs.",
530
+ };
531
+ process.stderr.write(
532
+ `[greedysearch] 🔓 ${stillBlocked.join(", ")} still blocked — keeping visible Chrome open. Solve the challenge in the window to store cookies, then rerun.\n`,
533
+ );
534
+ // Visible Chrome stays open so the user can interact with any
535
+ // Turnstile/Cloudflare challenge. Once solved, cookies are stored
536
+ // in the shared profile and future headless runs will reuse them.
537
+ }
538
+ } finally {
539
+ if (keepVisibleForHuman) {
540
+ // User must interact — keep visible Chrome open but out of the way
541
+ minimizeChrome().catch(() => {});
542
+ } else {
543
+ // Switch back to headless for synthesis + source fetch.
544
+ // killHeadlessChrome() sends Browser.close first so Chrome flushes
545
+ // its cookie database before the force-kill — cookies are preserved.
546
+ await closeTabs(retryTabs);
547
+ process.stderr.write(
548
+ "[greedysearch] Switching back to headless Chrome...\n",
549
+ );
550
+ await killHeadlessChrome();
551
+ delete process.env.GREEDY_SEARCH_VISIBLE;
552
+ process.env.GREEDY_SEARCH_HEADLESS = "1";
553
+ await ensureChrome();
554
+ await cdp(["list"]);
555
+ }
556
+ }
557
+
558
+ // Clear engineTabs — finally{} closeTabs handles empty arrays gracefully
559
+ engineTabs.length = 0;
560
+ }
561
+
562
+ for (const engineName of ALL_ENGINES) {
563
+ if (!out[engineName]?.error) continue;
564
+ if (recoveryCandidates.includes(engineName)) {
565
+ if (process.env.GREEDY_SEARCH_VISIBLE === "1") {
566
+ process.stderr.write(
567
+ `PROGRESS:${engineName}:${isManualVerificationError(out[engineName].error) ? "needs-human" : "error"}\n`,
568
+ );
569
+ }
570
+ continue;
571
+ }
572
+ process.stderr.write(`PROGRESS:${engineName}:error\n`);
573
+ }
574
+
575
+ // Build a canonical source registry across all engines
576
+ out._sources = buildSourceRegistry(out, query);
577
+
578
+ // Source fetching: default for all "all" searches
579
+ // Fetch all sources in a single batch (concurrency = source count).
580
+ if (shouldFetchSources && out._sources.length > 0) {
581
+ process.stderr.write("PROGRESS:source-fetch:start\n");
582
+ const fetchedSources = await fetchMultipleSources(
583
+ out._sources,
584
+ 5,
585
+ 8000,
586
+ );
587
+
588
+ out._sources = mergeFetchDataIntoSources(out._sources, fetchedSources);
589
+ out._fetchedSources = writeSourcesToFiles(fetchedSources);
590
+ process.stderr.write("PROGRESS:source-fetch:done\n");
591
+ }
592
+
593
+ // Optional engine-agnostic synthesis for multi-engine searches.
594
+ // Open the synthesizer tab HERE (after source fetch) instead of
595
+ // pre-opening before source fetch. Pre-opening was fragile: Chrome could
596
+ // be killed during visible recovery or idle-timeout between source fetch
597
+ // and synthesis, leaving a stale tab ID that causes "No target matching prefix".
598
+ if (shouldSynthesize) {
599
+ process.stderr.write("PROGRESS:synthesis:start\n");
600
+ process.stderr.write(
601
+ `[greedysearch] Synthesizing results with ${synthesizer}...\n`,
602
+ );
603
+ let synthesisTab = null;
604
+ try {
605
+ synthesisTab = await openNewTab(getSynthesisStartUrl(synthesizer));
606
+ const synthesis = await synthesizeResults(query, out, {
607
+ grounded: groundedSynthesis,
608
+ tabPrefix: synthesisTab,
609
+ visible: process.env.GREEDY_SEARCH_VISIBLE === "1",
610
+ synthesizer,
611
+ });
612
+ out._synthesis = {
613
+ ...synthesis,
614
+ synthesized: true,
615
+ };
616
+ process.stderr.write("PROGRESS:synthesis:done\n");
617
+ } catch (e) {
618
+ process.stderr.write(
619
+ `[greedysearch] Synthesis failed: ${e.message}\n`,
620
+ );
621
+ out._synthesis = {
622
+ error: e.message,
623
+ synthesized: false,
624
+ synthesizedBy: synthesizer,
625
+ };
626
+ } finally {
627
+ if (synthesisTab) await closeTab(synthesisTab);
628
+ }
629
+ }
630
+
631
+ if (fetchSource) {
632
+ const top = pickTopSource(out);
633
+ if (top)
634
+ out._topSource = await fetchTopSource(top.canonicalUrl || top.url);
635
+ }
636
+
637
+ // Include confidence metrics for grounded multi-engine searches.
638
+ if (!legacyFast) out._confidence = buildConfidence(out);
639
+
640
+ writeOutput(out, outFile, {
641
+ inline,
642
+ synthesize: shouldSynthesize,
643
+ query,
644
+ });
645
+ return;
646
+ } finally {
647
+ await closeTabs(engineTabs);
648
+ }
649
+ }
650
+
651
+ // Single engine
652
+ const script = ENGINES[engine];
653
+ if (!script) {
654
+ process.stderr.write(
655
+ `Unknown engine: "${engine}"\nAvailable: ${Object.keys(ENGINES).join(", ")}\n`,
656
+ );
657
+ process.exit(1);
658
+ }
659
+
660
+ try {
661
+ const result = await runExtractor(
662
+ script,
663
+ normalizeQuery(query),
664
+ null,
665
+ short,
666
+ null,
667
+ locale,
668
+ );
669
+ if (fetchSource && result.sources?.length > 0) {
670
+ result.topSource = await fetchTopSource(result.sources[0].url);
671
+ }
672
+ writeOutput(result, outFile, { inline, synthesize: false, query });
673
+ } catch (e) {
674
+ const recoveryEngine = script.includes("bing")
675
+ ? "bing"
676
+ : script.includes("perplexity")
677
+ ? "perplexity"
678
+ : script.includes("chatgpt")
679
+ ? "chatgpt"
680
+ : script.includes("semantic-scholar")
681
+ ? "semantic-scholar"
682
+ : script.includes("logically")
683
+ ? "logically"
684
+ : null;
685
+ const canRetryVisible =
686
+ recoveryEngine &&
687
+ process.env.GREEDY_SEARCH_VISIBLE !== "1" &&
688
+ isHeadlessBlockedResult(e);
689
+
690
+ if (canRetryVisible) {
691
+ logVisibleRecovery({
692
+ scope: "single",
693
+ phase: "start",
694
+ engines: [recoveryEngine],
695
+ reasons: {
696
+ [recoveryEngine]: {
697
+ error: e.message || null,
698
+ envelope: e.envelope || null,
699
+ lastStage: e.lastStage || null,
700
+ },
701
+ },
702
+ });
703
+ process.stderr.write(
704
+ `[greedysearch] 🔓 ${recoveryEngine} blocked in headless — retrying visible to establish cookies...\n`,
705
+ );
706
+ await killHeadlessChrome();
707
+ process.env.GREEDY_SEARCH_VISIBLE = "1";
708
+ delete process.env.GREEDY_SEARCH_HEADLESS;
709
+ await ensureChrome();
710
+ await cdp(["list"]);
711
+
712
+ const retryTab = await openNewTab();
713
+ let keepVisibleForHuman = false;
714
+ try {
715
+ const result = await runExtractor(
716
+ script,
717
+ query,
718
+ retryTab,
719
+ short,
720
+ null,
721
+ locale,
722
+ );
723
+ logVisibleRecovery({
724
+ scope: "single",
725
+ phase: "success",
726
+ engines: [recoveryEngine],
727
+ result: {
728
+ engine: recoveryEngine,
729
+ mode: result._envelope?.mode || null,
730
+ durationMs: result._envelope?.durationMs || null,
731
+ lastStage: result._envelope?.lastStage || null,
732
+ },
733
+ });
734
+ if (fetchSource && result.sources?.length > 0) {
735
+ result.topSource = await fetchTopSource(result.sources[0].url);
736
+ }
737
+ writeOutput(result, outFile, { inline, synthesize: false, query });
738
+ return;
739
+ } catch (retryErr) {
740
+ logVisibleRecovery({
741
+ scope: "single",
742
+ phase: "needs-human",
743
+ engines: [recoveryEngine],
744
+ result: {
745
+ engine: recoveryEngine,
746
+ error: retryErr.message || String(retryErr),
747
+ envelope: retryErr.envelope || null,
748
+ },
749
+ });
750
+ // Any visible retry failure: keep Chrome open so user can solve Turnstile.
751
+ // Once solved, cookies are stored in the shared profile for future headless runs.
752
+ keepVisibleForHuman = true;
753
+ writeOutput(
754
+ {
755
+ query,
756
+ error: retryErr.message,
757
+ _needsHumanVerification: {
758
+ engines: [recoveryEngine],
759
+ message:
760
+ "Visible Chrome is open with the engine page loaded. Solve the Turnstile checkbox or other challenge to store cookies. Cookies persist for future runs.",
761
+ },
762
+ },
763
+ outFile,
764
+ { inline, synthesize: false, query },
765
+ );
766
+ return;
767
+ } finally {
768
+ if (!keepVisibleForHuman) {
769
+ await closeTab(retryTab);
770
+ await killHeadlessChrome();
771
+ delete process.env.GREEDY_SEARCH_VISIBLE;
772
+ process.env.GREEDY_SEARCH_HEADLESS = "1";
773
+ } else {
774
+ // Minimize the visible window so it's out of the way
775
+ minimizeChrome().catch(() => {});
776
+ }
777
+ }
778
+ }
779
+
780
+ process.stderr.write(`Error: ${e.message}\n`);
781
+ process.exit(1);
782
+ }
783
+ }
784
+
785
+ function pickTopSource(out) {
786
+ if (Array.isArray(out._sources) && out._sources.length > 0)
787
+ return out._sources[0];
788
+ for (const engine of ["perplexity", "google", "bing"]) {
789
+ const r = out[engine];
790
+ if (r?.sources?.length > 0) return r.sources[0];
791
+ }
792
+ return null;
793
+ }
794
+
795
+ /**
796
+ * Minimize Chrome window via CDP after search completes.
797
+ * Called at the end of search to keep window minimized.
798
+ * Skipped in headless mode (no window to minimize).
799
+ */
800
+ async function minimizeChrome() {
801
+ // In headless mode (default), there's no window to minimize
802
+ if (process.env.GREEDY_SEARCH_HEADLESS === "1") return;
803
+
804
+ try {
805
+ const http = await import("node:http");
806
+ const version = await new Promise((resolve, reject) => {
807
+ http
808
+ .get(`http://localhost:9222/json/version`, (res) => {
809
+ let body = "";
810
+ res.on("data", (d) => (body += d));
811
+ res.on("end", () => resolve(JSON.parse(body)));
812
+ })
813
+ .on("error", reject);
814
+ });
815
+
816
+ const wsUrl = version.webSocketDebuggerUrl;
817
+ const WebSocket = globalThis.WebSocket;
818
+ if (!WebSocket) return;
819
+
820
+ const ws = new WebSocket(wsUrl);
821
+ let requestId = 0;
822
+ const pending = new Map();
823
+
824
+ ws.onopen = () => {
825
+ const id = ++requestId;
826
+ pending.set(id, {
827
+ resolve: (result) => {
828
+ const targets = result.targetInfos || [];
829
+ const pageTarget = targets.find((t) => t.type === "page");
830
+ if (!pageTarget) {
831
+ ws.close();
832
+ return;
833
+ }
834
+
835
+ const winId = ++requestId;
836
+ pending.set(winId, {
837
+ resolve: (winResult) => {
838
+ const windowId = winResult.windowId;
839
+ const minId = ++requestId;
840
+ pending.set(minId, { resolve: () => {}, reject: () => {} });
841
+ ws.send(
842
+ JSON.stringify({
843
+ id: minId,
844
+ method: "Browser.setWindowBounds",
845
+ params: { windowId, bounds: { windowState: "minimized" } },
846
+ }),
847
+ );
848
+ setTimeout(() => ws.close(), 500);
849
+ },
850
+ reject: () => ws.close(),
851
+ });
852
+ ws.send(
853
+ JSON.stringify({
854
+ id: winId,
855
+ method: "Browser.getWindowForTarget",
856
+ params: { targetId: pageTarget.targetId },
857
+ }),
858
+ );
859
+ },
860
+ reject: () => ws.close(),
861
+ });
862
+ ws.send(JSON.stringify({ id, method: "Target.getTargets", params: {} }));
863
+ };
864
+
865
+ ws.onmessage = (event) => {
866
+ const msg = JSON.parse(event.data);
867
+ if (msg.id && pending.has(msg.id)) {
868
+ const { resolve, reject } = pending.get(msg.id);
869
+ pending.delete(msg.id);
870
+ if (msg.error) reject?.(msg.error);
871
+ else resolve?.(msg.result);
872
+ }
873
+ };
874
+
875
+ setTimeout(() => ws.close(), 3000);
876
+ } catch {
877
+ // Best-effort
878
+ }
879
+ }
880
+
881
+ main().finally(async () => {
882
+ // Touch activity timestamp for headless idle timeout
883
+ touchActivity();
884
+ // Ensure window is minimized after search completes (best-effort, non-blocking)
885
+ minimizeChrome().catch(() => {});
886
+ });