@apmantza/greedysearch-pi 1.7.0 → 1.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1504 +1,1550 @@
1
- #!/usr/bin/env node
2
- // search.mjs — unified CLI for GreedySearch extractors
3
- //
4
- // Usage:
5
- // node search.mjs <engine> "<query>"
6
- // node search.mjs all "<query>"
7
- //
8
- // Engines:
9
- // perplexity | pplx | p
10
- // bing | copilot | b
11
- // google | g
12
- // gemini | gem
13
- // all — fan-out to all engines in parallel
14
- //
15
- // Output: JSON to stdout, errors to stderr
16
- //
17
- // Examples:
18
- // node search.mjs p "what is memoization"
19
- // node search.mjs gem "latest React features"
20
- // node search.mjs all "how does TCP congestion control work"
21
-
22
- import { spawn } from "node:child_process";
23
- import {
24
- existsSync,
25
- mkdirSync,
26
- readFileSync,
27
- renameSync,
28
- unlinkSync,
29
- writeFileSync,
30
- } from "node:fs";
31
- import http from "node:http";
32
- import { tmpdir } from "node:os";
33
- import { dirname, join } from "node:path";
34
- import { fileURLToPath } from "node:url";
35
- import { fetchSourceHttp, shouldUseBrowser } from "./src/fetcher.mjs";
36
- import { fetchGitHubContent, parseGitHubUrl } from "./src/github.mjs";
37
- import { trimContentHeadTail } from "./src/utils/content.mjs";
38
-
39
- const __dir = dirname(fileURLToPath(import.meta.url));
40
- const CDP = join(__dir, "cdp.mjs");
41
- const PAGES_CACHE = `${tmpdir().replace(/\\/g, "/")}/cdp-pages.json`;
42
-
1
+ #!/usr/bin/env node
2
+ // search.mjs — unified CLI for GreedySearch extractors
3
+ //
4
+ // Usage:
5
+ // node search.mjs <engine> "<query>"
6
+ // node search.mjs all "<query>"
7
+ //
8
+ // Engines:
9
+ // perplexity | pplx | p
10
+ // bing | copilot | b
11
+ // google | g
12
+ // gemini | gem
13
+ // all — fan-out to all engines in parallel
14
+ //
15
+ // Output: JSON to stdout, errors to stderr
16
+ //
17
+ // Examples:
18
+ // node search.mjs p "what is memoization"
19
+ // node search.mjs gem "latest React features"
20
+ // node search.mjs all "how does TCP congestion control work"
21
+
22
+ import { spawn } from "node:child_process";
23
+ import {
24
+ existsSync,
25
+ mkdirSync,
26
+ readFileSync,
27
+ renameSync,
28
+ unlinkSync,
29
+ writeFileSync,
30
+ } from "node:fs";
31
+ import http from "node:http";
32
+ import { tmpdir } from "node:os";
33
+ import { dirname, join } from "node:path";
34
+ import { fileURLToPath } from "node:url";
35
+ import { fetchSourceHttp, shouldUseBrowser } from "../src/fetcher.mjs";
36
+ import { fetchGitHubContent, parseGitHubUrl } from "../src/github.mjs";
37
+ import { trimContentHeadTail } from "../src/utils/content.mjs";
38
+
39
+ const __dir = dirname(fileURLToPath(import.meta.url));
40
+ const CDP = join(__dir, "cdp.mjs");
41
+ const PAGES_CACHE = `${tmpdir().replace(/\\/g, "/")}/cdp-pages.json`;
42
+
43
43
  const GREEDY_PORT = 9222;
44
-
45
- const ENGINES = {
46
- perplexity: "perplexity.mjs",
47
- pplx: "perplexity.mjs",
48
- p: "perplexity.mjs",
49
- bing: "bing-copilot.mjs",
50
- copilot: "bing-copilot.mjs",
51
- b: "bing-copilot.mjs",
52
- google: "google-ai.mjs",
53
- g: "google-ai.mjs",
54
- gemini: "gemini.mjs",
55
- gem: "gemini.mjs",
56
- };
57
-
58
- const ALL_ENGINES = ["perplexity", "bing", "google"];
59
-
60
- const ENGINE_DOMAINS = {
61
- perplexity: "perplexity.ai",
62
- bing: "copilot.microsoft.com",
63
- google: "google.com",
64
- gemini: "gemini.google.com",
65
- };
66
-
67
- const TRACKING_PARAMS = [
68
- "fbclid",
69
- "gclid",
70
- "ref",
71
- "ref_src",
72
- "ref_url",
73
- "source",
74
- "utm_campaign",
75
- "utm_content",
76
- "utm_medium",
77
- "utm_source",
78
- "utm_term",
79
- ];
80
-
81
- const COMMUNITY_HOSTS = [
82
- "dev.to",
83
- "hashnode.com",
84
- "medium.com",
85
- "reddit.com",
86
- "stackoverflow.com",
87
- "stackexchange.com",
88
- "substack.com",
89
- ];
90
-
91
- const NEWS_HOSTS = [
92
- "arstechnica.com",
93
- "techcrunch.com",
94
- "theverge.com",
95
- "venturebeat.com",
96
- "wired.com",
97
- "zdnet.com",
98
- ];
99
-
100
- /**
101
- * Infer preferred domains based on query keywords
102
- * Returns domains that should be boosted for this query
103
- */
104
- function inferPreferredDomains(query) {
105
- const normalized = query.toLowerCase();
106
- const matches = [];
107
-
108
- if (
109
- normalized.includes("openai") ||
110
- normalized.includes("gpt") ||
111
- normalized.includes("chatgpt")
112
- ) {
113
- matches.push("openai.com", "platform.openai.com", "help.openai.com");
114
- }
115
- if (normalized.includes("anthropic") || normalized.includes("claude")) {
116
- matches.push("anthropic.com", "docs.anthropic.com");
117
- }
118
- if (normalized.includes("bun")) {
119
- matches.push("bun.sh", "bun.com");
120
- }
121
- if (normalized.includes("next.js") || normalized.includes("nextjs")) {
122
- matches.push("nextjs.org", "vercel.com");
123
- }
124
- if (normalized.includes("playwright")) {
125
- matches.push("playwright.dev");
126
- }
127
- if (normalized.includes("supabase")) {
128
- matches.push("supabase.com", "supabase.io");
129
- }
130
- if (normalized.includes("prisma")) {
131
- matches.push("prisma.io");
132
- }
133
- if (normalized.includes("tailwind")) {
134
- matches.push("tailwindcss.com");
135
- }
136
- if (normalized.includes("vite")) {
137
- matches.push("vitejs.dev", "vite.dev");
138
- }
139
- if (normalized.includes("astro")) {
140
- matches.push("astro.build");
141
- }
142
- if (normalized.includes("svelte")) {
143
- matches.push("svelte.dev");
144
- }
145
- if (normalized.includes("solid")) {
146
- matches.push("solidjs.com");
147
- }
148
- if (normalized.includes("vue") || normalized.includes("nuxt")) {
149
- matches.push("vuejs.org", "nuxt.com");
150
- }
151
- if (normalized.includes("react") || normalized.includes("react native")) {
152
- matches.push("react.dev", "reactnative.dev");
153
- }
154
- if (normalized.includes("angular")) {
155
- matches.push("angular.io", "angular.dev");
156
- }
157
- if (normalized.includes("node.js") || normalized.includes("nodejs")) {
158
- matches.push("nodejs.org", "nodejs.dev", "npmjs.com");
159
- }
160
- if (normalized.includes("deno")) {
161
- matches.push("deno.land", "deno.com");
162
- }
163
- if (normalized.includes("fresh")) {
164
- matches.push("fresh.deno.dev");
165
- }
166
- if (normalized.includes("typescript") || normalized.includes("ts")) {
167
- matches.push("typescriptlang.org");
168
- }
169
- if (normalized.includes("python")) {
170
- matches.push("python.org", "docs.python.org");
171
- }
172
- if (normalized.includes("rust")) {
173
- matches.push("rust-lang.org", "docs.rs", "crates.io");
174
- }
175
- if (normalized.includes("go") || normalized.includes("golang")) {
176
- matches.push("go.dev", "golang.org", "pkg.go.dev");
177
- }
178
- if (normalized.includes("zig")) {
179
- matches.push("ziglang.org");
180
- }
181
- if (normalized.includes("docker")) {
182
- matches.push("docker.com", "docs.docker.com", "hub.docker.com");
183
- }
184
- if (normalized.includes("kubernetes") || normalized.includes("k8s")) {
185
- matches.push("kubernetes.io", "k8s.io");
186
- }
187
- if (normalized.includes("postgres") || normalized.includes("postgresql")) {
188
- matches.push("postgresql.org", "neon.tech", "supabase.com");
189
- }
190
- if (normalized.includes("redis")) {
191
- matches.push("redis.io");
192
- }
193
- if (normalized.includes("sqlite")) {
194
- matches.push("sqlite.org");
195
- }
196
- if (normalized.includes("cloudflare")) {
197
- matches.push("developers.cloudflare.com", "cloudflare.com");
198
- }
199
- if (normalized.includes("vercel")) {
200
- matches.push("vercel.com", "nextjs.org");
201
- }
202
- if (normalized.includes("netlify")) {
203
- matches.push("netlify.com", "docs.netlify.com");
204
- }
205
- if (normalized.includes("stripe")) {
206
- matches.push("stripe.com", "docs.stripe.com");
207
- }
208
- if (normalized.includes("github")) {
209
- matches.push("github.com", "docs.github.com");
210
- }
211
- if (normalized.includes("gitlab")) {
212
- matches.push("gitlab.com", "docs.gitlab.com");
213
- }
214
- if (normalized.includes("aws")) {
215
- matches.push("aws.amazon.com", "docs.aws.amazon.com");
216
- }
217
- if (normalized.includes("azure")) {
218
- matches.push("azure.microsoft.com", "learn.microsoft.com");
219
- }
220
- if (normalized.includes("gcp") || normalized.includes("google cloud")) {
221
- matches.push("cloud.google.com", "developers.google.com");
222
- }
223
- if (normalized.includes("gemini") || normalized.includes("google ai")) {
224
- matches.push("ai.google.dev", "developers.google.com");
225
- }
226
-
227
- return [...new Set(matches)];
228
- }
229
-
230
- /**
231
- * Check if a domain matches a preferred domain (exact or subdomain)
232
- */
233
- function domainMatches(hostname, candidate) {
234
- return hostname === candidate || hostname.endsWith(`.${candidate}`);
235
- }
236
-
237
- function trimText(text = "", maxChars = 240) {
238
- const clean = String(text).replace(/\s+/g, " ").trim();
239
- if (clean.length <= maxChars) return clean;
240
- return `${clean.slice(0, maxChars).replace(/\s+\S*$/, "")}...`;
241
- }
242
-
243
- function normalizeSourceTitle(title = "") {
244
- const clean = trimText(title, 180);
245
- if (!clean) return "";
246
- if (/^https?:\/\//i.test(clean)) return "";
247
-
248
- const wordCount = clean.split(/\s+/).filter(Boolean).length;
249
- const hasUppercase = /[A-Z]/.test(clean);
250
- const hasDigit = /\d/.test(clean);
251
- const looksLikeFragment =
252
- clean === clean.toLowerCase() &&
253
- wordCount <= 4 &&
254
- !hasUppercase &&
255
- !hasDigit;
256
- return looksLikeFragment ? "" : clean;
257
- }
258
-
259
- function pickPreferredTitle(currentTitle = "", nextTitle = "") {
260
- const current = normalizeSourceTitle(currentTitle);
261
- const next = normalizeSourceTitle(nextTitle);
262
- if (!next) return current;
263
- if (!current) return next;
264
- const currentLooksLikeUrl = /^https?:\/\//i.test(current);
265
- const nextLooksLikeUrl = /^https?:\/\//i.test(next);
266
- if (currentLooksLikeUrl && !nextLooksLikeUrl) return next;
267
- if (!currentLooksLikeUrl && nextLooksLikeUrl) return current;
268
- return next.length > current.length ? next : current;
269
- }
270
-
271
- function normalizeUrl(rawUrl) {
272
- if (!rawUrl) return null;
273
- try {
274
- const url = new URL(rawUrl);
275
- if (!["http:", "https:"].includes(url.protocol)) return null;
276
- url.hash = "";
277
- url.hostname = url.hostname.toLowerCase();
278
- if (
279
- (url.protocol === "https:" && url.port === "443") ||
280
- (url.protocol === "http:" && url.port === "80")
281
- ) {
282
- url.port = "";
283
- }
284
- for (const key of [...url.searchParams.keys()]) {
285
- const lower = key.toLowerCase();
286
- if (TRACKING_PARAMS.includes(lower) || lower.startsWith("utm_")) {
287
- url.searchParams.delete(key);
288
- }
289
- }
290
- url.searchParams.sort();
291
- const normalizedPath = url.pathname.replace(/\/+$/, "") || "/";
292
- url.pathname = normalizedPath;
293
- const normalized = url.toString();
294
- return normalizedPath === "/" ? normalized.replace(/\/$/, "") : normalized;
295
- } catch {
296
- return null;
297
- }
298
- }
299
-
300
- function getDomain(rawUrl) {
301
- try {
302
- const domain = new URL(rawUrl).hostname.toLowerCase();
303
- return domain.replace(/^www\./, "");
304
- } catch {
305
- return "";
306
- }
307
- }
308
-
309
- function matchesDomain(domain, hosts) {
310
- return hosts.some((host) => domain === host || domain.endsWith(`.${host}`));
311
- }
312
-
313
- function classifySourceType(domain, title = "", rawUrl = "") {
314
- const lowerTitle = title.toLowerCase();
315
- const lowerUrl = rawUrl.toLowerCase();
316
-
317
- if (domain === "github.com" || domain === "gitlab.com") return "repo";
318
- if (matchesDomain(domain, COMMUNITY_HOSTS)) return "community";
319
- if (matchesDomain(domain, NEWS_HOSTS)) return "news";
320
- if (
321
- domain.startsWith("docs.") ||
322
- domain.startsWith("developer.") ||
323
- domain.startsWith("developers.") ||
324
- domain.startsWith("api.") ||
325
- lowerTitle.includes("documentation") ||
326
- lowerTitle.includes("docs") ||
327
- lowerTitle.includes("reference") ||
328
- lowerUrl.includes("/docs/") ||
329
- lowerUrl.includes("/reference/") ||
330
- lowerUrl.includes("/api/")
331
- ) {
332
- return "official-docs";
333
- }
334
- if (domain.startsWith("blog.") || lowerUrl.includes("/blog/"))
335
- return "maintainer-blog";
336
- return "website";
337
- }
338
-
339
- function sourceTypePriority(sourceType) {
340
- switch (sourceType) {
341
- case "official-docs":
342
- return 5;
343
- case "repo":
344
- return 4;
345
- case "maintainer-blog":
346
- return 3;
347
- case "website":
348
- return 2;
349
- case "community":
350
- return 1;
351
- case "news":
352
- return 0;
353
- default:
354
- return 0;
355
- }
356
- }
357
-
358
- function bestRank(source) {
359
- const ranks = Object.values(source.perEngine || {}).map((v) => v?.rank || 99);
360
- return ranks.length ? Math.min(...ranks) : 99;
361
- }
362
-
363
- function buildSourceRegistry(out, query = "") {
364
- const seen = new Map();
365
- const engineOrder = ["perplexity", "bing", "google"];
366
-
367
- // Get preferred domains for this query
368
- const preferredDomains = inferPreferredDomains(query);
369
-
370
- for (const engine of engineOrder) {
371
- const result = out[engine];
372
- if (!result?.sources) continue;
373
-
374
- for (let i = 0; i < result.sources.length; i++) {
375
- const source = result.sources[i];
376
- const canonicalUrl = normalizeUrl(source.url);
377
- if (!canonicalUrl || canonicalUrl.length < 10) continue;
378
-
379
- const title = normalizeSourceTitle(source.title || "");
380
- const domain = getDomain(canonicalUrl);
381
- const sourceType = classifySourceType(domain, title, canonicalUrl);
382
-
383
- // Calculate smart score boost
384
- let smartScore = 0;
385
-
386
- // Boost preferred domains for this query
387
- if (preferredDomains.some((pd) => domainMatches(domain, pd))) {
388
- smartScore += 10; // Strong boost for query-relevant official docs
389
- }
390
-
391
- // Boost docs/developer sites
392
- if (sourceType === "official-docs") {
393
- smartScore += 3;
394
- }
395
-
396
- // Boost based on URL path patterns
397
- const lowerUrl = canonicalUrl.toLowerCase();
398
- if (
399
- /\/docs\/|\/documentation\/|\.dev\/|\/api\/|\/reference\//.test(
400
- lowerUrl,
401
- )
402
- ) {
403
- smartScore += 2;
404
- }
405
-
406
- // Penalize community/discussion sites for technical queries
407
- if (sourceType === "community" && preferredDomains.length > 0) {
408
- smartScore -= 2;
409
- }
410
-
411
- const existing = seen.get(canonicalUrl) || {
412
- id: "",
413
- canonicalUrl,
414
- displayUrl: source.url || canonicalUrl,
415
- domain,
416
- title: "",
417
- engines: [],
418
- engineCount: 0,
419
- perEngine: {},
420
- sourceType,
421
- isOfficial: sourceType === "official-docs",
422
- smartScore: 0,
423
- };
424
-
425
- existing.title = pickPreferredTitle(existing.title, title);
426
- existing.displayUrl = existing.displayUrl || source.url || canonicalUrl;
427
- existing.sourceType = existing.sourceType || sourceType;
428
- existing.isOfficial =
429
- existing.isOfficial || sourceType === "official-docs";
430
- existing.smartScore = Math.max(existing.smartScore, smartScore);
431
-
432
- if (!existing.engines.includes(engine)) {
433
- existing.engines.push(engine);
434
- }
435
- existing.perEngine[engine] = {
436
- rank: i + 1,
437
- title: pickPreferredTitle(
438
- existing.perEngine[engine]?.title || "",
439
- title,
440
- ),
441
- };
442
-
443
- seen.set(canonicalUrl, existing);
444
- }
445
- }
446
-
447
- const sources = Array.from(seen.values())
448
- .map((source) => ({
449
- ...source,
450
- engineCount: source.engines.length,
451
- }))
452
- .sort((a, b) => {
453
- // Primary: smart score (query-aware domain boosting)
454
- if (b.smartScore !== a.smartScore) return b.smartScore - a.smartScore;
455
-
456
- // Secondary: consensus (sources found by more engines)
457
- if (b.engineCount !== a.engineCount) return b.engineCount - a.engineCount;
458
-
459
- // Tertiary: source type priority
460
- if (
461
- sourceTypePriority(b.sourceType) !== sourceTypePriority(a.sourceType)
462
- ) {
463
- return (
464
- sourceTypePriority(b.sourceType) - sourceTypePriority(a.sourceType)
465
- );
466
- }
467
-
468
- // Quaternary: best rank across engines
469
- if (bestRank(a) !== bestRank(b)) return bestRank(a) - bestRank(b);
470
-
471
- return a.domain.localeCompare(b.domain);
472
- })
473
- .slice(0, 12)
474
- .map((source, index) => ({
475
- ...source,
476
- id: `S${index + 1}`,
477
- title: source.title || source.domain || source.canonicalUrl,
478
- }));
479
-
480
- return sources;
481
- }
482
-
483
- function mergeFetchDataIntoSources(sources, fetchedSources) {
484
- const byId = new Map(fetchedSources.map((source) => [source.id, source]));
485
- return sources.map((source) => {
486
- const fetched = byId.get(source.id);
487
- if (!fetched) return source;
488
-
489
- const title = pickPreferredTitle(source.title, fetched.title || "");
490
- return {
491
- ...source,
492
- title: title || source.title,
493
- fetch: {
494
- attempted: true,
495
- ok: !fetched.error && fetched.contentChars > 100,
496
- status: fetched.status || null,
497
- finalUrl: fetched.finalUrl || fetched.url || source.canonicalUrl,
498
- contentType: fetched.contentType || "",
499
- lastModified: fetched.lastModified || "",
500
- title: fetched.title || "",
501
- snippet: fetched.snippet || "",
502
- contentChars: fetched.contentChars || 0,
503
- source: fetched.source || "unknown", // "http" | "browser"
504
- duration: fetched.duration || 0,
505
- error: fetched.error || "",
506
- },
507
- };
508
- });
509
- }
510
-
511
- function parseStructuredJson(text) {
512
- if (!text) return null;
513
- const trimmed = String(text).trim();
514
- const candidates = [
515
- trimmed,
516
- trimmed
517
- .replace(/^```json\s*/i, "")
518
- .replace(/^```\s*/i, "")
519
- .replace(/```$/i, "")
520
- .trim(),
521
- ];
522
-
523
- const objectMatch = trimmed.match(/\{[\s\S]*\}/);
524
- if (objectMatch) candidates.push(objectMatch[0]);
525
-
526
- for (const candidate of candidates) {
527
- try {
528
- return JSON.parse(candidate);
529
- } catch {
530
- // try next candidate
531
- }
532
- }
533
- return null;
534
- }
535
-
536
- function normalizeSynthesisPayload(payload, sources, fallbackAnswer = "") {
537
- const sourceIds = new Set(sources.map((source) => source.id));
538
- const agreementLevel = [
539
- "high",
540
- "medium",
541
- "low",
542
- "mixed",
543
- "conflicting",
544
- ].includes(payload?.agreement?.level)
545
- ? payload.agreement.level
546
- : "mixed";
547
- const claims = Array.isArray(payload?.claims)
548
- ? payload.claims
549
- .map((claim) => ({
550
- claim: trimText(claim?.claim || "", 260),
551
- support: ["strong", "moderate", "weak", "conflicting"].includes(
552
- claim?.support,
553
- )
554
- ? claim.support
555
- : "moderate",
556
- sourceIds: Array.isArray(claim?.sourceIds)
557
- ? claim.sourceIds.filter((id) => sourceIds.has(id))
558
- : [],
559
- }))
560
- .filter((claim) => claim.claim)
561
- : [];
562
- const recommendedSources = Array.isArray(payload?.recommendedSources)
563
- ? payload.recommendedSources.filter((id) => sourceIds.has(id)).slice(0, 6)
564
- : [];
565
-
566
- return {
567
- answer: trimText(payload?.answer || fallbackAnswer, 4000),
568
- agreement: {
569
- level: agreementLevel,
570
- summary: trimText(payload?.agreement?.summary || "", 280),
571
- },
572
- differences: Array.isArray(payload?.differences)
573
- ? payload.differences
574
- .map((item) => trimText(item, 220))
575
- .filter(Boolean)
576
- .slice(0, 5)
577
- : [],
578
- caveats: Array.isArray(payload?.caveats)
579
- ? payload.caveats
580
- .map((item) => trimText(item, 220))
581
- .filter(Boolean)
582
- .slice(0, 5)
583
- : [],
584
- claims,
585
- recommendedSources,
586
- };
587
- }
588
-
589
- function buildSynthesisPrompt(
590
- query,
591
- results,
592
- sources,
593
- { grounded = false } = {},
594
- ) {
595
- const engineSummaries = {};
596
- for (const engine of ["perplexity", "bing", "google"]) {
597
- const result = results[engine];
598
- if (!result) continue;
599
- if (result.error) {
600
- engineSummaries[engine] = {
601
- status: "error",
602
- error: String(result.error),
603
- };
604
- continue;
605
- }
606
-
607
- engineSummaries[engine] = {
608
- status: "ok",
609
- answer: trimText(result.answer || "", grounded ? 4500 : 2200),
610
- sourceIds: sources
611
- .filter((source) => source.engines.includes(engine))
612
- .sort(
613
- (a, b) =>
614
- (a.perEngine[engine]?.rank || 99) -
615
- (b.perEngine[engine]?.rank || 99),
616
- )
617
- .map((source) => source.id)
618
- .slice(0, 6),
619
- };
620
- }
621
-
622
- const sourceRegistry = sources.slice(0, grounded ? 10 : 8).map((source) => ({
623
- id: source.id,
624
- title: source.title,
625
- domain: source.domain,
626
- canonicalUrl: source.canonicalUrl,
627
- sourceType: source.sourceType,
628
- isOfficial: source.isOfficial,
629
- engines: source.engines,
630
- engineCount: source.engineCount,
631
- perEngine: source.perEngine,
632
- fetch:
633
- grounded && source.fetch?.attempted
634
- ? {
635
- ok: source.fetch.ok,
636
- status: source.fetch.status,
637
- lastModified: source.fetch.lastModified,
638
- snippet: trimText(source.fetch.snippet || "", 700),
639
- }
640
- : undefined,
641
- }));
642
-
643
- return [
644
- "You are synthesizing results from Perplexity, Bing Copilot, and Google AI.",
645
- grounded
646
- ? "Use the fetched source snippets as the strongest evidence. Use engine answers for perspective and conflict detection."
647
- : "Use the engine answers for perspective. Use the source registry for provenance and citations.",
648
- "Prefer official docs, release notes, repositories, and maintainer-authored sources when available.",
649
- "If the engines disagree, say so explicitly.",
650
- "Do not invent sources. Only reference source IDs from the source registry.",
651
- "Return valid JSON only. No markdown fences, no prose outside the JSON object.",
652
- "",
653
- "JSON schema:",
654
- "{",
655
- ' "answer": "short direct answer",',
656
- ' "agreement": { "level": "high|medium|low|mixed|conflicting", "summary": "..." },',
657
- ' "differences": ["..."],',
658
- ' "caveats": ["..."],',
659
- ' "claims": [',
660
- ' { "claim": "...", "support": "strong|moderate|weak|conflicting", "sourceIds": ["S1"] }',
661
- " ],",
662
- ' "recommendedSources": ["S1", "S2"]',
663
- "}",
664
- "",
665
- `User query: ${query}`,
666
- "",
667
- `Engine results:\n${JSON.stringify(engineSummaries, null, 2)}`,
668
- "",
669
- `Source registry:\n${JSON.stringify(sourceRegistry, null, 2)}`,
670
- ].join("\n");
671
- }
672
-
673
- function buildConfidence(out) {
674
- const sources = Array.isArray(out._sources) ? out._sources : [];
675
- const topConsensus = sources.length > 0 ? sources[0]?.engineCount || 0 : 0;
676
- const officialSourceCount = sources.filter(
677
- (source) => source.isOfficial,
678
- ).length;
679
- const firstPartySourceCount = sources.filter(
680
- (source) => source.isOfficial || source.sourceType === "maintainer-blog",
681
- ).length;
682
- const fetchedAttempted = sources.filter(
683
- (source) => source.fetch?.attempted,
684
- ).length;
685
- const fetchedSucceeded = sources.filter((source) => source.fetch?.ok).length;
686
- const sourceTypeBreakdown = sources.reduce((acc, source) => {
687
- acc[source.sourceType] = (acc[source.sourceType] || 0) + 1;
688
- return acc;
689
- }, {});
690
- const synthesisLevel = out._synthesis?.agreement?.level;
691
-
692
- return {
693
- sourcesCount: sources.length,
694
- topSourceConsensus: topConsensus,
695
- agreementLevel:
696
- synthesisLevel ||
697
- (topConsensus >= 3 ? "high" : topConsensus >= 2 ? "medium" : "low"),
698
- enginesResponded: ALL_ENGINES.filter(
699
- (engine) => out[engine]?.answer && !out[engine]?.error,
700
- ),
701
- enginesFailed: ALL_ENGINES.filter((engine) => out[engine]?.error),
702
- officialSourceCount,
703
- firstPartySourceCount,
704
- fetchedSourceSuccessRate:
705
- fetchedAttempted > 0
706
- ? Number((fetchedSucceeded / fetchedAttempted).toFixed(2))
707
- : 0,
708
- sourceTypeBreakdown,
709
- };
710
- }
711
-
712
- function getFullTabFromCache(engine) {
713
- try {
714
- if (!existsSync(PAGES_CACHE)) return null;
715
- const pages = JSON.parse(readFileSync(PAGES_CACHE, "utf8"));
716
- const found = pages.find((p) => p.url.includes(ENGINE_DOMAINS[engine]));
717
- return found ? found.targetId : null;
718
- } catch {
719
- return null;
720
- }
721
- }
722
-
723
- function cdp(args, timeoutMs = 15000) {
724
- return new Promise((resolve, reject) => {
725
- const proc = spawn("node", [CDP, ...args], {
726
- stdio: ["ignore", "pipe", "pipe"],
727
- });
728
- let out = "",
729
- err = "";
730
- proc.stdout.on("data", (d) => (out += d));
731
- proc.stderr.on("data", (d) => (err += d));
732
- const t = setTimeout(() => {
733
- proc.kill();
734
- reject(new Error(`cdp timeout: ${args[0]}`));
735
- }, timeoutMs);
736
- proc.on("close", (code) => {
737
- clearTimeout(t);
738
- if (code !== 0) reject(new Error(err.trim() || `cdp exit ${code}`));
739
- else resolve(out.trim());
740
- });
741
- });
742
- }
743
-
744
- async function getAnyTab() {
745
- const list = await cdp(["list"]);
746
- const first = list.split("\n")[0];
747
- if (!first) throw new Error("No Chrome tabs found");
748
- return first.slice(0, 8);
749
- }
750
-
751
- async function _getOrReuseBlankTab() {
752
- // Reuse an existing about:blank tab rather than always creating a new one
753
- const listOut = await cdp(["list"]);
754
- const lines = listOut.split("\n").filter(Boolean);
755
- for (const line of lines) {
756
- if (line.includes("about:blank")) {
757
- return line.slice(0, 8); // prefix of the blank tab's targetId
758
- }
759
- }
760
- // No blank tab — open a new one
761
- const anchor = await getAnyTab();
762
- const raw = await cdp([
763
- "evalraw",
764
- anchor,
765
- "Target.createTarget",
766
- '{"url":"about:blank"}',
767
- ]);
768
- const { targetId } = JSON.parse(raw);
769
- return targetId;
770
- }
771
-
772
- async function openNewTab() {
773
- const anchor = await getAnyTab();
774
- const raw = await cdp([
775
- "evalraw",
776
- anchor,
777
- "Target.createTarget",
778
- '{"url":"about:blank"}',
779
- ]);
780
- const { targetId } = JSON.parse(raw);
781
- return targetId;
782
- }
783
-
784
- async function _getOrOpenEngineTab(engine) {
785
- await cdp(["list"]);
786
- return getFullTabFromCache(engine) || openNewTab();
787
- }
788
-
789
- async function activateTab(targetId) {
790
- try {
791
- const anchor = await getAnyTab();
792
- await cdp([
793
- "evalraw",
794
- anchor,
795
- "Target.activateTarget",
796
- JSON.stringify({ targetId }),
797
- ]);
798
- } catch {
799
- // best-effort
800
- }
801
- }
802
-
803
- async function closeTabs(targetIds = []) {
804
- for (const targetId of targetIds) {
805
- if (!targetId) continue;
806
- await closeTab(targetId);
807
- }
808
- if (targetIds.length > 0) {
809
- await new Promise((r) => setTimeout(r, 300));
810
- await cdp(["list"]).catch(() => null);
811
- }
812
- }
813
-
814
- async function closeTab(targetId) {
815
- try {
816
- const anchor = await getAnyTab();
817
- await cdp([
818
- "evalraw",
819
- anchor,
820
- "Target.closeTarget",
821
- JSON.stringify({ targetId }),
822
- ]);
823
- } catch {
824
- /* best-effort */
825
- }
826
- }
827
-
828
- function runExtractor(
829
- script,
830
- query,
831
- tabPrefix = null,
832
- short = false,
833
- timeoutMs = null, // null = auto-select based on engine
834
- ) {
835
- // Gemini is slower - use longer timeout
836
- if (timeoutMs === null) {
837
- timeoutMs = script.includes("gemini") ? 180000 : 90000;
838
- }
839
- const extraArgs = [
840
- ...(tabPrefix ? ["--tab", tabPrefix] : []),
841
- ...(short ? ["--short"] : []),
842
- ];
44
+ const SOURCE_FETCH_CONCURRENCY = Math.max(
45
+ 1,
46
+ parseInt(process.env.GREEDY_FETCH_CONCURRENCY || "2", 10) || 2,
47
+ );
48
+
49
+ const ENGINES = {
50
+ perplexity: "perplexity.mjs",
51
+ pplx: "perplexity.mjs",
52
+ p: "perplexity.mjs",
53
+ bing: "bing-copilot.mjs",
54
+ copilot: "bing-copilot.mjs",
55
+ b: "bing-copilot.mjs",
56
+ google: "google-ai.mjs",
57
+ g: "google-ai.mjs",
58
+ gemini: "gemini.mjs",
59
+ gem: "gemini.mjs",
60
+ };
61
+
62
+ const ALL_ENGINES = ["perplexity", "bing", "google"];
63
+
64
+ const ENGINE_DOMAINS = {
65
+ perplexity: "perplexity.ai",
66
+ bing: "copilot.microsoft.com",
67
+ google: "google.com",
68
+ gemini: "gemini.google.com",
69
+ };
70
+
71
+ const TRACKING_PARAMS = [
72
+ "fbclid",
73
+ "gclid",
74
+ "ref",
75
+ "ref_src",
76
+ "ref_url",
77
+ "source",
78
+ "utm_campaign",
79
+ "utm_content",
80
+ "utm_medium",
81
+ "utm_source",
82
+ "utm_term",
83
+ ];
84
+
85
+ const COMMUNITY_HOSTS = [
86
+ "dev.to",
87
+ "hashnode.com",
88
+ "medium.com",
89
+ "reddit.com",
90
+ "stackoverflow.com",
91
+ "stackexchange.com",
92
+ "substack.com",
93
+ ];
94
+
95
+ const NEWS_HOSTS = [
96
+ "arstechnica.com",
97
+ "techcrunch.com",
98
+ "theverge.com",
99
+ "venturebeat.com",
100
+ "wired.com",
101
+ "zdnet.com",
102
+ ];
103
+
104
+ /**
105
+ * Infer preferred domains based on query keywords
106
+ * Returns domains that should be boosted for this query
107
+ */
108
+ function inferPreferredDomains(query) {
109
+ const normalized = query.toLowerCase();
110
+ const matches = [];
111
+
112
+ if (
113
+ normalized.includes("openai") ||
114
+ normalized.includes("gpt") ||
115
+ normalized.includes("chatgpt")
116
+ ) {
117
+ matches.push("openai.com", "platform.openai.com", "help.openai.com");
118
+ }
119
+ if (normalized.includes("anthropic") || normalized.includes("claude")) {
120
+ matches.push("anthropic.com", "docs.anthropic.com");
121
+ }
122
+ if (normalized.includes("bun")) {
123
+ matches.push("bun.sh", "bun.com");
124
+ }
125
+ if (normalized.includes("next.js") || normalized.includes("nextjs")) {
126
+ matches.push("nextjs.org", "vercel.com");
127
+ }
128
+ if (normalized.includes("playwright")) {
129
+ matches.push("playwright.dev");
130
+ }
131
+ if (normalized.includes("supabase")) {
132
+ matches.push("supabase.com", "supabase.io");
133
+ }
134
+ if (normalized.includes("prisma")) {
135
+ matches.push("prisma.io");
136
+ }
137
+ if (normalized.includes("tailwind")) {
138
+ matches.push("tailwindcss.com");
139
+ }
140
+ if (normalized.includes("vite")) {
141
+ matches.push("vitejs.dev", "vite.dev");
142
+ }
143
+ if (normalized.includes("astro")) {
144
+ matches.push("astro.build");
145
+ }
146
+ if (normalized.includes("svelte")) {
147
+ matches.push("svelte.dev");
148
+ }
149
+ if (normalized.includes("solid")) {
150
+ matches.push("solidjs.com");
151
+ }
152
+ if (normalized.includes("vue") || normalized.includes("nuxt")) {
153
+ matches.push("vuejs.org", "nuxt.com");
154
+ }
155
+ if (normalized.includes("react") || normalized.includes("react native")) {
156
+ matches.push("react.dev", "reactnative.dev");
157
+ }
158
+ if (normalized.includes("angular")) {
159
+ matches.push("angular.io", "angular.dev");
160
+ }
161
+ if (normalized.includes("node.js") || normalized.includes("nodejs")) {
162
+ matches.push("nodejs.org", "nodejs.dev", "npmjs.com");
163
+ }
164
+ if (normalized.includes("deno")) {
165
+ matches.push("deno.land", "deno.com");
166
+ }
167
+ if (normalized.includes("fresh")) {
168
+ matches.push("fresh.deno.dev");
169
+ }
170
+ if (normalized.includes("typescript") || normalized.includes("ts")) {
171
+ matches.push("typescriptlang.org");
172
+ }
173
+ if (normalized.includes("python")) {
174
+ matches.push("python.org", "docs.python.org");
175
+ }
176
+ if (normalized.includes("rust")) {
177
+ matches.push("rust-lang.org", "docs.rs", "crates.io");
178
+ }
179
+ if (normalized.includes("go") || normalized.includes("golang")) {
180
+ matches.push("go.dev", "golang.org", "pkg.go.dev");
181
+ }
182
+ if (normalized.includes("zig")) {
183
+ matches.push("ziglang.org");
184
+ }
185
+ if (normalized.includes("docker")) {
186
+ matches.push("docker.com", "docs.docker.com", "hub.docker.com");
187
+ }
188
+ if (normalized.includes("kubernetes") || normalized.includes("k8s")) {
189
+ matches.push("kubernetes.io", "k8s.io");
190
+ }
191
+ if (normalized.includes("postgres") || normalized.includes("postgresql")) {
192
+ matches.push("postgresql.org", "neon.tech", "supabase.com");
193
+ }
194
+ if (normalized.includes("redis")) {
195
+ matches.push("redis.io");
196
+ }
197
+ if (normalized.includes("sqlite")) {
198
+ matches.push("sqlite.org");
199
+ }
200
+ if (normalized.includes("cloudflare")) {
201
+ matches.push("developers.cloudflare.com", "cloudflare.com");
202
+ }
203
+ if (normalized.includes("vercel")) {
204
+ matches.push("vercel.com", "nextjs.org");
205
+ }
206
+ if (normalized.includes("netlify")) {
207
+ matches.push("netlify.com", "docs.netlify.com");
208
+ }
209
+ if (normalized.includes("stripe")) {
210
+ matches.push("stripe.com", "docs.stripe.com");
211
+ }
212
+ if (normalized.includes("github")) {
213
+ matches.push("github.com", "docs.github.com");
214
+ }
215
+ if (normalized.includes("gitlab")) {
216
+ matches.push("gitlab.com", "docs.gitlab.com");
217
+ }
218
+ if (normalized.includes("aws")) {
219
+ matches.push("aws.amazon.com", "docs.aws.amazon.com");
220
+ }
221
+ if (normalized.includes("azure")) {
222
+ matches.push("azure.microsoft.com", "learn.microsoft.com");
223
+ }
224
+ if (normalized.includes("gcp") || normalized.includes("google cloud")) {
225
+ matches.push("cloud.google.com", "developers.google.com");
226
+ }
227
+ if (normalized.includes("gemini") || normalized.includes("google ai")) {
228
+ matches.push("ai.google.dev", "developers.google.com");
229
+ }
230
+
231
+ return [...new Set(matches)];
232
+ }
233
+
234
+ /**
235
+ * Check if a domain matches a preferred domain (exact or subdomain)
236
+ */
237
+ function domainMatches(hostname, candidate) {
238
+ return hostname === candidate || hostname.endsWith(`.${candidate}`);
239
+ }
240
+
241
+ function trimText(text = "", maxChars = 240) {
242
+ const clean = String(text).replace(/\s+/g, " ").trim();
243
+ if (clean.length <= maxChars) return clean;
244
+ return `${clean.slice(0, maxChars).replace(/\s+\S*$/, "")}...`;
245
+ }
246
+
247
+ function normalizeSourceTitle(title = "") {
248
+ const clean = trimText(title, 180);
249
+ if (!clean) return "";
250
+ if (/^https?:\/\//i.test(clean)) return "";
251
+
252
+ const wordCount = clean.split(/\s+/).filter(Boolean).length;
253
+ const hasUppercase = /[A-Z]/.test(clean);
254
+ const hasDigit = /\d/.test(clean);
255
+ const looksLikeFragment =
256
+ clean === clean.toLowerCase() &&
257
+ wordCount <= 4 &&
258
+ !hasUppercase &&
259
+ !hasDigit;
260
+ return looksLikeFragment ? "" : clean;
261
+ }
262
+
263
+ function pickPreferredTitle(currentTitle = "", nextTitle = "") {
264
+ const current = normalizeSourceTitle(currentTitle);
265
+ const next = normalizeSourceTitle(nextTitle);
266
+ if (!next) return current;
267
+ if (!current) return next;
268
+ const currentLooksLikeUrl = /^https?:\/\//i.test(current);
269
+ const nextLooksLikeUrl = /^https?:\/\//i.test(next);
270
+ if (currentLooksLikeUrl && !nextLooksLikeUrl) return next;
271
+ if (!currentLooksLikeUrl && nextLooksLikeUrl) return current;
272
+ return next.length > current.length ? next : current;
273
+ }
274
+
275
+ function normalizeUrl(rawUrl) {
276
+ if (!rawUrl) return null;
277
+ try {
278
+ const url = new URL(rawUrl);
279
+ if (!["http:", "https:"].includes(url.protocol)) return null;
280
+ url.hash = "";
281
+ url.hostname = url.hostname.toLowerCase();
282
+ if (
283
+ (url.protocol === "https:" && url.port === "443") ||
284
+ (url.protocol === "http:" && url.port === "80")
285
+ ) {
286
+ url.port = "";
287
+ }
288
+ for (const key of [...url.searchParams.keys()]) {
289
+ const lower = key.toLowerCase();
290
+ if (TRACKING_PARAMS.includes(lower) || lower.startsWith("utm_")) {
291
+ url.searchParams.delete(key);
292
+ }
293
+ }
294
+ url.searchParams.sort();
295
+ const normalizedPath = url.pathname.replace(/\/+$/, "") || "/";
296
+ url.pathname = normalizedPath;
297
+ const normalized = url.toString();
298
+ return normalizedPath === "/" ? normalized.replace(/\/$/, "") : normalized;
299
+ } catch {
300
+ return null;
301
+ }
302
+ }
303
+
304
+ function getDomain(rawUrl) {
305
+ try {
306
+ const domain = new URL(rawUrl).hostname.toLowerCase();
307
+ return domain.replace(/^www\./, "");
308
+ } catch {
309
+ return "";
310
+ }
311
+ }
312
+
313
+ function matchesDomain(domain, hosts) {
314
+ return hosts.some((host) => domain === host || domain.endsWith(`.${host}`));
315
+ }
316
+
317
+ function classifySourceType(domain, title = "", rawUrl = "") {
318
+ const lowerTitle = title.toLowerCase();
319
+ const lowerUrl = rawUrl.toLowerCase();
320
+
321
+ if (domain === "github.com" || domain === "gitlab.com") return "repo";
322
+ if (matchesDomain(domain, COMMUNITY_HOSTS)) return "community";
323
+ if (matchesDomain(domain, NEWS_HOSTS)) return "news";
324
+ if (
325
+ domain.startsWith("docs.") ||
326
+ domain.startsWith("developer.") ||
327
+ domain.startsWith("developers.") ||
328
+ domain.startsWith("api.") ||
329
+ lowerTitle.includes("documentation") ||
330
+ lowerTitle.includes("docs") ||
331
+ lowerTitle.includes("reference") ||
332
+ lowerUrl.includes("/docs/") ||
333
+ lowerUrl.includes("/reference/") ||
334
+ lowerUrl.includes("/api/")
335
+ ) {
336
+ return "official-docs";
337
+ }
338
+ if (domain.startsWith("blog.") || lowerUrl.includes("/blog/"))
339
+ return "maintainer-blog";
340
+ return "website";
341
+ }
342
+
343
+ function sourceTypePriority(sourceType) {
344
+ switch (sourceType) {
345
+ case "official-docs":
346
+ return 5;
347
+ case "repo":
348
+ return 4;
349
+ case "maintainer-blog":
350
+ return 3;
351
+ case "website":
352
+ return 2;
353
+ case "community":
354
+ return 1;
355
+ case "news":
356
+ return 0;
357
+ default:
358
+ return 0;
359
+ }
360
+ }
361
+
362
+ function bestRank(source) {
363
+ const ranks = Object.values(source.perEngine || {}).map((v) => v?.rank || 99);
364
+ return ranks.length ? Math.min(...ranks) : 99;
365
+ }
366
+
367
+ function buildSourceRegistry(out, query = "") {
368
+ const seen = new Map();
369
+ const engineOrder = ["perplexity", "bing", "google"];
370
+
371
+ // Get preferred domains for this query
372
+ const preferredDomains = inferPreferredDomains(query);
373
+
374
+ for (const engine of engineOrder) {
375
+ const result = out[engine];
376
+ if (!result?.sources) continue;
377
+
378
+ for (let i = 0; i < result.sources.length; i++) {
379
+ const source = result.sources[i];
380
+ const canonicalUrl = normalizeUrl(source.url);
381
+ if (!canonicalUrl || canonicalUrl.length < 10) continue;
382
+
383
+ const title = normalizeSourceTitle(source.title || "");
384
+ const domain = getDomain(canonicalUrl);
385
+ const sourceType = classifySourceType(domain, title, canonicalUrl);
386
+
387
+ // Calculate smart score boost
388
+ let smartScore = 0;
389
+
390
+ // Boost preferred domains for this query
391
+ if (preferredDomains.some((pd) => domainMatches(domain, pd))) {
392
+ smartScore += 10; // Strong boost for query-relevant official docs
393
+ }
394
+
395
+ // Boost docs/developer sites
396
+ if (sourceType === "official-docs") {
397
+ smartScore += 3;
398
+ }
399
+
400
+ // Boost based on URL path patterns
401
+ const lowerUrl = canonicalUrl.toLowerCase();
402
+ if (
403
+ /\/docs\/|\/documentation\/|\.dev\/|\/api\/|\/reference\//.test(
404
+ lowerUrl,
405
+ )
406
+ ) {
407
+ smartScore += 2;
408
+ }
409
+
410
+ // Penalize community/discussion sites for technical queries
411
+ if (sourceType === "community" && preferredDomains.length > 0) {
412
+ smartScore -= 2;
413
+ }
414
+
415
+ const existing = seen.get(canonicalUrl) || {
416
+ id: "",
417
+ canonicalUrl,
418
+ displayUrl: source.url || canonicalUrl,
419
+ domain,
420
+ title: "",
421
+ engines: [],
422
+ engineCount: 0,
423
+ perEngine: {},
424
+ sourceType,
425
+ isOfficial: sourceType === "official-docs",
426
+ smartScore: 0,
427
+ };
428
+
429
+ existing.title = pickPreferredTitle(existing.title, title);
430
+ existing.displayUrl = existing.displayUrl || source.url || canonicalUrl;
431
+ existing.sourceType = existing.sourceType || sourceType;
432
+ existing.isOfficial =
433
+ existing.isOfficial || sourceType === "official-docs";
434
+ existing.smartScore = Math.max(existing.smartScore, smartScore);
435
+
436
+ if (!existing.engines.includes(engine)) {
437
+ existing.engines.push(engine);
438
+ }
439
+ existing.perEngine[engine] = {
440
+ rank: i + 1,
441
+ title: pickPreferredTitle(
442
+ existing.perEngine[engine]?.title || "",
443
+ title,
444
+ ),
445
+ };
446
+
447
+ seen.set(canonicalUrl, existing);
448
+ }
449
+ }
450
+
451
+ const sources = Array.from(seen.values())
452
+ .map((source) => ({
453
+ ...source,
454
+ engineCount: source.engines.length,
455
+ }))
456
+ .sort((a, b) => {
457
+ // Primary: smart score (query-aware domain boosting)
458
+ if (b.smartScore !== a.smartScore) return b.smartScore - a.smartScore;
459
+
460
+ // Secondary: consensus (sources found by more engines)
461
+ if (b.engineCount !== a.engineCount) return b.engineCount - a.engineCount;
462
+
463
+ // Tertiary: source type priority
464
+ if (
465
+ sourceTypePriority(b.sourceType) !== sourceTypePriority(a.sourceType)
466
+ ) {
467
+ return (
468
+ sourceTypePriority(b.sourceType) - sourceTypePriority(a.sourceType)
469
+ );
470
+ }
471
+
472
+ // Quaternary: best rank across engines
473
+ if (bestRank(a) !== bestRank(b)) return bestRank(a) - bestRank(b);
474
+
475
+ return a.domain.localeCompare(b.domain);
476
+ })
477
+ .slice(0, 12)
478
+ .map((source, index) => ({
479
+ ...source,
480
+ id: `S${index + 1}`,
481
+ title: source.title || source.domain || source.canonicalUrl,
482
+ }));
483
+
484
+ return sources;
485
+ }
486
+
487
+ function mergeFetchDataIntoSources(sources, fetchedSources) {
488
+ const byId = new Map(fetchedSources.map((source) => [source.id, source]));
489
+ return sources.map((source) => {
490
+ const fetched = byId.get(source.id);
491
+ if (!fetched) return source;
492
+
493
+ const title = pickPreferredTitle(source.title, fetched.title || "");
494
+ return {
495
+ ...source,
496
+ title: title || source.title,
497
+ fetch: {
498
+ attempted: true,
499
+ ok: !fetched.error && fetched.contentChars > 100,
500
+ status: fetched.status || null,
501
+ finalUrl: fetched.finalUrl || fetched.url || source.canonicalUrl,
502
+ contentType: fetched.contentType || "",
503
+ lastModified: fetched.lastModified || "",
504
+ title: fetched.title || "",
505
+ snippet: fetched.snippet || "",
506
+ contentChars: fetched.contentChars || 0,
507
+ source: fetched.source || "unknown", // "http" | "browser"
508
+ duration: fetched.duration || 0,
509
+ error: fetched.error || "",
510
+ },
511
+ };
512
+ });
513
+ }
514
+
515
+ function parseStructuredJson(text) {
516
+ if (!text) return null;
517
+ const trimmed = String(text).trim();
518
+ const candidates = [
519
+ trimmed,
520
+ trimmed
521
+ .replace(/^```json\s*/i, "")
522
+ .replace(/^```\s*/i, "")
523
+ .replace(/```$/i, "")
524
+ .trim(),
525
+ ];
526
+
527
+ const objectMatch = trimmed.match(/\{[\s\S]*\}/);
528
+ if (objectMatch) candidates.push(objectMatch[0]);
529
+
530
+ for (const candidate of candidates) {
531
+ try {
532
+ return JSON.parse(candidate);
533
+ } catch {
534
+ // try next candidate
535
+ }
536
+ }
537
+ return null;
538
+ }
539
+
540
+ function normalizeSynthesisPayload(payload, sources, fallbackAnswer = "") {
541
+ const sourceIds = new Set(sources.map((source) => source.id));
542
+ const agreementLevel = [
543
+ "high",
544
+ "medium",
545
+ "low",
546
+ "mixed",
547
+ "conflicting",
548
+ ].includes(payload?.agreement?.level)
549
+ ? payload.agreement.level
550
+ : "mixed";
551
+ const claims = Array.isArray(payload?.claims)
552
+ ? payload.claims
553
+ .map((claim) => ({
554
+ claim: trimText(claim?.claim || "", 260),
555
+ support: ["strong", "moderate", "weak", "conflicting"].includes(
556
+ claim?.support,
557
+ )
558
+ ? claim.support
559
+ : "moderate",
560
+ sourceIds: Array.isArray(claim?.sourceIds)
561
+ ? claim.sourceIds.filter((id) => sourceIds.has(id))
562
+ : [],
563
+ }))
564
+ .filter((claim) => claim.claim)
565
+ : [];
566
+ const recommendedSources = Array.isArray(payload?.recommendedSources)
567
+ ? payload.recommendedSources.filter((id) => sourceIds.has(id)).slice(0, 6)
568
+ : [];
569
+
570
+ return {
571
+ answer: trimText(payload?.answer || fallbackAnswer, 4000),
572
+ agreement: {
573
+ level: agreementLevel,
574
+ summary: trimText(payload?.agreement?.summary || "", 280),
575
+ },
576
+ differences: Array.isArray(payload?.differences)
577
+ ? payload.differences
578
+ .map((item) => trimText(item, 220))
579
+ .filter(Boolean)
580
+ .slice(0, 5)
581
+ : [],
582
+ caveats: Array.isArray(payload?.caveats)
583
+ ? payload.caveats
584
+ .map((item) => trimText(item, 220))
585
+ .filter(Boolean)
586
+ .slice(0, 5)
587
+ : [],
588
+ claims,
589
+ recommendedSources,
590
+ };
591
+ }
592
+
593
+ function buildSynthesisPrompt(
594
+ query,
595
+ results,
596
+ sources,
597
+ { grounded = false } = {},
598
+ ) {
599
+ const engineSummaries = {};
600
+ for (const engine of ["perplexity", "bing", "google"]) {
601
+ const result = results[engine];
602
+ if (!result) continue;
603
+ if (result.error) {
604
+ engineSummaries[engine] = {
605
+ status: "error",
606
+ error: String(result.error),
607
+ };
608
+ continue;
609
+ }
610
+
611
+ engineSummaries[engine] = {
612
+ status: "ok",
613
+ answer: trimText(result.answer || "", grounded ? 4500 : 2200),
614
+ sourceIds: sources
615
+ .filter((source) => source.engines.includes(engine))
616
+ .sort(
617
+ (a, b) =>
618
+ (a.perEngine[engine]?.rank || 99) -
619
+ (b.perEngine[engine]?.rank || 99),
620
+ )
621
+ .map((source) => source.id)
622
+ .slice(0, 6),
623
+ };
624
+ }
625
+
626
+ const sourceRegistry = sources.slice(0, grounded ? 10 : 8).map((source) => ({
627
+ id: source.id,
628
+ title: source.title,
629
+ domain: source.domain,
630
+ canonicalUrl: source.canonicalUrl,
631
+ sourceType: source.sourceType,
632
+ isOfficial: source.isOfficial,
633
+ engines: source.engines,
634
+ engineCount: source.engineCount,
635
+ perEngine: source.perEngine,
636
+ fetch:
637
+ grounded && source.fetch?.attempted
638
+ ? {
639
+ ok: source.fetch.ok,
640
+ status: source.fetch.status,
641
+ lastModified: source.fetch.lastModified,
642
+ snippet: trimText(source.fetch.snippet || "", 700),
643
+ }
644
+ : undefined,
645
+ }));
646
+
647
+ return [
648
+ "You are synthesizing results from Perplexity, Bing Copilot, and Google AI.",
649
+ grounded
650
+ ? "Use the fetched source snippets as the strongest evidence. Use engine answers for perspective and conflict detection."
651
+ : "Use the engine answers for perspective. Use the source registry for provenance and citations.",
652
+ "Prefer official docs, release notes, repositories, and maintainer-authored sources when available.",
653
+ "If the engines disagree, say so explicitly.",
654
+ "Do not invent sources. Only reference source IDs from the source registry.",
655
+ "Return valid JSON only. No markdown fences, no prose outside the JSON object.",
656
+ "",
657
+ "JSON schema:",
658
+ "{",
659
+ ' "answer": "short direct answer",',
660
+ ' "agreement": { "level": "high|medium|low|mixed|conflicting", "summary": "..." },',
661
+ ' "differences": ["..."],',
662
+ ' "caveats": ["..."],',
663
+ ' "claims": [',
664
+ ' { "claim": "...", "support": "strong|moderate|weak|conflicting", "sourceIds": ["S1"] }',
665
+ " ],",
666
+ ' "recommendedSources": ["S1", "S2"]',
667
+ "}",
668
+ "",
669
+ `User query: ${query}`,
670
+ "",
671
+ `Engine results:\n${JSON.stringify(engineSummaries, null, 2)}`,
672
+ "",
673
+ `Source registry:\n${JSON.stringify(sourceRegistry, null, 2)}`,
674
+ ].join("\n");
675
+ }
676
+
677
+ function buildConfidence(out) {
678
+ const sources = Array.isArray(out._sources) ? out._sources : [];
679
+ const topConsensus = sources.length > 0 ? sources[0]?.engineCount || 0 : 0;
680
+ const officialSourceCount = sources.filter(
681
+ (source) => source.isOfficial,
682
+ ).length;
683
+ const firstPartySourceCount = sources.filter(
684
+ (source) => source.isOfficial || source.sourceType === "maintainer-blog",
685
+ ).length;
686
+ const fetchedAttempted = sources.filter(
687
+ (source) => source.fetch?.attempted,
688
+ ).length;
689
+ const fetchedSucceeded = sources.filter((source) => source.fetch?.ok).length;
690
+ const sourceTypeBreakdown = sources.reduce((acc, source) => {
691
+ acc[source.sourceType] = (acc[source.sourceType] || 0) + 1;
692
+ return acc;
693
+ }, {});
694
+ const synthesisLevel = out._synthesis?.agreement?.level;
695
+
696
+ return {
697
+ sourcesCount: sources.length,
698
+ topSourceConsensus: topConsensus,
699
+ agreementLevel:
700
+ synthesisLevel ||
701
+ (topConsensus >= 3 ? "high" : topConsensus >= 2 ? "medium" : "low"),
702
+ enginesResponded: ALL_ENGINES.filter(
703
+ (engine) => out[engine]?.answer && !out[engine]?.error,
704
+ ),
705
+ enginesFailed: ALL_ENGINES.filter((engine) => out[engine]?.error),
706
+ officialSourceCount,
707
+ firstPartySourceCount,
708
+ fetchedSourceSuccessRate:
709
+ fetchedAttempted > 0
710
+ ? Number((fetchedSucceeded / fetchedAttempted).toFixed(2))
711
+ : 0,
712
+ sourceTypeBreakdown,
713
+ };
714
+ }
715
+
716
+ function getFullTabFromCache(engine) {
717
+ try {
718
+ if (!existsSync(PAGES_CACHE)) return null;
719
+ const pages = JSON.parse(readFileSync(PAGES_CACHE, "utf8"));
720
+ const found = pages.find((p) => p.url.includes(ENGINE_DOMAINS[engine]));
721
+ return found ? found.targetId : null;
722
+ } catch {
723
+ return null;
724
+ }
725
+ }
726
+
727
+ function cdp(args, timeoutMs = 15000) {
728
+ return new Promise((resolve, reject) => {
729
+ const proc = spawn("node", [CDP, ...args], {
730
+ stdio: ["ignore", "pipe", "pipe"],
731
+ });
732
+ let out = "",
733
+ err = "";
734
+ proc.stdout.on("data", (d) => (out += d));
735
+ proc.stderr.on("data", (d) => (err += d));
736
+ const t = setTimeout(() => {
737
+ proc.kill();
738
+ reject(new Error(`cdp timeout: ${args[0]}`));
739
+ }, timeoutMs);
740
+ proc.on("close", (code) => {
741
+ clearTimeout(t);
742
+ if (code !== 0) reject(new Error(err.trim() || `cdp exit ${code}`));
743
+ else resolve(out.trim());
744
+ });
745
+ });
746
+ }
747
+
748
+ async function getAnyTab() {
749
+ const list = await cdp(["list"]);
750
+ const first = list.split("\n")[0];
751
+ if (!first) throw new Error("No Chrome tabs found");
752
+ return first.slice(0, 8);
753
+ }
754
+
755
+ async function _getOrReuseBlankTab() {
756
+ // Reuse an existing about:blank tab rather than always creating a new one
757
+ const listOut = await cdp(["list"]);
758
+ const lines = listOut.split("\n").filter(Boolean);
759
+ for (const line of lines) {
760
+ if (line.includes("about:blank")) {
761
+ return line.slice(0, 8); // prefix of the blank tab's targetId
762
+ }
763
+ }
764
+ // No blank tab — open a new one
765
+ const anchor = await getAnyTab();
766
+ const raw = await cdp([
767
+ "evalraw",
768
+ anchor,
769
+ "Target.createTarget",
770
+ '{"url":"about:blank"}',
771
+ ]);
772
+ const { targetId } = JSON.parse(raw);
773
+ return targetId;
774
+ }
775
+
776
+ async function openNewTab() {
777
+ const anchor = await getAnyTab();
778
+ const raw = await cdp([
779
+ "evalraw",
780
+ anchor,
781
+ "Target.createTarget",
782
+ '{"url":"about:blank"}',
783
+ ]);
784
+ const { targetId } = JSON.parse(raw);
785
+ return targetId;
786
+ }
787
+
788
+ async function _getOrOpenEngineTab(engine) {
789
+ await cdp(["list"]);
790
+ return getFullTabFromCache(engine) || openNewTab();
791
+ }
792
+
793
+ async function activateTab(targetId) {
794
+ try {
795
+ const anchor = await getAnyTab();
796
+ await cdp([
797
+ "evalraw",
798
+ anchor,
799
+ "Target.activateTarget",
800
+ JSON.stringify({ targetId }),
801
+ ]);
802
+ } catch {
803
+ // best-effort
804
+ }
805
+ }
806
+
807
+ async function closeTabs(targetIds = []) {
808
+ for (const targetId of targetIds) {
809
+ if (!targetId) continue;
810
+ await closeTab(targetId);
811
+ }
812
+ if (targetIds.length > 0) {
813
+ await new Promise((r) => setTimeout(r, 300));
814
+ await cdp(["list"]).catch(() => null);
815
+ }
816
+ }
817
+
818
+ async function closeTab(targetId) {
819
+ try {
820
+ const anchor = await getAnyTab();
821
+ await cdp([
822
+ "evalraw",
823
+ anchor,
824
+ "Target.closeTarget",
825
+ JSON.stringify({ targetId }),
826
+ ]);
827
+ } catch {
828
+ /* best-effort */
829
+ }
830
+ }
831
+
832
+ function runExtractor(
833
+ script,
834
+ query,
835
+ tabPrefix = null,
836
+ short = false,
837
+ timeoutMs = null, // null = auto-select based on engine
838
+ ) {
839
+ // Gemini is slower - use longer timeout
840
+ if (timeoutMs === null) {
841
+ timeoutMs = script.includes("gemini") ? 180000 : 90000;
842
+ }
843
+ const extraArgs = [
844
+ ...(tabPrefix ? ["--tab", tabPrefix] : []),
845
+ ...(short ? ["--short"] : []),
846
+ ];
843
847
  return new Promise((resolve, reject) => {
844
848
  const proc = spawn(
845
849
  "node",
846
- [join(__dir, "extractors", script), query, ...extraArgs],
850
+ [join(__dir, "..", "extractors", script), query, ...extraArgs],
847
851
  {
848
852
  stdio: ["ignore", "pipe", "pipe"],
849
853
  env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
850
- },
851
- );
852
- let out = "";
853
- let err = "";
854
- proc.stdout.on("data", (d) => (out += d));
855
- proc.stderr.on("data", (d) => (err += d));
856
- const t = setTimeout(() => {
857
- proc.kill();
858
- reject(new Error(`${script} timed out after ${timeoutMs / 1000}s`));
859
- }, timeoutMs);
860
- proc.on("close", (code) => {
861
- clearTimeout(t);
862
- if (code !== 0) reject(new Error(err.trim() || `extractor exit ${code}`));
863
- else {
864
- try {
865
- resolve(JSON.parse(out.trim()));
866
- } catch {
867
- reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`));
868
- }
869
- }
870
- });
871
- });
872
- }
873
-
874
- async function fetchTopSource(url) {
875
- const tab = await openNewTab();
876
- await cdp(["list"]); // refresh cache so the new tab is findable
877
- try {
878
- await cdp(["nav", tab, url], 30000);
879
- await new Promise((r) => setTimeout(r, 1500));
880
- const content = await cdp([
881
- "eval",
882
- tab,
883
- `
884
- (function(){
885
- var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
886
- var text = (el || document.body).innerText;
887
- return text.replace(/\\s+/g, ' ').trim();
888
- })()
889
- `,
890
- ]);
891
- return { url, content };
892
- } catch (e) {
893
- return { url, content: null, error: e.message };
894
- } finally {
895
- await closeTab(tab);
896
- }
897
- }
898
-
899
- /**
900
- * Fetch source content via HTTP with Readability extraction.
901
- * Falls back to browser if HTTP fails or content quality is low.
902
- * @param {string} url - URL to fetch
903
- * @param {number} maxChars - Max characters to return
904
- * @returns {Promise<object>} Fetch result
905
- */
906
- async function fetchSourceContent(url, maxChars = 8000) {
907
- const start = Date.now();
908
-
909
- // Check if it's a GitHub URL (tree/root - use clone, blob - let fetcher handle via raw)
910
- if (parseGitHubUrl(url)) {
911
- const parsed = parseGitHubUrl(url);
912
- // Use cloning for tree/root URLs, or blob URLs that might need exploration
913
- if (
914
- parsed &&
915
- (parsed.type === "root" ||
916
- parsed.type === "tree" ||
917
- (parsed.type === "blob" && !parsed.path?.includes(".")))
918
- ) {
919
- const ghResult = await fetchGitHubContent(url);
920
- if (ghResult.ok) {
921
- const content = trimContentHeadTail(ghResult.content, maxChars);
922
- return {
923
- url,
924
- finalUrl: url,
925
- status: 200,
926
- contentType: "text/markdown",
927
- lastModified: "",
928
- title: ghResult.title,
929
- snippet: content.slice(0, 320),
930
- content,
931
- contentChars: content.length,
932
- source: "github-clone",
933
- localPath: ghResult.localPath,
934
- ...(ghResult.tree && { tree: ghResult.tree }),
935
- duration: Date.now() - start,
936
- };
937
- }
938
- // If GitHub clone failed, fall through to HTTP (which will use raw for blobs)
939
- process.stderr.write(
940
- `[greedysearch] GitHub clone failed, trying HTTP: ${ghResult.error}\n`,
941
- );
942
- }
943
- }
944
-
945
- // Try HTTP first
946
- const httpResult = await fetchSourceHttp(url, { timeoutMs: 15000 });
947
-
948
- if (httpResult.ok) {
949
- const content = trimContentHeadTail(httpResult.markdown, maxChars);
950
- return {
951
- url,
952
- finalUrl: httpResult.finalUrl,
953
- status: httpResult.status,
954
- contentType: "text/markdown",
955
- lastModified: "",
956
- title: httpResult.title,
957
- snippet: httpResult.excerpt,
958
- content,
959
- contentChars: content.length,
960
- source: "http",
961
- duration: Date.now() - start,
962
- };
963
- }
854
+ },
855
+ );
856
+ let out = "";
857
+ let err = "";
858
+ proc.stdout.on("data", (d) => (out += d));
859
+ proc.stderr.on("data", (d) => (err += d));
860
+ const t = setTimeout(() => {
861
+ proc.kill();
862
+ reject(new Error(`${script} timed out after ${timeoutMs / 1000}s`));
863
+ }, timeoutMs);
864
+ proc.on("close", (code) => {
865
+ clearTimeout(t);
866
+ if (code !== 0) reject(new Error(err.trim() || `extractor exit ${code}`));
867
+ else {
868
+ try {
869
+ resolve(JSON.parse(out.trim()));
870
+ } catch {
871
+ reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`));
872
+ }
873
+ }
874
+ });
875
+ });
876
+ }
877
+
878
+ async function fetchTopSource(url) {
879
+ const tab = await openNewTab();
880
+ await cdp(["list"]); // refresh cache so the new tab is findable
881
+ try {
882
+ await cdp(["nav", tab, url], 30000);
883
+ await new Promise((r) => setTimeout(r, 1500));
884
+ const content = await cdp([
885
+ "eval",
886
+ tab,
887
+ `
888
+ (function(){
889
+ var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
890
+ var text = (el || document.body).innerText;
891
+ return text.replace(/\\s+/g, ' ').trim();
892
+ })()
893
+ `,
894
+ ]);
895
+ return { url, content };
896
+ } catch (e) {
897
+ return { url, content: null, error: e.message };
898
+ } finally {
899
+ await closeTab(tab);
900
+ }
901
+ }
902
+
903
+ /**
904
+ * Fetch source content via HTTP with Readability extraction.
905
+ * Falls back to browser if HTTP fails or content quality is low.
906
+ * @param {string} url - URL to fetch
907
+ * @param {number} maxChars - Max characters to return
908
+ * @returns {Promise<object>} Fetch result
909
+ */
910
+ async function fetchSourceContent(url, maxChars = 8000) {
911
+ const start = Date.now();
912
+
913
+ // Check if it's a GitHub URL (tree/root - use clone, blob - let fetcher handle via raw)
914
+ if (parseGitHubUrl(url)) {
915
+ const parsed = parseGitHubUrl(url);
916
+ // Use cloning for tree/root URLs, or blob URLs that might need exploration
917
+ if (
918
+ parsed &&
919
+ (parsed.type === "root" ||
920
+ parsed.type === "tree" ||
921
+ (parsed.type === "blob" && !parsed.path?.includes(".")))
922
+ ) {
923
+ const ghResult = await fetchGitHubContent(url);
924
+ if (ghResult.ok) {
925
+ const content = trimContentHeadTail(ghResult.content, maxChars);
926
+ return {
927
+ url,
928
+ finalUrl: url,
929
+ status: 200,
930
+ contentType: "text/markdown",
931
+ lastModified: "",
932
+ title: ghResult.title,
933
+ snippet: content.slice(0, 320),
934
+ content,
935
+ contentChars: content.length,
936
+ source: "github-clone",
937
+ localPath: ghResult.localPath,
938
+ ...(ghResult.tree && { tree: ghResult.tree }),
939
+ duration: Date.now() - start,
940
+ };
941
+ }
942
+ // If GitHub clone failed, fall through to HTTP (which will use raw for blobs)
943
+ process.stderr.write(
944
+ `[greedysearch] GitHub clone failed, trying HTTP: ${ghResult.error}\n`,
945
+ );
946
+ }
947
+ }
948
+
949
+ // Try HTTP first
950
+ const httpResult = await fetchSourceHttp(url, { timeoutMs: 15000 });
951
+
952
+ if (httpResult.ok) {
953
+ const content = trimContentHeadTail(httpResult.markdown, maxChars);
954
+ return {
955
+ url,
956
+ finalUrl: httpResult.finalUrl,
957
+ status: httpResult.status,
958
+ contentType: "text/markdown",
959
+ lastModified: "",
960
+ title: httpResult.title,
961
+ snippet: httpResult.excerpt,
962
+ content,
963
+ contentChars: content.length,
964
+ source: "http",
965
+ duration: Date.now() - start,
966
+ };
967
+ }
968
+
969
+ // HTTP failed or blocked - fall back to browser
970
+ process.stderr.write(
971
+ `[greedysearch] HTTP failed for ${url.slice(0, 60)}, trying browser...\n`,
972
+ );
973
+ return await fetchSourceContentBrowser(url, maxChars);
974
+ }
975
+
976
+ /**
977
+ * Browser fallback for source fetching (original CDP-based method)
978
+ */
979
+ async function fetchSourceContentBrowser(url, maxChars = 8000) {
980
+ const start = Date.now();
981
+ const tab = await openNewTab();
982
+
983
+ try {
984
+ await cdp(["nav", tab, url], 30000);
985
+ await new Promise((r) => setTimeout(r, 1500));
986
+
987
+ const content = await cdp([
988
+ "eval",
989
+ tab,
990
+ `
991
+ (function(){
992
+ var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
993
+ var text = (el || document.body).innerText;
994
+ return JSON.stringify({
995
+ title: document.title,
996
+ content: text.replace(/\\s+/g, ' ').trim(),
997
+ url: location.href
998
+ });
999
+ })()
1000
+ `,
1001
+ ]);
1002
+
1003
+ const parsed = JSON.parse(content);
1004
+ const finalContent = trimContentHeadTail(parsed.content, maxChars);
1005
+
1006
+ return {
1007
+ url,
1008
+ finalUrl: parsed.url || url,
1009
+ status: 200,
1010
+ contentType: "text/plain",
1011
+ lastModified: "",
1012
+ title: parsed.title,
1013
+ snippet: trimText(finalContent, 320),
1014
+ content: finalContent,
1015
+ contentChars: finalContent.length,
1016
+ source: "browser",
1017
+ duration: Date.now() - start,
1018
+ };
1019
+ } catch (error) {
1020
+ return {
1021
+ url,
1022
+ title: "",
1023
+ content: null,
1024
+ snippet: "",
1025
+ contentChars: 0,
1026
+ error: error.message,
1027
+ source: "browser",
1028
+ duration: Date.now() - start,
1029
+ };
1030
+ } finally {
1031
+ await closeTab(tab);
1032
+ }
1033
+ }
1034
+
1035
+ async function fetchMultipleSources(
1036
+ sources,
1037
+ maxSources = 5,
1038
+ maxChars = 8000,
1039
+ concurrency = SOURCE_FETCH_CONCURRENCY,
1040
+ ) {
1041
+ const toFetch = sources.slice(0, maxSources);
1042
+ if (toFetch.length === 0) return [];
964
1043
 
965
- // HTTP failed or blocked - fall back to browser
966
- process.stderr.write(
967
- `[greedysearch] HTTP failed for ${url.slice(0, 60)}, trying browser...\n`,
1044
+ const workerCount = Math.min(
1045
+ toFetch.length,
1046
+ Math.max(1, parseInt(String(concurrency), 10) || SOURCE_FETCH_CONCURRENCY),
968
1047
  );
969
- return await fetchSourceContentBrowser(url, maxChars);
970
- }
971
-
972
- /**
973
- * Browser fallback for source fetching (original CDP-based method)
974
- */
975
- async function fetchSourceContentBrowser(url, maxChars = 8000) {
976
- const start = Date.now();
977
- const tab = await openNewTab();
978
-
979
- try {
980
- await cdp(["nav", tab, url], 30000);
981
- await new Promise((r) => setTimeout(r, 1500));
982
-
983
- const content = await cdp([
984
- "eval",
985
- tab,
986
- `
987
- (function(){
988
- var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
989
- var text = (el || document.body).innerText;
990
- return JSON.stringify({
991
- title: document.title,
992
- content: text.replace(/\\s+/g, ' ').trim(),
993
- url: location.href
994
- });
995
- })()
996
- `,
997
- ]);
998
-
999
- const parsed = JSON.parse(content);
1000
- const finalContent = trimContentHeadTail(parsed.content, maxChars);
1001
1048
 
1002
- return {
1003
- url,
1004
- finalUrl: parsed.url || url,
1005
- status: 200,
1006
- contentType: "text/plain",
1007
- lastModified: "",
1008
- title: parsed.title,
1009
- snippet: trimText(finalContent, 320),
1010
- content: finalContent,
1011
- contentChars: finalContent.length,
1012
- source: "browser",
1013
- duration: Date.now() - start,
1014
- };
1015
- } catch (error) {
1016
- return {
1017
- url,
1018
- title: "",
1019
- content: null,
1020
- snippet: "",
1021
- contentChars: 0,
1022
- error: error.message,
1023
- source: "browser",
1024
- duration: Date.now() - start,
1025
- };
1026
- } finally {
1027
- await closeTab(tab);
1028
- }
1029
- }
1030
-
1031
- async function fetchMultipleSources(sources, maxSources = 5, maxChars = 8000) {
1032
1049
  process.stderr.write(
1033
- `[greedysearch] Fetching content from ${Math.min(sources.length, maxSources)} sources via HTTP (parallel)...\n`,
1050
+ `[greedysearch] Fetching content from ${toFetch.length} sources via HTTP (concurrency ${workerCount})...\n`,
1034
1051
  );
1035
1052
 
1036
- const toFetch = sources.slice(0, maxSources);
1037
-
1038
- // Fetch all sources in parallel via HTTP
1039
- const fetchPromises = toFetch.map(async (s, index) => {
1040
- const url = s.canonicalUrl || s.url;
1041
- process.stderr.write(
1042
- `[greedysearch] [${index + 1}/${toFetch.length}] Fetching: ${url.slice(0, 60)}...\n`,
1043
- );
1053
+ const fetched = new Array(toFetch.length);
1054
+ let nextIndex = 0;
1055
+ let completed = 0;
1044
1056
 
1045
- const result = await fetchSourceContent(url, maxChars);
1057
+ async function worker() {
1058
+ while (true) {
1059
+ const index = nextIndex++;
1060
+ if (index >= toFetch.length) return;
1046
1061
 
1047
- if (result.content && result.content.length > 100) {
1062
+ const s = toFetch[index];
1063
+ const url = s.canonicalUrl || s.url;
1048
1064
  process.stderr.write(
1049
- `[greedysearch] ${result.source}: ${result.content.length} chars\n`,
1065
+ `[greedysearch] [${index + 1}/${toFetch.length}] Fetching: ${url.slice(0, 60)}...\n`,
1050
1066
  );
1051
- } else if (result.error) {
1052
- process.stderr.write(`[greedysearch] ✗ ${result.error.slice(0, 80)}\n`);
1053
- }
1054
- process.stderr.write(`PROGRESS:fetch:${index + 1}/${toFetch.length}\n`);
1055
-
1056
- return {
1057
- id: s.id,
1058
- ...result,
1059
- };
1060
- });
1061
1067
 
1062
- const fetched = await Promise.all(fetchPromises);
1063
-
1064
- // Log summary
1065
- const successful = fetched.filter((f) => f.content && f.content.length > 100);
1066
- const httpCount = fetched.filter((f) => f.source === "http").length;
1067
- const browserCount = fetched.filter((f) => f.source === "browser").length;
1068
-
1069
- process.stderr.write(
1070
- `[greedysearch] Fetched ${successful.length}/${fetched.length} sources ` +
1071
- `(HTTP: ${httpCount}, Browser: ${browserCount})\n`,
1072
- );
1068
+ const result = await fetchSourceContent(url, maxChars);
1069
+ fetched[index] = {
1070
+ id: s.id,
1071
+ ...result,
1072
+ };
1073
1073
 
1074
- return fetched;
1075
- }
1074
+ if (result.content && result.content.length > 100) {
1075
+ process.stderr.write(
1076
+ `[greedysearch] ✓ ${result.source}: ${result.content.length} chars\n`,
1077
+ );
1078
+ } else if (result.error) {
1079
+ process.stderr.write(`[greedysearch] ✗ ${result.error.slice(0, 80)}\n`);
1080
+ }
1076
1081
 
1077
- function pickTopSource(out) {
1078
- if (Array.isArray(out._sources) && out._sources.length > 0)
1079
- return out._sources[0];
1080
- for (const engine of ["perplexity", "google", "bing"]) {
1081
- const r = out[engine];
1082
- if (r?.sources?.length > 0) return r.sources[0];
1082
+ completed += 1;
1083
+ process.stderr.write(`PROGRESS:fetch:${completed}/${toFetch.length}\n`);
1084
+ }
1083
1085
  }
1084
- return null;
1085
- }
1086
1086
 
1087
- async function synthesizeWithGemini(
1088
- query,
1089
- results,
1090
- { grounded = false, tabPrefix = null } = {},
1091
- ) {
1092
- const sources = Array.isArray(results._sources)
1093
- ? results._sources
1094
- : buildSourceRegistry(results);
1095
- const prompt = buildSynthesisPrompt(query, results, sources, { grounded });
1087
+ await Promise.all(Array.from({ length: workerCount }, () => worker()));
1096
1088
 
1089
+ // Log summary
1090
+ const successful = fetched.filter((f) => f.content && f.content.length > 100);
1091
+ const httpCount = fetched.filter((f) => f.source === "http").length;
1092
+ const browserCount = fetched.filter((f) => f.source === "browser").length;
1093
+
1094
+ process.stderr.write(
1095
+ `[greedysearch] Fetched ${successful.length}/${fetched.length} sources ` +
1096
+ `(HTTP: ${httpCount}, Browser: ${browserCount})\n`,
1097
+ );
1098
+
1099
+ return fetched;
1100
+ }
1101
+
1102
+ function pickTopSource(out) {
1103
+ if (Array.isArray(out._sources) && out._sources.length > 0)
1104
+ return out._sources[0];
1105
+ for (const engine of ["perplexity", "google", "bing"]) {
1106
+ const r = out[engine];
1107
+ if (r?.sources?.length > 0) return r.sources[0];
1108
+ }
1109
+ return null;
1110
+ }
1111
+
1112
+ async function synthesizeWithGemini(
1113
+ query,
1114
+ results,
1115
+ { grounded = false, tabPrefix = null } = {},
1116
+ ) {
1117
+ const sources = Array.isArray(results._sources)
1118
+ ? results._sources
1119
+ : buildSourceRegistry(results);
1120
+ const prompt = buildSynthesisPrompt(query, results, sources, { grounded });
1121
+
1097
1122
  return new Promise((resolve, reject) => {
1098
1123
  const extraArgs = tabPrefix ? ["--tab", String(tabPrefix)] : [];
1099
1124
  const proc = spawn(
1100
1125
  "node",
1101
- [join(__dir, "extractors", "gemini.mjs"), prompt, ...extraArgs],
1126
+ [join(__dir, "..", "extractors", "gemini.mjs"), prompt, ...extraArgs],
1102
1127
  {
1103
1128
  stdio: ["ignore", "pipe", "pipe"],
1104
1129
  env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
1105
- },
1106
- );
1107
- let out = "";
1108
- let err = "";
1109
- proc.stdout.on("data", (d) => (out += d));
1110
- proc.stderr.on("data", (d) => (err += d));
1111
- const t = setTimeout(() => {
1112
- proc.kill();
1113
- reject(new Error("Gemini synthesis timed out after 180s"));
1114
- }, 180000);
1115
- proc.on("close", (code) => {
1116
- clearTimeout(t);
1117
- if (code !== 0)
1118
- reject(new Error(err.trim() || "gemini extractor failed"));
1119
- else {
1120
- try {
1121
- const raw = JSON.parse(out.trim());
1122
- const structured = parseStructuredJson(raw.answer || "");
1123
- resolve({
1124
- ...normalizeSynthesisPayload(structured, sources, raw.answer || ""),
1125
- rawAnswer: raw.answer || "",
1126
- geminiSources: raw.sources || [],
1127
- });
1128
- } catch {
1129
- reject(new Error(`bad JSON from gemini: ${out.slice(0, 100)}`));
1130
- }
1131
- }
1132
- });
1133
- });
1134
- }
1135
-
1136
- function slugify(query) {
1137
- return query
1138
- .toLowerCase()
1139
- .replace(/[^a-z0-9]+/g, "-")
1140
- .replace(/^-|-$/g, "")
1141
- .slice(0, 60);
1142
- }
1143
-
1130
+ },
1131
+ );
1132
+ let out = "";
1133
+ let err = "";
1134
+ proc.stdout.on("data", (d) => (out += d));
1135
+ proc.stderr.on("data", (d) => (err += d));
1136
+ const t = setTimeout(() => {
1137
+ proc.kill();
1138
+ reject(new Error("Gemini synthesis timed out after 180s"));
1139
+ }, 180000);
1140
+ proc.on("close", (code) => {
1141
+ clearTimeout(t);
1142
+ if (code !== 0)
1143
+ reject(new Error(err.trim() || "gemini extractor failed"));
1144
+ else {
1145
+ try {
1146
+ const raw = JSON.parse(out.trim());
1147
+ const structured = parseStructuredJson(raw.answer || "");
1148
+ resolve({
1149
+ ...normalizeSynthesisPayload(structured, sources, raw.answer || ""),
1150
+ rawAnswer: raw.answer || "",
1151
+ geminiSources: raw.sources || [],
1152
+ });
1153
+ } catch {
1154
+ reject(new Error(`bad JSON from gemini: ${out.slice(0, 100)}`));
1155
+ }
1156
+ }
1157
+ });
1158
+ });
1159
+ }
1160
+
1161
+ function slugify(query) {
1162
+ return query
1163
+ .toLowerCase()
1164
+ .replace(/[^a-z0-9]+/g, "-")
1165
+ .replace(/^-|-$/g, "")
1166
+ .slice(0, 60);
1167
+ }
1168
+
1144
1169
  function resultsDir() {
1145
- const dir = join(__dir, "results");
1170
+ const dir = join(__dir, "..", "results");
1146
1171
  mkdirSync(dir, { recursive: true });
1147
1172
  return dir;
1148
1173
  }
1149
-
1150
- function writeOutput(
1151
- data,
1152
- outFile,
1153
- { inline = false, synthesize = false, query = "" } = {},
1154
- ) {
1155
- const json = `${JSON.stringify(data, null, 2)}\n`;
1156
-
1157
- if (outFile) {
1158
- writeFileSync(outFile, json, "utf8");
1159
- process.stderr.write(`Results written to ${outFile}\n`);
1160
- return;
1161
- }
1162
-
1163
- if (inline) {
1164
- process.stdout.write(json);
1165
- return;
1166
- }
1167
-
1168
- const ts = new Date()
1169
- .toISOString()
1170
- .replace("T", "_")
1171
- .replace(/[:.]/g, "-")
1172
- .slice(0, 19);
1173
- const slug = slugify(query);
1174
- const base = join(resultsDir(), `${ts}_${slug}`);
1175
-
1176
- writeFileSync(`${base}.json`, json, "utf8");
1177
-
1178
- if (synthesize && data._synthesis?.answer) {
1179
- writeFileSync(`${base}-synthesis.md`, data._synthesis.answer, "utf8");
1180
- process.stdout.write(`${base}-synthesis.md\n`);
1181
- } else {
1182
- process.stdout.write(`${base}.json\n`);
1183
- }
1184
- }
1185
-
1186
- const GREEDY_PROFILE_DIR = `${tmpdir().replace(/\\/g, "/")}/greedysearch-chrome-profile`;
1187
- const ACTIVE_PORT_FILE = `${GREEDY_PROFILE_DIR}/DevToolsActivePort`;
1188
-
1189
- // Tell cdp.mjs to prefer the GreedySearch Chrome profile's DevToolsActivePort,
1190
- // so searches never accidentally attach to the user's main Chrome session.
1191
- process.env.CDP_PROFILE_DIR = GREEDY_PROFILE_DIR;
1192
-
1193
- function probeGreedyChrome(timeoutMs = 3000) {
1194
- return new Promise((resolve) => {
1195
- const req = http.get(
1196
- `http://localhost:${GREEDY_PORT}/json/version`,
1197
- (res) => {
1198
- res.resume();
1199
- resolve(res.statusCode === 200);
1200
- },
1201
- );
1202
- req.on("error", () => resolve(false));
1203
- req.setTimeout(timeoutMs, () => {
1204
- req.destroy();
1205
- resolve(false);
1206
- });
1207
- });
1208
- }
1209
-
1210
- // Write (or refresh) the DevToolsActivePort file for the GreedySearch Chrome so
1211
- // cdp.mjs always connects to the right port rather than the user's main Chrome.
1212
- // Uses atomic write (write to temp + rename) to prevent corruption from parallel processes.
1174
+
1175
+ function writeOutput(
1176
+ data,
1177
+ outFile,
1178
+ { inline = false, synthesize = false, query = "" } = {},
1179
+ ) {
1180
+ const json = `${JSON.stringify(data, null, 2)}\n`;
1181
+
1182
+ if (outFile) {
1183
+ writeFileSync(outFile, json, "utf8");
1184
+ process.stderr.write(`Results written to ${outFile}\n`);
1185
+ return;
1186
+ }
1187
+
1188
+ if (inline) {
1189
+ process.stdout.write(json);
1190
+ return;
1191
+ }
1192
+
1193
+ const ts = new Date()
1194
+ .toISOString()
1195
+ .replace("T", "_")
1196
+ .replace(/[:.]/g, "-")
1197
+ .slice(0, 19);
1198
+ const slug = slugify(query);
1199
+ const base = join(resultsDir(), `${ts}_${slug}`);
1200
+
1201
+ writeFileSync(`${base}.json`, json, "utf8");
1202
+
1203
+ if (synthesize && data._synthesis?.answer) {
1204
+ writeFileSync(`${base}-synthesis.md`, data._synthesis.answer, "utf8");
1205
+ process.stdout.write(`${base}-synthesis.md\n`);
1206
+ } else {
1207
+ process.stdout.write(`${base}.json\n`);
1208
+ }
1209
+ }
1210
+
1211
+ const GREEDY_PROFILE_DIR = `${tmpdir().replace(/\\/g, "/")}/greedysearch-chrome-profile`;
1212
+ const ACTIVE_PORT_FILE = `${GREEDY_PROFILE_DIR}/DevToolsActivePort`;
1213
+
1214
+ // Tell cdp.mjs to prefer the GreedySearch Chrome profile's DevToolsActivePort,
1215
+ // so searches never accidentally attach to the user's main Chrome session.
1216
+ process.env.CDP_PROFILE_DIR = GREEDY_PROFILE_DIR;
1217
+
1218
+ function probeGreedyChrome(timeoutMs = 3000) {
1219
+ return new Promise((resolve) => {
1220
+ const req = http.get(
1221
+ `http://localhost:${GREEDY_PORT}/json/version`,
1222
+ (res) => {
1223
+ res.resume();
1224
+ resolve(res.statusCode === 200);
1225
+ },
1226
+ );
1227
+ req.on("error", () => resolve(false));
1228
+ req.setTimeout(timeoutMs, () => {
1229
+ req.destroy();
1230
+ resolve(false);
1231
+ });
1232
+ });
1233
+ }
1234
+
1235
+ // Write (or refresh) the DevToolsActivePort file for the GreedySearch Chrome so
1236
+ // cdp.mjs always connects to the right port rather than the user's main Chrome.
1237
+ // Uses atomic write (write to temp + rename) to prevent corruption from parallel processes.
1213
1238
  async function refreshPortFile() {
1214
1239
  const LOCK_FILE = `${ACTIVE_PORT_FILE}.lock`;
1215
1240
  const TEMP_FILE = `${ACTIVE_PORT_FILE}.tmp`;
1241
+ const LOCK_STALE_MS = 5000;
1242
+ const LOCK_WAIT_MS = 1000;
1216
1243
 
1217
- // Simple file-based lock with timeout (prevents parallel writes from corrupting the port file)
1244
+ // File-based lock with exclusive create + stale lock recovery
1218
1245
  const lockAcquired = await new Promise((resolve) => {
1219
1246
  const start = Date.now();
1220
1247
  const tryLock = () => {
1221
1248
  try {
1222
- writeFileSync(LOCK_FILE, `${process.pid}`, "utf8");
1249
+ const payload = JSON.stringify({ pid: process.pid, ts: Date.now() });
1250
+ writeFileSync(LOCK_FILE, payload, { encoding: "utf8", flag: "wx" });
1223
1251
  resolve(true);
1224
- } catch {
1225
- // Lock file exists - check if stale (older than 5 seconds)
1252
+ } catch (e) {
1253
+ if (e?.code !== "EEXIST") {
1254
+ if (Date.now() - start < LOCK_WAIT_MS) {
1255
+ setTimeout(tryLock, 50);
1256
+ } else {
1257
+ resolve(false);
1258
+ }
1259
+ return;
1260
+ }
1261
+
1226
1262
  try {
1227
- const lockTime = parseInt(readFileSync(LOCK_FILE, "utf8"), 10);
1228
- if (Date.now() - lockTime > 5000) {
1229
- // Stale lock - overwrite
1230
- writeFileSync(LOCK_FILE, `${process.pid}`, "utf8");
1231
- resolve(true);
1232
- } else if (Date.now() - start < 1000) {
1263
+ const lockRaw = readFileSync(LOCK_FILE, "utf8").trim();
1264
+ const parsed = lockRaw.startsWith("{")
1265
+ ? JSON.parse(lockRaw)
1266
+ : { ts: Number(lockRaw) };
1267
+ const lockTime = Number(parsed?.ts) || 0;
1268
+
1269
+ if (lockTime > 0 && Date.now() - lockTime > LOCK_STALE_MS) {
1270
+ try {
1271
+ unlinkSync(LOCK_FILE);
1272
+ } catch {}
1273
+ }
1274
+
1275
+ if (Date.now() - start < LOCK_WAIT_MS) {
1233
1276
  setTimeout(tryLock, 50);
1234
1277
  } else {
1235
- resolve(false); // Give up after 1s
1278
+ resolve(false);
1236
1279
  }
1237
1280
  } catch {
1238
- setTimeout(tryLock, 50);
1281
+ if (Date.now() - start < LOCK_WAIT_MS) {
1282
+ setTimeout(tryLock, 50);
1283
+ } else {
1284
+ resolve(false);
1285
+ }
1239
1286
  }
1240
1287
  }
1241
1288
  };
1242
- tryLock();
1243
- });
1244
-
1245
- try {
1246
- const body = await new Promise((res, rej) => {
1247
- const req = http.get(
1248
- `http://localhost:${GREEDY_PORT}/json/version`,
1249
- (r) => {
1250
- let b = "";
1251
- r.on("data", (d) => (b += d));
1252
- r.on("end", () => res(b));
1253
- },
1254
- );
1255
- req.on("error", rej);
1256
- req.setTimeout(3000, () => {
1257
- req.destroy();
1258
- rej(new Error("timeout"));
1259
- });
1260
- });
1261
- const { webSocketDebuggerUrl } = JSON.parse(body);
1262
- const wsPath = new URL(webSocketDebuggerUrl).pathname;
1263
-
1264
- // Atomic write: write to temp file, then rename
1265
- if (lockAcquired) {
1266
- writeFileSync(TEMP_FILE, `${GREEDY_PORT}\n${wsPath}`, "utf8");
1267
- try {
1268
- unlinkSync(ACTIVE_PORT_FILE);
1269
- } catch {}
1270
- renameSync(TEMP_FILE, ACTIVE_PORT_FILE);
1271
- }
1272
- } catch {
1273
- /* best-effort — launch.mjs already wrote the file on first start */
1274
- } finally {
1275
- if (lockAcquired) {
1276
- try {
1277
- unlinkSync(LOCK_FILE);
1278
- } catch {}
1279
- }
1280
- }
1281
- }
1282
-
1283
- async function ensureChrome() {
1284
- const ready = await probeGreedyChrome();
1285
- if (!ready) {
1286
- process.stderr.write(
1287
- `GreedySearch Chrome not running on port ${GREEDY_PORT} — auto-launching...\n`,
1288
- );
1289
- await new Promise((resolve, reject) => {
1290
- const proc = spawn("node", [join(__dir, "launch.mjs")], {
1291
- stdio: ["ignore", process.stderr, process.stderr],
1292
- });
1293
- proc.on("close", (code) =>
1294
- code === 0 ? resolve() : reject(new Error("launch.mjs failed")),
1295
- );
1296
- });
1297
- } else {
1298
- // Chrome already running — refresh the port file so cdp.mjs always picks
1299
- // up the right port, even if the file was stale from a previous session.
1300
- await refreshPortFile();
1301
- }
1302
- }
1303
-
1304
- async function main() {
1305
- const args = process.argv.slice(2);
1306
- if (args.length < 2 || args[0] === "--help") {
1307
- process.stderr.write(
1308
- `${[
1309
- 'Usage: node search.mjs <engine> "<query>"',
1310
- "",
1311
- "Engines: perplexity (p), bing (b), google (g), gemini (gem), all",
1312
- "",
1313
- "Flags:",
1314
- " --fast Quick mode: no source fetching or synthesis",
1315
- " --synthesize Deprecated: synthesis is now default for multi-engine",
1316
- " --deep-research Deprecated: source fetching is now default",
1317
- " --fetch-top-source Fetch content from top source",
1318
- " --inline Output JSON to stdout (for piping)",
1319
- "",
1320
- "Examples:",
1321
- ' node search.mjs all "Node.js streams" # Default: sources + synthesis',
1322
- ' node search.mjs all "quick check" --fast # Fast: no sources/synthesis',
1323
- ' node search.mjs p "what is memoization" # Single engine: fast mode',
1324
- ].join("\n")}\n`,
1325
- );
1326
- process.exit(1);
1327
- }
1328
-
1329
- await ensureChrome();
1330
-
1331
- // Depth modes: fast (no synthesis/fetch), standard (synthesis+fetch 5 sources)
1332
- const depthIdx = args.indexOf("--depth");
1333
- let depth = "standard"; // DEFAULT: all "all" searches now include synthesis + source fetch
1334
-
1335
- if (depthIdx !== -1 && args[depthIdx + 1]) {
1336
- depth = args[depthIdx + 1];
1337
- } else if (args.includes("--fast")) {
1338
- depth = "fast"; // Explicit fast mode requested
1339
- }
1340
-
1341
- // For single engine (not "all"), default to fast unless explicit
1342
- const engineArg = args.find((a) => !a.startsWith("--"))?.toLowerCase();
1343
- if (engineArg !== "all" && depthIdx === -1 && !args.includes("--fast")) {
1344
- // Single engine: default to fast for speed (no synthesis overhead)
1345
- depth = "fast";
1346
- }
1347
-
1348
- // --deep-research flag maps to standard (backward compat)
1349
- if (args.includes("--deep-research")) {
1350
- depth = "standard";
1351
- }
1352
-
1353
- // For "all" engine with no explicit flags, standard is already default
1354
-
1355
- const full = args.includes("--full");
1356
- const short = !full;
1357
- const fetchSource = args.includes("--fetch-top-source");
1358
- const inline = args.includes("--inline");
1359
- const outIdx = args.indexOf("--out");
1360
- const outFile = outIdx !== -1 ? args[outIdx + 1] : null;
1361
- const rest = args.filter(
1362
- (a, i) =>
1363
- a !== "--full" &&
1364
- a !== "--short" &&
1365
- a !== "--fast" &&
1366
- a !== "--fetch-top-source" &&
1367
- a !== "--synthesize" &&
1368
- a !== "--deep-research" &&
1369
- a !== "--inline" &&
1370
- a !== "--depth" &&
1371
- a !== "--out" &&
1372
- (depthIdx === -1 || i !== depthIdx + 1) &&
1373
- (outIdx === -1 || i !== outIdx + 1),
1374
- );
1375
- const engine = rest[0].toLowerCase();
1376
- const query = rest.slice(1).join(" ");
1377
-
1378
- if (engine === "all") {
1379
- await cdp(["list"]); // refresh pages cache
1380
-
1381
- // PARALLEL-SAFE: Always create fresh tabs for each engine to avoid race conditions
1382
- // when multiple "all" searches run concurrently. Previously, reusing cached tabs
1383
- // caused ERR_ABORTED and Uncaught errors as multiple processes fought over the same tab.
1384
- const tabs = [];
1289
+ tryLock();
1290
+ });
1291
+
1292
+ try {
1293
+ const body = await new Promise((res, rej) => {
1294
+ const req = http.get(
1295
+ `http://localhost:${GREEDY_PORT}/json/version`,
1296
+ (r) => {
1297
+ let b = "";
1298
+ r.on("data", (d) => (b += d));
1299
+ r.on("end", () => res(b));
1300
+ },
1301
+ );
1302
+ req.on("error", rej);
1303
+ req.setTimeout(3000, () => {
1304
+ req.destroy();
1305
+ rej(new Error("timeout"));
1306
+ });
1307
+ });
1308
+ const { webSocketDebuggerUrl } = JSON.parse(body);
1309
+ const wsPath = new URL(webSocketDebuggerUrl).pathname;
1310
+
1311
+ // Atomic write: write to temp file, then rename
1312
+ if (lockAcquired) {
1313
+ writeFileSync(TEMP_FILE, `${GREEDY_PORT}\n${wsPath}`, "utf8");
1314
+ try {
1315
+ unlinkSync(ACTIVE_PORT_FILE);
1316
+ } catch {}
1317
+ renameSync(TEMP_FILE, ACTIVE_PORT_FILE);
1318
+ }
1319
+ } catch {
1320
+ /* best-effort — launch.mjs already wrote the file on first start */
1321
+ } finally {
1322
+ if (lockAcquired) {
1323
+ try {
1324
+ unlinkSync(LOCK_FILE);
1325
+ } catch {}
1326
+ }
1327
+ }
1328
+ }
1329
+
1330
+ async function ensureChrome() {
1331
+ const ready = await probeGreedyChrome();
1332
+ if (!ready) {
1333
+ process.stderr.write(
1334
+ `GreedySearch Chrome not running on port ${GREEDY_PORT} — auto-launching...\n`,
1335
+ );
1336
+ await new Promise((resolve, reject) => {
1337
+ const proc = spawn("node", [join(__dir, "launch.mjs")], {
1338
+ stdio: ["ignore", process.stderr, process.stderr],
1339
+ });
1340
+ proc.on("close", (code) =>
1341
+ code === 0 ? resolve() : reject(new Error("launch.mjs failed")),
1342
+ );
1343
+ });
1344
+ } else {
1345
+ // Chrome already running — refresh the port file so cdp.mjs always picks
1346
+ // up the right port, even if the file was stale from a previous session.
1347
+ await refreshPortFile();
1348
+ }
1349
+ }
1350
+
1351
+ async function main() {
1352
+ const args = process.argv.slice(2);
1353
+ if (args.length < 2 || args[0] === "--help") {
1354
+ process.stderr.write(
1355
+ `${[
1356
+ 'Usage: node search.mjs <engine> "<query>"',
1357
+ "",
1358
+ "Engines: perplexity (p), bing (b), google (g), gemini (gem), all",
1359
+ "",
1360
+ "Flags:",
1361
+ " --fast Quick mode: no source fetching or synthesis",
1362
+ " --synthesize Deprecated: synthesis is now default for multi-engine",
1363
+ " --deep-research Deprecated: source fetching is now default",
1364
+ " --fetch-top-source Fetch content from top source",
1365
+ " --inline Output JSON to stdout (for piping)",
1366
+ "",
1367
+ "Examples:",
1368
+ ' node search.mjs all "Node.js streams" # Default: sources + synthesis',
1369
+ ' node search.mjs all "quick check" --fast # Fast: no sources/synthesis',
1370
+ ' node search.mjs p "what is memoization" # Single engine: fast mode',
1371
+ ].join("\n")}\n`,
1372
+ );
1373
+ process.exit(1);
1374
+ }
1375
+
1376
+ await ensureChrome();
1377
+
1378
+ // Depth modes: fast (no synthesis/fetch), standard (synthesis+fetch 5 sources)
1379
+ const depthIdx = args.indexOf("--depth");
1380
+ let depth = "standard"; // DEFAULT: all "all" searches now include synthesis + source fetch
1381
+
1382
+ if (depthIdx !== -1 && args[depthIdx + 1]) {
1383
+ depth = args[depthIdx + 1];
1384
+ } else if (args.includes("--fast")) {
1385
+ depth = "fast"; // Explicit fast mode requested
1386
+ }
1387
+
1388
+ // For single engine (not "all"), default to fast unless explicit
1389
+ const engineArg = args.find((a) => !a.startsWith("--"))?.toLowerCase();
1390
+ if (engineArg !== "all" && depthIdx === -1 && !args.includes("--fast")) {
1391
+ // Single engine: default to fast for speed (no synthesis overhead)
1392
+ depth = "fast";
1393
+ }
1394
+
1395
+ // --deep-research flag maps to standard (backward compat)
1396
+ if (args.includes("--deep-research")) {
1397
+ depth = "standard";
1398
+ }
1399
+
1400
+ // For "all" engine with no explicit flags, standard is already default
1401
+
1402
+ const full = args.includes("--full");
1403
+ const short = !full;
1404
+ const fetchSource = args.includes("--fetch-top-source");
1405
+ const inline = args.includes("--inline");
1406
+ const outIdx = args.indexOf("--out");
1407
+ const outFile = outIdx !== -1 ? args[outIdx + 1] : null;
1408
+ const rest = args.filter(
1409
+ (a, i) =>
1410
+ a !== "--full" &&
1411
+ a !== "--short" &&
1412
+ a !== "--fast" &&
1413
+ a !== "--fetch-top-source" &&
1414
+ a !== "--synthesize" &&
1415
+ a !== "--deep-research" &&
1416
+ a !== "--inline" &&
1417
+ a !== "--depth" &&
1418
+ a !== "--out" &&
1419
+ (depthIdx === -1 || i !== depthIdx + 1) &&
1420
+ (outIdx === -1 || i !== outIdx + 1),
1421
+ );
1422
+ const engine = rest[0].toLowerCase();
1423
+ const query = rest.slice(1).join(" ");
1424
+
1425
+ if (engine === "all") {
1426
+ await cdp(["list"]); // refresh pages cache
1427
+
1428
+ // PARALLEL-SAFE: Always create fresh tabs for each engine to avoid race conditions
1429
+ // when multiple "all" searches run concurrently. Previously, reusing cached tabs
1430
+ // caused ERR_ABORTED and Uncaught errors as multiple processes fought over the same tab.
1431
+ const engineTabs = [];
1385
1432
  for (let i = 0; i < ALL_ENGINES.length; i++) {
1386
1433
  if (i > 0) await new Promise((r) => setTimeout(r, 300)); // small delay between tab opens
1387
1434
  const tab = await openNewTab();
1388
- tabs.push(tab);
1435
+ engineTabs.push(tab);
1389
1436
  }
1390
-
1391
- // All tabs assigned — run extractors in parallel
1392
- try {
1393
- const results = await Promise.allSettled(
1437
+
1438
+ // All tabs assigned — run extractors in parallel
1439
+ try {
1440
+ const results = await Promise.allSettled(
1394
1441
  ALL_ENGINES.map((e, i) =>
1395
- runExtractor(ENGINES[e], query, tabs[i], short)
1396
- .then((r) => {
1397
- process.stderr.write(`PROGRESS:${e}:done\n`);
1398
- return { engine: e, ...r };
1399
- })
1400
- .catch((err) => {
1401
- process.stderr.write(`PROGRESS:${e}:error\n`);
1402
- throw err;
1403
- }),
1404
- ),
1405
- );
1406
-
1407
- const out = {};
1408
- for (let i = 0; i < results.length; i++) {
1409
- const r = results[i];
1410
- if (r.status === "fulfilled") {
1411
- out[r.value.engine] = r.value;
1412
- } else {
1413
- out[ALL_ENGINES[i]] = { error: r.reason?.message || "unknown error" };
1414
- }
1415
- }
1416
-
1417
- await closeTabs(tabs);
1418
-
1419
- // Build a canonical source registry across all engines
1420
- out._sources = buildSourceRegistry(out, query);
1421
-
1422
- // Source fetching: default for all "all" searches (was deep-research only)
1423
- if (depth !== "fast" && out._sources.length > 0) {
1424
- process.stderr.write("PROGRESS:source-fetch:start\n");
1425
- const fetchedSources = await fetchMultipleSources(
1426
- out._sources,
1427
- 5,
1428
- 8000,
1429
- );
1430
-
1431
- out._sources = mergeFetchDataIntoSources(out._sources, fetchedSources);
1432
- out._fetchedSources = fetchedSources;
1433
- process.stderr.write("PROGRESS:source-fetch:done\n");
1434
- }
1435
-
1436
- // Synthesize with Gemini for all non-fast modes (now default)
1437
- if (depth !== "fast") {
1438
- process.stderr.write("PROGRESS:synthesis:start\n");
1439
- process.stderr.write(
1440
- "[greedysearch] Synthesizing results with Gemini...\n",
1441
- );
1442
+ runExtractor(ENGINES[e], query, engineTabs[i], short)
1443
+ .then((r) => {
1444
+ process.stderr.write(`PROGRESS:${e}:done\n`);
1445
+ return { engine: e, ...r };
1446
+ })
1447
+ .catch((err) => {
1448
+ process.stderr.write(`PROGRESS:${e}:error\n`);
1449
+ throw err;
1450
+ }),
1451
+ ),
1452
+ );
1453
+
1454
+ const out = {};
1455
+ for (let i = 0; i < results.length; i++) {
1456
+ const r = results[i];
1457
+ if (r.status === "fulfilled") {
1458
+ out[r.value.engine] = r.value;
1459
+ } else {
1460
+ out[ALL_ENGINES[i]] = { error: r.reason?.message || "unknown error" };
1461
+ }
1462
+ }
1463
+
1464
+ await closeTabs(engineTabs);
1465
+
1466
+ // Build a canonical source registry across all engines
1467
+ out._sources = buildSourceRegistry(out, query);
1468
+
1469
+ // Source fetching: default for all "all" searches (was deep-research only)
1470
+ if (depth !== "fast" && out._sources.length > 0) {
1471
+ process.stderr.write("PROGRESS:source-fetch:start\n");
1472
+ const fetchedSources = await fetchMultipleSources(
1473
+ out._sources,
1474
+ 5,
1475
+ 8000,
1476
+ );
1477
+
1478
+ out._sources = mergeFetchDataIntoSources(out._sources, fetchedSources);
1479
+ out._fetchedSources = fetchedSources;
1480
+ process.stderr.write("PROGRESS:source-fetch:done\n");
1481
+ }
1482
+
1483
+ // Synthesize with Gemini for all non-fast modes (now default)
1484
+ if (depth !== "fast") {
1485
+ process.stderr.write("PROGRESS:synthesis:start\n");
1486
+ process.stderr.write(
1487
+ "[greedysearch] Synthesizing results with Gemini...\n",
1488
+ );
1442
1489
  try {
1443
- // Create fresh Gemini tab per search (not cached) to avoid conflicts in parallel searches
1444
- const geminiTab = await openNewTab();
1445
- tabs.push(geminiTab); // ensure cleanup in finally block
1490
+ const geminiTab = await getOrOpenEngineTab("gemini");
1446
1491
  await activateTab(geminiTab);
1447
1492
  const synthesis = await synthesizeWithGemini(query, out, {
1448
1493
  grounded: depth === "deep",
1449
1494
  tabPrefix: geminiTab,
1450
1495
  });
1496
+ await activateTab(geminiTab);
1451
1497
  out._synthesis = {
1452
1498
  ...synthesis,
1453
1499
  synthesized: true,
1454
1500
  };
1455
- process.stderr.write("PROGRESS:synthesis:done\n");
1456
- } catch (e) {
1457
- process.stderr.write(
1458
- `[greedysearch] Synthesis failed: ${e.message}\n`,
1459
- );
1460
- out._synthesis = { error: e.message, synthesized: false };
1461
- }
1462
- }
1463
-
1464
- if (fetchSource) {
1465
- const top = pickTopSource(out);
1466
- if (top)
1467
- out._topSource = await fetchTopSource(top.canonicalUrl || top.url);
1468
- }
1469
-
1470
- // Always include confidence metrics for non-fast searches
1471
- if (depth !== "fast") out._confidence = buildConfidence(out);
1472
-
1473
- writeOutput(out, outFile, {
1474
- inline,
1475
- synthesize: depth !== "fast",
1476
- query,
1477
- });
1478
- return;
1479
- } finally {
1480
- await closeTabs(tabs);
1501
+ process.stderr.write("PROGRESS:synthesis:done\n");
1502
+ } catch (e) {
1503
+ process.stderr.write(
1504
+ `[greedysearch] Synthesis failed: ${e.message}\n`,
1505
+ );
1506
+ out._synthesis = { error: e.message, synthesized: false };
1507
+ }
1508
+ }
1509
+
1510
+ if (fetchSource) {
1511
+ const top = pickTopSource(out);
1512
+ if (top)
1513
+ out._topSource = await fetchTopSource(top.canonicalUrl || top.url);
1514
+ }
1515
+
1516
+ // Always include confidence metrics for non-fast searches
1517
+ if (depth !== "fast") out._confidence = buildConfidence(out);
1518
+
1519
+ writeOutput(out, outFile, {
1520
+ inline,
1521
+ synthesize: depth !== "fast",
1522
+ query,
1523
+ });
1524
+ return;
1525
+ } finally {
1526
+ await closeTabs(engineTabs);
1481
1527
  }
1482
1528
  }
1483
-
1484
- const script = ENGINES[engine];
1485
- if (!script) {
1486
- process.stderr.write(
1487
- `Unknown engine: "${engine}"\nAvailable: ${Object.keys(ENGINES).join(", ")}\n`,
1488
- );
1489
- process.exit(1);
1490
- }
1491
-
1492
- try {
1493
- const result = await runExtractor(script, query, null, short);
1494
- if (fetchSource && result.sources?.length > 0) {
1495
- result.topSource = await fetchTopSource(result.sources[0].url);
1496
- }
1497
- writeOutput(result, outFile, { inline, synthesize: false, query });
1498
- } catch (e) {
1499
- process.stderr.write(`Error: ${e.message}\n`);
1500
- process.exit(1);
1501
- }
1502
- }
1503
-
1504
- main();
1529
+
1530
+ const script = ENGINES[engine];
1531
+ if (!script) {
1532
+ process.stderr.write(
1533
+ `Unknown engine: "${engine}"\nAvailable: ${Object.keys(ENGINES).join(", ")}\n`,
1534
+ );
1535
+ process.exit(1);
1536
+ }
1537
+
1538
+ try {
1539
+ const result = await runExtractor(script, query, null, short);
1540
+ if (fetchSource && result.sources?.length > 0) {
1541
+ result.topSource = await fetchTopSource(result.sources[0].url);
1542
+ }
1543
+ writeOutput(result, outFile, { inline, synthesize: false, query });
1544
+ } catch (e) {
1545
+ process.stderr.write(`Error: ${e.message}\n`);
1546
+ process.exit(1);
1547
+ }
1548
+ }
1549
+
1550
+ main();