@apmantza/greedysearch-pi 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/search.mjs CHANGED
@@ -1,996 +1,997 @@
1
- #!/usr/bin/env node
2
- // search.mjs — unified CLI for GreedySearch extractors
3
- //
4
- // Usage:
5
- // node search.mjs <engine> "<query>"
6
- // node search.mjs all "<query>"
7
- //
8
- // Engines:
9
- // perplexity | pplx | p
10
- // bing | copilot | b
11
- // google | g
12
- // gemini | gem
13
- // all — fan-out to all engines in parallel
14
- //
15
- // Output: JSON to stdout, errors to stderr
16
- //
17
- // Examples:
18
- // node search.mjs p "what is memoization"
19
- // node search.mjs gem "latest React features"
20
- // node search.mjs all "how does TCP congestion control work"
21
-
22
- import { spawn } from 'child_process';
23
- import { fileURLToPath } from 'url';
24
- import { join, dirname } from 'path';
25
- import { readFileSync, existsSync, writeFileSync, mkdirSync, renameSync, unlinkSync } from 'fs';
26
- import { tmpdir, homedir } from 'os';
27
- import http from 'http';
28
-
29
- const __dir = dirname(fileURLToPath(import.meta.url));
30
- const CDP = join(__dir, 'cdp.mjs');
31
- const PAGES_CACHE = `${tmpdir().replace(/\\/g, '/')}/cdp-pages.json`;
32
-
33
- const GREEDY_PORT = 9222;
34
-
35
- const ENGINES = {
36
- perplexity: 'perplexity.mjs',
37
- pplx: 'perplexity.mjs',
38
- p: 'perplexity.mjs',
39
- bing: 'bing-copilot.mjs',
40
- copilot: 'bing-copilot.mjs',
41
- b: 'bing-copilot.mjs',
42
- google: 'google-ai.mjs',
43
- g: 'google-ai.mjs',
44
- gemini: 'gemini.mjs',
45
- gem: 'gemini.mjs',
46
- };
47
-
48
- const ALL_ENGINES = ['perplexity', 'bing', 'google'];
49
-
50
- const ENGINE_DOMAINS = {
51
- perplexity: 'perplexity.ai',
52
- bing: 'copilot.microsoft.com',
53
- google: 'google.com',
54
- gemini: 'gemini.google.com',
55
- };
56
-
57
- const TRACKING_PARAMS = [
58
- 'fbclid',
59
- 'gclid',
60
- 'ref',
61
- 'ref_src',
62
- 'ref_url',
63
- 'source',
64
- 'utm_campaign',
65
- 'utm_content',
66
- 'utm_medium',
67
- 'utm_source',
68
- 'utm_term',
69
- ];
70
-
71
- const COMMUNITY_HOSTS = [
72
- 'dev.to',
73
- 'hashnode.com',
74
- 'medium.com',
75
- 'reddit.com',
76
- 'stackoverflow.com',
77
- 'stackexchange.com',
78
- 'substack.com',
79
- ];
80
-
81
- const NEWS_HOSTS = [
82
- 'arstechnica.com',
83
- 'techcrunch.com',
84
- 'theverge.com',
85
- 'venturebeat.com',
86
- 'wired.com',
87
- 'zdnet.com',
88
- ];
89
-
90
- function trimText(text = '', maxChars = 240) {
91
- const clean = String(text).replace(/\s+/g, ' ').trim();
92
- if (clean.length <= maxChars) return clean;
93
- return clean.slice(0, maxChars).replace(/\s+\S*$/, '') + '...';
94
- }
95
-
96
- function normalizeSourceTitle(title = '') {
97
- const clean = trimText(title, 180);
98
- if (!clean) return '';
99
- if (/^https?:\/\//i.test(clean)) return '';
100
-
101
- const wordCount = clean.split(/\s+/).filter(Boolean).length;
102
- const hasUppercase = /[A-Z]/.test(clean);
103
- const hasDigit = /\d/.test(clean);
104
- const looksLikeFragment = clean === clean.toLowerCase() && wordCount <= 4 && !hasUppercase && !hasDigit;
105
- return looksLikeFragment ? '' : clean;
106
- }
107
-
108
- function pickPreferredTitle(currentTitle = '', nextTitle = '') {
109
- const current = normalizeSourceTitle(currentTitle);
110
- const next = normalizeSourceTitle(nextTitle);
111
- if (!next) return current;
112
- if (!current) return next;
113
- const currentLooksLikeUrl = /^https?:\/\//i.test(current);
114
- const nextLooksLikeUrl = /^https?:\/\//i.test(next);
115
- if (currentLooksLikeUrl && !nextLooksLikeUrl) return next;
116
- if (!currentLooksLikeUrl && nextLooksLikeUrl) return current;
117
- return next.length > current.length ? next : current;
118
- }
119
-
120
- function normalizeUrl(rawUrl) {
121
- if (!rawUrl) return null;
122
- try {
123
- const url = new URL(rawUrl);
124
- if (!['http:', 'https:'].includes(url.protocol)) return null;
125
- url.hash = '';
126
- url.hostname = url.hostname.toLowerCase();
127
- if ((url.protocol === 'https:' && url.port === '443') || (url.protocol === 'http:' && url.port === '80')) {
128
- url.port = '';
129
- }
130
- for (const key of [...url.searchParams.keys()]) {
131
- const lower = key.toLowerCase();
132
- if (TRACKING_PARAMS.includes(lower) || lower.startsWith('utm_')) {
133
- url.searchParams.delete(key);
134
- }
135
- }
136
- url.searchParams.sort();
137
- const normalizedPath = url.pathname.replace(/\/+$/, '') || '/';
138
- url.pathname = normalizedPath;
139
- const normalized = url.toString();
140
- return normalizedPath === '/' ? normalized.replace(/\/$/, '') : normalized;
141
- } catch {
142
- return null;
143
- }
144
- }
145
-
146
- function getDomain(rawUrl) {
147
- try {
148
- const domain = new URL(rawUrl).hostname.toLowerCase();
149
- return domain.replace(/^www\./, '');
150
- } catch {
151
- return '';
152
- }
153
- }
154
-
155
- function matchesDomain(domain, hosts) {
156
- return hosts.some(host => domain === host || domain.endsWith(`.${host}`));
157
- }
158
-
159
- function classifySourceType(domain, title = '', rawUrl = '') {
160
- const lowerTitle = title.toLowerCase();
161
- const lowerUrl = rawUrl.toLowerCase();
162
-
163
- if (domain === 'github.com' || domain === 'gitlab.com') return 'repo';
164
- if (matchesDomain(domain, COMMUNITY_HOSTS)) return 'community';
165
- if (matchesDomain(domain, NEWS_HOSTS)) return 'news';
166
- if (
167
- domain.startsWith('docs.') ||
168
- domain.startsWith('developer.') ||
169
- domain.startsWith('developers.') ||
170
- domain.startsWith('api.') ||
171
- lowerTitle.includes('documentation') ||
172
- lowerTitle.includes('docs') ||
173
- lowerTitle.includes('reference') ||
174
- lowerUrl.includes('/docs/') ||
175
- lowerUrl.includes('/reference/') ||
176
- lowerUrl.includes('/api/')
177
- ) {
178
- return 'official-docs';
179
- }
180
- if (domain.startsWith('blog.') || lowerUrl.includes('/blog/')) return 'maintainer-blog';
181
- return 'website';
182
- }
183
-
184
- function sourceTypePriority(sourceType) {
185
- switch (sourceType) {
186
- case 'official-docs': return 5;
187
- case 'repo': return 4;
188
- case 'maintainer-blog': return 3;
189
- case 'website': return 2;
190
- case 'community': return 1;
191
- case 'news': return 0;
192
- default: return 0;
193
- }
194
- }
195
-
196
- function bestRank(source) {
197
- const ranks = Object.values(source.perEngine || {}).map(v => v?.rank || 99);
198
- return ranks.length ? Math.min(...ranks) : 99;
199
- }
200
-
201
- function buildSourceRegistry(out) {
202
- const seen = new Map();
203
- const engineOrder = ['perplexity', 'bing', 'google'];
204
-
205
- for (const engine of engineOrder) {
206
- const result = out[engine];
207
- if (!result?.sources) continue;
208
-
209
- for (let i = 0; i < result.sources.length; i++) {
210
- const source = result.sources[i];
211
- const canonicalUrl = normalizeUrl(source.url);
212
- if (!canonicalUrl || canonicalUrl.length < 10) continue;
213
-
214
- const title = normalizeSourceTitle(source.title || '');
215
- const domain = getDomain(canonicalUrl);
216
- const sourceType = classifySourceType(domain, title, canonicalUrl);
217
- const existing = seen.get(canonicalUrl) || {
218
- id: '',
219
- canonicalUrl,
220
- displayUrl: source.url || canonicalUrl,
221
- domain,
222
- title: '',
223
- engines: [],
224
- engineCount: 0,
225
- perEngine: {},
226
- sourceType,
227
- isOfficial: sourceType === 'official-docs',
228
- };
229
-
230
- existing.title = pickPreferredTitle(existing.title, title);
231
- existing.displayUrl = existing.displayUrl || source.url || canonicalUrl;
232
- existing.sourceType = existing.sourceType || sourceType;
233
- existing.isOfficial = existing.isOfficial || sourceType === 'official-docs';
234
-
235
- if (!existing.engines.includes(engine)) {
236
- existing.engines.push(engine);
237
- }
238
- existing.perEngine[engine] = {
239
- rank: i + 1,
240
- title: pickPreferredTitle(existing.perEngine[engine]?.title || '', title),
241
- };
242
-
243
- seen.set(canonicalUrl, existing);
244
- }
245
- }
246
-
247
- const sources = Array.from(seen.values())
248
- .map(source => ({
249
- ...source,
250
- engineCount: source.engines.length,
251
- }))
252
- .sort((a, b) => {
253
- if (b.engineCount !== a.engineCount) return b.engineCount - a.engineCount;
254
- if (sourceTypePriority(b.sourceType) !== sourceTypePriority(a.sourceType)) {
255
- return sourceTypePriority(b.sourceType) - sourceTypePriority(a.sourceType);
256
- }
257
- if (bestRank(a) !== bestRank(b)) return bestRank(a) - bestRank(b);
258
- return a.domain.localeCompare(b.domain);
259
- })
260
- .slice(0, 12)
261
- .map((source, index) => ({
262
- ...source,
263
- id: `S${index + 1}`,
264
- title: source.title || source.domain || source.canonicalUrl,
265
- }));
266
-
267
- return sources;
268
- }
269
-
270
- function mergeFetchDataIntoSources(sources, fetchedSources) {
271
- const byId = new Map(fetchedSources.map(source => [source.id, source]));
272
- return sources.map(source => {
273
- const fetched = byId.get(source.id);
274
- if (!fetched) return source;
275
-
276
- const title = pickPreferredTitle(source.title, fetched.title || '');
277
- return {
278
- ...source,
279
- title: title || source.title,
280
- fetch: {
281
- attempted: true,
282
- ok: !fetched.error,
283
- status: fetched.status || null,
284
- finalUrl: fetched.finalUrl || fetched.url || source.canonicalUrl,
285
- contentType: fetched.contentType || '',
286
- lastModified: fetched.lastModified || '',
287
- title: fetched.title || '',
288
- snippet: fetched.snippet || '',
289
- contentChars: fetched.contentChars || 0,
290
- error: fetched.error || '',
291
- },
292
- };
293
- });
294
- }
295
-
296
- function parseStructuredJson(text) {
297
- if (!text) return null;
298
- const trimmed = String(text).trim();
299
- const candidates = [
300
- trimmed,
301
- trimmed.replace(/^```json\s*/i, '').replace(/^```\s*/i, '').replace(/```$/i, '').trim(),
302
- ];
303
-
304
- const objectMatch = trimmed.match(/\{[\s\S]*\}/);
305
- if (objectMatch) candidates.push(objectMatch[0]);
306
-
307
- for (const candidate of candidates) {
308
- try {
309
- return JSON.parse(candidate);
310
- } catch {
311
- // try next candidate
312
- }
313
- }
314
- return null;
315
- }
316
-
317
- function normalizeSynthesisPayload(payload, sources, fallbackAnswer = '') {
318
- const sourceIds = new Set(sources.map(source => source.id));
319
- const agreementLevel = ['high', 'medium', 'low', 'mixed', 'conflicting'].includes(payload?.agreement?.level)
320
- ? payload.agreement.level
321
- : 'mixed';
322
- const claims = Array.isArray(payload?.claims)
323
- ? payload.claims.map(claim => ({
324
- claim: trimText(claim?.claim || '', 260),
325
- support: ['strong', 'moderate', 'weak', 'conflicting'].includes(claim?.support) ? claim.support : 'moderate',
326
- sourceIds: Array.isArray(claim?.sourceIds) ? claim.sourceIds.filter(id => sourceIds.has(id)) : [],
327
- })).filter(claim => claim.claim)
328
- : [];
329
- const recommendedSources = Array.isArray(payload?.recommendedSources)
330
- ? payload.recommendedSources.filter(id => sourceIds.has(id)).slice(0, 6)
331
- : [];
332
-
333
- return {
334
- answer: trimText(payload?.answer || fallbackAnswer, 4000),
335
- agreement: {
336
- level: agreementLevel,
337
- summary: trimText(payload?.agreement?.summary || '', 280),
338
- },
339
- differences: Array.isArray(payload?.differences)
340
- ? payload.differences.map(item => trimText(item, 220)).filter(Boolean).slice(0, 5)
341
- : [],
342
- caveats: Array.isArray(payload?.caveats)
343
- ? payload.caveats.map(item => trimText(item, 220)).filter(Boolean).slice(0, 5)
344
- : [],
345
- claims,
346
- recommendedSources,
347
- };
348
- }
349
-
350
- function buildSynthesisPrompt(query, results, sources, { grounded = false } = {}) {
351
- const engineSummaries = {};
352
- for (const engine of ['perplexity', 'bing', 'google']) {
353
- const result = results[engine];
354
- if (!result) continue;
355
- if (result.error) {
356
- engineSummaries[engine] = { status: 'error', error: String(result.error) };
357
- continue;
358
- }
359
-
360
- engineSummaries[engine] = {
361
- status: 'ok',
362
- answer: trimText(result.answer || '', grounded ? 4500 : 2200),
363
- sourceIds: sources
364
- .filter(source => source.engines.includes(engine))
365
- .sort((a, b) => (a.perEngine[engine]?.rank || 99) - (b.perEngine[engine]?.rank || 99))
366
- .map(source => source.id)
367
- .slice(0, 6),
368
- };
369
- }
370
-
371
- const sourceRegistry = sources.slice(0, grounded ? 10 : 8).map(source => ({
372
- id: source.id,
373
- title: source.title,
374
- domain: source.domain,
375
- canonicalUrl: source.canonicalUrl,
376
- sourceType: source.sourceType,
377
- isOfficial: source.isOfficial,
378
- engines: source.engines,
379
- engineCount: source.engineCount,
380
- perEngine: source.perEngine,
381
- fetch: grounded && source.fetch?.attempted ? {
382
- ok: source.fetch.ok,
383
- status: source.fetch.status,
384
- lastModified: source.fetch.lastModified,
385
- snippet: trimText(source.fetch.snippet || '', 700),
386
- } : undefined,
387
- }));
388
-
389
- return [
390
- 'You are synthesizing results from Perplexity, Bing Copilot, and Google AI.',
391
- grounded
392
- ? 'Use the fetched source snippets as the strongest evidence. Use engine answers for perspective and conflict detection.'
393
- : 'Use the engine answers for perspective. Use the source registry for provenance and citations.',
394
- 'Prefer official docs, release notes, repositories, and maintainer-authored sources when available.',
395
- 'If the engines disagree, say so explicitly.',
396
- 'Do not invent sources. Only reference source IDs from the source registry.',
397
- 'Return valid JSON only. No markdown fences, no prose outside the JSON object.',
398
- '',
399
- 'JSON schema:',
400
- '{',
401
- ' "answer": "short direct answer",',
402
- ' "agreement": { "level": "high|medium|low|mixed|conflicting", "summary": "..." },',
403
- ' "differences": ["..."],',
404
- ' "caveats": ["..."],',
405
- ' "claims": [',
406
- ' { "claim": "...", "support": "strong|moderate|weak|conflicting", "sourceIds": ["S1"] }',
407
- ' ],',
408
- ' "recommendedSources": ["S1", "S2"]',
409
- '}',
410
- '',
411
- `User query: ${query}`,
412
- '',
413
- `Engine results:\n${JSON.stringify(engineSummaries, null, 2)}`,
414
- '',
415
- `Source registry:\n${JSON.stringify(sourceRegistry, null, 2)}`,
416
- ].join('\n');
417
- }
418
-
419
- function buildConfidence(out) {
420
- const sources = Array.isArray(out._sources) ? out._sources : [];
421
- const topConsensus = sources.length > 0 ? sources[0]?.engineCount || 0 : 0;
422
- const officialSourceCount = sources.filter(source => source.isOfficial).length;
423
- const firstPartySourceCount = sources.filter(source => source.isOfficial || source.sourceType === 'maintainer-blog').length;
424
- const fetchedAttempted = sources.filter(source => source.fetch?.attempted).length;
425
- const fetchedSucceeded = sources.filter(source => source.fetch?.ok).length;
426
- const sourceTypeBreakdown = sources.reduce((acc, source) => {
427
- acc[source.sourceType] = (acc[source.sourceType] || 0) + 1;
428
- return acc;
429
- }, {});
430
- const synthesisLevel = out._synthesis?.agreement?.level;
431
-
432
- return {
433
- sourcesCount: sources.length,
434
- topSourceConsensus: topConsensus,
435
- agreementLevel: synthesisLevel || (topConsensus >= 3 ? 'high' : topConsensus >= 2 ? 'medium' : 'low'),
436
- enginesResponded: ALL_ENGINES.filter(engine => out[engine]?.answer && !out[engine]?.error),
437
- enginesFailed: ALL_ENGINES.filter(engine => out[engine]?.error),
438
- officialSourceCount,
439
- firstPartySourceCount,
440
- fetchedSourceSuccessRate: fetchedAttempted > 0 ? Number((fetchedSucceeded / fetchedAttempted).toFixed(2)) : 0,
441
- sourceTypeBreakdown,
442
- };
443
- }
444
-
445
- function getFullTabFromCache(engine) {
446
- try {
447
- if (!existsSync(PAGES_CACHE)) return null;
448
- const pages = JSON.parse(readFileSync(PAGES_CACHE, 'utf8'));
449
- const found = pages.find(p => p.url.includes(ENGINE_DOMAINS[engine]));
450
- return found ? found.targetId : null;
451
- } catch { return null; }
452
- }
453
-
454
- function cdp(args, timeoutMs = 15000) {
455
- return new Promise((resolve, reject) => {
456
- const proc = spawn('node', [CDP, ...args], { stdio: ['ignore', 'pipe', 'pipe'] });
457
- let out = '', err = '';
458
- proc.stdout.on('data', d => out += d);
459
- proc.stderr.on('data', d => err += d);
460
- const t = setTimeout(() => { proc.kill(); reject(new Error(`cdp timeout: ${args[0]}`)); }, timeoutMs);
461
- proc.on('close', code => {
462
- clearTimeout(t);
463
- if (code !== 0) reject(new Error(err.trim() || `cdp exit ${code}`));
464
- else resolve(out.trim());
465
- });
466
- });
467
- }
468
-
469
- async function getAnyTab() {
470
- const list = await cdp(['list']);
471
- const first = list.split('\n')[0];
472
- if (!first) throw new Error('No Chrome tabs found');
473
- return first.slice(0, 8);
474
- }
475
-
476
- async function getOrReuseBlankTab() {
477
- // Reuse an existing about:blank tab rather than always creating a new one
478
- const listOut = await cdp(['list']);
479
- const lines = listOut.split('\n').filter(Boolean);
480
- for (const line of lines) {
481
- if (line.includes('about:blank')) {
482
- return line.slice(0, 8); // prefix of the blank tab's targetId
483
- }
484
- }
485
- // No blank tab — open a new one
486
- const anchor = await getAnyTab();
487
- const raw = await cdp(['evalraw', anchor, 'Target.createTarget', '{"url":"about:blank"}']);
488
- const { targetId } = JSON.parse(raw);
489
- return targetId;
490
- }
491
-
492
- async function openNewTab() {
493
- const anchor = await getAnyTab();
494
- const raw = await cdp(['evalraw', anchor, 'Target.createTarget', '{"url":"about:blank"}']);
495
- const { targetId } = JSON.parse(raw);
496
- return targetId;
497
- }
498
-
499
- async function getOrOpenEngineTab(engine) {
500
- await cdp(['list']);
501
- return getFullTabFromCache(engine) || openNewTab();
502
- }
503
-
504
- async function activateTab(targetId) {
505
- try {
506
- const anchor = await getAnyTab();
507
- await cdp(['evalraw', anchor, 'Target.activateTarget', JSON.stringify({ targetId })]);
508
- } catch {
509
- // best-effort
510
- }
511
- }
512
-
513
- async function closeTabs(targetIds = []) {
514
- for (const targetId of targetIds) {
515
- if (!targetId) continue;
516
- await closeTab(targetId);
517
- }
518
- if (targetIds.length > 0) {
519
- await new Promise(r => setTimeout(r, 300));
520
- await cdp(['list']).catch(() => null);
521
- }
522
- }
523
-
524
- async function closeTab(targetId) {
525
- try {
526
- const anchor = await getAnyTab();
527
- await cdp(['evalraw', anchor, 'Target.closeTarget', JSON.stringify({ targetId })]);
528
- } catch { /* best-effort */ }
529
- }
530
-
531
- function runExtractor(script, query, tabPrefix = null, short = false, timeoutMs = 90000) {
532
- const extraArgs = [
533
- ...(tabPrefix ? ['--tab', tabPrefix] : []),
534
- ...(short ? ['--short'] : []),
535
- ];
536
- return new Promise((resolve, reject) => {
537
- const proc = spawn('node', [join(__dir, 'extractors', script), query, ...extraArgs], {
538
- stdio: ['ignore', 'pipe', 'pipe'],
539
- });
540
- let out = '';
541
- let err = '';
542
- proc.stdout.on('data', d => out += d);
543
- proc.stderr.on('data', d => err += d);
544
- const t = setTimeout(() => {
545
- proc.kill();
546
- reject(new Error(`${script} timed out after ${timeoutMs / 1000}s`));
547
- }, timeoutMs);
548
- proc.on('close', code => {
549
- clearTimeout(t);
550
- if (code !== 0) reject(new Error(err.trim() || `extractor exit ${code}`));
551
- else {
552
- try { resolve(JSON.parse(out.trim())); }
553
- catch { reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`)); }
554
- }
555
- });
556
- });
557
- }
558
-
559
-
560
- async function fetchTopSource(url) {
561
- const tab = await openNewTab();
562
- await cdp(['list']); // refresh cache so the new tab is findable
563
- try {
564
- await cdp(['nav', tab, url], 30000);
565
- await new Promise(r => setTimeout(r, 1500));
566
- const content = await cdp(['eval', tab, `
567
- (function(){
568
- var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
569
- var text = (el || document.body).innerText;
570
- return text.replace(/\\s+/g, ' ').trim();
571
- })()
572
- `]);
573
- return { url, content };
574
- } catch (e) {
575
- return { url, content: null, error: e.message };
576
- } finally {
577
- await closeTab(tab);
578
- }
579
- }
580
-
581
- async function fetchSourceContent(url, maxChars = 5000) {
582
- try {
583
- const controller = new AbortController();
584
- const timeout = setTimeout(() => controller.abort(), 15000);
585
-
586
- const res = await fetch(url, {
587
- signal: controller.signal,
588
- headers: {
589
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
590
- 'Accept': 'text/html,application/xhtml+xml',
591
- 'Accept-Language': 'en-US,en;q=0.9',
592
- },
593
- });
594
- clearTimeout(timeout);
595
-
596
- if (!res.ok) throw new Error(`HTTP ${res.status}`);
597
-
598
- const html = await res.text();
599
-
600
- // Simple HTML extraction - remove tags and extract text
601
- const content = html
602
- .replace(/<script[\s\S]*?<\/script>/gi, '')
603
- .replace(/<style[\s\S]*?<\/style>/gi, '')
604
- .replace(/<nav[\s\S]*?<\/nav>/gi, '')
605
- .replace(/<header[\s\S]*?<\/header>/gi, '')
606
- .replace(/<footer[\s\S]*?<\/footer>/gi, '')
607
- .replace(/<[^>]+>/g, ' ')
608
- .replace(/&[a-z]+;/gi, ' ')
609
- .replace(/\s+/g, ' ')
610
- .trim()
611
- .slice(0, maxChars);
612
-
613
- // Extract title
614
- const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
615
- const title = titleMatch ? titleMatch[1].trim() : '';
616
- const finalUrl = res.url || url;
617
- const snippet = trimText(content, 320);
618
-
619
- return {
620
- url,
621
- finalUrl,
622
- status: res.status,
623
- contentType: res.headers.get('content-type') || '',
624
- lastModified: res.headers.get('last-modified') || '',
625
- title,
626
- snippet,
627
- content,
628
- contentChars: content.length,
629
- };
630
- } catch (e) {
631
- return { url, title: '', content: null, snippet: '', contentChars: 0, error: e.message };
632
- }
633
- }
634
-
635
- async function fetchMultipleSources(sources, maxSources = 5, maxChars = 5000) {
636
- process.stderr.write(`[greedysearch] Fetching content from ${Math.min(sources.length, maxSources)} sources...\n`);
637
-
638
- // Fetch sources sequentially (CDP doesn't handle parallel tab operations well)
639
- const toFetch = sources.slice(0, maxSources);
640
- const fetched = [];
641
-
642
- for (let i = 0; i < toFetch.length; i++) {
643
- const s = toFetch[i];
644
- process.stderr.write(`[greedysearch] Fetching ${i + 1}/${toFetch.length}: ${(s.canonicalUrl || s.url).slice(0, 60)}...\n`);
645
- try {
646
- const result = await fetchSourceContent(s.canonicalUrl || s.url, maxChars);
647
- fetched.push({ id: s.id, ...result });
648
- if (result.content && result.content.length > 100) {
649
- process.stderr.write(`[greedysearch] ✓ Got ${result.content.length} chars\n`);
650
- } else {
651
- process.stderr.write(`[greedysearch] ✗ Empty or too short\n`);
652
- }
653
- } catch (e) {
654
- fetched.push({ id: s.id, url: s.canonicalUrl || s.url, error: e.message });
655
- process.stderr.write(`[greedysearch] ✗ Failed: ${e.message.slice(0, 80)}\n`);
656
- }
657
- process.stderr.write(`PROGRESS:fetch:${i + 1}/${toFetch.length}\n`);
658
- }
659
-
660
- return fetched;
661
- }
662
-
663
- function pickTopSource(out) {
664
- if (Array.isArray(out._sources) && out._sources.length > 0) return out._sources[0];
665
- for (const engine of ['perplexity', 'google', 'bing']) {
666
- const r = out[engine];
667
- if (r?.sources?.length > 0) return r.sources[0];
668
- }
669
- return null;
670
- }
671
-
672
- async function synthesizeWithGemini(query, results, { grounded = false, tabPrefix = null } = {}) {
673
- const sources = Array.isArray(results._sources) ? results._sources : buildSourceRegistry(results);
674
- const prompt = buildSynthesisPrompt(query, results, sources, { grounded });
675
-
676
- return new Promise((resolve, reject) => {
677
- const extraArgs = tabPrefix ? ['--tab', String(tabPrefix)] : [];
678
- const proc = spawn('node', [join(__dir, 'extractors', 'gemini.mjs'), prompt, ...extraArgs], {
679
- stdio: ['ignore', 'pipe', 'pipe'],
680
- });
681
- let out = '';
682
- let err = '';
683
- proc.stdout.on('data', d => out += d);
684
- proc.stderr.on('data', d => err += d);
685
- const t = setTimeout(() => {
686
- proc.kill();
687
- reject(new Error('Gemini synthesis timed out after 180s'));
688
- }, 180000);
689
- proc.on('close', code => {
690
- clearTimeout(t);
691
- if (code !== 0) reject(new Error(err.trim() || 'gemini extractor failed'));
692
- else {
693
- try {
694
- const raw = JSON.parse(out.trim());
695
- const structured = parseStructuredJson(raw.answer || '');
696
- resolve({
697
- ...normalizeSynthesisPayload(structured, sources, raw.answer || ''),
698
- rawAnswer: raw.answer || '',
699
- geminiSources: raw.sources || [],
700
- });
701
- }
702
- catch {
703
- reject(new Error(`bad JSON from gemini: ${out.slice(0, 100)}`));
704
- }
705
- }
706
- });
707
- });
708
- }
709
-
710
- function slugify(query) {
711
- return query.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '').slice(0, 60);
712
- }
713
-
714
- function resultsDir() {
715
- const dir = join(__dir, 'results');
716
- mkdirSync(dir, { recursive: true });
717
- return dir;
718
- }
719
-
720
- function writeOutput(data, outFile, { inline = false, synthesize = false, query = '' } = {}) {
721
- const json = JSON.stringify(data, null, 2) + '\n';
722
-
723
- if (outFile) {
724
- writeFileSync(outFile, json, 'utf8');
725
- process.stderr.write(`Results written to ${outFile}\n`);
726
- return;
727
- }
728
-
729
- if (inline) {
730
- process.stdout.write(json);
731
- return;
732
- }
733
-
734
- const ts = new Date().toISOString().replace('T', '_').replace(/[:.]/g, '-').slice(0, 19);
735
- const slug = slugify(query);
736
- const base = join(resultsDir(), `${ts}_${slug}`);
737
-
738
- writeFileSync(`${base}.json`, json, 'utf8');
739
-
740
- if (synthesize && data._synthesis?.answer) {
741
- writeFileSync(`${base}-synthesis.md`, data._synthesis.answer, 'utf8');
742
- process.stdout.write(`${base}-synthesis.md\n`);
743
- } else {
744
- process.stdout.write(`${base}.json\n`);
745
- }
746
- }
747
-
748
- const GREEDY_PROFILE_DIR = `${tmpdir().replace(/\\/g, '/')}/greedysearch-chrome-profile`;
749
- const ACTIVE_PORT_FILE = `${GREEDY_PROFILE_DIR}/DevToolsActivePort`;
750
-
751
- // Tell cdp.mjs to prefer the GreedySearch Chrome profile's DevToolsActivePort,
752
- // so searches never accidentally attach to the user's main Chrome session.
753
- process.env.CDP_PROFILE_DIR = GREEDY_PROFILE_DIR;
754
-
755
- function probeGreedyChrome(timeoutMs = 3000) {
756
- return new Promise(resolve => {
757
- const req = http.get(`http://localhost:${GREEDY_PORT}/json/version`, res => {
758
- res.resume();
759
- resolve(res.statusCode === 200);
760
- });
761
- req.on('error', () => resolve(false));
762
- req.setTimeout(timeoutMs, () => { req.destroy(); resolve(false); });
763
- });
764
- }
765
-
766
- // Write (or refresh) the DevToolsActivePort file for the GreedySearch Chrome so
767
- // cdp.mjs always connects to the right port rather than the user's main Chrome.
768
- // Uses atomic write (write to temp + rename) to prevent corruption from parallel processes.
769
- async function refreshPortFile() {
770
- const LOCK_FILE = ACTIVE_PORT_FILE + '.lock';
771
- const TEMP_FILE = ACTIVE_PORT_FILE + '.tmp';
772
-
773
- // Simple file-based lock with timeout (prevents parallel writes from corrupting the port file)
774
- const lockAcquired = await new Promise((resolve) => {
775
- const start = Date.now();
776
- const tryLock = () => {
777
- try {
778
- writeFileSync(LOCK_FILE, `${process.pid}`, 'utf8');
779
- resolve(true);
780
- } catch {
781
- // Lock file exists - check if stale (older than 5 seconds)
782
- try {
783
- const lockTime = parseInt(readFileSync(LOCK_FILE, 'utf8'));
784
- if (Date.now() - lockTime > 5000) {
785
- // Stale lock - overwrite
786
- writeFileSync(LOCK_FILE, `${process.pid}`, 'utf8');
787
- resolve(true);
788
- } else if (Date.now() - start < 1000) {
789
- setTimeout(tryLock, 50);
790
- } else {
791
- resolve(false); // Give up after 1s
792
- }
793
- } catch {
794
- setTimeout(tryLock, 50);
795
- }
796
- }
797
- };
798
- tryLock();
799
- });
800
-
801
- try {
802
- const body = await new Promise((res, rej) => {
803
- const req = http.get(`http://localhost:${GREEDY_PORT}/json/version`, r => {
804
- let b = '';
805
- r.on('data', d => b += d);
806
- r.on('end', () => res(b));
807
- });
808
- req.on('error', rej);
809
- req.setTimeout(3000, () => { req.destroy(); rej(new Error('timeout')); });
810
- });
811
- const { webSocketDebuggerUrl } = JSON.parse(body);
812
- const wsPath = new URL(webSocketDebuggerUrl).pathname;
813
-
814
- // Atomic write: write to temp file, then rename
815
- if (lockAcquired) {
816
- writeFileSync(TEMP_FILE, `${GREEDY_PORT}\n${wsPath}`, 'utf8');
817
- try { unlinkSync(ACTIVE_PORT_FILE); } catch {}
818
- renameSync(TEMP_FILE, ACTIVE_PORT_FILE);
819
- }
820
- } catch { /* best-effort — launch.mjs already wrote the file on first start */ }
821
- finally {
822
- if (lockAcquired) {
823
- try { unlinkSync(LOCK_FILE); } catch {}
824
- }
825
- }
826
- }
827
-
828
- async function ensureChrome() {
829
- const ready = await probeGreedyChrome();
830
- if (!ready) {
831
- process.stderr.write(`GreedySearch Chrome not running on port ${GREEDY_PORT} — auto-launching...\n`);
832
- await new Promise((resolve, reject) => {
833
- const proc = spawn('node', [join(__dir, 'launch.mjs')], { stdio: ['ignore', process.stderr, process.stderr] });
834
- proc.on('close', code => code === 0 ? resolve() : reject(new Error('launch.mjs failed')));
835
- });
836
- } else {
837
- // Chrome already running — refresh the port file so cdp.mjs always picks
838
- // up the right port, even if the file was stale from a previous session.
839
- await refreshPortFile();
840
- }
841
- }
842
-
843
- async function main() {
844
- const args = process.argv.slice(2);
845
- if (args.length < 2 || args[0] === '--help') {
846
- process.stderr.write([
847
- 'Usage: node search.mjs <engine> "<query>"',
848
- '',
849
- 'Engines: perplexity (p), bing (b), google (g), gemini (gem), all',
850
- '',
851
- 'Flags:',
852
- ' --full Return complete answers (~3000+ chars)',
853
- ' --synthesize Synthesize results via Gemini (adds ~30s)',
854
- ' --deep-research Full research: full answers + source fetching + synthesis',
855
- ' --fetch-top-source Fetch content from top source',
856
- ' --inline Output JSON to stdout (for piping)',
857
- '',
858
- 'Examples:',
859
- ' node search.mjs p "what is memoization"',
860
- ' node search.mjs all "TCP congestion control"',
861
- ' node search.mjs all "RAG vs fine-tuning" --deep-research',
862
- ].join('\n') + '\n');
863
- process.exit(1);
864
- }
865
-
866
- await ensureChrome();
867
-
868
- const full = args.includes('--full') || args.includes('--deep-research');
869
- const short = !full;
870
- const fetchSource = args.includes('--fetch-top-source');
871
- const synthesize = args.includes('--synthesize') || args.includes('--deep-research');
872
- const deepResearch = args.includes('--deep-research');
873
- const inline = args.includes('--inline');
874
- const outIdx = args.indexOf('--out');
875
- const outFile = outIdx !== -1 ? args[outIdx + 1] : null;
876
- const rest = args.filter((a, i) =>
877
- a !== '--full' &&
878
- a !== '--short' &&
879
- a !== '--fetch-top-source' &&
880
- a !== '--synthesize' &&
881
- a !== '--deep-research' &&
882
- a !== '--inline' &&
883
- a !== '--out' &&
884
- (outIdx === -1 || i !== outIdx + 1)
885
- );
886
- const engine = rest[0].toLowerCase();
887
- const query = rest.slice(1).join(' ');
888
-
889
- if (engine === 'all') {
890
- await cdp(['list']); // refresh pages cache
891
-
892
- // PARALLEL-SAFE: Always create fresh tabs for each engine to avoid race conditions
893
- // when multiple "all" searches run concurrently. Previously, reusing cached tabs
894
- // caused ERR_ABORTED and Uncaught errors as multiple processes fought over the same tab.
895
- const tabs = [];
896
- for (let i = 0; i < ALL_ENGINES.length; i++) {
897
- if (i > 0) await new Promise(r => setTimeout(r, 300)); // small delay between tab opens
898
- const tab = await openNewTab();
899
- tabs.push(tab);
900
- }
901
-
902
- // All tabs assigned — run extractors in parallel
903
- try {
904
- const results = await Promise.allSettled(
905
- ALL_ENGINES.map((e, i) =>
906
- runExtractor(ENGINES[e], query, tabs[i], short)
907
- .then(r => {
908
- process.stderr.write(`PROGRESS:${e}:done\n`);
909
- return { engine: e, ...r };
910
- })
911
- .catch(err => {
912
- process.stderr.write(`PROGRESS:${e}:error\n`);
913
- throw err;
914
- })
915
- )
916
- );
917
-
918
- const out = {};
919
- for (let i = 0; i < results.length; i++) {
920
- const r = results[i];
921
- if (r.status === 'fulfilled') {
922
- out[r.value.engine] = r.value;
923
- } else {
924
- out[ALL_ENGINES[i]] = { error: r.reason?.message || 'unknown error' };
925
- }
926
- }
927
-
928
- await closeTabs(tabs);
929
-
930
- // Build a canonical source registry across all engines
931
- out._sources = buildSourceRegistry(out);
932
-
933
- if (deepResearch) {
934
- process.stderr.write('PROGRESS:deep-research:start\n');
935
- const fetchedSources = out._sources.length > 0
936
- ? await fetchMultipleSources(out._sources, 5, 8000)
937
- : [];
938
-
939
- out._sources = mergeFetchDataIntoSources(out._sources, fetchedSources);
940
- out._fetchedSources = fetchedSources;
941
- process.stderr.write(out._sources.length > 0 ? 'PROGRESS:deep-research:done\n' : 'PROGRESS:deep-research:no-sources\n');
942
- }
943
-
944
- // Synthesize with Gemini if requested
945
- if (synthesize) {
946
- process.stderr.write('PROGRESS:synthesis:start\n');
947
- process.stderr.write('[greedysearch] Synthesizing results with Gemini...\n');
948
- try {
949
- const geminiTab = await getOrOpenEngineTab('gemini');
950
- await activateTab(geminiTab);
951
- const synthesis = await synthesizeWithGemini(query, out, { grounded: deepResearch, tabPrefix: geminiTab });
952
- await activateTab(geminiTab);
953
- out._synthesis = {
954
- ...synthesis,
955
- synthesized: true,
956
- };
957
- process.stderr.write('PROGRESS:synthesis:done\n');
958
- } catch (e) {
959
- process.stderr.write(`[greedysearch] Synthesis failed: ${e.message}\n`);
960
- out._synthesis = { error: e.message, synthesized: false };
961
- }
962
- }
963
-
964
- if (fetchSource) {
965
- const top = pickTopSource(out);
966
- if (top) out._topSource = await fetchTopSource(top.canonicalUrl || top.url);
967
- }
968
-
969
- if (deepResearch) out._confidence = buildConfidence(out);
970
-
971
- writeOutput(out, outFile, { inline, synthesize, query });
972
- return;
973
- } finally {
974
- await closeTabs(tabs);
975
- }
976
- }
977
-
978
- const script = ENGINES[engine];
979
- if (!script) {
980
- process.stderr.write(`Unknown engine: "${engine}"\nAvailable: ${Object.keys(ENGINES).join(', ')}\n`);
981
- process.exit(1);
982
- }
983
-
984
- try {
985
- const result = await runExtractor(script, query, null, short);
986
- if (fetchSource && result.sources?.length > 0) {
987
- result.topSource = await fetchTopSource(result.sources[0].url);
988
- }
989
- writeOutput(result, outFile, { inline, synthesize, query });
990
- } catch (e) {
991
- process.stderr.write(`Error: ${e.message}\n`);
992
- process.exit(1);
993
- }
994
- }
995
-
996
- main();
1
+ #!/usr/bin/env node
2
+ // search.mjs — unified CLI for GreedySearch extractors
3
+ //
4
+ // Usage:
5
+ // node search.mjs <engine> "<query>"
6
+ // node search.mjs all "<query>"
7
+ //
8
+ // Engines:
9
+ // perplexity | pplx | p
10
+ // bing | copilot | b
11
+ // google | g
12
+ // gemini | gem
13
+ // all — fan-out to all engines in parallel
14
+ //
15
+ // Output: JSON to stdout, errors to stderr
16
+ //
17
+ // Examples:
18
+ // node search.mjs p "what is memoization"
19
+ // node search.mjs gem "latest React features"
20
+ // node search.mjs all "how does TCP congestion control work"
21
+
22
+ import { spawn } from 'child_process';
23
+ import { fileURLToPath } from 'url';
24
+ import { join, dirname } from 'path';
25
+ import { readFileSync, existsSync, writeFileSync, mkdirSync, renameSync, unlinkSync } from 'fs';
26
+ import { tmpdir, homedir } from 'os';
27
+ import http from 'http';
28
+
29
+ const __dir = dirname(fileURLToPath(import.meta.url));
30
+ const CDP = join(__dir, 'cdp.mjs');
31
+ const PAGES_CACHE = `${tmpdir().replace(/\\/g, '/')}/cdp-pages.json`;
32
+
33
+ const GREEDY_PORT = 9222;
34
+
35
+ const ENGINES = {
36
+ perplexity: 'perplexity.mjs',
37
+ pplx: 'perplexity.mjs',
38
+ p: 'perplexity.mjs',
39
+ bing: 'bing-copilot.mjs',
40
+ copilot: 'bing-copilot.mjs',
41
+ b: 'bing-copilot.mjs',
42
+ google: 'google-ai.mjs',
43
+ g: 'google-ai.mjs',
44
+ gemini: 'gemini.mjs',
45
+ gem: 'gemini.mjs',
46
+ };
47
+
48
+ const ALL_ENGINES = ['perplexity', 'bing', 'google'];
49
+
50
+ const ENGINE_DOMAINS = {
51
+ perplexity: 'perplexity.ai',
52
+ bing: 'copilot.microsoft.com',
53
+ google: 'google.com',
54
+ gemini: 'gemini.google.com',
55
+ };
56
+
57
+ const TRACKING_PARAMS = [
58
+ 'fbclid',
59
+ 'gclid',
60
+ 'ref',
61
+ 'ref_src',
62
+ 'ref_url',
63
+ 'source',
64
+ 'utm_campaign',
65
+ 'utm_content',
66
+ 'utm_medium',
67
+ 'utm_source',
68
+ 'utm_term',
69
+ ];
70
+
71
+ const COMMUNITY_HOSTS = [
72
+ 'dev.to',
73
+ 'hashnode.com',
74
+ 'medium.com',
75
+ 'reddit.com',
76
+ 'stackoverflow.com',
77
+ 'stackexchange.com',
78
+ 'substack.com',
79
+ ];
80
+
81
+ const NEWS_HOSTS = [
82
+ 'arstechnica.com',
83
+ 'techcrunch.com',
84
+ 'theverge.com',
85
+ 'venturebeat.com',
86
+ 'wired.com',
87
+ 'zdnet.com',
88
+ ];
89
+
90
+ function trimText(text = '', maxChars = 240) {
91
+ const clean = String(text).replace(/\s+/g, ' ').trim();
92
+ if (clean.length <= maxChars) return clean;
93
+ return clean.slice(0, maxChars).replace(/\s+\S*$/, '') + '...';
94
+ }
95
+
96
+ function normalizeSourceTitle(title = '') {
97
+ const clean = trimText(title, 180);
98
+ if (!clean) return '';
99
+ if (/^https?:\/\//i.test(clean)) return '';
100
+
101
+ const wordCount = clean.split(/\s+/).filter(Boolean).length;
102
+ const hasUppercase = /[A-Z]/.test(clean);
103
+ const hasDigit = /\d/.test(clean);
104
+ const looksLikeFragment = clean === clean.toLowerCase() && wordCount <= 4 && !hasUppercase && !hasDigit;
105
+ return looksLikeFragment ? '' : clean;
106
+ }
107
+
108
+ function pickPreferredTitle(currentTitle = '', nextTitle = '') {
109
+ const current = normalizeSourceTitle(currentTitle);
110
+ const next = normalizeSourceTitle(nextTitle);
111
+ if (!next) return current;
112
+ if (!current) return next;
113
+ const currentLooksLikeUrl = /^https?:\/\//i.test(current);
114
+ const nextLooksLikeUrl = /^https?:\/\//i.test(next);
115
+ if (currentLooksLikeUrl && !nextLooksLikeUrl) return next;
116
+ if (!currentLooksLikeUrl && nextLooksLikeUrl) return current;
117
+ return next.length > current.length ? next : current;
118
+ }
119
+
120
+ function normalizeUrl(rawUrl) {
121
+ if (!rawUrl) return null;
122
+ try {
123
+ const url = new URL(rawUrl);
124
+ if (!['http:', 'https:'].includes(url.protocol)) return null;
125
+ url.hash = '';
126
+ url.hostname = url.hostname.toLowerCase();
127
+ if ((url.protocol === 'https:' && url.port === '443') || (url.protocol === 'http:' && url.port === '80')) {
128
+ url.port = '';
129
+ }
130
+ for (const key of [...url.searchParams.keys()]) {
131
+ const lower = key.toLowerCase();
132
+ if (TRACKING_PARAMS.includes(lower) || lower.startsWith('utm_')) {
133
+ url.searchParams.delete(key);
134
+ }
135
+ }
136
+ url.searchParams.sort();
137
+ const normalizedPath = url.pathname.replace(/\/+$/, '') || '/';
138
+ url.pathname = normalizedPath;
139
+ const normalized = url.toString();
140
+ return normalizedPath === '/' ? normalized.replace(/\/$/, '') : normalized;
141
+ } catch {
142
+ return null;
143
+ }
144
+ }
145
+
146
+ function getDomain(rawUrl) {
147
+ try {
148
+ const domain = new URL(rawUrl).hostname.toLowerCase();
149
+ return domain.replace(/^www\./, '');
150
+ } catch {
151
+ return '';
152
+ }
153
+ }
154
+
155
+ function matchesDomain(domain, hosts) {
156
+ return hosts.some(host => domain === host || domain.endsWith(`.${host}`));
157
+ }
158
+
159
+ function classifySourceType(domain, title = '', rawUrl = '') {
160
+ const lowerTitle = title.toLowerCase();
161
+ const lowerUrl = rawUrl.toLowerCase();
162
+
163
+ if (domain === 'github.com' || domain === 'gitlab.com') return 'repo';
164
+ if (matchesDomain(domain, COMMUNITY_HOSTS)) return 'community';
165
+ if (matchesDomain(domain, NEWS_HOSTS)) return 'news';
166
+ if (
167
+ domain.startsWith('docs.') ||
168
+ domain.startsWith('developer.') ||
169
+ domain.startsWith('developers.') ||
170
+ domain.startsWith('api.') ||
171
+ lowerTitle.includes('documentation') ||
172
+ lowerTitle.includes('docs') ||
173
+ lowerTitle.includes('reference') ||
174
+ lowerUrl.includes('/docs/') ||
175
+ lowerUrl.includes('/reference/') ||
176
+ lowerUrl.includes('/api/')
177
+ ) {
178
+ return 'official-docs';
179
+ }
180
+ if (domain.startsWith('blog.') || lowerUrl.includes('/blog/')) return 'maintainer-blog';
181
+ return 'website';
182
+ }
183
+
184
+ function sourceTypePriority(sourceType) {
185
+ switch (sourceType) {
186
+ case 'official-docs': return 5;
187
+ case 'repo': return 4;
188
+ case 'maintainer-blog': return 3;
189
+ case 'website': return 2;
190
+ case 'community': return 1;
191
+ case 'news': return 0;
192
+ default: return 0;
193
+ }
194
+ }
195
+
196
+ function bestRank(source) {
197
+ const ranks = Object.values(source.perEngine || {}).map(v => v?.rank || 99);
198
+ return ranks.length ? Math.min(...ranks) : 99;
199
+ }
200
+
201
+ function buildSourceRegistry(out) {
202
+ const seen = new Map();
203
+ const engineOrder = ['perplexity', 'bing', 'google'];
204
+
205
+ for (const engine of engineOrder) {
206
+ const result = out[engine];
207
+ if (!result?.sources) continue;
208
+
209
+ for (let i = 0; i < result.sources.length; i++) {
210
+ const source = result.sources[i];
211
+ const canonicalUrl = normalizeUrl(source.url);
212
+ if (!canonicalUrl || canonicalUrl.length < 10) continue;
213
+
214
+ const title = normalizeSourceTitle(source.title || '');
215
+ const domain = getDomain(canonicalUrl);
216
+ const sourceType = classifySourceType(domain, title, canonicalUrl);
217
+ const existing = seen.get(canonicalUrl) || {
218
+ id: '',
219
+ canonicalUrl,
220
+ displayUrl: source.url || canonicalUrl,
221
+ domain,
222
+ title: '',
223
+ engines: [],
224
+ engineCount: 0,
225
+ perEngine: {},
226
+ sourceType,
227
+ isOfficial: sourceType === 'official-docs',
228
+ };
229
+
230
+ existing.title = pickPreferredTitle(existing.title, title);
231
+ existing.displayUrl = existing.displayUrl || source.url || canonicalUrl;
232
+ existing.sourceType = existing.sourceType || sourceType;
233
+ existing.isOfficial = existing.isOfficial || sourceType === 'official-docs';
234
+
235
+ if (!existing.engines.includes(engine)) {
236
+ existing.engines.push(engine);
237
+ }
238
+ existing.perEngine[engine] = {
239
+ rank: i + 1,
240
+ title: pickPreferredTitle(existing.perEngine[engine]?.title || '', title),
241
+ };
242
+
243
+ seen.set(canonicalUrl, existing);
244
+ }
245
+ }
246
+
247
+ const sources = Array.from(seen.values())
248
+ .map(source => ({
249
+ ...source,
250
+ engineCount: source.engines.length,
251
+ }))
252
+ .sort((a, b) => {
253
+ if (b.engineCount !== a.engineCount) return b.engineCount - a.engineCount;
254
+ if (sourceTypePriority(b.sourceType) !== sourceTypePriority(a.sourceType)) {
255
+ return sourceTypePriority(b.sourceType) - sourceTypePriority(a.sourceType);
256
+ }
257
+ if (bestRank(a) !== bestRank(b)) return bestRank(a) - bestRank(b);
258
+ return a.domain.localeCompare(b.domain);
259
+ })
260
+ .slice(0, 12)
261
+ .map((source, index) => ({
262
+ ...source,
263
+ id: `S${index + 1}`,
264
+ title: source.title || source.domain || source.canonicalUrl,
265
+ }));
266
+
267
+ return sources;
268
+ }
269
+
270
+ function mergeFetchDataIntoSources(sources, fetchedSources) {
271
+ const byId = new Map(fetchedSources.map(source => [source.id, source]));
272
+ return sources.map(source => {
273
+ const fetched = byId.get(source.id);
274
+ if (!fetched) return source;
275
+
276
+ const title = pickPreferredTitle(source.title, fetched.title || '');
277
+ return {
278
+ ...source,
279
+ title: title || source.title,
280
+ fetch: {
281
+ attempted: true,
282
+ ok: !fetched.error,
283
+ status: fetched.status || null,
284
+ finalUrl: fetched.finalUrl || fetched.url || source.canonicalUrl,
285
+ contentType: fetched.contentType || '',
286
+ lastModified: fetched.lastModified || '',
287
+ title: fetched.title || '',
288
+ snippet: fetched.snippet || '',
289
+ contentChars: fetched.contentChars || 0,
290
+ error: fetched.error || '',
291
+ },
292
+ };
293
+ });
294
+ }
295
+
296
+ function parseStructuredJson(text) {
297
+ if (!text) return null;
298
+ const trimmed = String(text).trim();
299
+ const candidates = [
300
+ trimmed,
301
+ trimmed.replace(/^```json\s*/i, '').replace(/^```\s*/i, '').replace(/```$/i, '').trim(),
302
+ ];
303
+
304
+ const objectMatch = trimmed.match(/\{[\s\S]*\}/);
305
+ if (objectMatch) candidates.push(objectMatch[0]);
306
+
307
+ for (const candidate of candidates) {
308
+ try {
309
+ return JSON.parse(candidate);
310
+ } catch {
311
+ // try next candidate
312
+ }
313
+ }
314
+ return null;
315
+ }
316
+
317
+ function normalizeSynthesisPayload(payload, sources, fallbackAnswer = '') {
318
+ const sourceIds = new Set(sources.map(source => source.id));
319
+ const agreementLevel = ['high', 'medium', 'low', 'mixed', 'conflicting'].includes(payload?.agreement?.level)
320
+ ? payload.agreement.level
321
+ : 'mixed';
322
+ const claims = Array.isArray(payload?.claims)
323
+ ? payload.claims.map(claim => ({
324
+ claim: trimText(claim?.claim || '', 260),
325
+ support: ['strong', 'moderate', 'weak', 'conflicting'].includes(claim?.support) ? claim.support : 'moderate',
326
+ sourceIds: Array.isArray(claim?.sourceIds) ? claim.sourceIds.filter(id => sourceIds.has(id)) : [],
327
+ })).filter(claim => claim.claim)
328
+ : [];
329
+ const recommendedSources = Array.isArray(payload?.recommendedSources)
330
+ ? payload.recommendedSources.filter(id => sourceIds.has(id)).slice(0, 6)
331
+ : [];
332
+
333
+ return {
334
+ answer: trimText(payload?.answer || fallbackAnswer, 4000),
335
+ agreement: {
336
+ level: agreementLevel,
337
+ summary: trimText(payload?.agreement?.summary || '', 280),
338
+ },
339
+ differences: Array.isArray(payload?.differences)
340
+ ? payload.differences.map(item => trimText(item, 220)).filter(Boolean).slice(0, 5)
341
+ : [],
342
+ caveats: Array.isArray(payload?.caveats)
343
+ ? payload.caveats.map(item => trimText(item, 220)).filter(Boolean).slice(0, 5)
344
+ : [],
345
+ claims,
346
+ recommendedSources,
347
+ };
348
+ }
349
+
350
+ function buildSynthesisPrompt(query, results, sources, { grounded = false } = {}) {
351
+ const engineSummaries = {};
352
+ for (const engine of ['perplexity', 'bing', 'google']) {
353
+ const result = results[engine];
354
+ if (!result) continue;
355
+ if (result.error) {
356
+ engineSummaries[engine] = { status: 'error', error: String(result.error) };
357
+ continue;
358
+ }
359
+
360
+ engineSummaries[engine] = {
361
+ status: 'ok',
362
+ answer: trimText(result.answer || '', grounded ? 4500 : 2200),
363
+ sourceIds: sources
364
+ .filter(source => source.engines.includes(engine))
365
+ .sort((a, b) => (a.perEngine[engine]?.rank || 99) - (b.perEngine[engine]?.rank || 99))
366
+ .map(source => source.id)
367
+ .slice(0, 6),
368
+ };
369
+ }
370
+
371
+ const sourceRegistry = sources.slice(0, grounded ? 10 : 8).map(source => ({
372
+ id: source.id,
373
+ title: source.title,
374
+ domain: source.domain,
375
+ canonicalUrl: source.canonicalUrl,
376
+ sourceType: source.sourceType,
377
+ isOfficial: source.isOfficial,
378
+ engines: source.engines,
379
+ engineCount: source.engineCount,
380
+ perEngine: source.perEngine,
381
+ fetch: grounded && source.fetch?.attempted ? {
382
+ ok: source.fetch.ok,
383
+ status: source.fetch.status,
384
+ lastModified: source.fetch.lastModified,
385
+ snippet: trimText(source.fetch.snippet || '', 700),
386
+ } : undefined,
387
+ }));
388
+
389
+ return [
390
+ 'You are synthesizing results from Perplexity, Bing Copilot, and Google AI.',
391
+ grounded
392
+ ? 'Use the fetched source snippets as the strongest evidence. Use engine answers for perspective and conflict detection.'
393
+ : 'Use the engine answers for perspective. Use the source registry for provenance and citations.',
394
+ 'Prefer official docs, release notes, repositories, and maintainer-authored sources when available.',
395
+ 'If the engines disagree, say so explicitly.',
396
+ 'Do not invent sources. Only reference source IDs from the source registry.',
397
+ 'Return valid JSON only. No markdown fences, no prose outside the JSON object.',
398
+ '',
399
+ 'JSON schema:',
400
+ '{',
401
+ ' "answer": "short direct answer",',
402
+ ' "agreement": { "level": "high|medium|low|mixed|conflicting", "summary": "..." },',
403
+ ' "differences": ["..."],',
404
+ ' "caveats": ["..."],',
405
+ ' "claims": [',
406
+ ' { "claim": "...", "support": "strong|moderate|weak|conflicting", "sourceIds": ["S1"] }',
407
+ ' ],',
408
+ ' "recommendedSources": ["S1", "S2"]',
409
+ '}',
410
+ '',
411
+ `User query: ${query}`,
412
+ '',
413
+ `Engine results:\n${JSON.stringify(engineSummaries, null, 2)}`,
414
+ '',
415
+ `Source registry:\n${JSON.stringify(sourceRegistry, null, 2)}`,
416
+ ].join('\n');
417
+ }
418
+
419
+ function buildConfidence(out) {
420
+ const sources = Array.isArray(out._sources) ? out._sources : [];
421
+ const topConsensus = sources.length > 0 ? sources[0]?.engineCount || 0 : 0;
422
+ const officialSourceCount = sources.filter(source => source.isOfficial).length;
423
+ const firstPartySourceCount = sources.filter(source => source.isOfficial || source.sourceType === 'maintainer-blog').length;
424
+ const fetchedAttempted = sources.filter(source => source.fetch?.attempted).length;
425
+ const fetchedSucceeded = sources.filter(source => source.fetch?.ok).length;
426
+ const sourceTypeBreakdown = sources.reduce((acc, source) => {
427
+ acc[source.sourceType] = (acc[source.sourceType] || 0) + 1;
428
+ return acc;
429
+ }, {});
430
+ const synthesisLevel = out._synthesis?.agreement?.level;
431
+
432
+ return {
433
+ sourcesCount: sources.length,
434
+ topSourceConsensus: topConsensus,
435
+ agreementLevel: synthesisLevel || (topConsensus >= 3 ? 'high' : topConsensus >= 2 ? 'medium' : 'low'),
436
+ enginesResponded: ALL_ENGINES.filter(engine => out[engine]?.answer && !out[engine]?.error),
437
+ enginesFailed: ALL_ENGINES.filter(engine => out[engine]?.error),
438
+ officialSourceCount,
439
+ firstPartySourceCount,
440
+ fetchedSourceSuccessRate: fetchedAttempted > 0 ? Number((fetchedSucceeded / fetchedAttempted).toFixed(2)) : 0,
441
+ sourceTypeBreakdown,
442
+ };
443
+ }
444
+
445
+ function getFullTabFromCache(engine) {
446
+ try {
447
+ if (!existsSync(PAGES_CACHE)) return null;
448
+ const pages = JSON.parse(readFileSync(PAGES_CACHE, 'utf8'));
449
+ const found = pages.find(p => p.url.includes(ENGINE_DOMAINS[engine]));
450
+ return found ? found.targetId : null;
451
+ } catch { return null; }
452
+ }
453
+
454
+ function cdp(args, timeoutMs = 15000) {
455
+ return new Promise((resolve, reject) => {
456
+ const proc = spawn('node', [CDP, ...args], { stdio: ['ignore', 'pipe', 'pipe'] });
457
+ let out = '', err = '';
458
+ proc.stdout.on('data', d => out += d);
459
+ proc.stderr.on('data', d => err += d);
460
+ const t = setTimeout(() => { proc.kill(); reject(new Error(`cdp timeout: ${args[0]}`)); }, timeoutMs);
461
+ proc.on('close', code => {
462
+ clearTimeout(t);
463
+ if (code !== 0) reject(new Error(err.trim() || `cdp exit ${code}`));
464
+ else resolve(out.trim());
465
+ });
466
+ });
467
+ }
468
+
469
+ async function getAnyTab() {
470
+ const list = await cdp(['list']);
471
+ const first = list.split('\n')[0];
472
+ if (!first) throw new Error('No Chrome tabs found');
473
+ return first.slice(0, 8);
474
+ }
475
+
476
+ async function getOrReuseBlankTab() {
477
+ // Reuse an existing about:blank tab rather than always creating a new one
478
+ const listOut = await cdp(['list']);
479
+ const lines = listOut.split('\n').filter(Boolean);
480
+ for (const line of lines) {
481
+ if (line.includes('about:blank')) {
482
+ return line.slice(0, 8); // prefix of the blank tab's targetId
483
+ }
484
+ }
485
+ // No blank tab — open a new one
486
+ const anchor = await getAnyTab();
487
+ const raw = await cdp(['evalraw', anchor, 'Target.createTarget', '{"url":"about:blank"}']);
488
+ const { targetId } = JSON.parse(raw);
489
+ return targetId;
490
+ }
491
+
492
+ async function openNewTab() {
493
+ const anchor = await getAnyTab();
494
+ const raw = await cdp(['evalraw', anchor, 'Target.createTarget', '{"url":"about:blank"}']);
495
+ const { targetId } = JSON.parse(raw);
496
+ return targetId;
497
+ }
498
+
499
+ async function getOrOpenEngineTab(engine) {
500
+ await cdp(['list']);
501
+ return getFullTabFromCache(engine) || openNewTab();
502
+ }
503
+
504
+ async function activateTab(targetId) {
505
+ try {
506
+ const anchor = await getAnyTab();
507
+ await cdp(['evalraw', anchor, 'Target.activateTarget', JSON.stringify({ targetId })]);
508
+ } catch {
509
+ // best-effort
510
+ }
511
+ }
512
+
513
+ async function closeTabs(targetIds = []) {
514
+ for (const targetId of targetIds) {
515
+ if (!targetId) continue;
516
+ await closeTab(targetId);
517
+ }
518
+ if (targetIds.length > 0) {
519
+ await new Promise(r => setTimeout(r, 300));
520
+ await cdp(['list']).catch(() => null);
521
+ }
522
+ }
523
+
524
+ async function closeTab(targetId) {
525
+ try {
526
+ const anchor = await getAnyTab();
527
+ await cdp(['evalraw', anchor, 'Target.closeTarget', JSON.stringify({ targetId })]);
528
+ } catch { /* best-effort */ }
529
+ }
530
+
531
+ function runExtractor(script, query, tabPrefix = null, short = false, timeoutMs = 90000) {
532
+ const extraArgs = [
533
+ ...(tabPrefix ? ['--tab', tabPrefix] : []),
534
+ ...(short ? ['--short'] : []),
535
+ ];
536
+ return new Promise((resolve, reject) => {
537
+ const proc = spawn('node', [join(__dir, 'extractors', script), query, ...extraArgs], {
538
+ stdio: ['ignore', 'pipe', 'pipe'],
539
+ });
540
+ let out = '';
541
+ let err = '';
542
+ proc.stdout.on('data', d => out += d);
543
+ proc.stderr.on('data', d => err += d);
544
+ const t = setTimeout(() => {
545
+ proc.kill();
546
+ reject(new Error(`${script} timed out after ${timeoutMs / 1000}s`));
547
+ }, timeoutMs);
548
+ proc.on('close', code => {
549
+ clearTimeout(t);
550
+ if (code !== 0) reject(new Error(err.trim() || `extractor exit ${code}`));
551
+ else {
552
+ try { resolve(JSON.parse(out.trim())); }
553
+ catch { reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`)); }
554
+ }
555
+ });
556
+ });
557
+ }
558
+
559
+
560
+ async function fetchTopSource(url) {
561
+ const tab = await openNewTab();
562
+ await cdp(['list']); // refresh cache so the new tab is findable
563
+ try {
564
+ await cdp(['nav', tab, url], 30000);
565
+ await new Promise(r => setTimeout(r, 1500));
566
+ const content = await cdp(['eval', tab, `
567
+ (function(){
568
+ var el = document.querySelector('article, [role="main"], main, .post-content, .article-body, #content, .content');
569
+ var text = (el || document.body).innerText;
570
+ return text.replace(/\\s+/g, ' ').trim();
571
+ })()
572
+ `]);
573
+ return { url, content };
574
+ } catch (e) {
575
+ return { url, content: null, error: e.message };
576
+ } finally {
577
+ await closeTab(tab);
578
+ }
579
+ }
580
+
581
+ async function fetchSourceContent(url, maxChars = 5000) {
582
+ try {
583
+ const controller = new AbortController();
584
+ const timeout = setTimeout(() => controller.abort(), 15000);
585
+
586
+ const res = await fetch(url, {
587
+ signal: controller.signal,
588
+ headers: {
589
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
590
+ 'Accept': 'text/html,application/xhtml+xml',
591
+ 'Accept-Language': 'en-US,en;q=0.9',
592
+ },
593
+ });
594
+ clearTimeout(timeout);
595
+
596
+ if (!res.ok) throw new Error(`HTTP ${res.status}`);
597
+
598
+ const html = await res.text();
599
+
600
+ // Simple HTML extraction - remove tags and extract text
601
+ const content = html
602
+ .replace(/<script[\s\S]*?<\/script>/gi, '')
603
+ .replace(/<style[\s\S]*?<\/style>/gi, '')
604
+ .replace(/<nav[\s\S]*?<\/nav>/gi, '')
605
+ .replace(/<header[\s\S]*?<\/header>/gi, '')
606
+ .replace(/<footer[\s\S]*?<\/footer>/gi, '')
607
+ .replace(/<[^>]+>/g, ' ')
608
+ .replace(/&[a-z]+;/gi, ' ')
609
+ .replace(/\s+/g, ' ')
610
+ .trim()
611
+ .slice(0, maxChars);
612
+
613
+ // Extract title
614
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
615
+ const title = titleMatch ? titleMatch[1].trim() : '';
616
+ const finalUrl = res.url || url;
617
+ const snippet = trimText(content, 320);
618
+
619
+ return {
620
+ url,
621
+ finalUrl,
622
+ status: res.status,
623
+ contentType: res.headers.get('content-type') || '',
624
+ lastModified: res.headers.get('last-modified') || '',
625
+ title,
626
+ snippet,
627
+ content,
628
+ contentChars: content.length,
629
+ };
630
+ } catch (e) {
631
+ return { url, title: '', content: null, snippet: '', contentChars: 0, error: e.message };
632
+ }
633
+ }
634
+
635
+ async function fetchMultipleSources(sources, maxSources = 5, maxChars = 5000) {
636
+ process.stderr.write(`[greedysearch] Fetching content from ${Math.min(sources.length, maxSources)} sources...\n`);
637
+
638
+ // Fetch sources sequentially (CDP doesn't handle parallel tab operations well)
639
+ const toFetch = sources.slice(0, maxSources);
640
+ const fetched = [];
641
+
642
+ for (let i = 0; i < toFetch.length; i++) {
643
+ const s = toFetch[i];
644
+ process.stderr.write(`[greedysearch] Fetching ${i + 1}/${toFetch.length}: ${(s.canonicalUrl || s.url).slice(0, 60)}...\n`);
645
+ try {
646
+ const result = await fetchSourceContent(s.canonicalUrl || s.url, maxChars);
647
+ fetched.push({ id: s.id, ...result });
648
+ if (result.content && result.content.length > 100) {
649
+ process.stderr.write(`[greedysearch] ✓ Got ${result.content.length} chars\n`);
650
+ } else {
651
+ process.stderr.write(`[greedysearch] ✗ Empty or too short\n`);
652
+ }
653
+ } catch (e) {
654
+ fetched.push({ id: s.id, url: s.canonicalUrl || s.url, error: e.message });
655
+ process.stderr.write(`[greedysearch] ✗ Failed: ${e.message.slice(0, 80)}\n`);
656
+ }
657
+ process.stderr.write(`PROGRESS:fetch:${i + 1}/${toFetch.length}\n`);
658
+ }
659
+
660
+ return fetched;
661
+ }
662
+
663
+ function pickTopSource(out) {
664
+ if (Array.isArray(out._sources) && out._sources.length > 0) return out._sources[0];
665
+ for (const engine of ['perplexity', 'google', 'bing']) {
666
+ const r = out[engine];
667
+ if (r?.sources?.length > 0) return r.sources[0];
668
+ }
669
+ return null;
670
+ }
671
+
672
+ async function synthesizeWithGemini(query, results, { grounded = false, tabPrefix = null } = {}) {
673
+ const sources = Array.isArray(results._sources) ? results._sources : buildSourceRegistry(results);
674
+ const prompt = buildSynthesisPrompt(query, results, sources, { grounded });
675
+
676
+ return new Promise((resolve, reject) => {
677
+ const extraArgs = tabPrefix ? ['--tab', String(tabPrefix)] : [];
678
+ const proc = spawn('node', [join(__dir, 'extractors', 'gemini.mjs'), prompt, ...extraArgs], {
679
+ stdio: ['ignore', 'pipe', 'pipe'],
680
+ });
681
+ let out = '';
682
+ let err = '';
683
+ proc.stdout.on('data', d => out += d);
684
+ proc.stderr.on('data', d => err += d);
685
+ const t = setTimeout(() => {
686
+ proc.kill();
687
+ reject(new Error('Gemini synthesis timed out after 180s'));
688
+ }, 180000);
689
+ proc.on('close', code => {
690
+ clearTimeout(t);
691
+ if (code !== 0) reject(new Error(err.trim() || 'gemini extractor failed'));
692
+ else {
693
+ try {
694
+ const raw = JSON.parse(out.trim());
695
+ const structured = parseStructuredJson(raw.answer || '');
696
+ resolve({
697
+ ...normalizeSynthesisPayload(structured, sources, raw.answer || ''),
698
+ rawAnswer: raw.answer || '',
699
+ geminiSources: raw.sources || [],
700
+ });
701
+ }
702
+ catch {
703
+ reject(new Error(`bad JSON from gemini: ${out.slice(0, 100)}`));
704
+ }
705
+ }
706
+ });
707
+ });
708
+ }
709
+
710
+ function slugify(query) {
711
+ return query.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '').slice(0, 60);
712
+ }
713
+
714
+ function resultsDir() {
715
+ const dir = join(__dir, 'results');
716
+ mkdirSync(dir, { recursive: true });
717
+ return dir;
718
+ }
719
+
720
+ function writeOutput(data, outFile, { inline = false, synthesize = false, query = '' } = {}) {
721
+ const json = JSON.stringify(data, null, 2) + '\n';
722
+
723
+ if (outFile) {
724
+ writeFileSync(outFile, json, 'utf8');
725
+ process.stderr.write(`Results written to ${outFile}\n`);
726
+ return;
727
+ }
728
+
729
+ if (inline) {
730
+ process.stdout.write(json);
731
+ return;
732
+ }
733
+
734
+ const ts = new Date().toISOString().replace('T', '_').replace(/[:.]/g, '-').slice(0, 19);
735
+ const slug = slugify(query);
736
+ const base = join(resultsDir(), `${ts}_${slug}`);
737
+
738
+ writeFileSync(`${base}.json`, json, 'utf8');
739
+
740
+ if (synthesize && data._synthesis?.answer) {
741
+ writeFileSync(`${base}-synthesis.md`, data._synthesis.answer, 'utf8');
742
+ process.stdout.write(`${base}-synthesis.md\n`);
743
+ } else {
744
+ process.stdout.write(`${base}.json\n`);
745
+ }
746
+ }
747
+
748
+ const GREEDY_PROFILE_DIR = `${tmpdir().replace(/\\/g, '/')}/greedysearch-chrome-profile`;
749
+ const ACTIVE_PORT_FILE = `${GREEDY_PROFILE_DIR}/DevToolsActivePort`;
750
+
751
+ // Tell cdp.mjs to prefer the GreedySearch Chrome profile's DevToolsActivePort,
752
+ // so searches never accidentally attach to the user's main Chrome session.
753
+ process.env.CDP_PROFILE_DIR = GREEDY_PROFILE_DIR;
754
+
755
+ function probeGreedyChrome(timeoutMs = 3000) {
756
+ return new Promise(resolve => {
757
+ const req = http.get(`http://localhost:${GREEDY_PORT}/json/version`, res => {
758
+ res.resume();
759
+ resolve(res.statusCode === 200);
760
+ });
761
+ req.on('error', () => resolve(false));
762
+ req.setTimeout(timeoutMs, () => { req.destroy(); resolve(false); });
763
+ });
764
+ }
765
+
766
+ // Write (or refresh) the DevToolsActivePort file for the GreedySearch Chrome so
767
+ // cdp.mjs always connects to the right port rather than the user's main Chrome.
768
+ // Uses atomic write (write to temp + rename) to prevent corruption from parallel processes.
769
+ async function refreshPortFile() {
770
+ const LOCK_FILE = ACTIVE_PORT_FILE + '.lock';
771
+ const TEMP_FILE = ACTIVE_PORT_FILE + '.tmp';
772
+
773
+ // Simple file-based lock with timeout (prevents parallel writes from corrupting the port file)
774
+ const lockAcquired = await new Promise((resolve) => {
775
+ const start = Date.now();
776
+ const tryLock = () => {
777
+ try {
778
+ writeFileSync(LOCK_FILE, `${process.pid}`, 'utf8');
779
+ resolve(true);
780
+ } catch {
781
+ // Lock file exists - check if stale (older than 5 seconds)
782
+ try {
783
+ const lockTime = parseInt(readFileSync(LOCK_FILE, 'utf8'));
784
+ if (Date.now() - lockTime > 5000) {
785
+ // Stale lock - overwrite
786
+ writeFileSync(LOCK_FILE, `${process.pid}`, 'utf8');
787
+ resolve(true);
788
+ } else if (Date.now() - start < 1000) {
789
+ setTimeout(tryLock, 50);
790
+ } else {
791
+ resolve(false); // Give up after 1s
792
+ }
793
+ } catch {
794
+ setTimeout(tryLock, 50);
795
+ }
796
+ }
797
+ };
798
+ tryLock();
799
+ });
800
+
801
+ try {
802
+ const body = await new Promise((res, rej) => {
803
+ const req = http.get(`http://localhost:${GREEDY_PORT}/json/version`, r => {
804
+ let b = '';
805
+ r.on('data', d => b += d);
806
+ r.on('end', () => res(b));
807
+ });
808
+ req.on('error', rej);
809
+ req.setTimeout(3000, () => { req.destroy(); rej(new Error('timeout')); });
810
+ });
811
+ const { webSocketDebuggerUrl } = JSON.parse(body);
812
+ const wsPath = new URL(webSocketDebuggerUrl).pathname;
813
+
814
+ // Atomic write: write to temp file, then rename
815
+ if (lockAcquired) {
816
+ writeFileSync(TEMP_FILE, `${GREEDY_PORT}\n${wsPath}`, 'utf8');
817
+ try { unlinkSync(ACTIVE_PORT_FILE); } catch {}
818
+ renameSync(TEMP_FILE, ACTIVE_PORT_FILE);
819
+ }
820
+ } catch { /* best-effort — launch.mjs already wrote the file on first start */ }
821
+ finally {
822
+ if (lockAcquired) {
823
+ try { unlinkSync(LOCK_FILE); } catch {}
824
+ }
825
+ }
826
+ }
827
+
828
+ async function ensureChrome() {
829
+ const ready = await probeGreedyChrome();
830
+ if (!ready) {
831
+ process.stderr.write(`GreedySearch Chrome not running on port ${GREEDY_PORT} — auto-launching...\n`);
832
+ await new Promise((resolve, reject) => {
833
+ const proc = spawn('node', [join(__dir, 'launch.mjs')], { stdio: ['ignore', process.stderr, process.stderr] });
834
+ proc.on('close', code => code === 0 ? resolve() : reject(new Error('launch.mjs failed')));
835
+ });
836
+ } else {
837
+ // Chrome already running — refresh the port file so cdp.mjs always picks
838
+ // up the right port, even if the file was stale from a previous session.
839
+ await refreshPortFile();
840
+ }
841
+ }
842
+
843
+ async function main() {
844
+ const args = process.argv.slice(2);
845
+ if (args.length < 2 || args[0] === '--help') {
846
+ process.stderr.write([
847
+ 'Usage: node search.mjs <engine> "<query>"',
848
+ '',
849
+ 'Engines: perplexity (p), bing (b), google (g), gemini (gem), all',
850
+ '',
851
+ 'Flags:',
852
+ ' --full Return complete answers (~3000+ chars)',
853
+ ' --synthesize Synthesize results via Gemini (adds ~30s)',
854
+ ' --deep-research Full research: full answers + source fetching + synthesis',
855
+ ' --fetch-top-source Fetch content from top source',
856
+ ' --inline Output JSON to stdout (for piping)',
857
+ '',
858
+ 'Examples:',
859
+ ' node search.mjs p "what is memoization"',
860
+ ' node search.mjs all "TCP congestion control"',
861
+ ' node search.mjs all "RAG vs fine-tuning" --deep-research',
862
+ ].join('\n') + '\n');
863
+ process.exit(1);
864
+ }
865
+
866
+ await ensureChrome();
867
+
868
+ const full = args.includes('--full') || args.includes('--deep-research');
869
+ const short = !full;
870
+ const fetchSource = args.includes('--fetch-top-source');
871
+ const synthesize = args.includes('--synthesize') || args.includes('--deep-research');
872
+ const deepResearch = args.includes('--deep-research');
873
+ const inline = args.includes('--inline');
874
+ const outIdx = args.indexOf('--out');
875
+ const outFile = outIdx !== -1 ? args[outIdx + 1] : null;
876
+ const rest = args.filter((a, i) =>
877
+ a !== '--full' &&
878
+ a !== '--short' &&
879
+ a !== '--fetch-top-source' &&
880
+ a !== '--synthesize' &&
881
+ a !== '--deep-research' &&
882
+ a !== '--inline' &&
883
+ a !== '--out' &&
884
+ (outIdx === -1 || i !== outIdx + 1)
885
+ );
886
+ const engine = rest[0].toLowerCase();
887
+ const query = rest.slice(1).join(' ');
888
+
889
+ if (engine === 'all') {
890
+ await cdp(['list']); // refresh pages cache
891
+
892
+ // PARALLEL-SAFE: Always create fresh tabs for each engine to avoid race conditions
893
+ // when multiple "all" searches run concurrently. Previously, reusing cached tabs
894
+ // caused ERR_ABORTED and Uncaught errors as multiple processes fought over the same tab.
895
+ const tabs = [];
896
+ for (let i = 0; i < ALL_ENGINES.length; i++) {
897
+ if (i > 0) await new Promise(r => setTimeout(r, 300)); // small delay between tab opens
898
+ const tab = await openNewTab();
899
+ tabs.push(tab);
900
+ }
901
+
902
+ // All tabs assigned — run extractors in parallel
903
+ try {
904
+ const results = await Promise.allSettled(
905
+ ALL_ENGINES.map((e, i) =>
906
+ runExtractor(ENGINES[e], query, tabs[i], short)
907
+ .then(r => {
908
+ process.stderr.write(`PROGRESS:${e}:done\n`);
909
+ return { engine: e, ...r };
910
+ })
911
+ .catch(err => {
912
+ process.stderr.write(`PROGRESS:${e}:error\n`);
913
+ throw err;
914
+ })
915
+ )
916
+ );
917
+
918
+ const out = {};
919
+ for (let i = 0; i < results.length; i++) {
920
+ const r = results[i];
921
+ if (r.status === 'fulfilled') {
922
+ out[r.value.engine] = r.value;
923
+ } else {
924
+ out[ALL_ENGINES[i]] = { error: r.reason?.message || 'unknown error' };
925
+ }
926
+ }
927
+
928
+ await closeTabs(tabs);
929
+
930
+ // Build a canonical source registry across all engines
931
+ out._sources = buildSourceRegistry(out);
932
+
933
+ if (deepResearch) {
934
+ process.stderr.write('PROGRESS:deep-research:start\n');
935
+ const fetchedSources = out._sources.length > 0
936
+ ? await fetchMultipleSources(out._sources, 5, 8000)
937
+ : [];
938
+
939
+ out._sources = mergeFetchDataIntoSources(out._sources, fetchedSources);
940
+ out._fetchedSources = fetchedSources;
941
+ process.stderr.write(out._sources.length > 0 ? 'PROGRESS:deep-research:done\n' : 'PROGRESS:deep-research:no-sources\n');
942
+ }
943
+
944
+ // Synthesize with Gemini if requested
945
+ if (synthesize) {
946
+ process.stderr.write('PROGRESS:synthesis:start\n');
947
+ process.stderr.write('[greedysearch] Synthesizing results with Gemini...\n');
948
+ try {
949
+ // Create fresh Gemini tab per search (not cached) to avoid conflicts in parallel searches
950
+ const geminiTab = await openNewTab();
951
+ tabs.push(geminiTab); // ensure cleanup in finally block
952
+ await activateTab(geminiTab);
953
+ const synthesis = await synthesizeWithGemini(query, out, { grounded: deepResearch, tabPrefix: geminiTab });
954
+ out._synthesis = {
955
+ ...synthesis,
956
+ synthesized: true,
957
+ };
958
+ process.stderr.write('PROGRESS:synthesis:done\n');
959
+ } catch (e) {
960
+ process.stderr.write(`[greedysearch] Synthesis failed: ${e.message}\n`);
961
+ out._synthesis = { error: e.message, synthesized: false };
962
+ }
963
+ }
964
+
965
+ if (fetchSource) {
966
+ const top = pickTopSource(out);
967
+ if (top) out._topSource = await fetchTopSource(top.canonicalUrl || top.url);
968
+ }
969
+
970
+ if (deepResearch) out._confidence = buildConfidence(out);
971
+
972
+ writeOutput(out, outFile, { inline, synthesize, query });
973
+ return;
974
+ } finally {
975
+ await closeTabs(tabs);
976
+ }
977
+ }
978
+
979
+ const script = ENGINES[engine];
980
+ if (!script) {
981
+ process.stderr.write(`Unknown engine: "${engine}"\nAvailable: ${Object.keys(ENGINES).join(', ')}\n`);
982
+ process.exit(1);
983
+ }
984
+
985
+ try {
986
+ const result = await runExtractor(script, query, null, short);
987
+ if (fetchSource && result.sources?.length > 0) {
988
+ result.topSource = await fetchTopSource(result.sources[0].url);
989
+ }
990
+ writeOutput(result, outFile, { inline, synthesize, query });
991
+ } catch (e) {
992
+ process.stderr.write(`Error: ${e.message}\n`);
993
+ process.exit(1);
994
+ }
995
+ }
996
+
997
+ main();