@apmantza/greedysearch-pi 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -4,12 +4,13 @@ Pi extension that adds a `greedy_search` tool — fans out queries to Perplexity
4
4
 
5
5
  Forked from [GreedySearch-claude](https://github.com/apmantza/GreedySearch-claude).
6
6
 
7
- ## What's New (v1.2.0)
7
+ ## What's New (v1.4.0)
8
8
 
9
- - **Fixed parallel search race condition** multiple `greedy_search` calls can now run concurrently without tab conflicts
10
- - **Improved Bing Copilot verification** — better auto-handling of Turnstile challenges and modal dialogs
11
- - **Added test suite** — run `./test.sh` to verify all modes work correctly
12
- - **Atomic port file writes** — prevents corruption when multiple processes connect to Chrome
9
+ - **Grounded synthesis** Gemini now receives a normalized source registry with stable source IDs, agreement summaries, caveats, and cited claims
10
+ - **Real deep research** — top sources are fetched before synthesis so deep research answers are grounded in fetched evidence, not just engine summaries
11
+ - **Richer source metadata** — source output now includes canonical URLs, domains, source types, per-engine attribution, and confidence metadata
12
+ - **Cleaner tab lifecycle** — temporary Perplexity, Bing, and Google tabs are closed after each fan-out search, and synthesis finishes on the Gemini tab
13
+ - **Isolated Chrome targeting** — GreedySearch now refuses to fall back to your normal Chrome session, preventing stray remote-debugging prompts
13
14
 
14
15
  ## Install
15
16
 
@@ -69,7 +70,15 @@ For complex research questions, use `synthesize: true` with `engine: "all"`:
69
70
  greedy_search({ query: "best auth patterns for SaaS in 2026", engine: "all", synthesize: true })
70
71
  ```
71
72
 
72
- This deduplicates sources across engines and feeds them to Gemini for one clean, synthesized answer. Adds ~30s but produces the highest quality output with deduped sources showing consensus scores (`[2/3]`, `[3/3]`).
73
+ This deduplicates sources across engines, builds a normalized source registry, and feeds that context to Gemini for one clean synthesized answer. Adds ~30s but now returns agreement summaries, caveats, key claims, and better-labeled top sources.
74
+
75
+ For the most grounded mode, use deep research from the CLI:
76
+
77
+ ```bash
78
+ node search.mjs all "best auth patterns for SaaS in 2026" --deep-research
79
+ ```
80
+
81
+ Deep research fetches top source pages before synthesis and reports source confidence metadata such as agreement level, fetched-source success rate, and source mix.
73
82
 
74
83
  **Use synthesis when:**
75
84
  - You need one definitive answer, not multiple perspectives
@@ -112,7 +121,7 @@ greedy_search({ query: "Error: Cannot find module 'react-dom/client' Next.js 15"
112
121
 
113
122
  ## Requirements
114
123
 
115
- - **Chrome** — must be installed. The extension auto-launches a dedicated Chrome instance on port 9222 (separate from your main browser session).
124
+ - **Chrome** — must be installed. The extension auto-launches a dedicated Chrome instance on port 9222 with its own isolated profile and DevTools port file, separate from your main browser session.
116
125
  - **Node.js 22+** — for built-in `fetch` and WebSocket support.
117
126
 
118
127
  ## Setup (first time)
package/cdp.mjs CHANGED
@@ -37,21 +37,22 @@ function getDevToolsActivePortPath() {
37
37
  return join(homedir(), '.config', 'google-chrome', 'DevToolsActivePort');
38
38
  }
39
39
 
40
- function getWsUrl() {
41
- // If CDP_PROFILE_DIR is set (by search.mjs), prefer that profile's port file
42
- // so GreedySearch targets its own Chrome, not the user's main session.
43
- const profileDir = process.env.CDP_PROFILE_DIR;
44
- if (profileDir) {
45
- const p = profileDir.replace(/\\/g, '/') + '/DevToolsActivePort';
46
- if (existsSync(p)) {
47
- const lines = readFileSync(p, 'utf8').trim().split('\n');
48
- return `ws://127.0.0.1:${lines[0]}${lines[1]}`;
49
- }
50
- }
51
- const portFile = getDevToolsActivePortPath();
52
- const lines = readFileSync(portFile, 'utf8').trim().split('\n');
53
- return `ws://127.0.0.1:${lines[0]}${lines[1]}`;
54
- }
40
+ function getWsUrl() {
41
+ // If CDP_PROFILE_DIR is set (by search.mjs), prefer that profile's port file
42
+ // so GreedySearch targets its own Chrome, not the user's main session.
43
+ const profileDir = process.env.CDP_PROFILE_DIR;
44
+ if (profileDir) {
45
+ const p = profileDir.replace(/\\/g, '/') + '/DevToolsActivePort';
46
+ if (existsSync(p)) {
47
+ const lines = readFileSync(p, 'utf8').trim().split('\n');
48
+ return `ws://127.0.0.1:${lines[0]}${lines[1]}`;
49
+ }
50
+ throw new Error(`GreedySearch DevToolsActivePort not found at ${p}. Refusing to fall back to the main Chrome session.`);
51
+ }
52
+ const portFile = getDevToolsActivePortPath();
53
+ const lines = readFileSync(portFile, 'utf8').trim().split('\n');
54
+ return `ws://127.0.0.1:${lines[0]}${lines[1]}`;
55
+ }
55
56
 
56
57
  const sleep = (ms) => new Promise(r => setTimeout(r, ms));
57
58
 
package/index.ts CHANGED
@@ -68,31 +68,164 @@ function runSearch(
68
68
  });
69
69
  }
70
70
 
71
+ function formatEngineName(engine: string): string {
72
+ if (engine === "bing") return "Bing Copilot";
73
+ if (engine === "google") return "Google AI";
74
+ return engine.charAt(0).toUpperCase() + engine.slice(1);
75
+ }
76
+
77
+ function humanizeSourceType(sourceType: string): string {
78
+ if (!sourceType) return "";
79
+ if (sourceType === "official-docs") return "official docs";
80
+ return sourceType.replace(/-/g, " ");
81
+ }
82
+
83
+ function sourceUrl(source: Record<string, unknown>): string {
84
+ return String(source.displayUrl || source.canonicalUrl || source.url || "");
85
+ }
86
+
87
+ function sourceLabel(source: Record<string, unknown>): string {
88
+ return String(source.title || source.domain || sourceUrl(source) || "Untitled source");
89
+ }
90
+
91
+ function sourceConsensus(source: Record<string, unknown>): number {
92
+ if (typeof source.engineCount === "number") return source.engineCount;
93
+ const engines = Array.isArray(source.engines) ? (source.engines as string[]) : [];
94
+ return engines.length;
95
+ }
96
+
97
+ function formatAgreementLevel(level: string): string {
98
+ if (!level) return "Mixed";
99
+ return level.charAt(0).toUpperCase() + level.slice(1);
100
+ }
101
+
102
+ function getSourceMap(sources: Array<Record<string, unknown>>): Map<string, Record<string, unknown>> {
103
+ return new Map(
104
+ sources
105
+ .map((source) => [String(source.id || ""), source] as const)
106
+ .filter(([id]) => id),
107
+ );
108
+ }
109
+
110
+ function formatSourceLine(source: Record<string, unknown>): string {
111
+ const id = String(source.id || "?");
112
+ const url = sourceUrl(source);
113
+ const title = sourceLabel(source);
114
+ const domain = String(source.domain || "");
115
+ const engines = Array.isArray(source.engines) ? (source.engines as string[]) : [];
116
+ const consensus = sourceConsensus(source);
117
+ const typeLabel = humanizeSourceType(String(source.sourceType || ""));
118
+ const fetch = source.fetch as Record<string, unknown> | undefined;
119
+ const fetchStatus = fetch?.ok ? `fetched ${fetch.status || 200}` : fetch?.attempted ? "fetch failed" : "";
120
+ const pieces = [
121
+ `${id} - [${title}](${url})`,
122
+ domain,
123
+ typeLabel,
124
+ engines.length ? `cited by ${engines.map(formatEngineName).join(", ")} (${consensus}/3)` : `${consensus}/3`,
125
+ fetchStatus,
126
+ ].filter(Boolean);
127
+ return `- ${pieces.join(" - ")}`;
128
+ }
129
+
130
+ function renderSourceEvidence(lines: string[], source: Record<string, unknown>): void {
131
+ const fetch = source.fetch as Record<string, unknown> | undefined;
132
+ if (!fetch?.attempted) return;
133
+
134
+ const snippet = String(fetch.snippet || "").trim();
135
+ const lastModified = String(fetch.lastModified || "").trim();
136
+ if (snippet) lines.push(` Evidence: ${snippet}`);
137
+ if (lastModified) lines.push(` Last-Modified: ${lastModified}`);
138
+ if (fetch.error) lines.push(` Fetch error: ${String(fetch.error)}`);
139
+ }
140
+
141
+ function pickSources(
142
+ sources: Array<Record<string, unknown>>,
143
+ recommendedIds: string[] = [],
144
+ max = 6,
145
+ ): Array<Record<string, unknown>> {
146
+ if (!sources.length) return [];
147
+ const sourceMap = getSourceMap(sources);
148
+ const recommended = recommendedIds
149
+ .map((id) => sourceMap.get(id))
150
+ .filter((source): source is Record<string, unknown> => Boolean(source));
151
+ if (recommended.length > 0) return recommended.slice(0, max);
152
+ return sources.slice(0, max);
153
+ }
154
+
155
+ function renderSynthesis(
156
+ lines: string[],
157
+ synthesis: Record<string, unknown>,
158
+ sources: Array<Record<string, unknown>>,
159
+ maxSources = 6,
160
+ ): void {
161
+ if (synthesis.answer) {
162
+ lines.push("## Answer");
163
+ lines.push(String(synthesis.answer));
164
+ lines.push("");
165
+ }
166
+
167
+ const agreement = synthesis.agreement as Record<string, unknown> | undefined;
168
+ const agreementSummary = String(agreement?.summary || "").trim();
169
+ const agreementLevel = String(agreement?.level || "").trim();
170
+ if (agreementSummary || agreementLevel) {
171
+ lines.push("## Consensus");
172
+ lines.push(`- ${formatAgreementLevel(agreementLevel)}${agreementSummary ? ` - ${agreementSummary}` : ""}`);
173
+ lines.push("");
174
+ }
175
+
176
+ const differences = Array.isArray(synthesis.differences) ? (synthesis.differences as string[]) : [];
177
+ if (differences.length > 0) {
178
+ lines.push("## Where Engines Differ");
179
+ for (const difference of differences) lines.push(`- ${difference}`);
180
+ lines.push("");
181
+ }
182
+
183
+ const caveats = Array.isArray(synthesis.caveats) ? (synthesis.caveats as string[]) : [];
184
+ if (caveats.length > 0) {
185
+ lines.push("## Caveats");
186
+ for (const caveat of caveats) lines.push(`- ${caveat}`);
187
+ lines.push("");
188
+ }
189
+
190
+ const claims = Array.isArray(synthesis.claims)
191
+ ? (synthesis.claims as Array<Record<string, unknown>>)
192
+ : [];
193
+ if (claims.length > 0) {
194
+ lines.push("## Key Claims");
195
+ for (const claim of claims) {
196
+ const sourceIds = Array.isArray(claim.sourceIds) ? (claim.sourceIds as string[]) : [];
197
+ const support = String(claim.support || "moderate");
198
+ lines.push(`- ${String(claim.claim || "")} [${support}${sourceIds.length ? `; ${sourceIds.join(", ")}` : ""}]`);
199
+ }
200
+ lines.push("");
201
+ }
202
+
203
+ const recommendedIds = Array.isArray(synthesis.recommendedSources)
204
+ ? (synthesis.recommendedSources as string[])
205
+ : [];
206
+ const topSources = pickSources(sources, recommendedIds, maxSources);
207
+ if (topSources.length > 0) {
208
+ lines.push("## Top Sources");
209
+ for (const source of topSources) lines.push(formatSourceLine(source));
210
+ lines.push("");
211
+ }
212
+ }
213
+
71
214
  function formatResults(engine: string, data: Record<string, unknown>): string {
72
215
  const lines: string[] = [];
73
216
 
74
217
  if (engine === "all") {
75
- // Synthesized output: prefer _synthesis + _sources
76
218
  const synthesis = data._synthesis as Record<string, unknown> | undefined;
77
219
  const dedupedSources = data._sources as Array<Record<string, unknown>> | undefined;
78
220
  if (synthesis?.answer) {
79
- lines.push("## Synthesis");
80
- lines.push(String(synthesis.answer));
81
- if (dedupedSources?.length) {
82
- lines.push("\n**Top sources by consensus:**");
83
- for (const s of dedupedSources.slice(0, 6)) {
84
- const engines = (s.engines as string[]) || [];
85
- lines.push(`- [${s.title || s.url}](${s.url}) [${engines.length}/3]`);
86
- }
87
- }
88
- lines.push("\n---\n*Synthesized from Perplexity, Bing Copilot, and Google AI*");
221
+ renderSynthesis(lines, synthesis, dedupedSources || [], 6);
222
+ lines.push("*Synthesized from Perplexity, Bing Copilot, and Google AI*\n");
89
223
  return lines.join("\n").trim();
90
224
  }
91
225
 
92
- // Standard output: per-engine answers
93
226
  for (const [eng, result] of Object.entries(data)) {
94
227
  if (eng.startsWith("_")) continue;
95
- lines.push(`\n## ${eng.charAt(0).toUpperCase() + eng.slice(1)}`);
228
+ lines.push(`\n## ${formatEngineName(eng)}`);
96
229
  const r = result as Record<string, unknown>;
97
230
  if (r.error) {
98
231
  lines.push(`Error: ${r.error}`);
@@ -128,33 +261,42 @@ function formatResults(engine: string, data: Record<string, unknown>): string {
128
261
  function formatDeepResearch(data: Record<string, unknown>): string {
129
262
  const lines: string[] = [];
130
263
  const confidence = data._confidence as Record<string, unknown> | undefined;
131
- const fetchedSources = data._fetchedSources as Array<Record<string, unknown>> | undefined;
132
264
  const dedupedSources = data._sources as Array<Record<string, unknown>> | undefined;
265
+ const synthesis = data._synthesis as Record<string, unknown> | undefined;
133
266
 
134
267
  lines.push("# Deep Research Report\n");
135
268
 
136
- // Confidence summary
137
269
  if (confidence) {
138
270
  const enginesResponded = (confidence.enginesResponded as string[]) || [];
139
271
  const enginesFailed = (confidence.enginesFailed as string[]) || [];
140
- const consensusScore = confidence.consensusScore || 0;
272
+ const agreementLevel = String(confidence.agreementLevel || "mixed");
273
+ const firstPartySourceCount = Number(confidence.firstPartySourceCount || 0);
274
+ const sourceTypeBreakdown = confidence.sourceTypeBreakdown as Record<string, number> | undefined;
141
275
 
142
276
  lines.push("## Confidence\n");
143
- lines.push(`- **Engines responded:** ${enginesResponded.join(", ") || "none"}`);
277
+ lines.push(`- Agreement: ${formatAgreementLevel(agreementLevel)}`);
278
+ lines.push(`- Engines responded: ${enginesResponded.map(formatEngineName).join(", ") || "none"}`);
144
279
  if (enginesFailed.length > 0) {
145
- lines.push(`- **Engines failed:** ${enginesFailed.join(", ")}`);
280
+ lines.push(`- Engines failed: ${enginesFailed.map(formatEngineName).join(", ")}`);
281
+ }
282
+ lines.push(`- Top source consensus: ${confidence.topSourceConsensus || 0}/3 engines`);
283
+ lines.push(`- Total unique sources: ${confidence.sourcesCount || 0}`);
284
+ lines.push(`- Official sources: ${confidence.officialSourceCount || 0}`);
285
+ lines.push(`- First-party sources: ${firstPartySourceCount}`);
286
+ lines.push(`- Fetch success rate: ${confidence.fetchedSourceSuccessRate || 0}`);
287
+ if (sourceTypeBreakdown && Object.keys(sourceTypeBreakdown).length > 0) {
288
+ lines.push(`- Source mix: ${Object.entries(sourceTypeBreakdown).map(([type, count]) => `${humanizeSourceType(type)} ${count}`).join(", ")}`);
146
289
  }
147
- lines.push(`- **Top source consensus:** ${consensusScore}/3 engines`);
148
- lines.push(`- **Total unique sources:** ${confidence.sourcesCount || 0}`);
149
290
  lines.push("");
150
291
  }
151
292
 
152
- // Per-engine answers
153
- lines.push("## Findings\n");
293
+ if (synthesis?.answer) renderSynthesis(lines, synthesis, dedupedSources || [], 8);
294
+
295
+ lines.push("## Engine Perspectives\n");
154
296
  for (const engine of ["perplexity", "bing", "google"]) {
155
297
  const r = data[engine] as Record<string, unknown> | undefined;
156
298
  if (!r) continue;
157
- lines.push(`### ${engine.charAt(0).toUpperCase() + engine.slice(1)}`);
299
+ lines.push(`### ${formatEngineName(engine)}`);
158
300
  if (r.error) {
159
301
  lines.push(`⚠️ Error: ${r.error}`);
160
302
  } else if (r.answer) {
@@ -163,41 +305,15 @@ function formatDeepResearch(data: Record<string, unknown>): string {
163
305
  lines.push("");
164
306
  }
165
307
 
166
- // Synthesis
167
- const synthesis = data._synthesis as Record<string, unknown> | undefined;
168
- if (synthesis?.answer) {
169
- lines.push("## Synthesized Answer\n");
170
- lines.push(String(synthesis.answer));
171
- lines.push("");
172
- }
173
-
174
- // Deduplicated sources by consensus
175
308
  if (dedupedSources && dedupedSources.length > 0) {
176
- lines.push("## Sources (Ranked by Consensus)\n");
177
- for (const s of dedupedSources) {
178
- const engines = (s.engines as string[]) || [];
179
- const consensus = engines.length;
180
- lines.push(`- **[${consensus}/3]** [${s.title || "Untitled"}](${s.url})`);
309
+ lines.push("## Source Registry\n");
310
+ for (const source of dedupedSources) {
311
+ lines.push(formatSourceLine(source));
312
+ renderSourceEvidence(lines, source);
181
313
  }
182
314
  lines.push("");
183
315
  }
184
316
 
185
- // Fetched source content
186
- if (fetchedSources && fetchedSources.length > 0) {
187
- lines.push("## Source Content (Top Matches)\n");
188
- for (const fs of fetchedSources) {
189
- lines.push(`### ${fs.title || fs.url}`);
190
- lines.push(`*Source: ${fs.url}*`);
191
- lines.push("");
192
- if (fs.content) {
193
- lines.push(String(fs.content).slice(0, 3000));
194
- } else if (fs.error) {
195
- lines.push(`⚠️ Could not fetch: ${fs.error}`);
196
- }
197
- lines.push("\n---\n");
198
- }
199
- }
200
-
201
317
  return lines.join("\n").trim();
202
318
  }
203
319
 
package/launch.mjs CHANGED
@@ -5,9 +5,8 @@
5
5
  // the "Allow remote debugging?" dialog entirely. It runs on port 9222 so it doesn't
6
6
  // conflict with your main Chrome session (which may use port 9223).
7
7
  //
8
- // On launch, it overwrites the DevToolsActivePort file that cdp.mjs reads so all
9
- // extractors automatically target the GreedySearch Chrome, with no code changes.
10
- // The original file is restored on --kill.
8
+ // search.mjs passes CDP_PROFILE_DIR so cdp.mjs targets this dedicated Chrome
9
+ // without ever touching the user's main Chrome DevToolsActivePort file.
11
10
  //
12
11
  // Usage:
13
12
  // node launch.mjs — launch (or report if already running)
@@ -15,8 +14,8 @@
15
14
  // node launch.mjs --status — check if running
16
15
 
17
16
  import { spawn } from 'child_process';
18
- import { existsSync, writeFileSync, readFileSync, copyFileSync, mkdirSync, unlinkSync } from 'fs';
19
- import { tmpdir, homedir, platform } from 'os';
17
+ import { existsSync, writeFileSync, readFileSync, mkdirSync, unlinkSync } from 'fs';
18
+ import { tmpdir, platform } from 'os';
20
19
  import { join } from 'path';
21
20
  import http from 'http';
22
21
 
@@ -43,18 +42,8 @@ function findChrome() {
43
42
  return candidates.find(existsSync) || null;
44
43
  }
45
44
 
46
- function systemPortPath() {
47
- const os = platform();
48
- if (os === 'win32') return join(homedir(), 'AppData', 'Local', 'Google', 'Chrome', 'User Data', 'DevToolsActivePort');
49
- if (os === 'darwin') return join(homedir(), 'Library', 'Application Support', 'Google', 'Chrome', 'DevToolsActivePort');
50
- return join(homedir(), '.config', 'google-chrome', 'DevToolsActivePort');
51
- }
52
-
53
- const SYSTEM_PORT = systemPortPath();
54
- const SYSTEM_BACKUP = SYSTEM_PORT + '.bak';
55
-
56
- const CHROME_FLAGS = [
57
- `--remote-debugging-port=${PORT}`,
45
+ const CHROME_FLAGS = [
46
+ `--remote-debugging-port=${PORT}`,
58
47
  '--disable-features=DevToolsPrivacyUI', // suppresses "Allow remote debugging?" dialog
59
48
  '--no-first-run',
60
49
  '--no-default-browser-check',
@@ -108,52 +97,21 @@ async function writePortFile(timeoutMs = 15000) {
108
97
  return false;
109
98
  }
110
99
 
111
- function redirectCdpToGreedySearch() {
112
- // Back up system DevToolsActivePort (user's main Chrome)
113
- if (existsSync(SYSTEM_PORT) && !existsSync(SYSTEM_BACKUP)) {
114
- copyFileSync(SYSTEM_PORT, SYSTEM_BACKUP);
115
- }
116
- // Point cdp.mjs to our dedicated Chrome's port
117
- // On Windows, main Chrome may hold a lock on SYSTEM_PORT (EBUSY).
118
- // Fall back to writeFileSync which uses CreateFile/WriteFile instead of CopyFile.
119
- try {
120
- copyFileSync(ACTIVE_PORT, SYSTEM_PORT);
121
- } catch (e) {
122
- if (e.code !== 'EBUSY') throw e;
123
- try {
124
- writeFileSync(SYSTEM_PORT, readFileSync(ACTIVE_PORT, 'utf8'), 'utf8');
125
- } catch {
126
- console.warn('Warning: could not redirect DevToolsActivePort (file busy) — cdp.mjs will use existing port.');
127
- }
128
- }
129
- }
130
-
131
- function restoreCdpToMainChrome() {
132
- if (existsSync(SYSTEM_BACKUP)) {
133
- copyFileSync(SYSTEM_BACKUP, SYSTEM_PORT);
134
- console.log('Restored DevToolsActivePort to main Chrome.');
135
- } else if (existsSync(SYSTEM_PORT)) {
136
- // No backup means main Chrome wasn't using CDP — remove our file
137
- try { unlinkSync(SYSTEM_PORT); } catch {}
138
- }
139
- }
140
-
141
- // ---------------------------------------------------------------------------
100
+ // ---------------------------------------------------------------------------
142
101
 
143
102
  async function main() {
144
103
  const arg = process.argv[2];
145
104
 
146
- if (arg === '--kill') {
147
- const pid = isRunning();
148
- if (pid) {
149
- try { process.kill(pid, 'SIGTERM'); console.log(`Stopped Chrome (pid ${pid}).`); }
150
- catch (e) { console.error(`Failed: ${e.message}`); }
151
- } else {
152
- console.log('GreedySearch Chrome is not running.');
153
- }
154
- restoreCdpToMainChrome();
155
- return;
156
- }
105
+ if (arg === '--kill') {
106
+ const pid = isRunning();
107
+ if (pid) {
108
+ try { process.kill(pid, 'SIGTERM'); console.log(`Stopped Chrome (pid ${pid}).`); }
109
+ catch (e) { console.error(`Failed: ${e.message}`); }
110
+ } else {
111
+ console.log('GreedySearch Chrome is not running.');
112
+ }
113
+ return;
114
+ }
157
115
 
158
116
  if (arg === '--status') {
159
117
  const pid = isRunning();
@@ -165,13 +123,12 @@ async function main() {
165
123
  // Already running?
166
124
  const existing = isRunning();
167
125
  if (existing) {
168
- const ready = await writePortFile(5000);
169
- if (ready) {
170
- console.log(`GreedySearch Chrome already running (pid ${existing}, port ${PORT}).`);
171
- redirectCdpToGreedySearch();
172
- console.log('DevToolsActivePort redirected.');
173
- return;
174
- }
126
+ const ready = await writePortFile(5000);
127
+ if (ready) {
128
+ console.log(`GreedySearch Chrome already running (pid ${existing}, port ${PORT}).`);
129
+ console.log('Dedicated GreedySearch DevToolsActivePort is ready.');
130
+ return;
131
+ }
175
132
  // Stale PID — process alive but not Chrome on port 9223. Fall through to fresh launch.
176
133
  console.log(`Stale PID ${existing} detected (not Chrome on port ${PORT}) — launching fresh.`);
177
134
  try { unlinkSync(PID_FILE); } catch {}
@@ -195,16 +152,15 @@ async function main() {
195
152
  proc.unref();
196
153
  writeFileSync(PID_FILE, String(proc.pid));
197
154
 
198
- // Wait for Chrome HTTP endpoint, build DevToolsActivePort file, redirect cdp.mjs
199
- const portFileReady = await writePortFile();
200
- if (!portFileReady) {
201
- console.error('Chrome did not become ready within 15s.');
202
- process.exit(1);
203
- }
204
- redirectCdpToGreedySearch();
205
-
206
- console.log(`Ready. No more "Allow remote debugging?" dialogs.`);
207
- console.log(`Run "node launch.mjs --kill" when done to restore your main Chrome's CDP.`);
208
- }
155
+ // Wait for Chrome HTTP endpoint and build the dedicated DevToolsActivePort file
156
+ const portFileReady = await writePortFile();
157
+ if (!portFileReady) {
158
+ console.error('Chrome did not become ready within 15s.');
159
+ process.exit(1);
160
+ }
161
+
162
+ console.log(`Ready. No more "Allow remote debugging?" dialogs.`);
163
+ console.log('GreedySearch now uses its own isolated DevToolsActivePort file.');
164
+ }
209
165
 
210
166
  main();
@@ -0,0 +1,105 @@
1
+ # New Feature Ideas
2
+
3
+ Ideas for future features — thinking from the perspective of an AI assistant using these tools.
4
+
5
+ ---
6
+
7
+ ## 1. Source Verification
8
+
9
+ **Problem:** I get sources but can't verify if they're live, updated, or actually support the claimed content.
10
+
11
+ ```
12
+ verify_sources({ urls: ["https://...", "https://..."] })
13
+ → [{ url, status: 200, title, snippet, lastModified, claim: "supports X" }]
14
+ ```
15
+
16
+ **Use cases:**
17
+ - Before citing a source, verify it's not 404
18
+ - Check if a page actually contains the claimed information
19
+ - Get last-modified dates to assess freshness
20
+
21
+ ---
22
+
23
+ ## 2. Incremental / Continuation Research
24
+
25
+ **Problem:** After deep_research on "RAG vs fine-tuning", going deeper on just RAG means re-running everything with a new query and losing original context.
26
+
27
+ ```
28
+ deep_research({ query: "RAG vs fine-tuning", ... }) // initial
29
+ continue_research({ previousId: "...", query: "production RAG architectures" }) // goes deeper on RAG
30
+ ```
31
+
32
+ **Use cases:**
33
+ - Drill into a specific aspect after initial broad research
34
+ - Build on previous results without re-fetching everything
35
+ - Progressive disclosure of complex topics
36
+
37
+ ---
38
+
39
+ ## 3. Multi-Query Synthesis
40
+
41
+ **Problem:** One query isn't enough for complex research. I chain multiple greedy_search calls manually.
42
+
43
+ ```
44
+ multi_research({
45
+ queries: ["auth best practices", "NextAuth vs Clerk vs Lucia", "Next.js auth security"],
46
+ synthesize: true
47
+ })
48
+ ```
49
+
50
+ **Use cases:**
51
+ - "Best auth for Next.js" needs multiple angles
52
+ - Research with different facets (comparison, security, performance)
53
+ - Casting a wider net when single query returns narrow results
54
+
55
+ ---
56
+
57
+ ## 4. Structured Extraction
58
+
59
+ **Problem:** When researching "which libraries are maintained", I want tables (name, stars, last commit, license), not prose.
60
+
61
+ ```
62
+ extract_structured({
63
+ query: "Python HTTP client libraries 2026",
64
+ schema: { name: "string", stars: "number", lastUpdated: "date", async: "boolean" }
65
+ })
66
+ ```
67
+
68
+ **Use cases:**
69
+ - Library comparisons as structured data
70
+ - Dependency audits
71
+ - Feature matrices for tools/frameworks
72
+
73
+ ---
74
+
75
+ ## 5. Confidence Scoring on Specific Claims
76
+
77
+ **Problem:** I say "high confidence" but it's hand-wavy. What if I could ask: "how confident are we that library X is actively maintained?"
78
+
79
+ ```
80
+ verify_claim({
81
+ claim: "Prisma is actively maintained",
82
+ evidence: ["last commit: 2 weeks ago", "open issues: 45", "npm downloads: 2M/week"]
83
+ })
84
+ → { confidence: 0.95, reasoning: "..." }
85
+ ```
86
+
87
+ ---
88
+
89
+ ## 6. Research Cache / History
90
+
91
+ **Problem:** I do expensive deep_research, then the user asks a follow-up. I have to re-run everything.
92
+
93
+ ```
94
+ get_research(id: "...") // retrieve previous results
95
+ list_research({ query: "RAG" }) // find related previous research
96
+ ```
97
+
98
+ ---
99
+
100
+ ## Priority
101
+
102
+ 1. **Source verification** — high value, relatively simple, fixes trust gap
103
+ 2. **Multi-query synthesis** — high value, complex but powerful
104
+ 3. **Incremental research** — medium value, nice UX improvement
105
+ 4. **Structured extraction** — medium value, specialized use cases
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@apmantza/greedysearch-pi",
3
- "version": "1.3.0",
3
+ "version": "1.4.0",
4
4
  "description": "Pi extension: browser-automation tool that searches Perplexity, Bing Copilot, and Google AI in parallel, extracts answers and sources via CDP, with optional Gemini synthesis — grounded AI answers from real browser interactions.",
5
5
  "type": "module",
6
6
  "keywords": [
package/search.mjs CHANGED
@@ -54,12 +54,400 @@ const ENGINE_DOMAINS = {
54
54
  gemini: 'gemini.google.com',
55
55
  };
56
56
 
57
- function getTabFromCache(engine) {
57
+ const TRACKING_PARAMS = [
58
+ 'fbclid',
59
+ 'gclid',
60
+ 'ref',
61
+ 'ref_src',
62
+ 'ref_url',
63
+ 'source',
64
+ 'utm_campaign',
65
+ 'utm_content',
66
+ 'utm_medium',
67
+ 'utm_source',
68
+ 'utm_term',
69
+ ];
70
+
71
+ const COMMUNITY_HOSTS = [
72
+ 'dev.to',
73
+ 'hashnode.com',
74
+ 'medium.com',
75
+ 'reddit.com',
76
+ 'stackoverflow.com',
77
+ 'stackexchange.com',
78
+ 'substack.com',
79
+ ];
80
+
81
+ const NEWS_HOSTS = [
82
+ 'arstechnica.com',
83
+ 'techcrunch.com',
84
+ 'theverge.com',
85
+ 'venturebeat.com',
86
+ 'wired.com',
87
+ 'zdnet.com',
88
+ ];
89
+
90
+ function trimText(text = '', maxChars = 240) {
91
+ const clean = String(text).replace(/\s+/g, ' ').trim();
92
+ if (clean.length <= maxChars) return clean;
93
+ return clean.slice(0, maxChars).replace(/\s+\S*$/, '') + '...';
94
+ }
95
+
96
+ function normalizeSourceTitle(title = '') {
97
+ const clean = trimText(title, 180);
98
+ if (!clean) return '';
99
+ if (/^https?:\/\//i.test(clean)) return '';
100
+
101
+ const wordCount = clean.split(/\s+/).filter(Boolean).length;
102
+ const hasUppercase = /[A-Z]/.test(clean);
103
+ const hasDigit = /\d/.test(clean);
104
+ const looksLikeFragment = clean === clean.toLowerCase() && wordCount <= 4 && !hasUppercase && !hasDigit;
105
+ return looksLikeFragment ? '' : clean;
106
+ }
107
+
108
+ function pickPreferredTitle(currentTitle = '', nextTitle = '') {
109
+ const current = normalizeSourceTitle(currentTitle);
110
+ const next = normalizeSourceTitle(nextTitle);
111
+ if (!next) return current;
112
+ if (!current) return next;
113
+ const currentLooksLikeUrl = /^https?:\/\//i.test(current);
114
+ const nextLooksLikeUrl = /^https?:\/\//i.test(next);
115
+ if (currentLooksLikeUrl && !nextLooksLikeUrl) return next;
116
+ if (!currentLooksLikeUrl && nextLooksLikeUrl) return current;
117
+ return next.length > current.length ? next : current;
118
+ }
119
+
120
+ function normalizeUrl(rawUrl) {
121
+ if (!rawUrl) return null;
122
+ try {
123
+ const url = new URL(rawUrl);
124
+ if (!['http:', 'https:'].includes(url.protocol)) return null;
125
+ url.hash = '';
126
+ url.hostname = url.hostname.toLowerCase();
127
+ if ((url.protocol === 'https:' && url.port === '443') || (url.protocol === 'http:' && url.port === '80')) {
128
+ url.port = '';
129
+ }
130
+ for (const key of [...url.searchParams.keys()]) {
131
+ const lower = key.toLowerCase();
132
+ if (TRACKING_PARAMS.includes(lower) || lower.startsWith('utm_')) {
133
+ url.searchParams.delete(key);
134
+ }
135
+ }
136
+ url.searchParams.sort();
137
+ const normalizedPath = url.pathname.replace(/\/+$/, '') || '/';
138
+ url.pathname = normalizedPath;
139
+ const normalized = url.toString();
140
+ return normalizedPath === '/' ? normalized.replace(/\/$/, '') : normalized;
141
+ } catch {
142
+ return null;
143
+ }
144
+ }
145
+
146
+ function getDomain(rawUrl) {
147
+ try {
148
+ const domain = new URL(rawUrl).hostname.toLowerCase();
149
+ return domain.replace(/^www\./, '');
150
+ } catch {
151
+ return '';
152
+ }
153
+ }
154
+
155
+ function matchesDomain(domain, hosts) {
156
+ return hosts.some(host => domain === host || domain.endsWith(`.${host}`));
157
+ }
158
+
159
+ function classifySourceType(domain, title = '', rawUrl = '') {
160
+ const lowerTitle = title.toLowerCase();
161
+ const lowerUrl = rawUrl.toLowerCase();
162
+
163
+ if (domain === 'github.com' || domain === 'gitlab.com') return 'repo';
164
+ if (matchesDomain(domain, COMMUNITY_HOSTS)) return 'community';
165
+ if (matchesDomain(domain, NEWS_HOSTS)) return 'news';
166
+ if (
167
+ domain.startsWith('docs.') ||
168
+ domain.startsWith('developer.') ||
169
+ domain.startsWith('developers.') ||
170
+ domain.startsWith('api.') ||
171
+ lowerTitle.includes('documentation') ||
172
+ lowerTitle.includes('docs') ||
173
+ lowerTitle.includes('reference') ||
174
+ lowerUrl.includes('/docs/') ||
175
+ lowerUrl.includes('/reference/') ||
176
+ lowerUrl.includes('/api/')
177
+ ) {
178
+ return 'official-docs';
179
+ }
180
+ if (domain.startsWith('blog.') || lowerUrl.includes('/blog/')) return 'maintainer-blog';
181
+ return 'website';
182
+ }
183
+
184
+ function sourceTypePriority(sourceType) {
185
+ switch (sourceType) {
186
+ case 'official-docs': return 5;
187
+ case 'repo': return 4;
188
+ case 'maintainer-blog': return 3;
189
+ case 'website': return 2;
190
+ case 'community': return 1;
191
+ case 'news': return 0;
192
+ default: return 0;
193
+ }
194
+ }
195
+
196
+ function bestRank(source) {
197
+ const ranks = Object.values(source.perEngine || {}).map(v => v?.rank || 99);
198
+ return ranks.length ? Math.min(...ranks) : 99;
199
+ }
200
+
201
+ function buildSourceRegistry(out) {
202
+ const seen = new Map();
203
+ const engineOrder = ['perplexity', 'bing', 'google'];
204
+
205
+ for (const engine of engineOrder) {
206
+ const result = out[engine];
207
+ if (!result?.sources) continue;
208
+
209
+ for (let i = 0; i < result.sources.length; i++) {
210
+ const source = result.sources[i];
211
+ const canonicalUrl = normalizeUrl(source.url);
212
+ if (!canonicalUrl || canonicalUrl.length < 10) continue;
213
+
214
+ const title = normalizeSourceTitle(source.title || '');
215
+ const domain = getDomain(canonicalUrl);
216
+ const sourceType = classifySourceType(domain, title, canonicalUrl);
217
+ const existing = seen.get(canonicalUrl) || {
218
+ id: '',
219
+ canonicalUrl,
220
+ displayUrl: source.url || canonicalUrl,
221
+ domain,
222
+ title: '',
223
+ engines: [],
224
+ engineCount: 0,
225
+ perEngine: {},
226
+ sourceType,
227
+ isOfficial: sourceType === 'official-docs',
228
+ };
229
+
230
+ existing.title = pickPreferredTitle(existing.title, title);
231
+ existing.displayUrl = existing.displayUrl || source.url || canonicalUrl;
232
+ existing.sourceType = existing.sourceType || sourceType;
233
+ existing.isOfficial = existing.isOfficial || sourceType === 'official-docs';
234
+
235
+ if (!existing.engines.includes(engine)) {
236
+ existing.engines.push(engine);
237
+ }
238
+ existing.perEngine[engine] = {
239
+ rank: i + 1,
240
+ title: pickPreferredTitle(existing.perEngine[engine]?.title || '', title),
241
+ };
242
+
243
+ seen.set(canonicalUrl, existing);
244
+ }
245
+ }
246
+
247
+ const sources = Array.from(seen.values())
248
+ .map(source => ({
249
+ ...source,
250
+ engineCount: source.engines.length,
251
+ }))
252
+ .sort((a, b) => {
253
+ if (b.engineCount !== a.engineCount) return b.engineCount - a.engineCount;
254
+ if (sourceTypePriority(b.sourceType) !== sourceTypePriority(a.sourceType)) {
255
+ return sourceTypePriority(b.sourceType) - sourceTypePriority(a.sourceType);
256
+ }
257
+ if (bestRank(a) !== bestRank(b)) return bestRank(a) - bestRank(b);
258
+ return a.domain.localeCompare(b.domain);
259
+ })
260
+ .slice(0, 12)
261
+ .map((source, index) => ({
262
+ ...source,
263
+ id: `S${index + 1}`,
264
+ title: source.title || source.domain || source.canonicalUrl,
265
+ }));
266
+
267
+ return sources;
268
+ }
269
+
270
+ function mergeFetchDataIntoSources(sources, fetchedSources) {
271
+ const byId = new Map(fetchedSources.map(source => [source.id, source]));
272
+ return sources.map(source => {
273
+ const fetched = byId.get(source.id);
274
+ if (!fetched) return source;
275
+
276
+ const title = pickPreferredTitle(source.title, fetched.title || '');
277
+ return {
278
+ ...source,
279
+ title: title || source.title,
280
+ fetch: {
281
+ attempted: true,
282
+ ok: !fetched.error,
283
+ status: fetched.status || null,
284
+ finalUrl: fetched.finalUrl || fetched.url || source.canonicalUrl,
285
+ contentType: fetched.contentType || '',
286
+ lastModified: fetched.lastModified || '',
287
+ title: fetched.title || '',
288
+ snippet: fetched.snippet || '',
289
+ contentChars: fetched.contentChars || 0,
290
+ error: fetched.error || '',
291
+ },
292
+ };
293
+ });
294
+ }
295
+
296
+ function parseStructuredJson(text) {
297
+ if (!text) return null;
298
+ const trimmed = String(text).trim();
299
+ const candidates = [
300
+ trimmed,
301
+ trimmed.replace(/^```json\s*/i, '').replace(/^```\s*/i, '').replace(/```$/i, '').trim(),
302
+ ];
303
+
304
+ const objectMatch = trimmed.match(/\{[\s\S]*\}/);
305
+ if (objectMatch) candidates.push(objectMatch[0]);
306
+
307
+ for (const candidate of candidates) {
308
+ try {
309
+ return JSON.parse(candidate);
310
+ } catch {
311
+ // try next candidate
312
+ }
313
+ }
314
+ return null;
315
+ }
316
+
317
+ function normalizeSynthesisPayload(payload, sources, fallbackAnswer = '') {
318
+ const sourceIds = new Set(sources.map(source => source.id));
319
+ const agreementLevel = ['high', 'medium', 'low', 'mixed', 'conflicting'].includes(payload?.agreement?.level)
320
+ ? payload.agreement.level
321
+ : 'mixed';
322
+ const claims = Array.isArray(payload?.claims)
323
+ ? payload.claims.map(claim => ({
324
+ claim: trimText(claim?.claim || '', 260),
325
+ support: ['strong', 'moderate', 'weak', 'conflicting'].includes(claim?.support) ? claim.support : 'moderate',
326
+ sourceIds: Array.isArray(claim?.sourceIds) ? claim.sourceIds.filter(id => sourceIds.has(id)) : [],
327
+ })).filter(claim => claim.claim)
328
+ : [];
329
+ const recommendedSources = Array.isArray(payload?.recommendedSources)
330
+ ? payload.recommendedSources.filter(id => sourceIds.has(id)).slice(0, 6)
331
+ : [];
332
+
333
+ return {
334
+ answer: trimText(payload?.answer || fallbackAnswer, 4000),
335
+ agreement: {
336
+ level: agreementLevel,
337
+ summary: trimText(payload?.agreement?.summary || '', 280),
338
+ },
339
+ differences: Array.isArray(payload?.differences)
340
+ ? payload.differences.map(item => trimText(item, 220)).filter(Boolean).slice(0, 5)
341
+ : [],
342
+ caveats: Array.isArray(payload?.caveats)
343
+ ? payload.caveats.map(item => trimText(item, 220)).filter(Boolean).slice(0, 5)
344
+ : [],
345
+ claims,
346
+ recommendedSources,
347
+ };
348
+ }
349
+
350
+ function buildSynthesisPrompt(query, results, sources, { grounded = false } = {}) {
351
+ const engineSummaries = {};
352
+ for (const engine of ['perplexity', 'bing', 'google']) {
353
+ const result = results[engine];
354
+ if (!result) continue;
355
+ if (result.error) {
356
+ engineSummaries[engine] = { status: 'error', error: String(result.error) };
357
+ continue;
358
+ }
359
+
360
+ engineSummaries[engine] = {
361
+ status: 'ok',
362
+ answer: trimText(result.answer || '', grounded ? 4500 : 2200),
363
+ sourceIds: sources
364
+ .filter(source => source.engines.includes(engine))
365
+ .sort((a, b) => (a.perEngine[engine]?.rank || 99) - (b.perEngine[engine]?.rank || 99))
366
+ .map(source => source.id)
367
+ .slice(0, 6),
368
+ };
369
+ }
370
+
371
+ const sourceRegistry = sources.slice(0, grounded ? 10 : 8).map(source => ({
372
+ id: source.id,
373
+ title: source.title,
374
+ domain: source.domain,
375
+ canonicalUrl: source.canonicalUrl,
376
+ sourceType: source.sourceType,
377
+ isOfficial: source.isOfficial,
378
+ engines: source.engines,
379
+ engineCount: source.engineCount,
380
+ perEngine: source.perEngine,
381
+ fetch: grounded && source.fetch?.attempted ? {
382
+ ok: source.fetch.ok,
383
+ status: source.fetch.status,
384
+ lastModified: source.fetch.lastModified,
385
+ snippet: trimText(source.fetch.snippet || '', 700),
386
+ } : undefined,
387
+ }));
388
+
389
+ return [
390
+ 'You are synthesizing results from Perplexity, Bing Copilot, and Google AI.',
391
+ grounded
392
+ ? 'Use the fetched source snippets as the strongest evidence. Use engine answers for perspective and conflict detection.'
393
+ : 'Use the engine answers for perspective. Use the source registry for provenance and citations.',
394
+ 'Prefer official docs, release notes, repositories, and maintainer-authored sources when available.',
395
+ 'If the engines disagree, say so explicitly.',
396
+ 'Do not invent sources. Only reference source IDs from the source registry.',
397
+ 'Return valid JSON only. No markdown fences, no prose outside the JSON object.',
398
+ '',
399
+ 'JSON schema:',
400
+ '{',
401
+ ' "answer": "short direct answer",',
402
+ ' "agreement": { "level": "high|medium|low|mixed|conflicting", "summary": "..." },',
403
+ ' "differences": ["..."],',
404
+ ' "caveats": ["..."],',
405
+ ' "claims": [',
406
+ ' { "claim": "...", "support": "strong|moderate|weak|conflicting", "sourceIds": ["S1"] }',
407
+ ' ],',
408
+ ' "recommendedSources": ["S1", "S2"]',
409
+ '}',
410
+ '',
411
+ `User query: ${query}`,
412
+ '',
413
+ `Engine results:\n${JSON.stringify(engineSummaries, null, 2)}`,
414
+ '',
415
+ `Source registry:\n${JSON.stringify(sourceRegistry, null, 2)}`,
416
+ ].join('\n');
417
+ }
418
+
419
+ function buildConfidence(out) {
420
+ const sources = Array.isArray(out._sources) ? out._sources : [];
421
+ const topConsensus = sources.length > 0 ? sources[0]?.engineCount || 0 : 0;
422
+ const officialSourceCount = sources.filter(source => source.isOfficial).length;
423
+ const firstPartySourceCount = sources.filter(source => source.isOfficial || source.sourceType === 'maintainer-blog').length;
424
+ const fetchedAttempted = sources.filter(source => source.fetch?.attempted).length;
425
+ const fetchedSucceeded = sources.filter(source => source.fetch?.ok).length;
426
+ const sourceTypeBreakdown = sources.reduce((acc, source) => {
427
+ acc[source.sourceType] = (acc[source.sourceType] || 0) + 1;
428
+ return acc;
429
+ }, {});
430
+ const synthesisLevel = out._synthesis?.agreement?.level;
431
+
432
+ return {
433
+ sourcesCount: sources.length,
434
+ topSourceConsensus: topConsensus,
435
+ agreementLevel: synthesisLevel || (topConsensus >= 3 ? 'high' : topConsensus >= 2 ? 'medium' : 'low'),
436
+ enginesResponded: ALL_ENGINES.filter(engine => out[engine]?.answer && !out[engine]?.error),
437
+ enginesFailed: ALL_ENGINES.filter(engine => out[engine]?.error),
438
+ officialSourceCount,
439
+ firstPartySourceCount,
440
+ fetchedSourceSuccessRate: fetchedAttempted > 0 ? Number((fetchedSucceeded / fetchedAttempted).toFixed(2)) : 0,
441
+ sourceTypeBreakdown,
442
+ };
443
+ }
444
+
445
+ function getFullTabFromCache(engine) {
58
446
  try {
59
447
  if (!existsSync(PAGES_CACHE)) return null;
60
448
  const pages = JSON.parse(readFileSync(PAGES_CACHE, 'utf8'));
61
449
  const found = pages.find(p => p.url.includes(ENGINE_DOMAINS[engine]));
62
- return found ? found.targetId.slice(0, 8) : null;
450
+ return found ? found.targetId : null;
63
451
  } catch { return null; }
64
452
  }
65
453
 
@@ -108,6 +496,31 @@ async function openNewTab() {
108
496
  return targetId;
109
497
  }
110
498
 
499
+ async function getOrOpenEngineTab(engine) {
500
+ await cdp(['list']);
501
+ return getFullTabFromCache(engine) || openNewTab();
502
+ }
503
+
504
+ async function activateTab(targetId) {
505
+ try {
506
+ const anchor = await getAnyTab();
507
+ await cdp(['evalraw', anchor, 'Target.activateTarget', JSON.stringify({ targetId })]);
508
+ } catch {
509
+ // best-effort
510
+ }
511
+ }
512
+
513
+ async function closeTabs(targetIds = []) {
514
+ for (const targetId of targetIds) {
515
+ if (!targetId) continue;
516
+ await closeTab(targetId);
517
+ }
518
+ if (targetIds.length > 0) {
519
+ await new Promise(r => setTimeout(r, 300));
520
+ await cdp(['list']).catch(() => null);
521
+ }
522
+ }
523
+
111
524
  async function closeTab(targetId) {
112
525
  try {
113
526
  const anchor = await getAnyTab();
@@ -200,10 +613,22 @@ async function fetchSourceContent(url, maxChars = 5000) {
200
613
  // Extract title
201
614
  const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
202
615
  const title = titleMatch ? titleMatch[1].trim() : '';
616
+ const finalUrl = res.url || url;
617
+ const snippet = trimText(content, 320);
203
618
 
204
- return { url, title, content };
619
+ return {
620
+ url,
621
+ finalUrl,
622
+ status: res.status,
623
+ contentType: res.headers.get('content-type') || '',
624
+ lastModified: res.headers.get('last-modified') || '',
625
+ title,
626
+ snippet,
627
+ content,
628
+ contentChars: content.length,
629
+ };
205
630
  } catch (e) {
206
- return { url, title: '', content: null, error: e.message };
631
+ return { url, title: '', content: null, snippet: '', contentChars: 0, error: e.message };
207
632
  }
208
633
  }
209
634
 
@@ -216,16 +641,17 @@ async function fetchMultipleSources(sources, maxSources = 5, maxChars = 5000) {
216
641
 
217
642
  for (let i = 0; i < toFetch.length; i++) {
218
643
  const s = toFetch[i];
219
- process.stderr.write(`[greedysearch] Fetching ${i + 1}/${toFetch.length}: ${s.url.slice(0, 60)}...\n`);
644
+ process.stderr.write(`[greedysearch] Fetching ${i + 1}/${toFetch.length}: ${(s.canonicalUrl || s.url).slice(0, 60)}...\n`);
220
645
  try {
221
- const result = await fetchSourceContent(s.url, maxChars);
646
+ const result = await fetchSourceContent(s.canonicalUrl || s.url, maxChars);
647
+ fetched.push({ id: s.id, ...result });
222
648
  if (result.content && result.content.length > 100) {
223
- fetched.push(result);
224
649
  process.stderr.write(`[greedysearch] ✓ Got ${result.content.length} chars\n`);
225
650
  } else {
226
651
  process.stderr.write(`[greedysearch] ✗ Empty or too short\n`);
227
652
  }
228
653
  } catch (e) {
654
+ fetched.push({ id: s.id, url: s.canonicalUrl || s.url, error: e.message });
229
655
  process.stderr.write(`[greedysearch] ✗ Failed: ${e.message.slice(0, 80)}\n`);
230
656
  }
231
657
  process.stderr.write(`PROGRESS:fetch:${i + 1}/${toFetch.length}\n`);
@@ -235,6 +661,7 @@ async function fetchMultipleSources(sources, maxSources = 5, maxChars = 5000) {
235
661
  }
236
662
 
237
663
  function pickTopSource(out) {
664
+ if (Array.isArray(out._sources) && out._sources.length > 0) return out._sources[0];
238
665
  for (const engine of ['perplexity', 'google', 'bing']) {
239
666
  const r = out[engine];
240
667
  if (r?.sources?.length > 0) return r.sources[0];
@@ -242,59 +669,13 @@ function pickTopSource(out) {
242
669
  return null;
243
670
  }
244
671
 
245
- function deduplicateSources(out) {
246
- const seen = new Map(); // url -> { title, engines }
247
- const engineOrder = ['perplexity', 'bing', 'google'];
248
-
249
- for (const engine of engineOrder) {
250
- const r = out[engine];
251
- if (!r?.sources) continue;
252
- for (const s of r.sources) {
253
- const url = s.url?.split('#')[0]?.replace(/\/$/, '');
254
- if (!url || url.length < 10) continue;
255
- if (!seen.has(url)) {
256
- seen.set(url, { url: s.url, title: s.title || '', engines: [engine] });
257
- } else {
258
- const existing = seen.get(url);
259
- if (!existing.engines.includes(engine)) {
260
- existing.engines.push(engine);
261
- }
262
- if (!existing.title && s.title) existing.title = s.title;
263
- }
264
- }
265
- }
266
-
267
- // Sort by consensus (most engines = highest confidence)
268
- return Array.from(seen.values())
269
- .sort((a, b) => b.engines.length - a.engines.length)
270
- .slice(0, 10);
271
- }
672
+ async function synthesizeWithGemini(query, results, { grounded = false, tabPrefix = null } = {}) {
673
+ const sources = Array.isArray(results._sources) ? results._sources : buildSourceRegistry(results);
674
+ const prompt = buildSynthesisPrompt(query, results, sources, { grounded });
272
675
 
273
- async function synthesizeWithGemini(query, results) {
274
- // Build a prompt that includes all engine results
275
- const sources = deduplicateSources(results);
276
-
277
- let prompt = `Based on the following search results from multiple AI engines, provide a single, synthesized answer to the user's question. Combine the information, resolve any conflicts, and present the most accurate and complete answer.\n\n`;
278
- prompt += `User's question: "${query}"\n\n`;
279
-
280
- for (const engine of ['perplexity', 'bing', 'google']) {
281
- const r = results[engine];
282
- if (r?.error) {
283
- prompt += `## ${engine} (failed)\nError: ${r.error}\n\n`;
284
- } else if (r?.answer) {
285
- prompt += `## ${engine}\n${r.answer}\n\n`;
286
- }
287
- }
288
-
289
- prompt += `Provide a synthesized answer that:\n`;
290
- prompt += `1. Combines the best information from all sources\n`;
291
- prompt += `2. Notes where sources agree or disagree\n`;
292
- prompt += `3. Is clear and well-structured\n`;
293
- prompt += `4. Includes key sources at the end\n`;
294
-
295
- // Run the query through Gemini extractor
296
676
  return new Promise((resolve, reject) => {
297
- const proc = spawn('node', [join(__dir, 'extractors', 'gemini.mjs'), prompt, '--short'], {
677
+ const extraArgs = tabPrefix ? ['--tab', String(tabPrefix)] : [];
678
+ const proc = spawn('node', [join(__dir, 'extractors', 'gemini.mjs'), prompt, ...extraArgs], {
298
679
  stdio: ['ignore', 'pipe', 'pipe'],
299
680
  });
300
681
  let out = '';
@@ -309,8 +690,18 @@ async function synthesizeWithGemini(query, results) {
309
690
  clearTimeout(t);
310
691
  if (code !== 0) reject(new Error(err.trim() || 'gemini extractor failed'));
311
692
  else {
312
- try { resolve(JSON.parse(out.trim())); }
313
- catch { reject(new Error(`bad JSON from gemini: ${out.slice(0, 100)}`)); }
693
+ try {
694
+ const raw = JSON.parse(out.trim());
695
+ const structured = parseStructuredJson(raw.answer || '');
696
+ resolve({
697
+ ...normalizeSynthesisPayload(structured, sources, raw.answer || ''),
698
+ rawAnswer: raw.answer || '',
699
+ geminiSources: raw.sources || [],
700
+ });
701
+ }
702
+ catch {
703
+ reject(new Error(`bad JSON from gemini: ${out.slice(0, 100)}`));
704
+ }
314
705
  }
315
706
  });
316
707
  });
@@ -509,83 +900,79 @@ async function main() {
509
900
  }
510
901
 
511
902
  // All tabs assigned — run extractors in parallel
512
- const results = await Promise.allSettled(
513
- ALL_ENGINES.map((e, i) =>
514
- runExtractor(ENGINES[e], query, tabs[i], short)
515
- .then(r => {
516
- process.stderr.write(`PROGRESS:${e}:done\n`);
517
- return { engine: e, ...r };
518
- })
519
- .catch(err => {
520
- process.stderr.write(`PROGRESS:${e}:error\n`);
521
- throw err;
522
- })
523
- )
524
- );
525
-
526
- const out = {};
527
- for (let i = 0; i < results.length; i++) {
528
- const r = results[i];
529
- if (r.status === 'fulfilled') {
530
- out[r.value.engine] = r.value;
531
- } else {
532
- out[ALL_ENGINES[i]] = { error: r.reason?.message || 'unknown error' };
903
+ try {
904
+ const results = await Promise.allSettled(
905
+ ALL_ENGINES.map((e, i) =>
906
+ runExtractor(ENGINES[e], query, tabs[i], short)
907
+ .then(r => {
908
+ process.stderr.write(`PROGRESS:${e}:done\n`);
909
+ return { engine: e, ...r };
910
+ })
911
+ .catch(err => {
912
+ process.stderr.write(`PROGRESS:${e}:error\n`);
913
+ throw err;
914
+ })
915
+ )
916
+ );
917
+
918
+ const out = {};
919
+ for (let i = 0; i < results.length; i++) {
920
+ const r = results[i];
921
+ if (r.status === 'fulfilled') {
922
+ out[r.value.engine] = r.value;
923
+ } else {
924
+ out[ALL_ENGINES[i]] = { error: r.reason?.message || 'unknown error' };
925
+ }
533
926
  }
534
- }
535
927
 
536
- // Deduplicate sources across all engines
537
- out._sources = deduplicateSources(out);
928
+ await closeTabs(tabs);
538
929
 
539
- // Synthesize with Gemini if requested
540
- if (synthesize) {
541
- process.stderr.write('PROGRESS:synthesis:start\n');
542
- process.stderr.write('[greedysearch] Synthesizing results with Gemini...\n');
543
- try {
544
- const synthesis = await synthesizeWithGemini(query, out);
545
- out._synthesis = {
546
- answer: synthesis.answer || '',
547
- sources: synthesis.sources || [],
548
- synthesized: true,
549
- };
550
- process.stderr.write('PROGRESS:synthesis:done\n');
551
- } catch (e) {
552
- process.stderr.write(`[greedysearch] Synthesis failed: ${e.message}\n`);
553
- out._synthesis = { error: e.message, synthesized: false };
930
+ // Build a canonical source registry across all engines
931
+ out._sources = buildSourceRegistry(out);
932
+
933
+ if (deepResearch) {
934
+ process.stderr.write('PROGRESS:deep-research:start\n');
935
+ const fetchedSources = out._sources.length > 0
936
+ ? await fetchMultipleSources(out._sources, 5, 8000)
937
+ : [];
938
+
939
+ out._sources = mergeFetchDataIntoSources(out._sources, fetchedSources);
940
+ out._fetchedSources = fetchedSources;
941
+ process.stderr.write(out._sources.length > 0 ? 'PROGRESS:deep-research:done\n' : 'PROGRESS:deep-research:no-sources\n');
554
942
  }
555
- }
556
943
 
557
- if (fetchSource) {
558
- const top = pickTopSource(out);
559
- if (top) out._topSource = await fetchTopSource(top.url);
560
- }
944
+ // Synthesize with Gemini if requested
945
+ if (synthesize) {
946
+ process.stderr.write('PROGRESS:synthesis:start\n');
947
+ process.stderr.write('[greedysearch] Synthesizing results with Gemini...\n');
948
+ try {
949
+ const geminiTab = await getOrOpenEngineTab('gemini');
950
+ await activateTab(geminiTab);
951
+ const synthesis = await synthesizeWithGemini(query, out, { grounded: deepResearch, tabPrefix: geminiTab });
952
+ await activateTab(geminiTab);
953
+ out._synthesis = {
954
+ ...synthesis,
955
+ synthesized: true,
956
+ };
957
+ process.stderr.write('PROGRESS:synthesis:done\n');
958
+ } catch (e) {
959
+ process.stderr.write(`[greedysearch] Synthesis failed: ${e.message}\n`);
960
+ out._synthesis = { error: e.message, synthesized: false };
961
+ }
962
+ }
561
963
 
562
- // Deep research mode: fetch top sources and return structured document
563
- if (deepResearch) {
564
- process.stderr.write('PROGRESS:deep-research:start\n');
565
-
566
- // Get top sources by consensus
567
- const topSources = out._sources || [];
568
-
569
- if (topSources.length > 0) {
570
- // Fetch content from top sources
571
- out._fetchedSources = await fetchMultipleSources(topSources, 5, 8000);
572
- process.stderr.write('PROGRESS:deep-research:done\n');
573
- } else {
574
- out._fetchedSources = [];
575
- process.stderr.write('PROGRESS:deep-research:no-sources\n');
964
+ if (fetchSource) {
965
+ const top = pickTopSource(out);
966
+ if (top) out._topSource = await fetchTopSource(top.canonicalUrl || top.url);
576
967
  }
577
-
578
- // Build confidence scores
579
- out._confidence = {
580
- sourcesCount: topSources.length,
581
- consensusScore: topSources.length > 0 ? topSources[0]?.engines?.length || 0 : 0,
582
- enginesResponded: ALL_ENGINES.filter(e => out[e]?.answer && !out[e]?.error),
583
- enginesFailed: ALL_ENGINES.filter(e => out[e]?.error),
584
- };
585
- }
586
968
 
587
- writeOutput(out, outFile, { inline, synthesize, query });
588
- return;
969
+ if (deepResearch) out._confidence = buildConfidence(out);
970
+
971
+ writeOutput(out, outFile, { inline, synthesize, query });
972
+ return;
973
+ } finally {
974
+ await closeTabs(tabs);
975
+ }
589
976
  }
590
977
 
591
978
  const script = ENGINES[engine];