@apmantza/greedysearch-pi 1.9.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,6 +37,16 @@ export const NEWS_HOSTS = [
37
37
  "zdnet.com",
38
38
  ];
39
39
 
40
+ export const SOCIAL_HOSTS = [
41
+ "facebook.com",
42
+ "instagram.com",
43
+ "linkedin.com",
44
+ "pinterest.com",
45
+ "tiktok.com",
46
+ "twitter.com",
47
+ "x.com",
48
+ ];
49
+
40
50
  export function trimText(text = "", maxChars = 240) {
41
51
  const clean = String(text).replaceAll(/\s+/g, " ").trim();
42
52
  if (clean.length <= maxChars) return clean;
@@ -122,6 +132,17 @@ export function classifySourceType(domain, title = "", rawUrl = "") {
122
132
  const lowerUrl = rawUrl.toLowerCase();
123
133
 
124
134
  if (domain === "github.com" || domain === "gitlab.com") return "repo";
135
+ if (
136
+ domain === "arxiv.org" ||
137
+ domain === "doi.org" ||
138
+ domain === "semanticscholar.org" ||
139
+ domain.endsWith(".semanticscholar.org") ||
140
+ lowerUrl.includes("/paper/") ||
141
+ lowerUrl.includes("/pdf/")
142
+ ) {
143
+ return "academic";
144
+ }
145
+ if (matchesDomain(domain, SOCIAL_HOSTS)) return "social";
125
146
  if (matchesDomain(domain, COMMUNITY_HOSTS)) return "community";
126
147
  if (matchesDomain(domain, NEWS_HOSTS)) return "news";
127
148
  if (
@@ -149,6 +170,8 @@ export function sourceTypePriority(sourceType) {
149
170
  return 5;
150
171
  case "repo":
151
172
  return 4;
173
+ case "academic":
174
+ return 4;
152
175
  case "maintainer-blog":
153
176
  return 3;
154
177
  case "website":
@@ -157,6 +180,8 @@ export function sourceTypePriority(sourceType) {
157
180
  return 1;
158
181
  case "news":
159
182
  return 0;
183
+ case "social":
184
+ return -6;
160
185
  default:
161
186
  return 0;
162
187
  }
@@ -308,6 +333,10 @@ export function inferPreferredDomains(query) {
308
333
  if (normalized.includes("gemini") || normalized.includes("google ai")) {
309
334
  matches.push("ai.google.dev", "developers.google.com");
310
335
  }
336
+ for (const socialHost of SOCIAL_HOSTS) {
337
+ const bareName = socialHost.replace(/\.com$/, "");
338
+ if (normalized.includes(bareName)) matches.push(socialHost);
339
+ }
311
340
 
312
341
  return [...new Set(matches)];
313
342
  }
@@ -318,7 +347,9 @@ export function domainMatches(hostname, candidate) {
318
347
 
319
348
  export function buildSourceRegistry(out, query = "") {
320
349
  const seen = new Map();
321
- const engineOrder = ["perplexity", "bing", "google"];
350
+ const engineOrder = Object.keys(out || {}).filter(
351
+ (key) => !key.startsWith("_"),
352
+ );
322
353
 
323
354
  // Get preferred domains for this query
324
355
  const preferredDomains = inferPreferredDomains(query);
@@ -359,10 +390,20 @@ export function buildSourceRegistry(out, query = "") {
359
390
  smartScore += 2;
360
391
  }
361
392
 
362
- // Penalize discussion forums for technical queries — high noise, rarely canonical.
363
- // Q&A sites (stackoverflow, stackexchange) are excluded: they often have the
364
- // best practical answer and shouldn't be penalised just because an official
365
- // domain also exists.
393
+ // Penalize discussion/social sites for technical queries — high noise,
394
+ // hard to fetch cleanly, and rarely canonical. Q&A sites (StackOverflow,
395
+ // StackExchange) are excluded from the community penalty.
396
+ //
397
+ // Social penalty is now −20 (was −12). The original −12 wasn't enough
398
+ // to overcome the +10 preferred-domain boost + clean rank, so a single
399
+ // social citation could land as S1. The post-sort demotion below
400
+ // is the hard guardrail on top.
401
+ const queryTargetsSocialHost = preferredDomains.some((pd) =>
402
+ domainMatches(domain, pd),
403
+ );
404
+ if (sourceType === "social" && !queryTargetsSocialHost) {
405
+ smartScore -= 20;
406
+ }
366
407
  if (preferredDomains.length > 0) {
367
408
  if (matchesDomain(domain, DISCUSSION_HOSTS)) {
368
409
  smartScore -= 3;
@@ -410,27 +451,38 @@ export function buildSourceRegistry(out, query = "") {
410
451
  }
411
452
  }
412
453
 
413
- const sources = Array.from(seen.values())
414
- .map((source) => ({
415
- ...source,
416
- engineCount: source.engines.length,
417
- }))
418
- .sort((a, b) => {
419
- // Single composite score so all signals contribute simultaneously.
420
- // Avoids rank being ignored when engineCount differs, and smartScore
421
- // dominating even when rank/type signal would break the tie better.
422
- const diff = computeCompositeScore(b) - computeCompositeScore(a);
423
- if (diff !== 0) return diff;
424
- return a.domain.localeCompare(b.domain);
425
- })
426
- .slice(0, 12)
427
- .map((source, index) => ({
428
- ...source,
429
- id: `S${index + 1}`,
430
- title: source.title || source.domain || source.canonicalUrl,
431
- }));
454
+ const sources = Array.from(seen.values()).map((source) => ({
455
+ ...source,
456
+ engineCount: source.engines.length,
457
+ }));
458
+
459
+ // Social hard guardrail: when the query doesn't explicitly target a
460
+ // social host (rare only happens for queries like "latest twitter
461
+ // announcement"), keep social sources OUT of the composite sort and
462
+ // pin them to the end of the registry. The smartScore −20 penalty
463
+ // above handles the "bare social gets a +10 boost" case, but a
464
+ // clean multi-engine social citation can still occasionally outscore
465
+ // a noisy single-engine academic source. This sort is the final say
466
+ // on what becomes S1, S2, etc.
467
+ const nonSocial = sources.filter((s) => s.sourceType !== "social");
468
+ const socialSources = sources.filter((s) => s.sourceType === "social");
469
+ nonSocial.sort((a, b) => {
470
+ const diff = computeCompositeScore(b) - computeCompositeScore(a);
471
+ if (diff !== 0) return diff;
472
+ return a.domain.localeCompare(b.domain);
473
+ });
474
+ socialSources.sort((a, b) => {
475
+ const diff = computeCompositeScore(b) - computeCompositeScore(a);
476
+ if (diff !== 0) return diff;
477
+ return a.domain.localeCompare(b.domain);
478
+ });
479
+ const ordered = [...nonSocial, ...socialSources];
432
480
 
433
- return sources;
481
+ return ordered.slice(0, 12).map((source, index) => ({
482
+ ...source,
483
+ id: `S${index + 1}`,
484
+ title: source.title || source.domain || source.canonicalUrl,
485
+ }));
434
486
  }
435
487
 
436
488
  export function mergeFetchDataIntoSources(sources, fetchedSources) {
@@ -1,45 +1,84 @@
1
- // src/search/synthesis-runner.mjs — Run Gemini synthesis via CDP
1
+ // src/search/synthesis-runner.mjs — Engine-agnostic synthesis via CDP extractors
2
2
  //
3
- // Extracted from search.mjs.
3
+ // The all-search synthesis layer builds a neutral prompt and can route it to a
4
+ // configured browser engine. Gemini remains the default synthesizer; ChatGPT is
5
+ // supported for users who opt in via ~/.pi/greedyconfig or --synthesizer.
4
6
 
5
7
  import { spawn } from "node:child_process";
6
8
  import { join } from "node:path";
7
- import { GREEDY_PROFILE_DIR } from "./constants.mjs";
9
+ import { GREEDY_PROFILE_DIR, SUPPORTED_SYNTHESIZERS } from "./constants.mjs";
8
10
  import {
9
11
  buildSynthesisPrompt,
10
12
  normalizeSynthesisPayload,
11
13
  parseStructuredJson,
12
14
  } from "./synthesis.mjs";
15
+ import { buildSourceRegistry } from "./sources.mjs";
13
16
 
14
17
  const __dir =
15
18
  import.meta.dirname ||
16
19
  new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
17
20
 
18
- export async function synthesizeWithGemini(
19
- query,
20
- results,
21
- { grounded = false, tabPrefix = null } = {},
21
+ const SYNTHESIS_EXTRACTORS = {
22
+ gemini: "gemini.mjs",
23
+ chatgpt: "chatgpt.mjs",
24
+ };
25
+
26
+ const SYNTHESIS_START_URLS = {
27
+ gemini: "https://gemini.google.com/app",
28
+ chatgpt: "https://chatgpt.com/",
29
+ };
30
+
31
+ export function normalizeSynthesizer(synthesizer = "gemini") {
32
+ const normalized = String(synthesizer || "gemini").toLowerCase();
33
+ if (normalized === "gem") return "gemini";
34
+ if (normalized === "gpt") return "chatgpt";
35
+ return normalized;
36
+ }
37
+
38
+ export function getSynthesisStartUrl(synthesizer = "gemini") {
39
+ return (
40
+ SYNTHESIS_START_URLS[normalizeSynthesizer(synthesizer)] || "about:blank"
41
+ );
42
+ }
43
+
44
+ export async function runSynthesisPrompt(
45
+ synthesizer,
46
+ prompt,
47
+ { tabPrefix = null, timeoutMs = 180000, visible = null } = {},
22
48
  ) {
23
- const sources = Array.isArray(results._sources)
24
- ? results._sources
25
- : buildSourceRegistry(results);
26
- const prompt = buildSynthesisPrompt(query, results, sources, { grounded });
49
+ const normalizedSynthesizer = normalizeSynthesizer(synthesizer);
50
+ const script = SYNTHESIS_EXTRACTORS[normalizedSynthesizer];
51
+ if (!script || !SUPPORTED_SYNTHESIZERS.includes(normalizedSynthesizer)) {
52
+ throw new Error(
53
+ `Unsupported synthesizer "${synthesizer}". Supported: ${SUPPORTED_SYNTHESIZERS.join(", ")}`,
54
+ );
55
+ }
27
56
 
28
57
  return new Promise((resolve, reject) => {
29
58
  const extraArgs = tabPrefix ? ["--tab", String(tabPrefix)] : [];
59
+ // Strip inherited visible-mode flags so a stale GREEDY_SEARCH_VISIBLE=1
60
+ // in the parent process doesn't force visible Chrome. Callers that
61
+ // genuinely want visible synthesis should pass visible: true explicitly.
62
+ const childEnv = {
63
+ ...process.env,
64
+ CDP_PROFILE_DIR: GREEDY_PROFILE_DIR,
65
+ };
66
+ if (visible !== true) {
67
+ delete childEnv.GREEDY_SEARCH_VISIBLE;
68
+ delete childEnv.GREEDY_SEARCH_ALWAYS_VISIBLE;
69
+ } else {
70
+ childEnv.GREEDY_SEARCH_VISIBLE = "1";
71
+ childEnv.GREEDY_SEARCH_ALWAYS_VISIBLE = "1";
72
+ }
30
73
  const proc = spawn(
31
74
  process.execPath,
32
- [
33
- join(__dir, "..", "..", "extractors", "gemini.mjs"),
34
- "--stdin",
35
- ...extraArgs,
36
- ],
75
+ [join(__dir, "..", "..", "extractors", script), "--stdin", ...extraArgs],
37
76
  {
38
77
  stdio: ["pipe", "pipe", "pipe"],
39
- env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
78
+ env: childEnv,
40
79
  },
41
80
  );
42
- // Pipe synthesis prompt via stdin to avoid leaking the full prompt in process table
81
+ // Pipe prompts via stdin to avoid leaking them in process tables.
43
82
  proc.stdin.write(prompt);
44
83
  proc.stdin.end();
45
84
  let out = "";
@@ -48,49 +87,95 @@ export async function synthesizeWithGemini(
48
87
  proc.stderr.on("data", (d) => (err += d));
49
88
  const t = setTimeout(() => {
50
89
  proc.kill();
51
- reject(new Error("Gemini synthesis timed out after 180s"));
52
- }, 180000);
90
+ reject(
91
+ new Error(
92
+ `${normalizedSynthesizer} prompt timed out after ${timeoutMs / 1000}s`,
93
+ ),
94
+ );
95
+ }, timeoutMs);
53
96
  proc.on("close", (code) => {
54
97
  clearTimeout(t);
55
- if (code !== 0)
56
- reject(new Error(err.trim() || "gemini extractor failed"));
57
- else {
58
- try {
59
- const raw = JSON.parse(out.trim());
60
- let structured = parseStructuredJson(raw.answer || "");
61
-
62
- // Detect if Gemini echoed back the engine summaries instead of a synthesis.
63
- // Happens when Gemini can't synthesize (e.g. only 1 engine responded) and
64
- // echoes the prompt JSON. The engine summary JSON has per-engine keys
65
- // (perplexity/bing/google) but no synthesis fields (answer/agreement).
66
- const SYNTHESIS_FIELDS = [
67
- "answer",
68
- "agreement",
69
- "claims",
70
- "differences",
71
- "caveats",
72
- ];
73
- const hasSynthesisFields =
74
- structured && SYNTHESIS_FIELDS.some((f) => f in structured);
75
- const hasEngineKeys =
76
- structured &&
77
- ["perplexity", "bing", "google"].some((e) => e in structured);
78
- if (hasEngineKeys && !hasSynthesisFields) {
79
- structured = null; // Treat as parse failure — Gemini echoed input
80
- }
81
-
82
- resolve({
83
- ...normalizeSynthesisPayload(structured, sources, raw.answer || ""),
84
- rawAnswer: raw.answer || "",
85
- geminiSources: raw.sources || [],
86
- });
87
- } catch {
88
- reject(new Error(`bad JSON from gemini: ${out.slice(0, 100)}`));
89
- }
98
+ if (code !== 0) {
99
+ reject(
100
+ new Error(err.trim() || `${normalizedSynthesizer} extractor failed`),
101
+ );
102
+ return;
103
+ }
104
+ try {
105
+ resolve(JSON.parse(out.trim()));
106
+ } catch {
107
+ reject(
108
+ new Error(
109
+ `bad JSON from ${normalizedSynthesizer}: ${out.slice(0, 100)}`,
110
+ ),
111
+ );
90
112
  }
91
113
  });
92
114
  });
93
115
  }
94
116
 
95
- // Need to import buildSourceRegistry for fallback
96
- import { buildSourceRegistry } from "./sources.mjs";
117
+ // Backward-compatible Gemini helper used by research mode internals.
118
+ export async function runGeminiPrompt(prompt, options = {}) {
119
+ return runSynthesisPrompt("gemini", prompt, options);
120
+ }
121
+
122
+ export async function synthesizeResults(
123
+ query,
124
+ results,
125
+ {
126
+ grounded = false,
127
+ tabPrefix = null,
128
+ visible = null,
129
+ synthesizer = "gemini",
130
+ } = {},
131
+ ) {
132
+ const normalizedSynthesizer = normalizeSynthesizer(synthesizer);
133
+ const sources = Array.isArray(results._sources)
134
+ ? results._sources
135
+ : buildSourceRegistry(results);
136
+ const prompt = buildSynthesisPrompt(query, results, sources, { grounded });
137
+
138
+ const raw = await runSynthesisPrompt(normalizedSynthesizer, prompt, {
139
+ tabPrefix,
140
+ timeoutMs: 180000,
141
+ visible,
142
+ });
143
+ let structured = parseStructuredJson(raw.answer || "");
144
+
145
+ // Detect if the synthesizer echoed back the engine summaries instead of a
146
+ // synthesis. This can happen when it can't synthesize and mirrors prompt JSON.
147
+ const SYNTHESIS_FIELDS = [
148
+ "answer",
149
+ "agreement",
150
+ "claims",
151
+ "differences",
152
+ "caveats",
153
+ ];
154
+ const hasSynthesisFields =
155
+ structured && SYNTHESIS_FIELDS.some((f) => f in structured);
156
+ const hasEngineKeys =
157
+ structured &&
158
+ ["perplexity", "bing", "google", "chatgpt", "gemini"].some(
159
+ (e) => e in structured,
160
+ );
161
+ if (hasEngineKeys && !hasSynthesisFields) {
162
+ structured = null; // Treat as parse failure — synthesizer echoed input
163
+ }
164
+
165
+ return {
166
+ ...normalizeSynthesisPayload(structured, sources, raw.answer || ""),
167
+ rawAnswer: raw.answer || "",
168
+ synthesizedBy: normalizedSynthesizer,
169
+ synthesizerSources: raw.sources || [],
170
+ // Backward-compatible field for existing consumers.
171
+ geminiSources: normalizedSynthesizer === "gemini" ? raw.sources || [] : [],
172
+ };
173
+ }
174
+
175
+ // Backward-compatible all-search synthesis helper.
176
+ export async function synthesizeWithGemini(query, results, options = {}) {
177
+ return synthesizeResults(query, results, {
178
+ ...options,
179
+ synthesizer: "gemini",
180
+ });
181
+ }