@apmantza/greedysearch-pi 1.9.2 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/CHANGELOG.md +132 -2
  2. package/README.md +82 -47
  3. package/bin/cdp.mjs +1153 -1108
  4. package/bin/launch.mjs +9 -0
  5. package/bin/search.mjs +318 -81
  6. package/extractors/bing-copilot.mjs +48 -18
  7. package/extractors/chatgpt.mjs +553 -0
  8. package/extractors/common.mjs +213 -22
  9. package/extractors/consensus.mjs +655 -0
  10. package/extractors/consent.mjs +182 -18
  11. package/extractors/gemini.mjs +350 -217
  12. package/extractors/google-ai.mjs +129 -128
  13. package/extractors/logically.mjs +629 -0
  14. package/extractors/perplexity.mjs +547 -217
  15. package/extractors/selectors.mjs +3 -2
  16. package/extractors/semantic-scholar.mjs +219 -0
  17. package/package.json +8 -4
  18. package/skills/greedy-search/skill.md +20 -12
  19. package/src/fetcher.mjs +23 -1
  20. package/src/formatters/results.ts +185 -128
  21. package/src/search/browser-lifecycle.mjs +27 -5
  22. package/src/search/challenge-detect.mjs +205 -0
  23. package/src/search/chrome.mjs +653 -590
  24. package/src/search/constants.mjs +155 -39
  25. package/src/search/engines.mjs +114 -76
  26. package/src/search/fetch-source.mjs +566 -451
  27. package/src/search/pdf.mjs +68 -0
  28. package/src/search/progress.mjs +145 -0
  29. package/src/search/recovery.mjs +73 -45
  30. package/src/search/research.mjs +1419 -62
  31. package/src/search/scale-aware.mjs +93 -0
  32. package/src/search/simple-research.mjs +520 -0
  33. package/src/search/sources.mjs +52 -22
  34. package/src/search/synthesis-runner.mjs +105 -26
  35. package/src/search/synthesis.mjs +286 -246
  36. package/src/tools/greedy-search-handler.ts +129 -59
  37. package/src/tools/shared.ts +312 -186
  38. package/src/types.ts +110 -104
  39. package/test.mjs +537 -18
@@ -132,6 +132,16 @@ export function classifySourceType(domain, title = "", rawUrl = "") {
132
132
  const lowerUrl = rawUrl.toLowerCase();
133
133
 
134
134
  if (domain === "github.com" || domain === "gitlab.com") return "repo";
135
+ if (
136
+ domain === "arxiv.org" ||
137
+ domain === "doi.org" ||
138
+ domain === "semanticscholar.org" ||
139
+ domain.endsWith(".semanticscholar.org") ||
140
+ lowerUrl.includes("/paper/") ||
141
+ lowerUrl.includes("/pdf/")
142
+ ) {
143
+ return "academic";
144
+ }
135
145
  if (matchesDomain(domain, SOCIAL_HOSTS)) return "social";
136
146
  if (matchesDomain(domain, COMMUNITY_HOSTS)) return "community";
137
147
  if (matchesDomain(domain, NEWS_HOSTS)) return "news";
@@ -160,6 +170,8 @@ export function sourceTypePriority(sourceType) {
160
170
  return 5;
161
171
  case "repo":
162
172
  return 4;
173
+ case "academic":
174
+ return 4;
163
175
  case "maintainer-blog":
164
176
  return 3;
165
177
  case "website":
@@ -335,7 +347,9 @@ export function domainMatches(hostname, candidate) {
335
347
 
336
348
  export function buildSourceRegistry(out, query = "") {
337
349
  const seen = new Map();
338
- const engineOrder = ["perplexity", "bing", "google"];
350
+ const engineOrder = Object.keys(out || {}).filter(
351
+ (key) => !key.startsWith("_"),
352
+ );
339
353
 
340
354
  // Get preferred domains for this query
341
355
  const preferredDomains = inferPreferredDomains(query);
@@ -379,11 +393,16 @@ export function buildSourceRegistry(out, query = "") {
379
393
  // Penalize discussion/social sites for technical queries — high noise,
380
394
  // hard to fetch cleanly, and rarely canonical. Q&A sites (StackOverflow,
381
395
  // StackExchange) are excluded from the community penalty.
396
+ //
397
+ // Social penalty is now −20 (was −12). The original −12 wasn't enough
398
+ // to overcome the +10 preferred-domain boost + clean rank, so a single
399
+ // social citation could land as S1. The post-sort demotion below
400
+ // is the hard guardrail on top.
382
401
  const queryTargetsSocialHost = preferredDomains.some((pd) =>
383
402
  domainMatches(domain, pd),
384
403
  );
385
404
  if (sourceType === "social" && !queryTargetsSocialHost) {
386
- smartScore -= 12;
405
+ smartScore -= 20;
387
406
  }
388
407
  if (preferredDomains.length > 0) {
389
408
  if (matchesDomain(domain, DISCUSSION_HOSTS)) {
@@ -432,27 +451,38 @@ export function buildSourceRegistry(out, query = "") {
432
451
  }
433
452
  }
434
453
 
435
- const sources = Array.from(seen.values())
436
- .map((source) => ({
437
- ...source,
438
- engineCount: source.engines.length,
439
- }))
440
- .sort((a, b) => {
441
- // Single composite score so all signals contribute simultaneously.
442
- // Avoids rank being ignored when engineCount differs, and smartScore
443
- // dominating even when rank/type signal would break the tie better.
444
- const diff = computeCompositeScore(b) - computeCompositeScore(a);
445
- if (diff !== 0) return diff;
446
- return a.domain.localeCompare(b.domain);
447
- })
448
- .slice(0, 12)
449
- .map((source, index) => ({
450
- ...source,
451
- id: `S${index + 1}`,
452
- title: source.title || source.domain || source.canonicalUrl,
453
- }));
454
+ const sources = Array.from(seen.values()).map((source) => ({
455
+ ...source,
456
+ engineCount: source.engines.length,
457
+ }));
458
+
459
+ // Social hard guardrail: when the query doesn't explicitly target a
460
+ // social host (rare only happens for queries like "latest twitter
461
+ // announcement"), keep social sources OUT of the composite sort and
462
+ // pin them to the end of the registry. The smartScore −20 penalty
463
+ // above handles the "bare social gets a +10 boost" case, but a
464
+ // clean multi-engine social citation can still occasionally outscore
465
+ // a noisy single-engine academic source. This sort is the final say
466
+ // on what becomes S1, S2, etc.
467
+ const nonSocial = sources.filter((s) => s.sourceType !== "social");
468
+ const socialSources = sources.filter((s) => s.sourceType === "social");
469
+ nonSocial.sort((a, b) => {
470
+ const diff = computeCompositeScore(b) - computeCompositeScore(a);
471
+ if (diff !== 0) return diff;
472
+ return a.domain.localeCompare(b.domain);
473
+ });
474
+ socialSources.sort((a, b) => {
475
+ const diff = computeCompositeScore(b) - computeCompositeScore(a);
476
+ if (diff !== 0) return diff;
477
+ return a.domain.localeCompare(b.domain);
478
+ });
479
+ const ordered = [...nonSocial, ...socialSources];
454
480
 
455
- return sources;
481
+ return ordered.slice(0, 12).map((source, index) => ({
482
+ ...source,
483
+ id: `S${index + 1}`,
484
+ title: source.title || source.domain || source.canonicalUrl,
485
+ }));
456
486
  }
457
487
 
458
488
  export function mergeFetchDataIntoSources(sources, fetchedSources) {
@@ -1,36 +1,81 @@
1
- // src/search/synthesis-runner.mjs — Run Gemini synthesis via CDP
1
+ // src/search/synthesis-runner.mjs — Engine-agnostic synthesis via CDP extractors
2
2
  //
3
- // Extracted from search.mjs.
3
+ // The all-search synthesis layer builds a neutral prompt and can route it to a
4
+ // configured browser engine. Gemini remains the default synthesizer; ChatGPT is
5
+ // supported for users who opt in via ~/.pi/greedyconfig or --synthesizer.
4
6
 
5
7
  import { spawn } from "node:child_process";
6
8
  import { join } from "node:path";
7
- import { GREEDY_PROFILE_DIR } from "./constants.mjs";
9
+ import { GREEDY_PROFILE_DIR, SUPPORTED_SYNTHESIZERS } from "./constants.mjs";
8
10
  import {
9
11
  buildSynthesisPrompt,
10
12
  normalizeSynthesisPayload,
11
13
  parseStructuredJson,
12
14
  } from "./synthesis.mjs";
15
+ import { buildSourceRegistry } from "./sources.mjs";
13
16
 
14
17
  const __dir =
15
18
  import.meta.dirname ||
16
19
  new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
17
20
 
18
- export async function runGeminiPrompt(
21
+ const SYNTHESIS_EXTRACTORS = {
22
+ gemini: "gemini.mjs",
23
+ chatgpt: "chatgpt.mjs",
24
+ };
25
+
26
+ const SYNTHESIS_START_URLS = {
27
+ gemini: "https://gemini.google.com/app",
28
+ chatgpt: "https://chatgpt.com/",
29
+ };
30
+
31
+ export function normalizeSynthesizer(synthesizer = "gemini") {
32
+ const normalized = String(synthesizer || "gemini").toLowerCase();
33
+ if (normalized === "gem") return "gemini";
34
+ if (normalized === "gpt") return "chatgpt";
35
+ return normalized;
36
+ }
37
+
38
+ export function getSynthesisStartUrl(synthesizer = "gemini") {
39
+ return (
40
+ SYNTHESIS_START_URLS[normalizeSynthesizer(synthesizer)] || "about:blank"
41
+ );
42
+ }
43
+
44
+ export async function runSynthesisPrompt(
45
+ synthesizer,
19
46
  prompt,
20
- { tabPrefix = null, timeoutMs = 180000 } = {},
47
+ { tabPrefix = null, timeoutMs = 180000, visible = null } = {},
21
48
  ) {
49
+ const normalizedSynthesizer = normalizeSynthesizer(synthesizer);
50
+ const script = SYNTHESIS_EXTRACTORS[normalizedSynthesizer];
51
+ if (!script || !SUPPORTED_SYNTHESIZERS.includes(normalizedSynthesizer)) {
52
+ throw new Error(
53
+ `Unsupported synthesizer "${synthesizer}". Supported: ${SUPPORTED_SYNTHESIZERS.join(", ")}`,
54
+ );
55
+ }
56
+
22
57
  return new Promise((resolve, reject) => {
23
58
  const extraArgs = tabPrefix ? ["--tab", String(tabPrefix)] : [];
59
+ // Strip inherited visible-mode flags so a stale GREEDY_SEARCH_VISIBLE=1
60
+ // in the parent process doesn't force visible Chrome. Callers that
61
+ // genuinely want visible synthesis should pass visible: true explicitly.
62
+ const childEnv = {
63
+ ...process.env,
64
+ CDP_PROFILE_DIR: GREEDY_PROFILE_DIR,
65
+ };
66
+ if (visible !== true) {
67
+ delete childEnv.GREEDY_SEARCH_VISIBLE;
68
+ delete childEnv.GREEDY_SEARCH_ALWAYS_VISIBLE;
69
+ } else {
70
+ childEnv.GREEDY_SEARCH_VISIBLE = "1";
71
+ childEnv.GREEDY_SEARCH_ALWAYS_VISIBLE = "1";
72
+ }
24
73
  const proc = spawn(
25
74
  process.execPath,
26
- [
27
- join(__dir, "..", "..", "extractors", "gemini.mjs"),
28
- "--stdin",
29
- ...extraArgs,
30
- ],
75
+ [join(__dir, "..", "..", "extractors", script), "--stdin", ...extraArgs],
31
76
  {
32
77
  stdio: ["pipe", "pipe", "pipe"],
33
- env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
78
+ env: childEnv,
34
79
  },
35
80
  );
36
81
  // Pipe prompts via stdin to avoid leaking them in process tables.
@@ -42,40 +87,63 @@ export async function runGeminiPrompt(
42
87
  proc.stderr.on("data", (d) => (err += d));
43
88
  const t = setTimeout(() => {
44
89
  proc.kill();
45
- reject(new Error(`Gemini prompt timed out after ${timeoutMs / 1000}s`));
90
+ reject(
91
+ new Error(
92
+ `${normalizedSynthesizer} prompt timed out after ${timeoutMs / 1000}s`,
93
+ ),
94
+ );
46
95
  }, timeoutMs);
47
96
  proc.on("close", (code) => {
48
97
  clearTimeout(t);
49
98
  if (code !== 0) {
50
- reject(new Error(err.trim() || "gemini extractor failed"));
99
+ reject(
100
+ new Error(err.trim() || `${normalizedSynthesizer} extractor failed`),
101
+ );
51
102
  return;
52
103
  }
53
104
  try {
54
105
  resolve(JSON.parse(out.trim()));
55
106
  } catch {
56
- reject(new Error(`bad JSON from gemini: ${out.slice(0, 100)}`));
107
+ reject(
108
+ new Error(
109
+ `bad JSON from ${normalizedSynthesizer}: ${out.slice(0, 100)}`,
110
+ ),
111
+ );
57
112
  }
58
113
  });
59
114
  });
60
115
  }
61
116
 
62
- export async function synthesizeWithGemini(
117
+ // Backward-compatible Gemini helper used by research mode internals.
118
+ export async function runGeminiPrompt(prompt, options = {}) {
119
+ return runSynthesisPrompt("gemini", prompt, options);
120
+ }
121
+
122
+ export async function synthesizeResults(
63
123
  query,
64
124
  results,
65
- { grounded = false, tabPrefix = null } = {},
125
+ {
126
+ grounded = false,
127
+ tabPrefix = null,
128
+ visible = null,
129
+ synthesizer = "gemini",
130
+ } = {},
66
131
  ) {
132
+ const normalizedSynthesizer = normalizeSynthesizer(synthesizer);
67
133
  const sources = Array.isArray(results._sources)
68
134
  ? results._sources
69
135
  : buildSourceRegistry(results);
70
136
  const prompt = buildSynthesisPrompt(query, results, sources, { grounded });
71
137
 
72
- const raw = await runGeminiPrompt(prompt, { tabPrefix, timeoutMs: 180000 });
138
+ const raw = await runSynthesisPrompt(normalizedSynthesizer, prompt, {
139
+ tabPrefix,
140
+ timeoutMs: 180000,
141
+ visible,
142
+ });
73
143
  let structured = parseStructuredJson(raw.answer || "");
74
144
 
75
- // Detect if Gemini echoed back the engine summaries instead of a synthesis.
76
- // Happens when Gemini can't synthesize (e.g. only 1 engine responded) and
77
- // echoes the prompt JSON. The engine summary JSON has per-engine keys
78
- // (perplexity/bing/google) but no synthesis fields (answer/agreement).
145
+ // Detect if the synthesizer echoed back the engine summaries instead of a
146
+ // synthesis. This can happen when it can't synthesize and mirrors prompt JSON.
79
147
  const SYNTHESIS_FIELDS = [
80
148
  "answer",
81
149
  "agreement",
@@ -86,17 +154,28 @@ export async function synthesizeWithGemini(
86
154
  const hasSynthesisFields =
87
155
  structured && SYNTHESIS_FIELDS.some((f) => f in structured);
88
156
  const hasEngineKeys =
89
- structured && ["perplexity", "bing", "google"].some((e) => e in structured);
157
+ structured &&
158
+ ["perplexity", "bing", "google", "chatgpt", "gemini"].some(
159
+ (e) => e in structured,
160
+ );
90
161
  if (hasEngineKeys && !hasSynthesisFields) {
91
- structured = null; // Treat as parse failure — Gemini echoed input
162
+ structured = null; // Treat as parse failure — synthesizer echoed input
92
163
  }
93
164
 
94
165
  return {
95
166
  ...normalizeSynthesisPayload(structured, sources, raw.answer || ""),
96
167
  rawAnswer: raw.answer || "",
97
- geminiSources: raw.sources || [],
168
+ synthesizedBy: normalizedSynthesizer,
169
+ synthesizerSources: raw.sources || [],
170
+ // Backward-compatible field for existing consumers.
171
+ geminiSources: normalizedSynthesizer === "gemini" ? raw.sources || [] : [],
98
172
  };
99
173
  }
100
174
 
101
- // Need to import buildSourceRegistry for fallback
102
- import { buildSourceRegistry } from "./sources.mjs";
175
+ // Backward-compatible all-search synthesis helper.
176
+ export async function synthesizeWithGemini(query, results, options = {}) {
177
+ return synthesizeResults(query, results, {
178
+ ...options,
179
+ synthesizer: "gemini",
180
+ });
181
+ }