@apmantza/greedysearch-pi 1.8.9 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,67 +1,76 @@
1
- // src/search/engines.mjs — Extractor runner
2
- //
3
- // Engine map lives in constants.mjs; this module re-exports it for
4
- // backward compatibility and provides the runExtractor() function.
5
-
6
- import { spawn } from "node:child_process";
7
- import { join } from "node:path";
8
- import { ENGINES, GREEDY_PROFILE_DIR } from "./constants.mjs";
9
-
10
- export { ENGINES };
11
-
12
- const __dir =
13
- import.meta.dirname ||
14
- new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
15
-
16
- export function runExtractor(
17
- script,
18
- query,
19
- tabPrefix = null,
20
- short = false,
21
- timeoutMs = null,
22
- locale = null,
23
- ) {
24
- // Gemini synthesis: 70s budget (45s stream + ~25s nav/settle overhead)
25
- // Other engines: 60s budget
26
- if (timeoutMs === null) {
27
- timeoutMs = script.includes("gemini") ? 70000 : 60000;
28
- }
29
- const extraArgs = [
30
- ...(tabPrefix ? ["--tab", tabPrefix] : []),
31
- ...(short ? ["--short"] : []),
32
- ...(locale ? ["--locale", locale] : []),
33
- ];
34
- return new Promise((resolve, reject) => {
35
- const proc = spawn(
36
- process.execPath,
37
- [join(__dir, "..", "..", "extractors", script), "--stdin", ...extraArgs],
38
- {
39
- stdio: ["pipe", "pipe", "pipe"],
40
- env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
41
- },
42
- );
43
- // Pipe query via stdin to avoid leaking it in process table command-line
44
- proc.stdin.write(query);
45
- proc.stdin.end();
46
- let out = "";
47
- let err = "";
48
- proc.stdout.on("data", (d) => (out += d));
49
- proc.stderr.on("data", (d) => (err += d));
50
- const t = setTimeout(() => {
51
- proc.kill();
52
- reject(new Error(`${script} timed out after ${timeoutMs / 1000}s`));
53
- }, timeoutMs);
54
- proc.on("close", (code) => {
55
- clearTimeout(t);
56
- if (code === 0) {
57
- try {
58
- resolve(JSON.parse(out.trim()));
59
- } catch {
60
- reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`));
61
- }
62
- } else {
63
- reject(new Error(err.trim() || `extractor exit ${code}`));
64
- }
65
- });
66
- });
67
- }
1
+ // src/search/engines.mjs — Extractor runner
2
+ //
3
+ // Engine map lives in constants.mjs; this module re-exports it for
4
+ // backward compatibility and provides the runExtractor() function.
5
+
6
+ import { spawn } from "node:child_process";
7
+ import { join } from "node:path";
8
+ import { ENGINES, GREEDY_PROFILE_DIR } from "./constants.mjs";
9
+
10
+ export { ENGINES };
11
+
12
+ const __dir =
13
+ import.meta.dirname ||
14
+ new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
15
+
16
+ export function runExtractor(
17
+ script,
18
+ query,
19
+ tabPrefix = null,
20
+ short = false,
21
+ timeoutMs = null,
22
+ locale = null,
23
+ ) {
24
+ // Gemini synthesis: 70s budget (45s stream + ~25s nav/settle overhead)
25
+ // Other engines: 60s budget
26
+ if (timeoutMs === null) {
27
+ timeoutMs = script.includes("gemini") ? 70000 : 60000;
28
+ }
29
+ const extraArgs = [
30
+ ...(tabPrefix ? ["--tab", tabPrefix] : []),
31
+ ...(short ? ["--short"] : []),
32
+ ...(locale ? ["--locale", locale] : []),
33
+ ];
34
+ return new Promise((resolve, reject) => {
35
+ const proc = spawn(
36
+ process.execPath,
37
+ [join(__dir, "..", "..", "extractors", script), "--stdin", ...extraArgs],
38
+ {
39
+ stdio: ["pipe", "pipe", "pipe"],
40
+ env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
41
+ },
42
+ );
43
+ // Pipe query via stdin to avoid leaking it in process table command-line
44
+ proc.stdin.write(query);
45
+ proc.stdin.end();
46
+ let out = "";
47
+ let err = "";
48
+ proc.stdout.on("data", (d) => (out += d));
49
+ proc.stderr.on("data", (d) => (err += d));
50
+ const t = setTimeout(() => {
51
+ proc.kill();
52
+ reject(new Error(`${script} timed out after ${timeoutMs / 1000}s`));
53
+ }, timeoutMs);
54
+ proc.on("close", (code) => {
55
+ clearTimeout(t);
56
+ if (code === 0) {
57
+ try {
58
+ resolve(JSON.parse(out.trim()));
59
+ } catch {
60
+ reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`));
61
+ }
62
+ } else {
63
+ // Try to parse structured error envelope from stdout before falling back
64
+ let envelope = null;
65
+ try {
66
+ const parsed = JSON.parse(out.trim());
67
+ if (parsed._envelope) envelope = parsed._envelope;
68
+ } catch {}
69
+ const msg = err.trim() || `extractor exit ${code}`;
70
+ const errObj = new Error(msg);
71
+ if (envelope) errObj.envelope = envelope;
72
+ reject(errObj);
73
+ }
74
+ });
75
+ });
76
+ }
@@ -0,0 +1,46 @@
1
+ // src/search/file-sources.mjs — Write fetched source content to disk,
2
+ // return file paths instead of inline content. Token-efficient output.
3
+
4
+ import { mkdirSync, writeFileSync } from "node:fs";
5
+ import { join } from "node:path";
6
+
7
+ const DEFAULT_DIR = join(process.cwd(), ".pi", "greedysearch-sources");
8
+
9
+ /**
10
+ * Write fetched source content to files and replace inline content with paths.
11
+ * Keeps metadata and snippets inline for quick reference.
12
+ *
13
+ * @param {Array} fetchedSources — output from fetchMultipleSources
14
+ * @param {string} [dir] — directory to write files (default: .pi/greedysearch-sources)
15
+ * @returns {Array} sources with content stripped, contentPath added
16
+ */
17
+ export function writeSourcesToFiles(fetchedSources, dir = DEFAULT_DIR) {
18
+ mkdirSync(dir, { recursive: true });
19
+
20
+ return fetchedSources.map((source) => {
21
+ if (!source.content || source.content.length < 10) {
22
+ // No content to write — keep as-is
23
+ return source;
24
+ }
25
+
26
+ const safeId = String(source.id || "unknown").replace(/[^a-zA-Z0-9_-]/g, "");
27
+ const urlSlug = (source.canonicalUrl || source.url || "")
28
+ .replace(/^https?:\/\//, "")
29
+ .replace(/[^a-zA-Z0-9]/g, "-")
30
+ .slice(0, 40);
31
+ const filename = `${safeId}-${urlSlug}.md`;
32
+ const filepath = join(dir, filename);
33
+
34
+ // Write full content to file
35
+ const header = `---\nurl: ${source.finalUrl || source.url}\ntitle: ${source.title || ""}\nsource: ${source.source || "unknown"}\nstatus: ${source.status || ""}\nchars: ${source.contentChars || source.content.length}\n---\n\n`;
36
+ writeFileSync(filepath, header + source.content, "utf8");
37
+
38
+ // Return stripped object — content replaced by path
39
+ const { content, ...rest } = source;
40
+ return {
41
+ ...rest,
42
+ contentPath: filepath,
43
+ contentChars: source.contentChars || content.length,
44
+ };
45
+ });
46
+ }
@@ -0,0 +1,49 @@
1
+ // src/search/query.mjs — Query normalization for search engine input
2
+ //
3
+ // Two universal transforms applied to all engines:
4
+ // 1. stripPreamble — remove agent-generated conversational openers
5
+ // 2. addRecencyHint — append current year for temporally-sensitive queries
6
+ //
7
+ // Note: Google udm=50 is an AI mode with the same query understanding as
8
+ // natural-language question form — keyword conversion adds no benefit there.
9
+
10
+ // Agent preambles that add no search signal
11
+ const PREAMBLE_RX = /^(can you |could you |please |would you mind |i need to (know|understand) |i want to (know|understand) |i('m| am) (looking for|wondering about|curious about) |i need (information|info) (about|on) |tell me )?(about |explain |describe |give me |help me understand |search for |look up |find |research )?(about |regarding |on |for )?(it|this|the following)?\s*/i;
12
+
13
+ // Temporal keywords that indicate recency sensitivity
14
+ const TEMPORAL_RX = /\b(latest|newest|current|recent|up-to-date|up to date)\b/i;
15
+
16
+ // Version numbers and years — if already present, don't add year
17
+ const VERSION_RX = /\b\d+\.\d+|\bv\d+\b|\b20(2[0-9]|[3-9]\d)\b/i;
18
+
19
+ /**
20
+ * Strip common agent-generated preambles that add no search signal.
21
+ * "Can you explain how React hooks work?" → "how React hooks work?"
22
+ */
23
+ export function stripPreamble(query) {
24
+ const stripped = query.trim().replace(PREAMBLE_RX, "").trim();
25
+ return stripped.length > 4 ? stripped : query.trim();
26
+ }
27
+
28
+ /**
29
+ * Append current year when the query has explicit recency language but no
30
+ * version number or year. Prevents engines blending old/new results.
31
+ * "latest FastAPI best practices" → "latest FastAPI best practices 2026"
32
+ */
33
+ export function addRecencyHint(query, year = new Date().getFullYear()) {
34
+ if (!TEMPORAL_RX.test(query)) return query;
35
+ if (VERSION_RX.test(query)) return query; // already specific
36
+ return `${query.trimEnd()} ${year}`;
37
+ }
38
+
39
+ /**
40
+ * Full normalization pipeline. Engine-agnostic: all three AI search engines
41
+ * handle natural-language questions natively, so no per-engine rewriting.
42
+ * Returns the original query unchanged if transforms produce an empty string.
43
+ */
44
+ export function normalizeQuery(query) {
45
+ if (!query?.trim()) return query;
46
+ let q = stripPreamble(query);
47
+ q = addRecencyHint(q);
48
+ return q || query;
49
+ }
@@ -20,7 +20,26 @@ export function isManualVerificationError(error) {
20
20
 
21
21
  export function findHeadlessBlockedEngines(resultsByEngine) {
22
22
  return HEADLESS_RECOVERY_ENGINES.filter((engine) => {
23
- const error = resultsByEngine?.[engine]?.error;
23
+ const result = resultsByEngine?.[engine];
24
+ if (!result) return false;
25
+ // Data-driven: check envelope first (zero regex cost)
26
+ if (result._envelope?.blockedBy) return true;
27
+ if (result._envelope?.verificationResult === "needs-human") return true;
28
+ // Fallback: legacy string matching for errors passed as plain strings
29
+ const error = result.error;
24
30
  return error && isHeadlessBlockedError(error);
25
31
  });
26
32
  }
33
+
34
+ /**
35
+ * Check if an extractor Error carries a structured envelope indicating
36
+ * headless blocking. Used in single-engine recovery paths where the Error
37
+ * object is caught directly rather than parsed from a result record.
38
+ */
39
+ export function isHeadlessBlockedResult(error) {
40
+ if (!error) return false;
41
+ const env = error.envelope;
42
+ if (env?.blockedBy) return true;
43
+ if (env?.verificationResult === "needs-human") return true;
44
+ return isHeadlessBlockedError(error.message);
45
+ }
@@ -167,6 +167,25 @@ export function bestRank(source) {
167
167
  return ranks.length ? Math.min(...ranks) : 99;
168
168
  }
169
169
 
170
+ // Discussion-only hosts that get a stronger penalty vs. general community hosts.
171
+ // Q&A sites (stackoverflow, stackexchange) are intentionally excluded.
172
+ const DISCUSSION_HOSTS = ["reddit.com", "news.ycombinator.com", "lobste.rs"];
173
+
174
+ /**
175
+ * Composite relevance score combining all signals continuously instead of
176
+ * cascading tiebreakers. Weights chosen so a query-relevant official source
177
+ * ranked #1 by one engine beats any multi-engine consensus from generic sites,
178
+ * while multi-engine consensus beats a single-engine community post.
179
+ */
180
+ export function computeCompositeScore(source) {
181
+ return (
182
+ source.smartScore * 3 +
183
+ source.engineCount * 5 +
184
+ sourceTypePriority(source.sourceType) * 2 +
185
+ Math.max(0, 7 - bestRank(source))
186
+ );
187
+ }
188
+
170
189
  export function inferPreferredDomains(query) {
171
190
  const normalized = query.toLowerCase();
172
191
  const matches = [];
@@ -340,9 +359,19 @@ export function buildSourceRegistry(out, query = "") {
340
359
  smartScore += 2;
341
360
  }
342
361
 
343
- // Penalize community/discussion sites for technical queries
344
- if (sourceType === "community" && preferredDomains.length > 0) {
345
- smartScore -= 2;
362
+ // Penalize discussion forums for technical queries — high noise, rarely canonical.
363
+ // Q&A sites (stackoverflow, stackexchange) are excluded: they often have the
364
+ // best practical answer and shouldn't be penalised just because an official
365
+ // domain also exists.
366
+ if (preferredDomains.length > 0) {
367
+ if (matchesDomain(domain, DISCUSSION_HOSTS)) {
368
+ smartScore -= 3;
369
+ } else if (
370
+ sourceType === "community" &&
371
+ !matchesDomain(domain, ["stackoverflow.com", "stackexchange.com"])
372
+ ) {
373
+ smartScore -= 1;
374
+ }
346
375
  }
347
376
 
348
377
  const existing = seen.get(canonicalUrl) || {
@@ -387,24 +416,11 @@ export function buildSourceRegistry(out, query = "") {
387
416
  engineCount: source.engines.length,
388
417
  }))
389
418
  .sort((a, b) => {
390
- // Primary: smart score (query-aware domain boosting)
391
- if (b.smartScore !== a.smartScore) return b.smartScore - a.smartScore;
392
-
393
- // Secondary: consensus (sources found by more engines)
394
- if (b.engineCount !== a.engineCount) return b.engineCount - a.engineCount;
395
-
396
- // Tertiary: source type priority
397
- if (
398
- sourceTypePriority(b.sourceType) !== sourceTypePriority(a.sourceType)
399
- ) {
400
- return (
401
- sourceTypePriority(b.sourceType) - sourceTypePriority(a.sourceType)
402
- );
403
- }
404
-
405
- // Quaternary: best rank across engines
406
- if (bestRank(a) !== bestRank(b)) return bestRank(a) - bestRank(b);
407
-
419
+ // Single composite score so all signals contribute simultaneously.
420
+ // Avoids rank being ignored when engineCount differs, and smartScore
421
+ // dominating even when rank/type signal would break the tie better.
422
+ const diff = computeCompositeScore(b) - computeCompositeScore(a);
423
+ if (diff !== 0) return diff;
408
424
  return a.domain.localeCompare(b.domain);
409
425
  })
410
426
  .slice(0, 12)
@@ -152,6 +152,10 @@ export function buildSynthesisPrompt(
152
152
  };
153
153
  }
154
154
 
155
+ // Snippet budget: always include content for fetched sources so Gemini can
156
+ // make citation decisions based on what the sources actually say, not just
157
+ // their metadata. Grounded mode gets a larger budget per source.
158
+ const snippetChars = grounded ? 700 : 300;
155
159
  const sourceRegistry = sources.slice(0, grounded ? 10 : 8).map((source) => ({
156
160
  id: source.id,
157
161
  title: source.title,
@@ -161,37 +165,44 @@ export function buildSynthesisPrompt(
161
165
  isOfficial: source.isOfficial,
162
166
  engines: source.engines,
163
167
  engineCount: source.engineCount,
164
- perEngine: source.perEngine,
165
168
  fetch: source.fetch?.attempted
166
169
  ? {
167
170
  ok: source.fetch.ok,
168
- status: source.fetch.status,
169
171
  publishedTime: source.fetch.publishedTime || "",
170
- lastModified: source.fetch.lastModified || "",
171
172
  byline: source.fetch.byline || "",
172
- siteName: source.fetch.siteName || "",
173
- ...(grounded
174
- ? { snippet: trimText(source.fetch.snippet || "", 700) }
175
- : {}),
173
+ snippet: trimText(source.fetch.snippet || "", snippetChars),
176
174
  }
177
175
  : undefined,
178
176
  }));
179
177
 
180
178
  return [
181
- "Synthesize the following search results into a concise answer.",
182
- "Compare the three engine responses (Perplexity, Bing, Google) and identify:",
183
- "1. The main answer to the query",
184
- "2. Where the engines agree",
185
- "3. Where they disagree (if anywhere)",
186
- "4. Any caveats or limitations",
187
- "Use source IDs like S1, S2 when citing sources.",
188
- "Format: Start with a brief answer, then list key points.",
179
+ "You are a research synthesizer. Combine these search engine results into a single authoritative answer.",
189
180
  "",
190
181
  `Query: ${query}`,
191
182
  "",
192
- `Engine results:\n${JSON.stringify(engineSummaries, null, 2)}`,
183
+ `Engine summaries:\n${JSON.stringify(engineSummaries, null, 2)}`,
193
184
  "",
194
185
  `Source registry:\n${JSON.stringify(sourceRegistry, null, 2)}`,
186
+ "",
187
+ "Instructions:",
188
+ "- Write a clear, direct answer in markdown (use headers/bullets where they help readability)",
189
+ "- Cite sources inline as [S1], [S2] etc. when making specific claims",
190
+ "- Prefer sources with content (fetch.ok=true and non-empty snippet) for citations",
191
+ "- Note where the engines agree or meaningfully disagree",
192
+ "- List any important caveats or limitations",
193
+ "- recommendedSources: the 2-4 source IDs most worth reading for this query",
194
+ "",
195
+ "Respond ONLY with a JSON object wrapped in BEGIN_JSON / END_JSON markers:",
196
+ "",
197
+ "BEGIN_JSON",
198
+ JSON.stringify({
199
+ answer: "<your markdown answer here>",
200
+ agreement: { level: "high|medium|mixed|conflicting", summary: "<one sentence>" },
201
+ differences: ["<notable difference between engines, if any>"],
202
+ caveats: ["<important caveat or limitation>"],
203
+ recommendedSources: ["S1", "S2"],
204
+ }, null, 2),
205
+ "END_JSON",
195
206
  ].join("\n");
196
207
  }
197
208