npm - @apmantza/greedysearch-pi - Versions diffs - 1.8.9 → 1.9.0 - Mend

@apmantza/greedysearch-pi 1.8.9 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/CHANGELOG.md +503 -446
package/bin/cdp.mjs +15 -2
package/bin/search.mjs +679 -668
package/extractors/bing-copilot.mjs +68 -11
package/extractors/common.mjs +37 -2
package/extractors/consent.mjs +388 -294
package/extractors/gemini.mjs +217 -150
package/extractors/perplexity.mjs +56 -7
package/package.json +1 -1
package/src/search/chrome.mjs +62 -1
package/src/search/constants.mjs +1 -6
package/src/search/engines.mjs +76 -67
package/src/search/file-sources.mjs +46 -0
package/src/search/query.mjs +49 -0
package/src/search/recovery.mjs +20 -1
package/src/search/sources.mjs +37 -21
package/src/search/synthesis.mjs +27 -16
package/extractors/bing-aria.mjs +0 -539
package/extractors/google-search.mjs +0 -234

package/src/search/engines.mjs CHANGED Viewed

@@ -1,67 +1,76 @@
-// src/search/engines.mjs — Extractor runner
-//
-// Engine map lives in constants.mjs; this module re-exports it for
-// backward compatibility and provides the runExtractor() function.
-import { spawn } from "node:child_process";
-import { join } from "node:path";
-import { ENGINES, GREEDY_PROFILE_DIR } from "./constants.mjs";
-export { ENGINES };
-const __dir =
-	import.meta.dirname ||
-	new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
-export function runExtractor(
-	script,
-	query,
-	tabPrefix = null,
-	short = false,
-	timeoutMs = null,
-	locale = null,
-) {
-	// Gemini synthesis: 70s budget (45s stream + ~25s nav/settle overhead)
-	// Other engines: 60s budget
-	if (timeoutMs === null) {
-		timeoutMs = script.includes("gemini") ? 70000 : 60000;
-	}
-	const extraArgs = [
-		...(tabPrefix ? ["--tab", tabPrefix] : []),
-		...(short ? ["--short"] : []),
-		...(locale ? ["--locale", locale] : []),
-	];
-	return new Promise((resolve, reject) => {
-		const proc = spawn(
-			process.execPath,
-			[join(__dir, "..", "..", "extractors", script), "--stdin", ...extraArgs],
-			{
-				stdio: ["pipe", "pipe", "pipe"],
-				env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
-			},
-		);
-		// Pipe query via stdin to avoid leaking it in process table command-line
-		proc.stdin.write(query);
-		proc.stdin.end();
-		let out = "";
-		let err = "";
-		proc.stdout.on("data", (d) => (out += d));
-		proc.stderr.on("data", (d) => (err += d));
-		const t = setTimeout(() => {
-			proc.kill();
-			reject(new Error(`${script} timed out after ${timeoutMs / 1000}s`));
-		}, timeoutMs);
-		proc.on("close", (code) => {
-			clearTimeout(t);
-			if (code === 0) {
-				try {
-					resolve(JSON.parse(out.trim()));
-				} catch {
-					reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`));
-				}
-			} else {
-				reject(new Error(err.trim() || `extractor exit ${code}`));
-			}
-		});
-	});
-}
+// src/search/engines.mjs — Extractor runner
+//
+// Engine map lives in constants.mjs; this module re-exports it for
+// backward compatibility and provides the runExtractor() function.
+import { spawn } from "node:child_process";
+import { join } from "node:path";
+import { ENGINES, GREEDY_PROFILE_DIR } from "./constants.mjs";
+export { ENGINES };
+const __dir =
+	import.meta.dirname ||
+	new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
+export function runExtractor(
+	script,
+	query,
+	tabPrefix = null,
+	short = false,
+	timeoutMs = null,
+	locale = null,
+) {
+	// Gemini synthesis: 70s budget (45s stream + ~25s nav/settle overhead)
+	// Other engines: 60s budget
+	if (timeoutMs === null) {
+		timeoutMs = script.includes("gemini") ? 70000 : 60000;
+	}
+	const extraArgs = [
+		...(tabPrefix ? ["--tab", tabPrefix] : []),
+		...(short ? ["--short"] : []),
+		...(locale ? ["--locale", locale] : []),
+	];
+	return new Promise((resolve, reject) => {
+		const proc = spawn(
+			process.execPath,
+			[join(__dir, "..", "..", "extractors", script), "--stdin", ...extraArgs],
+			{
+				stdio: ["pipe", "pipe", "pipe"],
+				env: { ...process.env, CDP_PROFILE_DIR: GREEDY_PROFILE_DIR },
+			},
+		);
+		// Pipe query via stdin to avoid leaking it in process table command-line
+		proc.stdin.write(query);
+		proc.stdin.end();
+		let out = "";
+		let err = "";
+		proc.stdout.on("data", (d) => (out += d));
+		proc.stderr.on("data", (d) => (err += d));
+		const t = setTimeout(() => {
+			proc.kill();
+			reject(new Error(`${script} timed out after ${timeoutMs / 1000}s`));
+		}, timeoutMs);
+		proc.on("close", (code) => {
+			clearTimeout(t);
+			if (code === 0) {
+				try {
+					resolve(JSON.parse(out.trim()));
+				} catch {
+					reject(new Error(`bad JSON from ${script}: ${out.slice(0, 100)}`));
+				}
+			} else {
+				// Try to parse structured error envelope from stdout before falling back
+				let envelope = null;
+				try {
+					const parsed = JSON.parse(out.trim());
+					if (parsed._envelope) envelope = parsed._envelope;
+				} catch {}
+				const msg = err.trim() || `extractor exit ${code}`;
+				const errObj = new Error(msg);
+				if (envelope) errObj.envelope = envelope;
+				reject(errObj);
+			}
+		});
+	});
+}

package/src/search/file-sources.mjs ADDED Viewed

@@ -0,0 +1,46 @@
+// src/search/file-sources.mjs — Write fetched source content to disk,
+// return file paths instead of inline content. Token-efficient output.
+import { mkdirSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+const DEFAULT_DIR = join(process.cwd(), ".pi", "greedysearch-sources");
+/**
+ * Write fetched source content to files and replace inline content with paths.
+ * Keeps metadata and snippets inline for quick reference.
+ *
+ * @param {Array} fetchedSources — output from fetchMultipleSources
+ * @param {string} [dir] — directory to write files (default: .pi/greedysearch-sources)
+ * @returns {Array} sources with content stripped, contentPath added
+ */
+export function writeSourcesToFiles(fetchedSources, dir = DEFAULT_DIR) {
+	mkdirSync(dir, { recursive: true });
+	return fetchedSources.map((source) => {
+		if (!source.content || source.content.length < 10) {
+			// No content to write — keep as-is
+			return source;
+		}
+		const safeId = String(source.id || "unknown").replace(/[^a-zA-Z0-9_-]/g, "");
+		const urlSlug = (source.canonicalUrl || source.url || "")
+			.replace(/^https?:\/\//, "")
+			.replace(/[^a-zA-Z0-9]/g, "-")
+			.slice(0, 40);
+		const filename = `${safeId}-${urlSlug}.md`;
+		const filepath = join(dir, filename);
+		// Write full content to file
+		const header = `---\nurl: ${source.finalUrl || source.url}\ntitle: ${source.title || ""}\nsource: ${source.source || "unknown"}\nstatus: ${source.status || ""}\nchars: ${source.contentChars || source.content.length}\n---\n\n`;
+		writeFileSync(filepath, header + source.content, "utf8");
+		// Return stripped object — content replaced by path
+		const { content, ...rest } = source;
+		return {
+			...rest,
+			contentPath: filepath,
+			contentChars: source.contentChars || content.length,
+		};
+	});
+}

package/src/search/query.mjs ADDED Viewed

@@ -0,0 +1,49 @@
+// src/search/query.mjs — Query normalization for search engine input
+//
+// Two universal transforms applied to all engines:
+//   1. stripPreamble  — remove agent-generated conversational openers
+//   2. addRecencyHint — append current year for temporally-sensitive queries
+//
+// Note: Google udm=50 is an AI mode with the same query understanding as
+// natural-language question form — keyword conversion adds no benefit there.
+// Agent preambles that add no search signal
+const PREAMBLE_RX = /^(can you |could you |please |would you mind |i need to (know|understand) |i want to (know|understand) |i('m| am) (looking for|wondering about|curious about) |i need (information|info) (about|on) |tell me )?(about |explain |describe |give me |help me understand |search for |look up |find |research )?(about |regarding |on |for )?(it|this|the following)?\s*/i;
+// Temporal keywords that indicate recency sensitivity
+const TEMPORAL_RX = /\b(latest|newest|current|recent|up-to-date|up to date)\b/i;
+// Version numbers and years — if already present, don't add year
+const VERSION_RX = /\b\d+\.\d+|\bv\d+\b|\b20(2[0-9]|[3-9]\d)\b/i;
+/**
+ * Strip common agent-generated preambles that add no search signal.
+ * "Can you explain how React hooks work?" → "how React hooks work?"
+ */
+export function stripPreamble(query) {
+	const stripped = query.trim().replace(PREAMBLE_RX, "").trim();
+	return stripped.length > 4 ? stripped : query.trim();
+}
+/**
+ * Append current year when the query has explicit recency language but no
+ * version number or year. Prevents engines blending old/new results.
+ * "latest FastAPI best practices" → "latest FastAPI best practices 2026"
+ */
+export function addRecencyHint(query, year = new Date().getFullYear()) {
+	if (!TEMPORAL_RX.test(query)) return query;
+	if (VERSION_RX.test(query)) return query; // already specific
+	return `${query.trimEnd()} ${year}`;
+}
+/**
+ * Full normalization pipeline. Engine-agnostic: all three AI search engines
+ * handle natural-language questions natively, so no per-engine rewriting.
+ * Returns the original query unchanged if transforms produce an empty string.
+ */
+export function normalizeQuery(query) {
+	if (!query?.trim()) return query;
+	let q = stripPreamble(query);
+	q = addRecencyHint(q);
+	return q || query;
+}

package/src/search/recovery.mjs CHANGED Viewed

@@ -20,7 +20,26 @@ export function isManualVerificationError(error) {
 export function findHeadlessBlockedEngines(resultsByEngine) {
 	return HEADLESS_RECOVERY_ENGINES.filter((engine) => {
-		const error = resultsByEngine?.[engine]?.error;
+		const result = resultsByEngine?.[engine];
+		if (!result) return false;
+		// Data-driven: check envelope first (zero regex cost)
+		if (result._envelope?.blockedBy) return true;
+		if (result._envelope?.verificationResult === "needs-human") return true;
+		// Fallback: legacy string matching for errors passed as plain strings
+		const error = result.error;
 		return error && isHeadlessBlockedError(error);
 	});
 }
+/**
+ * Check if an extractor Error carries a structured envelope indicating
+ * headless blocking. Used in single-engine recovery paths where the Error
+ * object is caught directly rather than parsed from a result record.
+ */
+export function isHeadlessBlockedResult(error) {
+	if (!error) return false;
+	const env = error.envelope;
+	if (env?.blockedBy) return true;
+	if (env?.verificationResult === "needs-human") return true;
+	return isHeadlessBlockedError(error.message);
+}

package/src/search/sources.mjs CHANGED Viewed

@@ -167,6 +167,25 @@ export function bestRank(source) {
 	return ranks.length ? Math.min(...ranks) : 99;
 }
+// Discussion-only hosts that get a stronger penalty vs. general community hosts.
+// Q&A sites (stackoverflow, stackexchange) are intentionally excluded.
+const DISCUSSION_HOSTS = ["reddit.com", "news.ycombinator.com", "lobste.rs"];
+/**
+ * Composite relevance score combining all signals continuously instead of
+ * cascading tiebreakers. Weights chosen so a query-relevant official source
+ * ranked #1 by one engine beats any multi-engine consensus from generic sites,
+ * while multi-engine consensus beats a single-engine community post.
+ */
+export function computeCompositeScore(source) {
+	return (
+		source.smartScore * 3 +
+		source.engineCount * 5 +
+		sourceTypePriority(source.sourceType) * 2 +
+		Math.max(0, 7 - bestRank(source))
+	);
+}
 export function inferPreferredDomains(query) {
 	const normalized = query.toLowerCase();
 	const matches = [];
@@ -340,9 +359,19 @@ export function buildSourceRegistry(out, query = "") {
 				smartScore += 2;
 			}
-			// Penalize community/discussion sites for technical queries
-			if (sourceType === "community" && preferredDomains.length > 0) {
-				smartScore -= 2;
+			// Penalize discussion forums for technical queries — high noise, rarely canonical.
+			// Q&A sites (stackoverflow, stackexchange) are excluded: they often have the
+			// best practical answer and shouldn't be penalised just because an official
+			// domain also exists.
+			if (preferredDomains.length > 0) {
+				if (matchesDomain(domain, DISCUSSION_HOSTS)) {
+					smartScore -= 3;
+				} else if (
+					sourceType === "community" &&
+					!matchesDomain(domain, ["stackoverflow.com", "stackexchange.com"])
+				) {
+					smartScore -= 1;
+				}
 			}
 			const existing = seen.get(canonicalUrl) || {
@@ -387,24 +416,11 @@ export function buildSourceRegistry(out, query = "") {
 			engineCount: source.engines.length,
 		}))
 		.sort((a, b) => {
-			// Primary: smart score (query-aware domain boosting)
-			if (b.smartScore !== a.smartScore) return b.smartScore - a.smartScore;
-			// Secondary: consensus (sources found by more engines)
-			if (b.engineCount !== a.engineCount) return b.engineCount - a.engineCount;
-			// Tertiary: source type priority
-			if (
-				sourceTypePriority(b.sourceType) !== sourceTypePriority(a.sourceType)
-			) {
-				return (
-					sourceTypePriority(b.sourceType) - sourceTypePriority(a.sourceType)
-				);
-			}
-			// Quaternary: best rank across engines
-			if (bestRank(a) !== bestRank(b)) return bestRank(a) - bestRank(b);
+			// Single composite score so all signals contribute simultaneously.
+			// Avoids rank being ignored when engineCount differs, and smartScore
+			// dominating even when rank/type signal would break the tie better.
+			const diff = computeCompositeScore(b) - computeCompositeScore(a);
+			if (diff !== 0) return diff;
 			return a.domain.localeCompare(b.domain);
 		})
 		.slice(0, 12)

package/src/search/synthesis.mjs CHANGED Viewed

@@ -152,6 +152,10 @@ export function buildSynthesisPrompt(
 		};
 	}
+	// Snippet budget: always include content for fetched sources so Gemini can
+	// make citation decisions based on what the sources actually say, not just
+	// their metadata. Grounded mode gets a larger budget per source.
+	const snippetChars = grounded ? 700 : 300;
 	const sourceRegistry = sources.slice(0, grounded ? 10 : 8).map((source) => ({
 		id: source.id,
 		title: source.title,
@@ -161,37 +165,44 @@ export function buildSynthesisPrompt(
 		isOfficial: source.isOfficial,
 		engines: source.engines,
 		engineCount: source.engineCount,
-		perEngine: source.perEngine,
 		fetch: source.fetch?.attempted
 			? {
 					ok: source.fetch.ok,
-					status: source.fetch.status,
 					publishedTime: source.fetch.publishedTime || "",
-					lastModified: source.fetch.lastModified || "",
 					byline: source.fetch.byline || "",
-					siteName: source.fetch.siteName || "",
-					...(grounded
-						? { snippet: trimText(source.fetch.snippet || "", 700) }
-						: {}),
+					snippet: trimText(source.fetch.snippet || "", snippetChars),
 				}
 			: undefined,
 	}));
 	return [
-		"Synthesize the following search results into a concise answer.",
-		"Compare the three engine responses (Perplexity, Bing, Google) and identify:",
-		"1. The main answer to the query",
-		"2. Where the engines agree",
-		"3. Where they disagree (if anywhere)",
-		"4. Any caveats or limitations",
-		"Use source IDs like S1, S2 when citing sources.",
-		"Format: Start with a brief answer, then list key points.",
+		"You are a research synthesizer. Combine these search engine results into a single authoritative answer.",
 		"",
 		`Query: ${query}`,
 		"",
-		`Engine results:\n${JSON.stringify(engineSummaries, null, 2)}`,
+		`Engine summaries:\n${JSON.stringify(engineSummaries, null, 2)}`,
 		"",
 		`Source registry:\n${JSON.stringify(sourceRegistry, null, 2)}`,
+		"",
+		"Instructions:",
+		"- Write a clear, direct answer in markdown (use headers/bullets where they help readability)",
+		"- Cite sources inline as [S1], [S2] etc. when making specific claims",
+		"- Prefer sources with content (fetch.ok=true and non-empty snippet) for citations",
+		"- Note where the engines agree or meaningfully disagree",
+		"- List any important caveats or limitations",
+		"- recommendedSources: the 2-4 source IDs most worth reading for this query",
+		"",
+		"Respond ONLY with a JSON object wrapped in BEGIN_JSON / END_JSON markers:",
+		"",
+		"BEGIN_JSON",
+		JSON.stringify({
+			answer: "<your markdown answer here>",
+			agreement: { level: "high|medium|mixed|conflicting", summary: "<one sentence>" },
+			differences: ["<notable difference between engines, if any>"],
+			caveats: ["<important caveat or limitation>"],
+			recommendedSources: ["S1", "S2"],
+		}, null, 2),
+		"END_JSON",
 	].join("\n");
 }