npm - @apmantza/greedysearch-pi - Versions diffs - 1.8.2 → 1.8.4 - Mend

@apmantza/greedysearch-pi 1.8.2 → 1.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/CHANGELOG.md +17 -0
package/README.md +10 -1
package/bin/launch.mjs +366 -366
package/bin/search.mjs +388 -388
package/extractors/common.mjs +291 -291
package/extractors/gemini.mjs +146 -146
package/extractors/google-ai.mjs +125 -125
package/extractors/perplexity.mjs +147 -145
package/extractors/selectors.mjs +54 -54
package/index.ts +256 -278
package/package.json +1 -1
package/src/github.mjs +237 -237
package/src/reddit.mjs +210 -0
package/src/search/chrome.mjs +222 -222
package/src/search/constants.mjs +37 -37
package/src/search/defaults.mjs +14 -14
package/src/search/engines.mjs +62 -62
package/src/search/fetch-source.mjs +35 -3
package/src/search/output.mjs +58 -58
package/src/search/sources.mjs +445 -445
package/src/search/synthesis-runner.mjs +63 -63
package/src/search/synthesis.mjs +223 -223
package/src/tools/deep-research-handler.ts +36 -36
package/src/tools/greedy-search-handler.ts +53 -57
package/src/tools/shared.ts +135 -130
package/src/types.ts +103 -103
package/test.mjs +423 -377

package/src/github.mjs CHANGED Viewed

@@ -1,237 +1,237 @@
-// src/github.mjs - GitHub content fetching via REST API
-const GITHUB_API = "https://api.github.com";
-const DEFAULT_HEADERS = {
-	"user-agent": "GreedySearch/1.0",
-	accept: "application/vnd.github+json",
-	"x-github-api-version": "2022-11-28",
-};
-/**
- * Parse a GitHub URL into components
- * @param {string} url
- * @returns {{owner: string, repo: string, type: 'blob'|'tree'|'root', ref?: string, path?: string} | null}
- */
-export function parseGitHubUrl(url) {
-	try {
-		const parsed = new URL(url);
-		if (!parsed.hostname.endsWith("github.com")) {
-			return null;
-		}
-		const parts = parsed.pathname.split("/").filter(Boolean);
-		if (parts.length < 2) {
-			return null;
-		}
-		const [owner, repo] = parts;
-		// Root: github.com/owner/repo
-		if (parts.length === 2) {
-			return { owner, repo, type: "root" };
-		}
-		// With type: github.com/owner/repo/blob|tree/ref/path
-		if (parts.length >= 4 && (parts[2] === "blob" || parts[2] === "tree")) {
-			const type = parts[2];
-			const ref = parts[3];
-			const path = parts.slice(4).join("/");
-			return { owner, repo, type, ref, path };
-		}
-		return null;
-	} catch {
-		return null;
-	}
-}
-/**
- * Fetch JSON from GitHub API with timeout
- */
-async function apiGet(path, timeoutMs = 10000) {
-	const controller = new AbortController();
-	const tid = setTimeout(() => controller.abort(), timeoutMs);
-	try {
-		const res = await fetch(`${GITHUB_API}${path}`, {
-			headers: DEFAULT_HEADERS,
-			signal: controller.signal,
-		});
-		clearTimeout(tid);
-		if (!res.ok) {
-			throw new Error(`GitHub API ${res.status}: ${path}`);
-		}
-		return await res.json();
-	} catch (err) {
-		clearTimeout(tid);
-		throw err;
-	}
-}
-/**
- * Fetch the default branch README as plain text
- */
-async function fetchReadme(owner, repo) {
-	try {
-		const data = await apiGet(`/repos/${owner}/${repo}/readme`);
-		if (data.content && data.encoding === "base64") {
-			return Buffer.from(data.content, "base64").toString("utf8");
-		}
-		return "";
-	} catch {
-		return "";
-	}
-}
-/**
- * Fetch top-level file tree (non-recursive)
- */
-async function fetchTree(owner, repo, ref = "HEAD", subPath = "") {
-	try {
-		// Resolve ref to a tree SHA first when using HEAD or a branch name
-		const refData = await apiGet(`/repos/${owner}/${repo}/git/ref/heads/${ref === "HEAD" ? "main" : ref}`).catch(() =>
-			apiGet(`/repos/${owner}/${repo}/git/ref/heads/master`).catch(() => null)
-		);
-		let treeSha;
-		if (refData?.object?.sha) {
-			// Get commit to get tree SHA
-			const commit = await apiGet(`/repos/${owner}/${repo}/git/commits/${refData.object.sha}`);
-			treeSha = commit.tree.sha;
-		} else {
-			// Fall back to repo default branch info
-			const repoInfo = await apiGet(`/repos/${owner}/${repo}`);
-			const branch = await apiGet(`/repos/${owner}/${repo}/branches/${repoInfo.default_branch}`);
-			treeSha = branch.commit.commit.tree.sha;
-		}
-		const treeData = await apiGet(`/repos/${owner}/${repo}/git/trees/${treeSha}`);
-		let items = treeData.tree || [];
-		// Filter to subPath if requested
-		if (subPath) {
-			items = items.filter((item) => item.path.startsWith(subPath));
-		}
-		return items.slice(0, 50).map((item) => ({
-			path: item.path,
-			type: item.type === "tree" ? "dir" : "file",
-			size: item.size,
-		}));
-	} catch {
-		return [];
-	}
-}
-/**
- * Fetch a specific file via raw.githubusercontent.com
- */
-async function fetchRawFile(owner, repo, ref, filePath, timeoutMs = 10000) {
-	const ref_ = ref && ref !== "HEAD" ? ref : "main";
-	const urls = [
-		`https://raw.githubusercontent.com/${owner}/${repo}/${ref_}/${filePath}`,
-		`https://raw.githubusercontent.com/${owner}/${repo}/master/${filePath}`,
-	];
-	for (const url of urls) {
-		const controller = new AbortController();
-		const tid = setTimeout(() => controller.abort(), timeoutMs);
-		try {
-			const res = await fetch(url, {
-				headers: { "user-agent": DEFAULT_HEADERS["user-agent"] },
-				signal: controller.signal,
-			});
-			clearTimeout(tid);
-			if (res.ok) {
-				return await res.text();
-			}
-		} catch {
-			clearTimeout(tid);
-		}
-	}
-	return null;
-}
-/**
- * Fetch GitHub content via API
- * @param {string} url - GitHub URL (blob, tree, or root)
- * @returns {Promise<{ok: boolean, content?: string, title?: string, error?: string, tree?: Array}>}
- */
-export async function fetchGitHubContent(url) {
-	const parsed = parseGitHubUrl(url);
-	if (!parsed) {
-		return { ok: false, error: "Not a valid GitHub URL" };
-	}
-	const { owner, repo, type, ref, path } = parsed;
-	try {
-		if (type === "root" || (type === "tree" && !path)) {
-			// Fetch repo info + README + top-level tree in parallel
-			const [repoInfo, readme, tree] = await Promise.allSettled([
-				apiGet(`/repos/${owner}/${repo}`),
-				fetchReadme(owner, repo),
-				fetchTree(owner, repo, ref || "HEAD"),
-			]);
-			// If repo info failed (e.g. 404 — repo doesn't exist), bail out
-			if (repoInfo.status === "rejected") {
-				return { ok: false, error: repoInfo.reason?.message || "Repo not found" };
-			}
-			const info = repoInfo.value;
-			const readmeText = readme.status === "fulfilled" ? readme.value : "";
-			const treeItems = tree.status === "fulfilled" ? tree.value : [];
-			const description = info?.description ? `\n\n> ${info.description}` : "";
-			const stars = info?.stargazers_count != null ? ` ⭐ ${info.stargazers_count}` : "";
-			const language = info?.language ? ` · ${info.language}` : "";
-			let content = `# ${owner}/${repo}${stars}${language}${description}\n\n`;
-			if (readmeText) {
-				content += readmeText.slice(0, 6000);
-			} else {
-				content += `[No README found]\n\nFiles:\n${treeItems.map((t) => `  ${t.type === "dir" ? "📁" : "📄"} ${t.path}`).join("\n")}`;
-			}
-			return {
-				ok: true,
-				title: `${owner}/${repo}`,
-				content,
-				tree: treeItems.slice(0, 30),
-			};
-		}
-		if (type === "blob" && path) {
-			// Fetch specific file via raw URL
-			const content = await fetchRawFile(owner, repo, ref, path);
-			if (content === null) {
-				return { ok: false, error: `File not found: ${path}` };
-			}
-			return {
-				ok: true,
-				title: `${owner}/${repo}: ${path}`,
-				content,
-			};
-		}
-		if (type === "tree" && path) {
-			// Directory listing via API tree
-			const treeItems = await fetchTree(owner, repo, ref || "HEAD", path);
-			const listing = treeItems
-				.map((t) => `  ${t.type === "dir" ? "📁" : "📄"} ${t.path}`)
-				.join("\n");
-			return {
-				ok: true,
-				title: `${owner}/${repo}/${path}`,
-				content: `[Directory: ${path}]\n\nFiles:\n${listing}`,
-				tree: treeItems,
-			};
-		}
-		return { ok: false, error: "Unsupported GitHub URL type" };
-	} catch (err) {
-		return { ok: false, error: err.message };
-	}
-}
+// src/github.mjs - GitHub content fetching via REST API
+const GITHUB_API = "https://api.github.com";
+const DEFAULT_HEADERS = {
+	"user-agent": "GreedySearch/1.0",
+	accept: "application/vnd.github+json",
+	"x-github-api-version": "2022-11-28",
+};
+/**
+ * Parse a GitHub URL into components
+ * @param {string} url
+ * @returns {{owner: string, repo: string, type: 'blob'|'tree'|'root', ref?: string, path?: string} | null}
+ */
+export function parseGitHubUrl(url) {
+	try {
+		const parsed = new URL(url);
+		if (!parsed.hostname.endsWith("github.com")) {
+			return null;
+		}
+		const parts = parsed.pathname.split("/").filter(Boolean);
+		if (parts.length < 2) {
+			return null;
+		}
+		const [owner, repo] = parts;
+		// Root: github.com/owner/repo
+		if (parts.length === 2) {
+			return { owner, repo, type: "root" };
+		}
+		// With type: github.com/owner/repo/blob|tree/ref/path
+		if (parts.length >= 4 && (parts[2] === "blob" || parts[2] === "tree")) {
+			const type = parts[2];
+			const ref = parts[3];
+			const path = parts.slice(4).join("/");
+			return { owner, repo, type, ref, path };
+		}
+		return null;
+	} catch {
+		return null;
+	}
+}
+/**
+ * Fetch JSON from GitHub API with timeout
+ */
+async function apiGet(path, timeoutMs = 10000) {
+	const controller = new AbortController();
+	const tid = setTimeout(() => controller.abort(), timeoutMs);
+	try {
+		const res = await fetch(`${GITHUB_API}${path}`, {
+			headers: DEFAULT_HEADERS,
+			signal: controller.signal,
+		});
+		clearTimeout(tid);
+		if (!res.ok) {
+			throw new Error(`GitHub API ${res.status}: ${path}`);
+		}
+		return await res.json();
+	} catch (err) {
+		clearTimeout(tid);
+		throw err;
+	}
+}
+/**
+ * Fetch the default branch README as plain text
+ */
+async function fetchReadme(owner, repo) {
+	try {
+		const data = await apiGet(`/repos/${owner}/${repo}/readme`);
+		if (data.content && data.encoding === "base64") {
+			return Buffer.from(data.content, "base64").toString("utf8");
+		}
+		return "";
+	} catch {
+		return "";
+	}
+}
+/**
+ * Fetch top-level file tree (non-recursive)
+ */
+async function fetchTree(owner, repo, ref = "HEAD", subPath = "") {
+	try {
+		// Resolve ref to a tree SHA first when using HEAD or a branch name
+		const refData = await apiGet(`/repos/${owner}/${repo}/git/ref/heads/${ref === "HEAD" ? "main" : ref}`).catch(() =>
+			apiGet(`/repos/${owner}/${repo}/git/ref/heads/master`).catch(() => null)
+		);
+		let treeSha;
+		if (refData?.object?.sha) {
+			// Get commit to get tree SHA
+			const commit = await apiGet(`/repos/${owner}/${repo}/git/commits/${refData.object.sha}`);
+			treeSha = commit.tree.sha;
+		} else {
+			// Fall back to repo default branch info
+			const repoInfo = await apiGet(`/repos/${owner}/${repo}`);
+			const branch = await apiGet(`/repos/${owner}/${repo}/branches/${repoInfo.default_branch}`);
+			treeSha = branch.commit.commit.tree.sha;
+		}
+		const treeData = await apiGet(`/repos/${owner}/${repo}/git/trees/${treeSha}`);
+		let items = treeData.tree || [];
+		// Filter to subPath if requested
+		if (subPath) {
+			items = items.filter((item) => item.path.startsWith(subPath));
+		}
+		return items.slice(0, 50).map((item) => ({
+			path: item.path,
+			type: item.type === "tree" ? "dir" : "file",
+			size: item.size,
+		}));
+	} catch {
+		return [];
+	}
+}
+/**
+ * Fetch a specific file via raw.githubusercontent.com
+ */
+async function fetchRawFile(owner, repo, ref, filePath, timeoutMs = 10000) {
+	const ref_ = ref && ref !== "HEAD" ? ref : "main";
+	const urls = [
+		`https://raw.githubusercontent.com/${owner}/${repo}/${ref_}/${filePath}`,
+		`https://raw.githubusercontent.com/${owner}/${repo}/master/${filePath}`,
+	];
+	for (const url of urls) {
+		const controller = new AbortController();
+		const tid = setTimeout(() => controller.abort(), timeoutMs);
+		try {
+			const res = await fetch(url, {
+				headers: { "user-agent": DEFAULT_HEADERS["user-agent"] },
+				signal: controller.signal,
+			});
+			clearTimeout(tid);
+			if (res.ok) {
+				return await res.text();
+			}
+		} catch {
+			clearTimeout(tid);
+		}
+	}
+	return null;
+}
+/**
+ * Fetch GitHub content via API
+ * @param {string} url - GitHub URL (blob, tree, or root)
+ * @returns {Promise<{ok: boolean, content?: string, title?: string, error?: string, tree?: Array}>}
+ */
+export async function fetchGitHubContent(url) {
+	const parsed = parseGitHubUrl(url);
+	if (!parsed) {
+		return { ok: false, error: "Not a valid GitHub URL" };
+	}
+	const { owner, repo, type, ref, path } = parsed;
+	try {
+		if (type === "root" || (type === "tree" && !path)) {
+			// Fetch repo info + README + top-level tree in parallel
+			const [repoInfo, readme, tree] = await Promise.allSettled([
+				apiGet(`/repos/${owner}/${repo}`),
+				fetchReadme(owner, repo),
+				fetchTree(owner, repo, ref || "HEAD"),
+			]);
+			// If repo info failed (e.g. 404 — repo doesn't exist), bail out
+			if (repoInfo.status === "rejected") {
+				return { ok: false, error: repoInfo.reason?.message || "Repo not found" };
+			}
+			const info = repoInfo.value;
+			const readmeText = readme.status === "fulfilled" ? readme.value : "";
+			const treeItems = tree.status === "fulfilled" ? tree.value : [];
+			const description = info?.description ? `\n\n> ${info.description}` : "";
+			const stars = info?.stargazers_count != null ? ` ⭐ ${info.stargazers_count}` : "";
+			const language = info?.language ? ` · ${info.language}` : "";
+			let content = `# ${owner}/${repo}${stars}${language}${description}\n\n`;
+			if (readmeText) {
+				content += readmeText.slice(0, 6000);
+			} else {
+				content += `[No README found]\n\nFiles:\n${treeItems.map((t) => `  ${t.type === "dir" ? "📁" : "📄"} ${t.path}`).join("\n")}`;
+			}
+			return {
+				ok: true,
+				title: `${owner}/${repo}`,
+				content,
+				tree: treeItems.slice(0, 30),
+			};
+		}
+		if (type === "blob" && path) {
+			// Fetch specific file via raw URL
+			const content = await fetchRawFile(owner, repo, ref, path);
+			if (content === null) {
+				return { ok: false, error: `File not found: ${path}` };
+			}
+			return {
+				ok: true,
+				title: `${owner}/${repo}: ${path}`,
+				content,
+			};
+		}
+		if (type === "tree" && path) {
+			// Directory listing via API tree
+			const treeItems = await fetchTree(owner, repo, ref || "HEAD", path);
+			const listing = treeItems
+				.map((t) => `  ${t.type === "dir" ? "📁" : "📄"} ${t.path}`)
+				.join("\n");
+			return {
+				ok: true,
+				title: `${owner}/${repo}/${path}`,
+				content: `[Directory: ${path}]\n\nFiles:\n${listing}`,
+				tree: treeItems,
+			};
+		}
+		return { ok: false, error: "Unsupported GitHub URL type" };
+	} catch (err) {
+		return { ok: false, error: err.message };
+	}
+}

package/src/reddit.mjs ADDED Viewed

@@ -0,0 +1,210 @@
+// src/reddit.mjs - Reddit content fetching via public JSON API
+// Reddit exposes structured data by appending .json to any URL
+const REDDIT_HEADERS = {
+	"user-agent": "GreedySearch/1.0 (Research Bot)",
+	accept: "application/json",
+};
+/**
+ * Parse a Reddit URL to check if it's a post, comment, or user profile
+ * @param {string} url
+ * @returns {{type: 'post'|'user'|'other', cleanUrl: string} | null}
+ */
+export function parseRedditUrl(url) {
+	try {
+		const parsed = new URL(url);
+		const hostname = parsed.hostname.toLowerCase();
+		// Support reddit.com, old.reddit.com, www.reddit.com
+		if (!hostname.endsWith("reddit.com")) {
+			return null;
+		}
+		const pathname = parsed.pathname;
+		// User profile: /u/username or /user/username
+		if (pathname.match(/^\/(u|user)\/[^/]+\/?$/i)) {
+			return { type: "user", cleanUrl: normalizeRedditUrl(url) };
+		}
+		// Post: /r/subreddit/comments/xxxx/...
+		if (pathname.match(/^\/r\/[^/]+\/comments\/[^/]+/i)) {
+			return { type: "post", cleanUrl: normalizeRedditUrl(url) };
+		}
+		return null;
+	} catch {
+		return null;
+	}
+}
+/**
+ * Normalize Reddit URL (remove query params, fragments)
+ * @param {string} url
+ * @returns {string}
+ */
+function normalizeRedditUrl(url) {
+	try {
+		const parsed = new URL(url);
+		// Reconstruct without query/fragment
+		return `${parsed.protocol}//${parsed.hostname}${parsed.pathname}`;
+	} catch {
+		return url;
+	}
+}
+/**
+ * Fetch Reddit content via the .json API
+ * @param {string} url - Reddit URL (will have .json appended)
+ * @param {number} maxChars - Max characters for content
+ * @returns {Promise<FetchResult>}
+ */
+export async function fetchRedditContent(url, maxChars = 8000) {
+	const start = Date.now();
+	try {
+		// Append .json to get API response
+		const jsonUrl = url.replace(/\/?$/, ".json");
+		const controller = new AbortController();
+		const timeoutId = setTimeout(() => controller.abort(), 15000);
+		const response = await fetch(jsonUrl, {
+			headers: REDDIT_HEADERS,
+			signal: controller.signal,
+		});
+		clearTimeout(timeoutId);
+		if (!response.ok) {
+			throw new Error(`Reddit API ${response.status}`);
+		}
+		const data = await response.json();
+		// data[0] = post listing, data[1] = comments listing
+		if (!Array.isArray(data) || data.length < 1) {
+			throw new Error("Invalid Reddit API response structure");
+		}
+		const postListing = data[0];
+		const commentsListing = data[1];
+		// Extract post data
+		const post = postListing?.data?.children?.[0]?.data;
+		if (!post) {
+			throw new Error("No post data in Reddit response");
+		}
+		// Format as markdown
+		const markdown = formatRedditPost(post, commentsListing, maxChars);
+		return {
+			ok: true,
+			url,
+			finalUrl: url,
+			status: 200,
+			contentType: "text/markdown",
+			lastModified: "",
+			title: post.title || "Reddit Post",
+			byline: `u/${post.author}`,
+			siteName: `r/${post.subreddit}`,
+			lang: "en",
+			publishedTime: new Date(post.created_utc * 1000).toISOString(),
+			excerpt: post.selftext?.slice(0, 300).replace(/\n/g, " ") || "",
+			markdown,
+			contentLength: markdown.length,
+			needsBrowser: false,
+			duration: Date.now() - start,
+		};
+	} catch (error) {
+		return {
+			ok: false,
+			url,
+			finalUrl: url,
+			status: 0,
+			error: `Reddit fetch failed: ${error.message}`,
+			needsBrowser: false,
+			duration: Date.now() - start,
+		};
+	}
+}
+/**
+ * Format Reddit post and comments as clean markdown
+ * @param {object} post - Reddit post data
+ * @param {object|null} commentsListing - Comments listing data
+ * @param {number} maxChars - Max characters
+ * @returns {string}
+ */
+function formatRedditPost(post, commentsListing, maxChars) {
+	let md = "";
+	// Post header
+	md += `# ${post.title}\n\n`;
+	md += `**Subreddit:** r/${post.subreddit} | **Author:** u/${post.author} | **Score:** ${post.score}\n\n`;
+	// Post body (selftext) or link
+	if (post.selftext) {
+		md += post.selftext;
+		md += "\n\n";
+	} else if (post.url && !post.url.includes("reddit.com")) {
+		// External link post
+		md += `**Link:** ${post.url}\n\n`;
+	}
+	// Comments section
+	if (commentsListing?.data?.children?.length > 0) {
+		md += "---\n\n## Comments\n\n";
+		const comments = commentsListing.data.children
+			.filter((c) => c.kind === "t1") // t1 = comment
+			.slice(0, 10); // Top 10 comments
+		for (const comment of comments) {
+			md += formatComment(comment.data, 0);
+			md += "\n";
+		}
+	}
+	// Trim to maxChars while keeping structure
+	if (md.length > maxChars) {
+		md = md.slice(0, maxChars).trim() + "\n\n... (truncated)";
+	}
+	return md;
+}
+/**
+ * Format a single comment with nesting
+ * @param {object} comment - Reddit comment data
+ * @param {number} depth - Nesting depth
+ * @returns {string}
+ */
+function formatComment(comment, depth) {
+	if (
+		!comment ||
+		comment.body === "[deleted]" ||
+		comment.body === "[removed]"
+	) {
+		return "";
+	}
+	const indent = "> ".repeat(depth);
+	let md = "";
+	md += `${indent}**u/${comment.author}** (${comment.score} pts)\n`;
+	md += `${indent}${comment.body.replace(/\n/g, "\n" + indent)}\n`;
+	// Handle nested replies (limit depth to 3)
+	if (depth < 3 && comment.replies?.data?.children) {
+		const replies = comment.replies.data.children.filter(
+			(r) => r.kind === "t1",
+		);
+		for (const reply of replies.slice(0, 5)) {
+			md += "\n" + formatComment(reply.data, depth + 1);
+		}
+	}
+	return md;
+}