npm - @alexion42/pi-web-search - Versions diffs - 0.1.0 - Mend

@alexion42/pi-web-search 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/.pi/tasks/tasks-019e595f-0b95-7b09-9237-a0c6fbbda360.json +4 -0
package/CHANGELOG.md +18 -0
package/LICENSE +21 -0
package/README.md +88 -0
package/TOOLS.md +103 -0
package/activity.ts +101 -0
package/banner.png +0 -0
package/code-search.ts +107 -0
package/exa.ts +520 -0
package/extract.ts +342 -0
package/github-api.ts +196 -0
package/github-extract.ts +634 -0
package/index.ts +885 -0
package/package.json +46 -0
package/pdf-extract.ts +192 -0
package/pi-web-fetch-demo.mp4 +0 -0
package/rsc-extract.ts +338 -0
package/search.ts +49 -0
package/storage.ts +71 -0
package/test/pdf-extract.test.mjs +95 -0
package/types.ts +20 -0
package/utils.ts +44 -0

package/extract.ts ADDED Viewed

@@ -0,0 +1,342 @@
+import { Readability } from "@mozilla/readability";
+import { parseHTML } from "linkedom";
+import TurndownService from "turndown";
+import pLimit from "p-limit";
+import { activityMonitor } from "./activity.js";
+import { extractRSCContent } from "./rsc-extract.js";
+import { extractPDFToMarkdown, isPDF } from "./pdf-extract.js";
+import { extractGitHub } from "./github-extract.js";
+const DEFAULT_TIMEOUT_MS = 30000;
+const CONCURRENT_LIMIT = 3;
+const NON_RECOVERABLE_ERRORS = ["Unsupported content type", "Response too large"];
+const MIN_USEFUL_CONTENT = 500;
+function errorMessage(err: unknown): string {
+	return err instanceof Error ? err.message : String(err);
+}
+function isConfigParseError(err: unknown): boolean {
+	return errorMessage(err).startsWith("Failed to parse ");
+}
+function isAbortError(err: unknown): boolean {
+	return errorMessage(err).toLowerCase().includes("abort");
+}
+function abortedResult(url: string): ExtractedContent {
+	return { url, title: "", content: "", error: "Aborted" };
+}
+const turndown = new TurndownService({
+	headingStyle: "atx",
+	codeBlockStyle: "fenced",
+});
+const fetchLimit = pLimit(CONCURRENT_LIMIT);
+export interface ExtractedContent {
+	url: string;
+	title: string;
+	content: string;
+	error: string | null;
+}
+export interface ExtractOptions {
+	timeoutMs?: number;
+	forceClone?: boolean;
+}
+const JINA_READER_BASE = "https://r.jina.ai/";
+const JINA_TIMEOUT_MS = 30000;
+async function extractWithJinaReader(
+	url: string,
+	signal?: AbortSignal,
+): Promise<ExtractedContent | null> {
+	const jinaUrl = JINA_READER_BASE + url;
+	const activityId = activityMonitor.logStart({ type: "api", query: `jina: ${url}` });
+	try {
+		const res = await fetch(jinaUrl, {
+			headers: {
+				"Accept": "text/markdown",
+				"X-No-Cache": "true",
+			},
+			signal: AbortSignal.any([
+				AbortSignal.timeout(JINA_TIMEOUT_MS),
+				...(signal ? [signal] : []),
+			]),
+		});
+		if (!res.ok) {
+			activityMonitor.logComplete(activityId, res.status);
+			return null;
+		}
+		const content = await res.text();
+		activityMonitor.logComplete(activityId, res.status);
+		const contentStart = content.indexOf("Markdown Content:");
+		if (contentStart < 0) {
+			return null;
+		}
+		const markdownPart = content.slice(contentStart + 17).trim();
+		// Check for failed JS rendering or minimal content
+		if (markdownPart.length < 100 ||
+			markdownPart.startsWith("Loading...") ||
+			markdownPart.startsWith("Please enable JavaScript")) {
+			return null;
+		}
+		const title = extractHeadingTitle(markdownPart) ?? (new URL(url).pathname.split("/").pop() || url);
+		return { url, title, content: markdownPart, error: null };
+	} catch (err) {
+		const message = err instanceof Error ? err.message : String(err);
+		if (message.toLowerCase().includes("abort")) {
+			activityMonitor.logComplete(activityId, 0);
+		} else {
+			activityMonitor.logError(activityId, message);
+		}
+		return null;
+	}
+}
+export async function extractContent(
+	url: string,
+	signal?: AbortSignal,
+	options?: ExtractOptions,
+): Promise<ExtractedContent> {
+	if (signal?.aborted) {
+		return { url, title: "", content: "", error: "Aborted" };
+	}
+	// Validate URL format
+	try {
+		new URL(url);
+	} catch {
+		return { url, title: "", content: "", error: "Invalid URL" };
+	}
+	// Try GitHub extraction first
+	try {
+		const ghResult = await extractGitHub(url, signal, options?.forceClone);
+		if (ghResult) return ghResult;
+		if (signal?.aborted) return abortedResult(url);
+	} catch (err) {
+		const message = errorMessage(err);
+		if (isAbortError(err)) return abortedResult(url);
+		if (isConfigParseError(err)) {
+			return { url, title: "", content: "", error: message };
+		}
+	}
+	// HTTP extraction with fallback chain
+	const httpResult = await extractViaHttp(url, signal, options);
+	if (signal?.aborted) return abortedResult(url);
+	if (!httpResult.error) return httpResult;
+	if (NON_RECOVERABLE_ERRORS.some(prefix => httpResult.error!.startsWith(prefix))) return httpResult;
+	// Try Jina Reader for JS-rendered pages
+	const jinaResult = await extractWithJinaReader(url, signal);
+	if (jinaResult) return jinaResult;
+	if (signal?.aborted) return abortedResult(url);
+	// Final error state
+	return { ...httpResult, error: httpResult.error };
+}
+function isLikelyJSRendered(html: string): boolean {
+	const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
+	if (!bodyMatch) return false;
+	const bodyHtml = bodyMatch[1];
+	const textContent = bodyHtml
+		.replace(/<script[\s\S]*?<\/script>/gi, "")
+		.replace(/<style[\s\S]*?<\/style>/gi, "")
+		.replace(/<[^>]+>/g, "")
+		.replace(/\s+/g, " ")
+		.trim();
+	const scriptCount = (html.match(/<script/gi) || []).length;
+	return textContent.length < 500 && scriptCount > 3;
+}
+async function extractViaHttp(
+	url: string,
+	signal?: AbortSignal,
+	options?: ExtractOptions,
+): Promise<ExtractedContent> {
+	const timeoutMs = options?.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+	const activityId = activityMonitor.logStart({ type: "fetch", url });
+	const controller = new AbortController();
+	const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
+	const onAbort = () => controller.abort();
+	signal?.addEventListener("abort", onAbort);
+	try {
+		const response = await fetch(url, {
+			signal: controller.signal,
+			headers: {
+				"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
+				"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+				"Accept-Language": "en-US,en;q=0.9",
+				"Cache-Control": "no-cache",
+				"Sec-Fetch-Dest": "document",
+				"Sec-Fetch-Mode": "navigate",
+				"Sec-Fetch-Site": "none",
+				"Sec-Fetch-User": "?1",
+				"Upgrade-Insecure-Requests": "1",
+			},
+		});
+		if (!response.ok) {
+			activityMonitor.logComplete(activityId, response.status);
+			return {
+				url,
+				title: "",
+				content: "",
+				error: `HTTP ${response.status}: ${response.statusText}`,
+			};
+		}
+		const contentLengthHeader = response.headers.get("content-length");
+		const contentType = response.headers.get("content-type") || "";
+		const isPDFContent = isPDF(url, contentType);
+		const maxResponseSize = isPDFContent ? 20 * 1024 * 1024 : 5 * 1024 * 1024;
+		if (contentLengthHeader) {
+			const contentLength = parseInt(contentLengthHeader, 10);
+			if (contentLength > maxResponseSize) {
+				activityMonitor.logComplete(activityId, response.status);
+				return {
+					url,
+					title: "",
+					content: "",
+					error: `Response too large (${Math.round(contentLength / 1024 / 1024)}MB)`,
+				};
+			}
+		}
+		if (isPDFContent) {
+			try {
+				const buffer = await response.arrayBuffer();
+				const result = await extractPDFToMarkdown(buffer, url);
+				activityMonitor.logComplete(activityId, response.status);
+				return {
+					url,
+					title: result.title,
+					content: `PDF extracted and saved to: ${result.outputPath}\n\nPages: ${result.pages}\nCharacters: ${result.chars}`,
+					error: null,
+				};
+			} catch (err) {
+				const message = err instanceof Error ? err.message : String(err);
+				activityMonitor.logError(activityId, message);
+				return { url, title: "", content: "", error: `PDF extraction failed: ${message}` };
+			}
+		}
+		if (contentType.includes("application/octet-stream") ||
+			contentType.includes("image/") ||
+			contentType.includes("audio/") ||
+			contentType.includes("video/") ||
+			contentType.includes("application/zip")) {
+			activityMonitor.logComplete(activityId, response.status);
+			return {
+				url,
+				title: "",
+				content: "",
+				error: `Unsupported content type: ${contentType.split(";")[0]}`,
+			};
+		}
+		const text = await response.text();
+		const isHTML = contentType.includes("text/html") || contentType.includes("application/xhtml+xml");
+		if (!isHTML) {
+			activityMonitor.logComplete(activityId, response.status);
+			const title = extractTextTitle(text, url);
+			return { url, title, content: text, error: null };
+		}
+		const { document } = parseHTML(text);
+		const reader = new Readability(document as unknown as Document);
+		const article = reader.parse();
+		if (!article) {
+			const rscResult = extractRSCContent(text);
+			if (rscResult) {
+				activityMonitor.logComplete(activityId, response.status);
+				return { url, title: rscResult.title, content: rscResult.content, error: null };
+			}
+			activityMonitor.logComplete(activityId, response.status);
+			const jsRendered = isLikelyJSRendered(text);
+			const errorMsg = jsRendered
+				? "Page appears to be JavaScript-rendered (content loads dynamically)"
+				: "Could not extract readable content from HTML structure";
+			return {
+				url,
+				title: "",
+				content: "",
+				error: errorMsg,
+			};
+		}
+		const markdown = turndown.turndown(article.content);
+		activityMonitor.logComplete(activityId, response.status);
+		if (markdown.length < MIN_USEFUL_CONTENT) {
+			return {
+				url,
+				title: article.title || "",
+				content: markdown,
+				error: isLikelyJSRendered(text)
+					? "Page appears to be JavaScript-rendered (content loads dynamically)"
+					: "Extracted content appears incomplete",
+			};
+		}
+		return { url, title: article.title || "", content: markdown, error: null };
+	} catch (err) {
+		const message = err instanceof Error ? err.message : String(err);
+		if (message.toLowerCase().includes("abort")) {
+			activityMonitor.logComplete(activityId, 0);
+		} else {
+			activityMonitor.logError(activityId, message);
+		}
+		return { url, title: "", content: "", error: message };
+	} finally {
+		clearTimeout(timeoutId);
+		signal?.removeEventListener("abort", onAbort);
+	}
+}
+export function extractHeadingTitle(text: string): string | null {
+	const match = text.match(/^#{1,2}\s+(.+)/m);
+	if (!match) return null;
+	const cleaned = match[1].replace(/\*+/g, "").trim();
+	return cleaned || null;
+}
+function extractTextTitle(text: string, url: string): string {
+	return extractHeadingTitle(text) ?? (new URL(url).pathname.split("/").pop() || url);
+}
+export async function fetchAllContent(
+	urls: string[],
+	signal?: AbortSignal,
+	options?: ExtractOptions,
+): Promise<ExtractedContent[]> {
+	return Promise.all(urls.map((url) => fetchLimit(() => extractContent(url, signal, options))));
+}

package/github-api.ts ADDED Viewed

@@ -0,0 +1,196 @@
+import { execFile } from "node:child_process";
+import type { ExtractedContent } from "./extract.js";
+import type { GitHubUrlInfo } from "./github-extract.js";
+const MAX_TREE_ENTRIES = 200;
+const MAX_INLINE_FILE_CHARS = 100_000;
+let ghAvailable: boolean | null = null;
+let ghHintShown = false;
+export async function checkGhAvailable(): Promise<boolean> {
+	if (ghAvailable !== null) return ghAvailable;
+	return new Promise((resolve) => {
+		execFile("gh", ["--version"], { timeout: 5000 }, (err) => {
+			ghAvailable = !err;
+			resolve(ghAvailable);
+		});
+	});
+}
+export function showGhHint(): void {
+	if (!ghHintShown) {
+		ghHintShown = true;
+		console.error("[pi-web-search] Install `gh` CLI for better GitHub repo access including private repos.");
+	}
+}
+export async function checkRepoSize(owner: string, repo: string): Promise<number | null> {
+	if (!(await checkGhAvailable())) return null;
+	return new Promise((resolve) => {
+		execFile("gh", ["api", `repos/${owner}/${repo}`, "--jq", ".size"], { timeout: 10000 }, (err, stdout) => {
+			if (err) {
+				resolve(null);
+				return;
+			}
+			const kb = parseInt(stdout.trim(), 10);
+			resolve(Number.isNaN(kb) ? null : kb);
+		});
+	});
+}
+async function getDefaultBranch(owner: string, repo: string): Promise<string | null> {
+	if (!(await checkGhAvailable())) return null;
+	return new Promise((resolve) => {
+		execFile("gh", ["api", `repos/${owner}/${repo}`, "--jq", ".default_branch"], { timeout: 10000 }, (err, stdout) => {
+			if (err) {
+				resolve(null);
+				return;
+			}
+			const branch = stdout.trim();
+			resolve(branch || null);
+		});
+	});
+}
+async function fetchTreeViaApi(owner: string, repo: string, ref: string): Promise<string | null> {
+	if (!(await checkGhAvailable())) return null;
+	return new Promise((resolve) => {
+		execFile(
+			"gh",
+			["api", `repos/${owner}/${repo}/git/trees/${ref}?recursive=1`, "--jq", ".tree[].path"],
+			{ timeout: 15000, maxBuffer: 5 * 1024 * 1024 },
+			(err, stdout) => {
+				if (err) {
+					resolve(null);
+					return;
+				}
+				const paths = stdout.trim().split("\n").filter(Boolean);
+				if (paths.length === 0) {
+					resolve(null);
+					return;
+				}
+				const truncated = paths.length > MAX_TREE_ENTRIES;
+				const display = paths.slice(0, MAX_TREE_ENTRIES).join("\n");
+				resolve(truncated ? display + `\n... (${paths.length} total entries)` : display);
+			},
+		);
+	});
+}
+async function fetchReadmeViaApi(owner: string, repo: string, ref: string): Promise<string | null> {
+	if (!(await checkGhAvailable())) return null;
+	return new Promise((resolve) => {
+		execFile(
+			"gh",
+			["api", `repos/${owner}/${repo}/readme?ref=${ref}`, "--jq", ".content"],
+			{ timeout: 10000 },
+			(err, stdout) => {
+				if (err) {
+					resolve(null);
+					return;
+				}
+				try {
+					const decoded = Buffer.from(stdout.trim(), "base64").toString("utf-8");
+					resolve(decoded.length > 8192 ? decoded.slice(0, 8192) + "\n\n[README truncated at 8K chars]" : decoded);
+				} catch {
+					resolve(null);
+				}
+			},
+		);
+	});
+}
+async function fetchFileViaApi(owner: string, repo: string, path: string, ref: string): Promise<string | null> {
+	if (!(await checkGhAvailable())) return null;
+	return new Promise((resolve) => {
+		execFile(
+			"gh",
+			["api", `repos/${owner}/${repo}/contents/${path}?ref=${ref}`, "--jq", ".content"],
+			{ timeout: 10000, maxBuffer: 2 * 1024 * 1024 },
+			(err, stdout) => {
+				if (err) {
+					resolve(null);
+					return;
+				}
+				try {
+					resolve(Buffer.from(stdout.trim(), "base64").toString("utf-8"));
+				} catch {
+					resolve(null);
+				}
+			},
+		);
+	});
+}
+export async function fetchViaApi(
+	url: string,
+	owner: string,
+	repo: string,
+	info: GitHubUrlInfo,
+	sizeNote?: string,
+): Promise<ExtractedContent | null> {
+	const ref = info.ref || (await getDefaultBranch(owner, repo));
+	if (!ref) return null;
+	const lines: string[] = [];
+	if (sizeNote) {
+		lines.push(sizeNote);
+		lines.push("");
+	}
+	if (info.type === "blob" && info.path) {
+		const content = await fetchFileViaApi(owner, repo, info.path, ref);
+		if (!content) return null;
+		lines.push(`## ${info.path}`);
+		if (content.length > MAX_INLINE_FILE_CHARS) {
+			lines.push(content.slice(0, MAX_INLINE_FILE_CHARS));
+			lines.push(`\n[File truncated at 100K chars]`);
+		} else {
+			lines.push(content);
+		}
+		return {
+			url,
+			title: `${owner}/${repo} - ${info.path}`,
+			content: lines.join("\n"),
+			error: null,
+		};
+	}
+	const [tree, readme] = await Promise.all([
+		fetchTreeViaApi(owner, repo, ref),
+		fetchReadmeViaApi(owner, repo, ref),
+	]);
+	if (!tree && !readme) return null;
+	if (tree) {
+		lines.push("## Structure");
+		lines.push(tree);
+		lines.push("");
+	}
+	if (readme) {
+		lines.push("## README.md");
+		lines.push(readme);
+		lines.push("");
+	}
+	lines.push("This is an API-only view. Clone the repo or use `read`/`bash` for deeper exploration.");
+	const title = info.path ? `${owner}/${repo} - ${info.path}` : `${owner}/${repo}`;
+	return {
+		url,
+		title,
+		content: lines.join("\n"),
+		error: null,
+	};
+}