npm - @alexion42/pi-web-search - Versions diffs - 0.1.0 - Mend

@alexion42/pi-web-search 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/.pi/tasks/tasks-019e595f-0b95-7b09-9237-a0c6fbbda360.json +4 -0
package/CHANGELOG.md +18 -0
package/LICENSE +21 -0
package/README.md +88 -0
package/TOOLS.md +103 -0
package/activity.ts +101 -0
package/banner.png +0 -0
package/code-search.ts +107 -0
package/exa.ts +520 -0
package/extract.ts +342 -0
package/github-api.ts +196 -0
package/github-extract.ts +634 -0
package/index.ts +885 -0
package/package.json +46 -0
package/pdf-extract.ts +192 -0
package/pi-web-fetch-demo.mp4 +0 -0
package/rsc-extract.ts +338 -0
package/search.ts +49 -0
package/storage.ts +71 -0
package/test/pdf-extract.test.mjs +95 -0
package/types.ts +20 -0
package/utils.ts +44 -0

package/package.json ADDED Viewed

@@ -0,0 +1,46 @@
+{
+  "name": "@alexion42/pi-web-search",
+  "version": "0.1.0",
+  "description": "Exa-powered web search and content extraction for Pi coding agent",
+  "type": "module",
+  "scripts": {
+    "test": "node --test"
+  },
+  "keywords": [
+    "pi-package",
+    "pi",
+    "pi-coding-agent",
+    "extension",
+    "web-search",
+    "exa",
+    "fetch",
+    "scraping"
+  ],
+  "author": "Alexion Fortytwo <alexion@lexiupon.com>",
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/nicobailon/pi-web-access.git"
+  },
+  "bugs": {
+    "url": "https://github.com/nicobailon/pi-web-access/issues"
+  },
+  "homepage": "https://github.com/nicobailon/pi-web-access#readme",
+  "dependencies": {
+    "@mozilla/readability": "^0.5.0",
+    "linkedom": "^0.16.0",
+    "p-limit": "^6.1.0",
+    "turndown": "^7.2.0",
+    "unpdf": "^1.6.2"
+  },
+  "peerDependencies": {
+    "@mariozechner/pi-ai": ">=0.6.0",
+    "@mariozechner/pi-coding-agent": ">=0.37.3",
+    "@mariozechner/pi-tui": ">=0.37.3"
+  },
+  "pi": {
+    "extensions": [
+      "./index.ts"
+    ]
+  }
+}

package/pdf-extract.ts ADDED Viewed

@@ -0,0 +1,192 @@
+/**
+ * PDF Content Extractor
+ *
+ * Extracts text from PDF files and saves to markdown.
+ * Uses unpdf (pdfjs-dist wrapper) for text extraction.
+ */
+import { getDocumentProxy } from "unpdf";
+import { writeFile, mkdir } from "node:fs/promises";
+import { join, basename } from "node:path";
+import { homedir } from "node:os";
+export interface PDFExtractResult {
+  title: string;
+  pages: number;
+  chars: number;
+  outputPath: string;
+}
+export interface PDFExtractOptions {
+  maxPages?: number;
+  outputDir?: string;
+  filename?: string;
+}
+const DEFAULT_MAX_PAGES = 100;
+const DEFAULT_OUTPUT_DIR = join(homedir(), "Downloads");
+/**
+ * Extract text from a PDF buffer and save to markdown file
+ */
+export async function extractPDFToMarkdown(
+  buffer: ArrayBuffer,
+  url: string,
+  options: PDFExtractOptions = {}
+): Promise<PDFExtractResult> {
+  const {
+    maxPages = DEFAULT_MAX_PAGES,
+    outputDir = DEFAULT_OUTPUT_DIR,
+    filename,
+  } = options;
+  const safeMaxPages = Number.isFinite(maxPages)
+    ? Math.max(1, Math.floor(maxPages))
+    : DEFAULT_MAX_PAGES;
+  const pdf = await getDocumentProxy(new Uint8Array(buffer));
+  const metadata = await pdf.getMetadata();
+  const metadataInfo = metadata.info && typeof metadata.info === "object"
+    ? metadata.info as Record<string, unknown>
+    : null;
+  // Extract title from metadata or URL
+  const metaTitle = typeof metadataInfo?.Title === "string" ? metadataInfo.Title : undefined;
+  const metaAuthor = typeof metadataInfo?.Author === "string" ? metadataInfo.Author : undefined;
+  const urlTitle = extractTitleFromURL(url);
+  const title = metaTitle?.trim() || urlTitle;
+  // Determine pages to extract
+  const pagesToExtract = Math.min(pdf.numPages, safeMaxPages);
+  const truncated = pdf.numPages > safeMaxPages;
+  // Extract text page by page for better structure
+  const pages: { pageNum: number; text: string }[] = [];
+  for (let i = 1; i <= pagesToExtract; i++) {
+    const page = await pdf.getPage(i);
+    const textContent = await page.getTextContent();
+    const pageText = textContent.items
+      .map((item: unknown) => {
+        const textItem = item as { str?: string };
+        return textItem.str || "";
+      })
+      .join(" ")
+      .replace(/\s+/g, " ")
+      .trim();
+    if (pageText) {
+      pages.push({ pageNum: i, text: pageText });
+    }
+  }
+  // Build markdown content
+  const lines: string[] = [];
+  // Header with metadata
+  lines.push(`# ${title}`);
+  lines.push("");
+  lines.push(`> Source: ${url}`);
+  lines.push(`> Pages: ${pdf.numPages}${truncated ? ` (extracted first ${pagesToExtract})` : ""}`);
+  if (metaAuthor) {
+    lines.push(`> Author: ${metaAuthor}`);
+  }
+  lines.push("");
+  lines.push("---");
+  lines.push("");
+  // Content with page markers
+  for (let i = 0; i < pages.length; i++) {
+    if (i > 0) {
+      lines.push("");
+      lines.push(`<!-- Page ${pages[i].pageNum} -->`);
+      lines.push("");
+    }
+    lines.push(pages[i].text);
+  }
+  if (truncated) {
+    lines.push("");
+    lines.push("---");
+    lines.push("");
+    lines.push(`*[Truncated: Only first ${pagesToExtract} of ${pdf.numPages} pages extracted]*`);
+  }
+  const content = lines.join("\n");
+  // Generate output filename
+  const outputFilename = filename || sanitizeFilename(title) + ".md";
+  const outputPath = join(outputDir, outputFilename);
+  // Ensure output directory exists
+  await mkdir(outputDir, { recursive: true });
+  // Write file
+  await writeFile(outputPath, content, "utf-8");
+  return {
+    title,
+    pages: pdf.numPages,
+    chars: content.length,
+    outputPath,
+  };
+}
+/**
+ * Extract a reasonable title from URL
+ */
+function extractTitleFromURL(url: string): string {
+  try {
+    const urlObj = new URL(url);
+    const pathname = urlObj.pathname;
+    // Get filename without extension
+    let filename = basename(pathname, ".pdf");
+    // Handle arxiv URLs: /pdf/1706.03762 → "arxiv-1706.03762"
+    if (urlObj.hostname.includes("arxiv.org")) {
+      const match = pathname.match(/\/(?:pdf|abs)\/(\d+\.\d+)/);
+      if (match) {
+        filename = `arxiv-${match[1]}`;
+      }
+    }
+    // Clean up filename
+    filename = filename
+      .replace(/[_-]+/g, " ")
+      .replace(/\s+/g, " ")
+      .trim();
+    return filename || "document";
+  } catch {
+    return "document";
+  }
+}
+/**
+ * Sanitize string for use as filename
+ */
+function sanitizeFilename(name: string): string {
+  return name
+    .toLowerCase()
+    .replace(/[^a-z0-9\s-]/g, "")
+    .replace(/\s+/g, "-")
+    .replace(/-+/g, "-")
+    .slice(0, 100)
+    .replace(/^-|-$/g, "")
+    || "document";
+}
+/**
+ * Check if URL or content-type indicates a PDF
+ */
+export function isPDF(url: string, contentType?: string): boolean {
+  if (contentType?.includes("application/pdf")) {
+    return true;
+  }
+  try {
+    const urlObj = new URL(url);
+    return urlObj.pathname.toLowerCase().endsWith(".pdf");
+  } catch {
+    return false;
+  }
+}

package/pi-web-fetch-demo.mp4 ADDED Viewed

Binary file

package/rsc-extract.ts ADDED Viewed

@@ -0,0 +1,338 @@
+/**
+ * RSC Content Extractor
+ *
+ * Extracts readable content from Next.js React Server Components (RSC) flight payloads.
+ * RSC pages embed content as JSON in <script>self.__next_f.push([...])</script> tags.
+ */
+export interface RSCExtractResult {
+  title: string;
+  content: string;
+}
+export function extractRSCContent(html: string): RSCExtractResult | null {
+  if (!html.includes("self.__next_f.push")) {
+    return null;
+  }
+  // Parse all RSC chunks into a map
+  const chunkMap = new Map<string, string>();
+  const scriptRegex = /<script>self\.__next_f\.push\(\[1,"([\s\S]*?)"\]\)<\/script>/g;
+  for (const match of html.matchAll(scriptRegex)) {
+    let content: string;
+    try {
+      content = JSON.parse('"' + match[1] + '"');
+    } catch {
+      continue;
+    }
+    // Parse each line as "id:payload"
+    // Lines are separated by \n, each line is one chunk
+    // Chunk IDs are hex strings, typically 1-4 chars (supports up to 65535 chunks)
+    for (const line of content.split("\n")) {
+      if (!line.trim()) continue;
+      const colonIdx = line.indexOf(":");
+      if (colonIdx <= 0 || colonIdx > 4) continue;
+      const id = line.slice(0, colonIdx);
+      if (!/^[0-9a-f]+$/i.test(id)) continue;
+      const payload = line.slice(colonIdx + 1);
+      if (!payload) continue;
+      const existing = chunkMap.get(id);
+      if (!existing || payload.length > existing.length) {
+        chunkMap.set(id, payload);
+      }
+    }
+  }
+  if (chunkMap.size === 0) return null;
+  // Extract title
+  const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/);
+  const title = titleMatch?.[1]?.split("|")[0]?.trim() || "";
+  // Parse and cache parsed chunks
+  const parsedCache = new Map<string, unknown>();
+  function getParsedChunk(id: string): unknown | null {
+    if (parsedCache.has(id)) return parsedCache.get(id);
+    const chunk = chunkMap.get(id);
+    if (!chunk || !chunk.startsWith("[")) {
+      parsedCache.set(id, null);
+      return null;
+    }
+    try {
+      const parsed = JSON.parse(chunk);
+      parsedCache.set(id, parsed);
+      return parsed;
+    } catch {
+      parsedCache.set(id, null);
+      return null;
+    }
+  }
+  // Extract markdown from nodes, resolving refs on the fly
+  type Node = unknown;
+  const visitedRefs = new Set<string>();
+  function extractNode(node: Node, ctx = { inTable: false, inCode: false }): string {
+    if (node === null || node === undefined) return "";
+    if (typeof node === "string") {
+      // Check if it's a reference like "$L30"
+      const refMatch = node.match(/^\$L([0-9a-f]+)$/i);
+      if (refMatch) {
+        const refId = refMatch[1];
+        if (visitedRefs.has(refId)) return ""; // Prevent cycles
+        visitedRefs.add(refId);
+        const refNode = getParsedChunk(refId);
+        const result = refNode ? extractNode(refNode, ctx) : "";
+        visitedRefs.delete(refId);
+        return result;
+      }
+      // Filter out RSC-specific artifacts, but preserve content inside code blocks
+      if (!ctx.inCode && (node === "$undefined" || node === "$" || /^\$[A-Z]/.test(node))) return "";
+      return node.trim() ? node : "";
+    }
+    if (typeof node === "number") return String(node);
+    if (typeof node === "boolean") return "";
+    if (!Array.isArray(node)) return "";
+    // RSC element: ["$", "tag", key, props]
+    if (node[0] === "$" && typeof node[1] === "string") {
+      const tag = node[1] as string;
+      const props = (node[3] || {}) as Record<string, unknown>;
+      // Skip non-content
+      const skipTags = ["script", "style", "svg", "path", "circle", "link", "meta",
+                        "template", "button", "input", "nav", "footer", "aside"];
+      if (skipTags.includes(tag)) return "";
+      // Component ref like $L25
+      if (tag.startsWith("$L")) {
+        const refId = tag.slice(2);
+        if (visitedRefs.has(refId)) return "";
+        // Check for heading components with baseId
+        if (props.baseId && props.children) {
+          return `## ${String(props.children)}\n\n`;
+        }
+        visitedRefs.add(refId);
+        const refNode = getParsedChunk(refId);
+        let result = "";
+        if (refNode) {
+          result = extractNode(refNode, ctx);
+        } else if (props.children) {
+          result = extractNode(props.children as Node, ctx);
+        }
+        visitedRefs.delete(refId);
+        return result;
+      }
+      const children = props.children;
+      const content = children ? extractNode(children as Node, ctx) : "";
+      switch (tag) {
+        case "h1": return `# ${content.trim()}\n\n`;
+        case "h2": return `## ${content.trim()}\n\n`;
+        case "h3": return `### ${content.trim()}\n\n`;
+        case "h4": return `#### ${content.trim()}\n\n`;
+        case "h5": return `##### ${content.trim()}\n\n`;
+        case "h6": return `###### ${content.trim()}\n\n`;
+        case "p": return ctx.inTable ? content : `${content.trim()}\n\n`;
+        case "code": {
+          const codeContent = children ? extractNode(children as Node, { ...ctx, inCode: true }) : "";
+          return ctx.inCode ? codeContent : `\`${codeContent}\``;
+        }
+        case "pre": {
+          const preContent = children ? extractNode(children as Node, { ...ctx, inCode: true }) : "";
+          return "```\n" + preContent + "\n```\n\n";
+        }
+        case "strong": case "b": return `**${content}**`;
+        case "em": case "i": return `*${content}*`;
+        case "li": return `- ${content.trim()}\n`;
+        case "ul": case "ol": return content + "\n";
+        case "blockquote": return `> ${content.trim()}\n\n`;
+        case "table": return extractTable(node as unknown[]) + "\n";
+        case "thead": case "tbody": case "tr": case "th": case "td":
+          return content;
+        case "div":
+          if (props.role === "alert" || props["data-slot"] === "alert") {
+            return `> ${content.trim()}\n\n`;
+          }
+          return content;
+        case "a": {
+          const href = props.href as string | undefined;
+          return href && !href.startsWith("#") ? `[${content}](${href})` : content;
+        }
+        default: return content;
+      }
+    }
+    // Array of child nodes
+    return (node as Node[]).map(n => extractNode(n, ctx)).join("");
+  }
+  function extractTable(tableNode: unknown[]): string {
+    const props = (tableNode[3] || {}) as Record<string, unknown>;
+    const rows: string[][] = [];
+    let headerRowCount = 0;
+    function walkTable(node: unknown, isHeader = false): void {
+      if (node === null || node === undefined) return;
+      // Handle string refs
+      if (typeof node === "string") {
+        const refMatch = node.match(/^\$L([0-9a-f]+)$/i);
+        if (refMatch && !visitedRefs.has(refMatch[1])) {
+          visitedRefs.add(refMatch[1]);
+          const refNode = getParsedChunk(refMatch[1]);
+          if (refNode) walkTable(refNode, isHeader);
+          visitedRefs.delete(refMatch[1]);
+        }
+        return;
+      }
+      if (!Array.isArray(node)) return;
+      if (node[0] === "$") {
+        const tag = node[1] as string;
+        const nodeProps = (node[3] || {}) as Record<string, unknown>;
+        // Handle component refs
+        if (tag.startsWith("$L")) {
+          const refId = tag.slice(2);
+          if (!visitedRefs.has(refId)) {
+            visitedRefs.add(refId);
+            const refNode = getParsedChunk(refId);
+            if (refNode) walkTable(refNode, isHeader);
+            visitedRefs.delete(refId);
+          }
+          return;
+        }
+        if (tag === "thead") walkTable(nodeProps.children, true);
+        else if (tag === "tbody") walkTable(nodeProps.children, false);
+        else if (tag === "tr") {
+          const cells: string[] = [];
+          walkCells(nodeProps.children, cells);
+          if (cells.length > 0) {
+            rows.push(cells);
+            if (isHeader) headerRowCount++;
+          }
+        } else walkTable(nodeProps.children, isHeader);
+      } else {
+        for (const child of node) walkTable(child, isHeader);
+      }
+    }
+    function walkCells(node: unknown, cells: string[]): void {
+      if (node === null || node === undefined) return;
+      // Handle string refs
+      if (typeof node === "string") {
+        const refMatch = node.match(/^\$L([0-9a-f]+)$/i);
+        if (refMatch && !visitedRefs.has(refMatch[1])) {
+          visitedRefs.add(refMatch[1]);
+          const refNode = getParsedChunk(refMatch[1]);
+          if (refNode) walkCells(refNode, cells);
+          visitedRefs.delete(refMatch[1]);
+        }
+        return;
+      }
+      if (!Array.isArray(node)) return;
+      if (node[0] === "$" && (node[1] === "td" || node[1] === "th")) {
+        const cellProps = (node[3] || {}) as Record<string, unknown>;
+        const text = extractNode(cellProps.children, { inTable: true, inCode: false })
+          .trim()
+          .replace(/\n/g, " ")
+          .replace(/\\/g, "\\\\")  // Escape backslashes first
+          .replace(/\|/g, "\\|");  // Then escape pipes
+        cells.push(text);
+      } else if (node[0] === "$" && typeof node[1] === "string" && (node[1] as string).startsWith("$L")) {
+        // Component ref for a cell
+        const refId = (node[1] as string).slice(2);
+        if (!visitedRefs.has(refId)) {
+          visitedRefs.add(refId);
+          const refNode = getParsedChunk(refId);
+          if (refNode) walkCells(refNode, cells);
+          visitedRefs.delete(refId);
+        }
+      } else {
+        for (const child of node) walkCells(child, cells);
+      }
+    }
+    walkTable(props.children);
+    if (rows.length === 0) return "";
+    const colCount = Math.max(...rows.map(r => r.length));
+    let md = "";
+    for (let i = 0; i < rows.length; i++) {
+      const row = rows[i].concat(Array(colCount - rows[i].length).fill(""));
+      md += "| " + row.join(" | ") + " |\n";
+      if (i === headerRowCount - 1 || (headerRowCount === 0 && i === 0)) {
+        md += "| " + Array(colCount).fill("---").join(" | ") + " |\n";
+      }
+    }
+    return md;
+  }
+  // Process main content chunk (usually "23")
+  const mainChunk = getParsedChunk("23");
+  if (mainChunk) {
+    const content = extractNode(mainChunk);
+    if (content.trim().length > 100) {
+      const cleaned = content
+        .replace(/\n{3,}/g, "\n\n")
+        .trim();
+      return { title, content: cleaned };
+    }
+  }
+  // Fallback: try other chunks
+  const contentParts: { order: number; text: string }[] = [];
+  for (const [id] of chunkMap) {
+    if (id === "23") continue;
+    const parsed = getParsedChunk(id);
+    if (!parsed) continue;
+    visitedRefs.clear();
+    const text = extractNode(parsed);
+    if (text.trim().length > 50 &&
+        !text.includes("page was not found") &&
+        !text.includes("404")) {
+      contentParts.push({ order: parseInt(id, 16), text: text.trim() });
+    }
+  }
+  if (contentParts.length === 0) return null;
+  contentParts.sort((a, b) => a.order - b.order);
+  const seen = new Set<string>();
+  const uniqueParts: string[] = [];
+  for (const part of contentParts) {
+    const key = part.text.slice(0, 150);
+    if (!seen.has(key)) {
+      seen.add(key);
+      uniqueParts.push(part.text);
+    }
+  }
+  const content = uniqueParts.join("\n\n").replace(/\n{3,}/g, "\n\n").trim();
+  return content.length > 100 ? { title, content } : null;
+}

package/search.ts ADDED Viewed

@@ -0,0 +1,49 @@
+import { activityMonitor } from "./activity.js";
+import { hasExaApiKey, searchWithExa } from "./exa.js";
+import type { SearchResponse, SearchOptions } from "./types.js";
+const MAX_NUM_RESULTS = 20;
+export interface FullSearchOptions extends SearchOptions {
+	includeContent?: boolean;
+}
+function errorMessage(err: unknown): string {
+	return err instanceof Error ? err.message : String(err);
+}
+function isAbortError(err: unknown): boolean {
+	return errorMessage(err).toLowerCase().includes("abort");
+}
+export async function search(query: string, options: FullSearchOptions = {}): Promise<SearchResponse> {
+	const activityId = activityMonitor.logStart({ type: "api", query });
+	try {
+		const result = await searchWithExa(query, options);
+		if (result && "exhausted" in result) {
+			throw new Error(
+				"Exa monthly free tier exhausted (1,000 requests). Resets next month.\n" +
+				"  Upgrade at exa.ai/pricing"
+			);
+		}
+		if (result && "answer" in result) {
+			activityMonitor.logComplete(activityId, 200);
+			return result;
+		}
+		// null result from MCP with no API key
+		throw new Error(
+			"No search provider available. Either:\n" +
+			"  1. Set EXA_API_KEY (or exaApiKey in ~/.pi/web-search.json)\n" +
+			"  2. Use Exa MCP (no API key needed)"
+		);
+	} catch (err) {
+		const message = err instanceof Error ? err.message : String(err);
+		if (isAbortError(err)) {
+			activityMonitor.logComplete(activityId, 0);
+		} else {
+			activityMonitor.logError(activityId, message);
+		}
+		throw err;
+	}
+}

package/storage.ts ADDED Viewed

@@ -0,0 +1,71 @@
+import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
+import type { ExtractedContent } from "./extract.js";
+import type { SearchResult } from "./types.js";
+const CACHE_TTL_MS = 60 * 60 * 1000;
+export interface QueryResultData {
+	query: string;
+	answer: string;
+	results: SearchResult[];
+	error: string | null;
+}
+export interface StoredSearchData {
+	id: string;
+	type: "search" | "fetch";
+	timestamp: number;
+	queries?: QueryResultData[];
+	urls?: ExtractedContent[];
+}
+const storedResults = new Map<string, StoredSearchData>();
+export function generateId(): string {
+	return Date.now().toString(36) + Math.random().toString(36).slice(2, 8);
+}
+export function storeResult(id: string, data: StoredSearchData): void {
+	storedResults.set(id, data);
+}
+export function getResult(id: string): StoredSearchData | null {
+	return storedResults.get(id) ?? null;
+}
+export function getAllResults(): StoredSearchData[] {
+	return Array.from(storedResults.values());
+}
+export function deleteResult(id: string): boolean {
+	return storedResults.delete(id);
+}
+export function clearResults(): void {
+	storedResults.clear();
+}
+function isValidStoredData(data: unknown): data is StoredSearchData {
+	if (!data || typeof data !== "object") return false;
+	const d = data as Record<string, unknown>;
+	if (typeof d.id !== "string" || !d.id) return false;
+	if (d.type !== "search" && d.type !== "fetch") return false;
+	if (typeof d.timestamp !== "number") return false;
+	if (d.type === "search" && !Array.isArray(d.queries)) return false;
+	if (d.type === "fetch" && !Array.isArray(d.urls)) return false;
+	return true;
+}
+export function restoreFromSession(ctx: ExtensionContext): void {
+	storedResults.clear();
+	const now = Date.now();
+	for (const entry of ctx.sessionManager.getBranch()) {
+		if (entry.type === "custom" && entry.customType === "web-search-results") {
+			const data = entry.data;
+			if (isValidStoredData(data) && now - data.timestamp < CACHE_TTL_MS) {
+				storedResults.set(data.id, data);
+			}
+		}
+	}
+}