npm - @pi-unipi/web-api - Versions diffs - 0.1.14 → 0.1.16 - Mend

@pi-unipi/web-api 0.1.14 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +81 -114
package/package.json +9 -2
package/skills/web/SKILL.md +54 -11
package/src/engine/constants.ts +36 -0
package/src/engine/dependencies.ts +145 -0
package/src/engine/dom.ts +266 -0
package/src/engine/extract.ts +642 -0
package/src/engine/format.ts +306 -0
package/src/engine/profiles.ts +102 -0
package/src/engine/types.ts +169 -0
package/src/index.ts +9 -2
package/src/providers/base.ts +9 -1
package/src/settings.ts +70 -4
package/src/tools.ts +281 -24
package/src/tui/progress.ts +168 -0
package/src/tui/result.ts +173 -0
package/src/tui/settings-dialog.ts +168 -0

package/src/engine/format.ts ADDED Viewed

@@ -0,0 +1,306 @@
+/**
+ * @unipi/web-api — Format & Error Builders
+ *
+ * Output formatting, content truncation, and error text builders.
+ */
+import type {
+  FetchResult,
+  FetchError,
+  BatchFetchResult,
+} from "./types.js";
+import { DEFAULT_MAX_CHARS } from "./constants.js";
+/** Truncation marker appended to truncated content */
+const TRUNCATION_MARKER = "\n\n... [truncated]";
+/**
+ * Truncate content to a maximum character count.
+ * Appends a truncation marker if content is shortened.
+ *
+ * @param content - Content to truncate
+ * @param maxChars - Maximum characters
+ * @returns Truncated content with marker if needed
+ */
+export function truncateContent(
+  content: string,
+  maxChars: number = DEFAULT_MAX_CHARS
+): string {
+  if (!content || content.length <= maxChars) {
+    return content;
+  }
+  // Try to truncate at a word boundary
+  const targetLength = maxChars - TRUNCATION_MARKER.length;
+  let truncated = content.slice(0, targetLength);
+  // Find last space to avoid cutting mid-word
+  const lastSpace = truncated.lastIndexOf(" ");
+  if (lastSpace > targetLength * 0.8) {
+    truncated = truncated.slice(0, lastSpace);
+  }
+  return truncated.trim() + TRUNCATION_MARKER;
+}
+/**
+ * Format a FetchResult into the requested output format.
+ *
+ * @param result - Fetch result
+ * @param format - Output format
+ * @param maxChars - Maximum characters (optional)
+ * @returns Formatted content string
+ */
+export function formatContent(
+  result: FetchResult,
+  format: "markdown" | "html" | "text" | "json" = "markdown",
+  maxChars?: number
+): string {
+  let content: string;
+  switch (format) {
+    case "json":
+      content = JSON.stringify(result, null, 2);
+      break;
+    case "html":
+      // For now, return content as-is (defuddle outputs markdown)
+      // A full implementation would convert markdown to HTML
+      content = result.content;
+      break;
+    case "text":
+      // Strip markdown formatting for plain text
+      content = stripMarkdown(result.content);
+      break;
+    case "markdown":
+    default:
+      content = result.content;
+      break;
+  }
+  return truncateContent(content, maxChars);
+}
+/**
+ * Strip markdown formatting for plain text output.
+ *
+ * @param markdown - Markdown content
+ * @returns Plain text
+ */
+function stripMarkdown(markdown: string): string {
+  let text = markdown;
+  // Remove headers
+  text = text.replace(/^#{1,6}\s+/gm, "");
+  // Remove bold/italic
+  text = text.replace(/\*\*\*(.*?)\*\*\*/g, "$1");
+  text = text.replace(/\*\*(.*?)\*\*/g, "$1");
+  text = text.replace(/\*(.*?)\*/g, "$1");
+  text = text.replace(/___(.*?)___/g, "$1");
+  text = text.replace(/__(.*?)__/g, "$1");
+  text = text.replace(/_(.*?)_/g, "$1");
+  // Remove links, keep text
+  text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1");
+  // Remove images
+  text = text.replace(/!\[([^\]]*)\]\([^)]+\)/g, "");
+  // Remove code blocks
+  text = text.replace(/```[\s\S]*?```/g, "");
+  text = text.replace(/`([^`]+)`/g, "$1");
+  // Remove horizontal rules
+  text = text.replace(/^[-*_]{3,}$/gm, "");
+  // Remove blockquotes
+  text = text.replace(/^>\s+/gm, "");
+  // Remove list markers
+  text = text.replace(/^[\s]*[-*+]\s+/gm, "");
+  text = text.replace(/^[\s]*\d+\.\s+/gm, "");
+  // Clean up extra whitespace
+  text = text.replace(/\n{3,}/g, "\n\n");
+  text = text.trim();
+  return text;
+}
+/**
+ * Build a human-readable error message from a FetchError.
+ *
+ * @param error - Fetch error
+ * @returns Human-readable error string
+ */
+export function buildErrorText(error: FetchError): string {
+  const parts: string[] = [];
+  // Main error message
+  parts.push(error.error);
+  // Code and phase context
+  parts.push(`(${error.code} during ${error.phase})`);
+  // URL context
+  if (error.url) {
+    if (error.finalUrl && error.finalUrl !== error.url) {
+      parts.push(`URL: ${error.url} → ${error.finalUrl}`);
+    } else {
+      parts.push(`URL: ${error.url}`);
+    }
+  }
+  // HTTP status
+  if (error.statusCode) {
+    parts.push(`Status: ${error.statusCode}${error.statusText ? ` ${error.statusText}` : ""}`);
+  }
+  // Network details
+  if (error.mimeType) {
+    parts.push(`Content-Type: ${error.mimeType}`);
+  }
+  if (error.contentLength !== undefined) {
+    const sizeKB = Math.round(error.contentLength / 1024);
+    parts.push(`Size: ${sizeKB} KB`);
+  }
+  if (error.downloadedBytes !== undefined && error.contentLength) {
+    const percent = Math.round((error.downloadedBytes / error.contentLength) * 100);
+    parts.push(`Downloaded: ${percent}%`);
+  }
+  // Retry hint
+  if (error.retryable) {
+    parts.push("This error may be retried.");
+  } else {
+    parts.push("This error is not retryable.");
+  }
+  return parts.join("\n");
+}
+/**
+ * Format a single FetchResult for display.
+ *
+ * @param result - Fetch result
+ * @param verbose - Include metadata header
+ * @returns Formatted string
+ */
+export function formatSingleResult(
+  result: FetchResult,
+  verbose: boolean = true
+): string {
+  const lines: string[] = [];
+  if (verbose) {
+    // Metadata header
+    lines.push(`# ${result.title || "Untitled"}`);
+    lines.push("");
+    lines.push(`URL: ${result.url}`);
+    if (result.finalUrl !== result.url) {
+      lines.push(`Final URL: ${result.finalUrl}`);
+    }
+    if (result.author) {
+      lines.push(`Author: ${result.author}`);
+    }
+    if (result.published) {
+      lines.push(`Published: ${result.published}`);
+    }
+    if (result.site) {
+      lines.push(`Site: ${result.site}`);
+    }
+    if (result.language) {
+      lines.push(`Language: ${result.language}`);
+    }
+    lines.push(`Word count: ${result.wordCount}`);
+    lines.push("");
+    lines.push("---");
+    lines.push("");
+  }
+  // Content
+  lines.push(result.content);
+  return lines.join("\n");
+}
+/**
+ * Format a BatchFetchResult for display.
+ *
+ * @param result - Batch fetch result
+ * @returns Formatted string
+ */
+export function formatBatchResult(result: BatchFetchResult): string {
+  const lines: string[] = [];
+  // Summary header
+  lines.push(`# Batch Read Results`);
+  lines.push("");
+  lines.push(
+    `Total: ${result.total} · Succeeded: ${result.succeeded} · Failed: ${result.failed}`
+  );
+  lines.push("");
+  // Per-item results
+  for (let i = 0; i < result.items.length; i++) {
+    const item = result.items[i];
+    lines.push(`## [${i + 1}/${result.total}] ${item.status === "done" ? "✓" : "✗"}`);
+    if (item.status === "done") {
+      lines.push(`Title: ${item.result.title}`);
+      lines.push(`URL: ${item.result.url}`);
+      lines.push(`Words: ${item.result.wordCount}`);
+      // Content preview (first 500 chars)
+      const preview = item.result.content.slice(0, 500);
+      lines.push("");
+      lines.push(preview + (item.result.content.length > 500 ? "..." : ""));
+    } else {
+      lines.push(`URL: ${item.error.url || "unknown"}`);
+      lines.push(`Error: ${item.error.error}`);
+    }
+    lines.push("");
+  }
+  return lines.join("\n");
+}
+/**
+ * Format a FetchError for display.
+ *
+ * @param error - Fetch error
+ * @returns Formatted error string
+ */
+export function formatErrorResult(error: FetchError): string {
+  const lines: string[] = [];
+  lines.push(`# Fetch Error`);
+  lines.push("");
+  lines.push(`**${error.error}**`);
+  lines.push("");
+  lines.push(`Code: \`${error.code}\``);
+  lines.push(`Phase: \`${error.phase}\``);
+  if (error.url) {
+    lines.push("");
+    lines.push(`URL: ${error.url}`);
+    if (error.finalUrl && error.finalUrl !== error.url) {
+      lines.push(`Final URL: ${error.finalUrl}`);
+    }
+  }
+  if (error.statusCode) {
+    lines.push("");
+    lines.push(`HTTP Status: ${error.statusCode}${error.statusText ? ` ${error.statusText}` : ""}`);
+  }
+  if (error.retryable) {
+    lines.push("");
+    lines.push(`*This error may be retried.*`);
+  }
+  return lines.join("\n");
+}

package/src/engine/profiles.ts ADDED Viewed

@@ -0,0 +1,102 @@
+/**
+ * @unipi/web-api — Browser Profile Resolution
+ *
+ * Resolves browser TLS fingerprint profiles for wreq-js.
+ */
+import { DEFAULT_BROWSER, DEFAULT_OS } from "./constants.js";
+/** Known browser profiles for TLS fingerprinting */
+export const BROWSER_PROFILES = [
+  // Chrome
+  "chrome_100",
+  "chrome_101",
+  "chrome_104",
+  "chrome_107",
+  "chrome_110",
+  "chrome_116",
+  "chrome_119",
+  "chrome_120",
+  "chrome_123",
+  "chrome_124",
+  "chrome_131",
+  "chrome_133",
+  "chrome_145",
+  // Firefox
+  "firefox_120",
+  "firefox_133",
+  // Safari
+  "safari_15_6_1",
+  "safari_16_0",
+  "safari_17_0",
+  // Edge
+  "edge_101",
+] as const;
+/** OS fingerprint options */
+export const OS_PROFILES = [
+  "windows",
+  "macos",
+  "linux",
+  "android",
+  "ios",
+] as const;
+/** Type for browser profile strings */
+export type BrowserProfile = (typeof BROWSER_PROFILES)[number];
+/** Type for OS profile strings */
+export type OSProfile = (typeof OS_PROFILES)[number];
+/**
+ * Resolve a browser profile string.
+ * If provided, validates against known profiles.
+ * If omitted, returns the default (latest Chrome).
+ *
+ * @param browser - Browser profile string or undefined
+ * @returns Validated browser profile
+ */
+export function resolveBrowserProfile(browser?: string): string {
+  if (!browser) {
+    return DEFAULT_BROWSER;
+  }
+  // Check exact match
+  if ((BROWSER_PROFILES as readonly string[]).includes(browser)) {
+    return browser;
+  }
+  // Try prefix match (e.g. "chrome" → latest Chrome)
+  const prefix = browser.toLowerCase();
+  const matches = BROWSER_PROFILES.filter((p) =>
+    p.toLowerCase().startsWith(prefix)
+  );
+  if (matches.length > 0) {
+    // Return the last (newest) matching profile
+    return matches[matches.length - 1];
+  }
+  // Unknown profile — pass through (wreq-js may support newer profiles)
+  return browser;
+}
+/**
+ * Resolve an OS fingerprint string.
+ * If omitted, returns the default (windows).
+ *
+ * @param os - OS string or undefined
+ * @returns Validated OS string
+ */
+export function resolveOSProfile(os?: string): string {
+  if (!os) {
+    return DEFAULT_OS;
+  }
+  if ((OS_PROFILES as readonly string[]).includes(os)) {
+    return os;
+  }
+  // Pass through unknown values
+  return os;
+}

package/src/engine/types.ts ADDED Viewed

@@ -0,0 +1,169 @@
+/**
+ * @unipi/web-api — Smart-Fetch Engine Types
+ *
+ * Type definitions for the local smart-fetch engine:
+ * wreq-js (TLS fingerprinting) + defuddle (content extraction) + linkedom (server-side DOM).
+ */
+// ─── Error Types ───────────────────────────────────────────────
+/** Structured error codes for fetch failures */
+export type FetchErrorCode =
+  | "invalid_url"
+  | "unsupported_protocol"
+  | "http_error"
+  | "unexpected_response"
+  | "timeout"
+  | "network_error"
+  | "processing_error"
+  | "download_error"
+  | "no_content"
+  | "too_many_redirects";
+/** Phase in the fetch pipeline where an error occurred */
+export type FetchErrorPhase =
+  | "validation"
+  | "connecting"
+  | "waiting"
+  | "loading"
+  | "processing"
+  | "unknown";
+/** Rich error with structured context for agent retry decisions */
+export interface FetchError {
+  /** Human-readable error message */
+  error: string;
+  /** Structured error category */
+  code: FetchErrorCode;
+  /** Where in the pipeline it failed */
+  phase: FetchErrorPhase;
+  /** Whether the agent can retry this request */
+  retryable: boolean;
+  /** Configured timeout in ms */
+  timeoutMs?: number;
+  /** Original URL requested */
+  url?: string;
+  /** Final URL after redirects */
+  finalUrl?: string;
+  /** HTTP status code (if applicable) */
+  statusCode?: number;
+  /** HTTP status text (if applicable) */
+  statusText?: string;
+  /** Response content type */
+  mimeType?: string;
+  /** Expected response size in bytes */
+  contentLength?: number;
+  /** Bytes downloaded before failure */
+  downloadedBytes?: number;
+}
+// ─── Result Types ──────────────────────────────────────────────
+/** Successful fetch result with full metadata */
+export interface FetchResult {
+  /** Original URL requested */
+  url: string;
+  /** Final URL after redirects */
+  finalUrl: string;
+  /** Page title */
+  title: string;
+  /** Article author (if extractable) */
+  author: string;
+  /** Publication date (ISO 8601 if available) */
+  published: string;
+  /** Site name */
+  site: string;
+  /** Content language (BCP 47) */
+  language: string;
+  /** Word count of extracted content */
+  wordCount: number;
+  /** Extracted and formatted content */
+  content: string;
+  /** Output format */
+  format: "markdown" | "html" | "text" | "json";
+  /** Response MIME type */
+  mimeType: string;
+}
+// ─── Options ───────────────────────────────────────────────────
+/** Options for the smart-fetch engine */
+export interface FetchOptions {
+  /** TLS fingerprint profile (e.g. "chrome_145") */
+  browser?: string;
+  /** OS fingerprint (e.g. "windows") */
+  os?: string;
+  /** Output format */
+  format?: "markdown" | "html" | "text" | "json";
+  /** Maximum characters in output content */
+  maxChars?: number;
+  /** Request timeout in milliseconds */
+  timeoutMs?: number;
+  /** Strip image references from content */
+  removeImages?: boolean;
+  /** Include replies/comments: true, false, or "extractors" */
+  includeReplies?: boolean | "extractors";
+  /** Proxy URL */
+  proxy?: string;
+  /** Additional HTTP headers */
+  headers?: Record<string, string>;
+}
+// ─── Batch Types ───────────────────────────────────────────────
+/** Result for a single item in a batch fetch */
+export type BatchFetchItemResult =
+  | { status: "done"; result: FetchResult }
+  | { status: "error"; error: FetchError };
+/** Result of a batch fetch operation */
+export interface BatchFetchResult {
+  /** Total URLs requested */
+  total: number;
+  /** Successfully fetched */
+  succeeded: number;
+  /** Failed to fetch */
+  failed: number;
+  /** Per-item results in input order */
+  items: BatchFetchItemResult[];
+}
+// ─── Progress Types ────────────────────────────────────────────
+/** Status of a single URL in the fetch pipeline */
+export type FetchProgressStatus =
+  | "queued"
+  | "connecting"
+  | "waiting"
+  | "loading"
+  | "processing"
+  | "done"
+  | "error";
+/** Progress update for a single URL */
+export interface FetchProgress {
+  /** URL being fetched */
+  url: string;
+  /** Current pipeline status */
+  status: FetchProgressStatus;
+  /** Progress percentage (0-100) */
+  percent: number;
+  /** Bytes loaded so far */
+  bytesLoaded: number;
+  /** Total bytes expected */
+  bytesTotal: number;
+  /** Current phase label */
+  phase: string;
+  /** Error details (if status is "error") */
+  error?: FetchError;
+}
+// ─── Execution Hooks ───────────────────────────────────────────
+/** Hooks for observing fetch execution progress */
+export interface FetchExecutionHooks {
+  /** Called with progress updates for a single URL fetch */
+  onProgress?: (progress: FetchProgress) => void;
+  /** Called with full progress snapshot for batch fetches */
+  onUpdate?: (progress: FetchProgress[]) => void;
+}

package/src/index.ts CHANGED Viewed

@@ -2,7 +2,7 @@
  * @unipi/web-api — Extension entry
  *
  * Web search, read, and summarize tools with provider-based backend selection.
- * Provides agent tools: web-search, web-read, web-llm-summarize
+ * Provides agent tools: web-search, multi-web-content-read, web-llm-summarize
  */
 import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
@@ -15,7 +15,8 @@ import {
 import { registerWebTools, WEB_TOOLS } from "./tools.js";
 import { registerWebCommands, WEB_COMMANDS } from "./commands.js";
 import { webCache } from "./cache.js";
-import { loadConfig } from "./settings.js";
+import { loadConfig, loadSmartFetchSettings } from "./settings.js";
+import { checkDependencies } from "./engine/dependencies.js";
 import "./providers/duckduckgo.js";
 import "./providers/jina-search.js";
 import "./providers/jina-reader.js";
@@ -79,6 +80,7 @@ export default function (pi: ExtensionAPI) {
           showByDefault: true,
           stats: [
             { id: "providers", label: "Enabled Providers", show: true },
+            { id: "smartFetch", label: "Smart-Fetch", show: true },
             { id: "cacheEntries", label: "Cache Entries", show: true },
             { id: "cacheSize", label: "Cache Size", show: true },
             { id: "expired", label: "Expired Entries", show: true },
@@ -91,8 +93,13 @@ export default function (pi: ExtensionAPI) {
             (p) => p.enabled
           ).length;
+          // Check smart-fetch engine availability
+          const deps = await checkDependencies();
+          const smartFetchStatus = deps.available ? "✓ Ready" : `Missing: ${deps.missing.join(", ")}`;
           return {
             providers: { value: String(enabledCount) },
+            smartFetch: { value: smartFetchStatus },
             cacheEntries: { value: String(stats.totalEntries) },
             cacheSize: { value: `${(stats.totalSizeBytes / 1024).toFixed(1)} KB` },
             expired: { value: String(stats.expiredEntries) },

package/src/providers/base.ts CHANGED Viewed

@@ -7,7 +7,15 @@
 /** Supported capabilities for web providers */
 export type WebCapability = "search" | "read" | "summarize";
-/** Ranking structure for provider selection */
+/**
+ * Ranking for capability selection.
+ * Lower number = simpler/cheaper provider (preferred for auto-selection).
+ * 0 means provider doesn't support that capability.
+ *
+ * Note: For "read" capability, rank 0 is reserved for the smart-fetch engine.
+ * It is not a registered provider, but the default when source=0 or omitted.
+ * Registered read providers use ranks 1+ (Jina Reader=1, Firecrawl=2, Perplexity=3).
+ */
 export interface ProviderRanking {
   search: number;
   read: number;