npm - @pi-unipi/web-api - Versions diffs - 0.1.13 → 0.1.15 - Mend

@pi-unipi/web-api 0.1.13 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +76 -15
package/package.json +9 -2
package/skills/web/SKILL.md +54 -11
package/src/engine/constants.ts +36 -0
package/src/engine/dependencies.ts +145 -0
package/src/engine/dom.ts +266 -0
package/src/engine/extract.ts +642 -0
package/src/engine/format.ts +306 -0
package/src/engine/profiles.ts +102 -0
package/src/engine/types.ts +169 -0
package/src/index.ts +9 -2
package/src/providers/base.ts +9 -1
package/src/settings.ts +70 -4
package/src/tools.ts +281 -24
package/src/tui/progress.ts +168 -0
package/src/tui/result.ts +173 -0
package/src/tui/settings-dialog.ts +168 -0

package/src/settings.ts CHANGED Viewed

@@ -27,12 +27,42 @@ export interface CacheSettings {
   ttlMs: number;
 }
+/** Smart-fetch default settings */
+export interface SmartFetchSettings {
+  /** TLS fingerprint browser profile */
+  browser: string;
+  /** OS fingerprint */
+  os: string;
+  /** Maximum content characters */
+  maxChars: number;
+  /** Request timeout in ms */
+  timeoutMs: number;
+  /** Batch concurrency */
+  batchConcurrency: number;
+  /** Strip image references */
+  removeImages: boolean;
+  /** Include replies/comments */
+  includeReplies: boolean | "extractors";
+}
 /** Config storage structure */
 export interface WebApiConfig {
   providers: Record<string, ProviderSettings>;
   cache: CacheSettings;
+  smartFetch?: Partial<SmartFetchSettings>;
 }
+/** Default smart-fetch settings */
+const DEFAULT_SMART_FETCH_SETTINGS: SmartFetchSettings = {
+  browser: "chrome_145",
+  os: "windows",
+  maxChars: 50000,
+  timeoutMs: 15000,
+  batchConcurrency: 8,
+  removeImages: false,
+  includeReplies: "extractors",
+};
 /** Default configuration */
 const DEFAULT_CONFIG: WebApiConfig = {
   providers: {
@@ -49,6 +79,7 @@ const DEFAULT_CONFIG: WebApiConfig = {
     enabled: true,
     ttlMs: 3600000, // 1 hour
   },
+  smartFetch: {},
 };
 /**
@@ -94,8 +125,8 @@ export function loadAuth(): WebApiAuth {
       const content = fs.readFileSync(authPath, "utf-8");
       return JSON.parse(content);
     }
-  } catch (error) {
-    console.error("[web-api] Failed to load auth:", error);
+  } catch {
+    // Silently ignore — auth load failure returns empty.
   }
   return {};
 }
@@ -133,8 +164,8 @@ export function loadConfig(): WebApiConfig {
         },
       };
     }
-  } catch (error) {
-    console.error("[web-api] Failed to load config:", error);
+  } catch {
+    // Silently ignore — config load failure falls back to defaults.
   }
   return DEFAULT_CONFIG;
 }
@@ -261,3 +292,38 @@ export function validateApiKeyFormat(providerId: string, apiKey: string): boolea
       return apiKey.length >= 8;
   }
 }
+/**
+ * Load smart-fetch settings.
+ * Merges defaults with saved config.
+ * @returns Smart-fetch settings
+ */
+export function loadSmartFetchSettings(): SmartFetchSettings {
+  const config = loadConfig();
+  return {
+    ...DEFAULT_SMART_FETCH_SETTINGS,
+    ...config.smartFetch,
+  };
+}
+/**
+ * Save smart-fetch settings.
+ * @param settings - Partial settings to save
+ */
+export function saveSmartFetchSettings(settings: Partial<SmartFetchSettings>): void {
+  const config = loadConfig();
+  config.smartFetch = {
+    ...config.smartFetch,
+    ...settings,
+  };
+  saveConfig(config);
+}
+/**
+ * Reset smart-fetch settings to defaults.
+ */
+export function resetSmartFetchSettings(): void {
+  const config = loadConfig();
+  config.smartFetch = {};
+  saveConfig(config);
+}

package/src/tools.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 /**
  * @unipi/web-api — Agent tools registration
  *
- * Registers web-search, web-read, and web-llm-summarize tools.
+ * Registers web-search, multi-web-content-read, and web-llm-summarize tools.
  * Implements smart provider selection based on ranking.
  */
@@ -19,12 +19,20 @@ import {
   getApiKey,
   isProviderEnabled,
   loadConfig,
+  loadSmartFetchSettings,
 } from "./settings.js";
+import { webCache } from "./cache.js";
+import {
+  defuddleFetch,
+  defuddleFetchMultiple,
+} from "./engine/extract.js";
+import type { FetchOptions, FetchResult, BatchFetchResult } from "./engine/types.js";
+import { formatSingleResult, formatBatchResult, formatErrorResult } from "./engine/format.js";
 /** Tool names */
 export const WEB_TOOLS = {
   SEARCH: "web_search",
-  READ: "web_read",
+  READ: "multi_web_content_read",
   SUMMARIZE: "web_llm_summarize",
 } as const;
@@ -113,9 +121,9 @@ async function executeSearch(
 }
 /**
- * Execute web read.
+ * Execute web read via provider.
  */
-async function executeRead(
+async function executeProviderRead(
   url: string,
   sourceRank?: number
 ): Promise<ReadResult> {
@@ -151,6 +159,85 @@ async function executeSummarize(
   return provider.summarize(url, prompt, config);
 }
+/**
+ * Generate cache key for smart-fetch results.
+ */
+function generateSmartFetchKey(
+  url: string,
+  options: Partial<FetchOptions>
+): string {
+  const parts = [
+    url,
+    options.browser || "",
+    options.format || "",
+    String(options.maxChars || ""),
+  ];
+  return parts.join(":");
+}
+/**
+ * Execute smart-fetch read (single URL).
+ */
+async function executeSmartFetchRead(
+  url: string,
+  options: Partial<FetchOptions> = {}
+): Promise<FetchResult> {
+  // Check cache first
+  const cacheKey = generateSmartFetchKey(url, options);
+  const cached = webCache.get(cacheKey, "smart-fetch");
+  if (cached) {
+    return cached as FetchResult;
+  }
+  // Load defaults
+  const defaults = loadSmartFetchSettings();
+  const fetchOptions: FetchOptions = {
+    browser: options.browser || defaults.browser,
+    os: options.os || defaults.os,
+    format: options.format || "markdown",
+    maxChars: options.maxChars || defaults.maxChars,
+    timeoutMs: options.timeoutMs || defaults.timeoutMs,
+    removeImages: options.removeImages ?? defaults.removeImages,
+    includeReplies: options.includeReplies ?? defaults.includeReplies,
+    proxy: options.proxy,
+    headers: options.headers,
+  };
+  // Execute fetch
+  const result = await defuddleFetch(url, fetchOptions);
+  // Cache result
+  webCache.set(cacheKey, "smart-fetch", result);
+  return result;
+}
+/**
+ * Execute smart-fetch batch read.
+ */
+async function executeSmartFetchBatch(
+  urls: string[],
+  options: Partial<FetchOptions> & { batchConcurrency?: number } = {}
+): Promise<BatchFetchResult> {
+  // Load defaults
+  const defaults = loadSmartFetchSettings();
+  const fetchOptions: FetchOptions & { batchConcurrency?: number } = {
+    browser: options.browser || defaults.browser,
+    os: options.os || defaults.os,
+    format: options.format || "markdown",
+    maxChars: options.maxChars || defaults.maxChars,
+    timeoutMs: options.timeoutMs || defaults.timeoutMs,
+    removeImages: options.removeImages ?? defaults.removeImages,
+    includeReplies: options.includeReplies ?? defaults.includeReplies,
+    proxy: options.proxy,
+    headers: options.headers,
+    batchConcurrency: options.batchConcurrency || defaults.batchConcurrency,
+  };
+  // Execute batch fetch
+  return defuddleFetchMultiple(urls, fetchOptions);
+}
 /**
  * Register web tools with pi.
  */
@@ -221,43 +308,213 @@ export function registerWebTools(pi: ExtensionAPI): void {
     },
   });
-  // --- web_read tool ---
+  // --- multi_web_content_read tool ---
   pi.registerTool({
     name: WEB_TOOLS.READ,
-    label: "Web Read",
+    label: "Multi Web Content Read",
     description:
-      "Read and extract content from a URL. " +
-      "Extracts main content, strips navigation/ads. Returns markdown.",
-    promptSnippet: "Read content from a URL.",
+      "Read and extract content from URLs using the smart-fetch engine (default) or provider fallbacks. " +
+      "Supports single URL or batch URLs. " +
+      "Returns clean markdown with metadata (title, author, site, word count).",
+    promptSnippet: "Read content from one or more URLs.",
     promptGuidelines: [
-      "Use web_read to extract content from a web page.",
-      "Returns main content as markdown.",
-      "Lower source = simpler providers (Jina Reader).",
-      "Higher source = more capable providers (Firecrawl, Perplexity).",
+      "Use multi_web_content_read to extract content from web pages.",
+      "Pass a single URL string or an array of URLs for batch reading.",
+      "Default source (0 or omitted) uses the local smart-fetch engine — free, no API key.",
+      "source 1-3 uses provider fallbacks: Jina Reader, Firecrawl, Perplexity.",
+      "Batch mode: pass an array of URLs, returns results for each.",
     ],
     parameters: Type.Object({
-      url: Type.String({ description: "URL to read" }),
+      url: Type.Union([
+        Type.String({ description: "Single URL to read" }),
+        Type.Array(Type.String(), { description: "Array of URLs to read in batch" }),
+      ], { description: "URL or array of URLs to read" }),
       source: Type.Optional(
         Type.Number({
           description:
-            "Provider selection (1=Jina Reader, 2=Firecrawl, 3=Perplexity). " +
-            "Omit for auto-selection.",
-          minimum: 1,
+            "Provider selection (0=smart-fetch engine, 1=Jina Reader, 2=Firecrawl, 3=Perplexity). " +
+            "Default is 0 (smart-fetch).",
+          minimum: 0,
           maximum: 3,
         })
       ),
+      browser: Type.Optional(
+        Type.String({
+          description: "TLS fingerprint browser profile (e.g., chrome_145). Default: chrome_145.",
+        })
+      ),
+      os: Type.Optional(
+        Type.String({
+          description: "OS fingerprint (windows, macos, linux). Default: windows.",
+        })
+      ),
+      format: Type.Optional(
+        Type.Union([
+          Type.Literal("markdown"),
+          Type.Literal("html"),
+          Type.Literal("text"),
+          Type.Literal("json"),
+        ], { description: "Output format. Default: markdown." })
+      ),
+      maxChars: Type.Optional(
+        Type.Number({
+          description: "Maximum characters in output. Default: 50000.",
+        })
+      ),
+      timeoutMs: Type.Optional(
+        Type.Number({
+          description: "Request timeout in milliseconds. Default: 15000.",
+        })
+      ),
+      removeImages: Type.Optional(
+        Type.Boolean({
+          description: "Strip image references from content. Default: false.",
+        })
+      ),
+      includeReplies: Type.Optional(
+        Type.Union([
+          Type.Boolean(),
+          Type.Literal("extractors"),
+        ], { description: "Include replies/comments. Default: extractors." })
+      ),
+      proxy: Type.Optional(
+        Type.String({
+          description: "Proxy URL for requests.",
+        })
+      ),
+      batchConcurrency: Type.Optional(
+        Type.Number({
+          description: "Concurrent requests for batch mode. Default: 8.",
+        })
+      ),
+      verbose: Type.Optional(
+        Type.Boolean({
+          description: "Include metadata header in output. Default: true.",
+        })
+      ),
     }),
     async execute(_toolCallId, params, _signal, _onUpdate, _ctx) {
+      const source = params.source ?? 0;
+      const verbose = params.verbose ?? true;
       try {
-        const result = await executeRead(params.url, params.source);
+        // Single URL
+        if (typeof params.url === "string") {
+          // Provider fallback
+          if (source >= 1) {
+            const result = await executeProviderRead(params.url, source);
+            return {
+              content: [
+                {
+                  type: "text",
+                  text: `Content from ${result.url}:\n\n${result.content}`,
+                },
+              ],
+              details: {},
+            };
+          }
-        return {
-          content: [
-            {
-              type: "text",
-              text: `Content from ${result.url}:\n\n${result.content}`,
+          // Smart-fetch engine
+          const result = await executeSmartFetchRead(params.url, {
+            browser: params.browser,
+            os: params.os,
+            format: params.format as FetchOptions["format"],
+            maxChars: params.maxChars,
+            timeoutMs: params.timeoutMs,
+            removeImages: params.removeImages,
+            includeReplies: params.includeReplies as FetchOptions["includeReplies"],
+            proxy: params.proxy,
+          });
+          return {
+            content: [
+              {
+                type: "text",
+                text: formatSingleResult(result, verbose),
+              },
+            ],
+            details: {
+              url: result.url,
+              finalUrl: result.finalUrl,
+              title: result.title,
+              wordCount: result.wordCount,
             },
-          ],
+          };
+        }
+        // Batch URLs
+        if (Array.isArray(params.url)) {
+          if (params.url.length === 0) {
+            return {
+              content: [{ type: "text", text: "No URLs provided." }],
+              details: {},
+            };
+          }
+          // Provider fallback for batch (fetch each individually)
+          if (source >= 1) {
+            const results = await Promise.all(
+              params.url.map(async (url) => {
+                try {
+                  const result = await executeProviderRead(url, source);
+                  return { url, status: "done", content: result.content };
+                } catch (error) {
+                  return {
+                    url,
+                    status: "error",
+                    error: error instanceof Error ? error.message : String(error),
+                  };
+                }
+              })
+            );
+            const text = results
+              .map((r, i) => {
+                if (r.status === "done") {
+                  return `[${i + 1}] ${r.url}\n${r.content}`;
+                }
+                return `[${i + 1}] ${r.url}\nError: ${r.error}`;
+              })
+              .join("\n\n---\n\n");
+            return {
+              content: [{ type: "text", text }],
+              details: { total: results.length },
+            };
+          }
+          // Smart-fetch batch
+          const result = await executeSmartFetchBatch(params.url, {
+            browser: params.browser,
+            os: params.os,
+            format: params.format as FetchOptions["format"],
+            maxChars: params.maxChars,
+            timeoutMs: params.timeoutMs,
+            removeImages: params.removeImages,
+            includeReplies: params.includeReplies as FetchOptions["includeReplies"],
+            proxy: params.proxy,
+            batchConcurrency: params.batchConcurrency,
+          });
+          return {
+            content: [
+              {
+                type: "text",
+                text: formatBatchResult(result),
+              },
+            ],
+            details: {
+              total: result.total,
+              succeeded: result.succeeded,
+              failed: result.failed,
+            },
+          };
+        }
+        // Should never reach here
+        return {
+          content: [{ type: "text", text: "Invalid url parameter." }],
+          isError: true,
           details: {},
         };
       } catch (error) {

package/src/tui/progress.ts ADDED Viewed

@@ -0,0 +1,168 @@
+/**
+ * @unipi/web-api — TUI Progress Renderer
+ *
+ * Renders batch fetch progress for TUI display.
+ */
+import type { FetchProgress, FetchProgressStatus } from "../engine/types.js";
+/** Spinner frames for animation */
+const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
+/** Status glyphs */
+const STATUS_GLYPHS: Record<FetchProgressStatus, string> = {
+  queued: "○",
+  connecting: SPINNER_FRAMES[0],
+  waiting: SPINNER_FRAMES[0],
+  loading: SPINNER_FRAMES[0],
+  processing: SPINNER_FRAMES[0],
+  done: "✓",
+  error: "✗",
+};
+/**
+ * Get a spinner frame for the given index.
+ * Cycles through spinner frames for animation.
+ *
+ * @param index - Animation frame index
+ * @returns Spinner character
+ */
+export function getSpinnerFrame(index: number): string {
+  return SPINNER_FRAMES[index % SPINNER_FRAMES.length];
+}
+/**
+ * Render a progress bar.
+ *
+ * @param percent - Progress percentage (0-100)
+ * @param width - Bar width in characters
+ * @returns Progress bar string
+ */
+export function renderProgressBar(percent: number, width: number = 10): string {
+  const filled = Math.round((percent / 100) * width);
+  const empty = width - filled;
+  return "█".repeat(filled) + "░".repeat(empty);
+}
+/**
+ * Truncate a URL for display.
+ *
+ * @param url - URL to truncate
+ * @param maxLength - Maximum length
+ * @returns Truncated URL
+ */
+function truncateUrl(url: string, maxLength: number): string {
+  if (url.length <= maxLength) {
+    return url;
+  }
+  // Try to keep the domain
+  try {
+    const parsed = new URL(url);
+    const domain = parsed.host;
+    const path = parsed.pathname + parsed.search;
+    if (domain.length + 3 >= maxLength) {
+      return url.slice(0, maxLength - 1) + "…";
+    }
+    const remaining = maxLength - domain.length - 3;
+    if (path.length <= remaining) {
+      return domain + path;
+    }
+    return domain + path.slice(0, remaining - 1) + "…";
+  } catch {
+    return url.slice(0, maxLength - 1) + "…";
+  }
+}
+/**
+ * Render a single progress item line.
+ *
+ * @param progress - Progress object
+ * @param width - Available width
+ * @param spinnerIndex - Animation frame index
+ * @returns Formatted line
+ */
+export function renderProgressLine(
+  progress: FetchProgress,
+  width: number = 80,
+  spinnerIndex: number = 0
+): string {
+  // Status glyph
+  let glyph = STATUS_GLYPHS[progress.status];
+  if (["connecting", "waiting", "loading", "processing"].includes(progress.status)) {
+    glyph = getSpinnerFrame(spinnerIndex);
+  }
+  // Truncate URL
+  const urlMax = Math.min(40, width - 30);
+  const url = truncateUrl(progress.url, urlMax);
+  // Progress bar
+  const bar = renderProgressBar(progress.percent, 8);
+  // Status text
+  const statusText = progress.phase || progress.status;
+  // Format line
+  return `${glyph} ${url.padEnd(urlMax)}  ${statusText.padEnd(12)} [${bar}]`;
+}
+/**
+ * Render batch progress header.
+ *
+ * @param progress - All progress items
+ * @param concurrency - Current concurrency
+ * @returns Header line
+ */
+export function renderBatchProgressHeader(
+  progress: FetchProgress[],
+  concurrency: number
+): string {
+  const total = progress.length;
+  const done = progress.filter((p) => p.status === "done").length;
+  const error = progress.filter((p) => p.status === "error").length;
+  const active = progress.filter(
+    (p) => !["queued", "done", "error"].includes(p.status)
+  ).length;
+  return `batch_web_content_read ${done}/${total} done · ok ${done - error} · err ${error} · concurrency ${concurrency}`;
+}
+/**
+ * Render full batch progress display.
+ *
+ * @param progress - All progress items
+ * @param concurrency - Current concurrency
+ * @param width - Available width
+ * @param spinnerIndex - Animation frame index
+ * @returns Formatted string
+ */
+export function renderBatchProgress(
+  progress: FetchProgress[],
+  concurrency: number = 8,
+  width: number = 80,
+  spinnerIndex: number = 0
+): string {
+  const lines: string[] = [];
+  // Header
+  lines.push(renderBatchProgressHeader(progress, concurrency));
+  lines.push("");
+  // Progress items (show up to 10)
+  const maxItems = 10;
+  const itemsToShow = progress.slice(0, maxItems);
+  for (const item of itemsToShow) {
+    lines.push(renderProgressLine(item, width, spinnerIndex));
+  }
+  if (progress.length > maxItems) {
+    lines.push(`  ... and ${progress.length - maxItems} more`);
+  }
+  return lines.join("\n");
+}