npm - @sisu-ai/tool-web-fetch - Versions diffs - 1.0.0 - Mend

@sisu-ai/tool-web-fetch 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,28 @@
+# @sisu-ai/tool-web-fetch
+Fetch a web page by URL and return text, HTML, or JSON for LLM consumption.
+Install
+```bash
+npm i @sisu-ai/tool-web-fetch
+```
+Environment / Flags
+- `WEB_FETCH_USER_AGENT` or `HTTP_USER_AGENT` (flag: `--web-fetch-user-agent`)
+- `WEB_FETCH_MAX_BYTES` (flag: `--web-fetch-max-bytes`) — default 500kB
+Tool
+- Name: `webFetch`
+- Args: `{ url: string; format?: 'text'|'html'|'json'; maxBytes?: number }`
+- Returns: `{ url, finalUrl?, status, contentType?, title?, text?, html?, json? }`
+Behavior
+- Follows redirects and reads up to `maxBytes` to avoid huge pages.
+- If `format: 'text'` (default) and page is HTML, strips tags (removes script/style) and decodes basic entities; includes `title`.
+- If `format: 'html'`, returns raw HTML and `title`.
+- If server returns JSON or `format: 'json'`, parses into `json`.
+- Non-OK responses return status and a short text body snippet for debugging.
+Notes
+- This is a minimal fetcher to empower summarization / extraction workflows. For deeper crawling, add queueing, URL normalization, and robots.txt handling in upstream middleware.

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+import type { Tool } from '@sisu-ai/core';
+export type WebFetchFormat = 'text' | 'html' | 'json';
+export interface WebFetchArgs {
+    url: string;
+    format?: WebFetchFormat;
+    maxBytes?: number;
+}
+export interface WebFetchResult {
+    url: string;
+    finalUrl?: string;
+    status: number;
+    contentType?: string;
+    title?: string;
+    text?: string;
+    html?: string;
+    json?: unknown;
+}
+export declare const webFetch: Tool<WebFetchArgs>;
+export default webFetch;

package/dist/index.js ADDED Viewed

@@ -0,0 +1,121 @@
+import { firstConfigValue } from '@sisu-ai/core';
+import { z } from 'zod';
+export const webFetch = {
+    name: 'webFetch',
+    description: 'Fetch a web page by URL and return text, HTML, or JSON. Defaults to text extraction for HTML.',
+    schema: z.object({
+        url: z.string().url(),
+        format: z.enum(['text', 'html', 'json']).optional(),
+        maxBytes: z.number().int().positive().max(5_000_000).optional(),
+    }),
+    handler: async ({ url, format = 'text', maxBytes }, _ctx) => {
+        const ua = firstConfigValue(['WEB_FETCH_USER_AGENT', 'HTTP_USER_AGENT'])
+            || 'SisuWebFetch/0.1 (+https://github.com/finger-gun/sisu)';
+        const capEnv = firstConfigValue(['WEB_FETCH_MAX_BYTES']);
+        const cap = Number(maxBytes ?? (capEnv !== undefined ? Number(capEnv) : 500_000));
+        const res = await fetch(url, {
+            redirect: 'follow',
+            headers: { 'User-Agent': ua, 'Accept': '*/*' },
+        });
+        const contentType = res.headers?.get?.('content-type') || '';
+        // Stream read with cap to avoid massive bodies
+        const buf = await readWithCap(res, cap);
+        const finalUrl = res.url || undefined;
+        if (!res.ok) {
+            return { url, finalUrl, status: res.status, contentType, text: truncateText(buf.toString('utf8')) };
+        }
+        // Handle by requested format and content-type
+        const ctLower = contentType.toLowerCase();
+        if (format === 'json' || ctLower.includes('application/json')) {
+            try {
+                const json = JSON.parse(buf.toString('utf8'));
+                return { url, finalUrl, status: res.status, contentType, json };
+            }
+            catch {
+                // Fall through to text
+            }
+        }
+        if (format === 'html' || ctLower.includes('text/html') || ctLower.includes('application/xhtml')) {
+            const html = buf.toString('utf8');
+            if (format === 'html') {
+                return { url, finalUrl, status: res.status, contentType, html, title: extractTitle(html) };
+            }
+            // format === 'text'
+            const text = htmlToText(html);
+            return { url, finalUrl, status: res.status, contentType, text, title: extractTitle(html), html: undefined };
+        }
+        // Fallback: treat as text/*
+        const text = buf.toString('utf8');
+        return { url, finalUrl, status: res.status, contentType, text: truncateText(text) };
+    },
+};
+export default webFetch;
+async function readWithCap(res, cap) {
+    // If body is not a stream (older fetch mocks), try res.text()
+    const anyRes = res;
+    if (!anyRes.body || typeof anyRes.body.getReader !== 'function') {
+        const t = typeof anyRes.text === 'function' ? await anyRes.text() : '';
+        return Buffer.from(String(t), 'utf8');
+    }
+    const reader = anyRes.body.getReader();
+    const chunks = [];
+    let received = 0;
+    while (true) {
+        const { done, value } = await reader.read();
+        if (done)
+            break;
+        if (value) {
+            received += value.byteLength;
+            if (received > cap)
+                break;
+            chunks.push(value);
+        }
+    }
+    const out = new Uint8Array(received);
+    let offset = 0;
+    for (const c of chunks) {
+        out.set(c, offset);
+        offset += c.byteLength;
+    }
+    return Buffer.from(out);
+}
+function extractTitle(html) {
+    const m = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
+    if (!m)
+        return undefined;
+    return decodeHTMLEntities(m[1]).trim();
+}
+function htmlToText(html) {
+    // Remove script/style robustly: allow attributes and sloppy closing tags like </script foo="bar"> or </script >
+    let s = html
+        .replace(/<script\b[^>]*>[\s\S]*?<\/script\b[^>]*>/gi, ' ')
+        .replace(/<style\b[^>]*>[\s\S]*?<\/style\b[^>]*>/gi, ' ');
+    // Remove HTML comments, including non-standard end '--!>' browsers tolerate
+    s = s.replace(/<!--[\s\S]*?--!?>(\n)?/g, ' ');
+    // Replace <br> and block tags with newlines
+    s = s.replace(/<(br|BR)\s*\/?>(\n)?/g, '\n');
+    s = s.replace(/<\/(p|div|section|article|h[1-6]|li|ul|ol|header|footer|main)>/gi, '\n');
+    // Strip remaining tags
+    s = s.replace(/<[^>]+>/g, ' ');
+    // Decode entities
+    s = decodeHTMLEntities(s);
+    // Collapse whitespace
+    s = s.replace(/\s+/g, ' ').trim();
+    return truncateText(s);
+}
+function truncateText(text, max = 200_000) {
+    return text.length > max ? text.slice(0, max) + '…' : text;
+}
+// Minimal HTML entity decoder for common entities
+function decodeHTMLEntities(s) {
+    const map = {
+        '&amp;': '&',
+        '&lt;': '<',
+        '&gt;': '>',
+        '&quot;': '"',
+        '&#39;': "'",
+        '&apos;': "'",
+        '&nbsp;': ' ',
+    };
+    return s.replace(/&(amp|lt|gt|quot|#39|apos|nbsp);/g, (m) => map[m] || m);
+}

package/package.json ADDED Viewed

@@ -0,0 +1,28 @@
+{
+  "name": "@sisu-ai/tool-web-fetch",
+  "version": "1.0.0",
+  "type": "module",
+  "main": "dist/index.js",
+  "types": "dist/index.d.ts",
+  "files": [
+    "dist"
+  ],
+  "scripts": {
+    "build": "tsc -b"
+  },
+  "dependencies": {
+    "zod": "^3.23.8"
+  },
+  "peerDependencies": {
+    "@sisu-ai/core": "0.3.0"
+  },
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/finger-gun/sisu",
+    "directory": "packages/tools/web-fetch"
+  },
+  "homepage": "https://github.com/finger-gun/sisu#readme",
+  "bugs": {
+    "url": "https://github.com/finger-gun/sisu/issues"
+  }
+}