npm - @sisu-ai/tool-web-fetch - Versions diffs - 8.0.1 → 8.0.2 - Mend

@sisu-ai/tool-web-fetch 8.0.1 → 8.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import type { Tool } from '@sisu-ai/core';
-export type WebFetchFormat = 'text' | 'html' | 'json';
+import type { Tool } from "@sisu-ai/core";
+export type WebFetchFormat = "text" | "html" | "json";
 export interface WebFetchArgs {
     url: string;
     format?: WebFetchFormat;

package/dist/index.js CHANGED Viewed

@@ -1,32 +1,35 @@
-import { firstConfigValue } from '@sisu-ai/core';
-import { z } from 'zod';
+import { firstConfigValue } from "@sisu-ai/core";
+import { z } from "zod";
 export const webFetch = {
-    name: 'webFetch',
-    description: 'Fetch a web page by URL and return text, HTML, or JSON. Defaults to text extraction for HTML.',
+    name: "webFetch",
+    description: "Fetch a web page by URL and return text, HTML, or JSON. Defaults to text extraction for HTML.",
     schema: z.object({
         url: z.string().url(),
-        format: z.enum(['text', 'html', 'json']).optional(),
+        format: z.enum(["text", "html", "json"]).optional(),
         maxBytes: z.number().int().positive().max(5_000_000).optional(),
         respectRobots: z.boolean().optional(),
     }),
-    handler: async ({ url, format = 'text', maxBytes, respectRobots }, ctx) => {
-        const ua = firstConfigValue(['WEB_FETCH_USER_AGENT', 'HTTP_USER_AGENT'])
-            || 'SisuWebFetch/0.1 (+https://github.com/finger-gun/sisu)';
-        const capEnv = firstConfigValue(['WEB_FETCH_MAX_BYTES']);
+    handler: async ({ url, format = "text", maxBytes, respectRobots }, ctx) => {
+        const ua = firstConfigValue(["WEB_FETCH_USER_AGENT", "HTTP_USER_AGENT"]) ||
+            "SisuWebFetch/0.1 (+https://github.com/finger-gun/sisu)";
+        const capEnv = firstConfigValue(["WEB_FETCH_MAX_BYTES"]);
         const cap = Number(maxBytes ?? (capEnv !== undefined ? Number(capEnv) : 500_000));
         // robots.txt compliance (default on; disable with arg or env WEB_FETCH_RESPECT_ROBOTS=0)
         const respect = (() => {
-            if (typeof respectRobots === 'boolean')
+            if (typeof respectRobots === "boolean")
                 return respectRobots;
-            const env = firstConfigValue(['WEB_FETCH_RESPECT_ROBOTS', 'RESPECT_ROBOTS']);
+            const env = firstConfigValue([
+                "WEB_FETCH_RESPECT_ROBOTS",
+                "RESPECT_ROBOTS",
+            ]);
             if (env === undefined)
                 return true; // default on
-            return !(env === '0' || /^false$/i.test(env));
+            return !(env === "0" || /^false$/i.test(env));
         })();
         if (respect) {
             const decision = await robotsDecision(url, ua).catch(() => ({ allowed: true }));
             if (!decision.allowed) {
-                ctx?.log?.info?.('[webFetch] blocked by robots.txt', {
+                ctx?.log?.info?.("[webFetch] blocked by robots.txt", {
                     url,
                     userAgent: ua,
                     matchedAgent: decision.matchedAgent,
@@ -36,48 +39,76 @@ export const webFetch = {
                 return {
                     url,
                     status: 403,
-                    contentType: 'text/plain',
-                    text: `Blocked by robots.txt (agent: ${decision.matchedAgent ?? 'unknown'}, rule: ${decision.ruleType ?? 'disallow'} ${decision.rulePattern ?? ''})`.
-                        trim(),
+                    contentType: "text/plain",
+                    text: `Blocked by robots.txt (agent: ${decision.matchedAgent ?? "unknown"}, rule: ${decision.ruleType ?? "disallow"} ${decision.rulePattern ?? ""})`.trim(),
                     robotsBlocked: true,
-                    robotsAgent: ua
+                    robotsAgent: ua,
                 };
             }
         }
         const res = await fetch(url, {
-            redirect: 'follow',
-            headers: { 'User-Agent': ua, 'Accept': '*/*' },
+            redirect: "follow",
+            headers: { "User-Agent": ua, Accept: "*/*" },
         });
-        const contentType = res.headers?.get?.('content-type') || '';
+        const contentType = res.headers?.get?.("content-type") || "";
         // Stream read with cap to avoid massive bodies
         const buf = await readWithCap(res, cap);
         const finalUrl = res.url || undefined;
         if (!res.ok) {
-            return { url, finalUrl, status: res.status, contentType, text: truncateText(buf.toString('utf8')) };
+            return {
+                url,
+                finalUrl,
+                status: res.status,
+                contentType,
+                text: truncateText(buf.toString("utf8")),
+            };
         }
         // Handle by requested format and content-type
         const ctLower = contentType.toLowerCase();
-        if (format === 'json' || ctLower.includes('application/json')) {
+        if (format === "json" || ctLower.includes("application/json")) {
             try {
-                const json = JSON.parse(buf.toString('utf8'));
+                const json = JSON.parse(buf.toString("utf8"));
                 return { url, finalUrl, status: res.status, contentType, json };
             }
             catch {
                 // Fall through to text
             }
         }
-        if (format === 'html' || ctLower.includes('text/html') || ctLower.includes('application/xhtml')) {
-            const html = buf.toString('utf8');
-            if (format === 'html') {
-                return { url, finalUrl, status: res.status, contentType, html, title: extractTitle(html) };
+        if (format === "html" ||
+            ctLower.includes("text/html") ||
+            ctLower.includes("application/xhtml")) {
+            const html = buf.toString("utf8");
+            if (format === "html") {
+                return {
+                    url,
+                    finalUrl,
+                    status: res.status,
+                    contentType,
+                    html,
+                    title: extractTitle(html),
+                };
             }
             // format === 'text'
             const text = htmlToText(html);
-            return { url, finalUrl, status: res.status, contentType, text, title: extractTitle(html), html: undefined };
+            return {
+                url,
+                finalUrl,
+                status: res.status,
+                contentType,
+                text,
+                title: extractTitle(html),
+                html: undefined,
+            };
         }
         // Fallback: treat as text/*
-        const text = buf.toString('utf8');
-        return { url, finalUrl, status: res.status, contentType, text: truncateText(text) };
+        const text = buf.toString("utf8");
+        return {
+            url,
+            finalUrl,
+            status: res.status,
+            contentType,
+            text: truncateText(text),
+        };
     },
 };
 export default webFetch;
@@ -87,10 +118,13 @@ async function robotsDecision(targetUrl, userAgent) {
     const origin = `${u.protocol}//${u.host}`;
     const cache = robotsCache.get(origin);
     const now = Date.now();
-    if (!cache || (now - cache.ts) > 60 * 60 * 1000) { // 1h TTL
+    if (!cache || now - cache.ts > 60 * 60 * 1000) {
+        // 1h TTL
         const robotsUrl = `${origin}/robots.txt`;
         try {
-            const res = await fetch(robotsUrl, { headers: { 'User-Agent': userAgent, 'Accept': 'text/plain' } });
+            const res = await fetch(robotsUrl, {
+                headers: { "User-Agent": userAgent, Accept: "text/plain" },
+            });
             const txt = await res.text();
             const rules = res.ok ? parseRobots(txt) : null;
             robotsCache.set(origin, { ts: now, rules });
@@ -102,7 +136,7 @@ async function robotsDecision(targetUrl, userAgent) {
     const rules = robotsCache.get(origin)?.rules;
     if (!rules)
         return { allowed: true };
-    return evaluateRobotsDetailed(rules, userAgent, u.pathname + (u.search || ''));
+    return evaluateRobotsDetailed(rules, userAgent, u.pathname + (u.search || ""));
 }
 function parseRobots(text) {
     const lines = text.split(/\r?\n/);
@@ -110,31 +144,31 @@ function parseRobots(text) {
     let current = null;
     for (const raw of lines) {
         const line = raw.trim();
-        if (!line || line.startsWith('#'))
+        if (!line || line.startsWith("#"))
             continue;
         const m = line.match(/^(user-agent|allow|disallow)\s*:\s*(.*)$/i);
         if (!m)
             continue;
         const key = m[1].toLowerCase();
         const val = m[2].trim();
-        if (key === 'user-agent') {
+        if (key === "user-agent") {
             // Start a new group if we already had one and it contains rules
-            if (!current || (current.allows.length + current.disallows.length) > 0) {
+            if (!current || current.allows.length + current.disallows.length > 0) {
                 current = { agents: [], allows: [], disallows: [] };
                 groups.push(current);
             }
             current.agents.push(val.toLowerCase());
         }
-        else if (key === 'allow') {
+        else if (key === "allow") {
             if (!current) {
-                current = { agents: ['*'], allows: [], disallows: [] };
+                current = { agents: ["*"], allows: [], disallows: [] };
                 groups.push(current);
             }
             current.allows.push(val);
         }
-        else if (key === 'disallow') {
+        else if (key === "disallow") {
             if (!current) {
-                current = { agents: ['*'], allows: [], disallows: [] };
+                current = { agents: ["*"], allows: [], disallows: [] };
                 groups.push(current);
             }
             current.disallows.push(val);
@@ -145,18 +179,20 @@ function parseRobots(text) {
 function evaluateRobotsDetailed(rules, userAgent, pathWithQuery) {
     // Match exact agent token (product) ignoring case, or '*'.
     // Example: 'SisuWebFetch/0.1 (+...)' -> baseAgent 'sisuwebfetch'
-    const baseAgent = (userAgent.split(/[\/\s]/)[0] || '').toLowerCase();
+    const baseAgent = (userAgent.split(/[\s/]/)[0] || "").toLowerCase();
     const agentMatches = (agent) => {
-        if (agent === '*')
+        if (agent === "*")
             return true;
         return agent.toLowerCase() === baseAgent;
     };
     const matching = rules.groups
-        .map(g => ({ g, matchedAgent: g.agents.find(agentMatches) }))
-        .filter(x => !!x.matchedAgent);
+        .map((g) => ({ g, matchedAgent: g.agents.find(agentMatches) }))
+        .filter((x) => !!x.matchedAgent);
     const selected = matching.length
         ? matching
-        : rules.groups.filter(g => g.agents.includes('*')).map(g => ({ g, matchedAgent: '*' }));
+        : rules.groups
+            .filter((g) => g.agents.includes("*"))
+            .map((g) => ({ g, matchedAgent: "*" }));
     if (!selected.length)
         return { allowed: true };
     // longest match wins between allow and disallow
@@ -172,7 +208,7 @@ function evaluateRobotsDetailed(rules, userAgent, pathWithQuery) {
                 const L = pat.length;
                 if (L > bestLen) {
                     bestLen = L;
-                    bestType = 'allow';
+                    bestType = "allow";
                     bestPat = pat;
                     bestAgent = matchedAgent;
                 }
@@ -185,28 +221,38 @@ function evaluateRobotsDetailed(rules, userAgent, pathWithQuery) {
                 const L = pat.length;
                 if (L > bestLen) {
                     bestLen = L;
-                    bestType = 'disallow';
+                    bestType = "disallow";
                     bestPat = pat;
                     bestAgent = matchedAgent;
                 }
             }
         }
     }
-    if (bestType === 'disallow')
-        return { allowed: false, matchedAgent: bestAgent, ruleType: 'disallow', rulePattern: bestPat };
-    return { allowed: true, matchedAgent: bestAgent, ruleType: bestType, rulePattern: bestPat };
+    if (bestType === "disallow")
+        return {
+            allowed: false,
+            matchedAgent: bestAgent,
+            ruleType: "disallow",
+            rulePattern: bestPat,
+        };
+    return {
+        allowed: true,
+        matchedAgent: bestAgent,
+        ruleType: bestType,
+        rulePattern: bestPat,
+    };
 }
 function patternMatches(pat, path) {
     // Support '*' wildcard and '$' end anchor; treat path as starting with '/'
     const p = pat.trim();
-    if (p === '')
+    if (p === "")
         return false;
     // Empty disallow means allow all; already handled by return false above
     // Convert to regex
-    const escaped = p.replace(/[.+?^${}()|\[\]\\]/g, r => '\\' + r);
-    let reStr = '^' + escaped.replace(/\*/g, '.*');
-    if (reStr.endsWith('\$')) {
-        reStr = reStr.slice(0, -2) + '$';
+    const escaped = p.replace(/[.+?^${}()|[\]\\]/g, (r) => "\\" + r);
+    let reStr = "^" + escaped.replace(/\*/g, ".*");
+    if (reStr.endsWith("$")) {
+        reStr = reStr.slice(0, -1) + "$";
     }
     const re = new RegExp(reStr);
     return re.test(path);
@@ -214,9 +260,9 @@ function patternMatches(pat, path) {
 async function readWithCap(res, cap) {
     // If body is not a stream (older fetch mocks), try res.text()
     const anyRes = res;
-    if (!anyRes.body || typeof anyRes.body.getReader !== 'function') {
-        const t = typeof anyRes.text === 'function' ? await anyRes.text() : '';
-        return Buffer.from(String(t), 'utf8');
+    if (!anyRes.body || typeof anyRes.body.getReader !== "function") {
+        const t = typeof anyRes.text === "function" ? await anyRes.text() : "";
+        return Buffer.from(String(t), "utf8");
     }
     const reader = anyRes.body.getReader();
     const chunks = [];
@@ -249,34 +295,34 @@ function extractTitle(html) {
 function htmlToText(html) {
     // Remove script/style robustly: allow attributes and sloppy closing tags like </script foo="bar"> or </script >
     let s = html
-        .replace(/<script\b[^>]*>[\s\S]*?<\/script\b[^>]*>/gi, ' ')
-        .replace(/<style\b[^>]*>[\s\S]*?<\/style\b[^>]*>/gi, ' ');
+        .replace(/<script\b[^>]*>[\s\S]*?<\/script\b[^>]*>/gi, " ")
+        .replace(/<style\b[^>]*>[\s\S]*?<\/style\b[^>]*>/gi, " ");
     // Remove HTML comments, including non-standard end '--!>' browsers tolerate
-    s = s.replace(/<!--[\s\S]*?--!?>(\n)?/g, ' ');
+    s = s.replace(/<!--[\s\S]*?--!?>(\n)?/g, " ");
     // Replace <br> and block tags with newlines
-    s = s.replace(/<(br|BR)\s*\/?>(\n)?/g, '\n');
-    s = s.replace(/<\/(p|div|section|article|h[1-6]|li|ul|ol|header|footer|main)>/gi, '\n');
+    s = s.replace(/<(br|BR)\s*\/?>(\n)?/g, "\n");
+    s = s.replace(/<\/(p|div|section|article|h[1-6]|li|ul|ol|header|footer|main)>/gi, "\n");
     // Strip remaining tags
-    s = s.replace(/<[^>]+>/g, ' ');
+    s = s.replace(/<[^>]+>/g, " ");
     // Decode entities
     s = decodeHTMLEntities(s);
     // Collapse whitespace
-    s = s.replace(/\s+/g, ' ').trim();
+    s = s.replace(/\s+/g, " ").trim();
     return truncateText(s);
 }
 function truncateText(text, max = 200_000) {
-    return text.length > max ? text.slice(0, max) + '…' : text;
+    return text.length > max ? text.slice(0, max) + "…" : text;
 }
 // Minimal HTML entity decoder for common entities
 function decodeHTMLEntities(s) {
     const map = {
-        '&amp;': '&',
-        '&lt;': '<',
-        '&gt;': '>',
-        '&quot;': '"',
-        '&#39;': "'",
-        '&apos;': "'",
-        '&nbsp;': ' ',
+        "&amp;": "&",
+        "&lt;": "<",
+        "&gt;": ">",
+        "&quot;": '"',
+        "&#39;": "'",
+        "&apos;": "'",
+        "&nbsp;": " ",
     };
     return s.replace(/&(amp|lt|gt|quot|#39|apos|nbsp);/g, (m) => map[m] || m);
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sisu-ai/tool-web-fetch",
-  "version": "8.0.1",
+  "version": "8.0.2",
   "license": "Apache-2.0",
   "type": "module",
   "main": "dist/index.js",