@sisu-ai/tool-web-fetch 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,28 @@
1
+ # @sisu-ai/tool-web-fetch
2
+
3
+ Fetch a web page by URL and return text, HTML, or JSON for LLM consumption.
4
+
5
+ Install
6
+ ```bash
7
+ npm i @sisu-ai/tool-web-fetch
8
+ ```
9
+
10
+ Environment / Flags
11
+ - `WEB_FETCH_USER_AGENT` or `HTTP_USER_AGENT` (flag: `--web-fetch-user-agent`)
12
+ - `WEB_FETCH_MAX_BYTES` (flag: `--web-fetch-max-bytes`) — default 500kB
13
+
14
+ Tool
15
+ - Name: `webFetch`
16
+ - Args: `{ url: string; format?: 'text'|'html'|'json'; maxBytes?: number }`
17
+ - Returns: `{ url, finalUrl?, status, contentType?, title?, text?, html?, json? }`
18
+
19
+ Behavior
20
+ - Follows redirects and reads up to `maxBytes` to avoid huge pages.
21
+ - If `format: 'text'` (default) and page is HTML, strips tags (removes script/style) and decodes basic entities; includes `title`.
22
+ - If `format: 'html'`, returns raw HTML and `title`.
23
+ - If server returns JSON or `format: 'json'`, parses into `json`.
24
+ - Non-OK responses return status and a short text body snippet for debugging.
25
+
26
+ Notes
27
+ - This is a minimal fetcher to empower summarization / extraction workflows. For deeper crawling, add queueing, URL normalization, and robots.txt handling in upstream middleware.
28
+
@@ -0,0 +1,19 @@
1
+ import type { Tool } from '@sisu-ai/core';
2
+ export type WebFetchFormat = 'text' | 'html' | 'json';
3
+ export interface WebFetchArgs {
4
+ url: string;
5
+ format?: WebFetchFormat;
6
+ maxBytes?: number;
7
+ }
8
+ export interface WebFetchResult {
9
+ url: string;
10
+ finalUrl?: string;
11
+ status: number;
12
+ contentType?: string;
13
+ title?: string;
14
+ text?: string;
15
+ html?: string;
16
+ json?: unknown;
17
+ }
18
+ export declare const webFetch: Tool<WebFetchArgs>;
19
+ export default webFetch;
package/dist/index.js ADDED
@@ -0,0 +1,121 @@
1
+ import { firstConfigValue } from '@sisu-ai/core';
2
+ import { z } from 'zod';
3
+ export const webFetch = {
4
+ name: 'webFetch',
5
+ description: 'Fetch a web page by URL and return text, HTML, or JSON. Defaults to text extraction for HTML.',
6
+ schema: z.object({
7
+ url: z.string().url(),
8
+ format: z.enum(['text', 'html', 'json']).optional(),
9
+ maxBytes: z.number().int().positive().max(5_000_000).optional(),
10
+ }),
11
+ handler: async ({ url, format = 'text', maxBytes }, _ctx) => {
12
+ const ua = firstConfigValue(['WEB_FETCH_USER_AGENT', 'HTTP_USER_AGENT'])
13
+ || 'SisuWebFetch/0.1 (+https://github.com/finger-gun/sisu)';
14
+ const capEnv = firstConfigValue(['WEB_FETCH_MAX_BYTES']);
15
+ const cap = Number(maxBytes ?? (capEnv !== undefined ? Number(capEnv) : 500_000));
16
+ const res = await fetch(url, {
17
+ redirect: 'follow',
18
+ headers: { 'User-Agent': ua, 'Accept': '*/*' },
19
+ });
20
+ const contentType = res.headers?.get?.('content-type') || '';
21
+ // Stream read with cap to avoid massive bodies
22
+ const buf = await readWithCap(res, cap);
23
+ const finalUrl = res.url || undefined;
24
+ if (!res.ok) {
25
+ return { url, finalUrl, status: res.status, contentType, text: truncateText(buf.toString('utf8')) };
26
+ }
27
+ // Handle by requested format and content-type
28
+ const ctLower = contentType.toLowerCase();
29
+ if (format === 'json' || ctLower.includes('application/json')) {
30
+ try {
31
+ const json = JSON.parse(buf.toString('utf8'));
32
+ return { url, finalUrl, status: res.status, contentType, json };
33
+ }
34
+ catch {
35
+ // Fall through to text
36
+ }
37
+ }
38
+ if (format === 'html' || ctLower.includes('text/html') || ctLower.includes('application/xhtml')) {
39
+ const html = buf.toString('utf8');
40
+ if (format === 'html') {
41
+ return { url, finalUrl, status: res.status, contentType, html, title: extractTitle(html) };
42
+ }
43
+ // format === 'text'
44
+ const text = htmlToText(html);
45
+ return { url, finalUrl, status: res.status, contentType, text, title: extractTitle(html), html: undefined };
46
+ }
47
+ // Fallback: treat as text/*
48
+ const text = buf.toString('utf8');
49
+ return { url, finalUrl, status: res.status, contentType, text: truncateText(text) };
50
+ },
51
+ };
52
+ export default webFetch;
53
+ async function readWithCap(res, cap) {
54
+ // If body is not a stream (older fetch mocks), try res.text()
55
+ const anyRes = res;
56
+ if (!anyRes.body || typeof anyRes.body.getReader !== 'function') {
57
+ const t = typeof anyRes.text === 'function' ? await anyRes.text() : '';
58
+ return Buffer.from(String(t), 'utf8');
59
+ }
60
+ const reader = anyRes.body.getReader();
61
+ const chunks = [];
62
+ let received = 0;
63
+ while (true) {
64
+ const { done, value } = await reader.read();
65
+ if (done)
66
+ break;
67
+ if (value) {
68
+ received += value.byteLength;
69
+ if (received > cap)
70
+ break;
71
+ chunks.push(value);
72
+ }
73
+ }
74
+ const out = new Uint8Array(received);
75
+ let offset = 0;
76
+ for (const c of chunks) {
77
+ out.set(c, offset);
78
+ offset += c.byteLength;
79
+ }
80
+ return Buffer.from(out);
81
+ }
82
+ function extractTitle(html) {
83
+ const m = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
84
+ if (!m)
85
+ return undefined;
86
+ return decodeHTMLEntities(m[1]).trim();
87
+ }
88
+ function htmlToText(html) {
89
+ // Remove script/style robustly: allow attributes and sloppy closing tags like </script foo="bar"> or </script >
90
+ let s = html
91
+ .replace(/<script\b[^>]*>[\s\S]*?<\/script\b[^>]*>/gi, ' ')
92
+ .replace(/<style\b[^>]*>[\s\S]*?<\/style\b[^>]*>/gi, ' ');
93
+ // Remove HTML comments, including non-standard end '--!>' browsers tolerate
94
+ s = s.replace(/<!--[\s\S]*?--!?>(\n)?/g, ' ');
95
+ // Replace <br> and block tags with newlines
96
+ s = s.replace(/<(br|BR)\s*\/?>(\n)?/g, '\n');
97
+ s = s.replace(/<\/(p|div|section|article|h[1-6]|li|ul|ol|header|footer|main)>/gi, '\n');
98
+ // Strip remaining tags
99
+ s = s.replace(/<[^>]+>/g, ' ');
100
+ // Decode entities
101
+ s = decodeHTMLEntities(s);
102
+ // Collapse whitespace
103
+ s = s.replace(/\s+/g, ' ').trim();
104
+ return truncateText(s);
105
+ }
106
+ function truncateText(text, max = 200_000) {
107
+ return text.length > max ? text.slice(0, max) + '…' : text;
108
+ }
109
+ // Minimal HTML entity decoder for common entities
110
+ function decodeHTMLEntities(s) {
111
+ const map = {
112
+ '&amp;': '&',
113
+ '&lt;': '<',
114
+ '&gt;': '>',
115
+ '&quot;': '"',
116
+ '&#39;': "'",
117
+ '&apos;': "'",
118
+ '&nbsp;': ' ',
119
+ };
120
+ return s.replace(/&(amp|lt|gt|quot|#39|apos|nbsp);/g, (m) => map[m] || m);
121
+ }
package/package.json ADDED
@@ -0,0 +1,28 @@
1
+ {
2
+ "name": "@sisu-ai/tool-web-fetch",
3
+ "version": "1.0.0",
4
+ "type": "module",
5
+ "main": "dist/index.js",
6
+ "types": "dist/index.d.ts",
7
+ "files": [
8
+ "dist"
9
+ ],
10
+ "scripts": {
11
+ "build": "tsc -b"
12
+ },
13
+ "dependencies": {
14
+ "zod": "^3.23.8"
15
+ },
16
+ "peerDependencies": {
17
+ "@sisu-ai/core": "0.3.0"
18
+ },
19
+ "repository": {
20
+ "type": "git",
21
+ "url": "https://github.com/finger-gun/sisu",
22
+ "directory": "packages/tools/web-fetch"
23
+ },
24
+ "homepage": "https://github.com/finger-gun/sisu#readme",
25
+ "bugs": {
26
+ "url": "https://github.com/finger-gun/sisu/issues"
27
+ }
28
+ }