@sisu-ai/tool-web-fetch 8.0.1 → 8.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +2 -2
- package/dist/index.js +119 -73
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import type { Tool } from
|
|
2
|
-
export type WebFetchFormat =
|
|
1
|
+
import type { Tool } from "@sisu-ai/core";
|
|
2
|
+
export type WebFetchFormat = "text" | "html" | "json";
|
|
3
3
|
export interface WebFetchArgs {
|
|
4
4
|
url: string;
|
|
5
5
|
format?: WebFetchFormat;
|
package/dist/index.js
CHANGED
|
@@ -1,32 +1,35 @@
|
|
|
1
|
-
import { firstConfigValue } from
|
|
2
|
-
import { z } from
|
|
1
|
+
import { firstConfigValue } from "@sisu-ai/core";
|
|
2
|
+
import { z } from "zod";
|
|
3
3
|
export const webFetch = {
|
|
4
|
-
name:
|
|
5
|
-
description:
|
|
4
|
+
name: "webFetch",
|
|
5
|
+
description: "Fetch a web page by URL and return text, HTML, or JSON. Defaults to text extraction for HTML.",
|
|
6
6
|
schema: z.object({
|
|
7
7
|
url: z.string().url(),
|
|
8
|
-
format: z.enum([
|
|
8
|
+
format: z.enum(["text", "html", "json"]).optional(),
|
|
9
9
|
maxBytes: z.number().int().positive().max(5_000_000).optional(),
|
|
10
10
|
respectRobots: z.boolean().optional(),
|
|
11
11
|
}),
|
|
12
|
-
handler: async ({ url, format =
|
|
13
|
-
const ua = firstConfigValue([
|
|
14
|
-
|
|
15
|
-
const capEnv = firstConfigValue([
|
|
12
|
+
handler: async ({ url, format = "text", maxBytes, respectRobots }, ctx) => {
|
|
13
|
+
const ua = firstConfigValue(["WEB_FETCH_USER_AGENT", "HTTP_USER_AGENT"]) ||
|
|
14
|
+
"SisuWebFetch/0.1 (+https://github.com/finger-gun/sisu)";
|
|
15
|
+
const capEnv = firstConfigValue(["WEB_FETCH_MAX_BYTES"]);
|
|
16
16
|
const cap = Number(maxBytes ?? (capEnv !== undefined ? Number(capEnv) : 500_000));
|
|
17
17
|
// robots.txt compliance (default on; disable with arg or env WEB_FETCH_RESPECT_ROBOTS=0)
|
|
18
18
|
const respect = (() => {
|
|
19
|
-
if (typeof respectRobots ===
|
|
19
|
+
if (typeof respectRobots === "boolean")
|
|
20
20
|
return respectRobots;
|
|
21
|
-
const env = firstConfigValue([
|
|
21
|
+
const env = firstConfigValue([
|
|
22
|
+
"WEB_FETCH_RESPECT_ROBOTS",
|
|
23
|
+
"RESPECT_ROBOTS",
|
|
24
|
+
]);
|
|
22
25
|
if (env === undefined)
|
|
23
26
|
return true; // default on
|
|
24
|
-
return !(env ===
|
|
27
|
+
return !(env === "0" || /^false$/i.test(env));
|
|
25
28
|
})();
|
|
26
29
|
if (respect) {
|
|
27
30
|
const decision = await robotsDecision(url, ua).catch(() => ({ allowed: true }));
|
|
28
31
|
if (!decision.allowed) {
|
|
29
|
-
ctx?.log?.info?.(
|
|
32
|
+
ctx?.log?.info?.("[webFetch] blocked by robots.txt", {
|
|
30
33
|
url,
|
|
31
34
|
userAgent: ua,
|
|
32
35
|
matchedAgent: decision.matchedAgent,
|
|
@@ -36,48 +39,76 @@ export const webFetch = {
|
|
|
36
39
|
return {
|
|
37
40
|
url,
|
|
38
41
|
status: 403,
|
|
39
|
-
contentType:
|
|
40
|
-
text: `Blocked by robots.txt (agent: ${decision.matchedAgent ??
|
|
41
|
-
trim(),
|
|
42
|
+
contentType: "text/plain",
|
|
43
|
+
text: `Blocked by robots.txt (agent: ${decision.matchedAgent ?? "unknown"}, rule: ${decision.ruleType ?? "disallow"} ${decision.rulePattern ?? ""})`.trim(),
|
|
42
44
|
robotsBlocked: true,
|
|
43
|
-
robotsAgent: ua
|
|
45
|
+
robotsAgent: ua,
|
|
44
46
|
};
|
|
45
47
|
}
|
|
46
48
|
}
|
|
47
49
|
const res = await fetch(url, {
|
|
48
|
-
redirect:
|
|
49
|
-
headers: {
|
|
50
|
+
redirect: "follow",
|
|
51
|
+
headers: { "User-Agent": ua, Accept: "*/*" },
|
|
50
52
|
});
|
|
51
|
-
const contentType = res.headers?.get?.(
|
|
53
|
+
const contentType = res.headers?.get?.("content-type") || "";
|
|
52
54
|
// Stream read with cap to avoid massive bodies
|
|
53
55
|
const buf = await readWithCap(res, cap);
|
|
54
56
|
const finalUrl = res.url || undefined;
|
|
55
57
|
if (!res.ok) {
|
|
56
|
-
return {
|
|
58
|
+
return {
|
|
59
|
+
url,
|
|
60
|
+
finalUrl,
|
|
61
|
+
status: res.status,
|
|
62
|
+
contentType,
|
|
63
|
+
text: truncateText(buf.toString("utf8")),
|
|
64
|
+
};
|
|
57
65
|
}
|
|
58
66
|
// Handle by requested format and content-type
|
|
59
67
|
const ctLower = contentType.toLowerCase();
|
|
60
|
-
if (format ===
|
|
68
|
+
if (format === "json" || ctLower.includes("application/json")) {
|
|
61
69
|
try {
|
|
62
|
-
const json = JSON.parse(buf.toString(
|
|
70
|
+
const json = JSON.parse(buf.toString("utf8"));
|
|
63
71
|
return { url, finalUrl, status: res.status, contentType, json };
|
|
64
72
|
}
|
|
65
73
|
catch {
|
|
66
74
|
// Fall through to text
|
|
67
75
|
}
|
|
68
76
|
}
|
|
69
|
-
if (format ===
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
77
|
+
if (format === "html" ||
|
|
78
|
+
ctLower.includes("text/html") ||
|
|
79
|
+
ctLower.includes("application/xhtml")) {
|
|
80
|
+
const html = buf.toString("utf8");
|
|
81
|
+
if (format === "html") {
|
|
82
|
+
return {
|
|
83
|
+
url,
|
|
84
|
+
finalUrl,
|
|
85
|
+
status: res.status,
|
|
86
|
+
contentType,
|
|
87
|
+
html,
|
|
88
|
+
title: extractTitle(html),
|
|
89
|
+
};
|
|
73
90
|
}
|
|
74
91
|
// format === 'text'
|
|
75
92
|
const text = htmlToText(html);
|
|
76
|
-
return {
|
|
93
|
+
return {
|
|
94
|
+
url,
|
|
95
|
+
finalUrl,
|
|
96
|
+
status: res.status,
|
|
97
|
+
contentType,
|
|
98
|
+
text,
|
|
99
|
+
title: extractTitle(html),
|
|
100
|
+
html: undefined,
|
|
101
|
+
};
|
|
77
102
|
}
|
|
78
103
|
// Fallback: treat as text/*
|
|
79
|
-
const text = buf.toString(
|
|
80
|
-
return {
|
|
104
|
+
const text = buf.toString("utf8");
|
|
105
|
+
return {
|
|
106
|
+
url,
|
|
107
|
+
finalUrl,
|
|
108
|
+
status: res.status,
|
|
109
|
+
contentType,
|
|
110
|
+
text: truncateText(text),
|
|
111
|
+
};
|
|
81
112
|
},
|
|
82
113
|
};
|
|
83
114
|
export default webFetch;
|
|
@@ -87,10 +118,13 @@ async function robotsDecision(targetUrl, userAgent) {
|
|
|
87
118
|
const origin = `${u.protocol}//${u.host}`;
|
|
88
119
|
const cache = robotsCache.get(origin);
|
|
89
120
|
const now = Date.now();
|
|
90
|
-
if (!cache ||
|
|
121
|
+
if (!cache || now - cache.ts > 60 * 60 * 1000) {
|
|
122
|
+
// 1h TTL
|
|
91
123
|
const robotsUrl = `${origin}/robots.txt`;
|
|
92
124
|
try {
|
|
93
|
-
const res = await fetch(robotsUrl, {
|
|
125
|
+
const res = await fetch(robotsUrl, {
|
|
126
|
+
headers: { "User-Agent": userAgent, Accept: "text/plain" },
|
|
127
|
+
});
|
|
94
128
|
const txt = await res.text();
|
|
95
129
|
const rules = res.ok ? parseRobots(txt) : null;
|
|
96
130
|
robotsCache.set(origin, { ts: now, rules });
|
|
@@ -102,7 +136,7 @@ async function robotsDecision(targetUrl, userAgent) {
|
|
|
102
136
|
const rules = robotsCache.get(origin)?.rules;
|
|
103
137
|
if (!rules)
|
|
104
138
|
return { allowed: true };
|
|
105
|
-
return evaluateRobotsDetailed(rules, userAgent, u.pathname + (u.search ||
|
|
139
|
+
return evaluateRobotsDetailed(rules, userAgent, u.pathname + (u.search || ""));
|
|
106
140
|
}
|
|
107
141
|
function parseRobots(text) {
|
|
108
142
|
const lines = text.split(/\r?\n/);
|
|
@@ -110,31 +144,31 @@ function parseRobots(text) {
|
|
|
110
144
|
let current = null;
|
|
111
145
|
for (const raw of lines) {
|
|
112
146
|
const line = raw.trim();
|
|
113
|
-
if (!line || line.startsWith(
|
|
147
|
+
if (!line || line.startsWith("#"))
|
|
114
148
|
continue;
|
|
115
149
|
const m = line.match(/^(user-agent|allow|disallow)\s*:\s*(.*)$/i);
|
|
116
150
|
if (!m)
|
|
117
151
|
continue;
|
|
118
152
|
const key = m[1].toLowerCase();
|
|
119
153
|
const val = m[2].trim();
|
|
120
|
-
if (key ===
|
|
154
|
+
if (key === "user-agent") {
|
|
121
155
|
// Start a new group if we already had one and it contains rules
|
|
122
|
-
if (!current ||
|
|
156
|
+
if (!current || current.allows.length + current.disallows.length > 0) {
|
|
123
157
|
current = { agents: [], allows: [], disallows: [] };
|
|
124
158
|
groups.push(current);
|
|
125
159
|
}
|
|
126
160
|
current.agents.push(val.toLowerCase());
|
|
127
161
|
}
|
|
128
|
-
else if (key ===
|
|
162
|
+
else if (key === "allow") {
|
|
129
163
|
if (!current) {
|
|
130
|
-
current = { agents: [
|
|
164
|
+
current = { agents: ["*"], allows: [], disallows: [] };
|
|
131
165
|
groups.push(current);
|
|
132
166
|
}
|
|
133
167
|
current.allows.push(val);
|
|
134
168
|
}
|
|
135
|
-
else if (key ===
|
|
169
|
+
else if (key === "disallow") {
|
|
136
170
|
if (!current) {
|
|
137
|
-
current = { agents: [
|
|
171
|
+
current = { agents: ["*"], allows: [], disallows: [] };
|
|
138
172
|
groups.push(current);
|
|
139
173
|
}
|
|
140
174
|
current.disallows.push(val);
|
|
@@ -145,18 +179,20 @@ function parseRobots(text) {
|
|
|
145
179
|
function evaluateRobotsDetailed(rules, userAgent, pathWithQuery) {
|
|
146
180
|
// Match exact agent token (product) ignoring case, or '*'.
|
|
147
181
|
// Example: 'SisuWebFetch/0.1 (+...)' -> baseAgent 'sisuwebfetch'
|
|
148
|
-
const baseAgent = (userAgent.split(/[
|
|
182
|
+
const baseAgent = (userAgent.split(/[\s/]/)[0] || "").toLowerCase();
|
|
149
183
|
const agentMatches = (agent) => {
|
|
150
|
-
if (agent ===
|
|
184
|
+
if (agent === "*")
|
|
151
185
|
return true;
|
|
152
186
|
return agent.toLowerCase() === baseAgent;
|
|
153
187
|
};
|
|
154
188
|
const matching = rules.groups
|
|
155
|
-
.map(g => ({ g, matchedAgent: g.agents.find(agentMatches) }))
|
|
156
|
-
.filter(x => !!x.matchedAgent);
|
|
189
|
+
.map((g) => ({ g, matchedAgent: g.agents.find(agentMatches) }))
|
|
190
|
+
.filter((x) => !!x.matchedAgent);
|
|
157
191
|
const selected = matching.length
|
|
158
192
|
? matching
|
|
159
|
-
: rules.groups
|
|
193
|
+
: rules.groups
|
|
194
|
+
.filter((g) => g.agents.includes("*"))
|
|
195
|
+
.map((g) => ({ g, matchedAgent: "*" }));
|
|
160
196
|
if (!selected.length)
|
|
161
197
|
return { allowed: true };
|
|
162
198
|
// longest match wins between allow and disallow
|
|
@@ -172,7 +208,7 @@ function evaluateRobotsDetailed(rules, userAgent, pathWithQuery) {
|
|
|
172
208
|
const L = pat.length;
|
|
173
209
|
if (L > bestLen) {
|
|
174
210
|
bestLen = L;
|
|
175
|
-
bestType =
|
|
211
|
+
bestType = "allow";
|
|
176
212
|
bestPat = pat;
|
|
177
213
|
bestAgent = matchedAgent;
|
|
178
214
|
}
|
|
@@ -185,28 +221,38 @@ function evaluateRobotsDetailed(rules, userAgent, pathWithQuery) {
|
|
|
185
221
|
const L = pat.length;
|
|
186
222
|
if (L > bestLen) {
|
|
187
223
|
bestLen = L;
|
|
188
|
-
bestType =
|
|
224
|
+
bestType = "disallow";
|
|
189
225
|
bestPat = pat;
|
|
190
226
|
bestAgent = matchedAgent;
|
|
191
227
|
}
|
|
192
228
|
}
|
|
193
229
|
}
|
|
194
230
|
}
|
|
195
|
-
if (bestType ===
|
|
196
|
-
return {
|
|
197
|
-
|
|
231
|
+
if (bestType === "disallow")
|
|
232
|
+
return {
|
|
233
|
+
allowed: false,
|
|
234
|
+
matchedAgent: bestAgent,
|
|
235
|
+
ruleType: "disallow",
|
|
236
|
+
rulePattern: bestPat,
|
|
237
|
+
};
|
|
238
|
+
return {
|
|
239
|
+
allowed: true,
|
|
240
|
+
matchedAgent: bestAgent,
|
|
241
|
+
ruleType: bestType,
|
|
242
|
+
rulePattern: bestPat,
|
|
243
|
+
};
|
|
198
244
|
}
|
|
199
245
|
function patternMatches(pat, path) {
|
|
200
246
|
// Support '*' wildcard and '$' end anchor; treat path as starting with '/'
|
|
201
247
|
const p = pat.trim();
|
|
202
|
-
if (p ===
|
|
248
|
+
if (p === "")
|
|
203
249
|
return false;
|
|
204
250
|
// Empty disallow means allow all; already handled by return false above
|
|
205
251
|
// Convert to regex
|
|
206
|
-
const escaped = p.replace(/[.+?^${}()
|
|
207
|
-
let reStr =
|
|
208
|
-
if (reStr.endsWith(
|
|
209
|
-
reStr = reStr.slice(0, -
|
|
252
|
+
const escaped = p.replace(/[.+?^${}()|[\]\\]/g, (r) => "\\" + r);
|
|
253
|
+
let reStr = "^" + escaped.replace(/\*/g, ".*");
|
|
254
|
+
if (reStr.endsWith("$")) {
|
|
255
|
+
reStr = reStr.slice(0, -1) + "$";
|
|
210
256
|
}
|
|
211
257
|
const re = new RegExp(reStr);
|
|
212
258
|
return re.test(path);
|
|
@@ -214,9 +260,9 @@ function patternMatches(pat, path) {
|
|
|
214
260
|
async function readWithCap(res, cap) {
|
|
215
261
|
// If body is not a stream (older fetch mocks), try res.text()
|
|
216
262
|
const anyRes = res;
|
|
217
|
-
if (!anyRes.body || typeof anyRes.body.getReader !==
|
|
218
|
-
const t = typeof anyRes.text ===
|
|
219
|
-
return Buffer.from(String(t),
|
|
263
|
+
if (!anyRes.body || typeof anyRes.body.getReader !== "function") {
|
|
264
|
+
const t = typeof anyRes.text === "function" ? await anyRes.text() : "";
|
|
265
|
+
return Buffer.from(String(t), "utf8");
|
|
220
266
|
}
|
|
221
267
|
const reader = anyRes.body.getReader();
|
|
222
268
|
const chunks = [];
|
|
@@ -249,34 +295,34 @@ function extractTitle(html) {
|
|
|
249
295
|
function htmlToText(html) {
|
|
250
296
|
// Remove script/style robustly: allow attributes and sloppy closing tags like </script foo="bar"> or </script >
|
|
251
297
|
let s = html
|
|
252
|
-
.replace(/<script\b[^>]*>[\s\S]*?<\/script\b[^>]*>/gi,
|
|
253
|
-
.replace(/<style\b[^>]*>[\s\S]*?<\/style\b[^>]*>/gi,
|
|
298
|
+
.replace(/<script\b[^>]*>[\s\S]*?<\/script\b[^>]*>/gi, " ")
|
|
299
|
+
.replace(/<style\b[^>]*>[\s\S]*?<\/style\b[^>]*>/gi, " ");
|
|
254
300
|
// Remove HTML comments, including non-standard end '--!>' browsers tolerate
|
|
255
|
-
s = s.replace(/<!--[\s\S]*?--!?>(\n)?/g,
|
|
301
|
+
s = s.replace(/<!--[\s\S]*?--!?>(\n)?/g, " ");
|
|
256
302
|
// Replace <br> and block tags with newlines
|
|
257
|
-
s = s.replace(/<(br|BR)\s*\/?>(\n)?/g,
|
|
258
|
-
s = s.replace(/<\/(p|div|section|article|h[1-6]|li|ul|ol|header|footer|main)>/gi,
|
|
303
|
+
s = s.replace(/<(br|BR)\s*\/?>(\n)?/g, "\n");
|
|
304
|
+
s = s.replace(/<\/(p|div|section|article|h[1-6]|li|ul|ol|header|footer|main)>/gi, "\n");
|
|
259
305
|
// Strip remaining tags
|
|
260
|
-
s = s.replace(/<[^>]+>/g,
|
|
306
|
+
s = s.replace(/<[^>]+>/g, " ");
|
|
261
307
|
// Decode entities
|
|
262
308
|
s = decodeHTMLEntities(s);
|
|
263
309
|
// Collapse whitespace
|
|
264
|
-
s = s.replace(/\s+/g,
|
|
310
|
+
s = s.replace(/\s+/g, " ").trim();
|
|
265
311
|
return truncateText(s);
|
|
266
312
|
}
|
|
267
313
|
function truncateText(text, max = 200_000) {
|
|
268
|
-
return text.length > max ? text.slice(0, max) +
|
|
314
|
+
return text.length > max ? text.slice(0, max) + "…" : text;
|
|
269
315
|
}
|
|
270
316
|
// Minimal HTML entity decoder for common entities
|
|
271
317
|
function decodeHTMLEntities(s) {
|
|
272
318
|
const map = {
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
319
|
+
"&": "&",
|
|
320
|
+
"<": "<",
|
|
321
|
+
">": ">",
|
|
322
|
+
""": '"',
|
|
323
|
+
"'": "'",
|
|
324
|
+
"'": "'",
|
|
325
|
+
" ": " ",
|
|
280
326
|
};
|
|
281
327
|
return s.replace(/&(amp|lt|gt|quot|#39|apos|nbsp);/g, (m) => map[m] || m);
|
|
282
328
|
}
|