@sisu-ai/tool-web-fetch 1.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +162 -1
- package/package.json +11 -3
package/README.md
CHANGED
|
@@ -10,6 +10,7 @@ npm i @sisu-ai/tool-web-fetch
|
|
|
10
10
|
Environment / Flags
|
|
11
11
|
- `WEB_FETCH_USER_AGENT` or `HTTP_USER_AGENT` (flag: `--web-fetch-user-agent`)
|
|
12
12
|
- `WEB_FETCH_MAX_BYTES` (flag: `--web-fetch-max-bytes`) — default 500kB
|
|
13
|
+
- `WEB_FETCH_RESPECT_ROBOTS` (flag: `--web-fetch-respect-robots`) — `1`/`true` (default) to honor robots.txt; set `0`/`false` to disable
|
|
13
14
|
|
|
14
15
|
Tool
|
|
15
16
|
- Name: `webFetch`
|
|
@@ -17,6 +18,7 @@ Tool
|
|
|
17
18
|
- Returns: `{ url, finalUrl?, status, contentType?, title?, text?, html?, json? }`
|
|
18
19
|
|
|
19
20
|
Behavior
|
|
21
|
+
- Respects robots.txt by default for the provided User-Agent.
|
|
20
22
|
- Follows redirects and reads up to `maxBytes` to avoid huge pages.
|
|
21
23
|
- If `format: 'text'` (default) and page is HTML, strips tags (removes script/style) and decodes basic entities; includes `title`.
|
|
22
24
|
- If `format: 'html'`, returns raw HTML and `title`.
|
package/dist/index.d.ts
CHANGED
|
@@ -4,6 +4,7 @@ export interface WebFetchArgs {
|
|
|
4
4
|
url: string;
|
|
5
5
|
format?: WebFetchFormat;
|
|
6
6
|
maxBytes?: number;
|
|
7
|
+
respectRobots?: boolean;
|
|
7
8
|
}
|
|
8
9
|
export interface WebFetchResult {
|
|
9
10
|
url: string;
|
|
@@ -14,6 +15,8 @@ export interface WebFetchResult {
|
|
|
14
15
|
text?: string;
|
|
15
16
|
html?: string;
|
|
16
17
|
json?: unknown;
|
|
18
|
+
robotsBlocked?: boolean;
|
|
19
|
+
robotsAgent?: string;
|
|
17
20
|
}
|
|
18
21
|
export declare const webFetch: Tool<WebFetchArgs>;
|
|
19
22
|
export default webFetch;
|
package/dist/index.js
CHANGED
|
@@ -7,12 +7,43 @@ export const webFetch = {
|
|
|
7
7
|
url: z.string().url(),
|
|
8
8
|
format: z.enum(['text', 'html', 'json']).optional(),
|
|
9
9
|
maxBytes: z.number().int().positive().max(5_000_000).optional(),
|
|
10
|
+
respectRobots: z.boolean().optional(),
|
|
10
11
|
}),
|
|
11
|
-
handler: async ({ url, format = 'text', maxBytes },
|
|
12
|
+
handler: async ({ url, format = 'text', maxBytes, respectRobots }, ctx) => {
|
|
12
13
|
const ua = firstConfigValue(['WEB_FETCH_USER_AGENT', 'HTTP_USER_AGENT'])
|
|
13
14
|
|| 'SisuWebFetch/0.1 (+https://github.com/finger-gun/sisu)';
|
|
14
15
|
const capEnv = firstConfigValue(['WEB_FETCH_MAX_BYTES']);
|
|
15
16
|
const cap = Number(maxBytes ?? (capEnv !== undefined ? Number(capEnv) : 500_000));
|
|
17
|
+
// robots.txt compliance (default on; disable with arg or env WEB_FETCH_RESPECT_ROBOTS=0)
|
|
18
|
+
const respect = (() => {
|
|
19
|
+
if (typeof respectRobots === 'boolean')
|
|
20
|
+
return respectRobots;
|
|
21
|
+
const env = firstConfigValue(['WEB_FETCH_RESPECT_ROBOTS', 'RESPECT_ROBOTS']);
|
|
22
|
+
if (env === undefined)
|
|
23
|
+
return true; // default on
|
|
24
|
+
return !(env === '0' || /^false$/i.test(env));
|
|
25
|
+
})();
|
|
26
|
+
if (respect) {
|
|
27
|
+
const decision = await robotsDecision(url, ua).catch(() => ({ allowed: true }));
|
|
28
|
+
if (!decision.allowed) {
|
|
29
|
+
ctx?.log?.info?.('[webFetch] blocked by robots.txt', {
|
|
30
|
+
url,
|
|
31
|
+
userAgent: ua,
|
|
32
|
+
matchedAgent: decision.matchedAgent,
|
|
33
|
+
ruleType: decision.ruleType,
|
|
34
|
+
rulePattern: decision.rulePattern,
|
|
35
|
+
});
|
|
36
|
+
return {
|
|
37
|
+
url,
|
|
38
|
+
status: 403,
|
|
39
|
+
contentType: 'text/plain',
|
|
40
|
+
text: `Blocked by robots.txt (agent: ${decision.matchedAgent ?? 'unknown'}, rule: ${decision.ruleType ?? 'disallow'} ${decision.rulePattern ?? ''})`.
|
|
41
|
+
trim(),
|
|
42
|
+
robotsBlocked: true,
|
|
43
|
+
robotsAgent: ua
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
}
|
|
16
47
|
const res = await fetch(url, {
|
|
17
48
|
redirect: 'follow',
|
|
18
49
|
headers: { 'User-Agent': ua, 'Accept': '*/*' },
|
|
@@ -50,6 +81,136 @@ export const webFetch = {
|
|
|
50
81
|
},
|
|
51
82
|
};
|
|
52
83
|
export default webFetch;
|
|
84
|
+
const robotsCache = new Map();
|
|
85
|
+
async function robotsDecision(targetUrl, userAgent) {
|
|
86
|
+
const u = new URL(targetUrl);
|
|
87
|
+
const origin = `${u.protocol}//${u.host}`;
|
|
88
|
+
const cache = robotsCache.get(origin);
|
|
89
|
+
const now = Date.now();
|
|
90
|
+
if (!cache || (now - cache.ts) > 60 * 60 * 1000) { // 1h TTL
|
|
91
|
+
const robotsUrl = `${origin}/robots.txt`;
|
|
92
|
+
try {
|
|
93
|
+
const res = await fetch(robotsUrl, { headers: { 'User-Agent': userAgent, 'Accept': 'text/plain' } });
|
|
94
|
+
const txt = await res.text();
|
|
95
|
+
const rules = res.ok ? parseRobots(txt) : null;
|
|
96
|
+
robotsCache.set(origin, { ts: now, rules });
|
|
97
|
+
}
|
|
98
|
+
catch {
|
|
99
|
+
robotsCache.set(origin, { ts: now, rules: null });
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
const rules = robotsCache.get(origin)?.rules;
|
|
103
|
+
if (!rules)
|
|
104
|
+
return { allowed: true };
|
|
105
|
+
return evaluateRobotsDetailed(rules, userAgent, u.pathname + (u.search || ''));
|
|
106
|
+
}
|
|
107
|
+
function parseRobots(text) {
|
|
108
|
+
const lines = text.split(/\r?\n/);
|
|
109
|
+
const groups = [];
|
|
110
|
+
let current = null;
|
|
111
|
+
for (const raw of lines) {
|
|
112
|
+
const line = raw.trim();
|
|
113
|
+
if (!line || line.startsWith('#'))
|
|
114
|
+
continue;
|
|
115
|
+
const m = line.match(/^(user-agent|allow|disallow)\s*:\s*(.*)$/i);
|
|
116
|
+
if (!m)
|
|
117
|
+
continue;
|
|
118
|
+
const key = m[1].toLowerCase();
|
|
119
|
+
const val = m[2].trim();
|
|
120
|
+
if (key === 'user-agent') {
|
|
121
|
+
// Start a new group if we already had one and it contains rules
|
|
122
|
+
if (!current || (current.allows.length + current.disallows.length) > 0) {
|
|
123
|
+
current = { agents: [], allows: [], disallows: [] };
|
|
124
|
+
groups.push(current);
|
|
125
|
+
}
|
|
126
|
+
current.agents.push(val.toLowerCase());
|
|
127
|
+
}
|
|
128
|
+
else if (key === 'allow') {
|
|
129
|
+
if (!current) {
|
|
130
|
+
current = { agents: ['*'], allows: [], disallows: [] };
|
|
131
|
+
groups.push(current);
|
|
132
|
+
}
|
|
133
|
+
current.allows.push(val);
|
|
134
|
+
}
|
|
135
|
+
else if (key === 'disallow') {
|
|
136
|
+
if (!current) {
|
|
137
|
+
current = { agents: ['*'], allows: [], disallows: [] };
|
|
138
|
+
groups.push(current);
|
|
139
|
+
}
|
|
140
|
+
current.disallows.push(val);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
return { groups };
|
|
144
|
+
}
|
|
145
|
+
function evaluateRobotsDetailed(rules, userAgent, pathWithQuery) {
|
|
146
|
+
// Match exact agent token (product) ignoring case, or '*'.
|
|
147
|
+
// Example: 'SisuWebFetch/0.1 (+...)' -> baseAgent 'sisuwebfetch'
|
|
148
|
+
const baseAgent = (userAgent.split(/[\/\s]/)[0] || '').toLowerCase();
|
|
149
|
+
const agentMatches = (agent) => {
|
|
150
|
+
if (agent === '*')
|
|
151
|
+
return true;
|
|
152
|
+
return agent.toLowerCase() === baseAgent;
|
|
153
|
+
};
|
|
154
|
+
const matching = rules.groups
|
|
155
|
+
.map(g => ({ g, matchedAgent: g.agents.find(agentMatches) }))
|
|
156
|
+
.filter(x => !!x.matchedAgent);
|
|
157
|
+
const selected = matching.length
|
|
158
|
+
? matching
|
|
159
|
+
: rules.groups.filter(g => g.agents.includes('*')).map(g => ({ g, matchedAgent: '*' }));
|
|
160
|
+
if (!selected.length)
|
|
161
|
+
return { allowed: true };
|
|
162
|
+
// longest match wins between allow and disallow
|
|
163
|
+
let bestType;
|
|
164
|
+
let bestLen = -1;
|
|
165
|
+
let bestPat;
|
|
166
|
+
let bestAgent;
|
|
167
|
+
for (const { g, matchedAgent } of selected) {
|
|
168
|
+
for (const pat of g.allows) {
|
|
169
|
+
if (!pat)
|
|
170
|
+
continue;
|
|
171
|
+
if (patternMatches(pat, pathWithQuery)) {
|
|
172
|
+
const L = pat.length;
|
|
173
|
+
if (L > bestLen) {
|
|
174
|
+
bestLen = L;
|
|
175
|
+
bestType = 'allow';
|
|
176
|
+
bestPat = pat;
|
|
177
|
+
bestAgent = matchedAgent;
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
for (const pat of g.disallows) {
|
|
182
|
+
if (!pat)
|
|
183
|
+
continue;
|
|
184
|
+
if (patternMatches(pat, pathWithQuery)) {
|
|
185
|
+
const L = pat.length;
|
|
186
|
+
if (L > bestLen) {
|
|
187
|
+
bestLen = L;
|
|
188
|
+
bestType = 'disallow';
|
|
189
|
+
bestPat = pat;
|
|
190
|
+
bestAgent = matchedAgent;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
if (bestType === 'disallow')
|
|
196
|
+
return { allowed: false, matchedAgent: bestAgent, ruleType: 'disallow', rulePattern: bestPat };
|
|
197
|
+
return { allowed: true, matchedAgent: bestAgent, ruleType: bestType, rulePattern: bestPat };
|
|
198
|
+
}
|
|
199
|
+
function patternMatches(pat, path) {
|
|
200
|
+
// Support '*' wildcard and '$' end anchor; treat path as starting with '/'
|
|
201
|
+
const p = pat.trim();
|
|
202
|
+
if (p === '')
|
|
203
|
+
return false;
|
|
204
|
+
// Empty disallow means allow all; already handled by return false above
|
|
205
|
+
// Convert to regex
|
|
206
|
+
const escaped = p.replace(/[.+?^${}()|\[\]\\]/g, r => '\\' + r);
|
|
207
|
+
let reStr = '^' + escaped.replace(/\*/g, '.*');
|
|
208
|
+
if (reStr.endsWith('\$')) {
|
|
209
|
+
reStr = reStr.slice(0, -2) + '$';
|
|
210
|
+
}
|
|
211
|
+
const re = new RegExp(reStr);
|
|
212
|
+
return re.test(path);
|
|
213
|
+
}
|
|
53
214
|
async function readWithCap(res, cap) {
|
|
54
215
|
// If body is not a stream (older fetch mocks), try res.text()
|
|
55
216
|
const anyRes = res;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sisu-ai/tool-web-fetch",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"zod": "^3.23.8"
|
|
15
15
|
},
|
|
16
16
|
"peerDependencies": {
|
|
17
|
-
"@sisu-ai/core": "0.
|
|
17
|
+
"@sisu-ai/core": "1.0.1"
|
|
18
18
|
},
|
|
19
19
|
"repository": {
|
|
20
20
|
"type": "git",
|
|
@@ -24,5 +24,13 @@
|
|
|
24
24
|
"homepage": "https://github.com/finger-gun/sisu#readme",
|
|
25
25
|
"bugs": {
|
|
26
26
|
"url": "https://github.com/finger-gun/sisu/issues"
|
|
27
|
-
}
|
|
27
|
+
},
|
|
28
|
+
"keywords": [
|
|
29
|
+
"sisu",
|
|
30
|
+
"ai",
|
|
31
|
+
"ai-agent",
|
|
32
|
+
"agentic",
|
|
33
|
+
"tool",
|
|
34
|
+
"web-fetch"
|
|
35
|
+
]
|
|
28
36
|
}
|