@sisu-ai/tool-web-fetch 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -10,6 +10,7 @@ npm i @sisu-ai/tool-web-fetch
10
10
  Environment / Flags
11
11
  - `WEB_FETCH_USER_AGENT` or `HTTP_USER_AGENT` (flag: `--web-fetch-user-agent`)
12
12
  - `WEB_FETCH_MAX_BYTES` (flag: `--web-fetch-max-bytes`) — default 500kB
13
+ - `WEB_FETCH_RESPECT_ROBOTS` (flag: `--web-fetch-respect-robots`) — `1`/`true` (default) to honor robots.txt; set `0`/`false` to disable
13
14
 
14
15
  Tool
15
16
  - Name: `webFetch`
@@ -17,6 +18,7 @@ Tool
17
18
  - Returns: `{ url, finalUrl?, status, contentType?, title?, text?, html?, json? }`
18
19
 
19
20
  Behavior
21
+ - Respects robots.txt by default for the provided User-Agent.
20
22
  - Follows redirects and reads up to `maxBytes` to avoid huge pages.
21
23
  - If `format: 'text'` (default) and page is HTML, strips tags (removes script/style) and decodes basic entities; includes `title`.
22
24
  - If `format: 'html'`, returns raw HTML and `title`.
package/dist/index.d.ts CHANGED
@@ -4,6 +4,7 @@ export interface WebFetchArgs {
4
4
  url: string;
5
5
  format?: WebFetchFormat;
6
6
  maxBytes?: number;
7
+ respectRobots?: boolean;
7
8
  }
8
9
  export interface WebFetchResult {
9
10
  url: string;
@@ -14,6 +15,8 @@ export interface WebFetchResult {
14
15
  text?: string;
15
16
  html?: string;
16
17
  json?: unknown;
18
+ robotsBlocked?: boolean;
19
+ robotsAgent?: string;
17
20
  }
18
21
  export declare const webFetch: Tool<WebFetchArgs>;
19
22
  export default webFetch;
package/dist/index.js CHANGED
@@ -7,12 +7,43 @@ export const webFetch = {
7
7
  url: z.string().url(),
8
8
  format: z.enum(['text', 'html', 'json']).optional(),
9
9
  maxBytes: z.number().int().positive().max(5_000_000).optional(),
10
+ respectRobots: z.boolean().optional(),
10
11
  }),
11
- handler: async ({ url, format = 'text', maxBytes }, _ctx) => {
12
+ handler: async ({ url, format = 'text', maxBytes, respectRobots }, ctx) => {
12
13
  const ua = firstConfigValue(['WEB_FETCH_USER_AGENT', 'HTTP_USER_AGENT'])
13
14
  || 'SisuWebFetch/0.1 (+https://github.com/finger-gun/sisu)';
14
15
  const capEnv = firstConfigValue(['WEB_FETCH_MAX_BYTES']);
15
16
  const cap = Number(maxBytes ?? (capEnv !== undefined ? Number(capEnv) : 500_000));
17
+ // robots.txt compliance (default on; disable with arg or env WEB_FETCH_RESPECT_ROBOTS=0)
18
+ const respect = (() => {
19
+ if (typeof respectRobots === 'boolean')
20
+ return respectRobots;
21
+ const env = firstConfigValue(['WEB_FETCH_RESPECT_ROBOTS', 'RESPECT_ROBOTS']);
22
+ if (env === undefined)
23
+ return true; // default on
24
+ return !(env === '0' || /^false$/i.test(env));
25
+ })();
26
+ if (respect) {
27
+ const decision = await robotsDecision(url, ua).catch(() => ({ allowed: true }));
28
+ if (!decision.allowed) {
29
+ ctx?.log?.info?.('[webFetch] blocked by robots.txt', {
30
+ url,
31
+ userAgent: ua,
32
+ matchedAgent: decision.matchedAgent,
33
+ ruleType: decision.ruleType,
34
+ rulePattern: decision.rulePattern,
35
+ });
36
+ return {
37
+ url,
38
+ status: 403,
39
+ contentType: 'text/plain',
40
+ text: `Blocked by robots.txt (agent: ${decision.matchedAgent ?? 'unknown'}, rule: ${decision.ruleType ?? 'disallow'} ${decision.rulePattern ?? ''})`.
41
+ trim(),
42
+ robotsBlocked: true,
43
+ robotsAgent: ua
44
+ };
45
+ }
46
+ }
16
47
  const res = await fetch(url, {
17
48
  redirect: 'follow',
18
49
  headers: { 'User-Agent': ua, 'Accept': '*/*' },
@@ -50,6 +81,136 @@ export const webFetch = {
50
81
  },
51
82
  };
52
83
  export default webFetch;
84
+ const robotsCache = new Map();
85
+ async function robotsDecision(targetUrl, userAgent) {
86
+ const u = new URL(targetUrl);
87
+ const origin = `${u.protocol}//${u.host}`;
88
+ const cache = robotsCache.get(origin);
89
+ const now = Date.now();
90
+ if (!cache || (now - cache.ts) > 60 * 60 * 1000) { // 1h TTL
91
+ const robotsUrl = `${origin}/robots.txt`;
92
+ try {
93
+ const res = await fetch(robotsUrl, { headers: { 'User-Agent': userAgent, 'Accept': 'text/plain' } });
94
+ const txt = await res.text();
95
+ const rules = res.ok ? parseRobots(txt) : null;
96
+ robotsCache.set(origin, { ts: now, rules });
97
+ }
98
+ catch {
99
+ robotsCache.set(origin, { ts: now, rules: null });
100
+ }
101
+ }
102
+ const rules = robotsCache.get(origin)?.rules;
103
+ if (!rules)
104
+ return { allowed: true };
105
+ return evaluateRobotsDetailed(rules, userAgent, u.pathname + (u.search || ''));
106
+ }
107
+ function parseRobots(text) {
108
+ const lines = text.split(/\r?\n/);
109
+ const groups = [];
110
+ let current = null;
111
+ for (const raw of lines) {
112
+ const line = raw.trim();
113
+ if (!line || line.startsWith('#'))
114
+ continue;
115
+ const m = line.match(/^(user-agent|allow|disallow)\s*:\s*(.*)$/i);
116
+ if (!m)
117
+ continue;
118
+ const key = m[1].toLowerCase();
119
+ const val = m[2].trim();
120
+ if (key === 'user-agent') {
121
+ // Start a new group if we already had one and it contains rules
122
+ if (!current || (current.allows.length + current.disallows.length) > 0) {
123
+ current = { agents: [], allows: [], disallows: [] };
124
+ groups.push(current);
125
+ }
126
+ current.agents.push(val.toLowerCase());
127
+ }
128
+ else if (key === 'allow') {
129
+ if (!current) {
130
+ current = { agents: ['*'], allows: [], disallows: [] };
131
+ groups.push(current);
132
+ }
133
+ current.allows.push(val);
134
+ }
135
+ else if (key === 'disallow') {
136
+ if (!current) {
137
+ current = { agents: ['*'], allows: [], disallows: [] };
138
+ groups.push(current);
139
+ }
140
+ current.disallows.push(val);
141
+ }
142
+ }
143
+ return { groups };
144
+ }
145
+ function evaluateRobotsDetailed(rules, userAgent, pathWithQuery) {
146
+ // Match exact agent token (product) ignoring case, or '*'.
147
+ // Example: 'SisuWebFetch/0.1 (+...)' -> baseAgent 'sisuwebfetch'
148
+ const baseAgent = (userAgent.split(/[\/\s]/)[0] || '').toLowerCase();
149
+ const agentMatches = (agent) => {
150
+ if (agent === '*')
151
+ return true;
152
+ return agent.toLowerCase() === baseAgent;
153
+ };
154
+ const matching = rules.groups
155
+ .map(g => ({ g, matchedAgent: g.agents.find(agentMatches) }))
156
+ .filter(x => !!x.matchedAgent);
157
+ const selected = matching.length
158
+ ? matching
159
+ : rules.groups.filter(g => g.agents.includes('*')).map(g => ({ g, matchedAgent: '*' }));
160
+ if (!selected.length)
161
+ return { allowed: true };
162
+ // longest match wins between allow and disallow
163
+ let bestType;
164
+ let bestLen = -1;
165
+ let bestPat;
166
+ let bestAgent;
167
+ for (const { g, matchedAgent } of selected) {
168
+ for (const pat of g.allows) {
169
+ if (!pat)
170
+ continue;
171
+ if (patternMatches(pat, pathWithQuery)) {
172
+ const L = pat.length;
173
+ if (L > bestLen) {
174
+ bestLen = L;
175
+ bestType = 'allow';
176
+ bestPat = pat;
177
+ bestAgent = matchedAgent;
178
+ }
179
+ }
180
+ }
181
+ for (const pat of g.disallows) {
182
+ if (!pat)
183
+ continue;
184
+ if (patternMatches(pat, pathWithQuery)) {
185
+ const L = pat.length;
186
+ if (L > bestLen) {
187
+ bestLen = L;
188
+ bestType = 'disallow';
189
+ bestPat = pat;
190
+ bestAgent = matchedAgent;
191
+ }
192
+ }
193
+ }
194
+ }
195
+ if (bestType === 'disallow')
196
+ return { allowed: false, matchedAgent: bestAgent, ruleType: 'disallow', rulePattern: bestPat };
197
+ return { allowed: true, matchedAgent: bestAgent, ruleType: bestType, rulePattern: bestPat };
198
+ }
199
+ function patternMatches(pat, path) {
200
+ // Support '*' wildcard and '$' end anchor; treat path as starting with '/'
201
+ const p = pat.trim();
202
+ if (p === '')
203
+ return false;
204
+ // Empty disallow means allow all; already handled by return false above
205
+ // Convert to regex
206
+ const escaped = p.replace(/[.+?^${}()|\[\]\\]/g, r => '\\' + r);
207
+ let reStr = '^' + escaped.replace(/\*/g, '.*');
208
+ if (reStr.endsWith('\$')) {
209
+ reStr = reStr.slice(0, -2) + '$';
210
+ }
211
+ const re = new RegExp(reStr);
212
+ return re.test(path);
213
+ }
53
214
  async function readWithCap(res, cap) {
54
215
  // If body is not a stream (older fetch mocks), try res.text()
55
216
  const anyRes = res;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sisu-ai/tool-web-fetch",
3
- "version": "1.0.0",
3
+ "version": "2.0.0",
4
4
  "type": "module",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -14,7 +14,7 @@
14
14
  "zod": "^3.23.8"
15
15
  },
16
16
  "peerDependencies": {
17
- "@sisu-ai/core": "0.3.0"
17
+ "@sisu-ai/core": "1.0.0"
18
18
  },
19
19
  "repository": {
20
20
  "type": "git",