nothumanallowed 9.7.2 → 9.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,430 @@
1
+ /**
2
+ * Web search + URL fetch tools for NHA CLI.
3
+ *
4
+ * - web_search: DuckDuckGo HTML scraping (zero API key, zero dependencies)
5
+ * - fetch_url: SSRF-protected HTML→text extraction
6
+ *
7
+ * Enterprise-grade security:
8
+ * - SSRF protection (private IP blocking, protocol validation, DNS pre-resolution)
9
+ * - Content-type allowlist (text/* only)
10
+ * - Size limits (100KB download, 8KB output)
11
+ * - Timeout protection (10s)
12
+ * - No binary/PDF/script content
13
+ *
14
+ * Zero npm dependencies — pure Node.js 22.
15
+ */
16
+
17
+ import { URL } from 'url';
18
+ import dns from 'dns/promises';
19
+ import net from 'net';
20
+
21
+ // ── Constants ────────────────────────────────────────────────────────────────
22
+
23
+ const MAX_DOWNLOAD_BYTES = 100 * 1024; // 100KB
24
+ const MAX_OUTPUT_CHARS = 8000; // ~2K tokens
25
+ const FETCH_TIMEOUT_MS = 10000; // 10s
26
+ const MAX_REDIRECTS = 5;
27
+ const MAX_RESULTS = 8;
28
+
29
+ const USER_AGENT = 'NHA-CLI/9.0 (NotHumanAllowed; +https://nothumanallowed.com)';
30
+
31
+ // ── SSRF Protection ──────────────────────────────────────────────────────────
32
+
33
+ /**
34
+ * Private/internal IP ranges that MUST be blocked to prevent SSRF.
35
+ */
36
+ const PRIVATE_RANGES = [
37
+ // IPv4
38
+ { start: '10.0.0.0', end: '10.255.255.255' },
39
+ { start: '172.16.0.0', end: '172.31.255.255' },
40
+ { start: '192.168.0.0', end: '192.168.255.255' },
41
+ { start: '127.0.0.0', end: '127.255.255.255' },
42
+ { start: '169.254.0.0', end: '169.254.255.255' },
43
+ { start: '0.0.0.0', end: '0.255.255.255' },
44
+ ];
45
+
46
+ function ipToLong(ip) {
47
+ const parts = ip.split('.').map(Number);
48
+ return ((parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]) >>> 0;
49
+ }
50
+
51
+ function isPrivateIp(ip) {
52
+ if (!net.isIPv4(ip)) return false; // IPv6 — block by default for safety
53
+ const long = ipToLong(ip);
54
+ for (const range of PRIVATE_RANGES) {
55
+ if (long >= ipToLong(range.start) && long <= ipToLong(range.end)) return true;
56
+ }
57
+ return false;
58
+ }
59
+
60
+ /**
61
+ * Validate URL for SSRF safety.
62
+ * Returns { safe: true, hostname } or { safe: false, reason }.
63
+ */
64
+ async function validateUrl(urlStr) {
65
+ let parsed;
66
+ try {
67
+ parsed = new URL(urlStr);
68
+ } catch {
69
+ return { safe: false, reason: 'Invalid URL' };
70
+ }
71
+
72
+ // Protocol check
73
+ if (parsed.protocol !== 'https:' && parsed.protocol !== 'http:') {
74
+ return { safe: false, reason: `Blocked protocol: ${parsed.protocol}` };
75
+ }
76
+
77
+ // Localhost detection (various encodings)
78
+ const hostname = parsed.hostname.toLowerCase();
79
+ if (
80
+ hostname === 'localhost' ||
81
+ hostname === '0.0.0.0' ||
82
+ hostname === '[::1]' ||
83
+ hostname === '::1' ||
84
+ /^0x[0-9a-f]+$/i.test(hostname) || // hex-encoded
85
+ /^\d+$/.test(hostname) // decimal-encoded
86
+ ) {
87
+ return { safe: false, reason: 'Blocked: localhost' };
88
+ }
89
+
90
+ // DNS pre-resolution to catch internal hostnames
91
+ try {
92
+ const addresses = await dns.resolve4(hostname);
93
+ for (const addr of addresses) {
94
+ if (isPrivateIp(addr)) {
95
+ return { safe: false, reason: `Blocked: ${hostname} resolves to private IP ${addr}` };
96
+ }
97
+ }
98
+ } catch {
99
+ // DNS resolution failed — hostname might not exist
100
+ return { safe: false, reason: `DNS resolution failed for ${hostname}` };
101
+ }
102
+
103
+ return { safe: true, hostname };
104
+ }
105
+
106
+ // ── HTML → Text Extraction ───────────────────────────────────────────────────
107
+
108
+ /**
109
+ * Extract readable text from HTML.
110
+ * Strips scripts, styles, nav, header, footer. Decodes entities.
111
+ */
112
+ function htmlToText(html) {
113
+ let text = html;
114
+
115
+ // Remove script, style, nav, header, footer, svg, noscript
116
+ text = text.replace(/<(script|style|svg|noscript|nav|header|footer|aside|iframe)[^>]*>[\s\S]*?<\/\1>/gi, ' ');
117
+
118
+ // Remove all HTML tags
119
+ text = text.replace(/<[^>]+>/g, ' ');
120
+
121
+ // Decode HTML entities
122
+ text = text
123
+ .replace(/&#x([0-9A-Fa-f]+);/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
124
+ .replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)))
125
+ .replace(/&amp;/g, '&')
126
+ .replace(/&lt;/g, '<')
127
+ .replace(/&gt;/g, '>')
128
+ .replace(/&quot;/g, '"')
129
+ .replace(/&apos;/g, "'")
130
+ .replace(/&nbsp;/g, ' ')
131
+ .replace(/&mdash;/g, '—')
132
+ .replace(/&ndash;/g, '–')
133
+ .replace(/&hellip;/g, '...')
134
+ .replace(/&rsquo;/g, "'")
135
+ .replace(/&lsquo;/g, "'")
136
+ .replace(/&rdquo;/g, '"')
137
+ .replace(/&ldquo;/g, '"');
138
+
139
+ // Collapse whitespace
140
+ text = text.replace(/\s+/g, ' ').trim();
141
+
142
+ return text;
143
+ }
144
+
145
+ /**
146
+ * Extract <title> from HTML.
147
+ */
148
+ function extractTitle(html) {
149
+ const match = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
150
+ if (!match) return '';
151
+ return htmlToText(match[1]).slice(0, 200);
152
+ }
153
+
154
+ // ── Fetch with protection ────────────────────────────────────────────────────
155
+
156
+ /**
157
+ * Fetch a URL with SSRF protection, size limits, and timeout.
158
+ * Returns { status, contentType, body, title, excerpt, truncated }.
159
+ */
160
+ export async function fetchUrl(urlStr) {
161
+ // Validate URL
162
+ const validation = await validateUrl(urlStr);
163
+ if (!validation.safe) {
164
+ return { error: true, message: validation.reason };
165
+ }
166
+
167
+ const controller = new AbortController();
168
+ const timeout = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
169
+
170
+ try {
171
+ const res = await fetch(urlStr, {
172
+ headers: {
173
+ 'User-Agent': USER_AGENT,
174
+ 'Accept': 'text/html, text/plain, application/json, text/xml',
175
+ },
176
+ signal: controller.signal,
177
+ redirect: 'follow',
178
+ // Node.js fetch follows redirects by default (max 20)
179
+ });
180
+
181
+ clearTimeout(timeout);
182
+
183
+ const contentType = (res.headers.get('content-type') || '').toLowerCase();
184
+
185
+ // Content-type allowlist
186
+ if (!contentType.startsWith('text/') && !contentType.includes('json') && !contentType.includes('xml')) {
187
+ return {
188
+ error: true,
189
+ message: `Blocked content-type: ${contentType}. Only text, JSON, and XML are allowed.`,
190
+ };
191
+ }
192
+
193
+ // Read body with size limit
194
+ const reader = res.body.getReader();
195
+ const chunks = [];
196
+ let totalBytes = 0;
197
+ let truncated = false;
198
+
199
+ while (true) {
200
+ const { done, value } = await reader.read();
201
+ if (done) break;
202
+ totalBytes += value.length;
203
+ if (totalBytes > MAX_DOWNLOAD_BYTES) {
204
+ truncated = true;
205
+ // Take only the part that fits
206
+ const overshoot = totalBytes - MAX_DOWNLOAD_BYTES;
207
+ chunks.push(value.slice(0, value.length - overshoot));
208
+ break;
209
+ }
210
+ chunks.push(value);
211
+ }
212
+
213
+ const decoder = new TextDecoder('utf-8', { fatal: false });
214
+ const rawBody = decoder.decode(Buffer.concat(chunks));
215
+
216
+ // Extract useful content
217
+ let body;
218
+ let title = '';
219
+ let excerpt = '';
220
+
221
+ if (contentType.includes('html')) {
222
+ title = extractTitle(rawBody);
223
+ body = htmlToText(rawBody);
224
+ } else if (contentType.includes('json')) {
225
+ try {
226
+ const parsed = JSON.parse(rawBody);
227
+ body = JSON.stringify(parsed, null, 2);
228
+ } catch {
229
+ body = rawBody;
230
+ }
231
+ } else {
232
+ body = rawBody;
233
+ }
234
+
235
+ // Enforce output limit
236
+ if (body.length > MAX_OUTPUT_CHARS) {
237
+ body = body.slice(0, MAX_OUTPUT_CHARS) + '\n\n[... content truncated at 8000 chars]';
238
+ truncated = true;
239
+ }
240
+
241
+ excerpt = body.slice(0, 200).replace(/\s+/g, ' ').trim();
242
+
243
+ // DNS rebinding check on final URL (after redirects)
244
+ if (res.url !== urlStr) {
245
+ const finalValidation = await validateUrl(res.url);
246
+ if (!finalValidation.safe) {
247
+ return { error: true, message: `Redirect blocked: ${finalValidation.reason}` };
248
+ }
249
+ }
250
+
251
+ return {
252
+ error: false,
253
+ status: res.status,
254
+ contentType,
255
+ body,
256
+ title,
257
+ excerpt,
258
+ truncated,
259
+ url: res.url,
260
+ };
261
+ } catch (err) {
262
+ clearTimeout(timeout);
263
+ if (err.name === 'AbortError') {
264
+ return { error: true, message: 'Request timed out (10s limit)' };
265
+ }
266
+ return { error: true, message: `Fetch failed: ${err.message}` };
267
+ }
268
+ }
269
+
270
+ // ── Web Search (DuckDuckGo HTML) ─────────────────────────────────────────────
271
+
272
+ /**
273
+ * Search the web using DuckDuckGo HTML (no API key needed).
274
+ * Parses the HTML results page to extract links, titles, and snippets.
275
+ *
276
+ * @param {string} query - Search query
277
+ * @param {number} maxResults - Max results to return (default 8)
278
+ * @returns {Promise<{ results: Array<{ title, url, snippet }>, query }>}
279
+ */
280
+ export async function webSearch(query, maxResults = MAX_RESULTS) {
281
+ if (!query || query.trim().length < 2) {
282
+ return { error: true, message: 'Query too short' };
283
+ }
284
+
285
+ const encodedQuery = encodeURIComponent(query.trim());
286
+ const searchUrl = `https://html.duckduckgo.com/html/?q=${encodedQuery}`;
287
+
288
+ const controller = new AbortController();
289
+ const timeout = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
290
+
291
+ try {
292
+ const res = await fetch(searchUrl, {
293
+ headers: {
294
+ 'User-Agent': USER_AGENT,
295
+ 'Accept': 'text/html',
296
+ 'Accept-Language': 'en-US,en;q=0.9',
297
+ },
298
+ signal: controller.signal,
299
+ });
300
+
301
+ clearTimeout(timeout);
302
+
303
+ if (!res.ok) {
304
+ return { error: true, message: `DuckDuckGo returned ${res.status}` };
305
+ }
306
+
307
+ const html = await res.text();
308
+ const results = parseDuckDuckGoResults(html, maxResults);
309
+
310
+ return {
311
+ error: false,
312
+ query: query.trim(),
313
+ resultCount: results.length,
314
+ results,
315
+ };
316
+ } catch (err) {
317
+ clearTimeout(timeout);
318
+ if (err.name === 'AbortError') {
319
+ return { error: true, message: 'Search timed out (10s limit)' };
320
+ }
321
+ return { error: true, message: `Search failed: ${err.message}` };
322
+ }
323
+ }
324
+
325
+ /**
326
+ * Parse DuckDuckGo HTML results page.
327
+ * Extracts title, URL, and snippet from result items.
328
+ */
329
+ function parseDuckDuckGoResults(html, maxResults) {
330
+ const results = [];
331
+
332
+ // DuckDuckGo HTML wraps results in <div class="result..."> with
333
+ // <a class="result__a" href="...">title</a> and
334
+ // <a class="result__snippet">snippet</a>
335
+ const resultBlocks = html.split(/class="result\s/);
336
+
337
+ for (let i = 1; i < resultBlocks.length && results.length < maxResults; i++) {
338
+ const block = resultBlocks[i];
339
+
340
+ // Extract URL — DuckDuckGo uses redirect URLs, extract the actual destination
341
+ let url = '';
342
+ const urlMatch = block.match(/class="result__a"\s+href="([^"]+)"/);
343
+ if (urlMatch) {
344
+ url = urlMatch[1];
345
+ // DuckDuckGo wraps URLs: //duckduckgo.com/l/?uddg=ENCODED_URL&...
346
+ if (url.includes('uddg=')) {
347
+ const uddgMatch = url.match(/uddg=([^&]+)/);
348
+ if (uddgMatch) {
349
+ try {
350
+ url = decodeURIComponent(uddgMatch[1]);
351
+ } catch {
352
+ url = uddgMatch[1];
353
+ }
354
+ }
355
+ }
356
+ // Handle protocol-relative URLs
357
+ if (url.startsWith('//')) url = 'https:' + url;
358
+ }
359
+
360
+ // Extract title
361
+ let title = '';
362
+ const titleMatch = block.match(/class="result__a"[^>]*>([\s\S]*?)<\/a>/);
363
+ if (titleMatch) {
364
+ title = htmlToText(titleMatch[1]).trim();
365
+ }
366
+
367
+ // Extract snippet
368
+ let snippet = '';
369
+ const snippetMatch = block.match(/class="result__snippet"[^>]*>([\s\S]*?)<\/a>/);
370
+ if (!snippetMatch) {
371
+ const altSnippet = block.match(/class="result__snippet"[^>]*>([\s\S]*?)<\//);
372
+ if (altSnippet) snippet = htmlToText(altSnippet[1]).trim();
373
+ } else {
374
+ snippet = htmlToText(snippetMatch[1]).trim();
375
+ }
376
+
377
+ if (url && title) {
378
+ results.push({ title, url, snippet: snippet.slice(0, 300) });
379
+ }
380
+ }
381
+
382
+ return results;
383
+ }
384
+
385
+ /**
386
+ * Deep search: search + fetch top N results for full content.
387
+ *
388
+ * @param {string} query
389
+ * @param {number} fetchCount - How many top results to fetch (default 3)
390
+ * @returns {Promise<{ results, deepResults }>}
391
+ */
392
+ export async function webSearchDeep(query, fetchCount = 3) {
393
+ const searchResult = await webSearch(query);
394
+ if (searchResult.error) return searchResult;
395
+
396
+ const deepResults = [];
397
+ const urlsToFetch = searchResult.results.slice(0, fetchCount);
398
+
399
+ const fetches = urlsToFetch.map(async (result) => {
400
+ try {
401
+ const content = await fetchUrl(result.url);
402
+ if (!content.error) {
403
+ return {
404
+ title: content.title || result.title,
405
+ url: result.url,
406
+ snippet: result.snippet,
407
+ content: content.body,
408
+ excerpt: content.excerpt,
409
+ };
410
+ }
411
+ } catch {}
412
+ return null;
413
+ });
414
+
415
+ const fetchedResults = await Promise.allSettled(fetches);
416
+ for (const result of fetchedResults) {
417
+ if (result.status === 'fulfilled' && result.value) {
418
+ deepResults.push(result.value);
419
+ }
420
+ }
421
+
422
+ return {
423
+ error: false,
424
+ query: query.trim(),
425
+ resultCount: searchResult.results.length,
426
+ results: searchResult.results,
427
+ deepFetched: deepResults.length,
428
+ deepResults,
429
+ };
430
+ }