webpeel 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +415 -0
  3. package/dist/cli.d.ts +16 -0
  4. package/dist/cli.d.ts.map +1 -0
  5. package/dist/cli.js +140 -0
  6. package/dist/cli.js.map +1 -0
  7. package/dist/core/fetcher.d.ts +32 -0
  8. package/dist/core/fetcher.d.ts.map +1 -0
  9. package/dist/core/fetcher.js +479 -0
  10. package/dist/core/fetcher.js.map +1 -0
  11. package/dist/core/markdown.d.ts +17 -0
  12. package/dist/core/markdown.d.ts.map +1 -0
  13. package/dist/core/markdown.js +143 -0
  14. package/dist/core/markdown.js.map +1 -0
  15. package/dist/core/metadata.d.ts +17 -0
  16. package/dist/core/metadata.d.ts.map +1 -0
  17. package/dist/core/metadata.js +159 -0
  18. package/dist/core/metadata.js.map +1 -0
  19. package/dist/core/strategies.d.ts +30 -0
  20. package/dist/core/strategies.d.ts.map +1 -0
  21. package/dist/core/strategies.js +67 -0
  22. package/dist/core/strategies.js.map +1 -0
  23. package/dist/index.d.ts +31 -0
  24. package/dist/index.d.ts.map +1 -0
  25. package/dist/index.js +81 -0
  26. package/dist/index.js.map +1 -0
  27. package/dist/mcp/server.d.ts +7 -0
  28. package/dist/mcp/server.d.ts.map +1 -0
  29. package/dist/mcp/server.js +248 -0
  30. package/dist/mcp/server.js.map +1 -0
  31. package/dist/server/app.d.ts +13 -0
  32. package/dist/server/app.d.ts.map +1 -0
  33. package/dist/server/app.js +89 -0
  34. package/dist/server/app.js.map +1 -0
  35. package/dist/server/auth-store.d.ts +28 -0
  36. package/dist/server/auth-store.d.ts.map +1 -0
  37. package/dist/server/auth-store.js +87 -0
  38. package/dist/server/auth-store.js.map +1 -0
  39. package/dist/server/middleware/auth.d.ts +18 -0
  40. package/dist/server/middleware/auth.d.ts.map +1 -0
  41. package/dist/server/middleware/auth.js +55 -0
  42. package/dist/server/middleware/auth.js.map +1 -0
  43. package/dist/server/middleware/rate-limit.d.ts +23 -0
  44. package/dist/server/middleware/rate-limit.d.ts.map +1 -0
  45. package/dist/server/middleware/rate-limit.js +85 -0
  46. package/dist/server/middleware/rate-limit.js.map +1 -0
  47. package/dist/server/routes/fetch.d.ts +7 -0
  48. package/dist/server/routes/fetch.d.ts.map +1 -0
  49. package/dist/server/routes/fetch.js +127 -0
  50. package/dist/server/routes/fetch.js.map +1 -0
  51. package/dist/server/routes/health.d.ts +6 -0
  52. package/dist/server/routes/health.d.ts.map +1 -0
  53. package/dist/server/routes/health.js +19 -0
  54. package/dist/server/routes/health.js.map +1 -0
  55. package/dist/server/routes/search.d.ts +7 -0
  56. package/dist/server/routes/search.d.ts.map +1 -0
  57. package/dist/server/routes/search.js +124 -0
  58. package/dist/server/routes/search.js.map +1 -0
  59. package/dist/types.d.ts +59 -0
  60. package/dist/types.d.ts.map +1 -0
  61. package/dist/types.js +30 -0
  62. package/dist/types.js.map +1 -0
  63. package/llms.txt +60 -0
  64. package/package.json +80 -0
@@ -0,0 +1,479 @@
1
+ /**
2
+ * Core fetching logic: simple HTTP and browser-based fetching
3
+ */
4
+ import { chromium } from 'playwright';
5
+ import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
6
+ const USER_AGENTS = [
7
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
8
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
9
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15',
10
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
11
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
12
+ ];
13
+ function getRandomUserAgent() {
14
+ return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
15
+ }
16
+ /**
17
+ * SECURITY: Validate URL to prevent SSRF attacks
18
+ * Blocks localhost, private IPs, link-local, and various bypass techniques
19
+ */
20
+ function validateUrl(urlString) {
21
+ // Length check
22
+ if (urlString.length > 2048) {
23
+ throw new WebPeelError('URL too long (max 2048 characters)');
24
+ }
25
+ // Check for control characters and suspicious encoding
26
+ if (/[\x00-\x1F\x7F]/.test(urlString)) {
27
+ throw new WebPeelError('URL contains invalid control characters');
28
+ }
29
+ let url;
30
+ try {
31
+ url = new URL(urlString);
32
+ }
33
+ catch {
34
+ throw new WebPeelError('Invalid URL format');
35
+ }
36
+ // Only allow HTTP(S)
37
+ if (!['http:', 'https:'].includes(url.protocol)) {
38
+ throw new WebPeelError('Only HTTP and HTTPS protocols are allowed');
39
+ }
40
+ // Validate hostname is not empty
41
+ if (!url.hostname) {
42
+ throw new WebPeelError('Invalid hostname');
43
+ }
44
+ const hostname = url.hostname.toLowerCase();
45
+ // Block localhost patterns
46
+ const localhostPatterns = ['localhost', '0.0.0.0'];
47
+ if (localhostPatterns.some(pattern => hostname === pattern || hostname.endsWith('.' + pattern))) {
48
+ throw new WebPeelError('Access to localhost is not allowed');
49
+ }
50
+ // ENHANCED: Parse and validate IP addresses (handles hex, octal, decimal, mixed)
51
+ const ipv4Info = parseAndValidateIPv4(hostname);
52
+ if (ipv4Info) {
53
+ validateIPv4Address(ipv4Info);
54
+ }
55
+ // ENHANCED: Comprehensive IPv6 validation
56
+ if (hostname.includes(':')) {
57
+ validateIPv6Address(hostname);
58
+ }
59
+ }
60
+ /**
61
+ * Parse IPv4 address in any format (dotted, hex, octal, decimal, mixed)
62
+ * Returns null if not an IPv4 address
63
+ */
64
+ function parseAndValidateIPv4(hostname) {
65
+ // Remove brackets if present
66
+ const cleaned = hostname.replace(/^\[|\]$/g, '');
67
+ // Standard dotted notation: 192.168.1.1
68
+ const dottedRegex = /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/;
69
+ const dottedMatch = cleaned.match(dottedRegex);
70
+ if (dottedMatch) {
71
+ const octets = dottedMatch.slice(1).map(Number);
72
+ if (octets.every(o => o >= 0 && o <= 255)) {
73
+ return octets;
74
+ }
75
+ throw new WebPeelError('Invalid IPv4 address');
76
+ }
77
+ // Hex notation: 0x7f000001
78
+ if (/^0x[0-9a-fA-F]+$/.test(cleaned)) {
79
+ const num = parseInt(cleaned, 16);
80
+ return [
81
+ (num >>> 24) & 0xff,
82
+ (num >>> 16) & 0xff,
83
+ (num >>> 8) & 0xff,
84
+ num & 0xff,
85
+ ];
86
+ }
87
+ // Octal notation: 0177.0.0.1 or full octal 017700000001
88
+ if (/^0[0-7]/.test(cleaned)) {
89
+ // Full octal (all digits)
90
+ if (/^0[0-7]+$/.test(cleaned)) {
91
+ const num = parseInt(cleaned, 8);
92
+ if (num <= 0xffffffff) {
93
+ return [
94
+ (num >>> 24) & 0xff,
95
+ (num >>> 16) & 0xff,
96
+ (num >>> 8) & 0xff,
97
+ num & 0xff,
98
+ ];
99
+ }
100
+ }
101
+ // Mixed octal-decimal: 0177.0.0.1
102
+ const parts = cleaned.split('.');
103
+ if (parts.length === 4) {
104
+ const octets = parts.map(p => parseInt(p, /^0[0-7]/.test(p) ? 8 : 10));
105
+ if (octets.every(o => o >= 0 && o <= 255)) {
106
+ return octets;
107
+ }
108
+ }
109
+ }
110
+ // Decimal notation: 2130706433
111
+ if (/^\d+$/.test(cleaned)) {
112
+ const num = parseInt(cleaned, 10);
113
+ if (num <= 0xffffffff) {
114
+ return [
115
+ (num >>> 24) & 0xff,
116
+ (num >>> 16) & 0xff,
117
+ (num >>> 8) & 0xff,
118
+ num & 0xff,
119
+ ];
120
+ }
121
+ }
122
+ return null;
123
+ }
124
+ /**
125
+ * Validate IPv4 address against private/reserved ranges
126
+ */
127
+ function validateIPv4Address(octets) {
128
+ const [a, b, c, d] = octets;
129
+ // Loopback: 127.0.0.0/8
130
+ if (a === 127) {
131
+ throw new WebPeelError('Access to loopback addresses is not allowed');
132
+ }
133
+ // Private: 10.0.0.0/8
134
+ if (a === 10) {
135
+ throw new WebPeelError('Access to private IP addresses is not allowed');
136
+ }
137
+ // Private: 172.16.0.0/12
138
+ if (a === 172 && b >= 16 && b <= 31) {
139
+ throw new WebPeelError('Access to private IP addresses is not allowed');
140
+ }
141
+ // Private: 192.168.0.0/16
142
+ if (a === 192 && b === 168) {
143
+ throw new WebPeelError('Access to private IP addresses is not allowed');
144
+ }
145
+ // Link-local: 169.254.0.0/16
146
+ if (a === 169 && b === 254) {
147
+ throw new WebPeelError('Access to link-local addresses is not allowed');
148
+ }
149
+ // Broadcast: 255.255.255.255
150
+ if (a === 255 && b === 255 && c === 255 && d === 255) {
151
+ throw new WebPeelError('Access to broadcast address is not allowed');
152
+ }
153
+ // This network: 0.0.0.0/8
154
+ if (a === 0) {
155
+ throw new WebPeelError('Access to "this network" addresses is not allowed');
156
+ }
157
+ }
158
+ /**
159
+ * Validate IPv6 address against private/reserved ranges
160
+ */
161
+ function validateIPv6Address(hostname) {
162
+ // Remove brackets
163
+ const addr = hostname.replace(/^\[|\]$/g, '').toLowerCase();
164
+ // Loopback: ::1
165
+ if (addr === '::1' || addr === '0:0:0:0:0:0:0:1') {
166
+ throw new WebPeelError('Access to loopback addresses is not allowed');
167
+ }
168
+ // IPv6 mapped IPv4: ::ffff:192.168.1.1 or ::ffff:c0a8:0101
169
+ if (addr.startsWith('::ffff:')) {
170
+ // Extract the IPv4 part
171
+ const ipv4Part = addr.substring(7);
172
+ // Could be dotted (::ffff:192.168.1.1) or hex (::ffff:c0a8:0101)
173
+ if (ipv4Part.includes('.')) {
174
+ // Parse dotted IPv4
175
+ const parts = ipv4Part.split('.');
176
+ if (parts.length === 4) {
177
+ const octets = parts.map(p => parseInt(p, 10));
178
+ if (octets.every(o => !isNaN(o) && o >= 0 && o <= 255)) {
179
+ validateIPv4Address(octets);
180
+ }
181
+ }
182
+ }
183
+ else {
184
+ // Parse hex IPv4 (e.g., c0a80101 = 192.168.1.1)
185
+ const hexStr = ipv4Part.replace(/:/g, '');
186
+ if (/^[0-9a-f]{1,8}$/.test(hexStr)) {
187
+ const num = parseInt(hexStr, 16);
188
+ const octets = [
189
+ (num >>> 24) & 0xff,
190
+ (num >>> 16) & 0xff,
191
+ (num >>> 8) & 0xff,
192
+ num & 0xff,
193
+ ];
194
+ validateIPv4Address(octets);
195
+ }
196
+ }
197
+ throw new WebPeelError('Access to IPv6-mapped IPv4 addresses is not allowed');
198
+ }
199
+ // Unique local addresses: fc00::/7 (fc00:: to fdff::)
200
+ if (addr.startsWith('fc') || addr.startsWith('fd')) {
201
+ throw new WebPeelError('Access to unique local IPv6 addresses is not allowed');
202
+ }
203
+ // Link-local: fe80::/10
204
+ if (addr.startsWith('fe8') || addr.startsWith('fe9') ||
205
+ addr.startsWith('fea') || addr.startsWith('feb')) {
206
+ throw new WebPeelError('Access to link-local IPv6 addresses is not allowed');
207
+ }
208
+ }
209
+ /**
210
+ * Validate and sanitize user agent string
211
+ */
212
+ function validateUserAgent(userAgent) {
213
+ if (userAgent.length > 500) {
214
+ throw new WebPeelError('User agent too long (max 500 characters)');
215
+ }
216
+ // Allow only printable ASCII characters
217
+ if (!/^[\x20-\x7E]*$/.test(userAgent)) {
218
+ throw new WebPeelError('User agent contains invalid characters');
219
+ }
220
+ return userAgent;
221
+ }
222
+ /**
223
+ * Simple HTTP fetch using native fetch + Cheerio
224
+ * Fast and lightweight, but can be blocked by Cloudflare/bot detection
225
+ * SECURITY: Manual redirect handling with SSRF re-validation
226
+ */
227
+ export async function simpleFetch(url, userAgent, timeoutMs = 30000) {
228
+ // SECURITY: Validate URL to prevent SSRF
229
+ validateUrl(url);
230
+ // Validate user agent if provided
231
+ const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
232
+ const MAX_REDIRECTS = 10;
233
+ let redirectCount = 0;
234
+ let currentUrl = url;
235
+ const seenUrls = new Set();
236
+ while (redirectCount <= MAX_REDIRECTS) {
237
+ // Detect redirect loops
238
+ if (seenUrls.has(currentUrl)) {
239
+ throw new WebPeelError('Redirect loop detected');
240
+ }
241
+ seenUrls.add(currentUrl);
242
+ // Re-validate on each redirect
243
+ validateUrl(currentUrl);
244
+ const controller = new AbortController();
245
+ const timer = setTimeout(() => controller.abort(), timeoutMs);
246
+ try {
247
+ const response = await fetch(currentUrl, {
248
+ headers: {
249
+ 'User-Agent': validatedUserAgent,
250
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
251
+ 'Accept-Language': 'en-US,en;q=0.9',
252
+ 'Accept-Encoding': 'gzip, deflate, br',
253
+ 'DNT': '1',
254
+ 'Connection': 'keep-alive',
255
+ 'Upgrade-Insecure-Requests': '1',
256
+ },
257
+ signal: controller.signal,
258
+ redirect: 'manual', // SECURITY: Manual redirect handling
259
+ });
260
+ clearTimeout(timer);
261
+ // Handle redirects manually
262
+ if (response.status >= 300 && response.status < 400) {
263
+ const location = response.headers.get('location');
264
+ if (!location) {
265
+ throw new NetworkError('Redirect response missing Location header');
266
+ }
267
+ // Resolve relative URLs
268
+ currentUrl = new URL(location, currentUrl).href;
269
+ redirectCount++;
270
+ continue;
271
+ }
272
+ if (!response.ok) {
273
+ if (response.status === 403 || response.status === 503) {
274
+ throw new BlockedError(`HTTP ${response.status}: Site may be blocking requests. Try --render for browser mode.`);
275
+ }
276
+ throw new NetworkError(`HTTP ${response.status}: ${response.statusText}`);
277
+ }
278
+ // SECURITY: Validate Content-Type
279
+ const contentType = response.headers.get('content-type') || '';
280
+ if (!contentType.includes('text/html') && !contentType.includes('application/xhtml+xml')) {
281
+ throw new WebPeelError('Unsupported content type. Only HTML is supported.');
282
+ }
283
+ // SECURITY: Stream response with size limit (prevent memory exhaustion)
284
+ const chunks = [];
285
+ let totalSize = 0;
286
+ const MAX_SIZE = 10 * 1024 * 1024; // 10MB
287
+ const reader = response.body?.getReader();
288
+ if (!reader) {
289
+ throw new NetworkError('Response body is not readable');
290
+ }
291
+ try {
292
+ while (true) {
293
+ const { done, value } = await reader.read();
294
+ if (done)
295
+ break;
296
+ totalSize += value.length;
297
+ if (totalSize > MAX_SIZE) {
298
+ reader.cancel();
299
+ throw new WebPeelError('Response too large (max 10MB)');
300
+ }
301
+ chunks.push(value);
302
+ }
303
+ }
304
+ finally {
305
+ reader.releaseLock();
306
+ }
307
+ // Combine chunks
308
+ const combined = new Uint8Array(totalSize);
309
+ let offset = 0;
310
+ for (const chunk of chunks) {
311
+ combined.set(chunk, offset);
312
+ offset += chunk.length;
313
+ }
314
+ const html = new TextDecoder().decode(combined);
315
+ if (!html || html.length < 100) {
316
+ throw new BlockedError('Empty or suspiciously small response. Site may require JavaScript.');
317
+ }
318
+ // Check for Cloudflare challenge
319
+ if (html.includes('cf-browser-verification') || html.includes('Just a moment...')) {
320
+ throw new BlockedError('Cloudflare challenge detected. Try --render for browser mode.');
321
+ }
322
+ return {
323
+ html,
324
+ url: currentUrl,
325
+ statusCode: response.status,
326
+ };
327
+ }
328
+ catch (error) {
329
+ clearTimeout(timer);
330
+ if (error instanceof BlockedError || error instanceof NetworkError || error instanceof WebPeelError) {
331
+ throw error;
332
+ }
333
+ if (error instanceof Error && error.name === 'AbortError') {
334
+ throw new TimeoutError(`Request timed out after ${timeoutMs}ms`);
335
+ }
336
+ throw new NetworkError(`Failed to fetch: ${error instanceof Error ? error.message : 'Unknown error'}`);
337
+ }
338
+ }
339
+ throw new WebPeelError(`Too many redirects (max ${MAX_REDIRECTS})`);
340
+ }
341
+ let sharedBrowser = null;
342
+ let activePagesCount = 0;
343
+ const MAX_CONCURRENT_PAGES = 5;
344
+ async function getBrowser() {
345
+ // SECURITY: Check if browser is still connected and healthy
346
+ if (sharedBrowser) {
347
+ try {
348
+ if (sharedBrowser.isConnected()) {
349
+ return sharedBrowser;
350
+ }
351
+ }
352
+ catch {
353
+ // Browser is dead, recreate
354
+ sharedBrowser = null;
355
+ }
356
+ }
357
+ sharedBrowser = await chromium.launch({ headless: true });
358
+ return sharedBrowser;
359
+ }
360
+ /**
361
+ * Fetch using headless Chromium via Playwright
362
+ * Slower but can handle JavaScript-heavy sites and bypass some bot detection
363
+ */
364
+ export async function browserFetch(url, options = {}) {
365
+ // SECURITY: Validate URL to prevent SSRF
366
+ validateUrl(url);
367
+ const { userAgent, waitMs = 0, timeoutMs = 30000 } = options;
368
+ // Validate user agent if provided
369
+ const validatedUserAgent = userAgent ? validateUserAgent(userAgent) : getRandomUserAgent();
370
+ // Validate wait time
371
+ if (waitMs < 0 || waitMs > 60000) {
372
+ throw new WebPeelError('Wait time must be between 0 and 60000ms');
373
+ }
374
+ // SECURITY: Limit concurrent browser pages with timeout
375
+ const queueStartTime = Date.now();
376
+ const QUEUE_TIMEOUT_MS = 30000; // 30 second max wait
377
+ while (activePagesCount >= MAX_CONCURRENT_PAGES) {
378
+ if (Date.now() - queueStartTime > QUEUE_TIMEOUT_MS) {
379
+ throw new TimeoutError('Browser page queue timeout - too many concurrent requests');
380
+ }
381
+ await new Promise(resolve => setTimeout(resolve, 100));
382
+ }
383
+ activePagesCount++;
384
+ let page = null;
385
+ try {
386
+ const browser = await getBrowser();
387
+ page = await browser.newPage({
388
+ userAgent: validatedUserAgent,
389
+ });
390
+ // Block images, fonts, and other heavy resources for speed
391
+ await page.route('**/*', (route) => {
392
+ const resourceType = route.request().resourceType();
393
+ if (['image', 'font', 'media', 'stylesheet'].includes(resourceType)) {
394
+ route.abort();
395
+ }
396
+ else {
397
+ route.continue();
398
+ }
399
+ });
400
+ // SECURITY: Wrap entire operation in timeout
401
+ const fetchPromise = (async () => {
402
+ await page.goto(url, {
403
+ waitUntil: 'domcontentloaded',
404
+ timeout: timeoutMs,
405
+ });
406
+ // Wait for additional time if requested (for dynamic content)
407
+ if (waitMs > 0) {
408
+ await page.waitForTimeout(waitMs);
409
+ }
410
+ const html = await page.content();
411
+ const finalUrl = page.url();
412
+ return { html, finalUrl };
413
+ })();
414
+ const timeoutPromise = new Promise((_, reject) => {
415
+ setTimeout(() => reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)), timeoutMs);
416
+ });
417
+ const { html, finalUrl } = await Promise.race([fetchPromise, timeoutPromise]);
418
+ // SECURITY: Limit HTML size
419
+ if (html.length > 10 * 1024 * 1024) { // 10MB limit
420
+ throw new WebPeelError('Response too large (max 10MB)');
421
+ }
422
+ if (!html || html.length < 100) {
423
+ throw new BlockedError('Empty or suspiciously small response from browser.');
424
+ }
425
+ return {
426
+ html,
427
+ url: finalUrl,
428
+ };
429
+ }
430
+ catch (error) {
431
+ if (error instanceof BlockedError || error instanceof WebPeelError || error instanceof TimeoutError) {
432
+ throw error;
433
+ }
434
+ if (error instanceof Error && error.message.includes('Timeout')) {
435
+ throw new TimeoutError(`Browser navigation timed out`);
436
+ }
437
+ throw new NetworkError(`Browser fetch failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
438
+ }
439
+ finally {
440
+ // CRITICAL: Always close page and decrement counter
441
+ if (page) {
442
+ await page.close().catch(() => { });
443
+ }
444
+ activePagesCount--;
445
+ }
446
+ }
447
+ /**
448
+ * Retry a fetch operation with exponential backoff
449
+ */
450
+ export async function retryFetch(fn, maxAttempts = 3, baseDelayMs = 1000) {
451
+ let lastError = null;
452
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
453
+ try {
454
+ return await fn();
455
+ }
456
+ catch (error) {
457
+ lastError = error instanceof Error ? error : new Error('Unknown error');
458
+ // Don't retry on blocked errors or timeouts
459
+ if (error instanceof BlockedError || error instanceof TimeoutError) {
460
+ throw error;
461
+ }
462
+ if (attempt < maxAttempts) {
463
+ const delay = baseDelayMs * Math.pow(2, attempt - 1);
464
+ await new Promise((resolve) => setTimeout(resolve, delay));
465
+ }
466
+ }
467
+ }
468
+ throw lastError || new NetworkError('Retry failed');
469
+ }
470
+ /**
471
+ * Clean up browser resources
472
+ */
473
+ export async function cleanup() {
474
+ if (sharedBrowser) {
475
+ await sharedBrowser.close();
476
+ sharedBrowser = null;
477
+ }
478
+ }
479
+ //# sourceMappingURL=fetcher.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fetcher.js","sourceRoot":"","sources":["../../src/core/fetcher.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,QAAQ,EAA2B,MAAM,YAAY,CAAC;AAC/D,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAErF,MAAM,WAAW,GAAG;IAClB,uHAAuH;IACvH,iHAAiH;IACjH,uHAAuH;IACvH,kFAAkF;IAClF,uGAAuG;CACxG,CAAC;AAEF,SAAS,kBAAkB;IACzB,OAAO,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC;AACrE,CAAC;AAED;;;GAGG;AACH,SAAS,WAAW,CAAC,SAAiB;IACpC,eAAe;IACf,IAAI,SAAS,CAAC,MAAM,GAAG,IAAI,EAAE,CAAC;QAC5B,MAAM,IAAI,YAAY,CAAC,oCAAoC,CAAC,CAAC;IAC/D,CAAC;IAED,uDAAuD;IACvD,IAAI,iBAAiB,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,YAAY,CAAC,yCAAyC,CAAC,CAAC;IACpE,CAAC;IAED,IAAI,GAAQ,CAAC;IACb,IAAI,CAAC;QACH,GAAG,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC;IAC3B,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,YAAY,CAAC,oBAAoB,CAAC,CAAC;IAC/C,CAAC;IAED,qBAAqB;IACrB,IAAI,CAAC,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;QAChD,MAAM,IAAI,YAAY,CAAC,2CAA2C,CAAC,CAAC;IACtE,CAAC;IAED,iCAAiC;IACjC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;QAClB,MAAM,IAAI,YAAY,CAAC,kBAAkB,CAAC,CAAC;IAC7C,CAAC;IAED,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;IAE5C,2BAA2B;IAC3B,MAAM,iBAAiB,GAAG,CAAC,WAAW,EAAE,SAAS,CAAC,CAAC;IACnD,IAAI,iBAAiB,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,QAAQ,KAAK,OAAO,IAAI,QAAQ,CAAC,QAAQ,CAAC,GAAG,GAAG,OAAO,CAAC,CAAC,EAAE,CAAC;QAChG,MAAM,IAAI,YAAY,CAAC,oCAAoC,CAAC,CAAC;IAC/D,CAAC;IAED,iFAAiF;IACjF,MAAM,QAAQ,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAChD,IAAI,QAAQ,EAAE,CAAC;QACb,mBAAmB,CAAC,QAAQ,CAAC,CAAC;IAChC,CAAC;IAED,0CAA0C;IAC1C,IAAI,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QAC3B,mBAAmB,CAAC,QAAQ,CAAC,CAAC;IAChC,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,SAAS,oBAAoB,CAAC,QAAgB;IAC5C,6BAA6B;IAC7B,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;IAEjD,wCAAwC;IACxC,MAAM,WAAW,GAAG,8CAA8C,CAAC;IACnE,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC;IAC/C,IAAI,WAAW,EAAE,CAAC;QAChB,MAAM,MAAM,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAChD,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;YAC1C,OAAO,MAAM,CAAC;QAChB,CAAC;QACD,MAAM,IAAI,YAAY,CAAC,sBAAsB,CAAC,CAAC;IACjD,CAAC;IAED,2BAA2B;IAC3B,IAAI,kBAAkB,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,QAAQ,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QAClC,OAAO;YACL,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;YACnB,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;YACnB,CAAC,GAAG,KAAK,CAAC,CAAC,GAAG,IAAI;YAClB,GAAG,GAAG,IAAI;SACX,CAAC;IACJ,CAAC;IAED,wDAAwD;IACxD,IAAI,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QAC5B,0BAA0B;QAC1B,IAAI,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;YAC9B,MAAM,GAAG,GAAG,QAAQ,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YACjC,IAAI,GAAG,IAAI,UAAU,EAAE,CAAC;gBACtB,OAAO;oBACL,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;oBACnB,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;oBACnB,CAAC,GAAG,KAAK,CAAC,CAAC,GAAG,IAAI;oBAClB,GAAG,GAAG,IAAI;iBACX,CAAC;YACJ,CAAC;QACH,CAAC;QACD,kCAAkC;QAClC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QACjC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YACvE,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;gBAC1C,OAAO,MAAM,CAAC;YAChB,CAAC;QACH,CAAC;IACH,CAAC;IAED,+BAA+B;IAC/B,IAAI,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1B,MAAM,GAAG,GAAG,QAAQ,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QAClC,IAAI,GAAG,IAAI,UAAU,EAAE,CAAC;YACtB,OAAO;gBACL,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;gBACnB,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;gBACnB,CAAC,GAAG,KAAK,CAAC,CAAC,GAAG,IAAI;gBAClB,GAAG,GAAG,IAAI;aACX,CAAC;QACJ,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,MAAgB;IAC3C,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC;IAE5B,wBAAwB;IACxB,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;QACd,MAAM,IAAI,YAAY,CAAC,6CAA6C,CAAC,CAAC;IACxE,CAAC;IAED,sBAAsB;IACtB,IAAI,CAAC,KAAK,EAAE,EAAE,CAAC;QACb,MAAM,IAAI,YAAY,CAAC,+CAA+C,CAAC,CAAC;IAC1E,CAAC;IAED,yBAAyB;IACzB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;QACpC,MAAM,IAAI,YAAY,CAAC,+CAA+C,CAAC,CAAC;IAC1E,CAAC;IAED,0BAA0B;IAC1B,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;QAC3B,MAAM,IAAI,YAAY,CAAC,+CAA+C,CAAC,CAAC;IAC1E,CAAC;IAED,6BAA6B;IAC7B,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;QAC3B,MAAM,IAAI,YAAY,CAAC,+CAA+C,CAAC,CAAC;IAC1E,CAAC;IAED,6BAA6B;IAC7B,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;QACrD,MAAM,IAAI,YAAY,CAAC,4CAA4C,CAAC,CAAC;IACvE,CAAC;IAED,0BAA0B;IAC1B,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACZ,MAAM,IAAI,YAAY,CAAC,mDAAmD,CAAC,CAAC;IAC9E,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,QAAgB;IAC3C,kBAAkB;IAClB,MAAM,IAAI,GAAG,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IAE5D,gBAAgB;IAChB,IAAI,IAAI,KAAK,KAAK,IAAI,IAAI,KAAK,iBAAiB,EAAE,CAAC;QACjD,MAAM,IAAI,YAAY,CAAC,6CAA6C,CAAC,CAAC;IACxE,CAAC;IAED,2DAA2D;IAC3D,IAAI,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAC/B,wBAAwB;QACxB,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;QAEnC,iEAAiE;QACjE,IAAI,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC3B,oBAAoB;YACpB,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACvB,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;gBAC/C,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;oBACvD,mBAAmB,CAAC,MAAM,CAAC,CAAC;gBAC9B,CAAC;YACH,CAAC;QACH,CAAC;aAAM,CAAC;YACN,gDAAgD;YAChD,MAAM,MAAM,GAAG,QAAQ,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;YAC1C,IAAI,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;gBACnC,MAAM,GAAG,GAAG,QAAQ,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;gBACjC,MAAM,MAAM,GAAG;oBACb,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;oBACnB,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI;oBACnB,CAAC,GAAG,KAAK,CAAC,CAAC,GAAG,IAAI;oBAClB,GAAG,GAAG,IAAI;iBACX,CAAC;gBACF,mBAAmB,CAAC,MAAM,CAAC,CAAC;YAC9B,CAAC;QACH,CAAC;QACD,MAAM,IAAI,YAAY,CAAC,qDAAqD,CAAC,CAAC;IAChF,CAAC;IAED,sDAAsD;IACtD,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;QACnD,MAAM,IAAI,YAAY,CAAC,sDAAsD,CAAC,CAAC;IACjF,CAAC;IAED,wBAAwB;IACxB,IAAI,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC;QAChD,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QACrD,MAAM,IAAI,YAAY,CAAC,oDAAoD,CAAC,CAAC;IAC/E,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,SAAiB;IAC1C,IAAI,SAAS,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;QAC3B,MAAM,IAAI,YAAY,CAAC,0CAA0C,CAAC,CAAC;IACrE,CAAC;IACD,wCAAwC;IACxC,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,YAAY,CAAC,wCAAwC,CAAC,CAAC;IACnE,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAQD;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,GAAW,EACX,SAAkB,EAClB,YAAoB,KAAK;IAEzB,yCAAyC;IACzC,WAAW,CAAC,GAAG,CAAC,CAAC;IAEjB,kCAAkC;IAClC,MAAM,kBAAkB,GAAG,SAAS,CAAC,CAAC,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,kBAAkB,EAAE,CAAC;IAE3F,MAAM,aAAa,GAAG,EAAE,CAAC;IACzB,IAAI,aAAa,GAAG,CAAC,CAAC;IACtB,IAAI,UAAU,GAAG,GAAG,CAAC;IACrB,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;IAEnC,OAAO,aAAa,IAAI,aAAa,EAAE,CAAC;QACtC,wBAAwB;QACxB,IAAI,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;YAC7B,MAAM,IAAI,YAAY,CAAC,wBAAwB,CAAC,CAAC;QACnD,CAAC;QACD,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QAEzB,+BAA+B;QAC/B,WAAW,CAAC,UAAU,CAAC,CAAC;QAExB,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;QACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,CAAC,CAAC;QAE9D,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,UAAU,EAAE;gBACvC,OAAO,EAAE;oBACP,YAAY,EAAE,kBAAkB;oBAChC,QAAQ,EAAE,4EAA4E;oBACtF,iBAAiB,EAAE,gBAAgB;oBACnC,iBAAiB,EAAE,mBAAmB;oBACtC,KAAK,EAAE,GAAG;oBACV,YAAY,EAAE,YAAY;oBAC1B,2BAA2B,EAAE,GAAG;iBACjC;gBACD,MAAM,EAAE,UAAU,CAAC,MAAM;gBACzB,QAAQ,EAAE,QAAQ,EAAE,qCAAqC;aAC1D,CAAC,CAAC;YAEH,YAAY,CAAC,KAAK,CAAC,CAAC;YAEpB,4BAA4B;YAC5B,IAAI,QAAQ,CAAC,MAAM,IAAI,GAAG,IAAI,QAAQ,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;gBACpD,MAAM,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;gBAClD,IAAI,CAAC,QAAQ,EAAE,CAAC;oBACd,MAAM,IAAI,YAAY,CAAC,2CAA2C,CAAC,CAAC;gBACtE,CAAC;gBAED,wBAAwB;gBACxB,UAAU,GAAG,IAAI,GAAG,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC,IAAI,CAAC;gBAChD,aAAa,EAAE,CAAC;gBAChB,SAAS;YACX,CAAC;YAED,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;oBACvD,MAAM,IAAI,YAAY,CACpB,QAAQ,QAAQ,CAAC,MAAM,iEAAiE,CACzF,CAAC;gBACJ,CAAC;gBACD,MAAM,IAAI,YAAY,CAAC,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;YAC5E,CAAC;YAED,kCAAkC;YAClC,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;YAC/D,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,uBAAuB,CAAC,EAAE,CAAC;gBACzF,MAAM,IAAI,YAAY,CAAC,mDAAmD,CAAC,CAAC;YAC9E,CAAC;YAED,wEAAwE;YACxE,MAAM,MAAM,GAAiB,EAAE,CAAC;YAChC,IAAI,SAAS,GAAG,CAAC,CAAC;YAClB,MAAM,QAAQ,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC,CAAC,OAAO;YAE1C,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,EAAE,SAAS,EAAE,CAAC;YAC1C,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,MAAM,IAAI,YAAY,CAAC,+BAA+B,CAAC,CAAC;YAC1D,CAAC;YAED,IAAI,CAAC;gBACH,OAAO,IAAI,EAAE,CAAC;oBACZ,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,CAAC;oBAC5C,IAAI,IAAI;wBAAE,MAAM;oBAEhB,SAAS,IAAI,KAAK,CAAC,MAAM,CAAC;oBAC1B,IAAI,SAAS,GAAG,QAAQ,EAAE,CAAC;wBACzB,MAAM,CAAC,MAAM,EAAE,CAAC;wBAChB,MAAM,IAAI,YAAY,CAAC,+BAA+B,CAAC,CAAC;oBAC1D,CAAC;oBAED,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBACrB,CAAC;YACH,CAAC;oBAAS,CAAC;gBACT,MAAM,CAAC,WAAW,EAAE,CAAC;YACvB,CAAC;YAED,iBAAiB;YACjB,MAAM,QAAQ,GAAG,IAAI,UAAU,CAAC,SAAS,CAAC,CAAC;YAC3C,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBAC3B,QAAQ,CAAC,GAAG,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;gBAC5B,MAAM,IAAI,KAAK,CAAC,MAAM,CAAC;YACzB,CAAC;YAED,MAAM,IAAI,GAAG,IAAI,WAAW,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;YAEhD,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;gBAC/B,MAAM,IAAI,YAAY,CAAC,oEAAoE,CAAC,CAAC;YAC/F,CAAC;YAED,iCAAiC;YACjC,IAAI,IAAI,CAAC,QAAQ,CAAC,yBAAyB,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,kBAAkB,CAAC,EAAE,CAAC;gBAClF,MAAM,IAAI,YAAY,CAAC,+DAA+D,CAAC,CAAC;YAC1F,CAAC;YAED,OAAO;gBACL,IAAI;gBACJ,GAAG,EAAE,UAAU;gBACf,UAAU,EAAE,QAAQ,CAAC,MAAM;aAC5B,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,YAAY,CAAC,KAAK,CAAC,CAAC;YAEpB,IAAI,KAAK,YAAY,YAAY,IAAI,KAAK,YAAY,YAAY,IAAI,KAAK,YAAY,YAAY,EAAE,CAAC;gBACpG,MAAM,KAAK,CAAC;YACd,CAAC;YAED,IAAI,KAAK,YAAY,KAAK,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBAC1D,MAAM,IAAI,YAAY,CAAC,2BAA2B,SAAS,IAAI,CAAC,CAAC;YACnE,CAAC;YAED,MAAM,IAAI,YAAY,CAAC,oBAAoB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE,CAAC,CAAC;QACzG,CAAC;IACH,CAAC;IAED,MAAM,IAAI,YAAY,CAAC,2BAA2B,aAAa,GAAG,CAAC,CAAC;AACtE,CAAC;AAED,IAAI,aAAa,GAAmB,IAAI,CAAC;AACzC,IAAI,gBAAgB,GAAG,CAAC,CAAC;AACzB,MAAM,oBAAoB,GAAG,CAAC,CAAC;AAE/B,KAAK,UAAU,UAAU;IACvB,4DAA4D;IAC5D,IAAI,aAAa,EAAE,CAAC;QAClB,IAAI,CAAC;YACH,IAAI,aAAa,CAAC,WAAW,EAAE,EAAE,CAAC;gBAChC,OAAO,aAAa,CAAC;YACvB,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,4BAA4B;YAC5B,aAAa,GAAG,IAAI,CAAC;QACvB,CAAC;IACH,CAAC;IAED,aAAa,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1D,OAAO,aAAa,CAAC;AACvB,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,GAAW,EACX,UAII,EAAE;IAEN,yCAAyC;IACzC,WAAW,CAAC,GAAG,CAAC,CAAC;IAEjB,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,CAAC,EAAE,SAAS,GAAG,KAAK,EAAE,GAAG,OAAO,CAAC;IAE7D,kCAAkC;IAClC,MAAM,kBAAkB,GAAG,SAAS,CAAC,CAAC,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,kBAAkB,EAAE,CAAC;IAE3F,qBAAqB;IACrB,IAAI,MAAM,GAAG,CAAC,IAAI,MAAM,GAAG,KAAK,EAAE,CAAC;QACjC,MAAM,IAAI,YAAY,CAAC,yCAAyC,CAAC,CAAC;IACpE,CAAC;IAED,wDAAwD;IACxD,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAClC,MAAM,gBAAgB,GAAG,KAAK,CAAC,CAAC,qBAAqB;IAErD,OAAO,gBAAgB,IAAI,oBAAoB,EAAE,CAAC;QAChD,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,cAAc,GAAG,gBAAgB,EAAE,CAAC;YACnD,MAAM,IAAI,YAAY,CAAC,2DAA2D,CAAC,CAAC;QACtF,CAAC;QACD,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC,CAAC;IACzD,CAAC;IAED,gBAAgB,EAAE,CAAC;IACnB,IAAI,IAAI,GAAgB,IAAI,CAAC;IAE7B,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,UAAU,EAAE,CAAC;QACnC,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,CAAC;YAC3B,SAAS,EAAE,kBAAkB;SAC9B,CAAC,CAAC;QAEH,2DAA2D;QAC3D,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE;YACjC,MAAM,YAAY,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC,YAAY,EAAE,CAAC;YACpD,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;gBACpE,KAAK,CAAC,KAAK,EAAE,CAAC;YAChB,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,QAAQ,EAAE,CAAC;YACnB,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,6CAA6C;QAC7C,MAAM,YAAY,GAAG,CAAC,KAAK,IAAI,EAAE;YAC/B,MAAM,IAAK,CAAC,IAAI,CAAC,GAAG,EAAE;gBACpB,SAAS,EAAE,kBAAkB;gBAC7B,OAAO,EAAE,SAAS;aACnB,CAAC,CAAC;YAEH,8DAA8D;YAC9D,IAAI,MAAM,GAAG,CAAC,EAAE,CAAC;gBACf,MAAM,IAAK,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;YACrC,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,IAAK,CAAC,OAAO,EAAE,CAAC;YACnC,MAAM,QAAQ,GAAG,IAAK,CAAC,GAAG,EAAE,CAAC;YAE7B,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;QAC5B,CAAC,CAAC,EAAE,CAAC;QAEL,MAAM,cAAc,GAAG,IAAI,OAAO,CAAQ,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE;YACtD,UAAU,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,IAAI,YAAY,CAAC,6BAA6B,SAAS,IAAI,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QACpG,CAAC,CAAC,CAAC;QAEH,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC,YAAY,EAAE,cAAc,CAAC,CAAC,CAAC;QAE9E,4BAA4B;QAC5B,IAAI,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,EAAE,CAAC,CAAC,aAAa;YACjD,MAAM,IAAI,YAAY,CAAC,+BAA+B,CAAC,CAAC;QAC1D,CAAC;QAED,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YAC/B,MAAM,IAAI,YAAY,CAAC,oDAAoD,CAAC,CAAC;QAC/E,CAAC;QAED,OAAO;YACL,IAAI;YACJ,GAAG,EAAE,QAAQ;SACd,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,YAAY,IAAI,KAAK,YAAY,YAAY,IAAI,KAAK,YAAY,YAAY,EAAE,CAAC;YACpG,MAAM,KAAK,CAAC;QACd,CAAC;QAED,IAAI,KAAK,YAAY,KAAK,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;YAChE,MAAM,IAAI,YAAY,CAAC,8BAA8B,CAAC,CAAC;QACzD,CAAC;QAED,MAAM,IAAI,YAAY,CACpB,yBAAyB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE,CACpF,CAAC;IACJ,CAAC;YAAS,CAAC;QACT,oDAAoD;QACpD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QACrC,CAAC;QACD,gBAAgB,EAAE,CAAC;IACrB,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,EAAoB,EACpB,cAAsB,CAAC,EACvB,cAAsB,IAAI;IAE1B,IAAI,SAAS,GAAiB,IAAI,CAAC;IAEnC,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,WAAW,EAAE,OAAO,EAAE,EAAE,CAAC;QACxD,IAAI,CAAC;YACH,OAAO,MAAM,EAAE,EAAE,CAAC;QACpB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,SAAS,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,eAAe,CAAC,CAAC;YAExE,4CAA4C;YAC5C,IAAI,KAAK,YAAY,YAAY,IAAI,KAAK,YAAY,YAAY,EAAE,CAAC;gBACnE,MAAM,KAAK,CAAC;YACd,CAAC;YAED,IAAI,OAAO,GAAG,WAAW,EAAE,CAAC;gBAC1B,MAAM,KAAK,GAAG,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,GAAG,CAAC,CAAC,CAAC;gBACrD,MAAM,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC;YAC7D,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,SAAS,IAAI,IAAI,YAAY,CAAC,cAAc,CAAC,CAAC;AACtD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO;IAC3B,IAAI,aAAa,EAAE,CAAC;QAClB,MAAM,aAAa,CAAC,KAAK,EAAE,CAAC;QAC5B,aAAa,GAAG,IAAI,CAAC;IACvB,CAAC;AACH,CAAC"}
@@ -0,0 +1,17 @@
1
+ /**
2
+ * HTML to Markdown conversion with smart cleanup
3
+ */
4
+ /**
5
+ * Convert HTML to clean, readable Markdown
6
+ */
7
+ export declare function htmlToMarkdown(html: string): string;
8
+ /**
9
+ * Convert HTML to plain text (strip all formatting)
10
+ */
11
+ export declare function htmlToText(html: string): string;
12
+ /**
13
+ * Estimate token count (very rough approximation)
14
+ * Rule of thumb: 1 token ≈ 4 characters for English text
15
+ */
16
+ export declare function estimateTokens(text: string): number;
17
+ //# sourceMappingURL=markdown.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"markdown.d.ts","sourceRoot":"","sources":["../../src/core/markdown.ts"],"names":[],"mappings":"AAAA;;GAEG;AA8DH;;GAEG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CA4DnD;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAuB/C;AAED;;;GAGG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEnD"}
@@ -0,0 +1,143 @@
1
+ /**
2
+ * HTML to Markdown conversion with smart cleanup
3
+ */
4
+ import TurndownService from 'turndown';
5
+ import * as cheerio from 'cheerio';
6
+ const JUNK_SELECTORS = [
7
+ 'script',
8
+ 'style',
9
+ 'nav',
10
+ 'footer',
11
+ 'header.site-header',
12
+ 'aside',
13
+ '.sidebar',
14
+ '.advertisement',
15
+ '.ad',
16
+ '.cookie-banner',
17
+ '.cookie-notice',
18
+ '.newsletter-signup',
19
+ '.social-share',
20
+ '.related-posts',
21
+ '.comments',
22
+ '#comments',
23
+ '.cookie-consent',
24
+ '[class*="cookie"]',
25
+ '[id*="cookie"]',
26
+ '[class*="banner"]',
27
+ '[class*="popup"]',
28
+ '[class*="modal"]',
29
+ ];
30
+ /**
31
+ * Clean HTML before conversion
32
+ * Remove navigation, ads, cookie banners, and other junk
33
+ */
34
+ function cleanHTML(html) {
35
+ // SECURITY: Limit HTML size to prevent DoS
36
+ if (html.length > 10 * 1024 * 1024) { // 10MB
37
+ throw new Error('HTML too large to process (max 10MB)');
38
+ }
39
+ const $ = cheerio.load(html);
40
+ // Remove junk elements
41
+ JUNK_SELECTORS.forEach((selector) => {
42
+ $(selector).remove();
43
+ });
44
+ // Remove empty paragraphs and divs
45
+ $('p:empty, div:empty').remove();
46
+ // Remove elements with only whitespace
47
+ $('*').each((_, elem) => {
48
+ const $elem = $(elem);
49
+ const text = $elem.text().trim();
50
+ if (!text && $elem.children().length === 0) {
51
+ $elem.remove();
52
+ }
53
+ });
54
+ return $.html();
55
+ }
56
+ /**
57
+ * Convert HTML to clean, readable Markdown
58
+ */
59
+ export function htmlToMarkdown(html) {
60
+ const cleanedHTML = cleanHTML(html);
61
+ const turndown = new TurndownService({
62
+ headingStyle: 'atx',
63
+ codeBlockStyle: 'fenced',
64
+ bulletListMarker: '-',
65
+ emDelimiter: '_',
66
+ strongDelimiter: '**',
67
+ });
68
+ // Preserve tables
69
+ turndown.keep(['table', 'thead', 'tbody', 'tr', 'th', 'td']);
70
+ // Custom rule: convert images to alt text or skip
71
+ turndown.addRule('images', {
72
+ filter: 'img',
73
+ replacement: (_content, node) => {
74
+ const alt = node.alt;
75
+ const src = node.src;
76
+ if (alt) {
77
+ return `![${alt}](${src})`;
78
+ }
79
+ return '';
80
+ },
81
+ });
82
+ // Custom rule: preserve code blocks
83
+ turndown.addRule('codeBlocks', {
84
+ filter: (node) => {
85
+ return node.nodeName === 'PRE' && node.firstChild?.nodeName === 'CODE';
86
+ },
87
+ replacement: (_content, node) => {
88
+ const codeNode = node.firstChild;
89
+ const className = codeNode.getAttribute('class') || '';
90
+ const language = className.match(/language-(\w+)/)?.[1] || '';
91
+ return '\n\n```' + language + '\n' + codeNode.textContent + '\n```\n\n';
92
+ },
93
+ });
94
+ let markdown = turndown.turndown(cleanedHTML);
95
+ // SECURITY: Protect against ReDoS - limit input size before regex
96
+ if (markdown.length > 1024 * 1024) { // 1MB limit for markdown
97
+ markdown = markdown.slice(0, 1024 * 1024);
98
+ }
99
+ // Clean up excessive newlines (use non-backtracking approach)
100
+ markdown = markdown.split('\n').reduce((acc, line, i, arr) => {
101
+ if (i === 0)
102
+ return line;
103
+ const prevEmpty = arr[i - 1].trim() === '';
104
+ const currEmpty = line.trim() === '';
105
+ if (prevEmpty && currEmpty)
106
+ return acc;
107
+ return acc + '\n' + line;
108
+ }, '');
109
+ // Remove leading/trailing whitespace
110
+ markdown = markdown.trim();
111
+ return markdown;
112
+ }
113
+ /**
114
+ * Convert HTML to plain text (strip all formatting)
115
+ */
116
+ export function htmlToText(html) {
117
+ const cleanedHTML = cleanHTML(html);
118
+ const $ = cheerio.load(cleanedHTML);
119
+ // Get text content, preserving some structure
120
+ let text = '';
121
+ $('h1, h2, h3, h4, h5, h6, p, li').each((_, elem) => {
122
+ const content = $(elem).text().trim();
123
+ if (content) {
124
+ text += content + '\n\n';
125
+ }
126
+ });
127
+ // Fallback: if no structured content found, get all text
128
+ if (!text.trim()) {
129
+ text = $('body').text();
130
+ }
131
+ // Clean up excessive whitespace
132
+ text = text.replace(/\n{3,}/g, '\n\n');
133
+ text = text.replace(/[ \t]+/g, ' ');
134
+ return text.trim();
135
+ }
136
+ /**
137
+ * Estimate token count (very rough approximation)
138
+ * Rule of thumb: 1 token ≈ 4 characters for English text
139
+ */
140
+ export function estimateTokens(text) {
141
+ return Math.ceil(text.length / 4);
142
+ }
143
+ //# sourceMappingURL=markdown.js.map