@robot-resources/scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,976 @@
1
+ import { Readability } from '@mozilla/readability';
2
+ import { parseHTML } from 'linkedom';
3
+ import TurndownService from 'turndown';
4
+ import robotsParser from 'robots-parser';
5
+ import { readFileSync } from 'fs';
6
+ import { homedir } from 'os';
7
+ import { join } from 'path';
8
+
9
+ // src/fetch.ts
10
+ var USER_AGENTS = [
11
+ "Mozilla/5.0 (compatible; ScraperBot/1.0; +https://scraper.robotresources.ai)",
12
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
13
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
14
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
15
+ ];
16
+ var DEFAULT_TIMEOUT = 1e4;
17
+ var DEFAULT_MAX_RETRIES = 3;
18
+ var BASE_BACKOFF_MS = 1e3;
19
+ var FetchError = class extends Error {
20
+ constructor(message, statusCode, retryable = false) {
21
+ super(message);
22
+ this.statusCode = statusCode;
23
+ this.retryable = retryable;
24
+ this.name = "FetchError";
25
+ }
26
+ };
27
+ function isValidUrl(url) {
28
+ try {
29
+ const parsed = new URL(url);
30
+ return parsed.protocol === "http:" || parsed.protocol === "https:";
31
+ } catch {
32
+ return false;
33
+ }
34
+ }
35
+ function getRandomUserAgent() {
36
+ return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
37
+ }
38
+ function buildHeaders(userAgent) {
39
+ return {
40
+ "User-Agent": userAgent || getRandomUserAgent(),
41
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
42
+ "Accept-Language": "en-US,en;q=0.9",
43
+ "Accept-Encoding": "gzip, deflate",
44
+ Connection: "keep-alive",
45
+ "Cache-Control": "no-cache"
46
+ };
47
+ }
48
+ function headersToObject(headers) {
49
+ const result = {};
50
+ headers.forEach((value, key) => {
51
+ result[key.toLowerCase()] = value;
52
+ });
53
+ return result;
54
+ }
55
+ function isRetryableStatus(status) {
56
+ return status >= 500 && status < 600;
57
+ }
58
+ function sleep(ms) {
59
+ return new Promise((resolve) => setTimeout(resolve, ms));
60
+ }
61
+ function getBackoffDelay(attempt) {
62
+ return BASE_BACKOFF_MS * Math.pow(2, attempt);
63
+ }
64
+ async function fetchWithTimeout(url, headers, timeout) {
65
+ const controller = new AbortController();
66
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
67
+ try {
68
+ const response = await fetch(url, {
69
+ method: "GET",
70
+ headers,
71
+ signal: controller.signal,
72
+ redirect: "follow"
73
+ });
74
+ return response;
75
+ } catch (error) {
76
+ if (error instanceof Error && error.name === "AbortError") {
77
+ throw new FetchError("Request timeout", void 0, true);
78
+ }
79
+ throw error;
80
+ } finally {
81
+ clearTimeout(timeoutId);
82
+ }
83
+ }
84
+ async function fetchUrl(url, options = {}) {
85
+ const {
86
+ timeout = DEFAULT_TIMEOUT,
87
+ maxRetries = DEFAULT_MAX_RETRIES,
88
+ userAgent
89
+ } = options;
90
+ if (!isValidUrl(url)) {
91
+ throw new FetchError("Invalid URL", void 0, false);
92
+ }
93
+ const headers = buildHeaders(userAgent);
94
+ let lastError = null;
95
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
96
+ try {
97
+ const response = await fetchWithTimeout(url, headers, timeout);
98
+ if (!response.ok) {
99
+ const statusCode = response.status;
100
+ if (statusCode >= 400 && statusCode < 500) {
101
+ throw new FetchError(`HTTP ${statusCode}`, statusCode, false);
102
+ }
103
+ if (isRetryableStatus(statusCode)) {
104
+ throw new FetchError(`HTTP ${statusCode}`, statusCode, true);
105
+ }
106
+ }
107
+ const html = await response.text();
108
+ const responseHeaders = headersToObject(response.headers);
109
+ return {
110
+ html,
111
+ url: response.url,
112
+ statusCode: response.status,
113
+ headers: responseHeaders
114
+ };
115
+ } catch (error) {
116
+ lastError = error instanceof Error ? error : new Error(String(error));
117
+ const isRetryable = error instanceof FetchError ? error.retryable : !(error instanceof FetchError);
118
+ const hasRetriesLeft = attempt < maxRetries;
119
+ if (isRetryable && hasRetriesLeft) {
120
+ const delay = getBackoffDelay(attempt);
121
+ await sleep(delay);
122
+ continue;
123
+ }
124
+ break;
125
+ }
126
+ }
127
+ throw lastError || new FetchError("Unknown fetch error");
128
+ }
129
+
130
+ // src/fetch-stealth.ts
131
+ var DEFAULT_TIMEOUT2 = 1e4;
132
+ var DEFAULT_MAX_RETRIES2 = 2;
133
+ var BASE_BACKOFF_MS2 = 1e3;
134
+ function isValidUrl2(url) {
135
+ try {
136
+ const parsed = new URL(url);
137
+ return parsed.protocol === "http:" || parsed.protocol === "https:";
138
+ } catch {
139
+ return false;
140
+ }
141
+ }
142
+ function isRetryableStatus2(status) {
143
+ return status >= 500 && status < 600;
144
+ }
145
+ function sleep2(ms) {
146
+ return new Promise((resolve) => setTimeout(resolve, ms));
147
+ }
148
+ function getBackoffDelay2(attempt) {
149
+ return BASE_BACKOFF_MS2 * Math.pow(2, attempt);
150
+ }
151
+ async function fetchStealth(url, options = {}) {
152
+ const {
153
+ timeout = DEFAULT_TIMEOUT2,
154
+ maxRetries = DEFAULT_MAX_RETRIES2
155
+ } = options;
156
+ if (!isValidUrl2(url)) {
157
+ throw new FetchError("Invalid URL", void 0, false);
158
+ }
159
+ let Impit;
160
+ try {
161
+ ({ Impit } = await import('impit'));
162
+ } catch {
163
+ throw new FetchError(
164
+ "impit is required for stealth mode. Install: npm install impit (requires Node >= 20)",
165
+ void 0,
166
+ false
167
+ );
168
+ }
169
+ const client = new Impit({ browser: "chrome" });
170
+ let lastError = null;
171
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
172
+ try {
173
+ const response = await client.fetch(url, {
174
+ signal: AbortSignal.timeout(timeout)
175
+ });
176
+ if (!response.ok) {
177
+ const statusCode = response.status;
178
+ if (statusCode >= 400 && statusCode < 500) {
179
+ throw new FetchError(`HTTP ${statusCode}`, statusCode, false);
180
+ }
181
+ if (isRetryableStatus2(statusCode)) {
182
+ throw new FetchError(`HTTP ${statusCode}`, statusCode, true);
183
+ }
184
+ }
185
+ const html = await response.text();
186
+ const headers = {};
187
+ response.headers.forEach((value, key) => {
188
+ headers[key] = value;
189
+ });
190
+ return {
191
+ html,
192
+ url: response.url ?? url,
193
+ statusCode: response.status,
194
+ headers
195
+ };
196
+ } catch (error) {
197
+ lastError = error instanceof Error ? error : new Error(String(error));
198
+ const isRetryable = error instanceof FetchError ? error.retryable : !(error instanceof FetchError);
199
+ const hasRetriesLeft = attempt < maxRetries;
200
+ if (isRetryable && hasRetriesLeft) {
201
+ const delay = getBackoffDelay2(attempt);
202
+ await sleep2(delay);
203
+ continue;
204
+ }
205
+ break;
206
+ }
207
+ }
208
+ throw lastError || new FetchError("Unknown stealth fetch error");
209
+ }
210
+
211
+ // src/fetch-render.ts
212
+ var DEFAULT_TIMEOUT3 = 3e4;
213
+ function isValidUrl3(url) {
214
+ try {
215
+ const parsed = new URL(url);
216
+ return parsed.protocol === "http:" || parsed.protocol === "https:";
217
+ } catch {
218
+ return false;
219
+ }
220
+ }
221
+ async function fetchRender(url, options = {}) {
222
+ const { timeout = DEFAULT_TIMEOUT3 } = options;
223
+ if (!isValidUrl3(url)) {
224
+ throw new FetchError("Invalid URL", void 0, false);
225
+ }
226
+ let chromium;
227
+ try {
228
+ ({ chromium } = await import('playwright'));
229
+ } catch {
230
+ throw new FetchError(
231
+ "Playwright is required for render mode. Install: npm install playwright",
232
+ void 0,
233
+ false
234
+ );
235
+ }
236
+ const browser = await chromium.launch({ headless: true });
237
+ try {
238
+ const page = await browser.newPage();
239
+ page.on("dialog", (dialog) => dialog.dismiss());
240
+ const response = await page.goto(url, {
241
+ waitUntil: "networkidle",
242
+ timeout
243
+ });
244
+ if (!response) {
245
+ throw new FetchError(
246
+ "Navigation returned no response (about:blank or same-URL redirect)",
247
+ void 0,
248
+ false
249
+ );
250
+ }
251
+ const statusCode = response.status();
252
+ if (statusCode >= 400 && statusCode < 500) {
253
+ throw new FetchError(`HTTP ${statusCode}`, statusCode, false);
254
+ }
255
+ if (statusCode >= 500) {
256
+ throw new FetchError(`HTTP ${statusCode}`, statusCode, true);
257
+ }
258
+ const html = await page.content();
259
+ const headers = response.headers();
260
+ return {
261
+ html,
262
+ url: page.url(),
263
+ statusCode,
264
+ headers
265
+ };
266
+ } catch (error) {
267
+ if (error instanceof Error && error.name === "TimeoutError") {
268
+ throw new FetchError("Navigation timeout", void 0, true);
269
+ }
270
+ throw error;
271
+ } finally {
272
+ await browser.close();
273
+ }
274
+ }
275
+ var ExtractionError = class extends Error {
276
+ constructor(message, code) {
277
+ super(message);
278
+ this.code = code;
279
+ this.name = "ExtractionError";
280
+ }
281
+ };
282
+ async function extractContent(fetchResult) {
283
+ const { html } = fetchResult;
284
+ if (!html || !html.trim()) {
285
+ throw new ExtractionError("Empty HTML content", "EMPTY_HTML");
286
+ }
287
+ const { document } = parseHTML(html);
288
+ const reader = new Readability(document, {
289
+ charThreshold: 50
290
+ });
291
+ const article = reader.parse();
292
+ if (!article || !article.content || article.content.trim().length < 20) {
293
+ throw new ExtractionError(
294
+ "No content could be extracted from the page",
295
+ "NO_CONTENT"
296
+ );
297
+ }
298
+ const result = {
299
+ content: cleanContent(article.content),
300
+ title: article.title || extractFallbackTitle(document),
301
+ author: article.byline || void 0,
302
+ publishedAt: article.publishedTime || extractPublishedTime(document),
303
+ siteName: article.siteName || extractSiteName(document)
304
+ };
305
+ return result;
306
+ }
307
+ function cleanContent(content) {
308
+ return content.replace(/>\s+</g, "><").replace(/\s{2,}/g, " ").trim();
309
+ }
310
+ function extractFallbackTitle(document) {
311
+ const ogTitle = document.querySelector('meta[property="og:title"]');
312
+ if (ogTitle) {
313
+ const content = ogTitle.getAttribute("content");
314
+ if (content) return content;
315
+ }
316
+ const titleEl = document.querySelector("title");
317
+ if (titleEl && titleEl.textContent) {
318
+ return titleEl.textContent.trim();
319
+ }
320
+ const h1 = document.querySelector("h1");
321
+ if (h1 && h1.textContent) {
322
+ return h1.textContent.trim();
323
+ }
324
+ return void 0;
325
+ }
326
+ function extractPublishedTime(document) {
327
+ const ogTime = document.querySelector(
328
+ 'meta[property="article:published_time"]'
329
+ );
330
+ if (ogTime) {
331
+ const content = ogTime.getAttribute("content");
332
+ if (content) return content;
333
+ }
334
+ const schemaTime = document.querySelector('[itemprop="datePublished"]');
335
+ if (schemaTime) {
336
+ const datetime = schemaTime.getAttribute("datetime");
337
+ if (datetime) return datetime;
338
+ const content = schemaTime.getAttribute("content");
339
+ if (content) return content;
340
+ }
341
+ const timeEl = document.querySelector("time[datetime]");
342
+ if (timeEl) {
343
+ const datetime = timeEl.getAttribute("datetime");
344
+ if (datetime) return datetime;
345
+ }
346
+ return void 0;
347
+ }
348
+ function extractSiteName(document) {
349
+ const ogSiteName = document.querySelector('meta[property="og:site_name"]');
350
+ if (ogSiteName) {
351
+ const content = ogSiteName.getAttribute("content");
352
+ if (content) return content;
353
+ }
354
+ const appName = document.querySelector('meta[name="application-name"]');
355
+ if (appName) {
356
+ const content = appName.getAttribute("content");
357
+ if (content) return content;
358
+ }
359
+ return void 0;
360
+ }
361
+ function createTurndownService() {
362
+ const turndown = new TurndownService({
363
+ headingStyle: "atx",
364
+ hr: "---",
365
+ bulletListMarker: "-",
366
+ codeBlockStyle: "fenced",
367
+ fence: "```",
368
+ emDelimiter: "*",
369
+ strongDelimiter: "**",
370
+ linkStyle: "inlined"
371
+ });
372
+ turndown.remove(["script", "style", "noscript", "iframe"]);
373
+ turndown.addRule("removeEmpty", {
374
+ filter: (node) => {
375
+ if (node.nodeType === 1) {
376
+ const text = node.textContent || "";
377
+ const isEmptyBlock = text.trim() === "" && !["IMG", "BR", "HR", "INPUT"].includes(node.nodeName);
378
+ return isEmptyBlock;
379
+ }
380
+ return false;
381
+ },
382
+ replacement: () => ""
383
+ });
384
+ turndown.addRule("fencedCodeBlock", {
385
+ filter: (node, options) => {
386
+ return options.codeBlockStyle === "fenced" && node.nodeName === "PRE" && node.firstChild !== null && node.firstChild.nodeName === "CODE";
387
+ },
388
+ replacement: (_content, node, options) => {
389
+ const codeNode = node.firstChild;
390
+ const code = codeNode.textContent || "";
391
+ const className = codeNode.getAttribute("class") || "";
392
+ const langMatch = className.match(/language-(\w+)/);
393
+ const lang = langMatch ? langMatch[1] : "";
394
+ const fence = options.fence || "```";
395
+ return `
396
+
397
+ ${fence}${lang}
398
+ ${code}
399
+ ${fence}
400
+
401
+ `;
402
+ }
403
+ });
404
+ turndown.addRule("strikethrough", {
405
+ filter: ["del", "s"],
406
+ replacement: (content) => `~~${content}~~`
407
+ });
408
+ return turndown;
409
+ }
410
+ var turndownInstance = null;
411
+ function getTurndown() {
412
+ if (!turndownInstance) {
413
+ turndownInstance = createTurndownService();
414
+ }
415
+ return turndownInstance;
416
+ }
417
+ function cleanMarkdown(markdown) {
418
+ return markdown.replace(/\n{3,}/g, "\n\n").replace(/[ \t]+$/gm, "").replace(/^\n+/, "").replace(/\n+$/, "").trim();
419
+ }
420
+ async function convertToMarkdown(extractResult) {
421
+ const { content } = extractResult;
422
+ if (!content || !content.trim()) {
423
+ return {
424
+ markdown: "",
425
+ tokenCount: 0
426
+ };
427
+ }
428
+ const turndown = getTurndown();
429
+ let markdown = turndown.turndown(content);
430
+ markdown = cleanMarkdown(markdown);
431
+ const tokenCount = estimateTokens(markdown);
432
+ return {
433
+ markdown,
434
+ tokenCount
435
+ };
436
+ }
437
+ function estimateTokens(text) {
438
+ if (!text) return 0;
439
+ let tokens = 0;
440
+ let remaining = text;
441
+ remaining = remaining.replace(/```[\s\S]*?```/g, (match) => {
442
+ tokens += Math.ceil(match.length / 3.2);
443
+ return " ";
444
+ });
445
+ remaining = remaining.replace(/`[^`]+`/g, (match) => {
446
+ tokens += Math.ceil(match.length / 3.5);
447
+ return " ";
448
+ });
449
+ remaining = remaining.replace(/https?:\/\/\S+/g, (match) => {
450
+ tokens += Math.ceil(match.length / 5);
451
+ return " ";
452
+ });
453
+ const proseLength = remaining.replace(/\s+/g, " ").trim().length;
454
+ if (proseLength > 0) {
455
+ tokens += Math.ceil(proseLength / 4.3);
456
+ }
457
+ return Math.max(1, tokens);
458
+ }
459
+ var DEFAULT_TTL_MS = 60 * 60 * 1e3;
460
+ var DEFAULT_TIMEOUT_MS = 5e3;
461
+ var BOT_USER_AGENT = "ScraperBot";
462
+ var cache = /* @__PURE__ */ new Map();
463
+ function getRobotsUrl(url) {
464
+ const parsed = new URL(url);
465
+ return `${parsed.protocol}//${parsed.host}/robots.txt`;
466
+ }
467
+ async function getRobotsParser(url, timeout = DEFAULT_TIMEOUT_MS) {
468
+ const robotsUrl = getRobotsUrl(url);
469
+ const cached = cache.get(robotsUrl);
470
+ if (cached && cached.expiresAt > Date.now()) {
471
+ return cached.parser;
472
+ }
473
+ try {
474
+ const controller = new AbortController();
475
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
476
+ const response = await fetch(robotsUrl, {
477
+ signal: controller.signal,
478
+ headers: { "User-Agent": BOT_USER_AGENT }
479
+ });
480
+ clearTimeout(timeoutId);
481
+ const text = response.ok ? await response.text() : "";
482
+ const parser = robotsParser(robotsUrl, text);
483
+ cache.set(robotsUrl, {
484
+ parser,
485
+ expiresAt: Date.now() + DEFAULT_TTL_MS
486
+ });
487
+ return parser;
488
+ } catch {
489
+ const parser = robotsParser(robotsUrl, "");
490
+ cache.set(robotsUrl, {
491
+ parser,
492
+ expiresAt: Date.now() + DEFAULT_TTL_MS
493
+ });
494
+ return parser;
495
+ }
496
+ }
497
+ async function isAllowedByRobots(url, timeout) {
498
+ const parser = await getRobotsParser(url, timeout);
499
+ return parser.isAllowed(url, BOT_USER_AGENT) !== false;
500
+ }
501
+ async function getSitemapUrls(url, timeout) {
502
+ const parser = await getRobotsParser(url, timeout);
503
+ return parser.getSitemaps();
504
+ }
505
+ async function getCrawlDelay(url, timeout) {
506
+ const parser = await getRobotsParser(url, timeout);
507
+ const delay = parser.getCrawlDelay(BOT_USER_AGENT);
508
+ return delay === void 0 ? null : delay;
509
+ }
510
+ function clearRobotsCache() {
511
+ cache.clear();
512
+ }
513
+
514
+ // src/sitemap.ts
515
+ var DEFAULT_TIMEOUT_MS2 = 1e4;
516
+ var DEFAULT_TTL_MS2 = 60 * 60 * 1e3;
517
+ var MAX_RECURSION_DEPTH = 2;
518
+ var cache2 = /* @__PURE__ */ new Map();
519
+ function getOrigin(url) {
520
+ const parsed = new URL(url);
521
+ return `${parsed.protocol}//${parsed.host}`;
522
+ }
523
+ function isSitemapIndex(xml) {
524
+ return /<sitemapindex[\s>]/i.test(xml);
525
+ }
526
+ function extractSitemapIndexUrls(xml) {
527
+ const urls = [];
528
+ const sitemapBlockRegex = /<(?:\w+:)?sitemap\b[^>]*>([\s\S]*?)<\/(?:\w+:)?sitemap>/gi;
529
+ let blockMatch;
530
+ while ((blockMatch = sitemapBlockRegex.exec(xml)) !== null) {
531
+ const block = blockMatch[1];
532
+ const locMatch = /<(?:\w+:)?loc\b[^>]*>([\s\S]*?)<\/(?:\w+:)?loc>/i.exec(block);
533
+ if (locMatch) {
534
+ const url = locMatch[1].trim();
535
+ if (url) urls.push(url);
536
+ }
537
+ }
538
+ return urls;
539
+ }
540
+ function extractUrlEntries(xml, origin) {
541
+ const entries = [];
542
+ const urlBlockRegex = /<(?:\w+:)?url\b[^>]*>([\s\S]*?)<\/(?:\w+:)?url>/gi;
543
+ let blockMatch;
544
+ while ((blockMatch = urlBlockRegex.exec(xml)) !== null) {
545
+ const block = blockMatch[1];
546
+ const locMatch = /<(?:\w+:)?loc\b[^>]*>([\s\S]*?)<\/(?:\w+:)?loc>/i.exec(block);
547
+ if (!locMatch) continue;
548
+ const loc = locMatch[1].trim();
549
+ if (!loc) continue;
550
+ try {
551
+ if (getOrigin(loc) !== origin) continue;
552
+ } catch {
553
+ continue;
554
+ }
555
+ const entry = { loc };
556
+ const lastmodMatch = /<(?:\w+:)?lastmod\b[^>]*>([\s\S]*?)<\/(?:\w+:)?lastmod>/i.exec(block);
557
+ if (lastmodMatch) {
558
+ const lastmod = lastmodMatch[1].trim();
559
+ if (lastmod) entry.lastmod = lastmod;
560
+ }
561
+ const priorityMatch = /<(?:\w+:)?priority\b[^>]*>([\s\S]*?)<\/(?:\w+:)?priority>/i.exec(block);
562
+ if (priorityMatch) {
563
+ const priority = parseFloat(priorityMatch[1].trim());
564
+ if (!isNaN(priority)) entry.priority = priority;
565
+ }
566
+ entries.push(entry);
567
+ }
568
+ return entries;
569
+ }
570
+ async function fetchSitemapXml(url, timeout) {
571
+ try {
572
+ const controller = new AbortController();
573
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
574
+ const response = await fetch(url, {
575
+ signal: controller.signal,
576
+ headers: { "User-Agent": "ScraperBot/1.0" }
577
+ });
578
+ clearTimeout(timeoutId);
579
+ if (!response.ok) return null;
580
+ return await response.text();
581
+ } catch {
582
+ return null;
583
+ }
584
+ }
585
+ async function parseSitemapInternal(url, origin, timeout, depth) {
586
+ if (depth >= MAX_RECURSION_DEPTH) return [];
587
+ const xml = await fetchSitemapXml(url, timeout);
588
+ if (!xml) return [];
589
+ if (isSitemapIndex(xml)) {
590
+ const sitemapUrls = extractSitemapIndexUrls(xml);
591
+ const allEntries = [];
592
+ for (const sitemapUrl of sitemapUrls) {
593
+ const entries = await parseSitemapInternal(
594
+ sitemapUrl,
595
+ origin,
596
+ timeout,
597
+ depth + 1
598
+ );
599
+ allEntries.push(...entries);
600
+ }
601
+ return allEntries;
602
+ }
603
+ return extractUrlEntries(xml, origin);
604
+ }
605
+ async function parseSitemap(url, timeout = DEFAULT_TIMEOUT_MS2) {
606
+ const cached = cache2.get(url);
607
+ if (cached && cached.expiresAt > Date.now()) {
608
+ return cached.entries;
609
+ }
610
+ let origin;
611
+ try {
612
+ origin = getOrigin(url);
613
+ } catch {
614
+ return [];
615
+ }
616
+ const entries = await parseSitemapInternal(url, origin, timeout, 0);
617
+ cache2.set(url, {
618
+ entries,
619
+ expiresAt: Date.now() + DEFAULT_TTL_MS2
620
+ });
621
+ return entries;
622
+ }
623
+ function clearSitemapCache() {
624
+ cache2.clear();
625
+ }
626
+
627
+ // src/fetch-mode.ts
628
+ var CHALLENGE_MARKERS = [
629
+ "cf-browser-verification",
630
+ "Just a moment",
631
+ "_cf_chl_opt",
632
+ "akamai-challenge",
633
+ "ak-challenge"
634
+ ];
635
+ function isChallengeResponse(fetchResult) {
636
+ return CHALLENGE_MARKERS.some((marker) => fetchResult.html.includes(marker));
637
+ }
638
+ var VALID_MODES = ["fast", "stealth", "render", "auto"];
639
+ async function fetchWithMode(url, mode, options) {
640
+ if (!VALID_MODES.includes(mode)) {
641
+ throw new FetchError(
642
+ `Invalid fetch mode: '${mode}'. Valid modes: ${VALID_MODES.join(", ")}`,
643
+ void 0,
644
+ false
645
+ );
646
+ }
647
+ if (mode === "stealth") return fetchStealth(url, options);
648
+ if (mode === "render") return fetchRender(url, options);
649
+ if (mode === "fast") return fetchUrl(url, options);
650
+ try {
651
+ const result = await fetchUrl(url, options);
652
+ if (isChallengeResponse(result)) {
653
+ return fetchStealth(url, options);
654
+ }
655
+ return result;
656
+ } catch (err) {
657
+ if (err instanceof FetchError && err.statusCode === 403) {
658
+ return fetchStealth(url, options);
659
+ }
660
+ throw err;
661
+ }
662
+ }
663
+
664
+ // src/crawl.ts
665
+ function isValidUrl4(url) {
666
+ try {
667
+ const parsed = new URL(url);
668
+ return parsed.protocol === "http:" || parsed.protocol === "https:";
669
+ } catch {
670
+ return false;
671
+ }
672
+ }
673
+ function normalizeUrl(url) {
674
+ try {
675
+ const parsed = new URL(url);
676
+ parsed.hash = "";
677
+ let pathname = parsed.pathname;
678
+ if (pathname.length > 1 && pathname.endsWith("/")) {
679
+ pathname = pathname.slice(0, -1);
680
+ }
681
+ parsed.pathname = pathname;
682
+ return parsed.toString();
683
+ } catch {
684
+ return url;
685
+ }
686
+ }
687
+ var SKIP_EXTENSIONS = /* @__PURE__ */ new Set([
688
+ ".pdf",
689
+ ".jpg",
690
+ ".jpeg",
691
+ ".png",
692
+ ".gif",
693
+ ".svg",
694
+ ".webp",
695
+ ".ico",
696
+ ".mp4",
697
+ ".mp3",
698
+ ".wav",
699
+ ".avi",
700
+ ".zip",
701
+ ".tar",
702
+ ".gz",
703
+ ".rar",
704
+ ".css",
705
+ ".js",
706
+ ".xml",
707
+ ".json",
708
+ ".woff",
709
+ ".woff2",
710
+ ".ttf",
711
+ ".eot"
712
+ ]);
713
+ function extractLinks(html, baseUrl) {
714
+ const links = [];
715
+ let origin;
716
+ try {
717
+ origin = new URL(baseUrl).origin;
718
+ } catch {
719
+ return [];
720
+ }
721
+ const regex = /<a\s+[^>]*href\s*=\s*["']([^"']+)["'][^>]*>/gi;
722
+ let match;
723
+ while ((match = regex.exec(html)) !== null) {
724
+ const href = match[1].trim();
725
+ if (href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:") || href.startsWith("#")) continue;
726
+ try {
727
+ const resolved = new URL(href, baseUrl);
728
+ if (resolved.origin !== origin) continue;
729
+ const ext = resolved.pathname.toLowerCase().match(/\.\w+$/)?.[0];
730
+ if (ext && SKIP_EXTENSIONS.has(ext)) continue;
731
+ links.push(normalizeUrl(resolved.toString()));
732
+ } catch {
733
+ }
734
+ }
735
+ return [...new Set(links)];
736
+ }
737
+ function matchGlob(url, pattern) {
738
+ const regex = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*\*/g, "{{DOUBLESTAR}}").replace(/\*/g, "[^/]*").replace(/\{\{DOUBLESTAR\}\}/g, ".*");
739
+ return new RegExp(regex).test(url);
740
+ }
741
+ function matchesFilter(url, include, exclude) {
742
+ if (exclude?.length) {
743
+ for (const pattern of exclude) {
744
+ if (matchGlob(url, pattern)) return false;
745
+ }
746
+ }
747
+ if (include?.length) {
748
+ return include.some((pattern) => matchGlob(url, pattern));
749
+ }
750
+ return true;
751
+ }
752
+ function sleep3(ms) {
753
+ return new Promise((resolve) => setTimeout(resolve, ms));
754
+ }
755
+ async function crawl(options) {
756
+ const startTime = Date.now();
757
+ const {
758
+ url: startUrl,
759
+ depth: maxDepth = 2,
760
+ limit = 50,
761
+ mode = "auto",
762
+ include,
763
+ exclude,
764
+ timeout,
765
+ concurrency = 3,
766
+ respectRobots = true
767
+ } = options;
768
+ if (!isValidUrl4(startUrl)) {
769
+ throw new FetchError("Invalid URL", void 0, false);
770
+ }
771
+ if (maxDepth < 0) throw new FetchError("depth must be >= 0", void 0, false);
772
+ if (limit < 1) throw new FetchError("limit must be >= 1", void 0, false);
773
+ if (concurrency < 1) throw new FetchError("concurrency must be >= 1", void 0, false);
774
+ if (timeout !== void 0 && (timeout <= 0 || Number.isNaN(timeout))) {
775
+ throw new FetchError("timeout must be a positive number", void 0, false);
776
+ }
777
+ const pages = [];
778
+ const errors = [];
779
+ const visited = /* @__PURE__ */ new Set();
780
+ let totalDiscovered = 0;
781
+ let totalSkipped = 0;
782
+ const normalizedStart = normalizeUrl(startUrl);
783
+ const origin = new URL(startUrl).origin;
784
+ let crawlDelay = null;
785
+ if (respectRobots) {
786
+ crawlDelay = await getCrawlDelay(startUrl, timeout);
787
+ const allowed = await isAllowedByRobots(startUrl, timeout);
788
+ if (!allowed) {
789
+ return {
790
+ pages: [],
791
+ totalDiscovered: 1,
792
+ totalCrawled: 0,
793
+ totalSkipped: 1,
794
+ errors: [],
795
+ duration: Date.now() - startTime
796
+ };
797
+ }
798
+ }
799
+ const queue = [
800
+ { url: normalizedStart, depth: 0 }
801
+ ];
802
+ if (maxDepth > 0) {
803
+ try {
804
+ const sitemapEntries = await parseSitemap(`${origin}/sitemap.xml`, timeout);
805
+ const seen = /* @__PURE__ */ new Set([normalizedStart]);
806
+ for (const entry of sitemapEntries) {
807
+ const normalized = normalizeUrl(entry.loc);
808
+ if (!seen.has(normalized)) {
809
+ seen.add(normalized);
810
+ queue.push({ url: normalized, depth: 1 });
811
+ }
812
+ }
813
+ } catch (err) {
814
+ console.debug(`[scraper] Sitemap unavailable for ${origin}: ${err instanceof Error ? err.message : String(err)}`);
815
+ }
816
+ }
817
+ totalDiscovered = queue.length;
818
+ while (queue.length > 0 && pages.length < limit) {
819
+ const batchSize = Math.min(concurrency, limit - pages.length, queue.length);
820
+ const batch = queue.splice(0, batchSize);
821
+ const tasks = batch.map(async ({ url, depth }) => {
822
+ const normalized = normalizeUrl(url);
823
+ if (visited.has(normalized)) {
824
+ totalSkipped++;
825
+ return;
826
+ }
827
+ visited.add(normalized);
828
+ if (normalized !== normalizedStart && !matchesFilter(normalized, include, exclude)) {
829
+ totalSkipped++;
830
+ return;
831
+ }
832
+ if (respectRobots) {
833
+ const allowed = await isAllowedByRobots(url, timeout);
834
+ if (!allowed) {
835
+ totalSkipped++;
836
+ return;
837
+ }
838
+ }
839
+ try {
840
+ const fetchResult = await fetchWithMode(url, mode, { timeout });
841
+ const extractResult = await extractContent(fetchResult);
842
+ const convertResult = await convertToMarkdown(extractResult);
843
+ const pageResult = {
844
+ markdown: convertResult.markdown,
845
+ tokenCount: convertResult.tokenCount,
846
+ title: extractResult.title,
847
+ author: extractResult.author,
848
+ siteName: extractResult.siteName,
849
+ publishedAt: extractResult.publishedAt,
850
+ url: fetchResult.url,
851
+ depth
852
+ };
853
+ pages.push(pageResult);
854
+ if (depth < maxDepth) {
855
+ const links = extractLinks(fetchResult.html, fetchResult.url);
856
+ for (const link of links) {
857
+ if (!visited.has(link)) {
858
+ queue.push({ url: link, depth: depth + 1 });
859
+ totalDiscovered++;
860
+ }
861
+ }
862
+ }
863
+ } catch (err) {
864
+ errors.push({
865
+ url,
866
+ error: err instanceof Error ? err.message : String(err),
867
+ depth
868
+ });
869
+ }
870
+ });
871
+ await Promise.allSettled(tasks);
872
+ if (crawlDelay && crawlDelay > 0 && queue.length > 0) {
873
+ await sleep3(crawlDelay * 1e3);
874
+ }
875
+ }
876
+ return {
877
+ pages,
878
+ totalDiscovered,
879
+ totalCrawled: pages.length,
880
+ totalSkipped,
881
+ errors,
882
+ duration: Date.now() - startTime
883
+ };
884
+ }
885
+ var CONFIG_PATH = join(homedir(), ".robot-resources", "config.json");
886
+ var PLATFORM_URL = process.env.RR_PLATFORM_URL || "https://api.robotresources.ai";
887
+ var apiKey = null;
888
+ var keyLoaded = false;
889
+ function loadApiKey() {
890
+ if (keyLoaded) return apiKey;
891
+ keyLoaded = true;
892
+ if (process.env.RR_TELEMETRY === "off") {
893
+ return null;
894
+ }
895
+ try {
896
+ const config = JSON.parse(readFileSync(CONFIG_PATH, "utf-8"));
897
+ if (config.telemetry === false) {
898
+ return null;
899
+ }
900
+ apiKey = config.api_key || null;
901
+ return apiKey;
902
+ } catch {
903
+ return null;
904
+ }
905
+ }
906
+ function reportScraperEvent(payload) {
907
+ const key = loadApiKey();
908
+ if (!key) return;
909
+ fetch(`${PLATFORM_URL}/v1/telemetry`, {
910
+ method: "POST",
911
+ headers: {
912
+ "Content-Type": "application/json",
913
+ Authorization: `Bearer ${key}`
914
+ },
915
+ body: JSON.stringify({
916
+ product: "scraper",
917
+ event_type: payload.success ? "compress" : "error",
918
+ payload
919
+ })
920
+ }).catch(() => {
921
+ });
922
+ }
923
+
924
+ // src/index.ts
925
+ async function scrape(url, options = {}) {
926
+ const startTime = Date.now();
927
+ const mode = options.mode ?? "auto";
928
+ if (options.timeout !== void 0 && (options.timeout <= 0 || Number.isNaN(options.timeout))) {
929
+ throw new FetchError("timeout must be a positive number", void 0, false);
930
+ }
931
+ try {
932
+ if (options.respectRobots) {
933
+ const allowed = await isAllowedByRobots(url, options.timeout);
934
+ if (!allowed) {
935
+ throw new FetchError(`Blocked by robots.txt: ${url}`, void 0, false);
936
+ }
937
+ }
938
+ const fetchResult = await fetchWithMode(url, mode, {
939
+ timeout: options.timeout,
940
+ maxRetries: options.maxRetries,
941
+ userAgent: options.userAgent
942
+ });
943
+ const extractResult = await extractContent(fetchResult);
944
+ const convertResult = await convertToMarkdown(extractResult);
945
+ const result = {
946
+ markdown: convertResult.markdown,
947
+ tokenCount: convertResult.tokenCount,
948
+ title: extractResult.title,
949
+ author: extractResult.author,
950
+ siteName: extractResult.siteName,
951
+ publishedAt: extractResult.publishedAt,
952
+ url: fetchResult.url
953
+ };
954
+ reportScraperEvent({
955
+ url,
956
+ tokenCount: convertResult.tokenCount,
957
+ title: extractResult.title,
958
+ latencyMs: Date.now() - startTime,
959
+ success: true
960
+ });
961
+ return result;
962
+ } catch (err) {
963
+ reportScraperEvent({
964
+ url,
965
+ tokenCount: 0,
966
+ latencyMs: Date.now() - startTime,
967
+ success: false,
968
+ error: err instanceof Error ? err.message : String(err)
969
+ });
970
+ throw err;
971
+ }
972
+ }
973
+
974
+ export { ExtractionError, FetchError, clearRobotsCache, clearSitemapCache, convertToMarkdown, crawl, estimateTokens, extractContent, extractLinks, fetchRender, fetchStealth, fetchUrl, fetchWithMode, getCrawlDelay, getSitemapUrls, isAllowedByRobots, isChallengeResponse, normalizeUrl, parseSitemap, scrape };
975
+ //# sourceMappingURL=index.js.map
976
+ //# sourceMappingURL=index.js.map