@tyroneross/blog-scraper 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +254 -279
  3. package/dist/lib/circuit-breaker.d.ts +29 -0
  4. package/dist/lib/circuit-breaker.d.ts.map +1 -0
  5. package/dist/lib/circuit-breaker.js +89 -0
  6. package/dist/lib/circuit-breaker.js.map +1 -0
  7. package/dist/lib/content-extractor.d.ts +13 -0
  8. package/dist/lib/content-extractor.d.ts.map +1 -0
  9. package/dist/lib/content-extractor.js +75 -0
  10. package/dist/lib/content-extractor.js.map +1 -0
  11. package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
  12. package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
  13. package/dist/lib/formatters/html-to-markdown.js +146 -0
  14. package/dist/lib/formatters/html-to-markdown.js.map +1 -0
  15. package/dist/lib/formatters/text-cleaner.d.ts +44 -0
  16. package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
  17. package/dist/lib/formatters/text-cleaner.js +143 -0
  18. package/dist/lib/formatters/text-cleaner.js.map +1 -0
  19. package/dist/lib/index.d.ts +96 -0
  20. package/dist/lib/index.d.ts.map +1 -0
  21. package/dist/lib/index.js +184 -0
  22. package/dist/lib/index.js.map +1 -0
  23. package/dist/lib/quality-scorer.d.ts +83 -0
  24. package/dist/lib/quality-scorer.d.ts.map +1 -0
  25. package/dist/lib/quality-scorer.js +376 -0
  26. package/dist/lib/quality-scorer.js.map +1 -0
  27. package/dist/lib/rss-utils.d.ts +31 -0
  28. package/dist/lib/rss-utils.d.ts.map +1 -0
  29. package/dist/lib/rss-utils.js +175 -0
  30. package/dist/lib/rss-utils.js.map +1 -0
  31. package/dist/lib/scraping-rate-limiter.d.ts +52 -0
  32. package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
  33. package/dist/lib/scraping-rate-limiter.js +238 -0
  34. package/dist/lib/scraping-rate-limiter.js.map +1 -0
  35. package/dist/lib/source-orchestrator.d.ts +306 -0
  36. package/dist/lib/source-orchestrator.d.ts.map +1 -0
  37. package/dist/lib/source-orchestrator.js +840 -0
  38. package/dist/lib/source-orchestrator.js.map +1 -0
  39. package/dist/lib/types.d.ts +143 -0
  40. package/dist/lib/types.d.ts.map +1 -0
  41. package/dist/lib/types.js +7 -0
  42. package/dist/lib/types.js.map +1 -0
  43. package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
  44. package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
  45. package/dist/lib/web-scrapers/content-extractor.js +531 -0
  46. package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
  47. package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
  48. package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
  49. package/dist/lib/web-scrapers/html-scraper.js +598 -0
  50. package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
  51. package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
  52. package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
  53. package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
  54. package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
  55. package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
  56. package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
  57. package/dist/lib/web-scrapers/robots-checker.js +285 -0
  58. package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
  59. package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
  60. package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
  61. package/dist/lib/web-scrapers/rss-discovery.js +384 -0
  62. package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
  63. package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
  64. package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
  65. package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
  66. package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
  67. package/package.json +54 -33
  68. package/dist/index.d.mts +0 -949
  69. package/dist/index.d.ts +0 -949
  70. package/dist/index.js +0 -3236
  71. package/dist/index.mjs +0 -3165
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Playwright-based scraper for JavaScript-rendered pages
3
+ *
4
+ * Used as a fallback when static HTML scraping fails (e.g., Next.js, React, Vue sites)
5
+ * Returns the same ExtractedArticle format as HTMLScraper for consistency
6
+ */
7
+ import { ExtractedArticle, ScrapingConfig } from './html-scraper';
8
+ export interface PlaywrightScraperConfig extends ScrapingConfig {
9
+ /** Wait for specific selector before extracting (optional) */
10
+ waitForSelector?: string;
11
+ /** Maximum time to wait for page load in ms (default: 30000) */
12
+ timeout?: number;
13
+ /** Whether to block images/fonts for faster loading (default: true) */
14
+ blockMedia?: boolean;
15
+ /** Custom viewport size */
16
+ viewport?: {
17
+ width: number;
18
+ height: number;
19
+ };
20
+ }
21
+ export declare class PlaywrightScraper {
22
+ private browser;
23
+ private readonly userAgent;
24
+ private readonly defaultConfig;
25
+ /**
26
+ * Initialize browser instance
27
+ */
28
+ private getBrowser;
29
+ /**
30
+ * Close browser instance
31
+ */
32
+ close(): Promise<void>;
33
+ /**
34
+ * Extract article links from a JavaScript-rendered page
35
+ */
36
+ extractArticleLinks(url: string, config?: PlaywrightScraperConfig): Promise<ExtractedArticle[]>;
37
+ /**
38
+ * Fetch fully rendered HTML content from a page
39
+ * Useful for content extraction on JS-rendered article pages
40
+ */
41
+ fetchRenderedContent(url: string, config?: PlaywrightScraperConfig): Promise<string | null>;
42
+ /**
43
+ * Check if URL passes filters
44
+ */
45
+ private passesFilters;
46
+ /**
47
+ * Check if URL looks like an article
48
+ */
49
+ private isLikelyArticleUrl;
50
+ /**
51
+ * Merge configurations
52
+ */
53
+ private mergeConfig;
54
+ }
55
+ export declare function getPlaywrightScraper(): PlaywrightScraper;
56
+ export declare function closePlaywrightScraper(): Promise<void>;
57
+ //# sourceMappingURL=playwright-scraper.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"playwright-scraper.d.ts","sourceRoot":"","sources":["../../../lib/web-scrapers/playwright-scraper.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,EAAE,gBAAgB,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAElE,MAAM,WAAW,uBAAwB,SAAQ,cAAc;IAC7D,8DAA8D;IAC9D,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,gEAAgE;IAChE,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uEAAuE;IACvE,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,2BAA2B;IAC3B,QAAQ,CAAC,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;CAC9C;AAED,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,OAAO,CAAwB;IACvC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAA2H;IAErJ,OAAO,CAAC,QAAQ,CAAC,aAAa,CAsD5B;IAEF;;OAEG;YACW,UAAU;IAiBxB;;OAEG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAQ5B;;OAEG;IACG,mBAAmB,CACvB,GAAG,EAAE,MAAM,EACX,MAAM,GAAE,uBAA4B,GACnC,OAAO,CAAC,gBAAgB,EAAE,CAAC;IAwJ9B;;;OAGG;IACG,oBAAoB,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,GAAE,uBAA4B,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;IA2CrG;;OAEG;IACH,OAAO,CAAC,aAAa;IAiBrB;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAe1B;;OAEG;IACH,OAAO,CAAC,WAAW;CA6BpB;AAKD,wBAAgB,oBAAoB,IAAI,iBAAiB,CAKxD;AAGD,wBAAsB,sBAAsB,IAAI,OAAO,CAAC,IAAI,CAAC,CAK5D"}
@@ -0,0 +1,355 @@
1
+ "use strict";
2
+ /**
3
+ * Playwright-based scraper for JavaScript-rendered pages
4
+ *
5
+ * Used as a fallback when static HTML scraping fails (e.g., Next.js, React, Vue sites)
6
+ * Returns the same ExtractedArticle format as HTMLScraper for consistency
7
+ */
8
+ Object.defineProperty(exports, "__esModule", { value: true });
9
+ exports.PlaywrightScraper = void 0;
10
+ exports.getPlaywrightScraper = getPlaywrightScraper;
11
+ exports.closePlaywrightScraper = closePlaywrightScraper;
12
+ const playwright_1 = require("playwright");
13
+ class PlaywrightScraper {
14
+ constructor() {
15
+ this.browser = null;
16
+ this.userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
17
+ this.defaultConfig = {
18
+ timeout: 30000,
19
+ blockMedia: true,
20
+ viewport: { width: 1280, height: 720 },
21
+ selectors: {
22
+ articleLinks: [
23
+ 'article a[href]',
24
+ '.article a[href]',
25
+ '.post a[href]',
26
+ '.story a[href]',
27
+ '.news-item a[href]',
28
+ '.card a[href]',
29
+ '[class*="article"] a[href]',
30
+ '[class*="post"] a[href]',
31
+ '[class*="news"] a[href]',
32
+ '[class*="story"] a[href]',
33
+ 'h1 a[href]',
34
+ 'h2 a[href]',
35
+ 'h3 a[href]',
36
+ '.headline a[href]',
37
+ '.title a[href]',
38
+ // Common list patterns
39
+ 'ul li a[href]',
40
+ '.list-item a[href]',
41
+ '[role="listitem"] a[href]',
42
+ ],
43
+ excludeSelectors: [
44
+ 'nav',
45
+ 'header',
46
+ 'footer',
47
+ '.navigation',
48
+ '.menu',
49
+ '.sidebar',
50
+ '.advertisement',
51
+ '.ads',
52
+ '.comments',
53
+ '.social-share',
54
+ '[aria-hidden="true"]',
55
+ ]
56
+ },
57
+ filters: {
58
+ minTitleLength: 10,
59
+ maxTitleLength: 300,
60
+ excludePatterns: [
61
+ /\/(tag|category|author|search|archive|login|register|contact|about|privacy|terms)\//i,
62
+ /\.(pdf|jpg|jpeg|png|gif|mp4|zip|doc)$/i,
63
+ /#/,
64
+ /javascript:/i,
65
+ /mailto:/i
66
+ ]
67
+ },
68
+ limits: {
69
+ maxLinksPerPage: 100
70
+ }
71
+ };
72
+ }
73
+ /**
74
+ * Initialize browser instance
75
+ */
76
+ async getBrowser() {
77
+ if (!this.browser) {
78
+ console.log('🎭 [Playwright] Launching browser...');
79
+ this.browser = await playwright_1.chromium.launch({
80
+ headless: true,
81
+ args: [
82
+ '--no-sandbox',
83
+ '--disable-setuid-sandbox',
84
+ '--disable-dev-shm-usage',
85
+ '--disable-accelerated-2d-canvas',
86
+ '--disable-gpu'
87
+ ]
88
+ });
89
+ }
90
+ return this.browser;
91
+ }
92
+ /**
93
+ * Close browser instance
94
+ */
95
+ async close() {
96
+ if (this.browser) {
97
+ await this.browser.close();
98
+ this.browser = null;
99
+ console.log('🎭 [Playwright] Browser closed');
100
+ }
101
+ }
102
+ /**
103
+ * Extract article links from a JavaScript-rendered page
104
+ */
105
+ async extractArticleLinks(url, config = {}) {
106
+ console.log(`🎭 [Playwright] Extracting articles from ${url}`);
107
+ const mergedConfig = this.mergeConfig(this.defaultConfig, config);
108
+ const browser = await this.getBrowser();
109
+ const context = await browser.newContext({
110
+ userAgent: this.userAgent,
111
+ viewport: mergedConfig.viewport,
112
+ });
113
+ const page = await context.newPage();
114
+ const articles = [];
115
+ try {
116
+ // Block unnecessary resources for faster loading
117
+ if (mergedConfig.blockMedia) {
118
+ await page.route('**/*', (route) => {
119
+ const resourceType = route.request().resourceType();
120
+ if (['image', 'font', 'media'].includes(resourceType)) {
121
+ route.abort();
122
+ }
123
+ else {
124
+ route.continue();
125
+ }
126
+ });
127
+ }
128
+ // Navigate to page
129
+ console.log(`🎭 [Playwright] Loading ${url}...`);
130
+ await page.goto(url, {
131
+ waitUntil: 'networkidle',
132
+ timeout: mergedConfig.timeout
133
+ });
134
+ // Wait for custom selector if specified
135
+ if (mergedConfig.waitForSelector) {
136
+ await page.waitForSelector(mergedConfig.waitForSelector, {
137
+ timeout: mergedConfig.timeout
138
+ });
139
+ }
140
+ // Give JS a moment to finish rendering
141
+ await page.waitForTimeout(1000);
142
+ // Remove excluded elements
143
+ for (const selector of mergedConfig.selectors?.excludeSelectors || []) {
144
+ await page.evaluate((sel) => {
145
+ document.querySelectorAll(sel).forEach(el => el.remove());
146
+ }, selector);
147
+ }
148
+ // Extract articles using configured selectors
149
+ const extractedData = await page.evaluate((selectors) => {
150
+ const results = [];
151
+ const seenUrls = new Set();
152
+ for (const selector of selectors) {
153
+ const links = document.querySelectorAll(selector);
154
+ links.forEach((link) => {
155
+ const anchor = link;
156
+ const href = anchor.href;
157
+ if (!href || seenUrls.has(href))
158
+ return;
159
+ seenUrls.add(href);
160
+ // Get title from link text or nearby heading
161
+ let title = anchor.textContent?.trim() || '';
162
+ // Try to find better title from parent article/card
163
+ const parent = anchor.closest('article, [class*="card"], [class*="post"], [class*="item"], li');
164
+ if (parent) {
165
+ const heading = parent.querySelector('h1, h2, h3, h4, .title, .headline');
166
+ if (heading) {
167
+ const headingText = heading.textContent?.trim();
168
+ if (headingText && headingText.length > title.length) {
169
+ title = headingText;
170
+ }
171
+ }
172
+ // Get description
173
+ const desc = parent.querySelector('p, .excerpt, .summary, .description');
174
+ const description = desc?.textContent?.trim();
175
+ // Get date
176
+ const dateEl = parent.querySelector('time, [datetime], .date, .published');
177
+ const date = dateEl?.getAttribute('datetime') || dateEl?.textContent?.trim();
178
+ if (title && title.length >= 10) {
179
+ results.push({
180
+ url: href,
181
+ title,
182
+ date,
183
+ description: description?.substring(0, 300)
184
+ });
185
+ }
186
+ }
187
+ else if (title && title.length >= 10) {
188
+ results.push({ url: href, title });
189
+ }
190
+ });
191
+ }
192
+ return results;
193
+ }, mergedConfig.selectors?.articleLinks || []);
194
+ // Process and filter results
195
+ for (const item of extractedData) {
196
+ if (articles.length >= (mergedConfig.limits?.maxLinksPerPage || 100))
197
+ break;
198
+ // Apply URL filters
199
+ if (!this.passesFilters(item.url, mergedConfig.filters))
200
+ continue;
201
+ // Parse date if available
202
+ let publishedDate;
203
+ if (item.date) {
204
+ const parsed = new Date(item.date);
205
+ if (!isNaN(parsed.getTime())) {
206
+ publishedDate = parsed;
207
+ }
208
+ }
209
+ // Calculate confidence based on data quality
210
+ let confidence = 0.6; // Base confidence for Playwright extraction
211
+ if (publishedDate)
212
+ confidence += 0.1;
213
+ if (item.description)
214
+ confidence += 0.1;
215
+ if (this.isLikelyArticleUrl(item.url))
216
+ confidence += 0.1;
217
+ articles.push({
218
+ url: item.url,
219
+ title: item.title,
220
+ publishedDate,
221
+ description: item.description,
222
+ confidence: Math.min(confidence, 1.0),
223
+ source: 'link-text'
224
+ });
225
+ }
226
+ console.log(`🎭 [Playwright] Extracted ${articles.length} articles from ${url}`);
227
+ }
228
+ catch (error) {
229
+ console.error(`❌ [Playwright] Error extracting from ${url}:`, error);
230
+ }
231
+ finally {
232
+ await context.close();
233
+ }
234
+ return articles;
235
+ }
236
+ /**
237
+ * Fetch fully rendered HTML content from a page
238
+ * Useful for content extraction on JS-rendered article pages
239
+ */
240
+ async fetchRenderedContent(url, config = {}) {
241
+ console.log(`🎭 [Playwright] Fetching rendered content from ${url}`);
242
+ const mergedConfig = this.mergeConfig(this.defaultConfig, config);
243
+ const browser = await this.getBrowser();
244
+ const context = await browser.newContext({
245
+ userAgent: this.userAgent,
246
+ viewport: mergedConfig.viewport,
247
+ });
248
+ const page = await context.newPage();
249
+ try {
250
+ // Don't block images for content extraction - we might need them
251
+ await page.goto(url, {
252
+ waitUntil: 'networkidle',
253
+ timeout: mergedConfig.timeout
254
+ });
255
+ // Wait for custom selector if specified
256
+ if (mergedConfig.waitForSelector) {
257
+ await page.waitForSelector(mergedConfig.waitForSelector, {
258
+ timeout: mergedConfig.timeout
259
+ });
260
+ }
261
+ // Give JS a moment to finish rendering
262
+ await page.waitForTimeout(1000);
263
+ // Get the full HTML
264
+ const html = await page.content();
265
+ console.log(`🎭 [Playwright] Fetched ${html.length} bytes of rendered HTML`);
266
+ return html;
267
+ }
268
+ catch (error) {
269
+ console.error(`❌ [Playwright] Error fetching content from ${url}:`, error);
270
+ return null;
271
+ }
272
+ finally {
273
+ await context.close();
274
+ }
275
+ }
276
+ /**
277
+ * Check if URL passes filters
278
+ */
279
+ passesFilters(url, filters) {
280
+ if (!filters)
281
+ return true;
282
+ // Check exclude patterns
283
+ if (filters.excludePatterns?.some(pattern => pattern.test(url))) {
284
+ return false;
285
+ }
286
+ // Check include patterns if specified
287
+ if (filters.includePatterns?.length &&
288
+ !filters.includePatterns.some(pattern => pattern.test(url))) {
289
+ return false;
290
+ }
291
+ return true;
292
+ }
293
+ /**
294
+ * Check if URL looks like an article
295
+ */
296
+ isLikelyArticleUrl(url) {
297
+ const articlePatterns = [
298
+ /\/article[s]?\//i,
299
+ /\/post[s]?\//i,
300
+ /\/story\//i,
301
+ /\/stories\//i,
302
+ /\/news\//i,
303
+ /\/blog\//i,
304
+ /\/\d{4}\/\d{2}\/\d{2}\//,
305
+ /\/\d{4}\/\d{2}\//
306
+ ];
307
+ return articlePatterns.some(pattern => pattern.test(url));
308
+ }
309
+ /**
310
+ * Merge configurations
311
+ */
312
+ mergeConfig(defaultConfig, userConfig) {
313
+ return {
314
+ ...defaultConfig,
315
+ ...userConfig,
316
+ selectors: {
317
+ ...defaultConfig.selectors,
318
+ ...userConfig.selectors,
319
+ articleLinks: [
320
+ ...(defaultConfig.selectors?.articleLinks || []),
321
+ ...(userConfig.selectors?.articleLinks || [])
322
+ ],
323
+ excludeSelectors: [
324
+ ...(defaultConfig.selectors?.excludeSelectors || []),
325
+ ...(userConfig.selectors?.excludeSelectors || [])
326
+ ]
327
+ },
328
+ filters: {
329
+ ...defaultConfig.filters,
330
+ ...userConfig.filters
331
+ },
332
+ limits: {
333
+ ...defaultConfig.limits,
334
+ ...userConfig.limits
335
+ }
336
+ };
337
+ }
338
+ }
339
+ exports.PlaywrightScraper = PlaywrightScraper;
340
+ // Global instance with lazy initialization
341
+ let globalPlaywrightScraper = null;
342
+ function getPlaywrightScraper() {
343
+ if (!globalPlaywrightScraper) {
344
+ globalPlaywrightScraper = new PlaywrightScraper();
345
+ }
346
+ return globalPlaywrightScraper;
347
+ }
348
+ // Cleanup function for graceful shutdown
349
+ async function closePlaywrightScraper() {
350
+ if (globalPlaywrightScraper) {
351
+ await globalPlaywrightScraper.close();
352
+ globalPlaywrightScraper = null;
353
+ }
354
+ }
355
+ //# sourceMappingURL=playwright-scraper.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"playwright-scraper.js","sourceRoot":"","sources":["../../../lib/web-scrapers/playwright-scraper.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;AAmYH,oDAKC;AAGD,wDAKC;AA9YD,2CAAqD;AAcrD,MAAa,iBAAiB;IAA9B;QACU,YAAO,GAAmB,IAAI,CAAC;QACtB,cAAS,GAAG,uHAAuH,CAAC;QAEpI,kBAAa,GAA4B;YACxD,OAAO,EAAE,KAAK;YACd,UAAU,EAAE,IAAI;YAChB,QAAQ,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE;YACtC,SAAS,EAAE;gBACT,YAAY,EAAE;oBACZ,iBAAiB;oBACjB,kBAAkB;oBAClB,eAAe;oBACf,gBAAgB;oBAChB,oBAAoB;oBACpB,eAAe;oBACf,4BAA4B;oBAC5B,yBAAyB;oBACzB,yBAAyB;oBACzB,0BAA0B;oBAC1B,YAAY;oBACZ,YAAY;oBACZ,YAAY;oBACZ,mBAAmB;oBACnB,gBAAgB;oBAChB,uBAAuB;oBACvB,eAAe;oBACf,oBAAoB;oBACpB,2BAA2B;iBAC5B;gBACD,gBAAgB,EAAE;oBAChB,KAAK;oBACL,QAAQ;oBACR,QAAQ;oBACR,aAAa;oBACb,OAAO;oBACP,UAAU;oBACV,gBAAgB;oBAChB,MAAM;oBACN,WAAW;oBACX,eAAe;oBACf,sBAAsB;iBACvB;aACF;YACD,OAAO,EAAE;gBACP,cAAc,EAAE,EAAE;gBAClB,cAAc,EAAE,GAAG;gBACnB,eAAe,EAAE;oBACf,sFAAsF;oBACtF,wCAAwC;oBACxC,GAAG;oBACH,cAAc;oBACd,UAAU;iBACX;aACF;YACD,MAAM,EAAE;gBACN,eAAe,EAAE,GAAG;aACrB;SACF,CAAC;IAoTJ,CAAC;IAlTC;;OAEG;IACK,KAAK,CAAC,UAAU;QACtB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,OAAO,CAAC,GAAG,CAAC,sCAAsC,CAAC,CAAC;YACpD,IAAI,CAAC,OAAO,GAAG,MAAM,qBAAQ,CAAC,MAAM,CAAC;gBACnC,QAAQ,EAAE,IAAI;gBACd,IAAI,EAAE;oBACJ,cAAc;oBACd,0BAA0B;oBAC1B,yBAAyB;oBACzB,iCAAiC;oBACjC,eAAe;iBAChB;aACF,CAAC,CAAC;QACL,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK;QACT,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;YACpB,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;QAChD,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,mBAAmB,CACvB,GAAW,EACX,SAAkC,EAAE;QAEpC,OAAO,CAAC,GAAG,CAAC,4CAA4C,GAAG,EAAE,CAAC,CAAC;QAE/D,MAAM,YAAY,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC;QAClE,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;QACxC,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC;YACvC,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,QAAQ,EAAE,YAAY,CAAC,QAAQ;SAChC,CAAC,CAAC;QAEH,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QACrC,MAAM,QAAQ,GAAuB,EAAE,CAAC;QAExC,IAAI,CAAC;YACH,iDAAiD;YACjD,IAAI,YAAY,CAAC,UAAU,EAAE,CAAC;gBAC5B,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE;oBACjC,MAAM,YAAY,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC,YAAY,EAAE,CAAC;oBACpD,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;wBACtD,KAAK,CAAC,KAAK,EAAE,CAAC;oBAChB,CAAC;yBAAM,CAAC;wBACN,KAAK,CAAC,QAAQ,EAAE,CAAC;oBACnB,CAAC;gBACH,CAAC,CAAC,CAAC;YACL,CAAC;YAED,mBAAmB;YACnB,OAAO,CAAC,GAAG,CAAC,2BAA2B,GAAG,KAAK,CAAC,CAAC;YACjD,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;gBACnB,SAAS,EAAE,aAAa;gBACxB,OAAO,EAAE,YAAY,CAAC,OAAO;aAC9B,CAAC,CAAC;YAEH,wCAAwC;YACxC,IAAI,YAAY,CAAC,eAAe,EAAE,CAAC;gBACjC,MAAM,IAAI,CAAC,eAAe,CAAC,YAAY,CAAC,eAAe,EAAE;oBACvD,OAAO,EAAE,YAAY,CAAC,OAAO;iBAC9B,CAAC,CAAC;YACL,CAAC;YAED,uCAAuC;YACvC,MAAM,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;YAEhC,2BAA2B;YAC3B,KAAK,MAAM,QAAQ,IAAI,YAAY,CAAC,SAAS,EAAE,gBAAgB,IAAI,EAAE,EAAE,CAAC;gBACtE,MAAM,IAAI,CAAC,QAAQ,CAAC,CAAC,GAAG,EAAE,EAAE;oBAC1B,QAAQ,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;gBAC5D,CAAC,EAAE,QAAQ,CAAC,CAAC;YACf,CAAC;YAED,8CAA8C;YAC9C,MAAM,aAAa,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,CAAC,SAAS,EAAE,EAAE;gBACtD,MAAM,OAAO,GAKR,EAAE,CAAC;gBACR,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;gBAEnC,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;oBACjC,MAAM,KAAK,GAAG,QAAQ,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;oBAElD,KAAK,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;wBACrB,MAAM,MAAM,GAAG,IAAyB,CAAC;wBACzC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC;wBAEzB,IAAI,CAAC,IAAI,IAAI,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC;4BAAE,OAAO;wBACxC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;wBAEnB,6CAA6C;wBAC7C,IAAI,KAAK,GAAG,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;wBAE7C,oDAAoD;wBACpD,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,gEAAgE,CAAC,CAAC;wBAChG,IAAI,MAAM,EAAE,CAAC;4BACX,MAAM,OAAO,GAAG,MAAM,CAAC,aAAa,CAAC,mCAAmC,CAAC,CAAC;4BAC1E,IAAI,OAAO,EAAE,CAAC;gCACZ,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;gCAChD,IAAI,WAAW,IAAI,WAAW,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;oCACrD,KAAK,GAAG,WAAW,CAAC;gCACtB,CAAC;4BACH,CAAC;4BAED,kBAAkB;4BAClB,MAAM,IAAI,GAAG,MAAM,CAAC,aAAa,CAAC,qCAAqC,CAAC,CAAC;4BACzE,MAAM,WAAW,GAAG,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;4BAE9C,WAAW;4BACX,MAAM,MAAM,GAAG,MAAM,CAAC,aAAa,CAAC,qCAAqC,CAAC,CAAC;4BAC3E,MAAM,IAAI,GAAG,MAAM,EAAE,YAAY,CAAC,UAAU,CAAC,IAAI,MAAM,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;4BAE7E,IAAI,KAAK,IAAI,KAAK,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;gCAChC,OAAO,CAAC,IAAI,CAAC;oCACX,GAAG,EAAE,IAAI;oCACT,KAAK;oCACL,IAAI;oCACJ,WAAW,EAAE,WAAW,EAAE,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC;iCAC5C,CAAC,CAAC;4BACL,CAAC;wBACH,CAAC;6BAAM,IAAI,KAAK,IAAI,KAAK,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;4BACvC,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;wBACrC,CAAC;oBACH,CAAC,CAAC,CAAC;gBACL,CAAC;gBAED,OAAO,OAAO,CAAC;YACjB,CAAC,EAAE,YAAY,CAAC,SAAS,EAAE,YAAY,IAAI,EAAE,CAAC,CAAC;YAE/C,6BAA6B;YAC7B,KAAK,MAAM,IAAI,IAAI,aAAa,EAAE,CAAC;gBACjC,IAAI,QAAQ,CAAC,MAAM,IAAI,CAAC,YAAY,CAAC,MAAM,EAAE,eAAe,IAAI,GAAG,CAAC;oBAAE,MAAM;gBAE5E,oBAAoB;gBACpB,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,GAAG,EAAE,YAAY,CAAC,OAAO,CAAC;oBAAE,SAAS;gBAElE,0BAA0B;gBAC1B,IAAI,aAA+B,CAAC;gBACpC,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;oBACd,MAAM,MAAM,GAAG,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACnC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC,EAAE,CAAC;wBAC7B,aAAa,GAAG,MAAM,CAAC;oBACzB,CAAC;gBACH,CAAC;gBAED,6CAA6C;gBAC7C,IAAI,UAAU,GAAG,GAAG,CAAC,CAAC,4CAA4C;gBAClE,IAAI,aAAa;oBAAE,UAAU,IAAI,GAAG,CAAC;gBACrC,IAAI,IAAI,CAAC,WAAW;oBAAE,UAAU,IAAI,GAAG,CAAC;gBACxC,IAAI,IAAI,CAAC,kBAAkB,CAAC,IAAI,CAAC,GAAG,CAAC;oBAAE,UAAU,IAAI,GAAG,CAAC;gBAEzD,QAAQ,CAAC,IAAI,CAAC;oBACZ,GAAG,EAAE,IAAI,CAAC,GAAG;oBACb,KAAK,EAAE,IAAI,CAAC,KAAK;oBACjB,aAAa;oBACb,WAAW,EAAE,IAAI,CAAC,WAAW;oBAC7B,UAAU,EAAE,IAAI,CAAC,GAAG,CAAC,UAAU,EAAE,GAAG,CAAC;oBACrC,MAAM,EAAE,WAAW;iBACpB,CAAC,CAAC;YACL,CAAC;YAED,OAAO,CAAC,GAAG,CAAC,6BAA6B,QAAQ,CAAC,MAAM,kBAAkB,GAAG,EAAE,CAAC,CAAC;QAEnF,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,wCAAwC,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;QACvE,CAAC;gBAAS,CAAC;YACT,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;QACxB,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,oBAAoB,CAAC,GAAW,EAAE,SAAkC,EAAE;QAC1E,OAAO,CAAC,GAAG,CAAC,kDAAkD,GAAG,EAAE,CAAC,CAAC;QAErE,MAAM,YAAY,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC;QAClE,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;QACxC,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC;YACvC,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,QAAQ,EAAE,YAAY,CAAC,QAAQ;SAChC,CAAC,CAAC;QAEH,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;QAErC,IAAI,CAAC;YACH,iEAAiE;YACjE,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;gBACnB,SAAS,EAAE,aAAa;gBACxB,OAAO,EAAE,YAAY,CAAC,OAAO;aAC9B,CAAC,CAAC;YAEH,wCAAwC;YACxC,IAAI,YAAY,CAAC,eAAe,EAAE,CAAC;gBACjC,MAAM,IAAI,CAAC,eAAe,CAAC,YAAY,CAAC,eAAe,EAAE;oBACvD,OAAO,EAAE,YAAY,CAAC,OAAO;iBAC9B,CAAC,CAAC;YACL,CAAC;YAED,uCAAuC;YACvC,MAAM,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;YAEhC,oBAAoB;YACpB,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;YAClC,OAAO,CAAC,GAAG,CAAC,2BAA2B,IAAI,CAAC,MAAM,yBAAyB,CAAC,CAAC;YAE7E,OAAO,IAAI,CAAC;QAEd,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,8CAA8C,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YAC3E,OAAO,IAAI,CAAC;QACd,CAAC;gBAAS,CAAC;YACT,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;QACxB,CAAC;IACH,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,GAAW,EAAE,OAAmC;QACpE,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAE1B,yBAAyB;QACzB,IAAI,OAAO,CAAC,eAAe,EAAE,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAChE,OAAO,KAAK,CAAC;QACf,CAAC;QAED,sCAAsC;QACtC,IAAI,OAAO,CAAC,eAAe,EAAE,MAAM;YAC/B,CAAC,OAAO,CAAC,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAChE,OAAO,KAAK,CAAC;QACf,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAED;;OAEG;IACK,kBAAkB,CAAC,GAAW;QACpC,MAAM,eAAe,GAAG;YACtB,kBAAkB;YAClB,eAAe;YACf,YAAY;YACZ,cAAc;YACd,WAAW;YACX,WAAW;YACX,yBAAyB;YACzB,kBAAkB;SACnB,CAAC;QAEF,OAAO,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAC5D,CAAC;IAED;;OAEG;IACK,WAAW,CACjB,aAAsC,EACtC,UAAmC;QAEnC,OAAO;YACL,GAAG,aAAa;YAChB,GAAG,UAAU;YACb,SAAS,EAAE;gBACT,GAAG,aAAa,CAAC,SAAS;gBAC1B,GAAG,UAAU,CAAC,SAAS;gBACvB,YAAY,EAAE;oBACZ,GAAG,CAAC,aAAa,CAAC,SAAS,EAAE,YAAY,IAAI,EAAE,CAAC;oBAChD,GAAG,CAAC,UAAU,CAAC,SAAS,EAAE,YAAY,IAAI,EAAE,CAAC;iBAC9C;gBACD,gBAAgB,EAAE;oBAChB,GAAG,CAAC,aAAa,CAAC,SAAS,EAAE,gBAAgB,IAAI,EAAE,CAAC;oBACpD,GAAG,CAAC,UAAU,CAAC,SAAS,EAAE,gBAAgB,IAAI,EAAE,CAAC;iBAClD;aACF;YACD,OAAO,EAAE;gBACP,GAAG,aAAa,CAAC,OAAO;gBACxB,GAAG,UAAU,CAAC,OAAO;aACtB;YACD,MAAM,EAAE;gBACN,GAAG,aAAa,CAAC,MAAM;gBACvB,GAAG,UAAU,CAAC,MAAM;aACrB;SACF,CAAC;IACJ,CAAC;CACF;AA9WD,8CA8WC;AAED,2CAA2C;AAC3C,IAAI,uBAAuB,GAA6B,IAAI,CAAC;AAE7D,SAAgB,oBAAoB;IAClC,IAAI,CAAC,uBAAuB,EAAE,CAAC;QAC7B,uBAAuB,GAAG,IAAI,iBAAiB,EAAE,CAAC;IACpD,CAAC;IACD,OAAO,uBAAuB,CAAC;AACjC,CAAC;AAED,yCAAyC;AAClC,KAAK,UAAU,sBAAsB;IAC1C,IAAI,uBAAuB,EAAE,CAAC;QAC5B,MAAM,uBAAuB,CAAC,KAAK,EAAE,CAAC;QACtC,uBAAuB,GAAG,IAAI,CAAC;IACjC,CAAC;AACH,CAAC"}
@@ -0,0 +1,42 @@
1
+ export declare class RobotsChecker {
2
+ private cache;
3
+ private readonly cacheTimeout;
4
+ private readonly userAgent;
5
+ private readonly requestTimeout;
6
+ /**
7
+ * Check if a URL is allowed to be crawled according to robots.txt
8
+ */
9
+ isAllowed(url: string): Promise<{
10
+ allowed: boolean;
11
+ crawlDelay?: number;
12
+ sitemaps: string[];
13
+ reason?: string;
14
+ }>;
15
+ /**
16
+ * Get sitemaps listed in robots.txt for a domain
17
+ */
18
+ getSitemaps(domain: string): Promise<string[]>;
19
+ /**
20
+ * Get the recommended crawl delay for a domain
21
+ */
22
+ getCrawlDelay(domain: string): Promise<number | undefined>;
23
+ private getRobotsTxt;
24
+ private parseRobotsTxt;
25
+ private completeRule;
26
+ private checkRules;
27
+ private findBestMatchingRule;
28
+ private matchesPattern;
29
+ clearCache(): void;
30
+ getCacheStats(): {
31
+ size: number;
32
+ entries: {
33
+ url: string;
34
+ fetchedAt: string;
35
+ expiresAt: string;
36
+ rulesCount: number;
37
+ sitemapsCount: number;
38
+ }[];
39
+ };
40
+ }
41
+ export declare const globalRobotsChecker: RobotsChecker;
42
+ //# sourceMappingURL=robots-checker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"robots-checker.d.ts","sourceRoot":"","sources":["../../../lib/web-scrapers/robots-checker.ts"],"names":[],"mappings":"AAeA,qBAAa,aAAa;IACxB,OAAO,CAAC,KAAK,CAAgC;IAC7C,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAuB;IACpD,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAqB;IAC/C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAQ;IAEvC;;OAEG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;QACpC,OAAO,EAAE,OAAO,CAAC;QACjB,UAAU,CAAC,EAAE,MAAM,CAAC;QACpB,QAAQ,EAAE,MAAM,EAAE,CAAC;QACnB,MAAM,CAAC,EAAE,MAAM,CAAC;KACjB,CAAC;IAkCF;;OAEG;IACG,WAAW,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;IAWpD;;OAEG;IACG,aAAa,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,SAAS,CAAC;YAiBlD,YAAY;IAiD1B,OAAO,CAAC,cAAc;IA+EtB,OAAO,CAAC,YAAY;IAUpB,OAAO,CAAC,UAAU;IAkDlB,OAAO,CAAC,oBAAoB;IAW5B,OAAO,CAAC,cAAc;IA2BtB,UAAU;IAKV,aAAa;;;;;;;;;;CAYd;AAGD,eAAO,MAAM,mBAAmB,eAAsB,CAAC"}