@tyroneross/blog-scraper 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +254 -279
  3. package/dist/lib/circuit-breaker.d.ts +29 -0
  4. package/dist/lib/circuit-breaker.d.ts.map +1 -0
  5. package/dist/lib/circuit-breaker.js +89 -0
  6. package/dist/lib/circuit-breaker.js.map +1 -0
  7. package/dist/lib/content-extractor.d.ts +13 -0
  8. package/dist/lib/content-extractor.d.ts.map +1 -0
  9. package/dist/lib/content-extractor.js +75 -0
  10. package/dist/lib/content-extractor.js.map +1 -0
  11. package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
  12. package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
  13. package/dist/lib/formatters/html-to-markdown.js +146 -0
  14. package/dist/lib/formatters/html-to-markdown.js.map +1 -0
  15. package/dist/lib/formatters/text-cleaner.d.ts +44 -0
  16. package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
  17. package/dist/lib/formatters/text-cleaner.js +143 -0
  18. package/dist/lib/formatters/text-cleaner.js.map +1 -0
  19. package/dist/lib/index.d.ts +96 -0
  20. package/dist/lib/index.d.ts.map +1 -0
  21. package/dist/lib/index.js +184 -0
  22. package/dist/lib/index.js.map +1 -0
  23. package/dist/lib/quality-scorer.d.ts +83 -0
  24. package/dist/lib/quality-scorer.d.ts.map +1 -0
  25. package/dist/lib/quality-scorer.js +376 -0
  26. package/dist/lib/quality-scorer.js.map +1 -0
  27. package/dist/lib/rss-utils.d.ts +31 -0
  28. package/dist/lib/rss-utils.d.ts.map +1 -0
  29. package/dist/lib/rss-utils.js +175 -0
  30. package/dist/lib/rss-utils.js.map +1 -0
  31. package/dist/lib/scraping-rate-limiter.d.ts +52 -0
  32. package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
  33. package/dist/lib/scraping-rate-limiter.js +238 -0
  34. package/dist/lib/scraping-rate-limiter.js.map +1 -0
  35. package/dist/lib/source-orchestrator.d.ts +306 -0
  36. package/dist/lib/source-orchestrator.d.ts.map +1 -0
  37. package/dist/lib/source-orchestrator.js +840 -0
  38. package/dist/lib/source-orchestrator.js.map +1 -0
  39. package/dist/lib/types.d.ts +143 -0
  40. package/dist/lib/types.d.ts.map +1 -0
  41. package/dist/lib/types.js +7 -0
  42. package/dist/lib/types.js.map +1 -0
  43. package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
  44. package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
  45. package/dist/lib/web-scrapers/content-extractor.js +531 -0
  46. package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
  47. package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
  48. package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
  49. package/dist/lib/web-scrapers/html-scraper.js +598 -0
  50. package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
  51. package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
  52. package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
  53. package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
  54. package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
  55. package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
  56. package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
  57. package/dist/lib/web-scrapers/robots-checker.js +285 -0
  58. package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
  59. package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
  60. package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
  61. package/dist/lib/web-scrapers/rss-discovery.js +384 -0
  62. package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
  63. package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
  64. package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
  65. package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
  66. package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
  67. package/package.json +54 -33
  68. package/dist/index.d.mts +0 -949
  69. package/dist/index.d.ts +0 -949
  70. package/dist/index.js +0 -3236
  71. package/dist/index.mjs +0 -3165
@@ -0,0 +1,531 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __importDefault = (this && this.__importDefault) || function (mod) {
36
+ return (mod && mod.__esModule) ? mod : { "default": mod };
37
+ };
38
+ Object.defineProperty(exports, "__esModule", { value: true });
39
+ exports.globalContentExtractor = exports.ContentExtractor = void 0;
40
+ const readability_1 = require("@mozilla/readability");
41
+ const jsdom_1 = require("jsdom");
42
+ const cheerio = __importStar(require("cheerio"));
43
+ const p_limit_1 = __importDefault(require("p-limit"));
44
+ const scraping_rate_limiter_1 = require("../scraping-rate-limiter");
45
+ const robots_checker_1 = require("./robots-checker");
46
+ class ContentExtractor {
47
+ constructor() {
48
+ this.userAgent = 'Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)';
49
+ this.timeout = 15000; // 15 seconds
50
+ this.maxContentSize = 10 * 1024 * 1024; // 10MB max
51
+ this.minContentLength = 200; // Minimum 200 characters
52
+ this.wordsPerMinute = 200; // Average reading speed
53
+ this.ssrfProtection = {
54
+ isPrivateIP: (url) => {
55
+ try {
56
+ const urlObj = new URL(url);
57
+ const hostname = urlObj.hostname;
58
+ // Check for private IP ranges
59
+ const privateRanges = [
60
+ /^127\./, // 127.0.0.0/8 (loopback)
61
+ /^10\./, // 10.0.0.0/8 (private)
62
+ /^172\.(1[6-9]|2[0-9]|3[01])\./, // 172.16.0.0/12 (private)
63
+ /^192\.168\./, // 192.168.0.0/16 (private)
64
+ /^169\.254\./, // 169.254.0.0/16 (link-local)
65
+ /^::1$/, // IPv6 loopback
66
+ /^fe80:/, // IPv6 link-local
67
+ /^fc00:/, // IPv6 unique local
68
+ /^fd00:/ // IPv6 unique local
69
+ ];
70
+ return privateRanges.some(range => range.test(hostname));
71
+ }
72
+ catch {
73
+ return true; // If we can't parse, block it
74
+ }
75
+ },
76
+ isLocalhost: (url) => {
77
+ try {
78
+ const urlObj = new URL(url);
79
+ const hostname = urlObj.hostname.toLowerCase();
80
+ return hostname === 'localhost' || hostname === '127.0.0.1' || hostname === '::1';
81
+ }
82
+ catch {
83
+ return true;
84
+ }
85
+ },
86
+ isAllowedProtocol: (url) => {
87
+ try {
88
+ const urlObj = new URL(url);
89
+ return urlObj.protocol === 'http:' || urlObj.protocol === 'https:';
90
+ }
91
+ catch {
92
+ return false;
93
+ }
94
+ }
95
+ };
96
+ }
97
+ /**
98
+ * Extract content from a URL
99
+ */
100
+ async extractContent(url) {
101
+ console.log(`📖 [ContentExtractor] Starting content extraction from ${url}`);
102
+ try {
103
+ // SSRF protection
104
+ if (!this.ssrfProtection.isAllowedProtocol(url)) {
105
+ throw new Error(`Disallowed protocol: ${url}`);
106
+ }
107
+ if (this.ssrfProtection.isPrivateIP(url) || this.ssrfProtection.isLocalhost(url)) {
108
+ throw new Error(`Private/local IP not allowed: ${url}`);
109
+ }
110
+ // Check robots.txt compliance
111
+ const robotsCheck = await robots_checker_1.globalRobotsChecker.isAllowed(url);
112
+ if (!robotsCheck.allowed) {
113
+ console.warn(`🤖 [ContentExtractor] URL blocked by robots.txt: ${url} - ${robotsCheck.reason}`);
114
+ return null;
115
+ }
116
+ const html = await this.fetchContent(url);
117
+ if (!html) {
118
+ return null;
119
+ }
120
+ // Extract content using multiple methods
121
+ const extracted = await this.extractFromHTML(html, url);
122
+ if (!extracted) {
123
+ console.warn(`⚠️ [ContentExtractor] No content extracted from ${url}`);
124
+ return null;
125
+ }
126
+ // Validate extracted content
127
+ if (extracted.textContent.length < this.minContentLength) {
128
+ console.warn(`⚠️ [ContentExtractor] Content too short (${extracted.textContent.length} chars): ${url}`);
129
+ return null;
130
+ }
131
+ console.log(`✅ [ContentExtractor] Successfully extracted ${extracted.wordCount} words from ${url}`);
132
+ return extracted;
133
+ }
134
+ catch (error) {
135
+ console.error(`❌ [ContentExtractor] Error extracting content from ${url}:`, error);
136
+ return null;
137
+ }
138
+ }
139
+ /**
140
+ * Extract content from multiple URLs with configurable concurrency
141
+ */
142
+ async extractBatch(urls, options = {}) {
143
+ const concurrency = options.concurrency || 5;
144
+ console.log(`📖 [ContentExtractor] Starting parallel batch extraction of ${urls.length} URLs (concurrency: ${concurrency})`);
145
+ const limit = (0, p_limit_1.default)(concurrency);
146
+ let completed = 0;
147
+ const results = await Promise.all(urls.map(url => limit(async () => {
148
+ try {
149
+ const result = await this.extractContent(url);
150
+ completed++;
151
+ options.onProgress?.(completed, urls.length, url);
152
+ return result;
153
+ }
154
+ catch (error) {
155
+ console.error(`❌ [ContentExtractor] Error in batch extraction for ${url}:`, error);
156
+ completed++;
157
+ options.onProgress?.(completed, urls.length, url);
158
+ return null;
159
+ }
160
+ })));
161
+ const successful = results.filter(Boolean).length;
162
+ console.log(`📖 [ContentExtractor] Batch complete: ${successful}/${urls.length} successful`);
163
+ return results;
164
+ }
165
+ async fetchContent(url) {
166
+ try {
167
+ return await scraping_rate_limiter_1.globalRateLimiter.execute(url, async () => {
168
+ const controller = new AbortController();
169
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
170
+ try {
171
+ const response = await fetch(url, {
172
+ headers: {
173
+ 'User-Agent': this.userAgent,
174
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
175
+ 'Accept-Language': 'en-US,en;q=0.9'
176
+ },
177
+ signal: controller.signal,
178
+ });
179
+ clearTimeout(timeoutId);
180
+ if (!response.ok) {
181
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
182
+ }
183
+ const contentLength = response.headers.get('content-length');
184
+ if (contentLength && parseInt(contentLength) > this.maxContentSize) {
185
+ throw new Error(`Content too large: ${contentLength} bytes`);
186
+ }
187
+ const html = await response.text();
188
+ if (html.length > this.maxContentSize) {
189
+ throw new Error(`Content too large: ${html.length} bytes`);
190
+ }
191
+ return html;
192
+ }
193
+ catch (error) {
194
+ clearTimeout(timeoutId);
195
+ throw error;
196
+ }
197
+ });
198
+ }
199
+ catch (error) {
200
+ console.error(`❌ [ContentExtractor] Error fetching content from ${url}:`, error);
201
+ return null;
202
+ }
203
+ }
204
+ async extractFromHTML(html, url) {
205
+ const errors = [];
206
+ try {
207
+ // Try Readability first (most reliable)
208
+ const readabilityResult = this.extractWithReadability(html, url);
209
+ if (readabilityResult && readabilityResult.textContent.length >= this.minContentLength) {
210
+ return {
211
+ ...readabilityResult,
212
+ extractionMethod: 'readability',
213
+ confidence: 0.9
214
+ };
215
+ }
216
+ else {
217
+ errors.push('Readability extraction failed or content too short');
218
+ }
219
+ }
220
+ catch (error) {
221
+ errors.push(`Readability error: ${error instanceof Error ? error.message : 'Unknown error'}`);
222
+ }
223
+ try {
224
+ // Fallback to manual extraction
225
+ const fallbackResult = this.extractWithFallback(html, url);
226
+ if (fallbackResult && fallbackResult.textContent.length >= this.minContentLength) {
227
+ return {
228
+ ...fallbackResult,
229
+ extractionMethod: 'fallback',
230
+ confidence: 0.6,
231
+ errors
232
+ };
233
+ }
234
+ else {
235
+ errors.push('Fallback extraction failed or content too short');
236
+ }
237
+ }
238
+ catch (error) {
239
+ errors.push(`Fallback error: ${error instanceof Error ? error.message : 'Unknown error'}`);
240
+ }
241
+ // If both methods fail, return null
242
+ console.error(`❌ [ContentExtractor] All extraction methods failed for ${url}:`, errors);
243
+ return null;
244
+ }
245
+ extractWithReadability(html, url) {
246
+ try {
247
+ const dom = new jsdom_1.JSDOM(html, { url });
248
+ const document = dom.window.document;
249
+ const reader = new readability_1.Readability(document);
250
+ const article = reader.parse();
251
+ if (!article) {
252
+ return null;
253
+ }
254
+ // Extract structured data
255
+ const structured = this.extractStructuredData(html, url);
256
+ // Calculate metrics
257
+ const wordCount = this.countWords(article.textContent ?? '');
258
+ const readingTime = Math.ceil(wordCount / this.wordsPerMinute);
259
+ return {
260
+ url,
261
+ title: article.title || '',
262
+ content: article.content || '',
263
+ textContent: article.textContent || '',
264
+ excerpt: article.excerpt || undefined,
265
+ byline: article.byline || undefined,
266
+ publishedTime: this.extractPublishedTime(html),
267
+ siteName: article.siteName || this.extractSiteName(html),
268
+ lang: this.extractLanguage(html),
269
+ structured,
270
+ wordCount,
271
+ readingTime,
272
+ confidence: 0.9,
273
+ extractionMethod: 'readability',
274
+ extractedAt: new Date()
275
+ };
276
+ }
277
+ catch (error) {
278
+ console.error(`❌ [ContentExtractor] Readability extraction failed:`, error);
279
+ return null;
280
+ }
281
+ }
282
+ extractWithFallback(html, url) {
283
+ try {
284
+ const $ = cheerio.load(html);
285
+ // Remove unwanted elements
286
+ const unwantedSelectors = [
287
+ 'script', 'style', 'nav', 'header', 'footer',
288
+ '.advertisement', '.ads', '.social-share', '.comments',
289
+ '.sidebar', '.navigation', '.menu', '.popup', '.modal'
290
+ ];
291
+ unwantedSelectors.forEach(selector => $(selector).remove());
292
+ // Try to find the main content
293
+ let content = '';
294
+ let title = '';
295
+ // Extract title
296
+ title = $('h1').first().text().trim() ||
297
+ $('title').text().trim() ||
298
+ $('meta[property="og:title"]').attr('content') || '';
299
+ // Try different content selectors
300
+ const contentSelectors = [
301
+ '.blog-post__body', // Arista blogs, HubSpot blogs
302
+ 'article',
303
+ '.article-content',
304
+ '.post-content',
305
+ '.entry-content',
306
+ '.content',
307
+ 'main',
308
+ '#content',
309
+ '.story-body'
310
+ ];
311
+ for (const selector of contentSelectors) {
312
+ const element = $(selector).first();
313
+ if (element.length > 0) {
314
+ content = element.html() || '';
315
+ if (content.length > this.minContentLength) {
316
+ break;
317
+ }
318
+ }
319
+ }
320
+ // If no specific content area found, try to extract from body
321
+ if (content.length < this.minContentLength) {
322
+ content = $('body').html() || '';
323
+ }
324
+ if (!content || content.length < this.minContentLength) {
325
+ return null;
326
+ }
327
+ const textContent = $(content).text().trim();
328
+ const wordCount = this.countWords(textContent);
329
+ const readingTime = Math.ceil(wordCount / this.wordsPerMinute);
330
+ // Extract structured data
331
+ const structured = this.extractStructuredData(html, url);
332
+ return {
333
+ url,
334
+ title,
335
+ content,
336
+ textContent,
337
+ excerpt: textContent.substring(0, 300) + '...',
338
+ publishedTime: this.extractPublishedTime(html),
339
+ siteName: this.extractSiteName(html),
340
+ lang: this.extractLanguage(html),
341
+ structured,
342
+ wordCount,
343
+ readingTime,
344
+ confidence: 0.6,
345
+ extractionMethod: 'fallback',
346
+ extractedAt: new Date()
347
+ };
348
+ }
349
+ catch (error) {
350
+ console.error(`❌ [ContentExtractor] Fallback extraction failed:`, error);
351
+ return null;
352
+ }
353
+ }
354
+ extractStructuredData(html, url) {
355
+ const structured = {};
356
+ try {
357
+ const $ = cheerio.load(html);
358
+ // Extract JSON-LD
359
+ const jsonLdScripts = [];
360
+ $('script[type="application/ld+json"]').each((_, element) => {
361
+ try {
362
+ const jsonText = $(element).html();
363
+ if (jsonText) {
364
+ const data = JSON.parse(jsonText);
365
+ jsonLdScripts.push(data);
366
+ }
367
+ }
368
+ catch {
369
+ // Skip malformed JSON-LD
370
+ }
371
+ });
372
+ if (jsonLdScripts.length > 0) {
373
+ structured.jsonLd = jsonLdScripts;
374
+ }
375
+ // Extract OpenGraph tags
376
+ const openGraph = {};
377
+ $('meta[property^="og:"]').each((_, element) => {
378
+ const property = $(element).attr('property');
379
+ const content = $(element).attr('content');
380
+ if (property && content) {
381
+ openGraph[property] = content;
382
+ }
383
+ });
384
+ if (Object.keys(openGraph).length > 0) {
385
+ structured.openGraph = openGraph;
386
+ }
387
+ // Extract Twitter Card tags
388
+ const twitterCard = {};
389
+ $('meta[name^="twitter:"]').each((_, element) => {
390
+ const name = $(element).attr('name');
391
+ const content = $(element).attr('content');
392
+ if (name && content) {
393
+ twitterCard[name] = content;
394
+ }
395
+ });
396
+ if (Object.keys(twitterCard).length > 0) {
397
+ structured.twitterCard = twitterCard;
398
+ }
399
+ // Extract microdata (basic support)
400
+ const microdata = [];
401
+ $('[itemscope]').each((_, element) => {
402
+ const $item = $(element);
403
+ const itemType = $item.attr('itemtype');
404
+ if (itemType) {
405
+ const item = { '@type': itemType };
406
+ $item.find('[itemprop]').each((_, propElement) => {
407
+ const $prop = $(propElement);
408
+ const propName = $prop.attr('itemprop');
409
+ const propValue = $prop.attr('content') || $prop.text().trim();
410
+ if (propName && propValue) {
411
+ item[propName] = propValue;
412
+ }
413
+ });
414
+ microdata.push(item);
415
+ }
416
+ });
417
+ if (microdata.length > 0) {
418
+ structured.microdata = microdata;
419
+ }
420
+ }
421
+ catch (error) {
422
+ console.warn(`⚠️ [ContentExtractor] Error extracting structured data:`, error);
423
+ }
424
+ return Object.keys(structured).length > 0 ? structured : undefined;
425
+ }
426
+ extractPublishedTime(html) {
427
+ try {
428
+ const $ = cheerio.load(html);
429
+ // Try different selectors for published time
430
+ const timeSelectors = [
431
+ 'meta[property="article:published_time"]',
432
+ 'meta[name="datePublished"]',
433
+ 'meta[name="publishdate"]',
434
+ 'time[datetime]',
435
+ '.published-date',
436
+ '.publish-date',
437
+ '.article-date'
438
+ ];
439
+ for (const selector of timeSelectors) {
440
+ const element = $(selector).first();
441
+ if (element.length > 0) {
442
+ const timeStr = element.attr('content') || element.attr('datetime') || element.text().trim();
443
+ if (timeStr) {
444
+ const date = new Date(timeStr);
445
+ if (!isNaN(date.getTime())) {
446
+ return date;
447
+ }
448
+ }
449
+ }
450
+ }
451
+ return undefined;
452
+ }
453
+ catch {
454
+ return undefined;
455
+ }
456
+ }
457
+ extractSiteName(html) {
458
+ try {
459
+ const $ = cheerio.load(html);
460
+ return $('meta[property="og:site_name"]').attr('content') ||
461
+ $('meta[name="application-name"]').attr('content') ||
462
+ undefined;
463
+ }
464
+ catch {
465
+ return undefined;
466
+ }
467
+ }
468
+ extractLanguage(html) {
469
+ try {
470
+ const $ = cheerio.load(html);
471
+ return $('html').attr('lang') ||
472
+ $('meta[name="language"]').attr('content') ||
473
+ $('meta[http-equiv="content-language"]').attr('content') ||
474
+ undefined;
475
+ }
476
+ catch {
477
+ return undefined;
478
+ }
479
+ }
480
+ countWords(text) {
481
+ if (!text)
482
+ return 0;
483
+ return text.trim().split(/\s+/).filter(word => word.length > 0).length;
484
+ }
485
+ /**
486
+ * Validate extracted content quality
487
+ */
488
+ validateContent(content) {
489
+ const issues = [];
490
+ let score = 1.0;
491
+ // Check minimum content length
492
+ if (content.textContent.length < this.minContentLength) {
493
+ issues.push(`Content too short: ${content.textContent.length} characters`);
494
+ score -= 0.5;
495
+ }
496
+ // Check title quality
497
+ if (!content.title || content.title.length < 10) {
498
+ issues.push('Missing or too short title');
499
+ score -= 0.2;
500
+ }
501
+ else if (content.title.length > 200) {
502
+ issues.push('Title too long');
503
+ score -= 0.1;
504
+ }
505
+ // Check content-to-HTML ratio (detect pages with too much markup)
506
+ const htmlLength = content.content.length;
507
+ const textLength = content.textContent.length;
508
+ const ratio = textLength / htmlLength;
509
+ if (ratio < 0.1) {
510
+ issues.push('Low text-to-HTML ratio - may be poorly extracted');
511
+ score -= 0.2;
512
+ }
513
+ // Check for duplicate content indicators
514
+ const sentences = content.textContent.split('.').filter(s => s.trim().length > 10);
515
+ const uniqueSentences = new Set(sentences);
516
+ const duplicateRatio = (sentences.length - uniqueSentences.size) / sentences.length;
517
+ if (duplicateRatio > 0.3) {
518
+ issues.push('High duplicate content detected');
519
+ score -= 0.3;
520
+ }
521
+ return {
522
+ isValid: issues.length === 0 && score >= 0.5,
523
+ issues,
524
+ score: Math.max(0, score)
525
+ };
526
+ }
527
+ }
528
+ exports.ContentExtractor = ContentExtractor;
529
+ // Default global instance
530
+ exports.globalContentExtractor = new ContentExtractor();
531
+ //# sourceMappingURL=content-extractor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"content-extractor.js","sourceRoot":"","sources":["../../../lib/web-scrapers/content-extractor.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,sDAAmD;AACnD,iCAA8B;AAC9B,iDAAmC;AACnC,sDAA6B;AAC7B,oEAA6D;AAC7D,qDAAuD;AAsCvD,MAAa,gBAAgB;IAQ3B;QAPiB,cAAS,GAAG,6EAA6E,CAAC;QAC1F,YAAO,GAAG,KAAK,CAAC,CAAC,aAAa;QAC9B,mBAAc,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC,CAAC,WAAW;QAC9C,qBAAgB,GAAG,GAAG,CAAC,CAAC,yBAAyB;QACjD,mBAAc,GAAG,GAAG,CAAC,CAAC,wBAAwB;QAI7D,IAAI,CAAC,cAAc,GAAG;YACpB,WAAW,EAAE,CAAC,GAAW,EAAW,EAAE;gBACpC,IAAI,CAAC;oBACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;oBAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;oBAEjC,8BAA8B;oBAC9B,MAAM,aAAa,GAAG;wBACpB,QAAQ,EAAqB,yBAAyB;wBACtD,OAAO,EAAsB,uBAAuB;wBACpD,+BAA+B,EAAG,0BAA0B;wBAC5D,aAAa,EAAgB,2BAA2B;wBACxD,aAAa,EAAgB,8BAA8B;wBAC3D,OAAO,EAAsB,gBAAgB;wBAC7C,QAAQ,EAAqB,kBAAkB;wBAC/C,QAAQ,EAAqB,oBAAoB;wBACjD,QAAQ,CAAqB,oBAAoB;qBAClD,CAAC;oBAEF,OAAO,aAAa,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC;gBAC3D,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,IAAI,CAAC,CAAC,8BAA8B;gBAC7C,CAAC;YACH,CAAC;YAED,WAAW,EAAE,CAAC,GAAW,EAAW,EAAE;gBACpC,IAAI,CAAC;oBACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;oBAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;oBAC/C,OAAO,QAAQ,KAAK,WAAW,IAAI,QAAQ,KAAK,WAAW,IAAI,QAAQ,KAAK,KAAK,CAAC;gBACpF,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,IAAI,CAAC;gBACd,CAAC;YACH,CAAC;YAED,iBAAiB,EAAE,CAAC,GAAW,EAAW,EAAE;gBAC1C,IAAI,CAAC;oBACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;oBAC5B,OAAO,MAAM,CAAC,QAAQ,KAAK,OAAO,IAAI,MAAM,CAAC,QAAQ,KAAK,QAAQ,CAAC;gBACrE,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,KAAK,CAAC;gBACf,CAAC;YACH,CAAC;SACF,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,cAAc,CAAC,GAAW;QAC9B,OAAO,CAAC,GAAG,CAAC,0DAA0D,GAAG,EAAE,CAAC,CAAC;QAE7E,IAAI,CAAC;YACH,kBAAkB;YAClB,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,iBAAiB,CAAC,GAAG,CAAC,EAAE,CAAC;gBAChD,MAAM,IAAI,KAAK,CAAC,wBAAwB,GAAG,EAAE,CAAC,CAAC;YACjD,CAAC;YAED,IAAI,IAAI,CAAC,cAAc,CAAC,WAAW,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,cAAc,CAAC,WAAW,CAAC,GAAG,CAAC,EAAE,CAAC;gBACjF,MAAM,IAAI,KAAK,CAAC,iCAAiC,GAAG,EAAE,CAAC,CAAC;YAC1D,CAAC;YAED,8BAA8B;YAC9B,MAAM,WAAW,GAAG,MAAM,oCAAmB,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAC7D,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,CAAC;gBACzB,OAAO,CAAC,IAAI,CAAC,oDAAoD,GAAG,MAAM,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC;gBAChG,OAAO,IAAI,CAAC;YACd,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;YAC1C,IAAI,CAAC,IAAI,EAAE,CAAC;gBACV,OAAO,IAAI,CAAC;YACd,CAAC;YAED,yCAAyC;YACzC,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YAExD,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,OAAO,CAAC,IAAI,CAAC,mDAAmD,GAAG,EAAE,CAAC,CAAC;gBACvE,OAAO,IAAI,CAAC;YACd,CAAC;YAED,6BAA6B;YAC7B,IAAI,SAAS,CAAC,WAAW,CAAC,MAAM,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBACzD,OAAO,CAAC,IAAI,CAAC,4CAA4C,SAAS,CAAC,WAAW,CAAC,MAAM,YAAY,GAAG,EAAE,CAAC,CAAC;gBACxG,OAAO,IAAI,CAAC;YACd,CAAC;YAED,OAAO,CAAC,GAAG,CAAC,+CAA+C,SAAS,CAAC,SAAS,eAAe,GAAG,EAAE,CAAC,CAAC;YACpG,OAAO,SAAS,CAAC;QAEnB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,sDAAsD,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YACnF,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,YAAY,CAChB,IAAc,EACd,UAGI,EAAE;QAEN,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,CAAC,CAAC;QAC7C,OAAO,CAAC,GAAG,CAAC,+DAA+D,IAAI,CAAC,MAAM,uBAAuB,WAAW,GAAG,CAAC,CAAC;QAE7H,MAAM,KAAK,GAAG,IAAA,iBAAM,EAAC,WAAW,CAAC,CAAC;QAClC,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAC/B,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CACb,KAAK,CAAC,KAAK,IAAI,EAAE;YACf,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;gBAC9C,SAAS,EAAE,CAAC;gBACZ,OAAO,CAAC,UAAU,EAAE,CAAC,SAAS,EAAE,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;gBAClD,OAAO,MAAM,CAAC;YAChB,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,KAAK,CAAC,sDAAsD,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;gBACnF,SAAS,EAAE,CAAC;gBACZ,OAAO,CAAC,UAAU,EAAE,CAAC,SAAS,EAAE,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;gBAClD,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC,CAAC,CACH,CACF,CAAC;QAEF,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;QAClD,OAAO,CAAC,GAAG,CAAC,yCAAyC,UAAU,IAAI,IAAI,CAAC,MAAM,aAAa,CAAC,CAAC;QAC7F,OAAO,OAAO,CAAC;IACjB,CAAC;IAEO,KAAK,CAAC,YAAY,CAAC,GAAW;QACpC,IAAI,CAAC;YACH,OAAO,MAAM,yCAAiB,CAAC,OAAO,CAAC,GAAG,EAAE,KAAK,IAAI,EAAE;gBACrD,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;gBACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;gBAErE,IAAI,CAAC;oBACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;wBAChC,OAAO,EAAE;4BACP,YAAY,EAAE,IAAI,CAAC,SAAS;4BAC5B,QAAQ,EAAE,iEAAiE;4BAC3E,iBAAiB,EAAE,gBAAgB;yBACpC;wBACD,MAAM,EAAE,UAAU,CAAC,MAAM;qBAC1B,CAAC,CAAC;oBAEH,YAAY,CAAC,SAAS,CAAC,CAAC;oBAExB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;wBACjB,MAAM,IAAI,KAAK,CAAC,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;oBACrE,CAAC;oBAED,MAAM,aAAa,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;oBAC7D,IAAI,aAAa,IAAI,QAAQ,CAAC,aAAa,CAAC,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;wBACnE,MAAM,IAAI,KAAK,CAAC,sBAAsB,aAAa,QAAQ,CAAC,CAAC;oBAC/D,CAAC;oBAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;oBAEnC,IAAI,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;wBACtC,MAAM,IAAI,KAAK,CAAC,sBAAsB,IAAI,CAAC,MAAM,QAAQ,CAAC,CAAC;oBAC7D,CAAC;oBAED,OAAO,IAAI,CAAC;gBAEd,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,YAAY,CAAC,SAAS,CAAC,CAAC;oBACxB,MAAM,KAAK,CAAC;gBACd,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,oDAAoD,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YACjF,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,eAAe,CAAC,IAAY,EAAE,GAAW;QACrD,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,IAAI,CAAC;YACH,wCAAwC;YACxC,MAAM,iBAAiB,GAAG,IAAI,CAAC,sBAAsB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YACjE,IAAI,iBAAiB,IAAI,iBAAiB,CAAC,WAAW,CAAC,MAAM,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBACvF,OAAO;oBACL,GAAG,iBAAiB;oBACpB,gBAAgB,EAAE,aAAa;oBAC/B,UAAU,EAAE,GAAG;iBAChB,CAAC;YACJ,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,IAAI,CAAC,oDAAoD,CAAC,CAAC;YACpE,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,CAAC,IAAI,CAAC,sBAAsB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE,CAAC,CAAC;QAChG,CAAC;QAED,IAAI,CAAC;YACH,gCAAgC;YAChC,MAAM,cAAc,GAAG,IAAI,CAAC,mBAAmB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YAC3D,IAAI,cAAc,IAAI,cAAc,CAAC,WAAW,CAAC,MAAM,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBACjF,OAAO;oBACL,GAAG,cAAc;oBACjB,gBAAgB,EAAE,UAAU;oBAC5B,UAAU,EAAE,GAAG;oBACf,MAAM;iBACP,CAAC;YACJ,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,IAAI,CAAC,iDAAiD,CAAC,CAAC;YACjE,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,CAAC,IAAI,CAAC,mBAAmB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE,CAAC,CAAC;QAC7F,CAAC;QAED,oCAAoC;QACpC,OAAO,CAAC,KAAK,CAAC,0DAA0D,GAAG,GAAG,EAAE,MAAM,CAAC,CAAC;QACxF,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,sBAAsB,CAAC,IAAY,EAAE,GAAW;QACtD,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,IAAI,aAAK,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;YACrC,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;YAErC,MAAM,MAAM,GAAG,IAAI,yBAAW,CAAC,QAAQ,CAAC,CAAC;YACzC,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;YAE/B,IAAI,CAAC,OAAO,EAAE,CAAC;gBACb,OAAO,IAAI,CAAC;YACd,CAAC;YAED,0BAA0B;YAC1B,MAAM,UAAU,GAAG,IAAI,CAAC,qBAAqB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YAEzD,oBAAoB;YACpB,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC;YAC7D,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,CAAC;YAE/D,OAAO;gBACL,GAAG;gBACH,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,EAAE;gBAC1B,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,EAAE;gBAC9B,WAAW,EAAE,OAAO,CAAC,WAAW,IAAI,EAAE;gBACtC,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,SAAS;gBACrC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,SAAS;gBACnC,aAAa,EAAE,IAAI,CAAC,oBAAoB,CAAC,IAAI,CAAC;gBAC9C,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC;gBACxD,IAAI,EAAE,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC;gBAChC,UAAU;gBACV,SAAS;gBACT,WAAW;gBACX,UAAU,EAAE,GAAG;gBACf,gBAAgB,EAAE,aAAa;gBAC/B,WAAW,EAAE,IAAI,IAAI,EAAE;aACxB,CAAC;QAEJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,qDAAqD,EAAE,KAAK,CAAC,CAAC;YAC5E,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAEO,mBAAmB,CAAC,IAAY,EAAE,GAAW;QACnD,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7B,2BAA2B;YAC3B,MAAM,iBAAiB,GAAG;gBACxB,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,QAAQ;gBAC5C,gBAAgB,EAAE,MAAM,EAAE,eAAe,EAAE,WAAW;gBACtD,UAAU,EAAE,aAAa,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ;aACvD,CAAC;YAEF,iBAAiB,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;YAE5D,+BAA+B;YAC/B,IAAI,OAAO,GAAG,EAAE,CAAC;YACjB,IAAI,KAAK,GAAG,EAAE,CAAC;YAEf,gBAAgB;YAChB,KAAK,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE;gBACnC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE;gBACxB,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;YAEvD,kCAAkC;YAClC,MAAM,gBAAgB,GAAG;gBACvB,kBAAkB,EAAO,8BAA8B;gBACvD,SAAS;gBACT,kBAAkB;gBAClB,eAAe;gBACf,gBAAgB;gBAChB,UAAU;gBACV,MAAM;gBACN,UAAU;gBACV,aAAa;aACd,CAAC;YAEF,KAAK,MAAM,QAAQ,IAAI,gBAAgB,EAAE,CAAC;gBACxC,MAAM,OAAO,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,KAAK,EAAE,CAAC;gBACpC,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACvB,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;oBAC/B,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;wBAC3C,MAAM;oBACR,CAAC;gBACH,CAAC;YACH,CAAC;YAED,8DAA8D;YAC9D,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBAC3C,OAAO,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;YACnC,CAAC;YAED,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBACvD,OAAO,IAAI,CAAC;YACd,CAAC;YAED,MAAM,WAAW,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAC7C,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,CAAC;YAC/C,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,CAAC;YAE/D,0BAA0B;YAC1B,MAAM,UAAU,GAAG,IAAI,CAAC,qBAAqB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YAEzD,OAAO;gBACL,GAAG;gBACH,KAAK;gBACL,OAAO;gBACP,WAAW;gBACX,OAAO,EAAE,WAAW,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,KAAK;gBAC9C,aAAa,EAAE,IAAI,CAAC,oBAAoB,CAAC,IAAI,CAAC;gBAC9C,QAAQ,EAAE,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC;gBACpC,IAAI,EAAE,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC;gBAChC,UAAU;gBACV,SAAS;gBACT,WAAW;gBACX,UAAU,EAAE,GAAG;gBACf,gBAAgB,EAAE,UAAU;gBAC5B,WAAW,EAAE,IAAI,IAAI,EAAE;aACxB,CAAC;QAEJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,kDAAkD,EAAE,KAAK,CAAC,CAAC;YACzE,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAEO,qBAAqB,CAAC,IAAY,EAAE,GAAW;QACrD,MAAM,UAAU,GAAmC,EAAE,CAAC;QAEtD,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7B,kBAAkB;YAClB,MAAM,aAAa,GAAU,EAAE,CAAC;YAChC,CAAC,CAAC,oCAAoC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBAC1D,IAAI,CAAC;oBACH,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;oBACnC,IAAI,QAAQ,EAAE,CAAC;wBACb,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;wBAClC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBAC3B,CAAC;gBACH,CAAC;gBAAC,MAAM,CAAC;oBACP,yBAAyB;gBAC3B,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC7B,UAAU,CAAC,MAAM,GAAG,aAAa,CAAC;YACpC,CAAC;YAED,yBAAyB;YACzB,MAAM,SAAS,GAA2B,EAAE,CAAC;YAC7C,CAAC,CAAC,uBAAuB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBAC7C,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;gBAC7C,MAAM,OAAO,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBAC3C,IAAI,QAAQ,IAAI,OAAO,EAAE,CAAC;oBACxB,SAAS,CAAC,QAAQ,CAAC,GAAG,OAAO,CAAC;gBAChC,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACtC,UAAU,CAAC,SAAS,GAAG,SAAS,CAAC;YACnC,CAAC;YAED,4BAA4B;YAC5B,MAAM,WAAW,GAA2B,EAAE,CAAC;YAC/C,CAAC,CAAC,wBAAwB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBAC9C,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACrC,MAAM,OAAO,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBAC3C,IAAI,IAAI,IAAI,OAAO,EAAE,CAAC;oBACpB,WAAW,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC;gBAC9B,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxC,UAAU,CAAC,WAAW,GAAG,WAAW,CAAC;YACvC,CAAC;YAED,oCAAoC;YACpC,MAAM,SAAS,GAAU,EAAE,CAAC;YAC5B,CAAC,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBACnC,MAAM,KAAK,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;gBACzB,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;gBACxC,IAAI,QAAQ,EAAE,CAAC;oBACb,MAAM,IAAI,GAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC;oBACxC,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,EAAE;wBAC/C,MAAM,KAAK,GAAG,CAAC,CAAC,WAAW,CAAC,CAAC;wBAC7B,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;wBACxC,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;wBAC/D,IAAI,QAAQ,IAAI,SAAS,EAAE,CAAC;4BAC1B,IAAI,CAAC,QAAQ,CAAC,GAAG,SAAS,CAAC;wBAC7B,CAAC;oBACH,CAAC,CAAC,CAAC;oBACH,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACvB,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACzB,UAAU,CAAC,SAAS,GAAG,SAAS,CAAC;YACnC,CAAC;QAEH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,yDAAyD,EAAE,KAAK,CAAC,CAAC;QACjF,CAAC;QAED,OAAO,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CAAC;IACrE,CAAC;IAEO,oBAAoB,CAAC,IAAY;QACvC,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7B,6CAA6C;YAC7C,MAAM,aAAa,GAAG;gBACpB,yCAAyC;gBACzC,4BAA4B;gBAC5B,0BAA0B;gBAC1B,gBAAgB;gBAChB,iBAAiB;gBACjB,eAAe;gBACf,eAAe;aAChB,CAAC;YAEF,KAAK,MAAM,QAAQ,IAAI,aAAa,EAAE,CAAC;gBACrC,MAAM,OAAO,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,KAAK,EAAE,CAAC;gBACpC,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACvB,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;oBAC7F,IAAI,OAAO,EAAE,CAAC;wBACZ,MAAM,IAAI,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,CAAC;wBAC/B,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,EAAE,CAAC;4BAC3B,OAAO,IAAI,CAAC;wBACd,CAAC;oBACH,CAAC;gBACH,CAAC;YACH,CAAC;YAED,OAAO,SAAS,CAAC;QACnB,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAEO,eAAe,CAAC,IAAY;QAClC,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7B,OAAO,CAAC,CAAC,+BAA+B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;gBACvD,CAAC,CAAC,+BAA+B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;gBAClD,SAAS,CAAC;QACd,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAEO,eAAe,CAAC,IAAY;QAClC,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7B,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC;gBAC3B,CAAC,CAAC,uBAAuB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;gBAC1C,CAAC,CAAC,qCAAqC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC;gBACxD,SAAS,CAAC;QACd,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAEO,UAAU,CAAC,IAAY;QAC7B,IAAI,CAAC,IAAI;YAAE,OAAO,CAAC,CAAC;QACpB,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC;IACzE,CAAC;IAED;;OAEG;IACH,eAAe,CAAC,OAAyB;QAKvC,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,KAAK,GAAG,GAAG,CAAC;QAEhB,+BAA+B;QAC/B,IAAI,OAAO,CAAC,WAAW,CAAC,MAAM,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;YACvD,MAAM,CAAC,IAAI,CAAC,sBAAsB,OAAO,CAAC,WAAW,CAAC,MAAM,aAAa,CAAC,CAAC;YAC3E,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;QAED,sBAAsB;QACtB,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;YAChD,MAAM,CAAC,IAAI,CAAC,4BAA4B,CAAC,CAAC;YAC1C,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;aAAM,IAAI,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YACtC,MAAM,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;YAC9B,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;QAED,kEAAkE;QAClE,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC;QAC1C,MAAM,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC,MAAM,CAAC;QAC9C,MAAM,KAAK,GAAG,UAAU,GAAG,UAAU,CAAC;QAEtC,IAAI,KAAK,GAAG,GAAG,EAAE,CAAC;YAChB,MAAM,CAAC,IAAI,CAAC,kDAAkD,CAAC,CAAC;YAChE,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;QAED,yCAAyC;QACzC,MAAM,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC;QACnF,MAAM,eAAe,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC;QAC3C,MAAM,cAAc,GAAG,CAAC,SAAS,CAAC,MAAM,GAAG,eAAe,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,MAAM,CAAC;QAEpF,IAAI,cAAc,GAAG,GAAG,EAAE,CAAC;YACzB,MAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;YAC/C,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;QAED,OAAO;YACL,OAAO,EAAE,MAAM,CAAC,MAAM,KAAK,CAAC,IAAI,KAAK,IAAI,GAAG;YAC5C,MAAM;YACN,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC;SAC1B,CAAC;IACJ,CAAC;CACF;AA7iBD,4CA6iBC;AAED,0BAA0B;AACb,QAAA,sBAAsB,GAAG,IAAI,gBAAgB,EAAE,CAAC"}
@@ -0,0 +1,74 @@
1
+ declare const PERPLEXITY_MODELS: {
2
+ readonly SONAR: "llama-3.1-sonar-small-128k-online";
3
+ readonly SONAR_PRO: "llama-3.1-sonar-large-128k-online";
4
+ };
5
+ export interface ScrapingConfig {
6
+ selectors?: {
7
+ articleLinks?: string[];
8
+ titleSelectors?: string[];
9
+ dateSelectors?: string[];
10
+ excludeSelectors?: string[];
11
+ };
12
+ filters?: {
13
+ minTitleLength?: number;
14
+ maxTitleLength?: number;
15
+ includePatterns?: RegExp[];
16
+ excludePatterns?: RegExp[];
17
+ allowedDomains?: string[];
18
+ };
19
+ limits?: {
20
+ maxLinksPerPage?: number;
21
+ maxDepth?: number;
22
+ };
23
+ perplexityFallback?: {
24
+ enabled?: boolean;
25
+ model?: typeof PERPLEXITY_MODELS[keyof typeof PERPLEXITY_MODELS];
26
+ useForRobotsBlocked?: boolean;
27
+ useForParseFailed?: boolean;
28
+ searchRecency?: 'hour' | 'day' | 'week' | 'month';
29
+ };
30
+ }
31
+ export interface ExtractedArticle {
32
+ url: string;
33
+ title?: string;
34
+ publishedDate?: Date;
35
+ description?: string;
36
+ confidence: number;
37
+ source: 'link-text' | 'meta-data' | 'structured-data';
38
+ }
39
+ export declare class HTMLScraper {
40
+ private readonly userAgent;
41
+ private readonly timeout;
42
+ private readonly defaultConfig;
43
+ /**
44
+ * Extract article links from a webpage
45
+ */
46
+ extractArticleLinks(url: string, config?: ScrapingConfig): Promise<ExtractedArticle[]>;
47
+ /**
48
+ * Extract articles from multiple pages with pagination support
49
+ */
50
+ extractFromMultiplePages(startUrl: string, config?: ScrapingConfig, options?: {
51
+ maxPages?: number;
52
+ paginationSelector?: string;
53
+ nextPagePatterns?: RegExp[];
54
+ }): Promise<ExtractedArticle[]>;
55
+ private fetchPage;
56
+ private parseArticleLinks;
57
+ private extractArticleInfo;
58
+ private extractStructuredData;
59
+ private findNextPageUrls;
60
+ private deduplicateArticles;
61
+ private passesFilters;
62
+ private isLikelyArticleUrl;
63
+ private parseDate;
64
+ private resolveUrl;
65
+ private mergeConfig;
66
+ /**
67
+ * Use Perplexity API to extract articles when traditional scraping fails
68
+ * Requires PERPLEXITY_API_KEY environment variable to be set
69
+ */
70
+ private extractWithPerplexity;
71
+ }
72
+ export declare const globalHTMLScraper: HTMLScraper;
73
+ export {};
74
+ //# sourceMappingURL=html-scraper.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"html-scraper.d.ts","sourceRoot":"","sources":["../../../lib/web-scrapers/html-scraper.ts"],"names":[],"mappings":"AAMA,QAAA,MAAM,iBAAiB;;;CAGb,CAAC;AAEX,MAAM,WAAW,cAAc;IAC7B,SAAS,CAAC,EAAE;QACV,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;QACxB,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;QACzB,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;KAC7B,CAAC;IACF,OAAO,CAAC,EAAE;QACR,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;QAC3B,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;QAC3B,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;KAC3B,CAAC;IACF,MAAM,CAAC,EAAE;QACP,eAAe,CAAC,EAAE,MAAM,CAAC;QACzB,QAAQ,CAAC,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,kBAAkB,CAAC,EAAE;QACnB,OAAO,CAAC,EAAE,OAAO,CAAC;QAClB,KAAK,CAAC,EAAE,OAAO,iBAAiB,CAAC,MAAM,OAAO,iBAAiB,CAAC,CAAC;QACjE,mBAAmB,CAAC,EAAE,OAAO,CAAC;QAC9B,iBAAiB,CAAC,EAAE,OAAO,CAAC;QAC5B,aAAa,CAAC,EAAE,MAAM,GAAG,KAAK,GAAG,MAAM,GAAG,OAAO,CAAC;KACnD,CAAC;CACH;AAED,MAAM,WAAW,gBAAgB;IAC/B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,aAAa,CAAC,EAAE,IAAI,CAAC;IACrB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,WAAW,GAAG,WAAW,GAAG,iBAAiB,CAAC;CACvD;AAED,qBAAa,WAAW;IACtB,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAiF;IAC3G,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAqE5B;IAEF;;OAEG;IACG,mBAAmB,CACvB,GAAG,EAAE,MAAM,EACX,MAAM,GAAE,cAAmB,GAC1B,OAAO,CAAC,gBAAgB,EAAE,CAAC;IAqD9B;;OAEG;IACG,wBAAwB,CAC5B,QAAQ,EAAE,MAAM,EAChB,MAAM,GAAE,cAAmB,EAC3B,OAAO,GAAE;QACP,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;KACxB,GACL,OAAO,CAAC,gBAAgB,EAAE,CAAC;YAgDhB,SAAS;IAuCvB,OAAO,CAAC,iBAAiB;IA4DzB,OAAO,CAAC,kBAAkB;IAqE1B,OAAO,CAAC,qBAAqB;YAsCf,gBAAgB;IAuC9B,OAAO,CAAC,mBAAmB;IAa3B,OAAO,CAAC,aAAa;IAkCrB,OAAO,CAAC,kBAAkB;IAiB1B,OAAO,CAAC,SAAS;IA8BjB,OAAO,CAAC,UAAU;IAQlB,OAAO,CAAC,WAAW;IAiCnB;;;OAGG;YACW,qBAAqB;CAyEpC;AAGD,eAAO,MAAM,iBAAiB,aAAoB,CAAC"}