@tyroneross/blog-scraper 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +254 -279
  3. package/dist/lib/circuit-breaker.d.ts +29 -0
  4. package/dist/lib/circuit-breaker.d.ts.map +1 -0
  5. package/dist/lib/circuit-breaker.js +89 -0
  6. package/dist/lib/circuit-breaker.js.map +1 -0
  7. package/dist/lib/content-extractor.d.ts +13 -0
  8. package/dist/lib/content-extractor.d.ts.map +1 -0
  9. package/dist/lib/content-extractor.js +75 -0
  10. package/dist/lib/content-extractor.js.map +1 -0
  11. package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
  12. package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
  13. package/dist/lib/formatters/html-to-markdown.js +146 -0
  14. package/dist/lib/formatters/html-to-markdown.js.map +1 -0
  15. package/dist/lib/formatters/text-cleaner.d.ts +44 -0
  16. package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
  17. package/dist/lib/formatters/text-cleaner.js +143 -0
  18. package/dist/lib/formatters/text-cleaner.js.map +1 -0
  19. package/dist/lib/index.d.ts +96 -0
  20. package/dist/lib/index.d.ts.map +1 -0
  21. package/dist/lib/index.js +184 -0
  22. package/dist/lib/index.js.map +1 -0
  23. package/dist/lib/quality-scorer.d.ts +83 -0
  24. package/dist/lib/quality-scorer.d.ts.map +1 -0
  25. package/dist/lib/quality-scorer.js +376 -0
  26. package/dist/lib/quality-scorer.js.map +1 -0
  27. package/dist/lib/rss-utils.d.ts +31 -0
  28. package/dist/lib/rss-utils.d.ts.map +1 -0
  29. package/dist/lib/rss-utils.js +175 -0
  30. package/dist/lib/rss-utils.js.map +1 -0
  31. package/dist/lib/scraping-rate-limiter.d.ts +52 -0
  32. package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
  33. package/dist/lib/scraping-rate-limiter.js +238 -0
  34. package/dist/lib/scraping-rate-limiter.js.map +1 -0
  35. package/dist/lib/source-orchestrator.d.ts +306 -0
  36. package/dist/lib/source-orchestrator.d.ts.map +1 -0
  37. package/dist/lib/source-orchestrator.js +840 -0
  38. package/dist/lib/source-orchestrator.js.map +1 -0
  39. package/dist/lib/types.d.ts +143 -0
  40. package/dist/lib/types.d.ts.map +1 -0
  41. package/dist/lib/types.js +7 -0
  42. package/dist/lib/types.js.map +1 -0
  43. package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
  44. package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
  45. package/dist/lib/web-scrapers/content-extractor.js +531 -0
  46. package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
  47. package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
  48. package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
  49. package/dist/lib/web-scrapers/html-scraper.js +598 -0
  50. package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
  51. package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
  52. package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
  53. package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
  54. package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
  55. package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
  56. package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
  57. package/dist/lib/web-scrapers/robots-checker.js +285 -0
  58. package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
  59. package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
  60. package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
  61. package/dist/lib/web-scrapers/rss-discovery.js +384 -0
  62. package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
  63. package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
  64. package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
  65. package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
  66. package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
  67. package/package.json +54 -33
  68. package/dist/index.d.mts +0 -949
  69. package/dist/index.d.ts +0 -949
  70. package/dist/index.js +0 -3236
  71. package/dist/index.mjs +0 -3165
@@ -0,0 +1,840 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.globalSourceOrchestrator = exports.SourceOrchestrator = exports.SourceConfigSchema = exports.CandidateArticleSchema = void 0;
7
+ const zod_1 = require("zod");
8
+ const crypto_1 = __importDefault(require("crypto"));
9
+ const p_limit_1 = __importDefault(require("p-limit"));
10
+ const rss_utils_1 = require("./rss-utils");
11
+ const rss_discovery_1 = require("./web-scrapers/rss-discovery");
12
+ const sitemap_parser_1 = require("./web-scrapers/sitemap-parser");
13
+ const html_scraper_1 = require("./web-scrapers/html-scraper");
14
+ const content_extractor_1 = require("./web-scrapers/content-extractor");
15
+ const robots_checker_1 = require("./web-scrapers/robots-checker");
16
+ const playwright_scraper_1 = require("./web-scrapers/playwright-scraper");
17
+ const quality_scorer_1 = require("./quality-scorer");
18
+ // Create instances
19
+ const globalHTMLScraper = new html_scraper_1.HTMLScraper();
20
+ const globalContentExtractor = new content_extractor_1.ContentExtractor();
21
+ const globalRobotsChecker = new robots_checker_1.RobotsChecker();
22
+ const circuit_breaker_1 = require("./circuit-breaker");
23
+ // Zod schemas for type safety
24
+ exports.CandidateArticleSchema = zod_1.z.object({
25
+ url: zod_1.z.string().url(),
26
+ title: zod_1.z.string().min(1),
27
+ publishedAt: zod_1.z.date(),
28
+ content: zod_1.z.string().optional(),
29
+ excerpt: zod_1.z.string().optional(),
30
+ guid: zod_1.z.string(),
31
+ confidence: zod_1.z.number().min(0).max(1),
32
+ source: zod_1.z.enum(['rss', 'sitemap', 'html', 'discovery']),
33
+ extractionMethod: zod_1.z.enum(['rss', 'sitemap', 'html-links', 'content-extraction']),
34
+ metadata: zod_1.z.record(zod_1.z.any()).optional()
35
+ });
36
+ exports.SourceConfigSchema = zod_1.z.object({
37
+ sourceType: zod_1.z.enum(['rss', 'sitemap', 'html', 'auto']),
38
+ allowPaths: zod_1.z.array(zod_1.z.string()).optional(),
39
+ denyPaths: zod_1.z.array(zod_1.z.string()).optional(),
40
+ maxDepth: zod_1.z.number().int().min(1).max(5).optional(),
41
+ detectOnly: zod_1.z.boolean().optional(),
42
+ scrapeConfig: zod_1.z.object({
43
+ selectors: zod_1.z.object({
44
+ articleLinks: zod_1.z.array(zod_1.z.string()).optional(),
45
+ titleSelectors: zod_1.z.array(zod_1.z.string()).optional(),
46
+ dateSelectors: zod_1.z.array(zod_1.z.string()).optional(),
47
+ excludeSelectors: zod_1.z.array(zod_1.z.string()).optional()
48
+ }).optional(),
49
+ filters: zod_1.z.object({
50
+ minTitleLength: zod_1.z.number().optional(),
51
+ maxTitleLength: zod_1.z.number().optional(),
52
+ includePatterns: zod_1.z.array(zod_1.z.string()).optional(),
53
+ excludePatterns: zod_1.z.array(zod_1.z.string()).optional()
54
+ }).optional(),
55
+ limits: zod_1.z.object({
56
+ maxLinksPerPage: zod_1.z.number().optional(),
57
+ maxPages: zod_1.z.number().optional()
58
+ }).optional()
59
+ }).optional()
60
+ });
61
+ class SourceOrchestrator {
62
+ constructor() {
63
+ this.maxArticlesPerSource = 1000;
64
+ this.recentTimeframe = 48 * 60 * 60 * 1000; // 48 hours
65
+ /**
66
+ * Common content section paths - prioritized for news/blog content
67
+ */
68
+ this.contentSectionPaths = [
69
+ '/news', '/blog', '/articles', '/posts', '/stories',
70
+ '/press', '/updates', '/announcements', '/insights',
71
+ '/resources', '/publications', '/research', '/engineering'
72
+ ];
73
+ /**
74
+ * Common blog subdomains to check when scraping root domains
75
+ * Many companies host their blogs on separate subdomains
76
+ */
77
+ this.blogSubdomains = [
78
+ 'blog', 'blogs', 'news', 'newsroom', 'press',
79
+ 'engineering', 'developers', 'ai', 'research'
80
+ ];
81
+ }
82
+ /**
83
+ * Infer path filters from the input URL
84
+ * e.g., if user enters anthropic.com/news, filter results to /news/* paths
85
+ */
86
+ inferPathFiltersFromUrl(url, config) {
87
+ try {
88
+ const urlObj = new URL(url);
89
+ const path = urlObj.pathname.toLowerCase();
90
+ // If URL has a meaningful path (not just /), infer allowPaths
91
+ if (path && path !== '/' && path.length > 1) {
92
+ for (const contentPath of this.contentSectionPaths) {
93
+ if (path.startsWith(contentPath)) {
94
+ // Only add if user hasn't explicitly set allowPaths
95
+ if (!config.allowPaths?.length) {
96
+ console.log(`🔍 [Orchestrator] Inferring path filter from URL: ${contentPath}/*`);
97
+ return {
98
+ ...config,
99
+ allowPaths: [`${contentPath}/*`, `${contentPath}`]
100
+ };
101
+ }
102
+ }
103
+ }
104
+ // For other paths, use the exact path as prefix
105
+ if (!config.allowPaths?.length && path.length > 3) {
106
+ const pathPrefix = path.endsWith('/') ? path.slice(0, -1) : path;
107
+ console.log(`🔍 [Orchestrator] Inferring path filter from URL: ${pathPrefix}/*`);
108
+ return {
109
+ ...config,
110
+ allowPaths: [`${pathPrefix}/*`, pathPrefix]
111
+ };
112
+ }
113
+ }
114
+ }
115
+ catch (error) {
116
+ // Ignore URL parsing errors
117
+ }
118
+ return config;
119
+ }
120
+ /**
121
+ * Discover content sections from sitemap when user enters root domain
122
+ * Returns prioritized list of content paths found
123
+ */
124
+ discoverContentSectionsFromSitemap(entries) {
125
+ const pathCounts = new Map();
126
+ for (const entry of entries) {
127
+ try {
128
+ const urlObj = new URL(entry.url);
129
+ const pathParts = urlObj.pathname.split('/').filter(Boolean);
130
+ if (pathParts.length >= 1) {
131
+ const firstPath = '/' + pathParts[0].toLowerCase();
132
+ // Only count if it's a potential content section
133
+ if (this.contentSectionPaths.includes(firstPath) ||
134
+ firstPath.match(/^\/(news|blog|post|article|stor|update|press)/i)) {
135
+ pathCounts.set(firstPath, (pathCounts.get(firstPath) || 0) + 1);
136
+ }
137
+ }
138
+ }
139
+ catch {
140
+ // Skip invalid URLs
141
+ }
142
+ }
143
+ // Sort by count (most content first) and return paths with 3+ entries
144
+ const sortedPaths = Array.from(pathCounts.entries())
145
+ .filter(([_, count]) => count >= 3)
146
+ .sort((a, b) => b[1] - a[1])
147
+ .map(([path]) => path);
148
+ if (sortedPaths.length > 0) {
149
+ console.log(`🔍 [Orchestrator] Discovered content sections: ${sortedPaths.join(', ')}`);
150
+ }
151
+ return sortedPaths;
152
+ }
153
+ /**
154
+ * Filter sitemap entries to content sections when processing root domain
155
+ * Also applies non-English locale filtering
156
+ */
157
+ filterToContentSections(articles, discoveredPaths) {
158
+ if (discoveredPaths.length === 0) {
159
+ return articles;
160
+ }
161
+ // Create allow patterns from discovered paths
162
+ const allowPatterns = discoveredPaths.flatMap(p => [`${p}/*`, p]);
163
+ return articles.filter(article => {
164
+ try {
165
+ const urlObj = new URL(article.url);
166
+ const path = urlObj.pathname.toLowerCase();
167
+ // Filter out non-English locale paths (e.g., /fr-be/, /de-ch/)
168
+ if ((0, quality_scorer_1.isNonEnglishLocalePath)(path)) {
169
+ return false;
170
+ }
171
+ return allowPatterns.some(pattern => this.matchesPattern(path, pattern));
172
+ }
173
+ catch {
174
+ return false;
175
+ }
176
+ });
177
+ }
178
+ /**
179
+ * Discover blog subdomains for a given domain
180
+ * e.g., for nvidia.com, check if blogs.nvidia.com exists
181
+ */
182
+ async discoverBlogSubdomains(domain) {
183
+ // Extract base domain (remove www. if present)
184
+ const baseDomain = domain.replace(/^www\./, '');
185
+ const discoveredSubdomains = [];
186
+ console.log(`🔍 [Orchestrator] Checking for blog subdomains of ${baseDomain}...`);
187
+ // Check each potential blog subdomain
188
+ for (const subdomain of this.blogSubdomains) {
189
+ const subdomainUrl = `https://${subdomain}.${baseDomain}`;
190
+ try {
191
+ // Quick HEAD request to check if subdomain exists
192
+ const controller = new AbortController();
193
+ const timeoutId = setTimeout(() => controller.abort(), 3000);
194
+ const response = await fetch(subdomainUrl, {
195
+ method: 'HEAD',
196
+ signal: controller.signal,
197
+ headers: {
198
+ 'User-Agent': 'Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)'
199
+ }
200
+ });
201
+ clearTimeout(timeoutId);
202
+ if (response.ok || response.status === 301 || response.status === 302) {
203
+ console.log(`✅ [Orchestrator] Found blog subdomain: ${subdomainUrl}`);
204
+ discoveredSubdomains.push(subdomainUrl);
205
+ }
206
+ }
207
+ catch {
208
+ // Subdomain doesn't exist or timeout, skip it
209
+ }
210
+ }
211
+ return discoveredSubdomains;
212
+ }
213
+ /**
214
+ * Main orchestration method - determines source type and extracts content
215
+ */
216
+ async processSource(url, config = { sourceType: 'auto' }) {
217
+ const startTime = Date.now();
218
+ // Infer path filters from input URL
219
+ config = this.inferPathFiltersFromUrl(url, config);
220
+ console.log(`🎭 [Orchestrator] Processing source: ${url} (type: ${config.sourceType})`);
221
+ const result = {
222
+ articles: [],
223
+ sourceInfo: {
224
+ detectedType: 'html',
225
+ extractionStats: {
226
+ attempted: 0,
227
+ successful: 0,
228
+ failed: 0,
229
+ filtered: 0
230
+ }
231
+ },
232
+ processingTime: 0,
233
+ errors: []
234
+ };
235
+ try {
236
+ // Apply circuit breaker protection (use custom if provided, otherwise default)
237
+ const breaker = config.circuitBreaker || circuit_breaker_1.circuitBreakers.scraping;
238
+ return await breaker.execute(async () => {
239
+ if (config.sourceType === 'auto') {
240
+ return await this.autoDetectAndProcess(url, config, result);
241
+ }
242
+ else {
243
+ return await this.processKnownType(url, config, result);
244
+ }
245
+ });
246
+ }
247
+ catch (error) {
248
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error';
249
+ console.error(`❌ [Orchestrator] Failed to process source ${url}:`, errorMessage);
250
+ result.errors.push(errorMessage);
251
+ result.processingTime = Date.now() - startTime;
252
+ return result;
253
+ }
254
+ }
255
+ /**
256
+ * Auto-detect source type and process accordingly
257
+ */
258
+ async autoDetectAndProcess(url, config, result) {
259
+ console.log(`🔍 [Orchestrator] Auto-detecting source type for ${url}`);
260
+ // Step 1: Try RSS first (most reliable)
261
+ try {
262
+ const rssArticles = await this.processAsRSS(url);
263
+ if (rssArticles.length > 0) {
264
+ result.sourceInfo.detectedType = 'rss';
265
+ // RSS is already curated content - only apply deny filters, not allow filters
266
+ result.articles = this.applyPathFilters(rssArticles, config, { skipAllowFilters: true });
267
+ console.log(`✅ [Orchestrator] Detected as RSS feed: ${result.articles.length} articles`);
268
+ return this.finalizeResult(result);
269
+ }
270
+ }
271
+ catch (error) {
272
+ result.errors.push(`RSS detection failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
273
+ }
274
+ // Step 2: Discover RSS feeds from HTML
275
+ try {
276
+ const discoveredFeeds = await rss_discovery_1.globalRSSDiscovery.discoverFeeds(url);
277
+ if (discoveredFeeds.length > 0) {
278
+ result.sourceInfo.discoveredFeeds = discoveredFeeds;
279
+ // Try the highest confidence discovered feed
280
+ const bestFeed = discoveredFeeds[0];
281
+ const rssArticles = await this.processAsRSS(bestFeed.url);
282
+ if (rssArticles.length > 0) {
283
+ result.sourceInfo.detectedType = 'rss';
284
+ // RSS is already curated content - only apply deny filters, not allow filters
285
+ result.articles = this.applyPathFilters(rssArticles, config, { skipAllowFilters: true });
286
+ console.log(`✅ [Orchestrator] Using discovered RSS feed: ${result.articles.length} articles`);
287
+ return this.finalizeResult(result);
288
+ }
289
+ }
290
+ }
291
+ catch (error) {
292
+ result.errors.push(`RSS discovery failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
293
+ }
294
+ // Step 3: Try sitemap parsing
295
+ try {
296
+ const sitemapArticles = await this.processAsSitemap(url);
297
+ if (sitemapArticles.length > 0) {
298
+ result.sourceInfo.detectedType = 'sitemap';
299
+ // If processing root domain, auto-discover and filter to content sections
300
+ const urlObj = new URL(url);
301
+ const isRootDomain = urlObj.pathname === '/' || urlObj.pathname === '';
302
+ if (isRootDomain && !config.allowPaths?.length) {
303
+ const discoveredPaths = this.discoverContentSectionsFromSitemap(sitemapArticles);
304
+ if (discoveredPaths.length > 0) {
305
+ result.articles = this.filterToContentSections(sitemapArticles, discoveredPaths);
306
+ }
307
+ else {
308
+ result.articles = this.applyPathFilters(sitemapArticles, config);
309
+ }
310
+ }
311
+ else {
312
+ result.articles = this.applyPathFilters(sitemapArticles, config);
313
+ }
314
+ console.log(`✅ [Orchestrator] Detected as sitemap: ${result.articles.length} articles`);
315
+ return this.finalizeResult(result);
316
+ }
317
+ }
318
+ catch (error) {
319
+ result.errors.push(`Sitemap detection failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
320
+ }
321
+ // Step 4: Discover sitemaps from domain
322
+ try {
323
+ const urlObj = new URL(url);
324
+ const discoveredSitemaps = await sitemap_parser_1.globalSitemapParser.discoverSitemaps(urlObj.hostname);
325
+ if (discoveredSitemaps.length > 0) {
326
+ result.sourceInfo.discoveredSitemaps = discoveredSitemaps;
327
+ // Try the first discovered sitemap
328
+ const sitemapArticles = await this.processAsSitemap(discoveredSitemaps[0]);
329
+ if (sitemapArticles.length > 0) {
330
+ result.sourceInfo.detectedType = 'sitemap';
331
+ // If processing root domain, auto-discover and filter to content sections
332
+ const isRootDomain = urlObj.pathname === '/' || urlObj.pathname === '';
333
+ if (isRootDomain && !config.allowPaths?.length) {
334
+ const discoveredPaths = this.discoverContentSectionsFromSitemap(sitemapArticles);
335
+ if (discoveredPaths.length > 0) {
336
+ result.articles = this.filterToContentSections(sitemapArticles, discoveredPaths);
337
+ }
338
+ else {
339
+ result.articles = this.applyPathFilters(sitemapArticles, config);
340
+ }
341
+ }
342
+ else {
343
+ result.articles = this.applyPathFilters(sitemapArticles, config);
344
+ }
345
+ console.log(`✅ [Orchestrator] Using discovered sitemap: ${result.articles.length} articles`);
346
+ return this.finalizeResult(result);
347
+ }
348
+ }
349
+ }
350
+ catch (error) {
351
+ result.errors.push(`Sitemap discovery failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
352
+ }
353
+ // Step 5: Try blog subdomains (e.g., blogs.nvidia.com for nvidia.com)
354
+ try {
355
+ const urlObj = new URL(url);
356
+ const isRootDomain = urlObj.pathname === '/' || urlObj.pathname === '';
357
+ if (isRootDomain) {
358
+ const blogSubdomains = await this.discoverBlogSubdomains(urlObj.hostname);
359
+ for (const subdomainUrl of blogSubdomains) {
360
+ try {
361
+ // Try RSS first on subdomain
362
+ const rssArticles = await this.processAsRSS(subdomainUrl);
363
+ if (rssArticles.length > 0) {
364
+ result.sourceInfo.detectedType = 'rss';
365
+ result.articles = this.applyPathFilters(rssArticles, config);
366
+ console.log(`✅ [Orchestrator] Found RSS on subdomain ${subdomainUrl}: ${result.articles.length} articles`);
367
+ return this.finalizeResult(result);
368
+ }
369
+ // Try sitemap on subdomain
370
+ const subdomainHostname = new URL(subdomainUrl).hostname;
371
+ const subdomainSitemaps = await sitemap_parser_1.globalSitemapParser.discoverSitemaps(subdomainHostname);
372
+ if (subdomainSitemaps.length > 0) {
373
+ result.sourceInfo.discoveredSitemaps = [
374
+ ...(result.sourceInfo.discoveredSitemaps || []),
375
+ ...subdomainSitemaps
376
+ ];
377
+ for (const sitemap of subdomainSitemaps) {
378
+ const sitemapArticles = await this.processAsSitemap(sitemap);
379
+ if (sitemapArticles.length > 0) {
380
+ result.sourceInfo.detectedType = 'sitemap';
381
+ result.articles = this.applyPathFilters(sitemapArticles, config);
382
+ console.log(`✅ [Orchestrator] Found sitemap on subdomain ${subdomainUrl}: ${result.articles.length} articles`);
383
+ return this.finalizeResult(result);
384
+ }
385
+ }
386
+ }
387
+ }
388
+ catch (subError) {
389
+ console.log(`⚠️ [Orchestrator] Error processing subdomain ${subdomainUrl}:`, subError);
390
+ }
391
+ }
392
+ }
393
+ }
394
+ catch (error) {
395
+ result.errors.push(`Subdomain discovery failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
396
+ }
397
+ // Step 6: Fall back to HTML scraping
398
+ try {
399
+ const htmlArticles = await this.processAsHTML(url, config);
400
+ if (htmlArticles.length > 0) {
401
+ result.sourceInfo.detectedType = 'html';
402
+ result.articles = this.applyPathFilters(htmlArticles, config);
403
+ console.log(`✅ [Orchestrator] Falling back to HTML scraping: ${result.articles.length} articles`);
404
+ return this.finalizeResult(result);
405
+ }
406
+ }
407
+ catch (error) {
408
+ result.errors.push(`HTML scraping failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
409
+ }
410
+ // Step 7: Final fallback - Playwright for JS-rendered pages
411
+ try {
412
+ console.log(`🎭 [Orchestrator] Trying Playwright for JS-rendered content...`);
413
+ const playwrightArticles = await this.processAsPlaywright(url, config);
414
+ if (playwrightArticles.length > 0) {
415
+ result.sourceInfo.detectedType = 'html'; // Still categorize as HTML source
416
+ result.articles = this.applyPathFilters(playwrightArticles, config);
417
+ console.log(`✅ [Orchestrator] Playwright extraction successful: ${result.articles.length} articles`);
418
+ return this.finalizeResult(result);
419
+ }
420
+ }
421
+ catch (error) {
422
+ result.errors.push(`Playwright scraping failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
423
+ }
424
+ // No articles found by any method
425
+ console.log(`⚠️ [Orchestrator] No articles found for ${url}`);
426
+ return this.finalizeResult(result);
427
+ }
428
+ /**
429
+ * Process source with known type
430
+ */
431
+ async processKnownType(url, config, result) {
432
+ console.log(`🎯 [Orchestrator] Processing as ${config.sourceType}: ${url}`);
433
+ try {
434
+ let articles = [];
435
+ switch (config.sourceType) {
436
+ case 'rss':
437
+ articles = await this.processAsRSS(url);
438
+ result.sourceInfo.detectedType = 'rss';
439
+ break;
440
+ case 'sitemap':
441
+ articles = await this.processAsSitemap(url);
442
+ result.sourceInfo.detectedType = 'sitemap';
443
+ break;
444
+ case 'html':
445
+ articles = await this.processAsHTML(url, config);
446
+ result.sourceInfo.detectedType = 'html';
447
+ break;
448
+ }
449
+ result.articles = this.applyPathFilters(articles, config);
450
+ console.log(`✅ [Orchestrator] Processed ${config.sourceType}: ${result.articles.length} articles`);
451
+ return this.finalizeResult(result);
452
+ }
453
+ catch (error) {
454
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error';
455
+ result.errors.push(`${config.sourceType} processing failed: ${errorMessage}`);
456
+ return this.finalizeResult(result);
457
+ }
458
+ }
459
+ /**
460
+ * Process URL as RSS feed
461
+ */
462
+ async processAsRSS(url) {
463
+ const rssItems = await (0, rss_utils_1.fetchRSSFeed)(url);
464
+ const candidates = [];
465
+ for (const item of rssItems) {
466
+ try {
467
+ const publishedAt = new Date(item.pubDate);
468
+ if (isNaN(publishedAt.getTime())) {
469
+ continue;
470
+ }
471
+ candidates.push({
472
+ url: item.link,
473
+ title: item.title,
474
+ publishedAt,
475
+ content: item.content,
476
+ excerpt: item.contentSnippet,
477
+ guid: item.guid,
478
+ confidence: 0.9,
479
+ source: 'rss',
480
+ extractionMethod: 'rss',
481
+ metadata: {
482
+ originalGuid: item.guid,
483
+ rssSource: url
484
+ }
485
+ });
486
+ }
487
+ catch (error) {
488
+ console.warn(`⚠️ [Orchestrator] Error processing RSS item:`, error);
489
+ continue;
490
+ }
491
+ }
492
+ return candidates;
493
+ }
494
+ /**
495
+ * Process URL as sitemap
496
+ */
497
+ async processAsSitemap(url) {
498
+ // Don't filter by recency - we want all entries from the sitemap
499
+ // Path filtering and quality scoring will handle relevance
500
+ const sitemapEntries = await sitemap_parser_1.globalSitemapParser.parseSitemap(url, {
501
+ filterRecent: false, // Changed: get all entries, filter later by path
502
+ maxEntries: this.maxArticlesPerSource,
503
+ includeNews: true
504
+ });
505
+ const candidates = [];
506
+ for (const entry of sitemapEntries) {
507
+ try {
508
+ const publishedAt = entry.lastmod || new Date();
509
+ candidates.push({
510
+ url: entry.url,
511
+ title: entry.news?.title || this.extractTitleFromUrl(entry.url),
512
+ publishedAt,
513
+ guid: this.createGuid(entry.url, publishedAt.toISOString()),
514
+ confidence: entry.news ? 0.8 : 0.6,
515
+ source: 'sitemap',
516
+ extractionMethod: 'sitemap',
517
+ metadata: {
518
+ changefreq: entry.changefreq,
519
+ priority: entry.priority,
520
+ hasNews: !!entry.news,
521
+ sitemapSource: url
522
+ }
523
+ });
524
+ }
525
+ catch (error) {
526
+ console.warn(`⚠️ [Orchestrator] Error processing sitemap entry:`, error);
527
+ continue;
528
+ }
529
+ }
530
+ return candidates;
531
+ }
532
+ /**
533
+ * Process URL as HTML page
534
+ */
535
+ async processAsHTML(url, config) {
536
+ const scrapingConfig = this.buildScrapingConfig(config);
537
+ const extractedArticles = await globalHTMLScraper.extractFromMultiplePages(url, scrapingConfig, {
538
+ maxPages: config.scrapeConfig?.limits?.maxPages || 3
539
+ });
540
+ const candidates = [];
541
+ for (const article of extractedArticles) {
542
+ try {
543
+ const publishedAt = article.publishedDate || new Date();
544
+ candidates.push({
545
+ url: article.url,
546
+ title: article.title || this.extractTitleFromUrl(article.url),
547
+ publishedAt,
548
+ excerpt: article.description,
549
+ guid: this.createGuid(article.url, publishedAt.toISOString()),
550
+ confidence: article.confidence,
551
+ source: 'html',
552
+ extractionMethod: 'html-links',
553
+ metadata: {
554
+ extractionSource: article.source,
555
+ htmlSource: url
556
+ }
557
+ });
558
+ }
559
+ catch (error) {
560
+ console.warn(`⚠️ [Orchestrator] Error processing HTML article:`, error);
561
+ continue;
562
+ }
563
+ }
564
+ return candidates;
565
+ }
566
+ /**
567
+ * Process URL using Playwright for JavaScript-rendered pages
568
+ * Used as fallback when static HTML scraping fails
569
+ */
570
+ async processAsPlaywright(url, config) {
571
+ const playwrightScraper = (0, playwright_scraper_1.getPlaywrightScraper)();
572
+ const playwrightConfig = {
573
+ timeout: 30000,
574
+ blockMedia: true,
575
+ ...this.buildScrapingConfig(config)
576
+ };
577
+ const extractedArticles = await playwrightScraper.extractArticleLinks(url, playwrightConfig);
578
+ const candidates = [];
579
+ for (const article of extractedArticles) {
580
+ try {
581
+ const publishedAt = article.publishedDate || new Date();
582
+ candidates.push({
583
+ url: article.url,
584
+ title: article.title || this.extractTitleFromUrl(article.url),
585
+ publishedAt,
586
+ excerpt: article.description,
587
+ guid: this.createGuid(article.url, publishedAt.toISOString()),
588
+ confidence: article.confidence,
589
+ source: 'html',
590
+ extractionMethod: 'html-links',
591
+ metadata: {
592
+ extractionSource: 'playwright',
593
+ playwrightRendered: true,
594
+ htmlSource: url
595
+ }
596
+ });
597
+ }
598
+ catch (error) {
599
+ console.warn(`⚠️ [Orchestrator] Error processing Playwright article:`, error);
600
+ continue;
601
+ }
602
+ }
603
+ return candidates;
604
+ }
605
+ /**
606
+ * Apply path filtering based on allowPaths and denyPaths
607
+ * Also filters out non-English locale paths
608
+ *
609
+ * @param articles - Articles to filter
610
+ * @param config - Source configuration
611
+ * @param options - Filtering options
612
+ * @param options.skipAllowFilters - Skip allow path filtering (useful for RSS which is already curated)
613
+ */
614
+ applyPathFilters(articles, config, options = {}) {
615
+ return articles.filter(article => {
616
+ try {
617
+ const urlObj = new URL(article.url);
618
+ const path = urlObj.pathname.toLowerCase();
619
+ // Always filter out non-English locale paths (e.g., /fr-be/, /de-ch/)
620
+ if ((0, quality_scorer_1.isNonEnglishLocalePath)(path)) {
621
+ return false;
622
+ }
623
+ // Check deny patterns first (always apply)
624
+ if (config.denyPaths?.length) {
625
+ for (const pattern of config.denyPaths) {
626
+ if (this.matchesPattern(path, pattern)) {
627
+ console.log(`🚫 [Orchestrator] Article blocked by deny pattern "${pattern}": ${article.url}`);
628
+ return false;
629
+ }
630
+ }
631
+ }
632
+ // Skip allow pattern filtering for RSS (RSS is already curated content)
633
+ if (options.skipAllowFilters) {
634
+ return true;
635
+ }
636
+ // Check allow patterns (only for sitemap/HTML sources)
637
+ if (config.allowPaths?.length) {
638
+ for (const pattern of config.allowPaths) {
639
+ if (this.matchesPattern(path, pattern)) {
640
+ return true;
641
+ }
642
+ }
643
+ console.log(`🚫 [Orchestrator] Article not matching any allow pattern: ${article.url}`);
644
+ return false;
645
+ }
646
+ return true;
647
+ }
648
+ catch (error) {
649
+ console.warn(`⚠️ [Orchestrator] Error applying path filters to ${article.url}:`, error);
650
+ return true; // Default to allowing on error
651
+ }
652
+ });
653
+ }
654
+ /**
655
+ * Check if a path matches a pattern (supports wildcards)
656
+ */
657
+ matchesPattern(path, pattern) {
658
+ const patternLower = pattern.toLowerCase();
659
+ const pathLower = path.toLowerCase();
660
+ // Handle exact match
661
+ if (patternLower === pathLower) {
662
+ return true;
663
+ }
664
+ // Handle simple prefix patterns like /news/*
665
+ if (patternLower.endsWith('/*')) {
666
+ const prefix = patternLower.slice(0, -2); // Remove /*
667
+ return pathLower.startsWith(prefix + '/') || pathLower === prefix;
668
+ }
669
+ // Handle wildcard patterns with regex
670
+ const regexPattern = patternLower
671
+ .replace(/[.+?^${}()|[\]\\]/g, '\\$&') // Escape special chars except *
672
+ .replace(/\*/g, '.*'); // Convert * to .*
673
+ const regex = new RegExp('^' + regexPattern + '$', 'i');
674
+ return regex.test(pathLower);
675
+ }
676
+ /**
677
+ * Build scraping configuration from source config
678
+ */
679
+ buildScrapingConfig(config) {
680
+ const scrapingConfig = {};
681
+ if (config.scrapeConfig?.selectors) {
682
+ scrapingConfig.selectors = {
683
+ articleLinks: config.scrapeConfig.selectors.articleLinks,
684
+ titleSelectors: config.scrapeConfig.selectors.titleSelectors,
685
+ dateSelectors: config.scrapeConfig.selectors.dateSelectors,
686
+ excludeSelectors: config.scrapeConfig.selectors.excludeSelectors
687
+ };
688
+ }
689
+ if (config.scrapeConfig?.filters) {
690
+ scrapingConfig.filters = {
691
+ minTitleLength: config.scrapeConfig.filters.minTitleLength,
692
+ maxTitleLength: config.scrapeConfig.filters.maxTitleLength,
693
+ includePatterns: config.scrapeConfig.filters.includePatterns?.map(p => new RegExp(p, 'i')),
694
+ excludePatterns: config.scrapeConfig.filters.excludePatterns?.map(p => new RegExp(p, 'i'))
695
+ };
696
+ }
697
+ if (config.scrapeConfig?.limits) {
698
+ scrapingConfig.limits = config.scrapeConfig.limits;
699
+ }
700
+ return scrapingConfig;
701
+ }
702
+ /**
703
+ * Extract title from URL as fallback
704
+ */
705
+ extractTitleFromUrl(url) {
706
+ try {
707
+ const urlObj = new URL(url);
708
+ const pathParts = urlObj.pathname.split('/').filter(Boolean);
709
+ const lastPart = pathParts[pathParts.length - 1] || urlObj.hostname;
710
+ return lastPart
711
+ .replace(/[-_]/g, ' ')
712
+ .replace(/\.(html|htm|php|asp|jsp)$/i, '')
713
+ .split(' ')
714
+ .map(word => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase())
715
+ .join(' ');
716
+ }
717
+ catch {
718
+ return 'Untitled Article';
719
+ }
720
+ }
721
+ /**
722
+ * Create a consistent GUID for an article
723
+ */
724
+ createGuid(url, publishedAt) {
725
+ return crypto_1.default.createHash('sha256').update(url + publishedAt).digest('hex');
726
+ }
727
+ /**
728
+ * Finalize processing result
729
+ */
730
+ finalizeResult(result) {
731
+ const endTime = Date.now();
732
+ result.processingTime = endTime - (Date.now() - result.processingTime);
733
+ // Update extraction stats
734
+ result.sourceInfo.extractionStats = {
735
+ attempted: result.articles.length,
736
+ successful: result.articles.filter(a => a.confidence >= 0.5).length,
737
+ failed: result.errors.length,
738
+ filtered: 0 // This would be calculated during filtering
739
+ };
740
+ // Sort articles by confidence and recency
741
+ result.articles.sort((a, b) => {
742
+ const confidenceDiff = b.confidence - a.confidence;
743
+ if (Math.abs(confidenceDiff) > 0.1)
744
+ return confidenceDiff;
745
+ return b.publishedAt.getTime() - a.publishedAt.getTime();
746
+ });
747
+ // Limit results
748
+ result.articles = result.articles.slice(0, this.maxArticlesPerSource);
749
+ console.log(`🎭 [Orchestrator] Processing complete: ${result.articles.length} articles in ${result.processingTime}ms`);
750
+ return result;
751
+ }
752
+ /**
753
+ * Extract full content for articles (optional enhancement step)
754
+ */
755
+ async enhanceWithFullContent(articles, maxArticles = 10, options = {}) {
756
+ const concurrency = options.concurrency || 5;
757
+ const toEnhance = articles
758
+ .filter(a => !a.content || a.content.length < 2000) // Only enhance articles without full content
759
+ .slice(0, maxArticles);
760
+ if (toEnhance.length === 0) {
761
+ console.log(`📖 [Orchestrator] No articles need content enhancement`);
762
+ return articles;
763
+ }
764
+ console.log(`📖 [Orchestrator] Enhancing ${toEnhance.length} articles in PARALLEL (concurrency: ${concurrency})`);
765
+ const limit = (0, p_limit_1.default)(concurrency);
766
+ let completed = 0;
767
+ await Promise.allSettled(toEnhance.map(article => limit(async () => {
768
+ try {
769
+ const extractedContent = await globalContentExtractor.extractContent(article.url);
770
+ if (extractedContent) {
771
+ article.content = extractedContent.content;
772
+ article.excerpt = extractedContent.excerpt || article.excerpt;
773
+ article.confidence = Math.min(article.confidence + 0.1, 1.0);
774
+ article.metadata = {
775
+ ...article.metadata,
776
+ fullContentExtracted: true,
777
+ extractionMethod: extractedContent.extractionMethod,
778
+ wordCount: extractedContent.wordCount,
779
+ readingTime: extractedContent.readingTime
780
+ };
781
+ }
782
+ }
783
+ catch (error) {
784
+ console.warn(`⚠️ [Orchestrator] Failed to enhance article ${article.url}:`, error);
785
+ }
786
+ finally {
787
+ completed++;
788
+ options.onProgress?.(completed, toEnhance.length);
789
+ }
790
+ })));
791
+ console.log(`📖 [Orchestrator] Content enhancement complete: ${completed}/${toEnhance.length}`);
792
+ return articles;
793
+ }
794
+ /**
795
+ * Validate orchestrator configuration
796
+ */
797
+ static validateConfig(config) {
798
+ try {
799
+ return exports.SourceConfigSchema.parse(config);
800
+ }
801
+ catch (error) {
802
+ if (error instanceof zod_1.z.ZodError) {
803
+ throw new Error(`Invalid source configuration: ${error.errors.map(e => e.message).join(', ')}`);
804
+ }
805
+ throw error;
806
+ }
807
+ }
808
+ /**
809
+ * Get source statistics
810
+ */
811
+ async getSourceStats(url) {
812
+ const robotsCheck = await globalRobotsChecker.isAllowed(url);
813
+ const discoveredFeeds = await rss_discovery_1.globalRSSDiscovery.discoverFeeds(url);
814
+ let hasSitemap = false;
815
+ let estimatedArticleCount = 0;
816
+ try {
817
+ const urlObj = new URL(url);
818
+ const sitemaps = await sitemap_parser_1.globalSitemapParser.discoverSitemaps(urlObj.hostname);
819
+ hasSitemap = sitemaps.length > 0;
820
+ if (hasSitemap) {
821
+ const recentEntries = await sitemap_parser_1.globalSitemapParser.getRecentEntries(urlObj.hostname, { hoursBack: 48, maxEntries: 100 });
822
+ estimatedArticleCount = recentEntries.length;
823
+ }
824
+ }
825
+ catch (error) {
826
+ // Ignore sitemap errors for stats
827
+ }
828
+ return {
829
+ robotsCompliant: robotsCheck.allowed,
830
+ hasRSSFeed: discoveredFeeds.length > 0,
831
+ hasSitemap,
832
+ detectedType: discoveredFeeds.length > 0 ? 'rss' : hasSitemap ? 'sitemap' : 'html',
833
+ estimatedArticleCount
834
+ };
835
+ }
836
+ }
837
+ exports.SourceOrchestrator = SourceOrchestrator;
838
+ // Export default instance
839
+ exports.globalSourceOrchestrator = new SourceOrchestrator();
840
+ //# sourceMappingURL=source-orchestrator.js.map