@tyroneross/blog-scraper 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +254 -279
- package/dist/lib/circuit-breaker.d.ts +29 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.js +89 -0
- package/dist/lib/circuit-breaker.js.map +1 -0
- package/dist/lib/content-extractor.d.ts +13 -0
- package/dist/lib/content-extractor.d.ts.map +1 -0
- package/dist/lib/content-extractor.js +75 -0
- package/dist/lib/content-extractor.js.map +1 -0
- package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
- package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
- package/dist/lib/formatters/html-to-markdown.js +146 -0
- package/dist/lib/formatters/html-to-markdown.js.map +1 -0
- package/dist/lib/formatters/text-cleaner.d.ts +44 -0
- package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
- package/dist/lib/formatters/text-cleaner.js +143 -0
- package/dist/lib/formatters/text-cleaner.js.map +1 -0
- package/dist/lib/index.d.ts +96 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +184 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/quality-scorer.d.ts +83 -0
- package/dist/lib/quality-scorer.d.ts.map +1 -0
- package/dist/lib/quality-scorer.js +376 -0
- package/dist/lib/quality-scorer.js.map +1 -0
- package/dist/lib/rss-utils.d.ts +31 -0
- package/dist/lib/rss-utils.d.ts.map +1 -0
- package/dist/lib/rss-utils.js +175 -0
- package/dist/lib/rss-utils.js.map +1 -0
- package/dist/lib/scraping-rate-limiter.d.ts +52 -0
- package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
- package/dist/lib/scraping-rate-limiter.js +238 -0
- package/dist/lib/scraping-rate-limiter.js.map +1 -0
- package/dist/lib/source-orchestrator.d.ts +306 -0
- package/dist/lib/source-orchestrator.d.ts.map +1 -0
- package/dist/lib/source-orchestrator.js +840 -0
- package/dist/lib/source-orchestrator.js.map +1 -0
- package/dist/lib/types.d.ts +143 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +7 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.js +531 -0
- package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.js +598 -0
- package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
- package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.js +285 -0
- package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.js +384 -0
- package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
- package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
- package/package.json +54 -33
- package/dist/index.d.mts +0 -949
- package/dist/index.d.ts +0 -949
- package/dist/index.js +0 -3236
- package/dist/index.mjs +0 -3165
|
@@ -0,0 +1,840 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.globalSourceOrchestrator = exports.SourceOrchestrator = exports.SourceConfigSchema = exports.CandidateArticleSchema = void 0;
|
|
7
|
+
const zod_1 = require("zod");
|
|
8
|
+
const crypto_1 = __importDefault(require("crypto"));
|
|
9
|
+
const p_limit_1 = __importDefault(require("p-limit"));
|
|
10
|
+
const rss_utils_1 = require("./rss-utils");
|
|
11
|
+
const rss_discovery_1 = require("./web-scrapers/rss-discovery");
|
|
12
|
+
const sitemap_parser_1 = require("./web-scrapers/sitemap-parser");
|
|
13
|
+
const html_scraper_1 = require("./web-scrapers/html-scraper");
|
|
14
|
+
const content_extractor_1 = require("./web-scrapers/content-extractor");
|
|
15
|
+
const robots_checker_1 = require("./web-scrapers/robots-checker");
|
|
16
|
+
const playwright_scraper_1 = require("./web-scrapers/playwright-scraper");
|
|
17
|
+
const quality_scorer_1 = require("./quality-scorer");
|
|
18
|
+
// Create instances
|
|
19
|
+
const globalHTMLScraper = new html_scraper_1.HTMLScraper();
|
|
20
|
+
const globalContentExtractor = new content_extractor_1.ContentExtractor();
|
|
21
|
+
const globalRobotsChecker = new robots_checker_1.RobotsChecker();
|
|
22
|
+
const circuit_breaker_1 = require("./circuit-breaker");
|
|
23
|
+
// Zod schemas for type safety
|
|
24
|
+
exports.CandidateArticleSchema = zod_1.z.object({
|
|
25
|
+
url: zod_1.z.string().url(),
|
|
26
|
+
title: zod_1.z.string().min(1),
|
|
27
|
+
publishedAt: zod_1.z.date(),
|
|
28
|
+
content: zod_1.z.string().optional(),
|
|
29
|
+
excerpt: zod_1.z.string().optional(),
|
|
30
|
+
guid: zod_1.z.string(),
|
|
31
|
+
confidence: zod_1.z.number().min(0).max(1),
|
|
32
|
+
source: zod_1.z.enum(['rss', 'sitemap', 'html', 'discovery']),
|
|
33
|
+
extractionMethod: zod_1.z.enum(['rss', 'sitemap', 'html-links', 'content-extraction']),
|
|
34
|
+
metadata: zod_1.z.record(zod_1.z.any()).optional()
|
|
35
|
+
});
|
|
36
|
+
exports.SourceConfigSchema = zod_1.z.object({
|
|
37
|
+
sourceType: zod_1.z.enum(['rss', 'sitemap', 'html', 'auto']),
|
|
38
|
+
allowPaths: zod_1.z.array(zod_1.z.string()).optional(),
|
|
39
|
+
denyPaths: zod_1.z.array(zod_1.z.string()).optional(),
|
|
40
|
+
maxDepth: zod_1.z.number().int().min(1).max(5).optional(),
|
|
41
|
+
detectOnly: zod_1.z.boolean().optional(),
|
|
42
|
+
scrapeConfig: zod_1.z.object({
|
|
43
|
+
selectors: zod_1.z.object({
|
|
44
|
+
articleLinks: zod_1.z.array(zod_1.z.string()).optional(),
|
|
45
|
+
titleSelectors: zod_1.z.array(zod_1.z.string()).optional(),
|
|
46
|
+
dateSelectors: zod_1.z.array(zod_1.z.string()).optional(),
|
|
47
|
+
excludeSelectors: zod_1.z.array(zod_1.z.string()).optional()
|
|
48
|
+
}).optional(),
|
|
49
|
+
filters: zod_1.z.object({
|
|
50
|
+
minTitleLength: zod_1.z.number().optional(),
|
|
51
|
+
maxTitleLength: zod_1.z.number().optional(),
|
|
52
|
+
includePatterns: zod_1.z.array(zod_1.z.string()).optional(),
|
|
53
|
+
excludePatterns: zod_1.z.array(zod_1.z.string()).optional()
|
|
54
|
+
}).optional(),
|
|
55
|
+
limits: zod_1.z.object({
|
|
56
|
+
maxLinksPerPage: zod_1.z.number().optional(),
|
|
57
|
+
maxPages: zod_1.z.number().optional()
|
|
58
|
+
}).optional()
|
|
59
|
+
}).optional()
|
|
60
|
+
});
|
|
61
|
+
class SourceOrchestrator {
|
|
62
|
+
constructor() {
|
|
63
|
+
this.maxArticlesPerSource = 1000;
|
|
64
|
+
this.recentTimeframe = 48 * 60 * 60 * 1000; // 48 hours
|
|
65
|
+
/**
|
|
66
|
+
* Common content section paths - prioritized for news/blog content
|
|
67
|
+
*/
|
|
68
|
+
this.contentSectionPaths = [
|
|
69
|
+
'/news', '/blog', '/articles', '/posts', '/stories',
|
|
70
|
+
'/press', '/updates', '/announcements', '/insights',
|
|
71
|
+
'/resources', '/publications', '/research', '/engineering'
|
|
72
|
+
];
|
|
73
|
+
/**
|
|
74
|
+
* Common blog subdomains to check when scraping root domains
|
|
75
|
+
* Many companies host their blogs on separate subdomains
|
|
76
|
+
*/
|
|
77
|
+
this.blogSubdomains = [
|
|
78
|
+
'blog', 'blogs', 'news', 'newsroom', 'press',
|
|
79
|
+
'engineering', 'developers', 'ai', 'research'
|
|
80
|
+
];
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Infer path filters from the input URL
|
|
84
|
+
* e.g., if user enters anthropic.com/news, filter results to /news/* paths
|
|
85
|
+
*/
|
|
86
|
+
inferPathFiltersFromUrl(url, config) {
|
|
87
|
+
try {
|
|
88
|
+
const urlObj = new URL(url);
|
|
89
|
+
const path = urlObj.pathname.toLowerCase();
|
|
90
|
+
// If URL has a meaningful path (not just /), infer allowPaths
|
|
91
|
+
if (path && path !== '/' && path.length > 1) {
|
|
92
|
+
for (const contentPath of this.contentSectionPaths) {
|
|
93
|
+
if (path.startsWith(contentPath)) {
|
|
94
|
+
// Only add if user hasn't explicitly set allowPaths
|
|
95
|
+
if (!config.allowPaths?.length) {
|
|
96
|
+
console.log(`🔍 [Orchestrator] Inferring path filter from URL: ${contentPath}/*`);
|
|
97
|
+
return {
|
|
98
|
+
...config,
|
|
99
|
+
allowPaths: [`${contentPath}/*`, `${contentPath}`]
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
// For other paths, use the exact path as prefix
|
|
105
|
+
if (!config.allowPaths?.length && path.length > 3) {
|
|
106
|
+
const pathPrefix = path.endsWith('/') ? path.slice(0, -1) : path;
|
|
107
|
+
console.log(`🔍 [Orchestrator] Inferring path filter from URL: ${pathPrefix}/*`);
|
|
108
|
+
return {
|
|
109
|
+
...config,
|
|
110
|
+
allowPaths: [`${pathPrefix}/*`, pathPrefix]
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
catch (error) {
|
|
116
|
+
// Ignore URL parsing errors
|
|
117
|
+
}
|
|
118
|
+
return config;
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Discover content sections from sitemap when user enters root domain
|
|
122
|
+
* Returns prioritized list of content paths found
|
|
123
|
+
*/
|
|
124
|
+
discoverContentSectionsFromSitemap(entries) {
|
|
125
|
+
const pathCounts = new Map();
|
|
126
|
+
for (const entry of entries) {
|
|
127
|
+
try {
|
|
128
|
+
const urlObj = new URL(entry.url);
|
|
129
|
+
const pathParts = urlObj.pathname.split('/').filter(Boolean);
|
|
130
|
+
if (pathParts.length >= 1) {
|
|
131
|
+
const firstPath = '/' + pathParts[0].toLowerCase();
|
|
132
|
+
// Only count if it's a potential content section
|
|
133
|
+
if (this.contentSectionPaths.includes(firstPath) ||
|
|
134
|
+
firstPath.match(/^\/(news|blog|post|article|stor|update|press)/i)) {
|
|
135
|
+
pathCounts.set(firstPath, (pathCounts.get(firstPath) || 0) + 1);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
catch {
|
|
140
|
+
// Skip invalid URLs
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
// Sort by count (most content first) and return paths with 3+ entries
|
|
144
|
+
const sortedPaths = Array.from(pathCounts.entries())
|
|
145
|
+
.filter(([_, count]) => count >= 3)
|
|
146
|
+
.sort((a, b) => b[1] - a[1])
|
|
147
|
+
.map(([path]) => path);
|
|
148
|
+
if (sortedPaths.length > 0) {
|
|
149
|
+
console.log(`🔍 [Orchestrator] Discovered content sections: ${sortedPaths.join(', ')}`);
|
|
150
|
+
}
|
|
151
|
+
return sortedPaths;
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Filter sitemap entries to content sections when processing root domain
|
|
155
|
+
* Also applies non-English locale filtering
|
|
156
|
+
*/
|
|
157
|
+
filterToContentSections(articles, discoveredPaths) {
|
|
158
|
+
if (discoveredPaths.length === 0) {
|
|
159
|
+
return articles;
|
|
160
|
+
}
|
|
161
|
+
// Create allow patterns from discovered paths
|
|
162
|
+
const allowPatterns = discoveredPaths.flatMap(p => [`${p}/*`, p]);
|
|
163
|
+
return articles.filter(article => {
|
|
164
|
+
try {
|
|
165
|
+
const urlObj = new URL(article.url);
|
|
166
|
+
const path = urlObj.pathname.toLowerCase();
|
|
167
|
+
// Filter out non-English locale paths (e.g., /fr-be/, /de-ch/)
|
|
168
|
+
if ((0, quality_scorer_1.isNonEnglishLocalePath)(path)) {
|
|
169
|
+
return false;
|
|
170
|
+
}
|
|
171
|
+
return allowPatterns.some(pattern => this.matchesPattern(path, pattern));
|
|
172
|
+
}
|
|
173
|
+
catch {
|
|
174
|
+
return false;
|
|
175
|
+
}
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Discover blog subdomains for a given domain
|
|
180
|
+
* e.g., for nvidia.com, check if blogs.nvidia.com exists
|
|
181
|
+
*/
|
|
182
|
+
async discoverBlogSubdomains(domain) {
|
|
183
|
+
// Extract base domain (remove www. if present)
|
|
184
|
+
const baseDomain = domain.replace(/^www\./, '');
|
|
185
|
+
const discoveredSubdomains = [];
|
|
186
|
+
console.log(`🔍 [Orchestrator] Checking for blog subdomains of ${baseDomain}...`);
|
|
187
|
+
// Check each potential blog subdomain
|
|
188
|
+
for (const subdomain of this.blogSubdomains) {
|
|
189
|
+
const subdomainUrl = `https://${subdomain}.${baseDomain}`;
|
|
190
|
+
try {
|
|
191
|
+
// Quick HEAD request to check if subdomain exists
|
|
192
|
+
const controller = new AbortController();
|
|
193
|
+
const timeoutId = setTimeout(() => controller.abort(), 3000);
|
|
194
|
+
const response = await fetch(subdomainUrl, {
|
|
195
|
+
method: 'HEAD',
|
|
196
|
+
signal: controller.signal,
|
|
197
|
+
headers: {
|
|
198
|
+
'User-Agent': 'Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)'
|
|
199
|
+
}
|
|
200
|
+
});
|
|
201
|
+
clearTimeout(timeoutId);
|
|
202
|
+
if (response.ok || response.status === 301 || response.status === 302) {
|
|
203
|
+
console.log(`✅ [Orchestrator] Found blog subdomain: ${subdomainUrl}`);
|
|
204
|
+
discoveredSubdomains.push(subdomainUrl);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
catch {
|
|
208
|
+
// Subdomain doesn't exist or timeout, skip it
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
return discoveredSubdomains;
|
|
212
|
+
}
|
|
213
|
+
/**
|
|
214
|
+
* Main orchestration method - determines source type and extracts content
|
|
215
|
+
*/
|
|
216
|
+
async processSource(url, config = { sourceType: 'auto' }) {
|
|
217
|
+
const startTime = Date.now();
|
|
218
|
+
// Infer path filters from input URL
|
|
219
|
+
config = this.inferPathFiltersFromUrl(url, config);
|
|
220
|
+
console.log(`🎭 [Orchestrator] Processing source: ${url} (type: ${config.sourceType})`);
|
|
221
|
+
const result = {
|
|
222
|
+
articles: [],
|
|
223
|
+
sourceInfo: {
|
|
224
|
+
detectedType: 'html',
|
|
225
|
+
extractionStats: {
|
|
226
|
+
attempted: 0,
|
|
227
|
+
successful: 0,
|
|
228
|
+
failed: 0,
|
|
229
|
+
filtered: 0
|
|
230
|
+
}
|
|
231
|
+
},
|
|
232
|
+
processingTime: 0,
|
|
233
|
+
errors: []
|
|
234
|
+
};
|
|
235
|
+
try {
|
|
236
|
+
// Apply circuit breaker protection (use custom if provided, otherwise default)
|
|
237
|
+
const breaker = config.circuitBreaker || circuit_breaker_1.circuitBreakers.scraping;
|
|
238
|
+
return await breaker.execute(async () => {
|
|
239
|
+
if (config.sourceType === 'auto') {
|
|
240
|
+
return await this.autoDetectAndProcess(url, config, result);
|
|
241
|
+
}
|
|
242
|
+
else {
|
|
243
|
+
return await this.processKnownType(url, config, result);
|
|
244
|
+
}
|
|
245
|
+
});
|
|
246
|
+
}
|
|
247
|
+
catch (error) {
|
|
248
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
249
|
+
console.error(`❌ [Orchestrator] Failed to process source ${url}:`, errorMessage);
|
|
250
|
+
result.errors.push(errorMessage);
|
|
251
|
+
result.processingTime = Date.now() - startTime;
|
|
252
|
+
return result;
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
/**
|
|
256
|
+
* Auto-detect source type and process accordingly
|
|
257
|
+
*/
|
|
258
|
+
async autoDetectAndProcess(url, config, result) {
|
|
259
|
+
console.log(`🔍 [Orchestrator] Auto-detecting source type for ${url}`);
|
|
260
|
+
// Step 1: Try RSS first (most reliable)
|
|
261
|
+
try {
|
|
262
|
+
const rssArticles = await this.processAsRSS(url);
|
|
263
|
+
if (rssArticles.length > 0) {
|
|
264
|
+
result.sourceInfo.detectedType = 'rss';
|
|
265
|
+
// RSS is already curated content - only apply deny filters, not allow filters
|
|
266
|
+
result.articles = this.applyPathFilters(rssArticles, config, { skipAllowFilters: true });
|
|
267
|
+
console.log(`✅ [Orchestrator] Detected as RSS feed: ${result.articles.length} articles`);
|
|
268
|
+
return this.finalizeResult(result);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
catch (error) {
|
|
272
|
+
result.errors.push(`RSS detection failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
273
|
+
}
|
|
274
|
+
// Step 2: Discover RSS feeds from HTML
|
|
275
|
+
try {
|
|
276
|
+
const discoveredFeeds = await rss_discovery_1.globalRSSDiscovery.discoverFeeds(url);
|
|
277
|
+
if (discoveredFeeds.length > 0) {
|
|
278
|
+
result.sourceInfo.discoveredFeeds = discoveredFeeds;
|
|
279
|
+
// Try the highest confidence discovered feed
|
|
280
|
+
const bestFeed = discoveredFeeds[0];
|
|
281
|
+
const rssArticles = await this.processAsRSS(bestFeed.url);
|
|
282
|
+
if (rssArticles.length > 0) {
|
|
283
|
+
result.sourceInfo.detectedType = 'rss';
|
|
284
|
+
// RSS is already curated content - only apply deny filters, not allow filters
|
|
285
|
+
result.articles = this.applyPathFilters(rssArticles, config, { skipAllowFilters: true });
|
|
286
|
+
console.log(`✅ [Orchestrator] Using discovered RSS feed: ${result.articles.length} articles`);
|
|
287
|
+
return this.finalizeResult(result);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
catch (error) {
|
|
292
|
+
result.errors.push(`RSS discovery failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
293
|
+
}
|
|
294
|
+
// Step 3: Try sitemap parsing
|
|
295
|
+
try {
|
|
296
|
+
const sitemapArticles = await this.processAsSitemap(url);
|
|
297
|
+
if (sitemapArticles.length > 0) {
|
|
298
|
+
result.sourceInfo.detectedType = 'sitemap';
|
|
299
|
+
// If processing root domain, auto-discover and filter to content sections
|
|
300
|
+
const urlObj = new URL(url);
|
|
301
|
+
const isRootDomain = urlObj.pathname === '/' || urlObj.pathname === '';
|
|
302
|
+
if (isRootDomain && !config.allowPaths?.length) {
|
|
303
|
+
const discoveredPaths = this.discoverContentSectionsFromSitemap(sitemapArticles);
|
|
304
|
+
if (discoveredPaths.length > 0) {
|
|
305
|
+
result.articles = this.filterToContentSections(sitemapArticles, discoveredPaths);
|
|
306
|
+
}
|
|
307
|
+
else {
|
|
308
|
+
result.articles = this.applyPathFilters(sitemapArticles, config);
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
else {
|
|
312
|
+
result.articles = this.applyPathFilters(sitemapArticles, config);
|
|
313
|
+
}
|
|
314
|
+
console.log(`✅ [Orchestrator] Detected as sitemap: ${result.articles.length} articles`);
|
|
315
|
+
return this.finalizeResult(result);
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
catch (error) {
|
|
319
|
+
result.errors.push(`Sitemap detection failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
320
|
+
}
|
|
321
|
+
// Step 4: Discover sitemaps from domain
|
|
322
|
+
try {
|
|
323
|
+
const urlObj = new URL(url);
|
|
324
|
+
const discoveredSitemaps = await sitemap_parser_1.globalSitemapParser.discoverSitemaps(urlObj.hostname);
|
|
325
|
+
if (discoveredSitemaps.length > 0) {
|
|
326
|
+
result.sourceInfo.discoveredSitemaps = discoveredSitemaps;
|
|
327
|
+
// Try the first discovered sitemap
|
|
328
|
+
const sitemapArticles = await this.processAsSitemap(discoveredSitemaps[0]);
|
|
329
|
+
if (sitemapArticles.length > 0) {
|
|
330
|
+
result.sourceInfo.detectedType = 'sitemap';
|
|
331
|
+
// If processing root domain, auto-discover and filter to content sections
|
|
332
|
+
const isRootDomain = urlObj.pathname === '/' || urlObj.pathname === '';
|
|
333
|
+
if (isRootDomain && !config.allowPaths?.length) {
|
|
334
|
+
const discoveredPaths = this.discoverContentSectionsFromSitemap(sitemapArticles);
|
|
335
|
+
if (discoveredPaths.length > 0) {
|
|
336
|
+
result.articles = this.filterToContentSections(sitemapArticles, discoveredPaths);
|
|
337
|
+
}
|
|
338
|
+
else {
|
|
339
|
+
result.articles = this.applyPathFilters(sitemapArticles, config);
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
else {
|
|
343
|
+
result.articles = this.applyPathFilters(sitemapArticles, config);
|
|
344
|
+
}
|
|
345
|
+
console.log(`✅ [Orchestrator] Using discovered sitemap: ${result.articles.length} articles`);
|
|
346
|
+
return this.finalizeResult(result);
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
catch (error) {
|
|
351
|
+
result.errors.push(`Sitemap discovery failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
352
|
+
}
|
|
353
|
+
// Step 5: Try blog subdomains (e.g., blogs.nvidia.com for nvidia.com)
|
|
354
|
+
try {
|
|
355
|
+
const urlObj = new URL(url);
|
|
356
|
+
const isRootDomain = urlObj.pathname === '/' || urlObj.pathname === '';
|
|
357
|
+
if (isRootDomain) {
|
|
358
|
+
const blogSubdomains = await this.discoverBlogSubdomains(urlObj.hostname);
|
|
359
|
+
for (const subdomainUrl of blogSubdomains) {
|
|
360
|
+
try {
|
|
361
|
+
// Try RSS first on subdomain
|
|
362
|
+
const rssArticles = await this.processAsRSS(subdomainUrl);
|
|
363
|
+
if (rssArticles.length > 0) {
|
|
364
|
+
result.sourceInfo.detectedType = 'rss';
|
|
365
|
+
result.articles = this.applyPathFilters(rssArticles, config);
|
|
366
|
+
console.log(`✅ [Orchestrator] Found RSS on subdomain ${subdomainUrl}: ${result.articles.length} articles`);
|
|
367
|
+
return this.finalizeResult(result);
|
|
368
|
+
}
|
|
369
|
+
// Try sitemap on subdomain
|
|
370
|
+
const subdomainHostname = new URL(subdomainUrl).hostname;
|
|
371
|
+
const subdomainSitemaps = await sitemap_parser_1.globalSitemapParser.discoverSitemaps(subdomainHostname);
|
|
372
|
+
if (subdomainSitemaps.length > 0) {
|
|
373
|
+
result.sourceInfo.discoveredSitemaps = [
|
|
374
|
+
...(result.sourceInfo.discoveredSitemaps || []),
|
|
375
|
+
...subdomainSitemaps
|
|
376
|
+
];
|
|
377
|
+
for (const sitemap of subdomainSitemaps) {
|
|
378
|
+
const sitemapArticles = await this.processAsSitemap(sitemap);
|
|
379
|
+
if (sitemapArticles.length > 0) {
|
|
380
|
+
result.sourceInfo.detectedType = 'sitemap';
|
|
381
|
+
result.articles = this.applyPathFilters(sitemapArticles, config);
|
|
382
|
+
console.log(`✅ [Orchestrator] Found sitemap on subdomain ${subdomainUrl}: ${result.articles.length} articles`);
|
|
383
|
+
return this.finalizeResult(result);
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
catch (subError) {
|
|
389
|
+
console.log(`⚠️ [Orchestrator] Error processing subdomain ${subdomainUrl}:`, subError);
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
catch (error) {
|
|
395
|
+
result.errors.push(`Subdomain discovery failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
396
|
+
}
|
|
397
|
+
// Step 6: Fall back to HTML scraping
|
|
398
|
+
try {
|
|
399
|
+
const htmlArticles = await this.processAsHTML(url, config);
|
|
400
|
+
if (htmlArticles.length > 0) {
|
|
401
|
+
result.sourceInfo.detectedType = 'html';
|
|
402
|
+
result.articles = this.applyPathFilters(htmlArticles, config);
|
|
403
|
+
console.log(`✅ [Orchestrator] Falling back to HTML scraping: ${result.articles.length} articles`);
|
|
404
|
+
return this.finalizeResult(result);
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
catch (error) {
|
|
408
|
+
result.errors.push(`HTML scraping failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
409
|
+
}
|
|
410
|
+
// Step 7: Final fallback - Playwright for JS-rendered pages
|
|
411
|
+
try {
|
|
412
|
+
console.log(`🎭 [Orchestrator] Trying Playwright for JS-rendered content...`);
|
|
413
|
+
const playwrightArticles = await this.processAsPlaywright(url, config);
|
|
414
|
+
if (playwrightArticles.length > 0) {
|
|
415
|
+
result.sourceInfo.detectedType = 'html'; // Still categorize as HTML source
|
|
416
|
+
result.articles = this.applyPathFilters(playwrightArticles, config);
|
|
417
|
+
console.log(`✅ [Orchestrator] Playwright extraction successful: ${result.articles.length} articles`);
|
|
418
|
+
return this.finalizeResult(result);
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
catch (error) {
|
|
422
|
+
result.errors.push(`Playwright scraping failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
423
|
+
}
|
|
424
|
+
// No articles found by any method
|
|
425
|
+
console.log(`⚠️ [Orchestrator] No articles found for ${url}`);
|
|
426
|
+
return this.finalizeResult(result);
|
|
427
|
+
}
|
|
428
|
+
/**
|
|
429
|
+
* Process source with known type
|
|
430
|
+
*/
|
|
431
|
+
async processKnownType(url, config, result) {
|
|
432
|
+
console.log(`🎯 [Orchestrator] Processing as ${config.sourceType}: ${url}`);
|
|
433
|
+
try {
|
|
434
|
+
let articles = [];
|
|
435
|
+
switch (config.sourceType) {
|
|
436
|
+
case 'rss':
|
|
437
|
+
articles = await this.processAsRSS(url);
|
|
438
|
+
result.sourceInfo.detectedType = 'rss';
|
|
439
|
+
break;
|
|
440
|
+
case 'sitemap':
|
|
441
|
+
articles = await this.processAsSitemap(url);
|
|
442
|
+
result.sourceInfo.detectedType = 'sitemap';
|
|
443
|
+
break;
|
|
444
|
+
case 'html':
|
|
445
|
+
articles = await this.processAsHTML(url, config);
|
|
446
|
+
result.sourceInfo.detectedType = 'html';
|
|
447
|
+
break;
|
|
448
|
+
}
|
|
449
|
+
result.articles = this.applyPathFilters(articles, config);
|
|
450
|
+
console.log(`✅ [Orchestrator] Processed ${config.sourceType}: ${result.articles.length} articles`);
|
|
451
|
+
return this.finalizeResult(result);
|
|
452
|
+
}
|
|
453
|
+
catch (error) {
|
|
454
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
455
|
+
result.errors.push(`${config.sourceType} processing failed: ${errorMessage}`);
|
|
456
|
+
return this.finalizeResult(result);
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
/**
|
|
460
|
+
* Process URL as RSS feed
|
|
461
|
+
*/
|
|
462
|
+
async processAsRSS(url) {
|
|
463
|
+
const rssItems = await (0, rss_utils_1.fetchRSSFeed)(url);
|
|
464
|
+
const candidates = [];
|
|
465
|
+
for (const item of rssItems) {
|
|
466
|
+
try {
|
|
467
|
+
const publishedAt = new Date(item.pubDate);
|
|
468
|
+
if (isNaN(publishedAt.getTime())) {
|
|
469
|
+
continue;
|
|
470
|
+
}
|
|
471
|
+
candidates.push({
|
|
472
|
+
url: item.link,
|
|
473
|
+
title: item.title,
|
|
474
|
+
publishedAt,
|
|
475
|
+
content: item.content,
|
|
476
|
+
excerpt: item.contentSnippet,
|
|
477
|
+
guid: item.guid,
|
|
478
|
+
confidence: 0.9,
|
|
479
|
+
source: 'rss',
|
|
480
|
+
extractionMethod: 'rss',
|
|
481
|
+
metadata: {
|
|
482
|
+
originalGuid: item.guid,
|
|
483
|
+
rssSource: url
|
|
484
|
+
}
|
|
485
|
+
});
|
|
486
|
+
}
|
|
487
|
+
catch (error) {
|
|
488
|
+
console.warn(`⚠️ [Orchestrator] Error processing RSS item:`, error);
|
|
489
|
+
continue;
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
return candidates;
|
|
493
|
+
}
|
|
494
|
+
/**
|
|
495
|
+
* Process URL as sitemap
|
|
496
|
+
*/
|
|
497
|
+
async processAsSitemap(url) {
|
|
498
|
+
// Don't filter by recency - we want all entries from the sitemap
|
|
499
|
+
// Path filtering and quality scoring will handle relevance
|
|
500
|
+
const sitemapEntries = await sitemap_parser_1.globalSitemapParser.parseSitemap(url, {
|
|
501
|
+
filterRecent: false, // Changed: get all entries, filter later by path
|
|
502
|
+
maxEntries: this.maxArticlesPerSource,
|
|
503
|
+
includeNews: true
|
|
504
|
+
});
|
|
505
|
+
const candidates = [];
|
|
506
|
+
for (const entry of sitemapEntries) {
|
|
507
|
+
try {
|
|
508
|
+
const publishedAt = entry.lastmod || new Date();
|
|
509
|
+
candidates.push({
|
|
510
|
+
url: entry.url,
|
|
511
|
+
title: entry.news?.title || this.extractTitleFromUrl(entry.url),
|
|
512
|
+
publishedAt,
|
|
513
|
+
guid: this.createGuid(entry.url, publishedAt.toISOString()),
|
|
514
|
+
confidence: entry.news ? 0.8 : 0.6,
|
|
515
|
+
source: 'sitemap',
|
|
516
|
+
extractionMethod: 'sitemap',
|
|
517
|
+
metadata: {
|
|
518
|
+
changefreq: entry.changefreq,
|
|
519
|
+
priority: entry.priority,
|
|
520
|
+
hasNews: !!entry.news,
|
|
521
|
+
sitemapSource: url
|
|
522
|
+
}
|
|
523
|
+
});
|
|
524
|
+
}
|
|
525
|
+
catch (error) {
|
|
526
|
+
console.warn(`⚠️ [Orchestrator] Error processing sitemap entry:`, error);
|
|
527
|
+
continue;
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
return candidates;
|
|
531
|
+
}
|
|
532
|
+
/**
|
|
533
|
+
* Process URL as HTML page
|
|
534
|
+
*/
|
|
535
|
+
async processAsHTML(url, config) {
|
|
536
|
+
const scrapingConfig = this.buildScrapingConfig(config);
|
|
537
|
+
const extractedArticles = await globalHTMLScraper.extractFromMultiplePages(url, scrapingConfig, {
|
|
538
|
+
maxPages: config.scrapeConfig?.limits?.maxPages || 3
|
|
539
|
+
});
|
|
540
|
+
const candidates = [];
|
|
541
|
+
for (const article of extractedArticles) {
|
|
542
|
+
try {
|
|
543
|
+
const publishedAt = article.publishedDate || new Date();
|
|
544
|
+
candidates.push({
|
|
545
|
+
url: article.url,
|
|
546
|
+
title: article.title || this.extractTitleFromUrl(article.url),
|
|
547
|
+
publishedAt,
|
|
548
|
+
excerpt: article.description,
|
|
549
|
+
guid: this.createGuid(article.url, publishedAt.toISOString()),
|
|
550
|
+
confidence: article.confidence,
|
|
551
|
+
source: 'html',
|
|
552
|
+
extractionMethod: 'html-links',
|
|
553
|
+
metadata: {
|
|
554
|
+
extractionSource: article.source,
|
|
555
|
+
htmlSource: url
|
|
556
|
+
}
|
|
557
|
+
});
|
|
558
|
+
}
|
|
559
|
+
catch (error) {
|
|
560
|
+
console.warn(`⚠️ [Orchestrator] Error processing HTML article:`, error);
|
|
561
|
+
continue;
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
return candidates;
|
|
565
|
+
}
|
|
566
|
+
/**
|
|
567
|
+
* Process URL using Playwright for JavaScript-rendered pages
|
|
568
|
+
* Used as fallback when static HTML scraping fails
|
|
569
|
+
*/
|
|
570
|
+
async processAsPlaywright(url, config) {
|
|
571
|
+
const playwrightScraper = (0, playwright_scraper_1.getPlaywrightScraper)();
|
|
572
|
+
const playwrightConfig = {
|
|
573
|
+
timeout: 30000,
|
|
574
|
+
blockMedia: true,
|
|
575
|
+
...this.buildScrapingConfig(config)
|
|
576
|
+
};
|
|
577
|
+
const extractedArticles = await playwrightScraper.extractArticleLinks(url, playwrightConfig);
|
|
578
|
+
const candidates = [];
|
|
579
|
+
for (const article of extractedArticles) {
|
|
580
|
+
try {
|
|
581
|
+
const publishedAt = article.publishedDate || new Date();
|
|
582
|
+
candidates.push({
|
|
583
|
+
url: article.url,
|
|
584
|
+
title: article.title || this.extractTitleFromUrl(article.url),
|
|
585
|
+
publishedAt,
|
|
586
|
+
excerpt: article.description,
|
|
587
|
+
guid: this.createGuid(article.url, publishedAt.toISOString()),
|
|
588
|
+
confidence: article.confidence,
|
|
589
|
+
source: 'html',
|
|
590
|
+
extractionMethod: 'html-links',
|
|
591
|
+
metadata: {
|
|
592
|
+
extractionSource: 'playwright',
|
|
593
|
+
playwrightRendered: true,
|
|
594
|
+
htmlSource: url
|
|
595
|
+
}
|
|
596
|
+
});
|
|
597
|
+
}
|
|
598
|
+
catch (error) {
|
|
599
|
+
console.warn(`⚠️ [Orchestrator] Error processing Playwright article:`, error);
|
|
600
|
+
continue;
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
return candidates;
|
|
604
|
+
}
|
|
605
|
+
/**
|
|
606
|
+
* Apply path filtering based on allowPaths and denyPaths
|
|
607
|
+
* Also filters out non-English locale paths
|
|
608
|
+
*
|
|
609
|
+
* @param articles - Articles to filter
|
|
610
|
+
* @param config - Source configuration
|
|
611
|
+
* @param options - Filtering options
|
|
612
|
+
* @param options.skipAllowFilters - Skip allow path filtering (useful for RSS which is already curated)
|
|
613
|
+
*/
|
|
614
|
+
applyPathFilters(articles, config, options = {}) {
|
|
615
|
+
return articles.filter(article => {
|
|
616
|
+
try {
|
|
617
|
+
const urlObj = new URL(article.url);
|
|
618
|
+
const path = urlObj.pathname.toLowerCase();
|
|
619
|
+
// Always filter out non-English locale paths (e.g., /fr-be/, /de-ch/)
|
|
620
|
+
if ((0, quality_scorer_1.isNonEnglishLocalePath)(path)) {
|
|
621
|
+
return false;
|
|
622
|
+
}
|
|
623
|
+
// Check deny patterns first (always apply)
|
|
624
|
+
if (config.denyPaths?.length) {
|
|
625
|
+
for (const pattern of config.denyPaths) {
|
|
626
|
+
if (this.matchesPattern(path, pattern)) {
|
|
627
|
+
console.log(`🚫 [Orchestrator] Article blocked by deny pattern "${pattern}": ${article.url}`);
|
|
628
|
+
return false;
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
// Skip allow pattern filtering for RSS (RSS is already curated content)
|
|
633
|
+
if (options.skipAllowFilters) {
|
|
634
|
+
return true;
|
|
635
|
+
}
|
|
636
|
+
// Check allow patterns (only for sitemap/HTML sources)
|
|
637
|
+
if (config.allowPaths?.length) {
|
|
638
|
+
for (const pattern of config.allowPaths) {
|
|
639
|
+
if (this.matchesPattern(path, pattern)) {
|
|
640
|
+
return true;
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
console.log(`🚫 [Orchestrator] Article not matching any allow pattern: ${article.url}`);
|
|
644
|
+
return false;
|
|
645
|
+
}
|
|
646
|
+
return true;
|
|
647
|
+
}
|
|
648
|
+
catch (error) {
|
|
649
|
+
console.warn(`⚠️ [Orchestrator] Error applying path filters to ${article.url}:`, error);
|
|
650
|
+
return true; // Default to allowing on error
|
|
651
|
+
}
|
|
652
|
+
});
|
|
653
|
+
}
|
|
654
|
+
/**
|
|
655
|
+
* Check if a path matches a pattern (supports wildcards)
|
|
656
|
+
*/
|
|
657
|
+
matchesPattern(path, pattern) {
|
|
658
|
+
const patternLower = pattern.toLowerCase();
|
|
659
|
+
const pathLower = path.toLowerCase();
|
|
660
|
+
// Handle exact match
|
|
661
|
+
if (patternLower === pathLower) {
|
|
662
|
+
return true;
|
|
663
|
+
}
|
|
664
|
+
// Handle simple prefix patterns like /news/*
|
|
665
|
+
if (patternLower.endsWith('/*')) {
|
|
666
|
+
const prefix = patternLower.slice(0, -2); // Remove /*
|
|
667
|
+
return pathLower.startsWith(prefix + '/') || pathLower === prefix;
|
|
668
|
+
}
|
|
669
|
+
// Handle wildcard patterns with regex
|
|
670
|
+
const regexPattern = patternLower
|
|
671
|
+
.replace(/[.+?^${}()|[\]\\]/g, '\\$&') // Escape special chars except *
|
|
672
|
+
.replace(/\*/g, '.*'); // Convert * to .*
|
|
673
|
+
const regex = new RegExp('^' + regexPattern + '$', 'i');
|
|
674
|
+
return regex.test(pathLower);
|
|
675
|
+
}
|
|
676
|
+
/**
|
|
677
|
+
* Build scraping configuration from source config
|
|
678
|
+
*/
|
|
679
|
+
buildScrapingConfig(config) {
|
|
680
|
+
const scrapingConfig = {};
|
|
681
|
+
if (config.scrapeConfig?.selectors) {
|
|
682
|
+
scrapingConfig.selectors = {
|
|
683
|
+
articleLinks: config.scrapeConfig.selectors.articleLinks,
|
|
684
|
+
titleSelectors: config.scrapeConfig.selectors.titleSelectors,
|
|
685
|
+
dateSelectors: config.scrapeConfig.selectors.dateSelectors,
|
|
686
|
+
excludeSelectors: config.scrapeConfig.selectors.excludeSelectors
|
|
687
|
+
};
|
|
688
|
+
}
|
|
689
|
+
if (config.scrapeConfig?.filters) {
|
|
690
|
+
scrapingConfig.filters = {
|
|
691
|
+
minTitleLength: config.scrapeConfig.filters.minTitleLength,
|
|
692
|
+
maxTitleLength: config.scrapeConfig.filters.maxTitleLength,
|
|
693
|
+
includePatterns: config.scrapeConfig.filters.includePatterns?.map(p => new RegExp(p, 'i')),
|
|
694
|
+
excludePatterns: config.scrapeConfig.filters.excludePatterns?.map(p => new RegExp(p, 'i'))
|
|
695
|
+
};
|
|
696
|
+
}
|
|
697
|
+
if (config.scrapeConfig?.limits) {
|
|
698
|
+
scrapingConfig.limits = config.scrapeConfig.limits;
|
|
699
|
+
}
|
|
700
|
+
return scrapingConfig;
|
|
701
|
+
}
|
|
702
|
+
/**
|
|
703
|
+
* Extract title from URL as fallback
|
|
704
|
+
*/
|
|
705
|
+
extractTitleFromUrl(url) {
|
|
706
|
+
try {
|
|
707
|
+
const urlObj = new URL(url);
|
|
708
|
+
const pathParts = urlObj.pathname.split('/').filter(Boolean);
|
|
709
|
+
const lastPart = pathParts[pathParts.length - 1] || urlObj.hostname;
|
|
710
|
+
return lastPart
|
|
711
|
+
.replace(/[-_]/g, ' ')
|
|
712
|
+
.replace(/\.(html|htm|php|asp|jsp)$/i, '')
|
|
713
|
+
.split(' ')
|
|
714
|
+
.map(word => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase())
|
|
715
|
+
.join(' ');
|
|
716
|
+
}
|
|
717
|
+
catch {
|
|
718
|
+
return 'Untitled Article';
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
/**
|
|
722
|
+
* Create a consistent GUID for an article
|
|
723
|
+
*/
|
|
724
|
+
createGuid(url, publishedAt) {
|
|
725
|
+
return crypto_1.default.createHash('sha256').update(url + publishedAt).digest('hex');
|
|
726
|
+
}
|
|
727
|
+
/**
|
|
728
|
+
* Finalize processing result
|
|
729
|
+
*/
|
|
730
|
+
finalizeResult(result) {
|
|
731
|
+
const endTime = Date.now();
|
|
732
|
+
result.processingTime = endTime - (Date.now() - result.processingTime);
|
|
733
|
+
// Update extraction stats
|
|
734
|
+
result.sourceInfo.extractionStats = {
|
|
735
|
+
attempted: result.articles.length,
|
|
736
|
+
successful: result.articles.filter(a => a.confidence >= 0.5).length,
|
|
737
|
+
failed: result.errors.length,
|
|
738
|
+
filtered: 0 // This would be calculated during filtering
|
|
739
|
+
};
|
|
740
|
+
// Sort articles by confidence and recency
|
|
741
|
+
result.articles.sort((a, b) => {
|
|
742
|
+
const confidenceDiff = b.confidence - a.confidence;
|
|
743
|
+
if (Math.abs(confidenceDiff) > 0.1)
|
|
744
|
+
return confidenceDiff;
|
|
745
|
+
return b.publishedAt.getTime() - a.publishedAt.getTime();
|
|
746
|
+
});
|
|
747
|
+
// Limit results
|
|
748
|
+
result.articles = result.articles.slice(0, this.maxArticlesPerSource);
|
|
749
|
+
console.log(`🎭 [Orchestrator] Processing complete: ${result.articles.length} articles in ${result.processingTime}ms`);
|
|
750
|
+
return result;
|
|
751
|
+
}
|
|
752
|
+
/**
|
|
753
|
+
* Extract full content for articles (optional enhancement step)
|
|
754
|
+
*/
|
|
755
|
+
async enhanceWithFullContent(articles, maxArticles = 10, options = {}) {
|
|
756
|
+
const concurrency = options.concurrency || 5;
|
|
757
|
+
const toEnhance = articles
|
|
758
|
+
.filter(a => !a.content || a.content.length < 2000) // Only enhance articles without full content
|
|
759
|
+
.slice(0, maxArticles);
|
|
760
|
+
if (toEnhance.length === 0) {
|
|
761
|
+
console.log(`📖 [Orchestrator] No articles need content enhancement`);
|
|
762
|
+
return articles;
|
|
763
|
+
}
|
|
764
|
+
console.log(`📖 [Orchestrator] Enhancing ${toEnhance.length} articles in PARALLEL (concurrency: ${concurrency})`);
|
|
765
|
+
const limit = (0, p_limit_1.default)(concurrency);
|
|
766
|
+
let completed = 0;
|
|
767
|
+
await Promise.allSettled(toEnhance.map(article => limit(async () => {
|
|
768
|
+
try {
|
|
769
|
+
const extractedContent = await globalContentExtractor.extractContent(article.url);
|
|
770
|
+
if (extractedContent) {
|
|
771
|
+
article.content = extractedContent.content;
|
|
772
|
+
article.excerpt = extractedContent.excerpt || article.excerpt;
|
|
773
|
+
article.confidence = Math.min(article.confidence + 0.1, 1.0);
|
|
774
|
+
article.metadata = {
|
|
775
|
+
...article.metadata,
|
|
776
|
+
fullContentExtracted: true,
|
|
777
|
+
extractionMethod: extractedContent.extractionMethod,
|
|
778
|
+
wordCount: extractedContent.wordCount,
|
|
779
|
+
readingTime: extractedContent.readingTime
|
|
780
|
+
};
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
catch (error) {
|
|
784
|
+
console.warn(`⚠️ [Orchestrator] Failed to enhance article ${article.url}:`, error);
|
|
785
|
+
}
|
|
786
|
+
finally {
|
|
787
|
+
completed++;
|
|
788
|
+
options.onProgress?.(completed, toEnhance.length);
|
|
789
|
+
}
|
|
790
|
+
})));
|
|
791
|
+
console.log(`📖 [Orchestrator] Content enhancement complete: ${completed}/${toEnhance.length}`);
|
|
792
|
+
return articles;
|
|
793
|
+
}
|
|
794
|
+
/**
|
|
795
|
+
* Validate orchestrator configuration
|
|
796
|
+
*/
|
|
797
|
+
static validateConfig(config) {
|
|
798
|
+
try {
|
|
799
|
+
return exports.SourceConfigSchema.parse(config);
|
|
800
|
+
}
|
|
801
|
+
catch (error) {
|
|
802
|
+
if (error instanceof zod_1.z.ZodError) {
|
|
803
|
+
throw new Error(`Invalid source configuration: ${error.errors.map(e => e.message).join(', ')}`);
|
|
804
|
+
}
|
|
805
|
+
throw error;
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
/**
|
|
809
|
+
* Get source statistics
|
|
810
|
+
*/
|
|
811
|
+
async getSourceStats(url) {
|
|
812
|
+
const robotsCheck = await globalRobotsChecker.isAllowed(url);
|
|
813
|
+
const discoveredFeeds = await rss_discovery_1.globalRSSDiscovery.discoverFeeds(url);
|
|
814
|
+
let hasSitemap = false;
|
|
815
|
+
let estimatedArticleCount = 0;
|
|
816
|
+
try {
|
|
817
|
+
const urlObj = new URL(url);
|
|
818
|
+
const sitemaps = await sitemap_parser_1.globalSitemapParser.discoverSitemaps(urlObj.hostname);
|
|
819
|
+
hasSitemap = sitemaps.length > 0;
|
|
820
|
+
if (hasSitemap) {
|
|
821
|
+
const recentEntries = await sitemap_parser_1.globalSitemapParser.getRecentEntries(urlObj.hostname, { hoursBack: 48, maxEntries: 100 });
|
|
822
|
+
estimatedArticleCount = recentEntries.length;
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
catch (error) {
|
|
826
|
+
// Ignore sitemap errors for stats
|
|
827
|
+
}
|
|
828
|
+
return {
|
|
829
|
+
robotsCompliant: robotsCheck.allowed,
|
|
830
|
+
hasRSSFeed: discoveredFeeds.length > 0,
|
|
831
|
+
hasSitemap,
|
|
832
|
+
detectedType: discoveredFeeds.length > 0 ? 'rss' : hasSitemap ? 'sitemap' : 'html',
|
|
833
|
+
estimatedArticleCount
|
|
834
|
+
};
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
exports.SourceOrchestrator = SourceOrchestrator;
|
|
838
|
+
// Export default instance
|
|
839
|
+
exports.globalSourceOrchestrator = new SourceOrchestrator();
|
|
840
|
+
//# sourceMappingURL=source-orchestrator.js.map
|