@tyroneross/blog-scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,3236 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var index_exports = {};
32
+ __export(index_exports, {
33
+ CircuitBreaker: () => CircuitBreaker,
34
+ ContentExtractor: () => ContentExtractor,
35
+ DEFAULT_DENY_PATHS: () => DEFAULT_DENY_PATHS,
36
+ DEFAULT_QUALITY_CONFIG: () => DEFAULT_QUALITY_CONFIG,
37
+ HTMLScraper: () => HTMLScraper,
38
+ RSSDiscovery: () => RSSDiscovery,
39
+ RobotsChecker: () => RobotsChecker,
40
+ ScrapingRateLimiter: () => ScrapingRateLimiter,
41
+ SitemapParser: () => SitemapParser,
42
+ SourceOrchestrator: () => SourceOrchestrator,
43
+ VERSION: () => VERSION,
44
+ calculateArticleQualityScore: () => calculateArticleQualityScore,
45
+ circuitBreakers: () => circuitBreakers,
46
+ cleanText: () => cleanText,
47
+ convertToMarkdown: () => convertToMarkdown,
48
+ decodeHTMLEntities: () => decodeHTMLEntities,
49
+ detectParagraphs: () => detectParagraphs,
50
+ fetchRSSFeed: () => fetchRSSFeed,
51
+ getQualityBreakdown: () => getQualityBreakdown,
52
+ globalContentExtractor: () => globalContentExtractor,
53
+ globalRSSDiscovery: () => globalRSSDiscovery,
54
+ globalRateLimiter: () => globalRateLimiter,
55
+ globalRobotsChecker: () => globalRobotsChecker,
56
+ globalSitemapParser: () => globalSitemapParser,
57
+ globalSourceOrchestrator: () => globalSourceOrchestrator,
58
+ htmlToMarkdown: () => htmlToMarkdown,
59
+ normalizeWhitespace: () => normalizeWhitespace2,
60
+ quickScrape: () => quickScrape,
61
+ removeUrls: () => removeUrls,
62
+ scrape: () => scrape,
63
+ shouldDenyUrl: () => shouldDenyUrl,
64
+ stripHTML: () => stripHTML,
65
+ stripNonArticleContent: () => stripNonArticleContent,
66
+ truncateText: () => truncateText,
67
+ validateContent: () => validateContent
68
+ });
69
+ module.exports = __toCommonJS(index_exports);
70
+
71
+ // src/orchestrator/source-orchestrator.ts
72
+ var import_zod = require("zod");
73
+ var import_crypto2 = __toESM(require("crypto"));
74
+
75
+ // src/utils/rss-utils.ts
76
+ var import_rss_parser = __toESM(require("rss-parser"));
77
+ var import_crypto = __toESM(require("crypto"));
78
+ var parser = new import_rss_parser.default({
79
+ timeout: 15e3,
80
+ // Increased timeout
81
+ headers: {
82
+ "User-Agent": "Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)"
83
+ }
84
+ });
85
+ async function fetchRSSFeed(url, _sourceId) {
86
+ try {
87
+ console.log(`\u{1F504} [RSS] Fetching feed from ${url}`);
88
+ const feed = await parser.parseURL(url);
89
+ if (!feed.items || feed.items.length === 0) {
90
+ console.warn(`\u26A0\uFE0F [RSS] Feed from ${url} contains no items`);
91
+ return [];
92
+ }
93
+ const items = feed.items.map((item) => ({
94
+ title: item.title || "Untitled",
95
+ link: item.link || "",
96
+ pubDate: item.pubDate || (/* @__PURE__ */ new Date()).toISOString(),
97
+ guid: item.guid || item.link || import_crypto.default.randomUUID(),
98
+ content: item.content || item["content:encoded"] || "",
99
+ contentSnippet: item.contentSnippet || ""
100
+ }));
101
+ console.log(`\u2705 [RSS] Successfully fetched ${items.length} items from ${url}`);
102
+ return items;
103
+ } catch (error) {
104
+ const errorMessage = error instanceof Error ? error.message : "Unknown error";
105
+ console.error(`\u274C [RSS] Failed to fetch RSS from ${url}:`, errorMessage);
106
+ if (error instanceof Error) {
107
+ if (error.message.includes("Invalid character")) {
108
+ console.error(`\u{1F50D} [RSS] XML parsing error - feed may be malformed or contain HTML`);
109
+ } else if (error.message.includes("timeout")) {
110
+ console.error(`\u{1F50D} [RSS] Request timeout - server may be slow or unreachable`);
111
+ } else if (error.message.includes("ENOTFOUND")) {
112
+ console.error(`\u{1F50D} [RSS] Domain not found - check URL spelling`);
113
+ } else if (error.message.includes("ECONNREFUSED")) {
114
+ console.error(`\u{1F50D} [RSS] Connection refused - server may be down`);
115
+ }
116
+ }
117
+ return [];
118
+ }
119
+ }
120
+
121
+ // src/extractors/rss-discovery.ts
122
+ var cheerio = __toESM(require("cheerio"));
123
+
124
+ // src/utils/scraping-rate-limiter.ts
125
+ var ScrapingRateLimiter = class {
126
+ constructor(options = {}) {
127
+ this.hosts = /* @__PURE__ */ new Map();
128
+ this.activeRequests = /* @__PURE__ */ new Set();
129
+ this.baseDelay = Math.floor(1e3 / (options.requestsPerSecond || 1));
130
+ this.maxBackoff = options.maxBackoff || 3e4;
131
+ this.maxConcurrent = options.maxConcurrent || 10;
132
+ }
133
+ async execute(url, operation, options = {}) {
134
+ const host = this.extractHost(url);
135
+ if (!host) {
136
+ throw new Error(`Invalid URL: ${url}`);
137
+ }
138
+ return new Promise((resolve, reject) => {
139
+ const request = {
140
+ resolve,
141
+ reject,
142
+ operation,
143
+ priority: options.priority || 0,
144
+ retryCount: 0,
145
+ maxRetries: options.maxRetries || 3,
146
+ host
147
+ };
148
+ this.enqueueRequest(host, request);
149
+ });
150
+ }
151
+ extractHost(url) {
152
+ try {
153
+ const parsed = new URL(url);
154
+ return parsed.hostname.toLowerCase();
155
+ } catch {
156
+ return null;
157
+ }
158
+ }
159
+ enqueueRequest(host, request) {
160
+ if (!this.hosts.has(host)) {
161
+ this.hosts.set(host, {
162
+ lastRequest: 0,
163
+ backoffUntil: 0,
164
+ backoffMultiplier: 1,
165
+ queue: [],
166
+ processing: false
167
+ });
168
+ }
169
+ const hostState = this.hosts.get(host);
170
+ const insertIndex = hostState.queue.findIndex(
171
+ (req) => req.priority < request.priority
172
+ );
173
+ if (insertIndex === -1) {
174
+ hostState.queue.push(request);
175
+ } else {
176
+ hostState.queue.splice(insertIndex, 0, request);
177
+ }
178
+ if (!hostState.processing) {
179
+ this.processQueue(host).catch((error) => {
180
+ console.error(`[RateLimiter] Error processing queue for ${host}:`, error);
181
+ });
182
+ }
183
+ }
184
+ async processQueue(host) {
185
+ const hostState = this.hosts.get(host);
186
+ if (!hostState || hostState.processing) {
187
+ return;
188
+ }
189
+ hostState.processing = true;
190
+ try {
191
+ while (hostState.queue.length > 0) {
192
+ if (this.activeRequests.size >= this.maxConcurrent) {
193
+ await this.wait(100);
194
+ continue;
195
+ }
196
+ if (Date.now() < hostState.backoffUntil) {
197
+ const waitTime = hostState.backoffUntil - Date.now();
198
+ await this.wait(Math.min(waitTime, 1e3));
199
+ continue;
200
+ }
201
+ const now = Date.now();
202
+ const timeSinceLastRequest = now - hostState.lastRequest;
203
+ if (timeSinceLastRequest < this.baseDelay) {
204
+ const waitTime = this.baseDelay - timeSinceLastRequest;
205
+ await this.wait(waitTime);
206
+ continue;
207
+ }
208
+ const request = hostState.queue.shift();
209
+ const requestId = `${host}-${Date.now()}-${Math.random()}`;
210
+ this.activeRequests.add(requestId);
211
+ try {
212
+ hostState.lastRequest = Date.now();
213
+ const result = await request.operation();
214
+ hostState.backoffMultiplier = 1;
215
+ hostState.backoffUntil = 0;
216
+ request.resolve(result);
217
+ } catch (error) {
218
+ await this.handleRequestError(hostState, request, error);
219
+ } finally {
220
+ this.activeRequests.delete(requestId);
221
+ }
222
+ }
223
+ } finally {
224
+ hostState.processing = false;
225
+ }
226
+ }
227
+ async handleRequestError(hostState, request, error) {
228
+ const shouldRetry = this.shouldRetry(error, request);
229
+ if (shouldRetry && request.retryCount < request.maxRetries) {
230
+ request.retryCount++;
231
+ if (this.shouldBackoff(error)) {
232
+ const backoffTime = Math.min(
233
+ this.baseDelay * hostState.backoffMultiplier * Math.pow(2, request.retryCount),
234
+ this.maxBackoff
235
+ );
236
+ hostState.backoffUntil = Date.now() + backoffTime;
237
+ hostState.backoffMultiplier = Math.min(hostState.backoffMultiplier * 1.5, 10);
238
+ console.warn(
239
+ `[RateLimiter] Backing off ${request.host} for ${backoffTime}ms (attempt ${request.retryCount}/${request.maxRetries}): ${error.message}`
240
+ );
241
+ }
242
+ request.priority = Math.max(request.priority - 1, -10);
243
+ hostState.queue.unshift(request);
244
+ } else {
245
+ console.error(
246
+ `[RateLimiter] Request failed for ${request.host} (${request.retryCount}/${request.maxRetries} retries): ${error.message}`
247
+ );
248
+ request.reject(error);
249
+ }
250
+ }
251
+ shouldRetry(error, request) {
252
+ if (request.retryCount >= request.maxRetries) {
253
+ return false;
254
+ }
255
+ if (error.code === "ENOTFOUND" || error.code === "ECONNREFUSED") {
256
+ return true;
257
+ }
258
+ if (error.status) {
259
+ const status = error.status;
260
+ return status === 408 || status === 429 || status >= 500;
261
+ }
262
+ if (error.name === "AbortError" || error.message.includes("timeout")) {
263
+ return true;
264
+ }
265
+ return false;
266
+ }
267
+ shouldBackoff(error) {
268
+ if (error.status) {
269
+ const status = error.status;
270
+ return status === 429 || status >= 500;
271
+ }
272
+ if (error.code === "ECONNREFUSED" || error.code === "ENOTFOUND") {
273
+ return true;
274
+ }
275
+ return false;
276
+ }
277
+ wait(ms) {
278
+ return new Promise((resolve) => setTimeout(resolve, ms));
279
+ }
280
+ // Utility method to get current queue stats
281
+ getStats() {
282
+ const stats = {};
283
+ this.hosts.forEach((state, host) => {
284
+ stats[host] = {
285
+ queueLength: state.queue.length,
286
+ processing: state.processing,
287
+ backoffUntil: state.backoffUntil,
288
+ backoffMultiplier: state.backoffMultiplier,
289
+ lastRequest: state.lastRequest
290
+ };
291
+ });
292
+ return {
293
+ hosts: stats,
294
+ activeRequests: this.activeRequests.size,
295
+ maxConcurrent: this.maxConcurrent
296
+ };
297
+ }
298
+ };
299
+ var globalRateLimiter = new ScrapingRateLimiter({
300
+ requestsPerSecond: 1,
301
+ maxBackoff: 3e4,
302
+ maxConcurrent: 10
303
+ });
304
+
305
+ // src/extractors/robots-checker.ts
306
+ var RobotsChecker = class {
307
+ constructor() {
308
+ this.cache = /* @__PURE__ */ new Map();
309
+ this.cacheTimeout = 24 * 60 * 60 * 1e3;
310
+ // 24 hours
311
+ this.userAgent = "AtomizeNews/1.0";
312
+ this.requestTimeout = 5e3;
313
+ }
314
+ // 5 seconds
315
+ /**
316
+ * Check if a URL is allowed to be crawled according to robots.txt
317
+ */
318
+ async isAllowed(url) {
319
+ try {
320
+ const urlObj = new URL(url);
321
+ const robotsUrl = `${urlObj.protocol}//${urlObj.host}/robots.txt`;
322
+ console.log(`\u{1F916} [Robots] Checking ${url} against ${robotsUrl}`);
323
+ const robotsTxt = await this.getRobotsTxt(robotsUrl);
324
+ if (!robotsTxt) {
325
+ return {
326
+ allowed: true,
327
+ sitemaps: [],
328
+ reason: "No robots.txt found - allowing by default"
329
+ };
330
+ }
331
+ const result = this.checkRules(urlObj.pathname, robotsTxt);
332
+ console.log(`\u{1F916} [Robots] ${result.allowed ? "\u2705 Allowed" : "\u274C Blocked"}: ${url} - ${result.reason}`);
333
+ return result;
334
+ } catch (error) {
335
+ console.warn(`\u26A0\uFE0F [Robots] Error checking robots.txt for ${url}:`, error);
336
+ return {
337
+ allowed: true,
338
+ sitemaps: [],
339
+ reason: `Error checking robots.txt: ${error instanceof Error ? error.message : "Unknown error"}`
340
+ };
341
+ }
342
+ }
343
+ /**
344
+ * Get sitemaps listed in robots.txt for a domain
345
+ */
346
+ async getSitemaps(domain) {
347
+ try {
348
+ const robotsUrl = `https://${domain}/robots.txt`;
349
+ const robotsTxt = await this.getRobotsTxt(robotsUrl);
350
+ return robotsTxt ? robotsTxt.sitemaps : [];
351
+ } catch (error) {
352
+ console.warn(`\u26A0\uFE0F [Robots] Error getting sitemaps for ${domain}:`, error);
353
+ return [];
354
+ }
355
+ }
356
+ /**
357
+ * Get the recommended crawl delay for a domain
358
+ */
359
+ async getCrawlDelay(domain) {
360
+ try {
361
+ const robotsUrl = `https://${domain}/robots.txt`;
362
+ const robotsTxt = await this.getRobotsTxt(robotsUrl);
363
+ if (!robotsTxt) return void 0;
364
+ const rule = this.findBestMatchingRule(robotsTxt.rules);
365
+ return rule?.crawlDelay;
366
+ } catch (error) {
367
+ console.warn(`\u26A0\uFE0F [Robots] Error getting crawl delay for ${domain}:`, error);
368
+ return void 0;
369
+ }
370
+ }
371
+ async getRobotsTxt(robotsUrl) {
372
+ const cached = this.cache.get(robotsUrl);
373
+ if (cached && Date.now() < cached.expiresAt) {
374
+ return cached;
375
+ }
376
+ try {
377
+ console.log(`\u{1F916} [Robots] Fetching ${robotsUrl}`);
378
+ const controller = new AbortController();
379
+ const timeoutId = setTimeout(() => controller.abort(), this.requestTimeout);
380
+ const response = await fetch(robotsUrl, {
381
+ headers: {
382
+ "User-Agent": this.userAgent
383
+ },
384
+ signal: controller.signal
385
+ });
386
+ clearTimeout(timeoutId);
387
+ if (!response.ok) {
388
+ if (response.status === 404) {
389
+ console.log(`\u{1F916} [Robots] No robots.txt found at ${robotsUrl}`);
390
+ return null;
391
+ }
392
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
393
+ }
394
+ const text = await response.text();
395
+ const robotsTxt = this.parseRobotsTxt(text);
396
+ this.cache.set(robotsUrl, robotsTxt);
397
+ console.log(`\u{1F916} [Robots] Successfully parsed robots.txt for ${new URL(robotsUrl).hostname}`);
398
+ return robotsTxt;
399
+ } catch (error) {
400
+ if (error instanceof Error && error.name === "AbortError") {
401
+ console.warn(`\u26A0\uFE0F [Robots] Timeout fetching ${robotsUrl}`);
402
+ } else {
403
+ console.warn(`\u26A0\uFE0F [Robots] Error fetching ${robotsUrl}:`, error);
404
+ }
405
+ return null;
406
+ }
407
+ }
408
+ parseRobotsTxt(text) {
409
+ const lines = text.split("\n").map((line) => line.trim()).filter((line) => line && !line.startsWith("#"));
410
+ const rules = [];
411
+ const globalSitemaps = [];
412
+ let currentRule = null;
413
+ for (const line of lines) {
414
+ const [key, ...valueParts] = line.split(":");
415
+ const value = valueParts.join(":").trim();
416
+ const lowerKey = key.toLowerCase().trim();
417
+ switch (lowerKey) {
418
+ case "user-agent":
419
+ if (currentRule) {
420
+ rules.push(this.completeRule(currentRule));
421
+ }
422
+ currentRule = {
423
+ userAgent: value.toLowerCase(),
424
+ disallows: [],
425
+ allows: [],
426
+ sitemaps: []
427
+ };
428
+ break;
429
+ case "disallow":
430
+ if (currentRule) {
431
+ currentRule.disallows = currentRule.disallows || [];
432
+ if (value) {
433
+ currentRule.disallows.push(value);
434
+ }
435
+ }
436
+ break;
437
+ case "allow":
438
+ if (currentRule) {
439
+ currentRule.allows = currentRule.allows || [];
440
+ if (value) {
441
+ currentRule.allows.push(value);
442
+ }
443
+ }
444
+ break;
445
+ case "crawl-delay":
446
+ if (currentRule && value) {
447
+ const delay = parseFloat(value);
448
+ if (!isNaN(delay)) {
449
+ currentRule.crawlDelay = delay * 1e3;
450
+ }
451
+ }
452
+ break;
453
+ case "sitemap":
454
+ if (value) {
455
+ globalSitemaps.push(value);
456
+ if (currentRule) {
457
+ currentRule.sitemaps = currentRule.sitemaps || [];
458
+ currentRule.sitemaps.push(value);
459
+ }
460
+ }
461
+ break;
462
+ }
463
+ }
464
+ if (currentRule) {
465
+ rules.push(this.completeRule(currentRule));
466
+ }
467
+ const now = Date.now();
468
+ return {
469
+ rules,
470
+ sitemaps: globalSitemaps,
471
+ fetchedAt: now,
472
+ expiresAt: now + this.cacheTimeout
473
+ };
474
+ }
475
+ completeRule(partial) {
476
+ return {
477
+ userAgent: partial.userAgent || "*",
478
+ disallows: partial.disallows || [],
479
+ allows: partial.allows || [],
480
+ crawlDelay: partial.crawlDelay,
481
+ sitemaps: partial.sitemaps || []
482
+ };
483
+ }
484
+ checkRules(path, robotsTxt) {
485
+ const rule = this.findBestMatchingRule(robotsTxt.rules);
486
+ if (!rule) {
487
+ return {
488
+ allowed: true,
489
+ sitemaps: robotsTxt.sitemaps,
490
+ reason: "No applicable rules found"
491
+ };
492
+ }
493
+ for (const allowPattern of rule.allows) {
494
+ if (this.matchesPattern(path, allowPattern)) {
495
+ return {
496
+ allowed: true,
497
+ crawlDelay: rule.crawlDelay,
498
+ sitemaps: robotsTxt.sitemaps,
499
+ reason: `Explicitly allowed by pattern: ${allowPattern}`
500
+ };
501
+ }
502
+ }
503
+ for (const disallowPattern of rule.disallows) {
504
+ if (this.matchesPattern(path, disallowPattern)) {
505
+ return {
506
+ allowed: false,
507
+ crawlDelay: rule.crawlDelay,
508
+ sitemaps: robotsTxt.sitemaps,
509
+ reason: `Blocked by pattern: ${disallowPattern}`
510
+ };
511
+ }
512
+ }
513
+ return {
514
+ allowed: true,
515
+ crawlDelay: rule.crawlDelay,
516
+ sitemaps: robotsTxt.sitemaps,
517
+ reason: "No matching disallow rules"
518
+ };
519
+ }
520
+ findBestMatchingRule(rules) {
521
+ const exactMatch = rules.find((rule) => rule.userAgent === this.userAgent.toLowerCase());
522
+ if (exactMatch) return exactMatch;
523
+ const wildcardMatch = rules.find((rule) => rule.userAgent === "*");
524
+ if (wildcardMatch) return wildcardMatch;
525
+ return null;
526
+ }
527
+ matchesPattern(path, pattern) {
528
+ if (pattern === "") {
529
+ return false;
530
+ }
531
+ if (pattern === "/") {
532
+ return true;
533
+ }
534
+ if (pattern.includes("*")) {
535
+ const regexPattern = pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&").replace(/\\\*/g, ".*");
536
+ const regex = new RegExp("^" + regexPattern);
537
+ return regex.test(path);
538
+ }
539
+ return path.startsWith(pattern);
540
+ }
541
+ // Clear cache (useful for testing)
542
+ clearCache() {
543
+ this.cache.clear();
544
+ }
545
+ // Get cache stats
546
+ getCacheStats() {
547
+ return {
548
+ size: this.cache.size,
549
+ entries: Array.from(this.cache.entries()).map(([url, data]) => ({
550
+ url,
551
+ fetchedAt: new Date(data.fetchedAt).toISOString(),
552
+ expiresAt: new Date(data.expiresAt).toISOString(),
553
+ rulesCount: data.rules.length,
554
+ sitemapsCount: data.sitemaps.length
555
+ }))
556
+ };
557
+ }
558
+ };
559
+ var globalRobotsChecker = new RobotsChecker();
560
+
561
+ // src/extractors/rss-discovery.ts
562
+ var RSSDiscovery = class {
563
+ constructor() {
564
+ this.userAgent = "Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)";
565
+ this.timeout = 1e4;
566
+ }
567
+ // 10 seconds
568
+ // private readonly maxRedirects = 3; // Currently unused
569
+ /**
570
+ * Discover RSS feeds from a given URL
571
+ */
572
+ async discoverFeeds(url) {
573
+ console.log(`\u{1F50D} [RSSDiscovery] Starting feed discovery for ${url}`);
574
+ const feeds = /* @__PURE__ */ new Map();
575
+ try {
576
+ const directFeed = await this.checkDirectFeed(url);
577
+ if (directFeed) {
578
+ feeds.set(directFeed.url, directFeed);
579
+ console.log(`\u2705 [RSSDiscovery] Direct feed found: ${directFeed.url}`);
580
+ return Array.from(feeds.values());
581
+ }
582
+ const robotsCheck = await globalRobotsChecker.isAllowed(url);
583
+ if (!robotsCheck.allowed) {
584
+ console.warn(`\u{1F916} [RSSDiscovery] URL blocked by robots.txt: ${url} - ${robotsCheck.reason}`);
585
+ return [];
586
+ }
587
+ const html = await this.fetchPage(url);
588
+ if (!html) {
589
+ return [];
590
+ }
591
+ const linkFeeds = this.extractFeedsFromHTML(html, url);
592
+ linkFeeds.forEach((feed) => feeds.set(feed.url, feed));
593
+ if (feeds.size === 0) {
594
+ const commonPathFeeds = await this.checkCommonPaths(url);
595
+ commonPathFeeds.forEach((feed) => feeds.set(feed.url, feed));
596
+ }
597
+ if (feeds.size === 0) {
598
+ const contentFeeds = await this.scanForFeedContent(html, url);
599
+ contentFeeds.forEach((feed) => feeds.set(feed.url, feed));
600
+ }
601
+ const discoveredFeeds = Array.from(feeds.values());
602
+ discoveredFeeds.sort((a, b) => b.confidence - a.confidence);
603
+ console.log(`\u{1F50D} [RSSDiscovery] Discovered ${discoveredFeeds.length} feeds for ${url}`);
604
+ return discoveredFeeds;
605
+ } catch (error) {
606
+ console.error(`\u274C [RSSDiscovery] Error discovering feeds for ${url}:`, error);
607
+ return [];
608
+ }
609
+ }
610
+ /**
611
+ * Check if the URL itself is a direct feed
612
+ */
613
+ async checkDirectFeed(url) {
614
+ try {
615
+ const response = await globalRateLimiter.execute(url, async () => {
616
+ const controller = new AbortController();
617
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
618
+ try {
619
+ const res = await fetch(url, {
620
+ method: "HEAD",
621
+ headers: { "User-Agent": this.userAgent },
622
+ signal: controller.signal
623
+ });
624
+ clearTimeout(timeoutId);
625
+ return res;
626
+ } catch (error) {
627
+ clearTimeout(timeoutId);
628
+ throw error;
629
+ }
630
+ });
631
+ const contentType = response.headers.get("content-type") || "";
632
+ if (this.isFeedContentType(contentType)) {
633
+ const type = this.determineFeedType(contentType);
634
+ return {
635
+ url,
636
+ type,
637
+ source: "link-tag",
638
+ confidence: 1
639
+ };
640
+ }
641
+ return null;
642
+ } catch (error) {
643
+ return null;
644
+ }
645
+ }
646
+ /**
647
+ * Fetch HTML page content
648
+ */
649
+ async fetchPage(url) {
650
+ try {
651
+ return await globalRateLimiter.execute(url, async () => {
652
+ const controller = new AbortController();
653
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
654
+ try {
655
+ const response = await fetch(url, {
656
+ headers: { "User-Agent": this.userAgent },
657
+ signal: controller.signal
658
+ });
659
+ clearTimeout(timeoutId);
660
+ if (!response.ok) {
661
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
662
+ }
663
+ const contentType = response.headers.get("content-type") || "";
664
+ if (!contentType.includes("text/html")) {
665
+ throw new Error(`Not HTML content: ${contentType}`);
666
+ }
667
+ return await response.text();
668
+ } catch (error) {
669
+ clearTimeout(timeoutId);
670
+ throw error;
671
+ }
672
+ });
673
+ } catch (error) {
674
+ console.error(`\u274C [RSSDiscovery] Error fetching page ${url}:`, error);
675
+ return null;
676
+ }
677
+ }
678
+ /**
679
+ * Extract feed URLs from HTML link tags
680
+ */
681
+ extractFeedsFromHTML(html, baseUrl) {
682
+ const feeds = [];
683
+ try {
684
+ const $ = cheerio.load(html);
685
+ $('link[rel="alternate"]').each((_, element) => {
686
+ const $link = $(element);
687
+ const type = $link.attr("type");
688
+ const href = $link.attr("href");
689
+ const title = $link.attr("title");
690
+ if (href && this.isFeedContentType(type || "")) {
691
+ const absoluteUrl = this.resolveUrl(href, baseUrl);
692
+ if (absoluteUrl) {
693
+ feeds.push({
694
+ url: absoluteUrl,
695
+ title: title || void 0,
696
+ type: this.determineFeedType(type || ""),
697
+ source: "link-tag",
698
+ confidence: 0.9
699
+ });
700
+ }
701
+ }
702
+ });
703
+ $("a[href]").each((_, element) => {
704
+ const $link = $(element);
705
+ const href = $link.attr("href");
706
+ const text = $link.text().toLowerCase().trim();
707
+ if (href && this.isFeedLikeLink(href, text)) {
708
+ const absoluteUrl = this.resolveUrl(href, baseUrl);
709
+ if (absoluteUrl && !feeds.some((f) => f.url === absoluteUrl)) {
710
+ feeds.push({
711
+ url: absoluteUrl,
712
+ title: $link.text().trim() || void 0,
713
+ type: this.guessFeedType(href),
714
+ source: "content-scan",
715
+ confidence: 0.6
716
+ });
717
+ }
718
+ }
719
+ });
720
+ } catch (error) {
721
+ console.error(`\u274C [RSSDiscovery] Error parsing HTML for feeds:`, error);
722
+ }
723
+ return feeds;
724
+ }
725
+ /**
726
+ * Check common feed paths
727
+ */
728
+ async checkCommonPaths(url) {
729
+ const baseUrl = new URL(url);
730
+ const commonPaths = [
731
+ "/feed/",
732
+ "/feed.xml",
733
+ "/rss/",
734
+ "/rss.xml",
735
+ "/feeds/",
736
+ "/feeds.xml",
737
+ "/atom.xml",
738
+ "/index.xml",
739
+ "/blog/feed/",
740
+ "/blog/rss.xml",
741
+ "/news/feed/",
742
+ "/news/rss.xml"
743
+ ];
744
+ const feeds = [];
745
+ for (const path of commonPaths) {
746
+ try {
747
+ const testUrl = `${baseUrl.protocol}//${baseUrl.host}${path}`;
748
+ const robotsCheck = await globalRobotsChecker.isAllowed(testUrl);
749
+ if (!robotsCheck.allowed) {
750
+ continue;
751
+ }
752
+ const isValid = await this.validateFeedUrl(testUrl);
753
+ if (isValid) {
754
+ feeds.push({
755
+ url: testUrl,
756
+ type: this.guessFeedType(path),
757
+ source: "common-path",
758
+ confidence: 0.7
759
+ });
760
+ }
761
+ } catch (error) {
762
+ continue;
763
+ }
764
+ }
765
+ return feeds;
766
+ }
767
+ /**
768
+ * Scan HTML content for feed-like patterns
769
+ */
770
+ async scanForFeedContent(html, baseUrl) {
771
+ const feeds = [];
772
+ try {
773
+ const $ = cheerio.load(html);
774
+ const text = $.text();
775
+ const urlRegex = /https?:\/\/[^\s]+(?:feed|rss|atom)[^\s]*/gi;
776
+ const matches = text.match(urlRegex);
777
+ if (matches) {
778
+ for (const match of matches) {
779
+ const cleanUrl = match.replace(/[.,;:!?)]$/, "");
780
+ const absoluteUrl = this.resolveUrl(cleanUrl, baseUrl);
781
+ if (absoluteUrl && !feeds.some((f) => f.url === absoluteUrl)) {
782
+ const isValid = await this.validateFeedUrl(absoluteUrl);
783
+ if (isValid) {
784
+ feeds.push({
785
+ url: absoluteUrl,
786
+ type: this.guessFeedType(absoluteUrl),
787
+ source: "content-scan",
788
+ confidence: 0.5
789
+ });
790
+ }
791
+ }
792
+ }
793
+ }
794
+ } catch (error) {
795
+ console.error(`\u274C [RSSDiscovery] Error scanning content for feeds:`, error);
796
+ }
797
+ return feeds;
798
+ }
799
+ /**
800
+ * Validate if a URL is actually a feed
801
+ */
802
+ async validateFeedUrl(url) {
803
+ try {
804
+ return await globalRateLimiter.execute(url, async () => {
805
+ const controller = new AbortController();
806
+ const timeoutId = setTimeout(() => controller.abort(), 5e3);
807
+ try {
808
+ const response = await fetch(url, {
809
+ method: "HEAD",
810
+ headers: { "User-Agent": this.userAgent },
811
+ signal: controller.signal
812
+ });
813
+ clearTimeout(timeoutId);
814
+ if (!response.ok) {
815
+ return false;
816
+ }
817
+ const contentType = response.headers.get("content-type") || "";
818
+ return this.isFeedContentType(contentType);
819
+ } catch (error) {
820
+ clearTimeout(timeoutId);
821
+ return false;
822
+ }
823
+ });
824
+ } catch (error) {
825
+ return false;
826
+ }
827
+ }
828
+ /**
829
+ * Resolve relative URLs to absolute URLs
830
+ */
831
+ resolveUrl(url, baseUrl) {
832
+ try {
833
+ return new URL(url, baseUrl).toString();
834
+ } catch {
835
+ return null;
836
+ }
837
+ }
838
+ /**
839
+ * Check if content type indicates a feed
840
+ */
841
+ isFeedContentType(contentType) {
842
+ const lowerType = contentType.toLowerCase();
843
+ return lowerType.includes("application/rss+xml") || lowerType.includes("application/atom+xml") || lowerType.includes("application/rdf+xml") || lowerType.includes("text/xml") || lowerType.includes("application/xml");
844
+ }
845
+ /**
846
+ * Determine feed type from content type
847
+ */
848
+ determineFeedType(contentType) {
849
+ const lowerType = contentType.toLowerCase();
850
+ if (lowerType.includes("atom")) return "atom";
851
+ if (lowerType.includes("rdf")) return "rdf";
852
+ return "rss";
853
+ }
854
+ /**
855
+ * Guess feed type from URL or text
856
+ */
857
+ guessFeedType(urlOrText) {
858
+ const lower = urlOrText.toLowerCase();
859
+ if (lower.includes("atom")) return "atom";
860
+ if (lower.includes("rdf")) return "rdf";
861
+ return "rss";
862
+ }
863
+ /**
864
+ * Check if a link looks like it could be a feed
865
+ */
866
+ isFeedLikeLink(href, text) {
867
+ const lowerHref = href.toLowerCase();
868
+ const lowerText = text.toLowerCase();
869
+ const feedKeywords = ["rss", "feed", "atom", "xml", "syndication"];
870
+ return feedKeywords.some(
871
+ (keyword) => lowerHref.includes(keyword) || lowerText.includes(keyword)
872
+ );
873
+ }
874
+ };
875
+ var globalRSSDiscovery = new RSSDiscovery();
876
+
877
+ // src/extractors/sitemap-parser.ts
878
+ var cheerio2 = __toESM(require("cheerio"));
879
+ var SitemapParser = class {
880
+ constructor() {
881
+ this.userAgent = "Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)";
882
+ this.timeout = 15e3;
883
+ // 15 seconds for sitemaps
884
+ this.maxSitemapSize = 50 * 1024 * 1024;
885
+ // 50MB max
886
+ this.maxEntries = 5e4;
887
+ // Max entries per sitemap
888
+ this.recentTimeframe = 48 * 60 * 60 * 1e3;
889
+ }
890
+ // 48 hours in ms
891
+ /**
892
+ * Parse sitemap from URL and return entries
893
+ */
894
+ async parseSitemap(url, options = {}) {
895
+ console.log(`\u{1F5FA}\uFE0F [Sitemap] Starting to parse ${url}`);
896
+ try {
897
+ const robotsCheck = await globalRobotsChecker.isAllowed(url);
898
+ if (!robotsCheck.allowed) {
899
+ console.warn(`\u{1F916} [Sitemap] URL blocked by robots.txt: ${url} - ${robotsCheck.reason}`);
900
+ return [];
901
+ }
902
+ const xml = await this.fetchSitemap(url);
903
+ if (!xml) {
904
+ return [];
905
+ }
906
+ if (this.isSitemapIndex(xml)) {
907
+ return await this.parseSitemapIndex(xml, options);
908
+ } else {
909
+ return this.parseRegularSitemap(xml, options);
910
+ }
911
+ } catch (error) {
912
+ console.error(`\u274C [Sitemap] Error parsing sitemap ${url}:`, error);
913
+ return [];
914
+ }
915
+ }
916
+ /**
917
+ * Discover sitemaps from domain
918
+ */
919
+ async discoverSitemaps(domain) {
920
+ const sitemaps = [];
921
+ try {
922
+ const robotsSitemaps = await globalRobotsChecker.getSitemaps(domain);
923
+ sitemaps.push(...robotsSitemaps);
924
+ const commonPaths = [
925
+ "/sitemap.xml",
926
+ "/sitemap_index.xml",
927
+ "/sitemaps.xml",
928
+ "/sitemap/",
929
+ "/news-sitemap.xml"
930
+ ];
931
+ for (const path of commonPaths) {
932
+ const sitemapUrl = `https://${domain}${path}`;
933
+ if (sitemaps.includes(sitemapUrl)) {
934
+ continue;
935
+ }
936
+ const exists = await this.checkSitemapExists(sitemapUrl);
937
+ if (exists) {
938
+ sitemaps.push(sitemapUrl);
939
+ }
940
+ }
941
+ console.log(`\u{1F5FA}\uFE0F [Sitemap] Discovered ${sitemaps.length} sitemaps for ${domain}`);
942
+ return Array.from(new Set(sitemaps));
943
+ } catch (error) {
944
+ console.error(`\u274C [Sitemap] Error discovering sitemaps for ${domain}:`, error);
945
+ return [];
946
+ }
947
+ }
948
+ /**
949
+ * Get recent entries from all sitemaps for a domain
950
+ */
951
+ async getRecentEntries(domain, options = {}) {
952
+ const hoursBack = options.hoursBack || 48;
953
+ const maxEntries = options.maxEntries || 1e3;
954
+ const sitemaps = await this.discoverSitemaps(domain);
955
+ const allEntries = [];
956
+ for (const sitemapUrl of sitemaps) {
957
+ try {
958
+ const entries = await this.parseSitemap(sitemapUrl, {
959
+ filterRecent: true,
960
+ maxEntries: Math.floor(maxEntries / sitemaps.length),
961
+ // Distribute quota
962
+ includeNews: true
963
+ });
964
+ allEntries.push(...entries);
965
+ } catch (error) {
966
+ console.warn(`\u26A0\uFE0F [Sitemap] Error parsing ${sitemapUrl}:`, error);
967
+ continue;
968
+ }
969
+ }
970
+ const cutoffTime = new Date(Date.now() - hoursBack * 60 * 60 * 1e3);
971
+ const recentEntries = allEntries.filter((entry) => entry.lastmod && entry.lastmod >= cutoffTime).sort((a, b) => {
972
+ if (!a.lastmod || !b.lastmod) return 0;
973
+ return b.lastmod.getTime() - a.lastmod.getTime();
974
+ }).slice(0, maxEntries);
975
+ console.log(`\u{1F5FA}\uFE0F [Sitemap] Found ${recentEntries.length} recent entries from ${domain}`);
976
+ return recentEntries;
977
+ }
978
+ async fetchSitemap(url) {
979
+ try {
980
+ return await globalRateLimiter.execute(url, async () => {
981
+ const controller = new AbortController();
982
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
983
+ try {
984
+ const response = await fetch(url, {
985
+ headers: {
986
+ "User-Agent": this.userAgent,
987
+ "Accept": "application/xml, text/xml, */*"
988
+ },
989
+ signal: controller.signal
990
+ });
991
+ clearTimeout(timeoutId);
992
+ if (!response.ok) {
993
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
994
+ }
995
+ const contentLength = response.headers.get("content-length");
996
+ if (contentLength && parseInt(contentLength) > this.maxSitemapSize) {
997
+ throw new Error(`Sitemap too large: ${contentLength} bytes`);
998
+ }
999
+ const xml = await response.text();
1000
+ if (xml.length > this.maxSitemapSize) {
1001
+ throw new Error(`Sitemap too large: ${xml.length} bytes`);
1002
+ }
1003
+ return xml;
1004
+ } catch (error) {
1005
+ clearTimeout(timeoutId);
1006
+ throw error;
1007
+ }
1008
+ });
1009
+ } catch (error) {
1010
+ console.error(`\u274C [Sitemap] Error fetching ${url}:`, error);
1011
+ return null;
1012
+ }
1013
+ }
1014
+ async checkSitemapExists(url) {
1015
+ try {
1016
+ return await globalRateLimiter.execute(url, async () => {
1017
+ const controller = new AbortController();
1018
+ const timeoutId = setTimeout(() => controller.abort(), 5e3);
1019
+ try {
1020
+ const response = await fetch(url, {
1021
+ method: "HEAD",
1022
+ headers: { "User-Agent": this.userAgent },
1023
+ signal: controller.signal
1024
+ });
1025
+ clearTimeout(timeoutId);
1026
+ return response.ok;
1027
+ } catch (error) {
1028
+ clearTimeout(timeoutId);
1029
+ return false;
1030
+ }
1031
+ });
1032
+ } catch (error) {
1033
+ return false;
1034
+ }
1035
+ }
1036
+ isSitemapIndex(xml) {
1037
+ return xml.includes("<sitemapindex") || xml.includes("</sitemapindex>");
1038
+ }
1039
+ async parseSitemapIndex(xml, options) {
1040
+ console.log(`\u{1F5FA}\uFE0F [Sitemap] Parsing sitemap index`);
1041
+ const $ = cheerio2.load(xml, { xmlMode: true });
1042
+ const sitemaps = [];
1043
+ const allEntries = [];
1044
+ $("sitemap").each((_, element) => {
1045
+ const $element = $(element);
1046
+ const loc = $element.find("loc").first().text().trim();
1047
+ if (loc) {
1048
+ sitemaps.push(loc);
1049
+ }
1050
+ });
1051
+ console.log(`\u{1F5FA}\uFE0F [Sitemap] Found ${sitemaps.length} sitemaps in index`);
1052
+ const entriesPerSitemap = Math.floor((options.maxEntries || this.maxEntries) / sitemaps.length);
1053
+ for (const sitemapUrl of sitemaps.slice(0, 10)) {
1054
+ try {
1055
+ const sitemapXml = await this.fetchSitemap(sitemapUrl);
1056
+ if (sitemapXml) {
1057
+ const entries = this.parseRegularSitemap(sitemapXml, {
1058
+ ...options,
1059
+ maxEntries: entriesPerSitemap
1060
+ });
1061
+ allEntries.push(...entries);
1062
+ }
1063
+ } catch (error) {
1064
+ console.warn(`\u26A0\uFE0F [Sitemap] Error parsing sitemap ${sitemapUrl}:`, error);
1065
+ continue;
1066
+ }
1067
+ }
1068
+ return allEntries;
1069
+ }
1070
+ parseRegularSitemap(xml, options) {
1071
+ console.log(`\u{1F5FA}\uFE0F [Sitemap] Parsing regular sitemap`);
1072
+ const $ = cheerio2.load(xml, { xmlMode: true });
1073
+ const entries = [];
1074
+ const maxEntries = options.maxEntries || this.maxEntries;
1075
+ const cutoffTime = options.filterRecent ? new Date(Date.now() - this.recentTimeframe) : null;
1076
+ $("url").each((_index, element) => {
1077
+ if (entries.length >= maxEntries) {
1078
+ return false;
1079
+ }
1080
+ const $element = $(element);
1081
+ const loc = $element.find("loc").first().text().trim();
1082
+ if (!loc) return void 0;
1083
+ const entry = { url: loc };
1084
+ const lastmodText = $element.find("lastmod").first().text().trim();
1085
+ if (lastmodText) {
1086
+ const lastmod = new Date(lastmodText);
1087
+ if (!isNaN(lastmod.getTime())) {
1088
+ entry.lastmod = lastmod;
1089
+ }
1090
+ }
1091
+ if (cutoffTime && entry.lastmod && entry.lastmod < cutoffTime) {
1092
+ return void 0;
1093
+ }
1094
+ const changefreq = $element.find("changefreq").first().text().trim();
1095
+ if (changefreq) {
1096
+ entry.changefreq = changefreq;
1097
+ }
1098
+ const priorityText = $element.find("priority").first().text().trim();
1099
+ if (priorityText) {
1100
+ const priority = parseFloat(priorityText);
1101
+ if (!isNaN(priority)) {
1102
+ entry.priority = priority;
1103
+ }
1104
+ }
1105
+ if (options.includeImages) {
1106
+ const images = [];
1107
+ $element.find("image\\:image").each((_, imgElement) => {
1108
+ const $img = $(imgElement);
1109
+ const imgLoc = $img.find("image\\:loc").first().text().trim();
1110
+ if (imgLoc) {
1111
+ images.push({
1112
+ loc: imgLoc,
1113
+ caption: $img.find("image\\:caption").first().text().trim() || void 0,
1114
+ title: $img.find("image\\:title").first().text().trim() || void 0
1115
+ });
1116
+ }
1117
+ });
1118
+ if (images.length > 0) {
1119
+ entry.images = images;
1120
+ }
1121
+ }
1122
+ if (options.includeNews) {
1123
+ const $news = $element.find("news\\:news");
1124
+ if ($news.length > 0) {
1125
+ const title = $news.find("news\\:title").first().text().trim();
1126
+ if (title) {
1127
+ entry.news = { title };
1128
+ const pubDateText = $news.find("news\\:publication_date").first().text().trim();
1129
+ if (pubDateText) {
1130
+ const pubDate = new Date(pubDateText);
1131
+ if (!isNaN(pubDate.getTime())) {
1132
+ entry.news.publishedDate = pubDate;
1133
+ }
1134
+ }
1135
+ const keywords = $news.find("news\\:keywords").first().text().trim();
1136
+ if (keywords) {
1137
+ entry.news.keywords = keywords.split(",").map((k) => k.trim());
1138
+ }
1139
+ }
1140
+ }
1141
+ }
1142
+ entries.push(entry);
1143
+ return void 0;
1144
+ });
1145
+ console.log(`\u{1F5FA}\uFE0F [Sitemap] Parsed ${entries.length} entries from sitemap`);
1146
+ return entries;
1147
+ }
1148
+ /**
1149
+ * Validate sitemap format
1150
+ */
1151
+ validateSitemapFormat(xml) {
1152
+ const errors = [];
1153
+ try {
1154
+ const $ = cheerio2.load(xml, { xmlMode: true });
1155
+ const hasUrlset = $("urlset").length > 0;
1156
+ const hasSitemapIndex = $("sitemapindex").length > 0;
1157
+ if (!hasUrlset && !hasSitemapIndex) {
1158
+ errors.push("Missing required root element: <urlset> or <sitemapindex>");
1159
+ }
1160
+ if (hasUrlset) {
1161
+ const urlCount = $("url").length;
1162
+ if (urlCount > 5e4) {
1163
+ errors.push(`Too many URLs: ${urlCount} (max: 50,000)`);
1164
+ }
1165
+ }
1166
+ $("url").each((index, element) => {
1167
+ const $element = $(element);
1168
+ const loc = $element.find("loc").first().text().trim();
1169
+ if (!loc) {
1170
+ errors.push(`URL entry ${index + 1} missing <loc> element`);
1171
+ } else {
1172
+ try {
1173
+ new URL(loc);
1174
+ } catch {
1175
+ errors.push(`Invalid URL in entry ${index + 1}: ${loc}`);
1176
+ }
1177
+ }
1178
+ const lastmod = $element.find("lastmod").first().text().trim();
1179
+ if (lastmod) {
1180
+ const date = new Date(lastmod);
1181
+ if (isNaN(date.getTime())) {
1182
+ errors.push(`Invalid lastmod date in entry ${index + 1}: ${lastmod}`);
1183
+ }
1184
+ }
1185
+ const priority = $element.find("priority").first().text().trim();
1186
+ if (priority) {
1187
+ const priorityNum = parseFloat(priority);
1188
+ if (isNaN(priorityNum) || priorityNum < 0 || priorityNum > 1) {
1189
+ errors.push(`Invalid priority in entry ${index + 1}: ${priority} (must be 0-1)`);
1190
+ }
1191
+ }
1192
+ });
1193
+ } catch (error) {
1194
+ errors.push(`XML parsing error: ${error instanceof Error ? error.message : "Unknown error"}`);
1195
+ }
1196
+ return {
1197
+ valid: errors.length === 0,
1198
+ errors
1199
+ };
1200
+ }
1201
+ };
1202
+ var globalSitemapParser = new SitemapParser();
1203
+
1204
+ // src/extractors/html-scraper.ts
1205
+ var cheerio3 = __toESM(require("cheerio"));
1206
+ var PERPLEXITY_MODELS = {
1207
+ SONAR: "llama-3.1-sonar-small-128k-online",
1208
+ SONAR_PRO: "llama-3.1-sonar-large-128k-online"
1209
+ };
1210
+ var HTMLScraper = class {
1211
+ constructor() {
1212
+ this.userAgent = "Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)";
1213
+ this.timeout = 1e4;
1214
+ // 10 seconds
1215
+ this.defaultConfig = {
1216
+ selectors: {
1217
+ articleLinks: [
1218
+ "article a[href]",
1219
+ ".article a[href]",
1220
+ ".post a[href]",
1221
+ ".story a[href]",
1222
+ ".news-item a[href]",
1223
+ ".content-item a[href]",
1224
+ "h1 a[href]",
1225
+ "h2 a[href]",
1226
+ "h3 a[href]",
1227
+ ".headline a[href]",
1228
+ ".title a[href]"
1229
+ ],
1230
+ titleSelectors: [
1231
+ "h1",
1232
+ "h2",
1233
+ "h3",
1234
+ ".headline",
1235
+ ".title",
1236
+ ".article-title",
1237
+ ".post-title",
1238
+ ".story-title"
1239
+ ],
1240
+ dateSelectors: [
1241
+ "time[datetime]",
1242
+ ".date",
1243
+ ".published",
1244
+ ".timestamp",
1245
+ ".publish-date",
1246
+ ".article-date"
1247
+ ],
1248
+ excludeSelectors: [
1249
+ ".advertisement",
1250
+ ".ads",
1251
+ ".sidebar",
1252
+ ".footer",
1253
+ ".navigation",
1254
+ ".menu",
1255
+ ".comments",
1256
+ ".related"
1257
+ ]
1258
+ },
1259
+ filters: {
1260
+ minTitleLength: 10,
1261
+ maxTitleLength: 200,
1262
+ includePatterns: [
1263
+ /\/article\//i,
1264
+ /\/post\//i,
1265
+ /\/story\//i,
1266
+ /\/news\//i,
1267
+ /\/blog\//i,
1268
+ /\/\d{4}\/\d{2}\/\d{2}\//,
1269
+ // Date patterns
1270
+ /\/\d{4}\/\d{2}\//
1271
+ ],
1272
+ excludePatterns: [
1273
+ /\/(tag|category|author|search|archive)\//i,
1274
+ /\/(login|register|contact|about)\//i,
1275
+ /\.(pdf|jpg|jpeg|png|gif|mp4|zip|doc)$/i,
1276
+ /#/,
1277
+ // Skip hash links
1278
+ /javascript:/i,
1279
+ /mailto:/i
1280
+ ]
1281
+ },
1282
+ limits: {
1283
+ maxLinksPerPage: 100,
1284
+ maxDepth: 3
1285
+ }
1286
+ };
1287
+ }
1288
+ /**
1289
+ * Extract article links from a webpage
1290
+ */
1291
+ async extractArticleLinks(url, config = {}) {
1292
+ console.log(`\u{1F4F0} [HTMLScraper] Starting to extract articles from ${url}`);
1293
+ try {
1294
+ const robotsCheck = await globalRobotsChecker.isAllowed(url);
1295
+ if (!robotsCheck.allowed) {
1296
+ console.warn(`\u{1F916} [HTMLScraper] URL blocked by robots.txt: ${url} - ${robotsCheck.reason}`);
1297
+ if (config.perplexityFallback?.enabled && config.perplexityFallback?.useForRobotsBlocked) {
1298
+ console.log(`\u{1F504} [HTMLScraper] Attempting Perplexity fallback for robots-blocked URL`);
1299
+ return await this.extractWithPerplexity(url, config);
1300
+ }
1301
+ return [];
1302
+ }
1303
+ const html = await this.fetchPage(url);
1304
+ if (!html) {
1305
+ if (config.perplexityFallback?.enabled && config.perplexityFallback?.useForParseFailed) {
1306
+ console.log(`\u{1F504} [HTMLScraper] Attempting Perplexity fallback for failed fetch`);
1307
+ return await this.extractWithPerplexity(url, config);
1308
+ }
1309
+ return [];
1310
+ }
1311
+ const mergedConfig = this.mergeConfig(this.defaultConfig, config);
1312
+ const articles = this.parseArticleLinks(html, url, mergedConfig);
1313
+ if (articles.length === 0 && config.perplexityFallback?.enabled && config.perplexityFallback?.useForParseFailed) {
1314
+ console.log(`\u{1F504} [HTMLScraper] No articles found, attempting Perplexity fallback`);
1315
+ return await this.extractWithPerplexity(url, config);
1316
+ }
1317
+ console.log(`\u{1F4F0} [HTMLScraper] Extracted ${articles.length} article links from ${url}`);
1318
+ return articles;
1319
+ } catch (error) {
1320
+ console.error(`\u274C [HTMLScraper] Error extracting articles from ${url}:`, error);
1321
+ if (config.perplexityFallback?.enabled) {
1322
+ console.log(`\u{1F504} [HTMLScraper] Attempting Perplexity fallback after error`);
1323
+ return await this.extractWithPerplexity(url, config);
1324
+ }
1325
+ return [];
1326
+ }
1327
+ }
1328
+ /**
1329
+ * Extract articles from multiple pages with pagination support
1330
+ */
1331
+ async extractFromMultiplePages(startUrl, config = {}, options = {}) {
1332
+ const maxPages = options.maxPages || 5;
1333
+ const allArticles = [];
1334
+ const visitedUrls = /* @__PURE__ */ new Set();
1335
+ const urlsToVisit = [startUrl];
1336
+ let pageCount = 0;
1337
+ while (urlsToVisit.length > 0 && pageCount < maxPages) {
1338
+ const currentUrl = urlsToVisit.shift();
1339
+ if (visitedUrls.has(currentUrl)) {
1340
+ continue;
1341
+ }
1342
+ visitedUrls.add(currentUrl);
1343
+ pageCount++;
1344
+ console.log(`\u{1F4F0} [HTMLScraper] Processing page ${pageCount}/${maxPages}: ${currentUrl}`);
1345
+ try {
1346
+ const articles = await this.extractArticleLinks(currentUrl, config);
1347
+ allArticles.push(...articles);
1348
+ if (pageCount < maxPages) {
1349
+ const nextPageUrls = await this.findNextPageUrls(currentUrl, options);
1350
+ for (const nextUrl of nextPageUrls) {
1351
+ if (!visitedUrls.has(nextUrl)) {
1352
+ urlsToVisit.push(nextUrl);
1353
+ }
1354
+ }
1355
+ }
1356
+ } catch (error) {
1357
+ console.warn(`\u26A0\uFE0F [HTMLScraper] Error processing page ${currentUrl}:`, error);
1358
+ continue;
1359
+ }
1360
+ }
1361
+ const uniqueArticles = this.deduplicateArticles(allArticles);
1362
+ uniqueArticles.sort((a, b) => b.confidence - a.confidence);
1363
+ console.log(`\u{1F4F0} [HTMLScraper] Total extracted ${uniqueArticles.length} unique articles from ${pageCount} pages`);
1364
+ return uniqueArticles;
1365
+ }
1366
+ async fetchPage(url) {
1367
+ try {
1368
+ return await globalRateLimiter.execute(url, async () => {
1369
+ const controller = new AbortController();
1370
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
1371
+ try {
1372
+ const response = await fetch(url, {
1373
+ headers: {
1374
+ "User-Agent": this.userAgent,
1375
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
1376
+ },
1377
+ signal: controller.signal
1378
+ });
1379
+ clearTimeout(timeoutId);
1380
+ if (!response.ok) {
1381
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
1382
+ }
1383
+ const contentType = response.headers.get("content-type") || "";
1384
+ if (!contentType.includes("text/html")) {
1385
+ throw new Error(`Not HTML content: ${contentType}`);
1386
+ }
1387
+ return await response.text();
1388
+ } catch (error) {
1389
+ clearTimeout(timeoutId);
1390
+ throw error;
1391
+ }
1392
+ });
1393
+ } catch (error) {
1394
+ console.error(`\u274C [HTMLScraper] Error fetching page ${url}:`, error);
1395
+ return null;
1396
+ }
1397
+ }
1398
+ parseArticleLinks(html, baseUrl, config) {
1399
+ const articles = [];
1400
+ try {
1401
+ const $ = cheerio3.load(html);
1402
+ const seenUrls = /* @__PURE__ */ new Set();
1403
+ config.selectors?.excludeSelectors?.forEach((selector) => {
1404
+ $(selector).remove();
1405
+ });
1406
+ config.selectors?.articleLinks?.forEach((selector) => {
1407
+ $(selector).each((_, element) => {
1408
+ const $link = $(element);
1409
+ const href = $link.attr("href");
1410
+ if (!href) return;
1411
+ const absoluteUrl = this.resolveUrl(href, baseUrl);
1412
+ if (!absoluteUrl || seenUrls.has(absoluteUrl)) {
1413
+ return;
1414
+ }
1415
+ if (!this.passesFilters(absoluteUrl, config.filters)) {
1416
+ return;
1417
+ }
1418
+ seenUrls.add(absoluteUrl);
1419
+ const article = this.extractArticleInfo($link, $, absoluteUrl);
1420
+ if (article && articles.length < (config.limits?.maxLinksPerPage || 100)) {
1421
+ articles.push(article);
1422
+ }
1423
+ });
1424
+ });
1425
+ const structuredArticles = this.extractStructuredData($, baseUrl);
1426
+ structuredArticles.forEach((article) => {
1427
+ if (!seenUrls.has(article.url)) {
1428
+ seenUrls.add(article.url);
1429
+ articles.push(article);
1430
+ }
1431
+ });
1432
+ } catch (error) {
1433
+ console.error(`\u274C [HTMLScraper] Error parsing HTML:`, error);
1434
+ }
1435
+ return articles;
1436
+ }
1437
+ extractArticleInfo($link, _$, url) {
1438
+ let title = $link.text().trim();
1439
+ let confidence = 0.5;
1440
+ let publishedDate;
1441
+ let description;
1442
+ if (!title || title.length < 5) {
1443
+ const $parent2 = $link.closest("article, .article, .post, .story, .news-item");
1444
+ if ($parent2.length > 0) {
1445
+ const betterTitle = $parent2.find("h1, h2, h3, .headline, .title").first().text().trim();
1446
+ if (betterTitle && betterTitle.length > title.length) {
1447
+ title = betterTitle;
1448
+ confidence += 0.2;
1449
+ }
1450
+ }
1451
+ }
1452
+ const $dateElement = $link.closest("article, .article, .post").find("time[datetime], .date, .published").first();
1453
+ if ($dateElement.length > 0) {
1454
+ const dateText = $dateElement.attr("datetime") || $dateElement.text().trim();
1455
+ if (dateText) {
1456
+ const date = this.parseDate(dateText);
1457
+ if (date) {
1458
+ publishedDate = date;
1459
+ confidence += 0.1;
1460
+ }
1461
+ }
1462
+ }
1463
+ const $parent = $link.closest("article, .article, .post, .story");
1464
+ if ($parent.length > 0) {
1465
+ description = $parent.find(".excerpt, .summary, p").first().text().trim();
1466
+ if (description && description.length > 50) {
1467
+ description = description.substring(0, 300) + "...";
1468
+ confidence += 0.1;
1469
+ }
1470
+ }
1471
+ if (this.isLikelyArticleUrl(url)) {
1472
+ confidence += 0.2;
1473
+ }
1474
+ if (title && title.length >= 20 && title.length <= 120) {
1475
+ confidence += 0.1;
1476
+ }
1477
+ if (!title || title.length < 10) {
1478
+ return null;
1479
+ }
1480
+ return {
1481
+ url,
1482
+ title,
1483
+ publishedDate,
1484
+ description,
1485
+ confidence: Math.min(confidence, 1),
1486
+ source: "link-text"
1487
+ };
1488
+ }
1489
+ extractStructuredData($, baseUrl) {
1490
+ const articles = [];
1491
+ $('script[type="application/ld+json"]').each((_, element) => {
1492
+ try {
1493
+ const jsonText = $(element).html();
1494
+ if (!jsonText) return;
1495
+ const data = JSON.parse(jsonText);
1496
+ const items = Array.isArray(data) ? data : [data];
1497
+ for (const item of items) {
1498
+ if (item["@type"] === "Article" || item["@type"] === "NewsArticle") {
1499
+ const url = item.url || item.mainEntityOfPage?.["@id"];
1500
+ if (url) {
1501
+ const absoluteUrl = this.resolveUrl(url, baseUrl);
1502
+ if (absoluteUrl) {
1503
+ articles.push({
1504
+ url: absoluteUrl,
1505
+ title: item.headline || item.name,
1506
+ publishedDate: item.datePublished ? new Date(item.datePublished) : void 0,
1507
+ description: item.description,
1508
+ confidence: 0.9,
1509
+ source: "structured-data"
1510
+ });
1511
+ }
1512
+ }
1513
+ }
1514
+ }
1515
+ } catch (error) {
1516
+ }
1517
+ });
1518
+ return articles;
1519
+ }
1520
+ async findNextPageUrls(currentUrl, options) {
1521
+ try {
1522
+ const html = await this.fetchPage(currentUrl);
1523
+ if (!html) return [];
1524
+ const $ = cheerio3.load(html);
1525
+ const nextUrls = [];
1526
+ const paginationSelector = options.paginationSelector || 'a[rel="next"], .pagination a, .next a, .pager a, [class*="next"] a';
1527
+ $(paginationSelector).each((_, element) => {
1528
+ const $link = $(element);
1529
+ const href = $link.attr("href");
1530
+ const text = $link.text().toLowerCase().trim();
1531
+ if (href && (text.includes("next") || text.includes("\u2192") || text === ">")) {
1532
+ const absoluteUrl = this.resolveUrl(href, currentUrl);
1533
+ if (absoluteUrl) {
1534
+ nextUrls.push(absoluteUrl);
1535
+ }
1536
+ }
1537
+ });
1538
+ return Array.from(new Set(nextUrls));
1539
+ } catch (error) {
1540
+ console.warn(`\u26A0\uFE0F [HTMLScraper] Error finding next page URLs:`, error);
1541
+ return [];
1542
+ }
1543
+ }
1544
+ deduplicateArticles(articles) {
1545
+ const seen = /* @__PURE__ */ new Map();
1546
+ for (const article of articles) {
1547
+ const existing = seen.get(article.url);
1548
+ if (!existing || article.confidence > existing.confidence) {
1549
+ seen.set(article.url, article);
1550
+ }
1551
+ }
1552
+ return Array.from(seen.values());
1553
+ }
1554
+ passesFilters(url, filters) {
1555
+ if (!filters) return true;
1556
+ if (filters.excludePatterns?.some((pattern) => pattern.test(url))) {
1557
+ return false;
1558
+ }
1559
+ if (filters.includePatterns?.length && !filters.includePatterns.some((pattern) => pattern.test(url))) {
1560
+ return false;
1561
+ }
1562
+ if (filters.allowedDomains?.length) {
1563
+ try {
1564
+ const urlObj = new URL(url);
1565
+ const domain = urlObj.hostname.toLowerCase();
1566
+ if (!filters.allowedDomains.some(
1567
+ (allowed) => domain === allowed.toLowerCase() || domain.endsWith("." + allowed.toLowerCase())
1568
+ )) {
1569
+ return false;
1570
+ }
1571
+ } catch {
1572
+ return false;
1573
+ }
1574
+ }
1575
+ return true;
1576
+ }
1577
+ isLikelyArticleUrl(url) {
1578
+ const urlLower = url.toLowerCase();
1579
+ const articlePatterns = [
1580
+ /\/article[s]?\//,
1581
+ /\/post[s]?\//,
1582
+ /\/story\//,
1583
+ /\/stories\//,
1584
+ /\/news\//,
1585
+ /\/blog\//,
1586
+ /\/\d{4}\/\d{2}\/\d{2}\//,
1587
+ // Date-based URLs
1588
+ /\/\d{4}\/\d{2}\//
1589
+ ];
1590
+ return articlePatterns.some((pattern) => pattern.test(urlLower));
1591
+ }
1592
+ parseDate(dateString) {
1593
+ try {
1594
+ const date = new Date(dateString);
1595
+ if (isNaN(date.getTime())) {
1596
+ const formats = [
1597
+ /(\d{4})-(\d{2})-(\d{2})/,
1598
+ // YYYY-MM-DD
1599
+ /(\d{2})\/(\d{2})\/(\d{4})/,
1600
+ // MM/DD/YYYY
1601
+ /(\d{2})\.(\d{2})\.(\d{4})/
1602
+ // DD.MM.YYYY
1603
+ ];
1604
+ for (const format of formats) {
1605
+ const match = dateString.match(format);
1606
+ if (match) {
1607
+ const [, p1, p2, p3] = match;
1608
+ const testDate = /* @__PURE__ */ new Date(`${p1}-${p2}-${p3}`);
1609
+ if (!isNaN(testDate.getTime())) {
1610
+ return testDate;
1611
+ }
1612
+ }
1613
+ }
1614
+ return null;
1615
+ }
1616
+ return date;
1617
+ } catch {
1618
+ return null;
1619
+ }
1620
+ }
1621
+ resolveUrl(url, baseUrl) {
1622
+ try {
1623
+ return new URL(url, baseUrl).toString();
1624
+ } catch {
1625
+ return null;
1626
+ }
1627
+ }
1628
+ mergeConfig(defaultConfig, userConfig) {
1629
+ return {
1630
+ selectors: {
1631
+ ...defaultConfig.selectors,
1632
+ ...userConfig.selectors,
1633
+ articleLinks: [
1634
+ ...defaultConfig.selectors?.articleLinks || [],
1635
+ ...userConfig.selectors?.articleLinks || []
1636
+ ]
1637
+ },
1638
+ filters: {
1639
+ ...defaultConfig.filters,
1640
+ ...userConfig.filters,
1641
+ includePatterns: [
1642
+ ...defaultConfig.filters?.includePatterns || [],
1643
+ ...userConfig.filters?.includePatterns || []
1644
+ ],
1645
+ excludePatterns: [
1646
+ ...defaultConfig.filters?.excludePatterns || [],
1647
+ ...userConfig.filters?.excludePatterns || []
1648
+ ]
1649
+ },
1650
+ limits: {
1651
+ ...defaultConfig.limits,
1652
+ ...userConfig.limits
1653
+ },
1654
+ perplexityFallback: {
1655
+ ...defaultConfig.perplexityFallback,
1656
+ ...userConfig.perplexityFallback
1657
+ }
1658
+ };
1659
+ }
1660
+ /**
1661
+ * Use Perplexity API to extract articles when traditional scraping fails
1662
+ * Requires PERPLEXITY_API_KEY environment variable to be set
1663
+ */
1664
+ async extractWithPerplexity(url, config) {
1665
+ try {
1666
+ if (!process.env.PERPLEXITY_API_KEY) {
1667
+ console.warn(`\u26A0\uFE0F [HTMLScraper] Perplexity API key not configured - set PERPLEXITY_API_KEY env variable`);
1668
+ return [];
1669
+ }
1670
+ const domain = new URL(url).hostname;
1671
+ const query = `Find recent news articles and stories from ${domain}. List article titles and URLs.`;
1672
+ console.log(`\u{1F50D} [HTMLScraper] Using Perplexity to find articles from ${domain}`);
1673
+ const response = await fetch("https://api.perplexity.ai/chat/completions", {
1674
+ method: "POST",
1675
+ headers: {
1676
+ "Content-Type": "application/json",
1677
+ "Authorization": `Bearer ${process.env.PERPLEXITY_API_KEY}`
1678
+ },
1679
+ body: JSON.stringify({
1680
+ model: config.perplexityFallback?.model || PERPLEXITY_MODELS.SONAR,
1681
+ messages: [{ role: "user", content: query }],
1682
+ max_tokens: 1e3,
1683
+ return_citations: true,
1684
+ search_recency_filter: config.perplexityFallback?.searchRecency || "day"
1685
+ })
1686
+ });
1687
+ if (!response.ok) {
1688
+ throw new Error(`Perplexity API error: ${response.status} ${response.statusText}`);
1689
+ }
1690
+ const data = await response.json();
1691
+ const articles = [];
1692
+ if (data.citations && Array.isArray(data.citations)) {
1693
+ for (const citation of data.citations) {
1694
+ try {
1695
+ const citationUrl = citation;
1696
+ const citationDomain = new URL(citationUrl).hostname;
1697
+ if (citationDomain === domain || citationDomain.includes(domain.split(".")[0])) {
1698
+ articles.push({
1699
+ url: citationUrl,
1700
+ title: citationUrl.split("/").pop() || domain,
1701
+ confidence: 0.7,
1702
+ source: "meta-data"
1703
+ });
1704
+ }
1705
+ } catch {
1706
+ continue;
1707
+ }
1708
+ }
1709
+ }
1710
+ const maxLinks = config.limits?.maxLinksPerPage || 100;
1711
+ const limitedArticles = articles.slice(0, maxLinks);
1712
+ console.log(`\u2728 [HTMLScraper] Perplexity found ${limitedArticles.length} articles`);
1713
+ return limitedArticles;
1714
+ } catch (error) {
1715
+ console.error(`\u274C [HTMLScraper] Perplexity fallback failed:`, error);
1716
+ return [];
1717
+ }
1718
+ }
1719
+ };
1720
+ var globalHTMLScraper = new HTMLScraper();
1721
+
1722
+ // src/extractors/content-extractor.ts
1723
+ var import_readability = require("@mozilla/readability");
1724
+ var import_jsdom = require("jsdom");
1725
+ var cheerio4 = __toESM(require("cheerio"));
1726
+ var ContentExtractor = class {
1727
+ constructor() {
1728
+ this.userAgent = "Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)";
1729
+ this.timeout = 15e3;
1730
+ // 15 seconds
1731
+ this.maxContentSize = 10 * 1024 * 1024;
1732
+ // 10MB max
1733
+ this.minContentLength = 200;
1734
+ // Minimum 200 characters
1735
+ this.wordsPerMinute = 200;
1736
+ this.ssrfProtection = {
1737
+ isPrivateIP: (url) => {
1738
+ try {
1739
+ const urlObj = new URL(url);
1740
+ const hostname = urlObj.hostname;
1741
+ const privateRanges = [
1742
+ /^127\./,
1743
+ // 127.0.0.0/8 (loopback)
1744
+ /^10\./,
1745
+ // 10.0.0.0/8 (private)
1746
+ /^172\.(1[6-9]|2[0-9]|3[01])\./,
1747
+ // 172.16.0.0/12 (private)
1748
+ /^192\.168\./,
1749
+ // 192.168.0.0/16 (private)
1750
+ /^169\.254\./,
1751
+ // 169.254.0.0/16 (link-local)
1752
+ /^::1$/,
1753
+ // IPv6 loopback
1754
+ /^fe80:/,
1755
+ // IPv6 link-local
1756
+ /^fc00:/,
1757
+ // IPv6 unique local
1758
+ /^fd00:/
1759
+ // IPv6 unique local
1760
+ ];
1761
+ return privateRanges.some((range) => range.test(hostname));
1762
+ } catch {
1763
+ return true;
1764
+ }
1765
+ },
1766
+ isLocalhost: (url) => {
1767
+ try {
1768
+ const urlObj = new URL(url);
1769
+ const hostname = urlObj.hostname.toLowerCase();
1770
+ return hostname === "localhost" || hostname === "127.0.0.1" || hostname === "::1";
1771
+ } catch {
1772
+ return true;
1773
+ }
1774
+ },
1775
+ isAllowedProtocol: (url) => {
1776
+ try {
1777
+ const urlObj = new URL(url);
1778
+ return urlObj.protocol === "http:" || urlObj.protocol === "https:";
1779
+ } catch {
1780
+ return false;
1781
+ }
1782
+ }
1783
+ };
1784
+ }
1785
+ /**
1786
+ * Extract content from a URL
1787
+ */
1788
+ async extractContent(url) {
1789
+ console.log(`\u{1F4D6} [ContentExtractor] Starting content extraction from ${url}`);
1790
+ try {
1791
+ if (!this.ssrfProtection.isAllowedProtocol(url)) {
1792
+ throw new Error(`Disallowed protocol: ${url}`);
1793
+ }
1794
+ if (this.ssrfProtection.isPrivateIP(url) || this.ssrfProtection.isLocalhost(url)) {
1795
+ throw new Error(`Private/local IP not allowed: ${url}`);
1796
+ }
1797
+ const robotsCheck = await globalRobotsChecker.isAllowed(url);
1798
+ if (!robotsCheck.allowed) {
1799
+ console.warn(`\u{1F916} [ContentExtractor] URL blocked by robots.txt: ${url} - ${robotsCheck.reason}`);
1800
+ return null;
1801
+ }
1802
+ const html = await this.fetchContent(url);
1803
+ if (!html) {
1804
+ return null;
1805
+ }
1806
+ const extracted = await this.extractFromHTML(html, url);
1807
+ if (!extracted) {
1808
+ console.warn(`\u26A0\uFE0F [ContentExtractor] No content extracted from ${url}`);
1809
+ return null;
1810
+ }
1811
+ if (extracted.textContent.length < this.minContentLength) {
1812
+ console.warn(`\u26A0\uFE0F [ContentExtractor] Content too short (${extracted.textContent.length} chars): ${url}`);
1813
+ return null;
1814
+ }
1815
+ console.log(`\u2705 [ContentExtractor] Successfully extracted ${extracted.wordCount} words from ${url}`);
1816
+ return extracted;
1817
+ } catch (error) {
1818
+ console.error(`\u274C [ContentExtractor] Error extracting content from ${url}:`, error);
1819
+ return null;
1820
+ }
1821
+ }
1822
+ /**
1823
+ * Extract content from multiple URLs
1824
+ */
1825
+ async extractBatch(urls) {
1826
+ console.log(`\u{1F4D6} [ContentExtractor] Starting batch extraction of ${urls.length} URLs`);
1827
+ const results = urls.map(
1828
+ (url) => this.extractContent(url).catch((error) => {
1829
+ console.error(`\u274C [ContentExtractor] Error in batch extraction for ${url}:`, error);
1830
+ return null;
1831
+ })
1832
+ );
1833
+ const extracted = await Promise.all(results);
1834
+ const successful = extracted.filter(Boolean).length;
1835
+ console.log(`\u{1F4D6} [ContentExtractor] Batch complete: ${successful}/${urls.length} successful`);
1836
+ return extracted;
1837
+ }
1838
+ async fetchContent(url) {
1839
+ try {
1840
+ return await globalRateLimiter.execute(url, async () => {
1841
+ const controller = new AbortController();
1842
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
1843
+ try {
1844
+ const response = await fetch(url, {
1845
+ headers: {
1846
+ "User-Agent": this.userAgent,
1847
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
1848
+ "Accept-Language": "en-US,en;q=0.9"
1849
+ },
1850
+ signal: controller.signal
1851
+ });
1852
+ clearTimeout(timeoutId);
1853
+ if (!response.ok) {
1854
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
1855
+ }
1856
+ const contentLength = response.headers.get("content-length");
1857
+ if (contentLength && parseInt(contentLength) > this.maxContentSize) {
1858
+ throw new Error(`Content too large: ${contentLength} bytes`);
1859
+ }
1860
+ const html = await response.text();
1861
+ if (html.length > this.maxContentSize) {
1862
+ throw new Error(`Content too large: ${html.length} bytes`);
1863
+ }
1864
+ return html;
1865
+ } catch (error) {
1866
+ clearTimeout(timeoutId);
1867
+ throw error;
1868
+ }
1869
+ });
1870
+ } catch (error) {
1871
+ console.error(`\u274C [ContentExtractor] Error fetching content from ${url}:`, error);
1872
+ return null;
1873
+ }
1874
+ }
1875
+ async extractFromHTML(html, url) {
1876
+ const errors = [];
1877
+ try {
1878
+ const readabilityResult = this.extractWithReadability(html, url);
1879
+ if (readabilityResult && readabilityResult.textContent.length >= this.minContentLength) {
1880
+ return {
1881
+ ...readabilityResult,
1882
+ extractionMethod: "readability",
1883
+ confidence: 0.9
1884
+ };
1885
+ } else {
1886
+ errors.push("Readability extraction failed or content too short");
1887
+ }
1888
+ } catch (error) {
1889
+ errors.push(`Readability error: ${error instanceof Error ? error.message : "Unknown error"}`);
1890
+ }
1891
+ try {
1892
+ const fallbackResult = this.extractWithFallback(html, url);
1893
+ if (fallbackResult && fallbackResult.textContent.length >= this.minContentLength) {
1894
+ return {
1895
+ ...fallbackResult,
1896
+ extractionMethod: "fallback",
1897
+ confidence: 0.6,
1898
+ errors
1899
+ };
1900
+ } else {
1901
+ errors.push("Fallback extraction failed or content too short");
1902
+ }
1903
+ } catch (error) {
1904
+ errors.push(`Fallback error: ${error instanceof Error ? error.message : "Unknown error"}`);
1905
+ }
1906
+ console.error(`\u274C [ContentExtractor] All extraction methods failed for ${url}:`, errors);
1907
+ return null;
1908
+ }
1909
+ extractWithReadability(html, url) {
1910
+ try {
1911
+ const dom = new import_jsdom.JSDOM(html, { url });
1912
+ const document = dom.window.document;
1913
+ const reader = new import_readability.Readability(document);
1914
+ const article = reader.parse();
1915
+ if (!article) {
1916
+ return null;
1917
+ }
1918
+ const structured = this.extractStructuredData(html, url);
1919
+ const wordCount = this.countWords(article.textContent);
1920
+ const readingTime = Math.ceil(wordCount / this.wordsPerMinute);
1921
+ return {
1922
+ url,
1923
+ title: article.title || "",
1924
+ content: article.content || "",
1925
+ textContent: article.textContent || "",
1926
+ excerpt: article.excerpt || void 0,
1927
+ byline: article.byline || void 0,
1928
+ publishedTime: this.extractPublishedTime(html),
1929
+ siteName: article.siteName || this.extractSiteName(html),
1930
+ lang: this.extractLanguage(html),
1931
+ structured,
1932
+ wordCount,
1933
+ readingTime,
1934
+ confidence: 0.9,
1935
+ extractionMethod: "readability",
1936
+ extractedAt: /* @__PURE__ */ new Date()
1937
+ };
1938
+ } catch (error) {
1939
+ console.error(`\u274C [ContentExtractor] Readability extraction failed:`, error);
1940
+ return null;
1941
+ }
1942
+ }
1943
+ extractWithFallback(html, url) {
1944
+ try {
1945
+ const $ = cheerio4.load(html);
1946
+ const unwantedSelectors = [
1947
+ "script",
1948
+ "style",
1949
+ "nav",
1950
+ "header",
1951
+ "footer",
1952
+ ".advertisement",
1953
+ ".ads",
1954
+ ".social-share",
1955
+ ".comments",
1956
+ ".sidebar",
1957
+ ".navigation",
1958
+ ".menu",
1959
+ ".popup",
1960
+ ".modal"
1961
+ ];
1962
+ unwantedSelectors.forEach((selector) => $(selector).remove());
1963
+ let content = "";
1964
+ let title = "";
1965
+ title = $("h1").first().text().trim() || $("title").text().trim() || $('meta[property="og:title"]').attr("content") || "";
1966
+ const contentSelectors = [
1967
+ "article",
1968
+ ".article-content",
1969
+ ".post-content",
1970
+ ".entry-content",
1971
+ ".content",
1972
+ "main",
1973
+ "#content",
1974
+ ".story-body"
1975
+ ];
1976
+ for (const selector of contentSelectors) {
1977
+ const element = $(selector).first();
1978
+ if (element.length > 0) {
1979
+ content = element.html() || "";
1980
+ if (content.length > this.minContentLength) {
1981
+ break;
1982
+ }
1983
+ }
1984
+ }
1985
+ if (content.length < this.minContentLength) {
1986
+ content = $("body").html() || "";
1987
+ }
1988
+ if (!content || content.length < this.minContentLength) {
1989
+ return null;
1990
+ }
1991
+ const textContent = $(content).text().trim();
1992
+ const wordCount = this.countWords(textContent);
1993
+ const readingTime = Math.ceil(wordCount / this.wordsPerMinute);
1994
+ const structured = this.extractStructuredData(html, url);
1995
+ return {
1996
+ url,
1997
+ title,
1998
+ content,
1999
+ textContent,
2000
+ excerpt: textContent.substring(0, 300) + "...",
2001
+ publishedTime: this.extractPublishedTime(html),
2002
+ siteName: this.extractSiteName(html),
2003
+ lang: this.extractLanguage(html),
2004
+ structured,
2005
+ wordCount,
2006
+ readingTime,
2007
+ confidence: 0.6,
2008
+ extractionMethod: "fallback",
2009
+ extractedAt: /* @__PURE__ */ new Date()
2010
+ };
2011
+ } catch (error) {
2012
+ console.error(`\u274C [ContentExtractor] Fallback extraction failed:`, error);
2013
+ return null;
2014
+ }
2015
+ }
2016
+ extractStructuredData(html, _url) {
2017
+ const structured = {};
2018
+ try {
2019
+ const $ = cheerio4.load(html);
2020
+ const jsonLdScripts = [];
2021
+ $('script[type="application/ld+json"]').each((_, element) => {
2022
+ try {
2023
+ const jsonText = $(element).html();
2024
+ if (jsonText) {
2025
+ const data = JSON.parse(jsonText);
2026
+ jsonLdScripts.push(data);
2027
+ }
2028
+ } catch {
2029
+ }
2030
+ });
2031
+ if (jsonLdScripts.length > 0) {
2032
+ structured.jsonLd = jsonLdScripts;
2033
+ }
2034
+ const openGraph = {};
2035
+ $('meta[property^="og:"]').each((_, element) => {
2036
+ const property = $(element).attr("property");
2037
+ const content = $(element).attr("content");
2038
+ if (property && content) {
2039
+ openGraph[property] = content;
2040
+ }
2041
+ });
2042
+ if (Object.keys(openGraph).length > 0) {
2043
+ structured.openGraph = openGraph;
2044
+ }
2045
+ const twitterCard = {};
2046
+ $('meta[name^="twitter:"]').each((_, element) => {
2047
+ const name = $(element).attr("name");
2048
+ const content = $(element).attr("content");
2049
+ if (name && content) {
2050
+ twitterCard[name] = content;
2051
+ }
2052
+ });
2053
+ if (Object.keys(twitterCard).length > 0) {
2054
+ structured.twitterCard = twitterCard;
2055
+ }
2056
+ const microdata = [];
2057
+ $("[itemscope]").each((_, element) => {
2058
+ const $item = $(element);
2059
+ const itemType = $item.attr("itemtype");
2060
+ if (itemType) {
2061
+ const item = { "@type": itemType };
2062
+ $item.find("[itemprop]").each((_2, propElement) => {
2063
+ const $prop = $(propElement);
2064
+ const propName = $prop.attr("itemprop");
2065
+ const propValue = $prop.attr("content") || $prop.text().trim();
2066
+ if (propName && propValue) {
2067
+ item[propName] = propValue;
2068
+ }
2069
+ });
2070
+ microdata.push(item);
2071
+ }
2072
+ });
2073
+ if (microdata.length > 0) {
2074
+ structured.microdata = microdata;
2075
+ }
2076
+ } catch (error) {
2077
+ console.warn(`\u26A0\uFE0F [ContentExtractor] Error extracting structured data:`, error);
2078
+ }
2079
+ return Object.keys(structured).length > 0 ? structured : void 0;
2080
+ }
2081
+ extractPublishedTime(html) {
2082
+ try {
2083
+ const $ = cheerio4.load(html);
2084
+ const timeSelectors = [
2085
+ 'meta[property="article:published_time"]',
2086
+ 'meta[name="datePublished"]',
2087
+ 'meta[name="publishdate"]',
2088
+ "time[datetime]",
2089
+ ".published-date",
2090
+ ".publish-date",
2091
+ ".article-date"
2092
+ ];
2093
+ for (const selector of timeSelectors) {
2094
+ const element = $(selector).first();
2095
+ if (element.length > 0) {
2096
+ const timeStr = element.attr("content") || element.attr("datetime") || element.text().trim();
2097
+ if (timeStr) {
2098
+ const date = new Date(timeStr);
2099
+ if (!isNaN(date.getTime())) {
2100
+ return date;
2101
+ }
2102
+ }
2103
+ }
2104
+ }
2105
+ return void 0;
2106
+ } catch {
2107
+ return void 0;
2108
+ }
2109
+ }
2110
+ extractSiteName(html) {
2111
+ try {
2112
+ const $ = cheerio4.load(html);
2113
+ return $('meta[property="og:site_name"]').attr("content") || $('meta[name="application-name"]').attr("content") || void 0;
2114
+ } catch {
2115
+ return void 0;
2116
+ }
2117
+ }
2118
+ extractLanguage(html) {
2119
+ try {
2120
+ const $ = cheerio4.load(html);
2121
+ return $("html").attr("lang") || $('meta[name="language"]').attr("content") || $('meta[http-equiv="content-language"]').attr("content") || void 0;
2122
+ } catch {
2123
+ return void 0;
2124
+ }
2125
+ }
2126
+ countWords(text) {
2127
+ if (!text) return 0;
2128
+ return text.trim().split(/\s+/).filter((word) => word.length > 0).length;
2129
+ }
2130
+ /**
2131
+ * Validate extracted content quality
2132
+ */
2133
+ validateContent(content) {
2134
+ const issues = [];
2135
+ let score = 1;
2136
+ if (content.textContent.length < this.minContentLength) {
2137
+ issues.push(`Content too short: ${content.textContent.length} characters`);
2138
+ score -= 0.5;
2139
+ }
2140
+ if (!content.title || content.title.length < 10) {
2141
+ issues.push("Missing or too short title");
2142
+ score -= 0.2;
2143
+ } else if (content.title.length > 200) {
2144
+ issues.push("Title too long");
2145
+ score -= 0.1;
2146
+ }
2147
+ const htmlLength = content.content.length;
2148
+ const textLength = content.textContent.length;
2149
+ const ratio = textLength / htmlLength;
2150
+ if (ratio < 0.1) {
2151
+ issues.push("Low text-to-HTML ratio - may be poorly extracted");
2152
+ score -= 0.2;
2153
+ }
2154
+ const sentences = content.textContent.split(".").filter((s) => s.trim().length > 10);
2155
+ const uniqueSentences = new Set(sentences);
2156
+ const duplicateRatio = (sentences.length - uniqueSentences.size) / sentences.length;
2157
+ if (duplicateRatio > 0.3) {
2158
+ issues.push("High duplicate content detected");
2159
+ score -= 0.3;
2160
+ }
2161
+ return {
2162
+ isValid: issues.length === 0 && score >= 0.5,
2163
+ issues,
2164
+ score: Math.max(0, score)
2165
+ };
2166
+ }
2167
+ };
2168
+ var globalContentExtractor = new ContentExtractor();
2169
+
2170
+ // src/utils/circuit-breaker.ts
2171
+ var CircuitBreaker = class {
2172
+ constructor(options) {
2173
+ this.failures = 0;
2174
+ this.lastFailureTime = 0;
2175
+ this.state = "CLOSED";
2176
+ this.options = options;
2177
+ }
2178
+ async execute(operation) {
2179
+ if (this.state === "OPEN") {
2180
+ if (Date.now() - this.lastFailureTime < this.options.resetTimeout) {
2181
+ throw new Error(`[CircuitBreaker:${this.options.name}] Circuit is OPEN - preventing request`);
2182
+ } else {
2183
+ this.state = "HALF_OPEN";
2184
+ console.log(`\u{1F504} [CircuitBreaker:${this.options.name}] Circuit moving to HALF_OPEN state`);
2185
+ }
2186
+ }
2187
+ try {
2188
+ const result = await this.executeWithTimeout(operation);
2189
+ this.onSuccess();
2190
+ return result;
2191
+ } catch (error) {
2192
+ this.onFailure();
2193
+ throw error;
2194
+ }
2195
+ }
2196
+ async executeWithTimeout(operation) {
2197
+ return new Promise((resolve, reject) => {
2198
+ const timer = setTimeout(() => {
2199
+ reject(new Error(`[CircuitBreaker:${this.options.name}] Operation timeout after ${this.options.timeout}ms`));
2200
+ }, this.options.timeout);
2201
+ operation().then((result) => {
2202
+ clearTimeout(timer);
2203
+ resolve(result);
2204
+ }).catch((error) => {
2205
+ clearTimeout(timer);
2206
+ reject(error);
2207
+ });
2208
+ });
2209
+ }
2210
+ onSuccess() {
2211
+ this.failures = 0;
2212
+ this.state = "CLOSED";
2213
+ }
2214
+ onFailure() {
2215
+ this.failures++;
2216
+ this.lastFailureTime = Date.now();
2217
+ if (this.failures >= this.options.failureThreshold) {
2218
+ this.state = "OPEN";
2219
+ console.error(`\u274C [CircuitBreaker:${this.options.name}] Circuit opened after ${this.failures} failures`);
2220
+ }
2221
+ }
2222
+ getState() {
2223
+ return {
2224
+ state: this.state,
2225
+ failures: this.failures,
2226
+ lastFailureTime: this.lastFailureTime
2227
+ };
2228
+ }
2229
+ };
2230
+ var circuitBreakers = {
2231
+ rss: new CircuitBreaker({
2232
+ name: "RSS",
2233
+ failureThreshold: 3,
2234
+ timeout: 15e3,
2235
+ // 15 seconds
2236
+ resetTimeout: 3e4
2237
+ // 30 seconds
2238
+ }),
2239
+ scraping: new CircuitBreaker({
2240
+ name: "Scraping",
2241
+ failureThreshold: 5,
2242
+ timeout: 1e4,
2243
+ // 10 seconds
2244
+ resetTimeout: 3e4
2245
+ // 30 seconds
2246
+ }),
2247
+ scrapingTest: new CircuitBreaker({
2248
+ name: "ScrapingTest",
2249
+ failureThreshold: 3,
2250
+ timeout: 3e4,
2251
+ // 30 seconds for test endpoints
2252
+ resetTimeout: 6e4
2253
+ // 1 minute
2254
+ })
2255
+ };
2256
+
2257
+ // src/orchestrator/source-orchestrator.ts
2258
+ var globalHTMLScraper2 = new HTMLScraper();
2259
+ var globalContentExtractor2 = new ContentExtractor();
2260
+ var globalRobotsChecker2 = new RobotsChecker();
2261
+ var CandidateArticleSchema = import_zod.z.object({
2262
+ url: import_zod.z.string().url(),
2263
+ title: import_zod.z.string().min(1),
2264
+ publishedAt: import_zod.z.date(),
2265
+ content: import_zod.z.string().optional(),
2266
+ excerpt: import_zod.z.string().optional(),
2267
+ guid: import_zod.z.string(),
2268
+ confidence: import_zod.z.number().min(0).max(1),
2269
+ source: import_zod.z.enum(["rss", "sitemap", "html", "discovery"]),
2270
+ extractionMethod: import_zod.z.enum(["rss", "sitemap", "html-links", "content-extraction"]),
2271
+ metadata: import_zod.z.record(import_zod.z.any()).optional()
2272
+ });
2273
+ var SourceConfigSchema = import_zod.z.object({
2274
+ sourceType: import_zod.z.enum(["rss", "sitemap", "html", "auto"]),
2275
+ allowPaths: import_zod.z.array(import_zod.z.string()).optional(),
2276
+ denyPaths: import_zod.z.array(import_zod.z.string()).optional(),
2277
+ maxDepth: import_zod.z.number().int().min(1).max(5).optional(),
2278
+ detectOnly: import_zod.z.boolean().optional(),
2279
+ scrapeConfig: import_zod.z.object({
2280
+ selectors: import_zod.z.object({
2281
+ articleLinks: import_zod.z.array(import_zod.z.string()).optional(),
2282
+ titleSelectors: import_zod.z.array(import_zod.z.string()).optional(),
2283
+ dateSelectors: import_zod.z.array(import_zod.z.string()).optional(),
2284
+ excludeSelectors: import_zod.z.array(import_zod.z.string()).optional()
2285
+ }).optional(),
2286
+ filters: import_zod.z.object({
2287
+ minTitleLength: import_zod.z.number().optional(),
2288
+ maxTitleLength: import_zod.z.number().optional(),
2289
+ includePatterns: import_zod.z.array(import_zod.z.string()).optional(),
2290
+ excludePatterns: import_zod.z.array(import_zod.z.string()).optional()
2291
+ }).optional(),
2292
+ limits: import_zod.z.object({
2293
+ maxLinksPerPage: import_zod.z.number().optional(),
2294
+ maxPages: import_zod.z.number().optional()
2295
+ }).optional()
2296
+ }).optional()
2297
+ });
2298
+ var SourceOrchestrator = class {
2299
+ constructor() {
2300
+ this.maxArticlesPerSource = 1e3;
2301
+ }
2302
+ // private readonly recentTimeframe = 48 * 60 * 60 * 1000; // 48 hours (currently unused)
2303
+ /**
2304
+ * Main orchestration method - determines source type and extracts content
2305
+ */
2306
+ async processSource(url, config = { sourceType: "auto" }) {
2307
+ const startTime = Date.now();
2308
+ console.log(`\u{1F3AD} [Orchestrator] Processing source: ${url} (type: ${config.sourceType})`);
2309
+ const result = {
2310
+ articles: [],
2311
+ sourceInfo: {
2312
+ detectedType: "html",
2313
+ extractionStats: {
2314
+ attempted: 0,
2315
+ successful: 0,
2316
+ failed: 0,
2317
+ filtered: 0
2318
+ }
2319
+ },
2320
+ processingTime: 0,
2321
+ errors: []
2322
+ };
2323
+ try {
2324
+ const breaker = config.circuitBreaker || circuitBreakers.scraping;
2325
+ return await breaker.execute(async () => {
2326
+ if (config.sourceType === "auto") {
2327
+ return await this.autoDetectAndProcess(url, config, result);
2328
+ } else {
2329
+ return await this.processKnownType(url, config, result);
2330
+ }
2331
+ });
2332
+ } catch (error) {
2333
+ const errorMessage = error instanceof Error ? error.message : "Unknown error";
2334
+ console.error(`\u274C [Orchestrator] Failed to process source ${url}:`, errorMessage);
2335
+ result.errors.push(errorMessage);
2336
+ result.processingTime = Date.now() - startTime;
2337
+ return result;
2338
+ }
2339
+ }
2340
+ /**
2341
+ * Auto-detect source type and process accordingly
2342
+ */
2343
+ async autoDetectAndProcess(url, config, result) {
2344
+ console.log(`\u{1F50D} [Orchestrator] Auto-detecting source type for ${url}`);
2345
+ try {
2346
+ const rssArticles = await this.processAsRSS(url);
2347
+ if (rssArticles.length > 0) {
2348
+ result.sourceInfo.detectedType = "rss";
2349
+ result.articles = this.applyPathFilters(rssArticles, config);
2350
+ console.log(`\u2705 [Orchestrator] Detected as RSS feed: ${result.articles.length} articles`);
2351
+ return this.finalizeResult(result);
2352
+ }
2353
+ } catch (error) {
2354
+ result.errors.push(`RSS detection failed: ${error instanceof Error ? error.message : "Unknown error"}`);
2355
+ }
2356
+ try {
2357
+ const discoveredFeeds = await globalRSSDiscovery.discoverFeeds(url);
2358
+ if (discoveredFeeds.length > 0) {
2359
+ result.sourceInfo.discoveredFeeds = discoveredFeeds;
2360
+ const bestFeed = discoveredFeeds[0];
2361
+ const rssArticles = await this.processAsRSS(bestFeed.url);
2362
+ if (rssArticles.length > 0) {
2363
+ result.sourceInfo.detectedType = "rss";
2364
+ result.articles = this.applyPathFilters(rssArticles, config);
2365
+ console.log(`\u2705 [Orchestrator] Using discovered RSS feed: ${result.articles.length} articles`);
2366
+ return this.finalizeResult(result);
2367
+ }
2368
+ }
2369
+ } catch (error) {
2370
+ result.errors.push(`RSS discovery failed: ${error instanceof Error ? error.message : "Unknown error"}`);
2371
+ }
2372
+ try {
2373
+ const sitemapArticles = await this.processAsSitemap(url);
2374
+ if (sitemapArticles.length > 0) {
2375
+ result.sourceInfo.detectedType = "sitemap";
2376
+ result.articles = this.applyPathFilters(sitemapArticles, config);
2377
+ console.log(`\u2705 [Orchestrator] Detected as sitemap: ${result.articles.length} articles`);
2378
+ return this.finalizeResult(result);
2379
+ }
2380
+ } catch (error) {
2381
+ result.errors.push(`Sitemap detection failed: ${error instanceof Error ? error.message : "Unknown error"}`);
2382
+ }
2383
+ try {
2384
+ const urlObj = new URL(url);
2385
+ const discoveredSitemaps = await globalSitemapParser.discoverSitemaps(urlObj.hostname);
2386
+ if (discoveredSitemaps.length > 0) {
2387
+ result.sourceInfo.discoveredSitemaps = discoveredSitemaps;
2388
+ const sitemapArticles = await this.processAsSitemap(discoveredSitemaps[0]);
2389
+ if (sitemapArticles.length > 0) {
2390
+ result.sourceInfo.detectedType = "sitemap";
2391
+ result.articles = this.applyPathFilters(sitemapArticles, config);
2392
+ console.log(`\u2705 [Orchestrator] Using discovered sitemap: ${result.articles.length} articles`);
2393
+ return this.finalizeResult(result);
2394
+ }
2395
+ }
2396
+ } catch (error) {
2397
+ result.errors.push(`Sitemap discovery failed: ${error instanceof Error ? error.message : "Unknown error"}`);
2398
+ }
2399
+ try {
2400
+ const htmlArticles = await this.processAsHTML(url, config);
2401
+ result.sourceInfo.detectedType = "html";
2402
+ result.articles = this.applyPathFilters(htmlArticles, config);
2403
+ console.log(`\u2705 [Orchestrator] Falling back to HTML scraping: ${result.articles.length} articles`);
2404
+ return this.finalizeResult(result);
2405
+ } catch (error) {
2406
+ result.errors.push(`HTML scraping failed: ${error instanceof Error ? error.message : "Unknown error"}`);
2407
+ return this.finalizeResult(result);
2408
+ }
2409
+ }
2410
+ /**
2411
+ * Process source with known type
2412
+ */
2413
+ async processKnownType(url, config, result) {
2414
+ console.log(`\u{1F3AF} [Orchestrator] Processing as ${config.sourceType}: ${url}`);
2415
+ try {
2416
+ let articles = [];
2417
+ switch (config.sourceType) {
2418
+ case "rss":
2419
+ articles = await this.processAsRSS(url);
2420
+ result.sourceInfo.detectedType = "rss";
2421
+ break;
2422
+ case "sitemap":
2423
+ articles = await this.processAsSitemap(url);
2424
+ result.sourceInfo.detectedType = "sitemap";
2425
+ break;
2426
+ case "html":
2427
+ articles = await this.processAsHTML(url, config);
2428
+ result.sourceInfo.detectedType = "html";
2429
+ break;
2430
+ }
2431
+ result.articles = this.applyPathFilters(articles, config);
2432
+ console.log(`\u2705 [Orchestrator] Processed ${config.sourceType}: ${result.articles.length} articles`);
2433
+ return this.finalizeResult(result);
2434
+ } catch (error) {
2435
+ const errorMessage = error instanceof Error ? error.message : "Unknown error";
2436
+ result.errors.push(`${config.sourceType} processing failed: ${errorMessage}`);
2437
+ return this.finalizeResult(result);
2438
+ }
2439
+ }
2440
+ /**
2441
+ * Process URL as RSS feed
2442
+ */
2443
+ async processAsRSS(url) {
2444
+ const rssItems = await fetchRSSFeed(url);
2445
+ const candidates = [];
2446
+ for (const item of rssItems) {
2447
+ try {
2448
+ const publishedAt = new Date(item.pubDate);
2449
+ if (isNaN(publishedAt.getTime())) {
2450
+ continue;
2451
+ }
2452
+ candidates.push({
2453
+ url: item.link,
2454
+ title: item.title,
2455
+ publishedAt,
2456
+ content: item.content,
2457
+ excerpt: item.contentSnippet,
2458
+ guid: item.guid,
2459
+ confidence: 0.9,
2460
+ source: "rss",
2461
+ extractionMethod: "rss",
2462
+ metadata: {
2463
+ originalGuid: item.guid,
2464
+ rssSource: url
2465
+ }
2466
+ });
2467
+ } catch (error) {
2468
+ console.warn(`\u26A0\uFE0F [Orchestrator] Error processing RSS item:`, error);
2469
+ continue;
2470
+ }
2471
+ }
2472
+ return candidates;
2473
+ }
2474
+ /**
2475
+ * Process URL as sitemap
2476
+ */
2477
+ async processAsSitemap(url) {
2478
+ const sitemapEntries = await globalSitemapParser.parseSitemap(url, {
2479
+ filterRecent: true,
2480
+ maxEntries: this.maxArticlesPerSource,
2481
+ includeNews: true
2482
+ });
2483
+ const candidates = [];
2484
+ for (const entry of sitemapEntries) {
2485
+ try {
2486
+ const publishedAt = entry.lastmod || /* @__PURE__ */ new Date();
2487
+ candidates.push({
2488
+ url: entry.url,
2489
+ title: entry.news?.title || this.extractTitleFromUrl(entry.url),
2490
+ publishedAt,
2491
+ guid: this.createGuid(entry.url, publishedAt.toISOString()),
2492
+ confidence: entry.news ? 0.8 : 0.6,
2493
+ source: "sitemap",
2494
+ extractionMethod: "sitemap",
2495
+ metadata: {
2496
+ changefreq: entry.changefreq,
2497
+ priority: entry.priority,
2498
+ hasNews: !!entry.news,
2499
+ sitemapSource: url
2500
+ }
2501
+ });
2502
+ } catch (error) {
2503
+ console.warn(`\u26A0\uFE0F [Orchestrator] Error processing sitemap entry:`, error);
2504
+ continue;
2505
+ }
2506
+ }
2507
+ return candidates;
2508
+ }
2509
+ /**
2510
+ * Process URL as HTML page
2511
+ */
2512
+ async processAsHTML(url, config) {
2513
+ const scrapingConfig = this.buildScrapingConfig(config);
2514
+ const extractedArticles = await globalHTMLScraper2.extractFromMultiplePages(url, scrapingConfig, {
2515
+ maxPages: config.scrapeConfig?.limits?.maxPages || 3
2516
+ });
2517
+ const candidates = [];
2518
+ for (const article of extractedArticles) {
2519
+ try {
2520
+ const publishedAt = article.publishedDate || /* @__PURE__ */ new Date();
2521
+ candidates.push({
2522
+ url: article.url,
2523
+ title: article.title || this.extractTitleFromUrl(article.url),
2524
+ publishedAt,
2525
+ excerpt: article.description,
2526
+ guid: this.createGuid(article.url, publishedAt.toISOString()),
2527
+ confidence: article.confidence,
2528
+ source: "html",
2529
+ extractionMethod: "html-links",
2530
+ metadata: {
2531
+ extractionSource: article.source,
2532
+ htmlSource: url
2533
+ }
2534
+ });
2535
+ } catch (error) {
2536
+ console.warn(`\u26A0\uFE0F [Orchestrator] Error processing HTML article:`, error);
2537
+ continue;
2538
+ }
2539
+ }
2540
+ return candidates;
2541
+ }
2542
+ /**
2543
+ * Apply path filtering based on allowPaths and denyPaths
2544
+ */
2545
+ applyPathFilters(articles, config) {
2546
+ if (!config.allowPaths?.length && !config.denyPaths?.length) {
2547
+ return articles;
2548
+ }
2549
+ return articles.filter((article) => {
2550
+ try {
2551
+ const urlObj = new URL(article.url);
2552
+ const path = urlObj.pathname.toLowerCase();
2553
+ if (config.denyPaths?.length) {
2554
+ for (const pattern of config.denyPaths) {
2555
+ if (this.matchesPattern(path, pattern)) {
2556
+ console.log(`\u{1F6AB} [Orchestrator] Article blocked by deny pattern "${pattern}": ${article.url}`);
2557
+ return false;
2558
+ }
2559
+ }
2560
+ }
2561
+ if (config.allowPaths?.length) {
2562
+ for (const pattern of config.allowPaths) {
2563
+ if (this.matchesPattern(path, pattern)) {
2564
+ return true;
2565
+ }
2566
+ }
2567
+ console.log(`\u{1F6AB} [Orchestrator] Article not matching any allow pattern: ${article.url}`);
2568
+ return false;
2569
+ }
2570
+ return true;
2571
+ } catch (error) {
2572
+ console.warn(`\u26A0\uFE0F [Orchestrator] Error applying path filters to ${article.url}:`, error);
2573
+ return true;
2574
+ }
2575
+ });
2576
+ }
2577
+ /**
2578
+ * Check if a path matches a pattern (supports wildcards)
2579
+ */
2580
+ matchesPattern(path, pattern) {
2581
+ const regexPattern = pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&").replace(/\\\*/g, ".*").replace(/\\\?/g, ".");
2582
+ const regex = new RegExp("^" + regexPattern + "$", "i");
2583
+ return regex.test(path);
2584
+ }
2585
+ /**
2586
+ * Build scraping configuration from source config
2587
+ */
2588
+ buildScrapingConfig(config) {
2589
+ const scrapingConfig = {};
2590
+ if (config.scrapeConfig?.selectors) {
2591
+ scrapingConfig.selectors = {
2592
+ articleLinks: config.scrapeConfig.selectors.articleLinks,
2593
+ titleSelectors: config.scrapeConfig.selectors.titleSelectors,
2594
+ dateSelectors: config.scrapeConfig.selectors.dateSelectors,
2595
+ excludeSelectors: config.scrapeConfig.selectors.excludeSelectors
2596
+ };
2597
+ }
2598
+ if (config.scrapeConfig?.filters) {
2599
+ scrapingConfig.filters = {
2600
+ minTitleLength: config.scrapeConfig.filters.minTitleLength,
2601
+ maxTitleLength: config.scrapeConfig.filters.maxTitleLength,
2602
+ includePatterns: config.scrapeConfig.filters.includePatterns?.map((p) => new RegExp(p, "i")),
2603
+ excludePatterns: config.scrapeConfig.filters.excludePatterns?.map((p) => new RegExp(p, "i"))
2604
+ };
2605
+ }
2606
+ if (config.scrapeConfig?.limits) {
2607
+ scrapingConfig.limits = config.scrapeConfig.limits;
2608
+ }
2609
+ return scrapingConfig;
2610
+ }
2611
+ /**
2612
+ * Extract title from URL as fallback
2613
+ */
2614
+ extractTitleFromUrl(url) {
2615
+ try {
2616
+ const urlObj = new URL(url);
2617
+ const pathParts = urlObj.pathname.split("/").filter(Boolean);
2618
+ const lastPart = pathParts[pathParts.length - 1] || urlObj.hostname;
2619
+ return lastPart.replace(/[-_]/g, " ").replace(/\.(html|htm|php|asp|jsp)$/i, "").split(" ").map((word) => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase()).join(" ");
2620
+ } catch {
2621
+ return "Untitled Article";
2622
+ }
2623
+ }
2624
+ /**
2625
+ * Create a consistent GUID for an article
2626
+ */
2627
+ createGuid(url, publishedAt) {
2628
+ return import_crypto2.default.createHash("sha256").update(url + publishedAt).digest("hex");
2629
+ }
2630
+ /**
2631
+ * Finalize processing result
2632
+ */
2633
+ finalizeResult(result) {
2634
+ const endTime = Date.now();
2635
+ result.processingTime = endTime - (Date.now() - result.processingTime);
2636
+ result.sourceInfo.extractionStats = {
2637
+ attempted: result.articles.length,
2638
+ successful: result.articles.filter((a) => a.confidence >= 0.5).length,
2639
+ failed: result.errors.length,
2640
+ filtered: 0
2641
+ // This would be calculated during filtering
2642
+ };
2643
+ result.articles.sort((a, b) => {
2644
+ const confidenceDiff = b.confidence - a.confidence;
2645
+ if (Math.abs(confidenceDiff) > 0.1) return confidenceDiff;
2646
+ return b.publishedAt.getTime() - a.publishedAt.getTime();
2647
+ });
2648
+ result.articles = result.articles.slice(0, this.maxArticlesPerSource);
2649
+ console.log(`\u{1F3AD} [Orchestrator] Processing complete: ${result.articles.length} articles in ${result.processingTime}ms`);
2650
+ return result;
2651
+ }
2652
+ /**
2653
+ * Extract full content for articles (optional enhancement step)
2654
+ */
2655
+ async enhanceWithFullContent(articles, maxArticles = 10) {
2656
+ console.log(`\u{1F4D6} [Orchestrator] Enhancing ${Math.min(articles.length, maxArticles)} articles with full content`);
2657
+ const toEnhance = articles.filter((a) => !a.content || a.content.length < 500).slice(0, maxArticles);
2658
+ for (const article of toEnhance) {
2659
+ try {
2660
+ const extractedContent = await globalContentExtractor2.extractContent(article.url);
2661
+ if (extractedContent) {
2662
+ article.content = extractedContent.content;
2663
+ article.excerpt = extractedContent.excerpt || article.excerpt;
2664
+ article.confidence = Math.min(article.confidence + 0.1, 1);
2665
+ article.metadata = {
2666
+ ...article.metadata,
2667
+ fullContentExtracted: true,
2668
+ extractionMethod: extractedContent.extractionMethod,
2669
+ wordCount: extractedContent.wordCount,
2670
+ readingTime: extractedContent.readingTime
2671
+ };
2672
+ }
2673
+ } catch (error) {
2674
+ console.warn(`\u26A0\uFE0F [Orchestrator] Failed to enhance article ${article.url}:`, error);
2675
+ continue;
2676
+ }
2677
+ }
2678
+ console.log(`\u{1F4D6} [Orchestrator] Content enhancement complete`);
2679
+ return articles;
2680
+ }
2681
+ /**
2682
+ * Validate orchestrator configuration
2683
+ */
2684
+ static validateConfig(config) {
2685
+ try {
2686
+ return SourceConfigSchema.parse(config);
2687
+ } catch (error) {
2688
+ if (error instanceof import_zod.z.ZodError) {
2689
+ throw new Error(`Invalid source configuration: ${error.errors.map((e) => e.message).join(", ")}`);
2690
+ }
2691
+ throw error;
2692
+ }
2693
+ }
2694
+ /**
2695
+ * Get source statistics
2696
+ */
2697
+ async getSourceStats(url) {
2698
+ const robotsCheck = await globalRobotsChecker2.isAllowed(url);
2699
+ const discoveredFeeds = await globalRSSDiscovery.discoverFeeds(url);
2700
+ let hasSitemap = false;
2701
+ let estimatedArticleCount = 0;
2702
+ try {
2703
+ const urlObj = new URL(url);
2704
+ const sitemaps = await globalSitemapParser.discoverSitemaps(urlObj.hostname);
2705
+ hasSitemap = sitemaps.length > 0;
2706
+ if (hasSitemap) {
2707
+ const recentEntries = await globalSitemapParser.getRecentEntries(urlObj.hostname, { hoursBack: 48, maxEntries: 100 });
2708
+ estimatedArticleCount = recentEntries.length;
2709
+ }
2710
+ } catch (error) {
2711
+ }
2712
+ return {
2713
+ robotsCompliant: robotsCheck.allowed,
2714
+ hasRSSFeed: discoveredFeeds.length > 0,
2715
+ hasSitemap,
2716
+ detectedType: discoveredFeeds.length > 0 ? "rss" : hasSitemap ? "sitemap" : "html",
2717
+ estimatedArticleCount
2718
+ };
2719
+ }
2720
+ };
2721
+ var globalSourceOrchestrator = new SourceOrchestrator();
2722
+
2723
+ // src/quality/quality-scorer.ts
2724
+ var DEFAULT_QUALITY_CONFIG = {
2725
+ contentWeight: 0.6,
2726
+ // Content validation (length, quality, ratio)
2727
+ dateWeight: 0.12,
2728
+ // Publication date presence
2729
+ authorWeight: 0.08,
2730
+ // Author/byline presence
2731
+ schemaWeight: 0.08,
2732
+ // Schema.org metadata
2733
+ readingTimeWeight: 0.12,
2734
+ // Substantial reading time (2+ min)
2735
+ threshold: 0.5
2736
+ // Minimum score to pass (50%)
2737
+ };
2738
+ var DEFAULT_DENY_PATHS = [
2739
+ "/",
2740
+ "/index",
2741
+ "/index.html",
2742
+ "/about",
2743
+ "/about/*",
2744
+ "/careers",
2745
+ "/careers/*",
2746
+ "/jobs",
2747
+ "/jobs/*",
2748
+ "/contact",
2749
+ "/contact/*",
2750
+ "/team",
2751
+ "/team/*",
2752
+ "/privacy",
2753
+ "/terms",
2754
+ "/legal/*",
2755
+ "/tag/*",
2756
+ "/tags/*",
2757
+ "/category/*",
2758
+ "/categories/*",
2759
+ "/author/*",
2760
+ "/authors/*",
2761
+ "/archive/*",
2762
+ "/search",
2763
+ "/search/*"
2764
+ ];
2765
+ function validateContent(extracted) {
2766
+ const reasons = [];
2767
+ let score = 1;
2768
+ const contentLength = extracted.textContent?.length || 0;
2769
+ if (contentLength < 200) {
2770
+ reasons.push("Content too short (< 200 characters)");
2771
+ score -= 0.5;
2772
+ }
2773
+ const titleLength = extracted.title?.length || 0;
2774
+ if (titleLength < 10 || titleLength > 200) {
2775
+ reasons.push("Title length invalid (must be 10-200 characters)");
2776
+ score -= 0.2;
2777
+ }
2778
+ if (extracted.content && extracted.textContent) {
2779
+ const htmlLength = extracted.content.length;
2780
+ const textLength = extracted.textContent.length;
2781
+ const ratio = textLength / htmlLength;
2782
+ if (ratio < 0.1) {
2783
+ reasons.push("Low text-to-HTML ratio (< 10%)");
2784
+ score -= 0.2;
2785
+ }
2786
+ }
2787
+ const isValid = score >= 0.5;
2788
+ return {
2789
+ isValid,
2790
+ score: Math.max(0, Math.min(1, score)),
2791
+ // Clamp between 0-1
2792
+ reasons
2793
+ };
2794
+ }
2795
+ function calculateArticleQualityScore(extracted, config = {}) {
2796
+ const finalConfig = { ...DEFAULT_QUALITY_CONFIG, ...config };
2797
+ let score = 0;
2798
+ const validation = validateContent(extracted);
2799
+ score += validation.score * finalConfig.contentWeight;
2800
+ if (extracted.publishedTime) {
2801
+ score += finalConfig.dateWeight;
2802
+ }
2803
+ if (extracted.byline) {
2804
+ score += finalConfig.authorWeight;
2805
+ }
2806
+ if (extracted.structured?.jsonLd) {
2807
+ const schemas = Array.isArray(extracted.structured.jsonLd) ? extracted.structured.jsonLd : [extracted.structured.jsonLd];
2808
+ const hasArticleType = schemas.some((s) => {
2809
+ const type = s["@type"];
2810
+ return type === "Article" || type === "NewsArticle" || type === "BlogPosting" || type === "TechArticle" || type === "ScholarlyArticle";
2811
+ });
2812
+ if (hasArticleType) {
2813
+ score += finalConfig.schemaWeight;
2814
+ }
2815
+ }
2816
+ if (extracted.readingTime && extracted.readingTime >= 2) {
2817
+ score += finalConfig.readingTimeWeight;
2818
+ }
2819
+ return Math.min(score, 1);
2820
+ }
2821
+ function shouldDenyUrl(url, denyPaths = DEFAULT_DENY_PATHS) {
2822
+ try {
2823
+ const urlObj = new URL(url);
2824
+ const path = urlObj.pathname;
2825
+ return denyPaths.some((pattern) => {
2826
+ if (pattern === path) return true;
2827
+ if (pattern.endsWith("/*")) {
2828
+ const prefix = pattern.slice(0, -2);
2829
+ return path.startsWith(prefix);
2830
+ }
2831
+ return false;
2832
+ });
2833
+ } catch {
2834
+ return false;
2835
+ }
2836
+ }
2837
+ function getQualityBreakdown(extracted, config = {}) {
2838
+ const finalConfig = { ...DEFAULT_QUALITY_CONFIG, ...config };
2839
+ const validation = validateContent(extracted);
2840
+ const breakdown = {
2841
+ contentValidation: validation.score * finalConfig.contentWeight,
2842
+ publishedDate: extracted.publishedTime ? finalConfig.dateWeight : 0,
2843
+ author: extracted.byline ? finalConfig.authorWeight : 0,
2844
+ schema: 0,
2845
+ readingTime: extracted.readingTime && extracted.readingTime >= 2 ? finalConfig.readingTimeWeight : 0,
2846
+ total: 0,
2847
+ passesThreshold: false
2848
+ };
2849
+ if (extracted.structured?.jsonLd) {
2850
+ const schemas = Array.isArray(extracted.structured.jsonLd) ? extracted.structured.jsonLd : [extracted.structured.jsonLd];
2851
+ const hasArticleType = schemas.some((s) => {
2852
+ const type = s["@type"];
2853
+ return type === "Article" || type === "NewsArticle" || type === "BlogPosting" || type === "TechArticle" || type === "ScholarlyArticle";
2854
+ });
2855
+ if (hasArticleType) {
2856
+ breakdown.schema = finalConfig.schemaWeight;
2857
+ }
2858
+ }
2859
+ breakdown.total = breakdown.contentValidation + breakdown.publishedDate + breakdown.author + breakdown.schema + breakdown.readingTime;
2860
+ breakdown.passesThreshold = breakdown.total >= finalConfig.threshold;
2861
+ return breakdown;
2862
+ }
2863
+
2864
+ // src/formatters/html-to-markdown.ts
2865
+ var import_turndown = __toESM(require("turndown"));
2866
+ function htmlToMarkdown(html) {
2867
+ if (!html) return "";
2868
+ const turndownService = new import_turndown.default({
2869
+ headingStyle: "atx",
2870
+ // Use # for headings
2871
+ codeBlockStyle: "fenced",
2872
+ // Use ``` for code blocks
2873
+ bulletListMarker: "-",
2874
+ // Use - for lists
2875
+ emDelimiter: "*",
2876
+ // Use * for emphasis
2877
+ strongDelimiter: "**"
2878
+ // Use ** for strong
2879
+ });
2880
+ turndownService.remove([
2881
+ "script",
2882
+ "style",
2883
+ "nav",
2884
+ "header",
2885
+ "footer",
2886
+ "aside",
2887
+ "form",
2888
+ "button",
2889
+ "input",
2890
+ "select",
2891
+ "textarea",
2892
+ "iframe",
2893
+ "noscript"
2894
+ ]);
2895
+ turndownService.addRule("cleanAttributes", {
2896
+ filter: ["div", "span", "p", "section", "article"],
2897
+ replacement: (content) => {
2898
+ return content;
2899
+ }
2900
+ });
2901
+ let markdown = turndownService.turndown(html);
2902
+ markdown = smartParagraphDetection(markdown);
2903
+ markdown = normalizeWhitespace(markdown);
2904
+ return markdown;
2905
+ }
2906
+ function smartParagraphDetection(markdown) {
2907
+ const lines = markdown.split("\n");
2908
+ const result = [];
2909
+ for (let i = 0; i < lines.length; i++) {
2910
+ const line = lines[i];
2911
+ const prevLine = i > 0 ? lines[i - 1] : "";
2912
+ const nextLine = i < lines.length - 1 ? lines[i + 1] : "";
2913
+ result.push(line);
2914
+ if (line.match(/^#{1,6}\s/) && nextLine && !nextLine.match(/^#{1,6}\s/)) {
2915
+ result.push("");
2916
+ }
2917
+ if (nextLine.match(/^#{1,6}\s/) && line && !line.match(/^#{1,6}\s/) && !prevLine.match(/^$/)) {
2918
+ result.push("");
2919
+ }
2920
+ if (line.match(/^[-*+]\s/) && nextLine && !nextLine.match(/^[-*+]\s/) && !nextLine.match(/^$/)) {
2921
+ result.push("");
2922
+ }
2923
+ }
2924
+ return result.join("\n");
2925
+ }
2926
+ function normalizeWhitespace(markdown) {
2927
+ markdown = markdown.replace(/\n{3,}/g, "\n\n");
2928
+ markdown = markdown.split("\n").map((line) => line.trim()).join("\n");
2929
+ markdown = markdown.trim();
2930
+ return markdown;
2931
+ }
2932
+ function stripNonArticleContent(html) {
2933
+ if (!html) return "";
2934
+ const nonArticlePatterns = [
2935
+ /<nav\b[^>]*>.*?<\/nav>/gi,
2936
+ /<header\b[^>]*>.*?<\/header>/gi,
2937
+ /<footer\b[^>]*>.*?<\/footer>/gi,
2938
+ /<aside\b[^>]*>.*?<\/aside>/gi,
2939
+ /<form\b[^>]*>.*?<\/form>/gi,
2940
+ /<div[^>]*class="[^"]*(?:nav|menu|sidebar|advertisement|ads|social|share|comment|popup|modal)[^"]*"[^>]*>.*?<\/div>/gi,
2941
+ /<div[^>]*id="[^"]*(?:nav|menu|sidebar|advertisement|ads|social|share|comment|popup|modal)[^"]*"[^>]*>.*?<\/div>/gi
2942
+ ];
2943
+ let cleaned = html;
2944
+ for (const pattern of nonArticlePatterns) {
2945
+ cleaned = cleaned.replace(pattern, "");
2946
+ }
2947
+ cleaned = cleaned.replace(/\s*class="[^"]*"/gi, "");
2948
+ cleaned = cleaned.replace(/\s*id="[^"]*"/gi, "");
2949
+ cleaned = cleaned.replace(/\s*data-[^=]*="[^"]*"/gi, "");
2950
+ return cleaned;
2951
+ }
2952
+ function convertToMarkdown(html, options = {}) {
2953
+ const {
2954
+ cleanNonArticle = true,
2955
+ smartParagraphs: _smartParagraphs = true
2956
+ } = options;
2957
+ let processedHtml = html;
2958
+ if (cleanNonArticle) {
2959
+ processedHtml = stripNonArticleContent(processedHtml);
2960
+ }
2961
+ const markdown = htmlToMarkdown(processedHtml);
2962
+ return markdown;
2963
+ }
2964
+
2965
+ // src/formatters/text-cleaner.ts
2966
+ function cleanText(text) {
2967
+ if (!text) return "";
2968
+ let cleaned = text;
2969
+ cleaned = decodeHTMLEntities(cleaned);
2970
+ cleaned = normalizeWhitespace2(cleaned);
2971
+ cleaned = detectParagraphs(cleaned);
2972
+ cleaned = cleaned.trim();
2973
+ return cleaned;
2974
+ }
2975
+ function decodeHTMLEntities(text) {
2976
+ const entities = {
2977
+ "&nbsp;": " ",
2978
+ "&amp;": "&",
2979
+ "&lt;": "<",
2980
+ "&gt;": ">",
2981
+ "&quot;": '"',
2982
+ "&#039;": "'",
2983
+ "&apos;": "'",
2984
+ "&ndash;": "\u2013",
2985
+ "&mdash;": "\u2014",
2986
+ "&hellip;": "\u2026",
2987
+ "&ldquo;": '"',
2988
+ "&rdquo;": '"',
2989
+ "&lsquo;": "\u2018",
2990
+ "&rsquo;": "\u2019"
2991
+ };
2992
+ let decoded = text;
2993
+ for (const [entity, char] of Object.entries(entities)) {
2994
+ decoded = decoded.replace(new RegExp(entity, "g"), char);
2995
+ }
2996
+ decoded = decoded.replace(
2997
+ /&#(\d+);/g,
2998
+ (_, code) => String.fromCharCode(parseInt(code, 10))
2999
+ );
3000
+ decoded = decoded.replace(
3001
+ /&#x([0-9a-f]+);/gi,
3002
+ (_, code) => String.fromCharCode(parseInt(code, 16))
3003
+ );
3004
+ return decoded;
3005
+ }
3006
+ function normalizeWhitespace2(text) {
3007
+ let normalized = text.replace(/\t/g, " ");
3008
+ normalized = normalized.replace(/ {2,}/g, " ");
3009
+ normalized = normalized.split("\n").map((line) => line.trim()).join("\n");
3010
+ normalized = normalized.replace(/\n{3,}/g, "\n\n");
3011
+ return normalized;
3012
+ }
3013
+ function detectParagraphs(text) {
3014
+ const lines = text.split("\n").filter((line) => line.trim().length > 0);
3015
+ const result = [];
3016
+ for (let i = 0; i < lines.length; i++) {
3017
+ const line = lines[i];
3018
+ const nextLine = i < lines.length - 1 ? lines[i + 1] : "";
3019
+ result.push(line);
3020
+ if (line.match(/[.!?]$/) && nextLine.match(/^[A-Z0-9]/) && line.length > 40 && // Avoid breaking after short lines
3021
+ nextLine.length > 20) {
3022
+ result.push("");
3023
+ }
3024
+ }
3025
+ return result.join("\n");
3026
+ }
3027
+ function removeUrls(text) {
3028
+ return text.replace(/https?:\/\/[^\s]+/g, "");
3029
+ }
3030
+ function truncateText(text, maxLength) {
3031
+ if (text.length <= maxLength) return text;
3032
+ const truncated = text.substring(0, maxLength);
3033
+ const lastSpace = truncated.lastIndexOf(" ");
3034
+ if (lastSpace > 0) {
3035
+ return truncated.substring(0, lastSpace) + "\u2026";
3036
+ }
3037
+ return truncated + "\u2026";
3038
+ }
3039
+ function stripHTML(html) {
3040
+ return html.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, "").replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, "").replace(/<[^>]+>/g, "").replace(/\s+/g, " ").trim();
3041
+ }
3042
+
3043
+ // src/scraper.ts
3044
+ async function scrape(url, options = {}) {
3045
+ const startTime = Date.now();
3046
+ const {
3047
+ sourceType = "auto",
3048
+ maxArticles = 50,
3049
+ extractFullContent = true,
3050
+ denyPaths = DEFAULT_DENY_PATHS,
3051
+ qualityThreshold = 0.6
3052
+ } = options;
3053
+ console.log(`\u{1F3AF} [Scraper] Starting scrape of ${url}`);
3054
+ console.log(` Source type: ${sourceType}`);
3055
+ console.log(` Max articles: ${maxArticles}`);
3056
+ console.log(` Extract full content: ${extractFullContent}`);
3057
+ console.log(` Quality threshold: ${qualityThreshold}`);
3058
+ const errors = [];
3059
+ let totalDiscovered = 0;
3060
+ let afterDenyFilter = 0;
3061
+ let afterContentValidation = 0;
3062
+ let afterQualityFilter = 0;
3063
+ try {
3064
+ const config = {
3065
+ sourceType,
3066
+ denyPaths
3067
+ };
3068
+ const orchestrationResult = await globalSourceOrchestrator.processSource(url, config);
3069
+ totalDiscovered = orchestrationResult.articles.length;
3070
+ errors.push(...orchestrationResult.errors);
3071
+ console.log(`\u{1F4E6} [Scraper] Discovered ${totalDiscovered} candidate articles`);
3072
+ let candidateArticles = orchestrationResult.articles.filter((article) => {
3073
+ const shouldDeny = shouldDenyUrl(article.url, denyPaths);
3074
+ return !shouldDeny;
3075
+ });
3076
+ afterDenyFilter = candidateArticles.length;
3077
+ console.log(`\u{1F6AB} [Scraper] After deny filter: ${afterDenyFilter} articles`);
3078
+ let scrapedArticles = [];
3079
+ if (extractFullContent && candidateArticles.length > 0) {
3080
+ console.log(`\u{1F4D6} [Scraper] Extracting full content for ${Math.min(candidateArticles.length, maxArticles)} articles`);
3081
+ const articlesToProcess = candidateArticles.slice(0, maxArticles * 2);
3082
+ for (const candidate of articlesToProcess) {
3083
+ try {
3084
+ const extractedContent = await globalContentExtractor.extractContent(candidate.url);
3085
+ if (!extractedContent) {
3086
+ errors.push(`Failed to extract content from ${candidate.url}`);
3087
+ continue;
3088
+ }
3089
+ const markdown = convertToMarkdown(extractedContent.content || "");
3090
+ const cleanedText = cleanText(extractedContent.textContent || "");
3091
+ const qualityScore = calculateArticleQualityScore(extractedContent);
3092
+ scrapedArticles.push({
3093
+ url: candidate.url,
3094
+ title: extractedContent.title || candidate.title,
3095
+ publishedDate: extractedContent.publishedTime,
3096
+ description: extractedContent.excerpt || candidate.excerpt,
3097
+ fullContent: extractedContent.content,
3098
+ fullContentMarkdown: markdown,
3099
+ fullContentText: cleanedText,
3100
+ confidence: candidate.confidence,
3101
+ source: extractedContent.structured?.jsonLd ? "structured-data" : extractedContent.byline ? "meta-data" : "link-text",
3102
+ qualityScore,
3103
+ metadata: {
3104
+ ...candidate.metadata,
3105
+ wordCount: extractedContent.wordCount,
3106
+ readingTime: extractedContent.readingTime,
3107
+ byline: extractedContent.byline,
3108
+ siteName: extractedContent.siteName,
3109
+ lang: extractedContent.lang
3110
+ }
3111
+ });
3112
+ if (scrapedArticles.length >= maxArticles) {
3113
+ break;
3114
+ }
3115
+ } catch (error) {
3116
+ const errorMsg = error instanceof Error ? error.message : "Unknown error";
3117
+ errors.push(`Error processing ${candidate.url}: ${errorMsg}`);
3118
+ continue;
3119
+ }
3120
+ }
3121
+ } else {
3122
+ scrapedArticles = candidateArticles.slice(0, maxArticles).map((candidate) => ({
3123
+ url: candidate.url,
3124
+ title: candidate.title,
3125
+ publishedDate: candidate.publishedAt,
3126
+ description: candidate.excerpt,
3127
+ confidence: candidate.confidence,
3128
+ source: candidate.source === "rss" ? "structured-data" : candidate.source === "sitemap" ? "meta-data" : "link-text",
3129
+ qualityScore: 0.5,
3130
+ // Default score when not extracting full content
3131
+ metadata: candidate.metadata
3132
+ }));
3133
+ }
3134
+ afterContentValidation = scrapedArticles.length;
3135
+ console.log(`\u2705 [Scraper] After content extraction: ${afterContentValidation} articles`);
3136
+ const filteredArticles = scrapedArticles.filter((article) => {
3137
+ const score = article.qualityScore ?? 0;
3138
+ return score >= qualityThreshold;
3139
+ });
3140
+ afterQualityFilter = filteredArticles.length;
3141
+ console.log(`\u2B50 [Scraper] After quality filter: ${afterQualityFilter} articles (threshold: ${qualityThreshold})`);
3142
+ const processingTime = Date.now() - startTime;
3143
+ const result = {
3144
+ url,
3145
+ detectedType: orchestrationResult.sourceInfo.detectedType,
3146
+ confidence: afterQualityFilter > 0 ? "high" : afterContentValidation > 0 ? "medium" : "low",
3147
+ articles: filteredArticles,
3148
+ extractionStats: {
3149
+ attempted: totalDiscovered,
3150
+ successful: afterQualityFilter,
3151
+ failed: errors.length,
3152
+ filtered: totalDiscovered - afterQualityFilter,
3153
+ totalDiscovered,
3154
+ afterDenyFilter,
3155
+ afterContentValidation,
3156
+ afterQualityFilter
3157
+ },
3158
+ processingTime,
3159
+ errors,
3160
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
3161
+ };
3162
+ console.log(`\u2728 [Scraper] Complete! ${afterQualityFilter} articles in ${processingTime}ms`);
3163
+ return result;
3164
+ } catch (error) {
3165
+ const errorMessage = error instanceof Error ? error.message : "Unknown error";
3166
+ console.error(`\u274C [Scraper] Fatal error:`, errorMessage);
3167
+ return {
3168
+ url,
3169
+ detectedType: "unknown",
3170
+ confidence: "low",
3171
+ articles: [],
3172
+ extractionStats: {
3173
+ attempted: totalDiscovered,
3174
+ successful: 0,
3175
+ failed: 1,
3176
+ filtered: totalDiscovered,
3177
+ totalDiscovered,
3178
+ afterDenyFilter,
3179
+ afterContentValidation,
3180
+ afterQualityFilter
3181
+ },
3182
+ processingTime: Date.now() - startTime,
3183
+ errors: [errorMessage, ...errors],
3184
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
3185
+ };
3186
+ }
3187
+ }
3188
+ async function quickScrape(url) {
3189
+ const result = await scrape(url, {
3190
+ extractFullContent: false,
3191
+ maxArticles: 100,
3192
+ qualityThreshold: 0
3193
+ });
3194
+ return result.articles.map((a) => a.url);
3195
+ }
3196
+
3197
+ // src/index.ts
3198
+ var VERSION = "0.1.0";
3199
+ // Annotate the CommonJS export names for ESM import in node:
3200
+ 0 && (module.exports = {
3201
+ CircuitBreaker,
3202
+ ContentExtractor,
3203
+ DEFAULT_DENY_PATHS,
3204
+ DEFAULT_QUALITY_CONFIG,
3205
+ HTMLScraper,
3206
+ RSSDiscovery,
3207
+ RobotsChecker,
3208
+ ScrapingRateLimiter,
3209
+ SitemapParser,
3210
+ SourceOrchestrator,
3211
+ VERSION,
3212
+ calculateArticleQualityScore,
3213
+ circuitBreakers,
3214
+ cleanText,
3215
+ convertToMarkdown,
3216
+ decodeHTMLEntities,
3217
+ detectParagraphs,
3218
+ fetchRSSFeed,
3219
+ getQualityBreakdown,
3220
+ globalContentExtractor,
3221
+ globalRSSDiscovery,
3222
+ globalRateLimiter,
3223
+ globalRobotsChecker,
3224
+ globalSitemapParser,
3225
+ globalSourceOrchestrator,
3226
+ htmlToMarkdown,
3227
+ normalizeWhitespace,
3228
+ quickScrape,
3229
+ removeUrls,
3230
+ scrape,
3231
+ shouldDenyUrl,
3232
+ stripHTML,
3233
+ stripNonArticleContent,
3234
+ truncateText,
3235
+ validateContent
3236
+ });