@tyroneross/blog-scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +371 -0
- package/dist/index.d.mts +949 -0
- package/dist/index.d.ts +949 -0
- package/dist/index.js +3236 -0
- package/dist/index.mjs +3165 -0
- package/package.json +69 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,3236 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
|
|
30
|
+
// src/index.ts
|
|
31
|
+
var index_exports = {};
|
|
32
|
+
__export(index_exports, {
|
|
33
|
+
CircuitBreaker: () => CircuitBreaker,
|
|
34
|
+
ContentExtractor: () => ContentExtractor,
|
|
35
|
+
DEFAULT_DENY_PATHS: () => DEFAULT_DENY_PATHS,
|
|
36
|
+
DEFAULT_QUALITY_CONFIG: () => DEFAULT_QUALITY_CONFIG,
|
|
37
|
+
HTMLScraper: () => HTMLScraper,
|
|
38
|
+
RSSDiscovery: () => RSSDiscovery,
|
|
39
|
+
RobotsChecker: () => RobotsChecker,
|
|
40
|
+
ScrapingRateLimiter: () => ScrapingRateLimiter,
|
|
41
|
+
SitemapParser: () => SitemapParser,
|
|
42
|
+
SourceOrchestrator: () => SourceOrchestrator,
|
|
43
|
+
VERSION: () => VERSION,
|
|
44
|
+
calculateArticleQualityScore: () => calculateArticleQualityScore,
|
|
45
|
+
circuitBreakers: () => circuitBreakers,
|
|
46
|
+
cleanText: () => cleanText,
|
|
47
|
+
convertToMarkdown: () => convertToMarkdown,
|
|
48
|
+
decodeHTMLEntities: () => decodeHTMLEntities,
|
|
49
|
+
detectParagraphs: () => detectParagraphs,
|
|
50
|
+
fetchRSSFeed: () => fetchRSSFeed,
|
|
51
|
+
getQualityBreakdown: () => getQualityBreakdown,
|
|
52
|
+
globalContentExtractor: () => globalContentExtractor,
|
|
53
|
+
globalRSSDiscovery: () => globalRSSDiscovery,
|
|
54
|
+
globalRateLimiter: () => globalRateLimiter,
|
|
55
|
+
globalRobotsChecker: () => globalRobotsChecker,
|
|
56
|
+
globalSitemapParser: () => globalSitemapParser,
|
|
57
|
+
globalSourceOrchestrator: () => globalSourceOrchestrator,
|
|
58
|
+
htmlToMarkdown: () => htmlToMarkdown,
|
|
59
|
+
normalizeWhitespace: () => normalizeWhitespace2,
|
|
60
|
+
quickScrape: () => quickScrape,
|
|
61
|
+
removeUrls: () => removeUrls,
|
|
62
|
+
scrape: () => scrape,
|
|
63
|
+
shouldDenyUrl: () => shouldDenyUrl,
|
|
64
|
+
stripHTML: () => stripHTML,
|
|
65
|
+
stripNonArticleContent: () => stripNonArticleContent,
|
|
66
|
+
truncateText: () => truncateText,
|
|
67
|
+
validateContent: () => validateContent
|
|
68
|
+
});
|
|
69
|
+
module.exports = __toCommonJS(index_exports);
|
|
70
|
+
|
|
71
|
+
// src/orchestrator/source-orchestrator.ts
|
|
72
|
+
var import_zod = require("zod");
|
|
73
|
+
var import_crypto2 = __toESM(require("crypto"));
|
|
74
|
+
|
|
75
|
+
// src/utils/rss-utils.ts
|
|
76
|
+
var import_rss_parser = __toESM(require("rss-parser"));
|
|
77
|
+
var import_crypto = __toESM(require("crypto"));
|
|
78
|
+
var parser = new import_rss_parser.default({
|
|
79
|
+
timeout: 15e3,
|
|
80
|
+
// Increased timeout
|
|
81
|
+
headers: {
|
|
82
|
+
"User-Agent": "Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)"
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
async function fetchRSSFeed(url, _sourceId) {
|
|
86
|
+
try {
|
|
87
|
+
console.log(`\u{1F504} [RSS] Fetching feed from ${url}`);
|
|
88
|
+
const feed = await parser.parseURL(url);
|
|
89
|
+
if (!feed.items || feed.items.length === 0) {
|
|
90
|
+
console.warn(`\u26A0\uFE0F [RSS] Feed from ${url} contains no items`);
|
|
91
|
+
return [];
|
|
92
|
+
}
|
|
93
|
+
const items = feed.items.map((item) => ({
|
|
94
|
+
title: item.title || "Untitled",
|
|
95
|
+
link: item.link || "",
|
|
96
|
+
pubDate: item.pubDate || (/* @__PURE__ */ new Date()).toISOString(),
|
|
97
|
+
guid: item.guid || item.link || import_crypto.default.randomUUID(),
|
|
98
|
+
content: item.content || item["content:encoded"] || "",
|
|
99
|
+
contentSnippet: item.contentSnippet || ""
|
|
100
|
+
}));
|
|
101
|
+
console.log(`\u2705 [RSS] Successfully fetched ${items.length} items from ${url}`);
|
|
102
|
+
return items;
|
|
103
|
+
} catch (error) {
|
|
104
|
+
const errorMessage = error instanceof Error ? error.message : "Unknown error";
|
|
105
|
+
console.error(`\u274C [RSS] Failed to fetch RSS from ${url}:`, errorMessage);
|
|
106
|
+
if (error instanceof Error) {
|
|
107
|
+
if (error.message.includes("Invalid character")) {
|
|
108
|
+
console.error(`\u{1F50D} [RSS] XML parsing error - feed may be malformed or contain HTML`);
|
|
109
|
+
} else if (error.message.includes("timeout")) {
|
|
110
|
+
console.error(`\u{1F50D} [RSS] Request timeout - server may be slow or unreachable`);
|
|
111
|
+
} else if (error.message.includes("ENOTFOUND")) {
|
|
112
|
+
console.error(`\u{1F50D} [RSS] Domain not found - check URL spelling`);
|
|
113
|
+
} else if (error.message.includes("ECONNREFUSED")) {
|
|
114
|
+
console.error(`\u{1F50D} [RSS] Connection refused - server may be down`);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return [];
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// src/extractors/rss-discovery.ts
|
|
122
|
+
var cheerio = __toESM(require("cheerio"));
|
|
123
|
+
|
|
124
|
+
// src/utils/scraping-rate-limiter.ts
|
|
125
|
+
var ScrapingRateLimiter = class {
|
|
126
|
+
constructor(options = {}) {
|
|
127
|
+
this.hosts = /* @__PURE__ */ new Map();
|
|
128
|
+
this.activeRequests = /* @__PURE__ */ new Set();
|
|
129
|
+
this.baseDelay = Math.floor(1e3 / (options.requestsPerSecond || 1));
|
|
130
|
+
this.maxBackoff = options.maxBackoff || 3e4;
|
|
131
|
+
this.maxConcurrent = options.maxConcurrent || 10;
|
|
132
|
+
}
|
|
133
|
+
async execute(url, operation, options = {}) {
|
|
134
|
+
const host = this.extractHost(url);
|
|
135
|
+
if (!host) {
|
|
136
|
+
throw new Error(`Invalid URL: ${url}`);
|
|
137
|
+
}
|
|
138
|
+
return new Promise((resolve, reject) => {
|
|
139
|
+
const request = {
|
|
140
|
+
resolve,
|
|
141
|
+
reject,
|
|
142
|
+
operation,
|
|
143
|
+
priority: options.priority || 0,
|
|
144
|
+
retryCount: 0,
|
|
145
|
+
maxRetries: options.maxRetries || 3,
|
|
146
|
+
host
|
|
147
|
+
};
|
|
148
|
+
this.enqueueRequest(host, request);
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
extractHost(url) {
|
|
152
|
+
try {
|
|
153
|
+
const parsed = new URL(url);
|
|
154
|
+
return parsed.hostname.toLowerCase();
|
|
155
|
+
} catch {
|
|
156
|
+
return null;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
enqueueRequest(host, request) {
|
|
160
|
+
if (!this.hosts.has(host)) {
|
|
161
|
+
this.hosts.set(host, {
|
|
162
|
+
lastRequest: 0,
|
|
163
|
+
backoffUntil: 0,
|
|
164
|
+
backoffMultiplier: 1,
|
|
165
|
+
queue: [],
|
|
166
|
+
processing: false
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
const hostState = this.hosts.get(host);
|
|
170
|
+
const insertIndex = hostState.queue.findIndex(
|
|
171
|
+
(req) => req.priority < request.priority
|
|
172
|
+
);
|
|
173
|
+
if (insertIndex === -1) {
|
|
174
|
+
hostState.queue.push(request);
|
|
175
|
+
} else {
|
|
176
|
+
hostState.queue.splice(insertIndex, 0, request);
|
|
177
|
+
}
|
|
178
|
+
if (!hostState.processing) {
|
|
179
|
+
this.processQueue(host).catch((error) => {
|
|
180
|
+
console.error(`[RateLimiter] Error processing queue for ${host}:`, error);
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
async processQueue(host) {
|
|
185
|
+
const hostState = this.hosts.get(host);
|
|
186
|
+
if (!hostState || hostState.processing) {
|
|
187
|
+
return;
|
|
188
|
+
}
|
|
189
|
+
hostState.processing = true;
|
|
190
|
+
try {
|
|
191
|
+
while (hostState.queue.length > 0) {
|
|
192
|
+
if (this.activeRequests.size >= this.maxConcurrent) {
|
|
193
|
+
await this.wait(100);
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
196
|
+
if (Date.now() < hostState.backoffUntil) {
|
|
197
|
+
const waitTime = hostState.backoffUntil - Date.now();
|
|
198
|
+
await this.wait(Math.min(waitTime, 1e3));
|
|
199
|
+
continue;
|
|
200
|
+
}
|
|
201
|
+
const now = Date.now();
|
|
202
|
+
const timeSinceLastRequest = now - hostState.lastRequest;
|
|
203
|
+
if (timeSinceLastRequest < this.baseDelay) {
|
|
204
|
+
const waitTime = this.baseDelay - timeSinceLastRequest;
|
|
205
|
+
await this.wait(waitTime);
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
const request = hostState.queue.shift();
|
|
209
|
+
const requestId = `${host}-${Date.now()}-${Math.random()}`;
|
|
210
|
+
this.activeRequests.add(requestId);
|
|
211
|
+
try {
|
|
212
|
+
hostState.lastRequest = Date.now();
|
|
213
|
+
const result = await request.operation();
|
|
214
|
+
hostState.backoffMultiplier = 1;
|
|
215
|
+
hostState.backoffUntil = 0;
|
|
216
|
+
request.resolve(result);
|
|
217
|
+
} catch (error) {
|
|
218
|
+
await this.handleRequestError(hostState, request, error);
|
|
219
|
+
} finally {
|
|
220
|
+
this.activeRequests.delete(requestId);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
} finally {
|
|
224
|
+
hostState.processing = false;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
async handleRequestError(hostState, request, error) {
|
|
228
|
+
const shouldRetry = this.shouldRetry(error, request);
|
|
229
|
+
if (shouldRetry && request.retryCount < request.maxRetries) {
|
|
230
|
+
request.retryCount++;
|
|
231
|
+
if (this.shouldBackoff(error)) {
|
|
232
|
+
const backoffTime = Math.min(
|
|
233
|
+
this.baseDelay * hostState.backoffMultiplier * Math.pow(2, request.retryCount),
|
|
234
|
+
this.maxBackoff
|
|
235
|
+
);
|
|
236
|
+
hostState.backoffUntil = Date.now() + backoffTime;
|
|
237
|
+
hostState.backoffMultiplier = Math.min(hostState.backoffMultiplier * 1.5, 10);
|
|
238
|
+
console.warn(
|
|
239
|
+
`[RateLimiter] Backing off ${request.host} for ${backoffTime}ms (attempt ${request.retryCount}/${request.maxRetries}): ${error.message}`
|
|
240
|
+
);
|
|
241
|
+
}
|
|
242
|
+
request.priority = Math.max(request.priority - 1, -10);
|
|
243
|
+
hostState.queue.unshift(request);
|
|
244
|
+
} else {
|
|
245
|
+
console.error(
|
|
246
|
+
`[RateLimiter] Request failed for ${request.host} (${request.retryCount}/${request.maxRetries} retries): ${error.message}`
|
|
247
|
+
);
|
|
248
|
+
request.reject(error);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
shouldRetry(error, request) {
|
|
252
|
+
if (request.retryCount >= request.maxRetries) {
|
|
253
|
+
return false;
|
|
254
|
+
}
|
|
255
|
+
if (error.code === "ENOTFOUND" || error.code === "ECONNREFUSED") {
|
|
256
|
+
return true;
|
|
257
|
+
}
|
|
258
|
+
if (error.status) {
|
|
259
|
+
const status = error.status;
|
|
260
|
+
return status === 408 || status === 429 || status >= 500;
|
|
261
|
+
}
|
|
262
|
+
if (error.name === "AbortError" || error.message.includes("timeout")) {
|
|
263
|
+
return true;
|
|
264
|
+
}
|
|
265
|
+
return false;
|
|
266
|
+
}
|
|
267
|
+
shouldBackoff(error) {
|
|
268
|
+
if (error.status) {
|
|
269
|
+
const status = error.status;
|
|
270
|
+
return status === 429 || status >= 500;
|
|
271
|
+
}
|
|
272
|
+
if (error.code === "ECONNREFUSED" || error.code === "ENOTFOUND") {
|
|
273
|
+
return true;
|
|
274
|
+
}
|
|
275
|
+
return false;
|
|
276
|
+
}
|
|
277
|
+
wait(ms) {
|
|
278
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
279
|
+
}
|
|
280
|
+
// Utility method to get current queue stats
|
|
281
|
+
getStats() {
|
|
282
|
+
const stats = {};
|
|
283
|
+
this.hosts.forEach((state, host) => {
|
|
284
|
+
stats[host] = {
|
|
285
|
+
queueLength: state.queue.length,
|
|
286
|
+
processing: state.processing,
|
|
287
|
+
backoffUntil: state.backoffUntil,
|
|
288
|
+
backoffMultiplier: state.backoffMultiplier,
|
|
289
|
+
lastRequest: state.lastRequest
|
|
290
|
+
};
|
|
291
|
+
});
|
|
292
|
+
return {
|
|
293
|
+
hosts: stats,
|
|
294
|
+
activeRequests: this.activeRequests.size,
|
|
295
|
+
maxConcurrent: this.maxConcurrent
|
|
296
|
+
};
|
|
297
|
+
}
|
|
298
|
+
};
|
|
299
|
+
var globalRateLimiter = new ScrapingRateLimiter({
|
|
300
|
+
requestsPerSecond: 1,
|
|
301
|
+
maxBackoff: 3e4,
|
|
302
|
+
maxConcurrent: 10
|
|
303
|
+
});
|
|
304
|
+
|
|
305
|
+
// src/extractors/robots-checker.ts
|
|
306
|
+
var RobotsChecker = class {
|
|
307
|
+
constructor() {
|
|
308
|
+
this.cache = /* @__PURE__ */ new Map();
|
|
309
|
+
this.cacheTimeout = 24 * 60 * 60 * 1e3;
|
|
310
|
+
// 24 hours
|
|
311
|
+
this.userAgent = "AtomizeNews/1.0";
|
|
312
|
+
this.requestTimeout = 5e3;
|
|
313
|
+
}
|
|
314
|
+
// 5 seconds
|
|
315
|
+
/**
|
|
316
|
+
* Check if a URL is allowed to be crawled according to robots.txt
|
|
317
|
+
*/
|
|
318
|
+
async isAllowed(url) {
|
|
319
|
+
try {
|
|
320
|
+
const urlObj = new URL(url);
|
|
321
|
+
const robotsUrl = `${urlObj.protocol}//${urlObj.host}/robots.txt`;
|
|
322
|
+
console.log(`\u{1F916} [Robots] Checking ${url} against ${robotsUrl}`);
|
|
323
|
+
const robotsTxt = await this.getRobotsTxt(robotsUrl);
|
|
324
|
+
if (!robotsTxt) {
|
|
325
|
+
return {
|
|
326
|
+
allowed: true,
|
|
327
|
+
sitemaps: [],
|
|
328
|
+
reason: "No robots.txt found - allowing by default"
|
|
329
|
+
};
|
|
330
|
+
}
|
|
331
|
+
const result = this.checkRules(urlObj.pathname, robotsTxt);
|
|
332
|
+
console.log(`\u{1F916} [Robots] ${result.allowed ? "\u2705 Allowed" : "\u274C Blocked"}: ${url} - ${result.reason}`);
|
|
333
|
+
return result;
|
|
334
|
+
} catch (error) {
|
|
335
|
+
console.warn(`\u26A0\uFE0F [Robots] Error checking robots.txt for ${url}:`, error);
|
|
336
|
+
return {
|
|
337
|
+
allowed: true,
|
|
338
|
+
sitemaps: [],
|
|
339
|
+
reason: `Error checking robots.txt: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
340
|
+
};
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* Get sitemaps listed in robots.txt for a domain
|
|
345
|
+
*/
|
|
346
|
+
async getSitemaps(domain) {
|
|
347
|
+
try {
|
|
348
|
+
const robotsUrl = `https://${domain}/robots.txt`;
|
|
349
|
+
const robotsTxt = await this.getRobotsTxt(robotsUrl);
|
|
350
|
+
return robotsTxt ? robotsTxt.sitemaps : [];
|
|
351
|
+
} catch (error) {
|
|
352
|
+
console.warn(`\u26A0\uFE0F [Robots] Error getting sitemaps for ${domain}:`, error);
|
|
353
|
+
return [];
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
/**
|
|
357
|
+
* Get the recommended crawl delay for a domain
|
|
358
|
+
*/
|
|
359
|
+
async getCrawlDelay(domain) {
|
|
360
|
+
try {
|
|
361
|
+
const robotsUrl = `https://${domain}/robots.txt`;
|
|
362
|
+
const robotsTxt = await this.getRobotsTxt(robotsUrl);
|
|
363
|
+
if (!robotsTxt) return void 0;
|
|
364
|
+
const rule = this.findBestMatchingRule(robotsTxt.rules);
|
|
365
|
+
return rule?.crawlDelay;
|
|
366
|
+
} catch (error) {
|
|
367
|
+
console.warn(`\u26A0\uFE0F [Robots] Error getting crawl delay for ${domain}:`, error);
|
|
368
|
+
return void 0;
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
async getRobotsTxt(robotsUrl) {
|
|
372
|
+
const cached = this.cache.get(robotsUrl);
|
|
373
|
+
if (cached && Date.now() < cached.expiresAt) {
|
|
374
|
+
return cached;
|
|
375
|
+
}
|
|
376
|
+
try {
|
|
377
|
+
console.log(`\u{1F916} [Robots] Fetching ${robotsUrl}`);
|
|
378
|
+
const controller = new AbortController();
|
|
379
|
+
const timeoutId = setTimeout(() => controller.abort(), this.requestTimeout);
|
|
380
|
+
const response = await fetch(robotsUrl, {
|
|
381
|
+
headers: {
|
|
382
|
+
"User-Agent": this.userAgent
|
|
383
|
+
},
|
|
384
|
+
signal: controller.signal
|
|
385
|
+
});
|
|
386
|
+
clearTimeout(timeoutId);
|
|
387
|
+
if (!response.ok) {
|
|
388
|
+
if (response.status === 404) {
|
|
389
|
+
console.log(`\u{1F916} [Robots] No robots.txt found at ${robotsUrl}`);
|
|
390
|
+
return null;
|
|
391
|
+
}
|
|
392
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
393
|
+
}
|
|
394
|
+
const text = await response.text();
|
|
395
|
+
const robotsTxt = this.parseRobotsTxt(text);
|
|
396
|
+
this.cache.set(robotsUrl, robotsTxt);
|
|
397
|
+
console.log(`\u{1F916} [Robots] Successfully parsed robots.txt for ${new URL(robotsUrl).hostname}`);
|
|
398
|
+
return robotsTxt;
|
|
399
|
+
} catch (error) {
|
|
400
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
401
|
+
console.warn(`\u26A0\uFE0F [Robots] Timeout fetching ${robotsUrl}`);
|
|
402
|
+
} else {
|
|
403
|
+
console.warn(`\u26A0\uFE0F [Robots] Error fetching ${robotsUrl}:`, error);
|
|
404
|
+
}
|
|
405
|
+
return null;
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
parseRobotsTxt(text) {
|
|
409
|
+
const lines = text.split("\n").map((line) => line.trim()).filter((line) => line && !line.startsWith("#"));
|
|
410
|
+
const rules = [];
|
|
411
|
+
const globalSitemaps = [];
|
|
412
|
+
let currentRule = null;
|
|
413
|
+
for (const line of lines) {
|
|
414
|
+
const [key, ...valueParts] = line.split(":");
|
|
415
|
+
const value = valueParts.join(":").trim();
|
|
416
|
+
const lowerKey = key.toLowerCase().trim();
|
|
417
|
+
switch (lowerKey) {
|
|
418
|
+
case "user-agent":
|
|
419
|
+
if (currentRule) {
|
|
420
|
+
rules.push(this.completeRule(currentRule));
|
|
421
|
+
}
|
|
422
|
+
currentRule = {
|
|
423
|
+
userAgent: value.toLowerCase(),
|
|
424
|
+
disallows: [],
|
|
425
|
+
allows: [],
|
|
426
|
+
sitemaps: []
|
|
427
|
+
};
|
|
428
|
+
break;
|
|
429
|
+
case "disallow":
|
|
430
|
+
if (currentRule) {
|
|
431
|
+
currentRule.disallows = currentRule.disallows || [];
|
|
432
|
+
if (value) {
|
|
433
|
+
currentRule.disallows.push(value);
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
break;
|
|
437
|
+
case "allow":
|
|
438
|
+
if (currentRule) {
|
|
439
|
+
currentRule.allows = currentRule.allows || [];
|
|
440
|
+
if (value) {
|
|
441
|
+
currentRule.allows.push(value);
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
break;
|
|
445
|
+
case "crawl-delay":
|
|
446
|
+
if (currentRule && value) {
|
|
447
|
+
const delay = parseFloat(value);
|
|
448
|
+
if (!isNaN(delay)) {
|
|
449
|
+
currentRule.crawlDelay = delay * 1e3;
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
break;
|
|
453
|
+
case "sitemap":
|
|
454
|
+
if (value) {
|
|
455
|
+
globalSitemaps.push(value);
|
|
456
|
+
if (currentRule) {
|
|
457
|
+
currentRule.sitemaps = currentRule.sitemaps || [];
|
|
458
|
+
currentRule.sitemaps.push(value);
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
break;
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
if (currentRule) {
|
|
465
|
+
rules.push(this.completeRule(currentRule));
|
|
466
|
+
}
|
|
467
|
+
const now = Date.now();
|
|
468
|
+
return {
|
|
469
|
+
rules,
|
|
470
|
+
sitemaps: globalSitemaps,
|
|
471
|
+
fetchedAt: now,
|
|
472
|
+
expiresAt: now + this.cacheTimeout
|
|
473
|
+
};
|
|
474
|
+
}
|
|
475
|
+
completeRule(partial) {
|
|
476
|
+
return {
|
|
477
|
+
userAgent: partial.userAgent || "*",
|
|
478
|
+
disallows: partial.disallows || [],
|
|
479
|
+
allows: partial.allows || [],
|
|
480
|
+
crawlDelay: partial.crawlDelay,
|
|
481
|
+
sitemaps: partial.sitemaps || []
|
|
482
|
+
};
|
|
483
|
+
}
|
|
484
|
+
checkRules(path, robotsTxt) {
|
|
485
|
+
const rule = this.findBestMatchingRule(robotsTxt.rules);
|
|
486
|
+
if (!rule) {
|
|
487
|
+
return {
|
|
488
|
+
allowed: true,
|
|
489
|
+
sitemaps: robotsTxt.sitemaps,
|
|
490
|
+
reason: "No applicable rules found"
|
|
491
|
+
};
|
|
492
|
+
}
|
|
493
|
+
for (const allowPattern of rule.allows) {
|
|
494
|
+
if (this.matchesPattern(path, allowPattern)) {
|
|
495
|
+
return {
|
|
496
|
+
allowed: true,
|
|
497
|
+
crawlDelay: rule.crawlDelay,
|
|
498
|
+
sitemaps: robotsTxt.sitemaps,
|
|
499
|
+
reason: `Explicitly allowed by pattern: ${allowPattern}`
|
|
500
|
+
};
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
for (const disallowPattern of rule.disallows) {
|
|
504
|
+
if (this.matchesPattern(path, disallowPattern)) {
|
|
505
|
+
return {
|
|
506
|
+
allowed: false,
|
|
507
|
+
crawlDelay: rule.crawlDelay,
|
|
508
|
+
sitemaps: robotsTxt.sitemaps,
|
|
509
|
+
reason: `Blocked by pattern: ${disallowPattern}`
|
|
510
|
+
};
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
return {
|
|
514
|
+
allowed: true,
|
|
515
|
+
crawlDelay: rule.crawlDelay,
|
|
516
|
+
sitemaps: robotsTxt.sitemaps,
|
|
517
|
+
reason: "No matching disallow rules"
|
|
518
|
+
};
|
|
519
|
+
}
|
|
520
|
+
findBestMatchingRule(rules) {
|
|
521
|
+
const exactMatch = rules.find((rule) => rule.userAgent === this.userAgent.toLowerCase());
|
|
522
|
+
if (exactMatch) return exactMatch;
|
|
523
|
+
const wildcardMatch = rules.find((rule) => rule.userAgent === "*");
|
|
524
|
+
if (wildcardMatch) return wildcardMatch;
|
|
525
|
+
return null;
|
|
526
|
+
}
|
|
527
|
+
matchesPattern(path, pattern) {
|
|
528
|
+
if (pattern === "") {
|
|
529
|
+
return false;
|
|
530
|
+
}
|
|
531
|
+
if (pattern === "/") {
|
|
532
|
+
return true;
|
|
533
|
+
}
|
|
534
|
+
if (pattern.includes("*")) {
|
|
535
|
+
const regexPattern = pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&").replace(/\\\*/g, ".*");
|
|
536
|
+
const regex = new RegExp("^" + regexPattern);
|
|
537
|
+
return regex.test(path);
|
|
538
|
+
}
|
|
539
|
+
return path.startsWith(pattern);
|
|
540
|
+
}
|
|
541
|
+
// Clear cache (useful for testing)
|
|
542
|
+
clearCache() {
|
|
543
|
+
this.cache.clear();
|
|
544
|
+
}
|
|
545
|
+
// Get cache stats
|
|
546
|
+
getCacheStats() {
|
|
547
|
+
return {
|
|
548
|
+
size: this.cache.size,
|
|
549
|
+
entries: Array.from(this.cache.entries()).map(([url, data]) => ({
|
|
550
|
+
url,
|
|
551
|
+
fetchedAt: new Date(data.fetchedAt).toISOString(),
|
|
552
|
+
expiresAt: new Date(data.expiresAt).toISOString(),
|
|
553
|
+
rulesCount: data.rules.length,
|
|
554
|
+
sitemapsCount: data.sitemaps.length
|
|
555
|
+
}))
|
|
556
|
+
};
|
|
557
|
+
}
|
|
558
|
+
};
|
|
559
|
+
var globalRobotsChecker = new RobotsChecker();
|
|
560
|
+
|
|
561
|
+
// src/extractors/rss-discovery.ts
|
|
562
|
+
var RSSDiscovery = class {
|
|
563
|
+
constructor() {
|
|
564
|
+
this.userAgent = "Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)";
|
|
565
|
+
this.timeout = 1e4;
|
|
566
|
+
}
|
|
567
|
+
// 10 seconds
|
|
568
|
+
// private readonly maxRedirects = 3; // Currently unused
|
|
569
|
+
/**
|
|
570
|
+
* Discover RSS feeds from a given URL
|
|
571
|
+
*/
|
|
572
|
+
async discoverFeeds(url) {
|
|
573
|
+
console.log(`\u{1F50D} [RSSDiscovery] Starting feed discovery for ${url}`);
|
|
574
|
+
const feeds = /* @__PURE__ */ new Map();
|
|
575
|
+
try {
|
|
576
|
+
const directFeed = await this.checkDirectFeed(url);
|
|
577
|
+
if (directFeed) {
|
|
578
|
+
feeds.set(directFeed.url, directFeed);
|
|
579
|
+
console.log(`\u2705 [RSSDiscovery] Direct feed found: ${directFeed.url}`);
|
|
580
|
+
return Array.from(feeds.values());
|
|
581
|
+
}
|
|
582
|
+
const robotsCheck = await globalRobotsChecker.isAllowed(url);
|
|
583
|
+
if (!robotsCheck.allowed) {
|
|
584
|
+
console.warn(`\u{1F916} [RSSDiscovery] URL blocked by robots.txt: ${url} - ${robotsCheck.reason}`);
|
|
585
|
+
return [];
|
|
586
|
+
}
|
|
587
|
+
const html = await this.fetchPage(url);
|
|
588
|
+
if (!html) {
|
|
589
|
+
return [];
|
|
590
|
+
}
|
|
591
|
+
const linkFeeds = this.extractFeedsFromHTML(html, url);
|
|
592
|
+
linkFeeds.forEach((feed) => feeds.set(feed.url, feed));
|
|
593
|
+
if (feeds.size === 0) {
|
|
594
|
+
const commonPathFeeds = await this.checkCommonPaths(url);
|
|
595
|
+
commonPathFeeds.forEach((feed) => feeds.set(feed.url, feed));
|
|
596
|
+
}
|
|
597
|
+
if (feeds.size === 0) {
|
|
598
|
+
const contentFeeds = await this.scanForFeedContent(html, url);
|
|
599
|
+
contentFeeds.forEach((feed) => feeds.set(feed.url, feed));
|
|
600
|
+
}
|
|
601
|
+
const discoveredFeeds = Array.from(feeds.values());
|
|
602
|
+
discoveredFeeds.sort((a, b) => b.confidence - a.confidence);
|
|
603
|
+
console.log(`\u{1F50D} [RSSDiscovery] Discovered ${discoveredFeeds.length} feeds for ${url}`);
|
|
604
|
+
return discoveredFeeds;
|
|
605
|
+
} catch (error) {
|
|
606
|
+
console.error(`\u274C [RSSDiscovery] Error discovering feeds for ${url}:`, error);
|
|
607
|
+
return [];
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
/**
|
|
611
|
+
* Check if the URL itself is a direct feed
|
|
612
|
+
*/
|
|
613
|
+
async checkDirectFeed(url) {
|
|
614
|
+
try {
|
|
615
|
+
const response = await globalRateLimiter.execute(url, async () => {
|
|
616
|
+
const controller = new AbortController();
|
|
617
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
618
|
+
try {
|
|
619
|
+
const res = await fetch(url, {
|
|
620
|
+
method: "HEAD",
|
|
621
|
+
headers: { "User-Agent": this.userAgent },
|
|
622
|
+
signal: controller.signal
|
|
623
|
+
});
|
|
624
|
+
clearTimeout(timeoutId);
|
|
625
|
+
return res;
|
|
626
|
+
} catch (error) {
|
|
627
|
+
clearTimeout(timeoutId);
|
|
628
|
+
throw error;
|
|
629
|
+
}
|
|
630
|
+
});
|
|
631
|
+
const contentType = response.headers.get("content-type") || "";
|
|
632
|
+
if (this.isFeedContentType(contentType)) {
|
|
633
|
+
const type = this.determineFeedType(contentType);
|
|
634
|
+
return {
|
|
635
|
+
url,
|
|
636
|
+
type,
|
|
637
|
+
source: "link-tag",
|
|
638
|
+
confidence: 1
|
|
639
|
+
};
|
|
640
|
+
}
|
|
641
|
+
return null;
|
|
642
|
+
} catch (error) {
|
|
643
|
+
return null;
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
/**
|
|
647
|
+
* Fetch HTML page content
|
|
648
|
+
*/
|
|
649
|
+
async fetchPage(url) {
|
|
650
|
+
try {
|
|
651
|
+
return await globalRateLimiter.execute(url, async () => {
|
|
652
|
+
const controller = new AbortController();
|
|
653
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
654
|
+
try {
|
|
655
|
+
const response = await fetch(url, {
|
|
656
|
+
headers: { "User-Agent": this.userAgent },
|
|
657
|
+
signal: controller.signal
|
|
658
|
+
});
|
|
659
|
+
clearTimeout(timeoutId);
|
|
660
|
+
if (!response.ok) {
|
|
661
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
662
|
+
}
|
|
663
|
+
const contentType = response.headers.get("content-type") || "";
|
|
664
|
+
if (!contentType.includes("text/html")) {
|
|
665
|
+
throw new Error(`Not HTML content: ${contentType}`);
|
|
666
|
+
}
|
|
667
|
+
return await response.text();
|
|
668
|
+
} catch (error) {
|
|
669
|
+
clearTimeout(timeoutId);
|
|
670
|
+
throw error;
|
|
671
|
+
}
|
|
672
|
+
});
|
|
673
|
+
} catch (error) {
|
|
674
|
+
console.error(`\u274C [RSSDiscovery] Error fetching page ${url}:`, error);
|
|
675
|
+
return null;
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
/**
|
|
679
|
+
* Extract feed URLs from HTML link tags
|
|
680
|
+
*/
|
|
681
|
+
extractFeedsFromHTML(html, baseUrl) {
|
|
682
|
+
const feeds = [];
|
|
683
|
+
try {
|
|
684
|
+
const $ = cheerio.load(html);
|
|
685
|
+
$('link[rel="alternate"]').each((_, element) => {
|
|
686
|
+
const $link = $(element);
|
|
687
|
+
const type = $link.attr("type");
|
|
688
|
+
const href = $link.attr("href");
|
|
689
|
+
const title = $link.attr("title");
|
|
690
|
+
if (href && this.isFeedContentType(type || "")) {
|
|
691
|
+
const absoluteUrl = this.resolveUrl(href, baseUrl);
|
|
692
|
+
if (absoluteUrl) {
|
|
693
|
+
feeds.push({
|
|
694
|
+
url: absoluteUrl,
|
|
695
|
+
title: title || void 0,
|
|
696
|
+
type: this.determineFeedType(type || ""),
|
|
697
|
+
source: "link-tag",
|
|
698
|
+
confidence: 0.9
|
|
699
|
+
});
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
});
|
|
703
|
+
$("a[href]").each((_, element) => {
|
|
704
|
+
const $link = $(element);
|
|
705
|
+
const href = $link.attr("href");
|
|
706
|
+
const text = $link.text().toLowerCase().trim();
|
|
707
|
+
if (href && this.isFeedLikeLink(href, text)) {
|
|
708
|
+
const absoluteUrl = this.resolveUrl(href, baseUrl);
|
|
709
|
+
if (absoluteUrl && !feeds.some((f) => f.url === absoluteUrl)) {
|
|
710
|
+
feeds.push({
|
|
711
|
+
url: absoluteUrl,
|
|
712
|
+
title: $link.text().trim() || void 0,
|
|
713
|
+
type: this.guessFeedType(href),
|
|
714
|
+
source: "content-scan",
|
|
715
|
+
confidence: 0.6
|
|
716
|
+
});
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
});
|
|
720
|
+
} catch (error) {
|
|
721
|
+
console.error(`\u274C [RSSDiscovery] Error parsing HTML for feeds:`, error);
|
|
722
|
+
}
|
|
723
|
+
return feeds;
|
|
724
|
+
}
|
|
725
|
+
/**
|
|
726
|
+
* Check common feed paths
|
|
727
|
+
*/
|
|
728
|
+
async checkCommonPaths(url) {
|
|
729
|
+
const baseUrl = new URL(url);
|
|
730
|
+
const commonPaths = [
|
|
731
|
+
"/feed/",
|
|
732
|
+
"/feed.xml",
|
|
733
|
+
"/rss/",
|
|
734
|
+
"/rss.xml",
|
|
735
|
+
"/feeds/",
|
|
736
|
+
"/feeds.xml",
|
|
737
|
+
"/atom.xml",
|
|
738
|
+
"/index.xml",
|
|
739
|
+
"/blog/feed/",
|
|
740
|
+
"/blog/rss.xml",
|
|
741
|
+
"/news/feed/",
|
|
742
|
+
"/news/rss.xml"
|
|
743
|
+
];
|
|
744
|
+
const feeds = [];
|
|
745
|
+
for (const path of commonPaths) {
|
|
746
|
+
try {
|
|
747
|
+
const testUrl = `${baseUrl.protocol}//${baseUrl.host}${path}`;
|
|
748
|
+
const robotsCheck = await globalRobotsChecker.isAllowed(testUrl);
|
|
749
|
+
if (!robotsCheck.allowed) {
|
|
750
|
+
continue;
|
|
751
|
+
}
|
|
752
|
+
const isValid = await this.validateFeedUrl(testUrl);
|
|
753
|
+
if (isValid) {
|
|
754
|
+
feeds.push({
|
|
755
|
+
url: testUrl,
|
|
756
|
+
type: this.guessFeedType(path),
|
|
757
|
+
source: "common-path",
|
|
758
|
+
confidence: 0.7
|
|
759
|
+
});
|
|
760
|
+
}
|
|
761
|
+
} catch (error) {
|
|
762
|
+
continue;
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
return feeds;
|
|
766
|
+
}
|
|
767
|
+
/**
|
|
768
|
+
* Scan HTML content for feed-like patterns
|
|
769
|
+
*/
|
|
770
|
+
async scanForFeedContent(html, baseUrl) {
|
|
771
|
+
const feeds = [];
|
|
772
|
+
try {
|
|
773
|
+
const $ = cheerio.load(html);
|
|
774
|
+
const text = $.text();
|
|
775
|
+
const urlRegex = /https?:\/\/[^\s]+(?:feed|rss|atom)[^\s]*/gi;
|
|
776
|
+
const matches = text.match(urlRegex);
|
|
777
|
+
if (matches) {
|
|
778
|
+
for (const match of matches) {
|
|
779
|
+
const cleanUrl = match.replace(/[.,;:!?)]$/, "");
|
|
780
|
+
const absoluteUrl = this.resolveUrl(cleanUrl, baseUrl);
|
|
781
|
+
if (absoluteUrl && !feeds.some((f) => f.url === absoluteUrl)) {
|
|
782
|
+
const isValid = await this.validateFeedUrl(absoluteUrl);
|
|
783
|
+
if (isValid) {
|
|
784
|
+
feeds.push({
|
|
785
|
+
url: absoluteUrl,
|
|
786
|
+
type: this.guessFeedType(absoluteUrl),
|
|
787
|
+
source: "content-scan",
|
|
788
|
+
confidence: 0.5
|
|
789
|
+
});
|
|
790
|
+
}
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
} catch (error) {
|
|
795
|
+
console.error(`\u274C [RSSDiscovery] Error scanning content for feeds:`, error);
|
|
796
|
+
}
|
|
797
|
+
return feeds;
|
|
798
|
+
}
|
|
799
|
+
/**
|
|
800
|
+
* Validate if a URL is actually a feed
|
|
801
|
+
*/
|
|
802
|
+
async validateFeedUrl(url) {
|
|
803
|
+
try {
|
|
804
|
+
return await globalRateLimiter.execute(url, async () => {
|
|
805
|
+
const controller = new AbortController();
|
|
806
|
+
const timeoutId = setTimeout(() => controller.abort(), 5e3);
|
|
807
|
+
try {
|
|
808
|
+
const response = await fetch(url, {
|
|
809
|
+
method: "HEAD",
|
|
810
|
+
headers: { "User-Agent": this.userAgent },
|
|
811
|
+
signal: controller.signal
|
|
812
|
+
});
|
|
813
|
+
clearTimeout(timeoutId);
|
|
814
|
+
if (!response.ok) {
|
|
815
|
+
return false;
|
|
816
|
+
}
|
|
817
|
+
const contentType = response.headers.get("content-type") || "";
|
|
818
|
+
return this.isFeedContentType(contentType);
|
|
819
|
+
} catch (error) {
|
|
820
|
+
clearTimeout(timeoutId);
|
|
821
|
+
return false;
|
|
822
|
+
}
|
|
823
|
+
});
|
|
824
|
+
} catch (error) {
|
|
825
|
+
return false;
|
|
826
|
+
}
|
|
827
|
+
}
|
|
828
|
+
/**
|
|
829
|
+
* Resolve relative URLs to absolute URLs
|
|
830
|
+
*/
|
|
831
|
+
resolveUrl(url, baseUrl) {
|
|
832
|
+
try {
|
|
833
|
+
return new URL(url, baseUrl).toString();
|
|
834
|
+
} catch {
|
|
835
|
+
return null;
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
/**
|
|
839
|
+
* Check if content type indicates a feed
|
|
840
|
+
*/
|
|
841
|
+
isFeedContentType(contentType) {
|
|
842
|
+
const lowerType = contentType.toLowerCase();
|
|
843
|
+
return lowerType.includes("application/rss+xml") || lowerType.includes("application/atom+xml") || lowerType.includes("application/rdf+xml") || lowerType.includes("text/xml") || lowerType.includes("application/xml");
|
|
844
|
+
}
|
|
845
|
+
/**
|
|
846
|
+
* Determine feed type from content type
|
|
847
|
+
*/
|
|
848
|
+
determineFeedType(contentType) {
|
|
849
|
+
const lowerType = contentType.toLowerCase();
|
|
850
|
+
if (lowerType.includes("atom")) return "atom";
|
|
851
|
+
if (lowerType.includes("rdf")) return "rdf";
|
|
852
|
+
return "rss";
|
|
853
|
+
}
|
|
854
|
+
/**
|
|
855
|
+
* Guess feed type from URL or text
|
|
856
|
+
*/
|
|
857
|
+
guessFeedType(urlOrText) {
|
|
858
|
+
const lower = urlOrText.toLowerCase();
|
|
859
|
+
if (lower.includes("atom")) return "atom";
|
|
860
|
+
if (lower.includes("rdf")) return "rdf";
|
|
861
|
+
return "rss";
|
|
862
|
+
}
|
|
863
|
+
/**
|
|
864
|
+
* Check if a link looks like it could be a feed
|
|
865
|
+
*/
|
|
866
|
+
isFeedLikeLink(href, text) {
|
|
867
|
+
const lowerHref = href.toLowerCase();
|
|
868
|
+
const lowerText = text.toLowerCase();
|
|
869
|
+
const feedKeywords = ["rss", "feed", "atom", "xml", "syndication"];
|
|
870
|
+
return feedKeywords.some(
|
|
871
|
+
(keyword) => lowerHref.includes(keyword) || lowerText.includes(keyword)
|
|
872
|
+
);
|
|
873
|
+
}
|
|
874
|
+
};
|
|
875
|
+
var globalRSSDiscovery = new RSSDiscovery();
|
|
876
|
+
|
|
877
|
+
// src/extractors/sitemap-parser.ts
|
|
878
|
+
var cheerio2 = __toESM(require("cheerio"));
|
|
879
|
+
var SitemapParser = class {
|
|
880
|
+
constructor() {
|
|
881
|
+
this.userAgent = "Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)";
|
|
882
|
+
this.timeout = 15e3;
|
|
883
|
+
// 15 seconds for sitemaps
|
|
884
|
+
this.maxSitemapSize = 50 * 1024 * 1024;
|
|
885
|
+
// 50MB max
|
|
886
|
+
this.maxEntries = 5e4;
|
|
887
|
+
// Max entries per sitemap
|
|
888
|
+
this.recentTimeframe = 48 * 60 * 60 * 1e3;
|
|
889
|
+
}
|
|
890
|
+
// 48 hours in ms
|
|
891
|
+
/**
|
|
892
|
+
* Parse sitemap from URL and return entries
|
|
893
|
+
*/
|
|
894
|
+
async parseSitemap(url, options = {}) {
|
|
895
|
+
console.log(`\u{1F5FA}\uFE0F [Sitemap] Starting to parse ${url}`);
|
|
896
|
+
try {
|
|
897
|
+
const robotsCheck = await globalRobotsChecker.isAllowed(url);
|
|
898
|
+
if (!robotsCheck.allowed) {
|
|
899
|
+
console.warn(`\u{1F916} [Sitemap] URL blocked by robots.txt: ${url} - ${robotsCheck.reason}`);
|
|
900
|
+
return [];
|
|
901
|
+
}
|
|
902
|
+
const xml = await this.fetchSitemap(url);
|
|
903
|
+
if (!xml) {
|
|
904
|
+
return [];
|
|
905
|
+
}
|
|
906
|
+
if (this.isSitemapIndex(xml)) {
|
|
907
|
+
return await this.parseSitemapIndex(xml, options);
|
|
908
|
+
} else {
|
|
909
|
+
return this.parseRegularSitemap(xml, options);
|
|
910
|
+
}
|
|
911
|
+
} catch (error) {
|
|
912
|
+
console.error(`\u274C [Sitemap] Error parsing sitemap ${url}:`, error);
|
|
913
|
+
return [];
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
/**
|
|
917
|
+
* Discover sitemaps from domain
|
|
918
|
+
*/
|
|
919
|
+
async discoverSitemaps(domain) {
|
|
920
|
+
const sitemaps = [];
|
|
921
|
+
try {
|
|
922
|
+
const robotsSitemaps = await globalRobotsChecker.getSitemaps(domain);
|
|
923
|
+
sitemaps.push(...robotsSitemaps);
|
|
924
|
+
const commonPaths = [
|
|
925
|
+
"/sitemap.xml",
|
|
926
|
+
"/sitemap_index.xml",
|
|
927
|
+
"/sitemaps.xml",
|
|
928
|
+
"/sitemap/",
|
|
929
|
+
"/news-sitemap.xml"
|
|
930
|
+
];
|
|
931
|
+
for (const path of commonPaths) {
|
|
932
|
+
const sitemapUrl = `https://${domain}${path}`;
|
|
933
|
+
if (sitemaps.includes(sitemapUrl)) {
|
|
934
|
+
continue;
|
|
935
|
+
}
|
|
936
|
+
const exists = await this.checkSitemapExists(sitemapUrl);
|
|
937
|
+
if (exists) {
|
|
938
|
+
sitemaps.push(sitemapUrl);
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
console.log(`\u{1F5FA}\uFE0F [Sitemap] Discovered ${sitemaps.length} sitemaps for ${domain}`);
|
|
942
|
+
return Array.from(new Set(sitemaps));
|
|
943
|
+
} catch (error) {
|
|
944
|
+
console.error(`\u274C [Sitemap] Error discovering sitemaps for ${domain}:`, error);
|
|
945
|
+
return [];
|
|
946
|
+
}
|
|
947
|
+
}
|
|
948
|
+
/**
|
|
949
|
+
* Get recent entries from all sitemaps for a domain
|
|
950
|
+
*/
|
|
951
|
+
async getRecentEntries(domain, options = {}) {
|
|
952
|
+
const hoursBack = options.hoursBack || 48;
|
|
953
|
+
const maxEntries = options.maxEntries || 1e3;
|
|
954
|
+
const sitemaps = await this.discoverSitemaps(domain);
|
|
955
|
+
const allEntries = [];
|
|
956
|
+
for (const sitemapUrl of sitemaps) {
|
|
957
|
+
try {
|
|
958
|
+
const entries = await this.parseSitemap(sitemapUrl, {
|
|
959
|
+
filterRecent: true,
|
|
960
|
+
maxEntries: Math.floor(maxEntries / sitemaps.length),
|
|
961
|
+
// Distribute quota
|
|
962
|
+
includeNews: true
|
|
963
|
+
});
|
|
964
|
+
allEntries.push(...entries);
|
|
965
|
+
} catch (error) {
|
|
966
|
+
console.warn(`\u26A0\uFE0F [Sitemap] Error parsing ${sitemapUrl}:`, error);
|
|
967
|
+
continue;
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
const cutoffTime = new Date(Date.now() - hoursBack * 60 * 60 * 1e3);
|
|
971
|
+
const recentEntries = allEntries.filter((entry) => entry.lastmod && entry.lastmod >= cutoffTime).sort((a, b) => {
|
|
972
|
+
if (!a.lastmod || !b.lastmod) return 0;
|
|
973
|
+
return b.lastmod.getTime() - a.lastmod.getTime();
|
|
974
|
+
}).slice(0, maxEntries);
|
|
975
|
+
console.log(`\u{1F5FA}\uFE0F [Sitemap] Found ${recentEntries.length} recent entries from ${domain}`);
|
|
976
|
+
return recentEntries;
|
|
977
|
+
}
|
|
978
|
+
async fetchSitemap(url) {
|
|
979
|
+
try {
|
|
980
|
+
return await globalRateLimiter.execute(url, async () => {
|
|
981
|
+
const controller = new AbortController();
|
|
982
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
983
|
+
try {
|
|
984
|
+
const response = await fetch(url, {
|
|
985
|
+
headers: {
|
|
986
|
+
"User-Agent": this.userAgent,
|
|
987
|
+
"Accept": "application/xml, text/xml, */*"
|
|
988
|
+
},
|
|
989
|
+
signal: controller.signal
|
|
990
|
+
});
|
|
991
|
+
clearTimeout(timeoutId);
|
|
992
|
+
if (!response.ok) {
|
|
993
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
994
|
+
}
|
|
995
|
+
const contentLength = response.headers.get("content-length");
|
|
996
|
+
if (contentLength && parseInt(contentLength) > this.maxSitemapSize) {
|
|
997
|
+
throw new Error(`Sitemap too large: ${contentLength} bytes`);
|
|
998
|
+
}
|
|
999
|
+
const xml = await response.text();
|
|
1000
|
+
if (xml.length > this.maxSitemapSize) {
|
|
1001
|
+
throw new Error(`Sitemap too large: ${xml.length} bytes`);
|
|
1002
|
+
}
|
|
1003
|
+
return xml;
|
|
1004
|
+
} catch (error) {
|
|
1005
|
+
clearTimeout(timeoutId);
|
|
1006
|
+
throw error;
|
|
1007
|
+
}
|
|
1008
|
+
});
|
|
1009
|
+
} catch (error) {
|
|
1010
|
+
console.error(`\u274C [Sitemap] Error fetching ${url}:`, error);
|
|
1011
|
+
return null;
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
async checkSitemapExists(url) {
|
|
1015
|
+
try {
|
|
1016
|
+
return await globalRateLimiter.execute(url, async () => {
|
|
1017
|
+
const controller = new AbortController();
|
|
1018
|
+
const timeoutId = setTimeout(() => controller.abort(), 5e3);
|
|
1019
|
+
try {
|
|
1020
|
+
const response = await fetch(url, {
|
|
1021
|
+
method: "HEAD",
|
|
1022
|
+
headers: { "User-Agent": this.userAgent },
|
|
1023
|
+
signal: controller.signal
|
|
1024
|
+
});
|
|
1025
|
+
clearTimeout(timeoutId);
|
|
1026
|
+
return response.ok;
|
|
1027
|
+
} catch (error) {
|
|
1028
|
+
clearTimeout(timeoutId);
|
|
1029
|
+
return false;
|
|
1030
|
+
}
|
|
1031
|
+
});
|
|
1032
|
+
} catch (error) {
|
|
1033
|
+
return false;
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
isSitemapIndex(xml) {
|
|
1037
|
+
return xml.includes("<sitemapindex") || xml.includes("</sitemapindex>");
|
|
1038
|
+
}
|
|
1039
|
+
async parseSitemapIndex(xml, options) {
|
|
1040
|
+
console.log(`\u{1F5FA}\uFE0F [Sitemap] Parsing sitemap index`);
|
|
1041
|
+
const $ = cheerio2.load(xml, { xmlMode: true });
|
|
1042
|
+
const sitemaps = [];
|
|
1043
|
+
const allEntries = [];
|
|
1044
|
+
$("sitemap").each((_, element) => {
|
|
1045
|
+
const $element = $(element);
|
|
1046
|
+
const loc = $element.find("loc").first().text().trim();
|
|
1047
|
+
if (loc) {
|
|
1048
|
+
sitemaps.push(loc);
|
|
1049
|
+
}
|
|
1050
|
+
});
|
|
1051
|
+
console.log(`\u{1F5FA}\uFE0F [Sitemap] Found ${sitemaps.length} sitemaps in index`);
|
|
1052
|
+
const entriesPerSitemap = Math.floor((options.maxEntries || this.maxEntries) / sitemaps.length);
|
|
1053
|
+
for (const sitemapUrl of sitemaps.slice(0, 10)) {
|
|
1054
|
+
try {
|
|
1055
|
+
const sitemapXml = await this.fetchSitemap(sitemapUrl);
|
|
1056
|
+
if (sitemapXml) {
|
|
1057
|
+
const entries = this.parseRegularSitemap(sitemapXml, {
|
|
1058
|
+
...options,
|
|
1059
|
+
maxEntries: entriesPerSitemap
|
|
1060
|
+
});
|
|
1061
|
+
allEntries.push(...entries);
|
|
1062
|
+
}
|
|
1063
|
+
} catch (error) {
|
|
1064
|
+
console.warn(`\u26A0\uFE0F [Sitemap] Error parsing sitemap ${sitemapUrl}:`, error);
|
|
1065
|
+
continue;
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
return allEntries;
|
|
1069
|
+
}
|
|
1070
|
+
parseRegularSitemap(xml, options) {
|
|
1071
|
+
console.log(`\u{1F5FA}\uFE0F [Sitemap] Parsing regular sitemap`);
|
|
1072
|
+
const $ = cheerio2.load(xml, { xmlMode: true });
|
|
1073
|
+
const entries = [];
|
|
1074
|
+
const maxEntries = options.maxEntries || this.maxEntries;
|
|
1075
|
+
const cutoffTime = options.filterRecent ? new Date(Date.now() - this.recentTimeframe) : null;
|
|
1076
|
+
$("url").each((_index, element) => {
|
|
1077
|
+
if (entries.length >= maxEntries) {
|
|
1078
|
+
return false;
|
|
1079
|
+
}
|
|
1080
|
+
const $element = $(element);
|
|
1081
|
+
const loc = $element.find("loc").first().text().trim();
|
|
1082
|
+
if (!loc) return void 0;
|
|
1083
|
+
const entry = { url: loc };
|
|
1084
|
+
const lastmodText = $element.find("lastmod").first().text().trim();
|
|
1085
|
+
if (lastmodText) {
|
|
1086
|
+
const lastmod = new Date(lastmodText);
|
|
1087
|
+
if (!isNaN(lastmod.getTime())) {
|
|
1088
|
+
entry.lastmod = lastmod;
|
|
1089
|
+
}
|
|
1090
|
+
}
|
|
1091
|
+
if (cutoffTime && entry.lastmod && entry.lastmod < cutoffTime) {
|
|
1092
|
+
return void 0;
|
|
1093
|
+
}
|
|
1094
|
+
const changefreq = $element.find("changefreq").first().text().trim();
|
|
1095
|
+
if (changefreq) {
|
|
1096
|
+
entry.changefreq = changefreq;
|
|
1097
|
+
}
|
|
1098
|
+
const priorityText = $element.find("priority").first().text().trim();
|
|
1099
|
+
if (priorityText) {
|
|
1100
|
+
const priority = parseFloat(priorityText);
|
|
1101
|
+
if (!isNaN(priority)) {
|
|
1102
|
+
entry.priority = priority;
|
|
1103
|
+
}
|
|
1104
|
+
}
|
|
1105
|
+
if (options.includeImages) {
|
|
1106
|
+
const images = [];
|
|
1107
|
+
$element.find("image\\:image").each((_, imgElement) => {
|
|
1108
|
+
const $img = $(imgElement);
|
|
1109
|
+
const imgLoc = $img.find("image\\:loc").first().text().trim();
|
|
1110
|
+
if (imgLoc) {
|
|
1111
|
+
images.push({
|
|
1112
|
+
loc: imgLoc,
|
|
1113
|
+
caption: $img.find("image\\:caption").first().text().trim() || void 0,
|
|
1114
|
+
title: $img.find("image\\:title").first().text().trim() || void 0
|
|
1115
|
+
});
|
|
1116
|
+
}
|
|
1117
|
+
});
|
|
1118
|
+
if (images.length > 0) {
|
|
1119
|
+
entry.images = images;
|
|
1120
|
+
}
|
|
1121
|
+
}
|
|
1122
|
+
if (options.includeNews) {
|
|
1123
|
+
const $news = $element.find("news\\:news");
|
|
1124
|
+
if ($news.length > 0) {
|
|
1125
|
+
const title = $news.find("news\\:title").first().text().trim();
|
|
1126
|
+
if (title) {
|
|
1127
|
+
entry.news = { title };
|
|
1128
|
+
const pubDateText = $news.find("news\\:publication_date").first().text().trim();
|
|
1129
|
+
if (pubDateText) {
|
|
1130
|
+
const pubDate = new Date(pubDateText);
|
|
1131
|
+
if (!isNaN(pubDate.getTime())) {
|
|
1132
|
+
entry.news.publishedDate = pubDate;
|
|
1133
|
+
}
|
|
1134
|
+
}
|
|
1135
|
+
const keywords = $news.find("news\\:keywords").first().text().trim();
|
|
1136
|
+
if (keywords) {
|
|
1137
|
+
entry.news.keywords = keywords.split(",").map((k) => k.trim());
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
}
|
|
1141
|
+
}
|
|
1142
|
+
entries.push(entry);
|
|
1143
|
+
return void 0;
|
|
1144
|
+
});
|
|
1145
|
+
console.log(`\u{1F5FA}\uFE0F [Sitemap] Parsed ${entries.length} entries from sitemap`);
|
|
1146
|
+
return entries;
|
|
1147
|
+
}
|
|
1148
|
+
/**
|
|
1149
|
+
* Validate sitemap format
|
|
1150
|
+
*/
|
|
1151
|
+
validateSitemapFormat(xml) {
|
|
1152
|
+
const errors = [];
|
|
1153
|
+
try {
|
|
1154
|
+
const $ = cheerio2.load(xml, { xmlMode: true });
|
|
1155
|
+
const hasUrlset = $("urlset").length > 0;
|
|
1156
|
+
const hasSitemapIndex = $("sitemapindex").length > 0;
|
|
1157
|
+
if (!hasUrlset && !hasSitemapIndex) {
|
|
1158
|
+
errors.push("Missing required root element: <urlset> or <sitemapindex>");
|
|
1159
|
+
}
|
|
1160
|
+
if (hasUrlset) {
|
|
1161
|
+
const urlCount = $("url").length;
|
|
1162
|
+
if (urlCount > 5e4) {
|
|
1163
|
+
errors.push(`Too many URLs: ${urlCount} (max: 50,000)`);
|
|
1164
|
+
}
|
|
1165
|
+
}
|
|
1166
|
+
$("url").each((index, element) => {
|
|
1167
|
+
const $element = $(element);
|
|
1168
|
+
const loc = $element.find("loc").first().text().trim();
|
|
1169
|
+
if (!loc) {
|
|
1170
|
+
errors.push(`URL entry ${index + 1} missing <loc> element`);
|
|
1171
|
+
} else {
|
|
1172
|
+
try {
|
|
1173
|
+
new URL(loc);
|
|
1174
|
+
} catch {
|
|
1175
|
+
errors.push(`Invalid URL in entry ${index + 1}: ${loc}`);
|
|
1176
|
+
}
|
|
1177
|
+
}
|
|
1178
|
+
const lastmod = $element.find("lastmod").first().text().trim();
|
|
1179
|
+
if (lastmod) {
|
|
1180
|
+
const date = new Date(lastmod);
|
|
1181
|
+
if (isNaN(date.getTime())) {
|
|
1182
|
+
errors.push(`Invalid lastmod date in entry ${index + 1}: ${lastmod}`);
|
|
1183
|
+
}
|
|
1184
|
+
}
|
|
1185
|
+
const priority = $element.find("priority").first().text().trim();
|
|
1186
|
+
if (priority) {
|
|
1187
|
+
const priorityNum = parseFloat(priority);
|
|
1188
|
+
if (isNaN(priorityNum) || priorityNum < 0 || priorityNum > 1) {
|
|
1189
|
+
errors.push(`Invalid priority in entry ${index + 1}: ${priority} (must be 0-1)`);
|
|
1190
|
+
}
|
|
1191
|
+
}
|
|
1192
|
+
});
|
|
1193
|
+
} catch (error) {
|
|
1194
|
+
errors.push(`XML parsing error: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
1195
|
+
}
|
|
1196
|
+
return {
|
|
1197
|
+
valid: errors.length === 0,
|
|
1198
|
+
errors
|
|
1199
|
+
};
|
|
1200
|
+
}
|
|
1201
|
+
};
|
|
1202
|
+
var globalSitemapParser = new SitemapParser();
|
|
1203
|
+
|
|
1204
|
+
// src/extractors/html-scraper.ts
|
|
1205
|
+
var cheerio3 = __toESM(require("cheerio"));
|
|
1206
|
+
var PERPLEXITY_MODELS = {
|
|
1207
|
+
SONAR: "llama-3.1-sonar-small-128k-online",
|
|
1208
|
+
SONAR_PRO: "llama-3.1-sonar-large-128k-online"
|
|
1209
|
+
};
|
|
1210
|
+
var HTMLScraper = class {
|
|
1211
|
+
constructor() {
|
|
1212
|
+
this.userAgent = "Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)";
|
|
1213
|
+
this.timeout = 1e4;
|
|
1214
|
+
// 10 seconds
|
|
1215
|
+
this.defaultConfig = {
|
|
1216
|
+
selectors: {
|
|
1217
|
+
articleLinks: [
|
|
1218
|
+
"article a[href]",
|
|
1219
|
+
".article a[href]",
|
|
1220
|
+
".post a[href]",
|
|
1221
|
+
".story a[href]",
|
|
1222
|
+
".news-item a[href]",
|
|
1223
|
+
".content-item a[href]",
|
|
1224
|
+
"h1 a[href]",
|
|
1225
|
+
"h2 a[href]",
|
|
1226
|
+
"h3 a[href]",
|
|
1227
|
+
".headline a[href]",
|
|
1228
|
+
".title a[href]"
|
|
1229
|
+
],
|
|
1230
|
+
titleSelectors: [
|
|
1231
|
+
"h1",
|
|
1232
|
+
"h2",
|
|
1233
|
+
"h3",
|
|
1234
|
+
".headline",
|
|
1235
|
+
".title",
|
|
1236
|
+
".article-title",
|
|
1237
|
+
".post-title",
|
|
1238
|
+
".story-title"
|
|
1239
|
+
],
|
|
1240
|
+
dateSelectors: [
|
|
1241
|
+
"time[datetime]",
|
|
1242
|
+
".date",
|
|
1243
|
+
".published",
|
|
1244
|
+
".timestamp",
|
|
1245
|
+
".publish-date",
|
|
1246
|
+
".article-date"
|
|
1247
|
+
],
|
|
1248
|
+
excludeSelectors: [
|
|
1249
|
+
".advertisement",
|
|
1250
|
+
".ads",
|
|
1251
|
+
".sidebar",
|
|
1252
|
+
".footer",
|
|
1253
|
+
".navigation",
|
|
1254
|
+
".menu",
|
|
1255
|
+
".comments",
|
|
1256
|
+
".related"
|
|
1257
|
+
]
|
|
1258
|
+
},
|
|
1259
|
+
filters: {
|
|
1260
|
+
minTitleLength: 10,
|
|
1261
|
+
maxTitleLength: 200,
|
|
1262
|
+
includePatterns: [
|
|
1263
|
+
/\/article\//i,
|
|
1264
|
+
/\/post\//i,
|
|
1265
|
+
/\/story\//i,
|
|
1266
|
+
/\/news\//i,
|
|
1267
|
+
/\/blog\//i,
|
|
1268
|
+
/\/\d{4}\/\d{2}\/\d{2}\//,
|
|
1269
|
+
// Date patterns
|
|
1270
|
+
/\/\d{4}\/\d{2}\//
|
|
1271
|
+
],
|
|
1272
|
+
excludePatterns: [
|
|
1273
|
+
/\/(tag|category|author|search|archive)\//i,
|
|
1274
|
+
/\/(login|register|contact|about)\//i,
|
|
1275
|
+
/\.(pdf|jpg|jpeg|png|gif|mp4|zip|doc)$/i,
|
|
1276
|
+
/#/,
|
|
1277
|
+
// Skip hash links
|
|
1278
|
+
/javascript:/i,
|
|
1279
|
+
/mailto:/i
|
|
1280
|
+
]
|
|
1281
|
+
},
|
|
1282
|
+
limits: {
|
|
1283
|
+
maxLinksPerPage: 100,
|
|
1284
|
+
maxDepth: 3
|
|
1285
|
+
}
|
|
1286
|
+
};
|
|
1287
|
+
}
|
|
1288
|
+
/**
|
|
1289
|
+
* Extract article links from a webpage
|
|
1290
|
+
*/
|
|
1291
|
+
async extractArticleLinks(url, config = {}) {
|
|
1292
|
+
console.log(`\u{1F4F0} [HTMLScraper] Starting to extract articles from ${url}`);
|
|
1293
|
+
try {
|
|
1294
|
+
const robotsCheck = await globalRobotsChecker.isAllowed(url);
|
|
1295
|
+
if (!robotsCheck.allowed) {
|
|
1296
|
+
console.warn(`\u{1F916} [HTMLScraper] URL blocked by robots.txt: ${url} - ${robotsCheck.reason}`);
|
|
1297
|
+
if (config.perplexityFallback?.enabled && config.perplexityFallback?.useForRobotsBlocked) {
|
|
1298
|
+
console.log(`\u{1F504} [HTMLScraper] Attempting Perplexity fallback for robots-blocked URL`);
|
|
1299
|
+
return await this.extractWithPerplexity(url, config);
|
|
1300
|
+
}
|
|
1301
|
+
return [];
|
|
1302
|
+
}
|
|
1303
|
+
const html = await this.fetchPage(url);
|
|
1304
|
+
if (!html) {
|
|
1305
|
+
if (config.perplexityFallback?.enabled && config.perplexityFallback?.useForParseFailed) {
|
|
1306
|
+
console.log(`\u{1F504} [HTMLScraper] Attempting Perplexity fallback for failed fetch`);
|
|
1307
|
+
return await this.extractWithPerplexity(url, config);
|
|
1308
|
+
}
|
|
1309
|
+
return [];
|
|
1310
|
+
}
|
|
1311
|
+
const mergedConfig = this.mergeConfig(this.defaultConfig, config);
|
|
1312
|
+
const articles = this.parseArticleLinks(html, url, mergedConfig);
|
|
1313
|
+
if (articles.length === 0 && config.perplexityFallback?.enabled && config.perplexityFallback?.useForParseFailed) {
|
|
1314
|
+
console.log(`\u{1F504} [HTMLScraper] No articles found, attempting Perplexity fallback`);
|
|
1315
|
+
return await this.extractWithPerplexity(url, config);
|
|
1316
|
+
}
|
|
1317
|
+
console.log(`\u{1F4F0} [HTMLScraper] Extracted ${articles.length} article links from ${url}`);
|
|
1318
|
+
return articles;
|
|
1319
|
+
} catch (error) {
|
|
1320
|
+
console.error(`\u274C [HTMLScraper] Error extracting articles from ${url}:`, error);
|
|
1321
|
+
if (config.perplexityFallback?.enabled) {
|
|
1322
|
+
console.log(`\u{1F504} [HTMLScraper] Attempting Perplexity fallback after error`);
|
|
1323
|
+
return await this.extractWithPerplexity(url, config);
|
|
1324
|
+
}
|
|
1325
|
+
return [];
|
|
1326
|
+
}
|
|
1327
|
+
}
|
|
1328
|
+
/**
|
|
1329
|
+
* Extract articles from multiple pages with pagination support
|
|
1330
|
+
*/
|
|
1331
|
+
async extractFromMultiplePages(startUrl, config = {}, options = {}) {
|
|
1332
|
+
const maxPages = options.maxPages || 5;
|
|
1333
|
+
const allArticles = [];
|
|
1334
|
+
const visitedUrls = /* @__PURE__ */ new Set();
|
|
1335
|
+
const urlsToVisit = [startUrl];
|
|
1336
|
+
let pageCount = 0;
|
|
1337
|
+
while (urlsToVisit.length > 0 && pageCount < maxPages) {
|
|
1338
|
+
const currentUrl = urlsToVisit.shift();
|
|
1339
|
+
if (visitedUrls.has(currentUrl)) {
|
|
1340
|
+
continue;
|
|
1341
|
+
}
|
|
1342
|
+
visitedUrls.add(currentUrl);
|
|
1343
|
+
pageCount++;
|
|
1344
|
+
console.log(`\u{1F4F0} [HTMLScraper] Processing page ${pageCount}/${maxPages}: ${currentUrl}`);
|
|
1345
|
+
try {
|
|
1346
|
+
const articles = await this.extractArticleLinks(currentUrl, config);
|
|
1347
|
+
allArticles.push(...articles);
|
|
1348
|
+
if (pageCount < maxPages) {
|
|
1349
|
+
const nextPageUrls = await this.findNextPageUrls(currentUrl, options);
|
|
1350
|
+
for (const nextUrl of nextPageUrls) {
|
|
1351
|
+
if (!visitedUrls.has(nextUrl)) {
|
|
1352
|
+
urlsToVisit.push(nextUrl);
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
}
|
|
1356
|
+
} catch (error) {
|
|
1357
|
+
console.warn(`\u26A0\uFE0F [HTMLScraper] Error processing page ${currentUrl}:`, error);
|
|
1358
|
+
continue;
|
|
1359
|
+
}
|
|
1360
|
+
}
|
|
1361
|
+
const uniqueArticles = this.deduplicateArticles(allArticles);
|
|
1362
|
+
uniqueArticles.sort((a, b) => b.confidence - a.confidence);
|
|
1363
|
+
console.log(`\u{1F4F0} [HTMLScraper] Total extracted ${uniqueArticles.length} unique articles from ${pageCount} pages`);
|
|
1364
|
+
return uniqueArticles;
|
|
1365
|
+
}
|
|
1366
|
+
async fetchPage(url) {
|
|
1367
|
+
try {
|
|
1368
|
+
return await globalRateLimiter.execute(url, async () => {
|
|
1369
|
+
const controller = new AbortController();
|
|
1370
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
1371
|
+
try {
|
|
1372
|
+
const response = await fetch(url, {
|
|
1373
|
+
headers: {
|
|
1374
|
+
"User-Agent": this.userAgent,
|
|
1375
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
|
1376
|
+
},
|
|
1377
|
+
signal: controller.signal
|
|
1378
|
+
});
|
|
1379
|
+
clearTimeout(timeoutId);
|
|
1380
|
+
if (!response.ok) {
|
|
1381
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
1382
|
+
}
|
|
1383
|
+
const contentType = response.headers.get("content-type") || "";
|
|
1384
|
+
if (!contentType.includes("text/html")) {
|
|
1385
|
+
throw new Error(`Not HTML content: ${contentType}`);
|
|
1386
|
+
}
|
|
1387
|
+
return await response.text();
|
|
1388
|
+
} catch (error) {
|
|
1389
|
+
clearTimeout(timeoutId);
|
|
1390
|
+
throw error;
|
|
1391
|
+
}
|
|
1392
|
+
});
|
|
1393
|
+
} catch (error) {
|
|
1394
|
+
console.error(`\u274C [HTMLScraper] Error fetching page ${url}:`, error);
|
|
1395
|
+
return null;
|
|
1396
|
+
}
|
|
1397
|
+
}
|
|
1398
|
+
parseArticleLinks(html, baseUrl, config) {
|
|
1399
|
+
const articles = [];
|
|
1400
|
+
try {
|
|
1401
|
+
const $ = cheerio3.load(html);
|
|
1402
|
+
const seenUrls = /* @__PURE__ */ new Set();
|
|
1403
|
+
config.selectors?.excludeSelectors?.forEach((selector) => {
|
|
1404
|
+
$(selector).remove();
|
|
1405
|
+
});
|
|
1406
|
+
config.selectors?.articleLinks?.forEach((selector) => {
|
|
1407
|
+
$(selector).each((_, element) => {
|
|
1408
|
+
const $link = $(element);
|
|
1409
|
+
const href = $link.attr("href");
|
|
1410
|
+
if (!href) return;
|
|
1411
|
+
const absoluteUrl = this.resolveUrl(href, baseUrl);
|
|
1412
|
+
if (!absoluteUrl || seenUrls.has(absoluteUrl)) {
|
|
1413
|
+
return;
|
|
1414
|
+
}
|
|
1415
|
+
if (!this.passesFilters(absoluteUrl, config.filters)) {
|
|
1416
|
+
return;
|
|
1417
|
+
}
|
|
1418
|
+
seenUrls.add(absoluteUrl);
|
|
1419
|
+
const article = this.extractArticleInfo($link, $, absoluteUrl);
|
|
1420
|
+
if (article && articles.length < (config.limits?.maxLinksPerPage || 100)) {
|
|
1421
|
+
articles.push(article);
|
|
1422
|
+
}
|
|
1423
|
+
});
|
|
1424
|
+
});
|
|
1425
|
+
const structuredArticles = this.extractStructuredData($, baseUrl);
|
|
1426
|
+
structuredArticles.forEach((article) => {
|
|
1427
|
+
if (!seenUrls.has(article.url)) {
|
|
1428
|
+
seenUrls.add(article.url);
|
|
1429
|
+
articles.push(article);
|
|
1430
|
+
}
|
|
1431
|
+
});
|
|
1432
|
+
} catch (error) {
|
|
1433
|
+
console.error(`\u274C [HTMLScraper] Error parsing HTML:`, error);
|
|
1434
|
+
}
|
|
1435
|
+
return articles;
|
|
1436
|
+
}
|
|
1437
|
+
extractArticleInfo($link, _$, url) {
|
|
1438
|
+
let title = $link.text().trim();
|
|
1439
|
+
let confidence = 0.5;
|
|
1440
|
+
let publishedDate;
|
|
1441
|
+
let description;
|
|
1442
|
+
if (!title || title.length < 5) {
|
|
1443
|
+
const $parent2 = $link.closest("article, .article, .post, .story, .news-item");
|
|
1444
|
+
if ($parent2.length > 0) {
|
|
1445
|
+
const betterTitle = $parent2.find("h1, h2, h3, .headline, .title").first().text().trim();
|
|
1446
|
+
if (betterTitle && betterTitle.length > title.length) {
|
|
1447
|
+
title = betterTitle;
|
|
1448
|
+
confidence += 0.2;
|
|
1449
|
+
}
|
|
1450
|
+
}
|
|
1451
|
+
}
|
|
1452
|
+
const $dateElement = $link.closest("article, .article, .post").find("time[datetime], .date, .published").first();
|
|
1453
|
+
if ($dateElement.length > 0) {
|
|
1454
|
+
const dateText = $dateElement.attr("datetime") || $dateElement.text().trim();
|
|
1455
|
+
if (dateText) {
|
|
1456
|
+
const date = this.parseDate(dateText);
|
|
1457
|
+
if (date) {
|
|
1458
|
+
publishedDate = date;
|
|
1459
|
+
confidence += 0.1;
|
|
1460
|
+
}
|
|
1461
|
+
}
|
|
1462
|
+
}
|
|
1463
|
+
const $parent = $link.closest("article, .article, .post, .story");
|
|
1464
|
+
if ($parent.length > 0) {
|
|
1465
|
+
description = $parent.find(".excerpt, .summary, p").first().text().trim();
|
|
1466
|
+
if (description && description.length > 50) {
|
|
1467
|
+
description = description.substring(0, 300) + "...";
|
|
1468
|
+
confidence += 0.1;
|
|
1469
|
+
}
|
|
1470
|
+
}
|
|
1471
|
+
if (this.isLikelyArticleUrl(url)) {
|
|
1472
|
+
confidence += 0.2;
|
|
1473
|
+
}
|
|
1474
|
+
if (title && title.length >= 20 && title.length <= 120) {
|
|
1475
|
+
confidence += 0.1;
|
|
1476
|
+
}
|
|
1477
|
+
if (!title || title.length < 10) {
|
|
1478
|
+
return null;
|
|
1479
|
+
}
|
|
1480
|
+
return {
|
|
1481
|
+
url,
|
|
1482
|
+
title,
|
|
1483
|
+
publishedDate,
|
|
1484
|
+
description,
|
|
1485
|
+
confidence: Math.min(confidence, 1),
|
|
1486
|
+
source: "link-text"
|
|
1487
|
+
};
|
|
1488
|
+
}
|
|
1489
|
+
extractStructuredData($, baseUrl) {
|
|
1490
|
+
const articles = [];
|
|
1491
|
+
$('script[type="application/ld+json"]').each((_, element) => {
|
|
1492
|
+
try {
|
|
1493
|
+
const jsonText = $(element).html();
|
|
1494
|
+
if (!jsonText) return;
|
|
1495
|
+
const data = JSON.parse(jsonText);
|
|
1496
|
+
const items = Array.isArray(data) ? data : [data];
|
|
1497
|
+
for (const item of items) {
|
|
1498
|
+
if (item["@type"] === "Article" || item["@type"] === "NewsArticle") {
|
|
1499
|
+
const url = item.url || item.mainEntityOfPage?.["@id"];
|
|
1500
|
+
if (url) {
|
|
1501
|
+
const absoluteUrl = this.resolveUrl(url, baseUrl);
|
|
1502
|
+
if (absoluteUrl) {
|
|
1503
|
+
articles.push({
|
|
1504
|
+
url: absoluteUrl,
|
|
1505
|
+
title: item.headline || item.name,
|
|
1506
|
+
publishedDate: item.datePublished ? new Date(item.datePublished) : void 0,
|
|
1507
|
+
description: item.description,
|
|
1508
|
+
confidence: 0.9,
|
|
1509
|
+
source: "structured-data"
|
|
1510
|
+
});
|
|
1511
|
+
}
|
|
1512
|
+
}
|
|
1513
|
+
}
|
|
1514
|
+
}
|
|
1515
|
+
} catch (error) {
|
|
1516
|
+
}
|
|
1517
|
+
});
|
|
1518
|
+
return articles;
|
|
1519
|
+
}
|
|
1520
|
+
async findNextPageUrls(currentUrl, options) {
|
|
1521
|
+
try {
|
|
1522
|
+
const html = await this.fetchPage(currentUrl);
|
|
1523
|
+
if (!html) return [];
|
|
1524
|
+
const $ = cheerio3.load(html);
|
|
1525
|
+
const nextUrls = [];
|
|
1526
|
+
const paginationSelector = options.paginationSelector || 'a[rel="next"], .pagination a, .next a, .pager a, [class*="next"] a';
|
|
1527
|
+
$(paginationSelector).each((_, element) => {
|
|
1528
|
+
const $link = $(element);
|
|
1529
|
+
const href = $link.attr("href");
|
|
1530
|
+
const text = $link.text().toLowerCase().trim();
|
|
1531
|
+
if (href && (text.includes("next") || text.includes("\u2192") || text === ">")) {
|
|
1532
|
+
const absoluteUrl = this.resolveUrl(href, currentUrl);
|
|
1533
|
+
if (absoluteUrl) {
|
|
1534
|
+
nextUrls.push(absoluteUrl);
|
|
1535
|
+
}
|
|
1536
|
+
}
|
|
1537
|
+
});
|
|
1538
|
+
return Array.from(new Set(nextUrls));
|
|
1539
|
+
} catch (error) {
|
|
1540
|
+
console.warn(`\u26A0\uFE0F [HTMLScraper] Error finding next page URLs:`, error);
|
|
1541
|
+
return [];
|
|
1542
|
+
}
|
|
1543
|
+
}
|
|
1544
|
+
deduplicateArticles(articles) {
|
|
1545
|
+
const seen = /* @__PURE__ */ new Map();
|
|
1546
|
+
for (const article of articles) {
|
|
1547
|
+
const existing = seen.get(article.url);
|
|
1548
|
+
if (!existing || article.confidence > existing.confidence) {
|
|
1549
|
+
seen.set(article.url, article);
|
|
1550
|
+
}
|
|
1551
|
+
}
|
|
1552
|
+
return Array.from(seen.values());
|
|
1553
|
+
}
|
|
1554
|
+
passesFilters(url, filters) {
|
|
1555
|
+
if (!filters) return true;
|
|
1556
|
+
if (filters.excludePatterns?.some((pattern) => pattern.test(url))) {
|
|
1557
|
+
return false;
|
|
1558
|
+
}
|
|
1559
|
+
if (filters.includePatterns?.length && !filters.includePatterns.some((pattern) => pattern.test(url))) {
|
|
1560
|
+
return false;
|
|
1561
|
+
}
|
|
1562
|
+
if (filters.allowedDomains?.length) {
|
|
1563
|
+
try {
|
|
1564
|
+
const urlObj = new URL(url);
|
|
1565
|
+
const domain = urlObj.hostname.toLowerCase();
|
|
1566
|
+
if (!filters.allowedDomains.some(
|
|
1567
|
+
(allowed) => domain === allowed.toLowerCase() || domain.endsWith("." + allowed.toLowerCase())
|
|
1568
|
+
)) {
|
|
1569
|
+
return false;
|
|
1570
|
+
}
|
|
1571
|
+
} catch {
|
|
1572
|
+
return false;
|
|
1573
|
+
}
|
|
1574
|
+
}
|
|
1575
|
+
return true;
|
|
1576
|
+
}
|
|
1577
|
+
isLikelyArticleUrl(url) {
|
|
1578
|
+
const urlLower = url.toLowerCase();
|
|
1579
|
+
const articlePatterns = [
|
|
1580
|
+
/\/article[s]?\//,
|
|
1581
|
+
/\/post[s]?\//,
|
|
1582
|
+
/\/story\//,
|
|
1583
|
+
/\/stories\//,
|
|
1584
|
+
/\/news\//,
|
|
1585
|
+
/\/blog\//,
|
|
1586
|
+
/\/\d{4}\/\d{2}\/\d{2}\//,
|
|
1587
|
+
// Date-based URLs
|
|
1588
|
+
/\/\d{4}\/\d{2}\//
|
|
1589
|
+
];
|
|
1590
|
+
return articlePatterns.some((pattern) => pattern.test(urlLower));
|
|
1591
|
+
}
|
|
1592
|
+
parseDate(dateString) {
|
|
1593
|
+
try {
|
|
1594
|
+
const date = new Date(dateString);
|
|
1595
|
+
if (isNaN(date.getTime())) {
|
|
1596
|
+
const formats = [
|
|
1597
|
+
/(\d{4})-(\d{2})-(\d{2})/,
|
|
1598
|
+
// YYYY-MM-DD
|
|
1599
|
+
/(\d{2})\/(\d{2})\/(\d{4})/,
|
|
1600
|
+
// MM/DD/YYYY
|
|
1601
|
+
/(\d{2})\.(\d{2})\.(\d{4})/
|
|
1602
|
+
// DD.MM.YYYY
|
|
1603
|
+
];
|
|
1604
|
+
for (const format of formats) {
|
|
1605
|
+
const match = dateString.match(format);
|
|
1606
|
+
if (match) {
|
|
1607
|
+
const [, p1, p2, p3] = match;
|
|
1608
|
+
const testDate = /* @__PURE__ */ new Date(`${p1}-${p2}-${p3}`);
|
|
1609
|
+
if (!isNaN(testDate.getTime())) {
|
|
1610
|
+
return testDate;
|
|
1611
|
+
}
|
|
1612
|
+
}
|
|
1613
|
+
}
|
|
1614
|
+
return null;
|
|
1615
|
+
}
|
|
1616
|
+
return date;
|
|
1617
|
+
} catch {
|
|
1618
|
+
return null;
|
|
1619
|
+
}
|
|
1620
|
+
}
|
|
1621
|
+
resolveUrl(url, baseUrl) {
|
|
1622
|
+
try {
|
|
1623
|
+
return new URL(url, baseUrl).toString();
|
|
1624
|
+
} catch {
|
|
1625
|
+
return null;
|
|
1626
|
+
}
|
|
1627
|
+
}
|
|
1628
|
+
mergeConfig(defaultConfig, userConfig) {
|
|
1629
|
+
return {
|
|
1630
|
+
selectors: {
|
|
1631
|
+
...defaultConfig.selectors,
|
|
1632
|
+
...userConfig.selectors,
|
|
1633
|
+
articleLinks: [
|
|
1634
|
+
...defaultConfig.selectors?.articleLinks || [],
|
|
1635
|
+
...userConfig.selectors?.articleLinks || []
|
|
1636
|
+
]
|
|
1637
|
+
},
|
|
1638
|
+
filters: {
|
|
1639
|
+
...defaultConfig.filters,
|
|
1640
|
+
...userConfig.filters,
|
|
1641
|
+
includePatterns: [
|
|
1642
|
+
...defaultConfig.filters?.includePatterns || [],
|
|
1643
|
+
...userConfig.filters?.includePatterns || []
|
|
1644
|
+
],
|
|
1645
|
+
excludePatterns: [
|
|
1646
|
+
...defaultConfig.filters?.excludePatterns || [],
|
|
1647
|
+
...userConfig.filters?.excludePatterns || []
|
|
1648
|
+
]
|
|
1649
|
+
},
|
|
1650
|
+
limits: {
|
|
1651
|
+
...defaultConfig.limits,
|
|
1652
|
+
...userConfig.limits
|
|
1653
|
+
},
|
|
1654
|
+
perplexityFallback: {
|
|
1655
|
+
...defaultConfig.perplexityFallback,
|
|
1656
|
+
...userConfig.perplexityFallback
|
|
1657
|
+
}
|
|
1658
|
+
};
|
|
1659
|
+
}
|
|
1660
|
+
/**
|
|
1661
|
+
* Use Perplexity API to extract articles when traditional scraping fails
|
|
1662
|
+
* Requires PERPLEXITY_API_KEY environment variable to be set
|
|
1663
|
+
*/
|
|
1664
|
+
async extractWithPerplexity(url, config) {
|
|
1665
|
+
try {
|
|
1666
|
+
if (!process.env.PERPLEXITY_API_KEY) {
|
|
1667
|
+
console.warn(`\u26A0\uFE0F [HTMLScraper] Perplexity API key not configured - set PERPLEXITY_API_KEY env variable`);
|
|
1668
|
+
return [];
|
|
1669
|
+
}
|
|
1670
|
+
const domain = new URL(url).hostname;
|
|
1671
|
+
const query = `Find recent news articles and stories from ${domain}. List article titles and URLs.`;
|
|
1672
|
+
console.log(`\u{1F50D} [HTMLScraper] Using Perplexity to find articles from ${domain}`);
|
|
1673
|
+
const response = await fetch("https://api.perplexity.ai/chat/completions", {
|
|
1674
|
+
method: "POST",
|
|
1675
|
+
headers: {
|
|
1676
|
+
"Content-Type": "application/json",
|
|
1677
|
+
"Authorization": `Bearer ${process.env.PERPLEXITY_API_KEY}`
|
|
1678
|
+
},
|
|
1679
|
+
body: JSON.stringify({
|
|
1680
|
+
model: config.perplexityFallback?.model || PERPLEXITY_MODELS.SONAR,
|
|
1681
|
+
messages: [{ role: "user", content: query }],
|
|
1682
|
+
max_tokens: 1e3,
|
|
1683
|
+
return_citations: true,
|
|
1684
|
+
search_recency_filter: config.perplexityFallback?.searchRecency || "day"
|
|
1685
|
+
})
|
|
1686
|
+
});
|
|
1687
|
+
if (!response.ok) {
|
|
1688
|
+
throw new Error(`Perplexity API error: ${response.status} ${response.statusText}`);
|
|
1689
|
+
}
|
|
1690
|
+
const data = await response.json();
|
|
1691
|
+
const articles = [];
|
|
1692
|
+
if (data.citations && Array.isArray(data.citations)) {
|
|
1693
|
+
for (const citation of data.citations) {
|
|
1694
|
+
try {
|
|
1695
|
+
const citationUrl = citation;
|
|
1696
|
+
const citationDomain = new URL(citationUrl).hostname;
|
|
1697
|
+
if (citationDomain === domain || citationDomain.includes(domain.split(".")[0])) {
|
|
1698
|
+
articles.push({
|
|
1699
|
+
url: citationUrl,
|
|
1700
|
+
title: citationUrl.split("/").pop() || domain,
|
|
1701
|
+
confidence: 0.7,
|
|
1702
|
+
source: "meta-data"
|
|
1703
|
+
});
|
|
1704
|
+
}
|
|
1705
|
+
} catch {
|
|
1706
|
+
continue;
|
|
1707
|
+
}
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
const maxLinks = config.limits?.maxLinksPerPage || 100;
|
|
1711
|
+
const limitedArticles = articles.slice(0, maxLinks);
|
|
1712
|
+
console.log(`\u2728 [HTMLScraper] Perplexity found ${limitedArticles.length} articles`);
|
|
1713
|
+
return limitedArticles;
|
|
1714
|
+
} catch (error) {
|
|
1715
|
+
console.error(`\u274C [HTMLScraper] Perplexity fallback failed:`, error);
|
|
1716
|
+
return [];
|
|
1717
|
+
}
|
|
1718
|
+
}
|
|
1719
|
+
};
|
|
1720
|
+
var globalHTMLScraper = new HTMLScraper();
|
|
1721
|
+
|
|
1722
|
+
// src/extractors/content-extractor.ts
|
|
1723
|
+
var import_readability = require("@mozilla/readability");
|
|
1724
|
+
var import_jsdom = require("jsdom");
|
|
1725
|
+
var cheerio4 = __toESM(require("cheerio"));
|
|
1726
|
+
var ContentExtractor = class {
|
|
1727
|
+
constructor() {
|
|
1728
|
+
this.userAgent = "Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)";
|
|
1729
|
+
this.timeout = 15e3;
|
|
1730
|
+
// 15 seconds
|
|
1731
|
+
this.maxContentSize = 10 * 1024 * 1024;
|
|
1732
|
+
// 10MB max
|
|
1733
|
+
this.minContentLength = 200;
|
|
1734
|
+
// Minimum 200 characters
|
|
1735
|
+
this.wordsPerMinute = 200;
|
|
1736
|
+
this.ssrfProtection = {
|
|
1737
|
+
isPrivateIP: (url) => {
|
|
1738
|
+
try {
|
|
1739
|
+
const urlObj = new URL(url);
|
|
1740
|
+
const hostname = urlObj.hostname;
|
|
1741
|
+
const privateRanges = [
|
|
1742
|
+
/^127\./,
|
|
1743
|
+
// 127.0.0.0/8 (loopback)
|
|
1744
|
+
/^10\./,
|
|
1745
|
+
// 10.0.0.0/8 (private)
|
|
1746
|
+
/^172\.(1[6-9]|2[0-9]|3[01])\./,
|
|
1747
|
+
// 172.16.0.0/12 (private)
|
|
1748
|
+
/^192\.168\./,
|
|
1749
|
+
// 192.168.0.0/16 (private)
|
|
1750
|
+
/^169\.254\./,
|
|
1751
|
+
// 169.254.0.0/16 (link-local)
|
|
1752
|
+
/^::1$/,
|
|
1753
|
+
// IPv6 loopback
|
|
1754
|
+
/^fe80:/,
|
|
1755
|
+
// IPv6 link-local
|
|
1756
|
+
/^fc00:/,
|
|
1757
|
+
// IPv6 unique local
|
|
1758
|
+
/^fd00:/
|
|
1759
|
+
// IPv6 unique local
|
|
1760
|
+
];
|
|
1761
|
+
return privateRanges.some((range) => range.test(hostname));
|
|
1762
|
+
} catch {
|
|
1763
|
+
return true;
|
|
1764
|
+
}
|
|
1765
|
+
},
|
|
1766
|
+
isLocalhost: (url) => {
|
|
1767
|
+
try {
|
|
1768
|
+
const urlObj = new URL(url);
|
|
1769
|
+
const hostname = urlObj.hostname.toLowerCase();
|
|
1770
|
+
return hostname === "localhost" || hostname === "127.0.0.1" || hostname === "::1";
|
|
1771
|
+
} catch {
|
|
1772
|
+
return true;
|
|
1773
|
+
}
|
|
1774
|
+
},
|
|
1775
|
+
isAllowedProtocol: (url) => {
|
|
1776
|
+
try {
|
|
1777
|
+
const urlObj = new URL(url);
|
|
1778
|
+
return urlObj.protocol === "http:" || urlObj.protocol === "https:";
|
|
1779
|
+
} catch {
|
|
1780
|
+
return false;
|
|
1781
|
+
}
|
|
1782
|
+
}
|
|
1783
|
+
};
|
|
1784
|
+
}
|
|
1785
|
+
/**
|
|
1786
|
+
* Extract content from a URL
|
|
1787
|
+
*/
|
|
1788
|
+
async extractContent(url) {
|
|
1789
|
+
console.log(`\u{1F4D6} [ContentExtractor] Starting content extraction from ${url}`);
|
|
1790
|
+
try {
|
|
1791
|
+
if (!this.ssrfProtection.isAllowedProtocol(url)) {
|
|
1792
|
+
throw new Error(`Disallowed protocol: ${url}`);
|
|
1793
|
+
}
|
|
1794
|
+
if (this.ssrfProtection.isPrivateIP(url) || this.ssrfProtection.isLocalhost(url)) {
|
|
1795
|
+
throw new Error(`Private/local IP not allowed: ${url}`);
|
|
1796
|
+
}
|
|
1797
|
+
const robotsCheck = await globalRobotsChecker.isAllowed(url);
|
|
1798
|
+
if (!robotsCheck.allowed) {
|
|
1799
|
+
console.warn(`\u{1F916} [ContentExtractor] URL blocked by robots.txt: ${url} - ${robotsCheck.reason}`);
|
|
1800
|
+
return null;
|
|
1801
|
+
}
|
|
1802
|
+
const html = await this.fetchContent(url);
|
|
1803
|
+
if (!html) {
|
|
1804
|
+
return null;
|
|
1805
|
+
}
|
|
1806
|
+
const extracted = await this.extractFromHTML(html, url);
|
|
1807
|
+
if (!extracted) {
|
|
1808
|
+
console.warn(`\u26A0\uFE0F [ContentExtractor] No content extracted from ${url}`);
|
|
1809
|
+
return null;
|
|
1810
|
+
}
|
|
1811
|
+
if (extracted.textContent.length < this.minContentLength) {
|
|
1812
|
+
console.warn(`\u26A0\uFE0F [ContentExtractor] Content too short (${extracted.textContent.length} chars): ${url}`);
|
|
1813
|
+
return null;
|
|
1814
|
+
}
|
|
1815
|
+
console.log(`\u2705 [ContentExtractor] Successfully extracted ${extracted.wordCount} words from ${url}`);
|
|
1816
|
+
return extracted;
|
|
1817
|
+
} catch (error) {
|
|
1818
|
+
console.error(`\u274C [ContentExtractor] Error extracting content from ${url}:`, error);
|
|
1819
|
+
return null;
|
|
1820
|
+
}
|
|
1821
|
+
}
|
|
1822
|
+
/**
|
|
1823
|
+
* Extract content from multiple URLs
|
|
1824
|
+
*/
|
|
1825
|
+
async extractBatch(urls) {
|
|
1826
|
+
console.log(`\u{1F4D6} [ContentExtractor] Starting batch extraction of ${urls.length} URLs`);
|
|
1827
|
+
const results = urls.map(
|
|
1828
|
+
(url) => this.extractContent(url).catch((error) => {
|
|
1829
|
+
console.error(`\u274C [ContentExtractor] Error in batch extraction for ${url}:`, error);
|
|
1830
|
+
return null;
|
|
1831
|
+
})
|
|
1832
|
+
);
|
|
1833
|
+
const extracted = await Promise.all(results);
|
|
1834
|
+
const successful = extracted.filter(Boolean).length;
|
|
1835
|
+
console.log(`\u{1F4D6} [ContentExtractor] Batch complete: ${successful}/${urls.length} successful`);
|
|
1836
|
+
return extracted;
|
|
1837
|
+
}
|
|
1838
|
+
async fetchContent(url) {
|
|
1839
|
+
try {
|
|
1840
|
+
return await globalRateLimiter.execute(url, async () => {
|
|
1841
|
+
const controller = new AbortController();
|
|
1842
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
1843
|
+
try {
|
|
1844
|
+
const response = await fetch(url, {
|
|
1845
|
+
headers: {
|
|
1846
|
+
"User-Agent": this.userAgent,
|
|
1847
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
1848
|
+
"Accept-Language": "en-US,en;q=0.9"
|
|
1849
|
+
},
|
|
1850
|
+
signal: controller.signal
|
|
1851
|
+
});
|
|
1852
|
+
clearTimeout(timeoutId);
|
|
1853
|
+
if (!response.ok) {
|
|
1854
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
1855
|
+
}
|
|
1856
|
+
const contentLength = response.headers.get("content-length");
|
|
1857
|
+
if (contentLength && parseInt(contentLength) > this.maxContentSize) {
|
|
1858
|
+
throw new Error(`Content too large: ${contentLength} bytes`);
|
|
1859
|
+
}
|
|
1860
|
+
const html = await response.text();
|
|
1861
|
+
if (html.length > this.maxContentSize) {
|
|
1862
|
+
throw new Error(`Content too large: ${html.length} bytes`);
|
|
1863
|
+
}
|
|
1864
|
+
return html;
|
|
1865
|
+
} catch (error) {
|
|
1866
|
+
clearTimeout(timeoutId);
|
|
1867
|
+
throw error;
|
|
1868
|
+
}
|
|
1869
|
+
});
|
|
1870
|
+
} catch (error) {
|
|
1871
|
+
console.error(`\u274C [ContentExtractor] Error fetching content from ${url}:`, error);
|
|
1872
|
+
return null;
|
|
1873
|
+
}
|
|
1874
|
+
}
|
|
1875
|
+
async extractFromHTML(html, url) {
|
|
1876
|
+
const errors = [];
|
|
1877
|
+
try {
|
|
1878
|
+
const readabilityResult = this.extractWithReadability(html, url);
|
|
1879
|
+
if (readabilityResult && readabilityResult.textContent.length >= this.minContentLength) {
|
|
1880
|
+
return {
|
|
1881
|
+
...readabilityResult,
|
|
1882
|
+
extractionMethod: "readability",
|
|
1883
|
+
confidence: 0.9
|
|
1884
|
+
};
|
|
1885
|
+
} else {
|
|
1886
|
+
errors.push("Readability extraction failed or content too short");
|
|
1887
|
+
}
|
|
1888
|
+
} catch (error) {
|
|
1889
|
+
errors.push(`Readability error: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
1890
|
+
}
|
|
1891
|
+
try {
|
|
1892
|
+
const fallbackResult = this.extractWithFallback(html, url);
|
|
1893
|
+
if (fallbackResult && fallbackResult.textContent.length >= this.minContentLength) {
|
|
1894
|
+
return {
|
|
1895
|
+
...fallbackResult,
|
|
1896
|
+
extractionMethod: "fallback",
|
|
1897
|
+
confidence: 0.6,
|
|
1898
|
+
errors
|
|
1899
|
+
};
|
|
1900
|
+
} else {
|
|
1901
|
+
errors.push("Fallback extraction failed or content too short");
|
|
1902
|
+
}
|
|
1903
|
+
} catch (error) {
|
|
1904
|
+
errors.push(`Fallback error: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
1905
|
+
}
|
|
1906
|
+
console.error(`\u274C [ContentExtractor] All extraction methods failed for ${url}:`, errors);
|
|
1907
|
+
return null;
|
|
1908
|
+
}
|
|
1909
|
+
extractWithReadability(html, url) {
|
|
1910
|
+
try {
|
|
1911
|
+
const dom = new import_jsdom.JSDOM(html, { url });
|
|
1912
|
+
const document = dom.window.document;
|
|
1913
|
+
const reader = new import_readability.Readability(document);
|
|
1914
|
+
const article = reader.parse();
|
|
1915
|
+
if (!article) {
|
|
1916
|
+
return null;
|
|
1917
|
+
}
|
|
1918
|
+
const structured = this.extractStructuredData(html, url);
|
|
1919
|
+
const wordCount = this.countWords(article.textContent);
|
|
1920
|
+
const readingTime = Math.ceil(wordCount / this.wordsPerMinute);
|
|
1921
|
+
return {
|
|
1922
|
+
url,
|
|
1923
|
+
title: article.title || "",
|
|
1924
|
+
content: article.content || "",
|
|
1925
|
+
textContent: article.textContent || "",
|
|
1926
|
+
excerpt: article.excerpt || void 0,
|
|
1927
|
+
byline: article.byline || void 0,
|
|
1928
|
+
publishedTime: this.extractPublishedTime(html),
|
|
1929
|
+
siteName: article.siteName || this.extractSiteName(html),
|
|
1930
|
+
lang: this.extractLanguage(html),
|
|
1931
|
+
structured,
|
|
1932
|
+
wordCount,
|
|
1933
|
+
readingTime,
|
|
1934
|
+
confidence: 0.9,
|
|
1935
|
+
extractionMethod: "readability",
|
|
1936
|
+
extractedAt: /* @__PURE__ */ new Date()
|
|
1937
|
+
};
|
|
1938
|
+
} catch (error) {
|
|
1939
|
+
console.error(`\u274C [ContentExtractor] Readability extraction failed:`, error);
|
|
1940
|
+
return null;
|
|
1941
|
+
}
|
|
1942
|
+
}
|
|
1943
|
+
extractWithFallback(html, url) {
|
|
1944
|
+
try {
|
|
1945
|
+
const $ = cheerio4.load(html);
|
|
1946
|
+
const unwantedSelectors = [
|
|
1947
|
+
"script",
|
|
1948
|
+
"style",
|
|
1949
|
+
"nav",
|
|
1950
|
+
"header",
|
|
1951
|
+
"footer",
|
|
1952
|
+
".advertisement",
|
|
1953
|
+
".ads",
|
|
1954
|
+
".social-share",
|
|
1955
|
+
".comments",
|
|
1956
|
+
".sidebar",
|
|
1957
|
+
".navigation",
|
|
1958
|
+
".menu",
|
|
1959
|
+
".popup",
|
|
1960
|
+
".modal"
|
|
1961
|
+
];
|
|
1962
|
+
unwantedSelectors.forEach((selector) => $(selector).remove());
|
|
1963
|
+
let content = "";
|
|
1964
|
+
let title = "";
|
|
1965
|
+
title = $("h1").first().text().trim() || $("title").text().trim() || $('meta[property="og:title"]').attr("content") || "";
|
|
1966
|
+
const contentSelectors = [
|
|
1967
|
+
"article",
|
|
1968
|
+
".article-content",
|
|
1969
|
+
".post-content",
|
|
1970
|
+
".entry-content",
|
|
1971
|
+
".content",
|
|
1972
|
+
"main",
|
|
1973
|
+
"#content",
|
|
1974
|
+
".story-body"
|
|
1975
|
+
];
|
|
1976
|
+
for (const selector of contentSelectors) {
|
|
1977
|
+
const element = $(selector).first();
|
|
1978
|
+
if (element.length > 0) {
|
|
1979
|
+
content = element.html() || "";
|
|
1980
|
+
if (content.length > this.minContentLength) {
|
|
1981
|
+
break;
|
|
1982
|
+
}
|
|
1983
|
+
}
|
|
1984
|
+
}
|
|
1985
|
+
if (content.length < this.minContentLength) {
|
|
1986
|
+
content = $("body").html() || "";
|
|
1987
|
+
}
|
|
1988
|
+
if (!content || content.length < this.minContentLength) {
|
|
1989
|
+
return null;
|
|
1990
|
+
}
|
|
1991
|
+
const textContent = $(content).text().trim();
|
|
1992
|
+
const wordCount = this.countWords(textContent);
|
|
1993
|
+
const readingTime = Math.ceil(wordCount / this.wordsPerMinute);
|
|
1994
|
+
const structured = this.extractStructuredData(html, url);
|
|
1995
|
+
return {
|
|
1996
|
+
url,
|
|
1997
|
+
title,
|
|
1998
|
+
content,
|
|
1999
|
+
textContent,
|
|
2000
|
+
excerpt: textContent.substring(0, 300) + "...",
|
|
2001
|
+
publishedTime: this.extractPublishedTime(html),
|
|
2002
|
+
siteName: this.extractSiteName(html),
|
|
2003
|
+
lang: this.extractLanguage(html),
|
|
2004
|
+
structured,
|
|
2005
|
+
wordCount,
|
|
2006
|
+
readingTime,
|
|
2007
|
+
confidence: 0.6,
|
|
2008
|
+
extractionMethod: "fallback",
|
|
2009
|
+
extractedAt: /* @__PURE__ */ new Date()
|
|
2010
|
+
};
|
|
2011
|
+
} catch (error) {
|
|
2012
|
+
console.error(`\u274C [ContentExtractor] Fallback extraction failed:`, error);
|
|
2013
|
+
return null;
|
|
2014
|
+
}
|
|
2015
|
+
}
|
|
2016
|
+
extractStructuredData(html, _url) {
|
|
2017
|
+
const structured = {};
|
|
2018
|
+
try {
|
|
2019
|
+
const $ = cheerio4.load(html);
|
|
2020
|
+
const jsonLdScripts = [];
|
|
2021
|
+
$('script[type="application/ld+json"]').each((_, element) => {
|
|
2022
|
+
try {
|
|
2023
|
+
const jsonText = $(element).html();
|
|
2024
|
+
if (jsonText) {
|
|
2025
|
+
const data = JSON.parse(jsonText);
|
|
2026
|
+
jsonLdScripts.push(data);
|
|
2027
|
+
}
|
|
2028
|
+
} catch {
|
|
2029
|
+
}
|
|
2030
|
+
});
|
|
2031
|
+
if (jsonLdScripts.length > 0) {
|
|
2032
|
+
structured.jsonLd = jsonLdScripts;
|
|
2033
|
+
}
|
|
2034
|
+
const openGraph = {};
|
|
2035
|
+
$('meta[property^="og:"]').each((_, element) => {
|
|
2036
|
+
const property = $(element).attr("property");
|
|
2037
|
+
const content = $(element).attr("content");
|
|
2038
|
+
if (property && content) {
|
|
2039
|
+
openGraph[property] = content;
|
|
2040
|
+
}
|
|
2041
|
+
});
|
|
2042
|
+
if (Object.keys(openGraph).length > 0) {
|
|
2043
|
+
structured.openGraph = openGraph;
|
|
2044
|
+
}
|
|
2045
|
+
const twitterCard = {};
|
|
2046
|
+
$('meta[name^="twitter:"]').each((_, element) => {
|
|
2047
|
+
const name = $(element).attr("name");
|
|
2048
|
+
const content = $(element).attr("content");
|
|
2049
|
+
if (name && content) {
|
|
2050
|
+
twitterCard[name] = content;
|
|
2051
|
+
}
|
|
2052
|
+
});
|
|
2053
|
+
if (Object.keys(twitterCard).length > 0) {
|
|
2054
|
+
structured.twitterCard = twitterCard;
|
|
2055
|
+
}
|
|
2056
|
+
const microdata = [];
|
|
2057
|
+
$("[itemscope]").each((_, element) => {
|
|
2058
|
+
const $item = $(element);
|
|
2059
|
+
const itemType = $item.attr("itemtype");
|
|
2060
|
+
if (itemType) {
|
|
2061
|
+
const item = { "@type": itemType };
|
|
2062
|
+
$item.find("[itemprop]").each((_2, propElement) => {
|
|
2063
|
+
const $prop = $(propElement);
|
|
2064
|
+
const propName = $prop.attr("itemprop");
|
|
2065
|
+
const propValue = $prop.attr("content") || $prop.text().trim();
|
|
2066
|
+
if (propName && propValue) {
|
|
2067
|
+
item[propName] = propValue;
|
|
2068
|
+
}
|
|
2069
|
+
});
|
|
2070
|
+
microdata.push(item);
|
|
2071
|
+
}
|
|
2072
|
+
});
|
|
2073
|
+
if (microdata.length > 0) {
|
|
2074
|
+
structured.microdata = microdata;
|
|
2075
|
+
}
|
|
2076
|
+
} catch (error) {
|
|
2077
|
+
console.warn(`\u26A0\uFE0F [ContentExtractor] Error extracting structured data:`, error);
|
|
2078
|
+
}
|
|
2079
|
+
return Object.keys(structured).length > 0 ? structured : void 0;
|
|
2080
|
+
}
|
|
2081
|
+
extractPublishedTime(html) {
|
|
2082
|
+
try {
|
|
2083
|
+
const $ = cheerio4.load(html);
|
|
2084
|
+
const timeSelectors = [
|
|
2085
|
+
'meta[property="article:published_time"]',
|
|
2086
|
+
'meta[name="datePublished"]',
|
|
2087
|
+
'meta[name="publishdate"]',
|
|
2088
|
+
"time[datetime]",
|
|
2089
|
+
".published-date",
|
|
2090
|
+
".publish-date",
|
|
2091
|
+
".article-date"
|
|
2092
|
+
];
|
|
2093
|
+
for (const selector of timeSelectors) {
|
|
2094
|
+
const element = $(selector).first();
|
|
2095
|
+
if (element.length > 0) {
|
|
2096
|
+
const timeStr = element.attr("content") || element.attr("datetime") || element.text().trim();
|
|
2097
|
+
if (timeStr) {
|
|
2098
|
+
const date = new Date(timeStr);
|
|
2099
|
+
if (!isNaN(date.getTime())) {
|
|
2100
|
+
return date;
|
|
2101
|
+
}
|
|
2102
|
+
}
|
|
2103
|
+
}
|
|
2104
|
+
}
|
|
2105
|
+
return void 0;
|
|
2106
|
+
} catch {
|
|
2107
|
+
return void 0;
|
|
2108
|
+
}
|
|
2109
|
+
}
|
|
2110
|
+
extractSiteName(html) {
|
|
2111
|
+
try {
|
|
2112
|
+
const $ = cheerio4.load(html);
|
|
2113
|
+
return $('meta[property="og:site_name"]').attr("content") || $('meta[name="application-name"]').attr("content") || void 0;
|
|
2114
|
+
} catch {
|
|
2115
|
+
return void 0;
|
|
2116
|
+
}
|
|
2117
|
+
}
|
|
2118
|
+
extractLanguage(html) {
|
|
2119
|
+
try {
|
|
2120
|
+
const $ = cheerio4.load(html);
|
|
2121
|
+
return $("html").attr("lang") || $('meta[name="language"]').attr("content") || $('meta[http-equiv="content-language"]').attr("content") || void 0;
|
|
2122
|
+
} catch {
|
|
2123
|
+
return void 0;
|
|
2124
|
+
}
|
|
2125
|
+
}
|
|
2126
|
+
countWords(text) {
|
|
2127
|
+
if (!text) return 0;
|
|
2128
|
+
return text.trim().split(/\s+/).filter((word) => word.length > 0).length;
|
|
2129
|
+
}
|
|
2130
|
+
/**
|
|
2131
|
+
* Validate extracted content quality
|
|
2132
|
+
*/
|
|
2133
|
+
validateContent(content) {
|
|
2134
|
+
const issues = [];
|
|
2135
|
+
let score = 1;
|
|
2136
|
+
if (content.textContent.length < this.minContentLength) {
|
|
2137
|
+
issues.push(`Content too short: ${content.textContent.length} characters`);
|
|
2138
|
+
score -= 0.5;
|
|
2139
|
+
}
|
|
2140
|
+
if (!content.title || content.title.length < 10) {
|
|
2141
|
+
issues.push("Missing or too short title");
|
|
2142
|
+
score -= 0.2;
|
|
2143
|
+
} else if (content.title.length > 200) {
|
|
2144
|
+
issues.push("Title too long");
|
|
2145
|
+
score -= 0.1;
|
|
2146
|
+
}
|
|
2147
|
+
const htmlLength = content.content.length;
|
|
2148
|
+
const textLength = content.textContent.length;
|
|
2149
|
+
const ratio = textLength / htmlLength;
|
|
2150
|
+
if (ratio < 0.1) {
|
|
2151
|
+
issues.push("Low text-to-HTML ratio - may be poorly extracted");
|
|
2152
|
+
score -= 0.2;
|
|
2153
|
+
}
|
|
2154
|
+
const sentences = content.textContent.split(".").filter((s) => s.trim().length > 10);
|
|
2155
|
+
const uniqueSentences = new Set(sentences);
|
|
2156
|
+
const duplicateRatio = (sentences.length - uniqueSentences.size) / sentences.length;
|
|
2157
|
+
if (duplicateRatio > 0.3) {
|
|
2158
|
+
issues.push("High duplicate content detected");
|
|
2159
|
+
score -= 0.3;
|
|
2160
|
+
}
|
|
2161
|
+
return {
|
|
2162
|
+
isValid: issues.length === 0 && score >= 0.5,
|
|
2163
|
+
issues,
|
|
2164
|
+
score: Math.max(0, score)
|
|
2165
|
+
};
|
|
2166
|
+
}
|
|
2167
|
+
};
|
|
2168
|
+
var globalContentExtractor = new ContentExtractor();
|
|
2169
|
+
|
|
2170
|
+
// src/utils/circuit-breaker.ts
|
|
2171
|
+
var CircuitBreaker = class {
|
|
2172
|
+
constructor(options) {
|
|
2173
|
+
this.failures = 0;
|
|
2174
|
+
this.lastFailureTime = 0;
|
|
2175
|
+
this.state = "CLOSED";
|
|
2176
|
+
this.options = options;
|
|
2177
|
+
}
|
|
2178
|
+
async execute(operation) {
|
|
2179
|
+
if (this.state === "OPEN") {
|
|
2180
|
+
if (Date.now() - this.lastFailureTime < this.options.resetTimeout) {
|
|
2181
|
+
throw new Error(`[CircuitBreaker:${this.options.name}] Circuit is OPEN - preventing request`);
|
|
2182
|
+
} else {
|
|
2183
|
+
this.state = "HALF_OPEN";
|
|
2184
|
+
console.log(`\u{1F504} [CircuitBreaker:${this.options.name}] Circuit moving to HALF_OPEN state`);
|
|
2185
|
+
}
|
|
2186
|
+
}
|
|
2187
|
+
try {
|
|
2188
|
+
const result = await this.executeWithTimeout(operation);
|
|
2189
|
+
this.onSuccess();
|
|
2190
|
+
return result;
|
|
2191
|
+
} catch (error) {
|
|
2192
|
+
this.onFailure();
|
|
2193
|
+
throw error;
|
|
2194
|
+
}
|
|
2195
|
+
}
|
|
2196
|
+
async executeWithTimeout(operation) {
|
|
2197
|
+
return new Promise((resolve, reject) => {
|
|
2198
|
+
const timer = setTimeout(() => {
|
|
2199
|
+
reject(new Error(`[CircuitBreaker:${this.options.name}] Operation timeout after ${this.options.timeout}ms`));
|
|
2200
|
+
}, this.options.timeout);
|
|
2201
|
+
operation().then((result) => {
|
|
2202
|
+
clearTimeout(timer);
|
|
2203
|
+
resolve(result);
|
|
2204
|
+
}).catch((error) => {
|
|
2205
|
+
clearTimeout(timer);
|
|
2206
|
+
reject(error);
|
|
2207
|
+
});
|
|
2208
|
+
});
|
|
2209
|
+
}
|
|
2210
|
+
onSuccess() {
|
|
2211
|
+
this.failures = 0;
|
|
2212
|
+
this.state = "CLOSED";
|
|
2213
|
+
}
|
|
2214
|
+
onFailure() {
|
|
2215
|
+
this.failures++;
|
|
2216
|
+
this.lastFailureTime = Date.now();
|
|
2217
|
+
if (this.failures >= this.options.failureThreshold) {
|
|
2218
|
+
this.state = "OPEN";
|
|
2219
|
+
console.error(`\u274C [CircuitBreaker:${this.options.name}] Circuit opened after ${this.failures} failures`);
|
|
2220
|
+
}
|
|
2221
|
+
}
|
|
2222
|
+
getState() {
|
|
2223
|
+
return {
|
|
2224
|
+
state: this.state,
|
|
2225
|
+
failures: this.failures,
|
|
2226
|
+
lastFailureTime: this.lastFailureTime
|
|
2227
|
+
};
|
|
2228
|
+
}
|
|
2229
|
+
};
|
|
2230
|
+
var circuitBreakers = {
|
|
2231
|
+
rss: new CircuitBreaker({
|
|
2232
|
+
name: "RSS",
|
|
2233
|
+
failureThreshold: 3,
|
|
2234
|
+
timeout: 15e3,
|
|
2235
|
+
// 15 seconds
|
|
2236
|
+
resetTimeout: 3e4
|
|
2237
|
+
// 30 seconds
|
|
2238
|
+
}),
|
|
2239
|
+
scraping: new CircuitBreaker({
|
|
2240
|
+
name: "Scraping",
|
|
2241
|
+
failureThreshold: 5,
|
|
2242
|
+
timeout: 1e4,
|
|
2243
|
+
// 10 seconds
|
|
2244
|
+
resetTimeout: 3e4
|
|
2245
|
+
// 30 seconds
|
|
2246
|
+
}),
|
|
2247
|
+
scrapingTest: new CircuitBreaker({
|
|
2248
|
+
name: "ScrapingTest",
|
|
2249
|
+
failureThreshold: 3,
|
|
2250
|
+
timeout: 3e4,
|
|
2251
|
+
// 30 seconds for test endpoints
|
|
2252
|
+
resetTimeout: 6e4
|
|
2253
|
+
// 1 minute
|
|
2254
|
+
})
|
|
2255
|
+
};
|
|
2256
|
+
|
|
2257
|
+
// src/orchestrator/source-orchestrator.ts
|
|
2258
|
+
var globalHTMLScraper2 = new HTMLScraper();
|
|
2259
|
+
var globalContentExtractor2 = new ContentExtractor();
|
|
2260
|
+
var globalRobotsChecker2 = new RobotsChecker();
|
|
2261
|
+
var CandidateArticleSchema = import_zod.z.object({
|
|
2262
|
+
url: import_zod.z.string().url(),
|
|
2263
|
+
title: import_zod.z.string().min(1),
|
|
2264
|
+
publishedAt: import_zod.z.date(),
|
|
2265
|
+
content: import_zod.z.string().optional(),
|
|
2266
|
+
excerpt: import_zod.z.string().optional(),
|
|
2267
|
+
guid: import_zod.z.string(),
|
|
2268
|
+
confidence: import_zod.z.number().min(0).max(1),
|
|
2269
|
+
source: import_zod.z.enum(["rss", "sitemap", "html", "discovery"]),
|
|
2270
|
+
extractionMethod: import_zod.z.enum(["rss", "sitemap", "html-links", "content-extraction"]),
|
|
2271
|
+
metadata: import_zod.z.record(import_zod.z.any()).optional()
|
|
2272
|
+
});
|
|
2273
|
+
var SourceConfigSchema = import_zod.z.object({
|
|
2274
|
+
sourceType: import_zod.z.enum(["rss", "sitemap", "html", "auto"]),
|
|
2275
|
+
allowPaths: import_zod.z.array(import_zod.z.string()).optional(),
|
|
2276
|
+
denyPaths: import_zod.z.array(import_zod.z.string()).optional(),
|
|
2277
|
+
maxDepth: import_zod.z.number().int().min(1).max(5).optional(),
|
|
2278
|
+
detectOnly: import_zod.z.boolean().optional(),
|
|
2279
|
+
scrapeConfig: import_zod.z.object({
|
|
2280
|
+
selectors: import_zod.z.object({
|
|
2281
|
+
articleLinks: import_zod.z.array(import_zod.z.string()).optional(),
|
|
2282
|
+
titleSelectors: import_zod.z.array(import_zod.z.string()).optional(),
|
|
2283
|
+
dateSelectors: import_zod.z.array(import_zod.z.string()).optional(),
|
|
2284
|
+
excludeSelectors: import_zod.z.array(import_zod.z.string()).optional()
|
|
2285
|
+
}).optional(),
|
|
2286
|
+
filters: import_zod.z.object({
|
|
2287
|
+
minTitleLength: import_zod.z.number().optional(),
|
|
2288
|
+
maxTitleLength: import_zod.z.number().optional(),
|
|
2289
|
+
includePatterns: import_zod.z.array(import_zod.z.string()).optional(),
|
|
2290
|
+
excludePatterns: import_zod.z.array(import_zod.z.string()).optional()
|
|
2291
|
+
}).optional(),
|
|
2292
|
+
limits: import_zod.z.object({
|
|
2293
|
+
maxLinksPerPage: import_zod.z.number().optional(),
|
|
2294
|
+
maxPages: import_zod.z.number().optional()
|
|
2295
|
+
}).optional()
|
|
2296
|
+
}).optional()
|
|
2297
|
+
});
|
|
2298
|
+
var SourceOrchestrator = class {
|
|
2299
|
+
constructor() {
|
|
2300
|
+
this.maxArticlesPerSource = 1e3;
|
|
2301
|
+
}
|
|
2302
|
+
// private readonly recentTimeframe = 48 * 60 * 60 * 1000; // 48 hours (currently unused)
|
|
2303
|
+
/**
|
|
2304
|
+
* Main orchestration method - determines source type and extracts content
|
|
2305
|
+
*/
|
|
2306
|
+
async processSource(url, config = { sourceType: "auto" }) {
|
|
2307
|
+
const startTime = Date.now();
|
|
2308
|
+
console.log(`\u{1F3AD} [Orchestrator] Processing source: ${url} (type: ${config.sourceType})`);
|
|
2309
|
+
const result = {
|
|
2310
|
+
articles: [],
|
|
2311
|
+
sourceInfo: {
|
|
2312
|
+
detectedType: "html",
|
|
2313
|
+
extractionStats: {
|
|
2314
|
+
attempted: 0,
|
|
2315
|
+
successful: 0,
|
|
2316
|
+
failed: 0,
|
|
2317
|
+
filtered: 0
|
|
2318
|
+
}
|
|
2319
|
+
},
|
|
2320
|
+
processingTime: 0,
|
|
2321
|
+
errors: []
|
|
2322
|
+
};
|
|
2323
|
+
try {
|
|
2324
|
+
const breaker = config.circuitBreaker || circuitBreakers.scraping;
|
|
2325
|
+
return await breaker.execute(async () => {
|
|
2326
|
+
if (config.sourceType === "auto") {
|
|
2327
|
+
return await this.autoDetectAndProcess(url, config, result);
|
|
2328
|
+
} else {
|
|
2329
|
+
return await this.processKnownType(url, config, result);
|
|
2330
|
+
}
|
|
2331
|
+
});
|
|
2332
|
+
} catch (error) {
|
|
2333
|
+
const errorMessage = error instanceof Error ? error.message : "Unknown error";
|
|
2334
|
+
console.error(`\u274C [Orchestrator] Failed to process source ${url}:`, errorMessage);
|
|
2335
|
+
result.errors.push(errorMessage);
|
|
2336
|
+
result.processingTime = Date.now() - startTime;
|
|
2337
|
+
return result;
|
|
2338
|
+
}
|
|
2339
|
+
}
|
|
2340
|
+
/**
|
|
2341
|
+
* Auto-detect source type and process accordingly
|
|
2342
|
+
*/
|
|
2343
|
+
async autoDetectAndProcess(url, config, result) {
|
|
2344
|
+
console.log(`\u{1F50D} [Orchestrator] Auto-detecting source type for ${url}`);
|
|
2345
|
+
try {
|
|
2346
|
+
const rssArticles = await this.processAsRSS(url);
|
|
2347
|
+
if (rssArticles.length > 0) {
|
|
2348
|
+
result.sourceInfo.detectedType = "rss";
|
|
2349
|
+
result.articles = this.applyPathFilters(rssArticles, config);
|
|
2350
|
+
console.log(`\u2705 [Orchestrator] Detected as RSS feed: ${result.articles.length} articles`);
|
|
2351
|
+
return this.finalizeResult(result);
|
|
2352
|
+
}
|
|
2353
|
+
} catch (error) {
|
|
2354
|
+
result.errors.push(`RSS detection failed: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
2355
|
+
}
|
|
2356
|
+
try {
|
|
2357
|
+
const discoveredFeeds = await globalRSSDiscovery.discoverFeeds(url);
|
|
2358
|
+
if (discoveredFeeds.length > 0) {
|
|
2359
|
+
result.sourceInfo.discoveredFeeds = discoveredFeeds;
|
|
2360
|
+
const bestFeed = discoveredFeeds[0];
|
|
2361
|
+
const rssArticles = await this.processAsRSS(bestFeed.url);
|
|
2362
|
+
if (rssArticles.length > 0) {
|
|
2363
|
+
result.sourceInfo.detectedType = "rss";
|
|
2364
|
+
result.articles = this.applyPathFilters(rssArticles, config);
|
|
2365
|
+
console.log(`\u2705 [Orchestrator] Using discovered RSS feed: ${result.articles.length} articles`);
|
|
2366
|
+
return this.finalizeResult(result);
|
|
2367
|
+
}
|
|
2368
|
+
}
|
|
2369
|
+
} catch (error) {
|
|
2370
|
+
result.errors.push(`RSS discovery failed: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
2371
|
+
}
|
|
2372
|
+
try {
|
|
2373
|
+
const sitemapArticles = await this.processAsSitemap(url);
|
|
2374
|
+
if (sitemapArticles.length > 0) {
|
|
2375
|
+
result.sourceInfo.detectedType = "sitemap";
|
|
2376
|
+
result.articles = this.applyPathFilters(sitemapArticles, config);
|
|
2377
|
+
console.log(`\u2705 [Orchestrator] Detected as sitemap: ${result.articles.length} articles`);
|
|
2378
|
+
return this.finalizeResult(result);
|
|
2379
|
+
}
|
|
2380
|
+
} catch (error) {
|
|
2381
|
+
result.errors.push(`Sitemap detection failed: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
2382
|
+
}
|
|
2383
|
+
try {
|
|
2384
|
+
const urlObj = new URL(url);
|
|
2385
|
+
const discoveredSitemaps = await globalSitemapParser.discoverSitemaps(urlObj.hostname);
|
|
2386
|
+
if (discoveredSitemaps.length > 0) {
|
|
2387
|
+
result.sourceInfo.discoveredSitemaps = discoveredSitemaps;
|
|
2388
|
+
const sitemapArticles = await this.processAsSitemap(discoveredSitemaps[0]);
|
|
2389
|
+
if (sitemapArticles.length > 0) {
|
|
2390
|
+
result.sourceInfo.detectedType = "sitemap";
|
|
2391
|
+
result.articles = this.applyPathFilters(sitemapArticles, config);
|
|
2392
|
+
console.log(`\u2705 [Orchestrator] Using discovered sitemap: ${result.articles.length} articles`);
|
|
2393
|
+
return this.finalizeResult(result);
|
|
2394
|
+
}
|
|
2395
|
+
}
|
|
2396
|
+
} catch (error) {
|
|
2397
|
+
result.errors.push(`Sitemap discovery failed: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
2398
|
+
}
|
|
2399
|
+
try {
|
|
2400
|
+
const htmlArticles = await this.processAsHTML(url, config);
|
|
2401
|
+
result.sourceInfo.detectedType = "html";
|
|
2402
|
+
result.articles = this.applyPathFilters(htmlArticles, config);
|
|
2403
|
+
console.log(`\u2705 [Orchestrator] Falling back to HTML scraping: ${result.articles.length} articles`);
|
|
2404
|
+
return this.finalizeResult(result);
|
|
2405
|
+
} catch (error) {
|
|
2406
|
+
result.errors.push(`HTML scraping failed: ${error instanceof Error ? error.message : "Unknown error"}`);
|
|
2407
|
+
return this.finalizeResult(result);
|
|
2408
|
+
}
|
|
2409
|
+
}
|
|
2410
|
+
/**
|
|
2411
|
+
* Process source with known type
|
|
2412
|
+
*/
|
|
2413
|
+
async processKnownType(url, config, result) {
|
|
2414
|
+
console.log(`\u{1F3AF} [Orchestrator] Processing as ${config.sourceType}: ${url}`);
|
|
2415
|
+
try {
|
|
2416
|
+
let articles = [];
|
|
2417
|
+
switch (config.sourceType) {
|
|
2418
|
+
case "rss":
|
|
2419
|
+
articles = await this.processAsRSS(url);
|
|
2420
|
+
result.sourceInfo.detectedType = "rss";
|
|
2421
|
+
break;
|
|
2422
|
+
case "sitemap":
|
|
2423
|
+
articles = await this.processAsSitemap(url);
|
|
2424
|
+
result.sourceInfo.detectedType = "sitemap";
|
|
2425
|
+
break;
|
|
2426
|
+
case "html":
|
|
2427
|
+
articles = await this.processAsHTML(url, config);
|
|
2428
|
+
result.sourceInfo.detectedType = "html";
|
|
2429
|
+
break;
|
|
2430
|
+
}
|
|
2431
|
+
result.articles = this.applyPathFilters(articles, config);
|
|
2432
|
+
console.log(`\u2705 [Orchestrator] Processed ${config.sourceType}: ${result.articles.length} articles`);
|
|
2433
|
+
return this.finalizeResult(result);
|
|
2434
|
+
} catch (error) {
|
|
2435
|
+
const errorMessage = error instanceof Error ? error.message : "Unknown error";
|
|
2436
|
+
result.errors.push(`${config.sourceType} processing failed: ${errorMessage}`);
|
|
2437
|
+
return this.finalizeResult(result);
|
|
2438
|
+
}
|
|
2439
|
+
}
|
|
2440
|
+
/**
|
|
2441
|
+
* Process URL as RSS feed
|
|
2442
|
+
*/
|
|
2443
|
+
async processAsRSS(url) {
|
|
2444
|
+
const rssItems = await fetchRSSFeed(url);
|
|
2445
|
+
const candidates = [];
|
|
2446
|
+
for (const item of rssItems) {
|
|
2447
|
+
try {
|
|
2448
|
+
const publishedAt = new Date(item.pubDate);
|
|
2449
|
+
if (isNaN(publishedAt.getTime())) {
|
|
2450
|
+
continue;
|
|
2451
|
+
}
|
|
2452
|
+
candidates.push({
|
|
2453
|
+
url: item.link,
|
|
2454
|
+
title: item.title,
|
|
2455
|
+
publishedAt,
|
|
2456
|
+
content: item.content,
|
|
2457
|
+
excerpt: item.contentSnippet,
|
|
2458
|
+
guid: item.guid,
|
|
2459
|
+
confidence: 0.9,
|
|
2460
|
+
source: "rss",
|
|
2461
|
+
extractionMethod: "rss",
|
|
2462
|
+
metadata: {
|
|
2463
|
+
originalGuid: item.guid,
|
|
2464
|
+
rssSource: url
|
|
2465
|
+
}
|
|
2466
|
+
});
|
|
2467
|
+
} catch (error) {
|
|
2468
|
+
console.warn(`\u26A0\uFE0F [Orchestrator] Error processing RSS item:`, error);
|
|
2469
|
+
continue;
|
|
2470
|
+
}
|
|
2471
|
+
}
|
|
2472
|
+
return candidates;
|
|
2473
|
+
}
|
|
2474
|
+
/**
|
|
2475
|
+
* Process URL as sitemap
|
|
2476
|
+
*/
|
|
2477
|
+
async processAsSitemap(url) {
|
|
2478
|
+
const sitemapEntries = await globalSitemapParser.parseSitemap(url, {
|
|
2479
|
+
filterRecent: true,
|
|
2480
|
+
maxEntries: this.maxArticlesPerSource,
|
|
2481
|
+
includeNews: true
|
|
2482
|
+
});
|
|
2483
|
+
const candidates = [];
|
|
2484
|
+
for (const entry of sitemapEntries) {
|
|
2485
|
+
try {
|
|
2486
|
+
const publishedAt = entry.lastmod || /* @__PURE__ */ new Date();
|
|
2487
|
+
candidates.push({
|
|
2488
|
+
url: entry.url,
|
|
2489
|
+
title: entry.news?.title || this.extractTitleFromUrl(entry.url),
|
|
2490
|
+
publishedAt,
|
|
2491
|
+
guid: this.createGuid(entry.url, publishedAt.toISOString()),
|
|
2492
|
+
confidence: entry.news ? 0.8 : 0.6,
|
|
2493
|
+
source: "sitemap",
|
|
2494
|
+
extractionMethod: "sitemap",
|
|
2495
|
+
metadata: {
|
|
2496
|
+
changefreq: entry.changefreq,
|
|
2497
|
+
priority: entry.priority,
|
|
2498
|
+
hasNews: !!entry.news,
|
|
2499
|
+
sitemapSource: url
|
|
2500
|
+
}
|
|
2501
|
+
});
|
|
2502
|
+
} catch (error) {
|
|
2503
|
+
console.warn(`\u26A0\uFE0F [Orchestrator] Error processing sitemap entry:`, error);
|
|
2504
|
+
continue;
|
|
2505
|
+
}
|
|
2506
|
+
}
|
|
2507
|
+
return candidates;
|
|
2508
|
+
}
|
|
2509
|
+
/**
|
|
2510
|
+
* Process URL as HTML page
|
|
2511
|
+
*/
|
|
2512
|
+
async processAsHTML(url, config) {
|
|
2513
|
+
const scrapingConfig = this.buildScrapingConfig(config);
|
|
2514
|
+
const extractedArticles = await globalHTMLScraper2.extractFromMultiplePages(url, scrapingConfig, {
|
|
2515
|
+
maxPages: config.scrapeConfig?.limits?.maxPages || 3
|
|
2516
|
+
});
|
|
2517
|
+
const candidates = [];
|
|
2518
|
+
for (const article of extractedArticles) {
|
|
2519
|
+
try {
|
|
2520
|
+
const publishedAt = article.publishedDate || /* @__PURE__ */ new Date();
|
|
2521
|
+
candidates.push({
|
|
2522
|
+
url: article.url,
|
|
2523
|
+
title: article.title || this.extractTitleFromUrl(article.url),
|
|
2524
|
+
publishedAt,
|
|
2525
|
+
excerpt: article.description,
|
|
2526
|
+
guid: this.createGuid(article.url, publishedAt.toISOString()),
|
|
2527
|
+
confidence: article.confidence,
|
|
2528
|
+
source: "html",
|
|
2529
|
+
extractionMethod: "html-links",
|
|
2530
|
+
metadata: {
|
|
2531
|
+
extractionSource: article.source,
|
|
2532
|
+
htmlSource: url
|
|
2533
|
+
}
|
|
2534
|
+
});
|
|
2535
|
+
} catch (error) {
|
|
2536
|
+
console.warn(`\u26A0\uFE0F [Orchestrator] Error processing HTML article:`, error);
|
|
2537
|
+
continue;
|
|
2538
|
+
}
|
|
2539
|
+
}
|
|
2540
|
+
return candidates;
|
|
2541
|
+
}
|
|
2542
|
+
/**
|
|
2543
|
+
* Apply path filtering based on allowPaths and denyPaths
|
|
2544
|
+
*/
|
|
2545
|
+
applyPathFilters(articles, config) {
|
|
2546
|
+
if (!config.allowPaths?.length && !config.denyPaths?.length) {
|
|
2547
|
+
return articles;
|
|
2548
|
+
}
|
|
2549
|
+
return articles.filter((article) => {
|
|
2550
|
+
try {
|
|
2551
|
+
const urlObj = new URL(article.url);
|
|
2552
|
+
const path = urlObj.pathname.toLowerCase();
|
|
2553
|
+
if (config.denyPaths?.length) {
|
|
2554
|
+
for (const pattern of config.denyPaths) {
|
|
2555
|
+
if (this.matchesPattern(path, pattern)) {
|
|
2556
|
+
console.log(`\u{1F6AB} [Orchestrator] Article blocked by deny pattern "${pattern}": ${article.url}`);
|
|
2557
|
+
return false;
|
|
2558
|
+
}
|
|
2559
|
+
}
|
|
2560
|
+
}
|
|
2561
|
+
if (config.allowPaths?.length) {
|
|
2562
|
+
for (const pattern of config.allowPaths) {
|
|
2563
|
+
if (this.matchesPattern(path, pattern)) {
|
|
2564
|
+
return true;
|
|
2565
|
+
}
|
|
2566
|
+
}
|
|
2567
|
+
console.log(`\u{1F6AB} [Orchestrator] Article not matching any allow pattern: ${article.url}`);
|
|
2568
|
+
return false;
|
|
2569
|
+
}
|
|
2570
|
+
return true;
|
|
2571
|
+
} catch (error) {
|
|
2572
|
+
console.warn(`\u26A0\uFE0F [Orchestrator] Error applying path filters to ${article.url}:`, error);
|
|
2573
|
+
return true;
|
|
2574
|
+
}
|
|
2575
|
+
});
|
|
2576
|
+
}
|
|
2577
|
+
/**
|
|
2578
|
+
* Check if a path matches a pattern (supports wildcards)
|
|
2579
|
+
*/
|
|
2580
|
+
matchesPattern(path, pattern) {
|
|
2581
|
+
const regexPattern = pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&").replace(/\\\*/g, ".*").replace(/\\\?/g, ".");
|
|
2582
|
+
const regex = new RegExp("^" + regexPattern + "$", "i");
|
|
2583
|
+
return regex.test(path);
|
|
2584
|
+
}
|
|
2585
|
+
/**
|
|
2586
|
+
* Build scraping configuration from source config
|
|
2587
|
+
*/
|
|
2588
|
+
buildScrapingConfig(config) {
|
|
2589
|
+
const scrapingConfig = {};
|
|
2590
|
+
if (config.scrapeConfig?.selectors) {
|
|
2591
|
+
scrapingConfig.selectors = {
|
|
2592
|
+
articleLinks: config.scrapeConfig.selectors.articleLinks,
|
|
2593
|
+
titleSelectors: config.scrapeConfig.selectors.titleSelectors,
|
|
2594
|
+
dateSelectors: config.scrapeConfig.selectors.dateSelectors,
|
|
2595
|
+
excludeSelectors: config.scrapeConfig.selectors.excludeSelectors
|
|
2596
|
+
};
|
|
2597
|
+
}
|
|
2598
|
+
if (config.scrapeConfig?.filters) {
|
|
2599
|
+
scrapingConfig.filters = {
|
|
2600
|
+
minTitleLength: config.scrapeConfig.filters.minTitleLength,
|
|
2601
|
+
maxTitleLength: config.scrapeConfig.filters.maxTitleLength,
|
|
2602
|
+
includePatterns: config.scrapeConfig.filters.includePatterns?.map((p) => new RegExp(p, "i")),
|
|
2603
|
+
excludePatterns: config.scrapeConfig.filters.excludePatterns?.map((p) => new RegExp(p, "i"))
|
|
2604
|
+
};
|
|
2605
|
+
}
|
|
2606
|
+
if (config.scrapeConfig?.limits) {
|
|
2607
|
+
scrapingConfig.limits = config.scrapeConfig.limits;
|
|
2608
|
+
}
|
|
2609
|
+
return scrapingConfig;
|
|
2610
|
+
}
|
|
2611
|
+
/**
|
|
2612
|
+
* Extract title from URL as fallback
|
|
2613
|
+
*/
|
|
2614
|
+
extractTitleFromUrl(url) {
|
|
2615
|
+
try {
|
|
2616
|
+
const urlObj = new URL(url);
|
|
2617
|
+
const pathParts = urlObj.pathname.split("/").filter(Boolean);
|
|
2618
|
+
const lastPart = pathParts[pathParts.length - 1] || urlObj.hostname;
|
|
2619
|
+
return lastPart.replace(/[-_]/g, " ").replace(/\.(html|htm|php|asp|jsp)$/i, "").split(" ").map((word) => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase()).join(" ");
|
|
2620
|
+
} catch {
|
|
2621
|
+
return "Untitled Article";
|
|
2622
|
+
}
|
|
2623
|
+
}
|
|
2624
|
+
/**
|
|
2625
|
+
* Create a consistent GUID for an article
|
|
2626
|
+
*/
|
|
2627
|
+
createGuid(url, publishedAt) {
|
|
2628
|
+
return import_crypto2.default.createHash("sha256").update(url + publishedAt).digest("hex");
|
|
2629
|
+
}
|
|
2630
|
+
/**
|
|
2631
|
+
* Finalize processing result
|
|
2632
|
+
*/
|
|
2633
|
+
finalizeResult(result) {
|
|
2634
|
+
const endTime = Date.now();
|
|
2635
|
+
result.processingTime = endTime - (Date.now() - result.processingTime);
|
|
2636
|
+
result.sourceInfo.extractionStats = {
|
|
2637
|
+
attempted: result.articles.length,
|
|
2638
|
+
successful: result.articles.filter((a) => a.confidence >= 0.5).length,
|
|
2639
|
+
failed: result.errors.length,
|
|
2640
|
+
filtered: 0
|
|
2641
|
+
// This would be calculated during filtering
|
|
2642
|
+
};
|
|
2643
|
+
result.articles.sort((a, b) => {
|
|
2644
|
+
const confidenceDiff = b.confidence - a.confidence;
|
|
2645
|
+
if (Math.abs(confidenceDiff) > 0.1) return confidenceDiff;
|
|
2646
|
+
return b.publishedAt.getTime() - a.publishedAt.getTime();
|
|
2647
|
+
});
|
|
2648
|
+
result.articles = result.articles.slice(0, this.maxArticlesPerSource);
|
|
2649
|
+
console.log(`\u{1F3AD} [Orchestrator] Processing complete: ${result.articles.length} articles in ${result.processingTime}ms`);
|
|
2650
|
+
return result;
|
|
2651
|
+
}
|
|
2652
|
+
/**
|
|
2653
|
+
* Extract full content for articles (optional enhancement step)
|
|
2654
|
+
*/
|
|
2655
|
+
async enhanceWithFullContent(articles, maxArticles = 10) {
|
|
2656
|
+
console.log(`\u{1F4D6} [Orchestrator] Enhancing ${Math.min(articles.length, maxArticles)} articles with full content`);
|
|
2657
|
+
const toEnhance = articles.filter((a) => !a.content || a.content.length < 500).slice(0, maxArticles);
|
|
2658
|
+
for (const article of toEnhance) {
|
|
2659
|
+
try {
|
|
2660
|
+
const extractedContent = await globalContentExtractor2.extractContent(article.url);
|
|
2661
|
+
if (extractedContent) {
|
|
2662
|
+
article.content = extractedContent.content;
|
|
2663
|
+
article.excerpt = extractedContent.excerpt || article.excerpt;
|
|
2664
|
+
article.confidence = Math.min(article.confidence + 0.1, 1);
|
|
2665
|
+
article.metadata = {
|
|
2666
|
+
...article.metadata,
|
|
2667
|
+
fullContentExtracted: true,
|
|
2668
|
+
extractionMethod: extractedContent.extractionMethod,
|
|
2669
|
+
wordCount: extractedContent.wordCount,
|
|
2670
|
+
readingTime: extractedContent.readingTime
|
|
2671
|
+
};
|
|
2672
|
+
}
|
|
2673
|
+
} catch (error) {
|
|
2674
|
+
console.warn(`\u26A0\uFE0F [Orchestrator] Failed to enhance article ${article.url}:`, error);
|
|
2675
|
+
continue;
|
|
2676
|
+
}
|
|
2677
|
+
}
|
|
2678
|
+
console.log(`\u{1F4D6} [Orchestrator] Content enhancement complete`);
|
|
2679
|
+
return articles;
|
|
2680
|
+
}
|
|
2681
|
+
/**
|
|
2682
|
+
* Validate orchestrator configuration
|
|
2683
|
+
*/
|
|
2684
|
+
static validateConfig(config) {
|
|
2685
|
+
try {
|
|
2686
|
+
return SourceConfigSchema.parse(config);
|
|
2687
|
+
} catch (error) {
|
|
2688
|
+
if (error instanceof import_zod.z.ZodError) {
|
|
2689
|
+
throw new Error(`Invalid source configuration: ${error.errors.map((e) => e.message).join(", ")}`);
|
|
2690
|
+
}
|
|
2691
|
+
throw error;
|
|
2692
|
+
}
|
|
2693
|
+
}
|
|
2694
|
+
/**
|
|
2695
|
+
* Get source statistics
|
|
2696
|
+
*/
|
|
2697
|
+
async getSourceStats(url) {
|
|
2698
|
+
const robotsCheck = await globalRobotsChecker2.isAllowed(url);
|
|
2699
|
+
const discoveredFeeds = await globalRSSDiscovery.discoverFeeds(url);
|
|
2700
|
+
let hasSitemap = false;
|
|
2701
|
+
let estimatedArticleCount = 0;
|
|
2702
|
+
try {
|
|
2703
|
+
const urlObj = new URL(url);
|
|
2704
|
+
const sitemaps = await globalSitemapParser.discoverSitemaps(urlObj.hostname);
|
|
2705
|
+
hasSitemap = sitemaps.length > 0;
|
|
2706
|
+
if (hasSitemap) {
|
|
2707
|
+
const recentEntries = await globalSitemapParser.getRecentEntries(urlObj.hostname, { hoursBack: 48, maxEntries: 100 });
|
|
2708
|
+
estimatedArticleCount = recentEntries.length;
|
|
2709
|
+
}
|
|
2710
|
+
} catch (error) {
|
|
2711
|
+
}
|
|
2712
|
+
return {
|
|
2713
|
+
robotsCompliant: robotsCheck.allowed,
|
|
2714
|
+
hasRSSFeed: discoveredFeeds.length > 0,
|
|
2715
|
+
hasSitemap,
|
|
2716
|
+
detectedType: discoveredFeeds.length > 0 ? "rss" : hasSitemap ? "sitemap" : "html",
|
|
2717
|
+
estimatedArticleCount
|
|
2718
|
+
};
|
|
2719
|
+
}
|
|
2720
|
+
};
|
|
2721
|
+
var globalSourceOrchestrator = new SourceOrchestrator();
|
|
2722
|
+
|
|
2723
|
+
// src/quality/quality-scorer.ts
|
|
2724
|
+
var DEFAULT_QUALITY_CONFIG = {
|
|
2725
|
+
contentWeight: 0.6,
|
|
2726
|
+
// Content validation (length, quality, ratio)
|
|
2727
|
+
dateWeight: 0.12,
|
|
2728
|
+
// Publication date presence
|
|
2729
|
+
authorWeight: 0.08,
|
|
2730
|
+
// Author/byline presence
|
|
2731
|
+
schemaWeight: 0.08,
|
|
2732
|
+
// Schema.org metadata
|
|
2733
|
+
readingTimeWeight: 0.12,
|
|
2734
|
+
// Substantial reading time (2+ min)
|
|
2735
|
+
threshold: 0.5
|
|
2736
|
+
// Minimum score to pass (50%)
|
|
2737
|
+
};
|
|
2738
|
+
var DEFAULT_DENY_PATHS = [
|
|
2739
|
+
"/",
|
|
2740
|
+
"/index",
|
|
2741
|
+
"/index.html",
|
|
2742
|
+
"/about",
|
|
2743
|
+
"/about/*",
|
|
2744
|
+
"/careers",
|
|
2745
|
+
"/careers/*",
|
|
2746
|
+
"/jobs",
|
|
2747
|
+
"/jobs/*",
|
|
2748
|
+
"/contact",
|
|
2749
|
+
"/contact/*",
|
|
2750
|
+
"/team",
|
|
2751
|
+
"/team/*",
|
|
2752
|
+
"/privacy",
|
|
2753
|
+
"/terms",
|
|
2754
|
+
"/legal/*",
|
|
2755
|
+
"/tag/*",
|
|
2756
|
+
"/tags/*",
|
|
2757
|
+
"/category/*",
|
|
2758
|
+
"/categories/*",
|
|
2759
|
+
"/author/*",
|
|
2760
|
+
"/authors/*",
|
|
2761
|
+
"/archive/*",
|
|
2762
|
+
"/search",
|
|
2763
|
+
"/search/*"
|
|
2764
|
+
];
|
|
2765
|
+
function validateContent(extracted) {
|
|
2766
|
+
const reasons = [];
|
|
2767
|
+
let score = 1;
|
|
2768
|
+
const contentLength = extracted.textContent?.length || 0;
|
|
2769
|
+
if (contentLength < 200) {
|
|
2770
|
+
reasons.push("Content too short (< 200 characters)");
|
|
2771
|
+
score -= 0.5;
|
|
2772
|
+
}
|
|
2773
|
+
const titleLength = extracted.title?.length || 0;
|
|
2774
|
+
if (titleLength < 10 || titleLength > 200) {
|
|
2775
|
+
reasons.push("Title length invalid (must be 10-200 characters)");
|
|
2776
|
+
score -= 0.2;
|
|
2777
|
+
}
|
|
2778
|
+
if (extracted.content && extracted.textContent) {
|
|
2779
|
+
const htmlLength = extracted.content.length;
|
|
2780
|
+
const textLength = extracted.textContent.length;
|
|
2781
|
+
const ratio = textLength / htmlLength;
|
|
2782
|
+
if (ratio < 0.1) {
|
|
2783
|
+
reasons.push("Low text-to-HTML ratio (< 10%)");
|
|
2784
|
+
score -= 0.2;
|
|
2785
|
+
}
|
|
2786
|
+
}
|
|
2787
|
+
const isValid = score >= 0.5;
|
|
2788
|
+
return {
|
|
2789
|
+
isValid,
|
|
2790
|
+
score: Math.max(0, Math.min(1, score)),
|
|
2791
|
+
// Clamp between 0-1
|
|
2792
|
+
reasons
|
|
2793
|
+
};
|
|
2794
|
+
}
|
|
2795
|
+
function calculateArticleQualityScore(extracted, config = {}) {
|
|
2796
|
+
const finalConfig = { ...DEFAULT_QUALITY_CONFIG, ...config };
|
|
2797
|
+
let score = 0;
|
|
2798
|
+
const validation = validateContent(extracted);
|
|
2799
|
+
score += validation.score * finalConfig.contentWeight;
|
|
2800
|
+
if (extracted.publishedTime) {
|
|
2801
|
+
score += finalConfig.dateWeight;
|
|
2802
|
+
}
|
|
2803
|
+
if (extracted.byline) {
|
|
2804
|
+
score += finalConfig.authorWeight;
|
|
2805
|
+
}
|
|
2806
|
+
if (extracted.structured?.jsonLd) {
|
|
2807
|
+
const schemas = Array.isArray(extracted.structured.jsonLd) ? extracted.structured.jsonLd : [extracted.structured.jsonLd];
|
|
2808
|
+
const hasArticleType = schemas.some((s) => {
|
|
2809
|
+
const type = s["@type"];
|
|
2810
|
+
return type === "Article" || type === "NewsArticle" || type === "BlogPosting" || type === "TechArticle" || type === "ScholarlyArticle";
|
|
2811
|
+
});
|
|
2812
|
+
if (hasArticleType) {
|
|
2813
|
+
score += finalConfig.schemaWeight;
|
|
2814
|
+
}
|
|
2815
|
+
}
|
|
2816
|
+
if (extracted.readingTime && extracted.readingTime >= 2) {
|
|
2817
|
+
score += finalConfig.readingTimeWeight;
|
|
2818
|
+
}
|
|
2819
|
+
return Math.min(score, 1);
|
|
2820
|
+
}
|
|
2821
|
+
function shouldDenyUrl(url, denyPaths = DEFAULT_DENY_PATHS) {
|
|
2822
|
+
try {
|
|
2823
|
+
const urlObj = new URL(url);
|
|
2824
|
+
const path = urlObj.pathname;
|
|
2825
|
+
return denyPaths.some((pattern) => {
|
|
2826
|
+
if (pattern === path) return true;
|
|
2827
|
+
if (pattern.endsWith("/*")) {
|
|
2828
|
+
const prefix = pattern.slice(0, -2);
|
|
2829
|
+
return path.startsWith(prefix);
|
|
2830
|
+
}
|
|
2831
|
+
return false;
|
|
2832
|
+
});
|
|
2833
|
+
} catch {
|
|
2834
|
+
return false;
|
|
2835
|
+
}
|
|
2836
|
+
}
|
|
2837
|
+
function getQualityBreakdown(extracted, config = {}) {
|
|
2838
|
+
const finalConfig = { ...DEFAULT_QUALITY_CONFIG, ...config };
|
|
2839
|
+
const validation = validateContent(extracted);
|
|
2840
|
+
const breakdown = {
|
|
2841
|
+
contentValidation: validation.score * finalConfig.contentWeight,
|
|
2842
|
+
publishedDate: extracted.publishedTime ? finalConfig.dateWeight : 0,
|
|
2843
|
+
author: extracted.byline ? finalConfig.authorWeight : 0,
|
|
2844
|
+
schema: 0,
|
|
2845
|
+
readingTime: extracted.readingTime && extracted.readingTime >= 2 ? finalConfig.readingTimeWeight : 0,
|
|
2846
|
+
total: 0,
|
|
2847
|
+
passesThreshold: false
|
|
2848
|
+
};
|
|
2849
|
+
if (extracted.structured?.jsonLd) {
|
|
2850
|
+
const schemas = Array.isArray(extracted.structured.jsonLd) ? extracted.structured.jsonLd : [extracted.structured.jsonLd];
|
|
2851
|
+
const hasArticleType = schemas.some((s) => {
|
|
2852
|
+
const type = s["@type"];
|
|
2853
|
+
return type === "Article" || type === "NewsArticle" || type === "BlogPosting" || type === "TechArticle" || type === "ScholarlyArticle";
|
|
2854
|
+
});
|
|
2855
|
+
if (hasArticleType) {
|
|
2856
|
+
breakdown.schema = finalConfig.schemaWeight;
|
|
2857
|
+
}
|
|
2858
|
+
}
|
|
2859
|
+
breakdown.total = breakdown.contentValidation + breakdown.publishedDate + breakdown.author + breakdown.schema + breakdown.readingTime;
|
|
2860
|
+
breakdown.passesThreshold = breakdown.total >= finalConfig.threshold;
|
|
2861
|
+
return breakdown;
|
|
2862
|
+
}
|
|
2863
|
+
|
|
2864
|
+
// src/formatters/html-to-markdown.ts
|
|
2865
|
+
var import_turndown = __toESM(require("turndown"));
|
|
2866
|
+
function htmlToMarkdown(html) {
|
|
2867
|
+
if (!html) return "";
|
|
2868
|
+
const turndownService = new import_turndown.default({
|
|
2869
|
+
headingStyle: "atx",
|
|
2870
|
+
// Use # for headings
|
|
2871
|
+
codeBlockStyle: "fenced",
|
|
2872
|
+
// Use ``` for code blocks
|
|
2873
|
+
bulletListMarker: "-",
|
|
2874
|
+
// Use - for lists
|
|
2875
|
+
emDelimiter: "*",
|
|
2876
|
+
// Use * for emphasis
|
|
2877
|
+
strongDelimiter: "**"
|
|
2878
|
+
// Use ** for strong
|
|
2879
|
+
});
|
|
2880
|
+
turndownService.remove([
|
|
2881
|
+
"script",
|
|
2882
|
+
"style",
|
|
2883
|
+
"nav",
|
|
2884
|
+
"header",
|
|
2885
|
+
"footer",
|
|
2886
|
+
"aside",
|
|
2887
|
+
"form",
|
|
2888
|
+
"button",
|
|
2889
|
+
"input",
|
|
2890
|
+
"select",
|
|
2891
|
+
"textarea",
|
|
2892
|
+
"iframe",
|
|
2893
|
+
"noscript"
|
|
2894
|
+
]);
|
|
2895
|
+
turndownService.addRule("cleanAttributes", {
|
|
2896
|
+
filter: ["div", "span", "p", "section", "article"],
|
|
2897
|
+
replacement: (content) => {
|
|
2898
|
+
return content;
|
|
2899
|
+
}
|
|
2900
|
+
});
|
|
2901
|
+
let markdown = turndownService.turndown(html);
|
|
2902
|
+
markdown = smartParagraphDetection(markdown);
|
|
2903
|
+
markdown = normalizeWhitespace(markdown);
|
|
2904
|
+
return markdown;
|
|
2905
|
+
}
|
|
2906
|
+
function smartParagraphDetection(markdown) {
|
|
2907
|
+
const lines = markdown.split("\n");
|
|
2908
|
+
const result = [];
|
|
2909
|
+
for (let i = 0; i < lines.length; i++) {
|
|
2910
|
+
const line = lines[i];
|
|
2911
|
+
const prevLine = i > 0 ? lines[i - 1] : "";
|
|
2912
|
+
const nextLine = i < lines.length - 1 ? lines[i + 1] : "";
|
|
2913
|
+
result.push(line);
|
|
2914
|
+
if (line.match(/^#{1,6}\s/) && nextLine && !nextLine.match(/^#{1,6}\s/)) {
|
|
2915
|
+
result.push("");
|
|
2916
|
+
}
|
|
2917
|
+
if (nextLine.match(/^#{1,6}\s/) && line && !line.match(/^#{1,6}\s/) && !prevLine.match(/^$/)) {
|
|
2918
|
+
result.push("");
|
|
2919
|
+
}
|
|
2920
|
+
if (line.match(/^[-*+]\s/) && nextLine && !nextLine.match(/^[-*+]\s/) && !nextLine.match(/^$/)) {
|
|
2921
|
+
result.push("");
|
|
2922
|
+
}
|
|
2923
|
+
}
|
|
2924
|
+
return result.join("\n");
|
|
2925
|
+
}
|
|
2926
|
+
function normalizeWhitespace(markdown) {
|
|
2927
|
+
markdown = markdown.replace(/\n{3,}/g, "\n\n");
|
|
2928
|
+
markdown = markdown.split("\n").map((line) => line.trim()).join("\n");
|
|
2929
|
+
markdown = markdown.trim();
|
|
2930
|
+
return markdown;
|
|
2931
|
+
}
|
|
2932
|
+
function stripNonArticleContent(html) {
|
|
2933
|
+
if (!html) return "";
|
|
2934
|
+
const nonArticlePatterns = [
|
|
2935
|
+
/<nav\b[^>]*>.*?<\/nav>/gi,
|
|
2936
|
+
/<header\b[^>]*>.*?<\/header>/gi,
|
|
2937
|
+
/<footer\b[^>]*>.*?<\/footer>/gi,
|
|
2938
|
+
/<aside\b[^>]*>.*?<\/aside>/gi,
|
|
2939
|
+
/<form\b[^>]*>.*?<\/form>/gi,
|
|
2940
|
+
/<div[^>]*class="[^"]*(?:nav|menu|sidebar|advertisement|ads|social|share|comment|popup|modal)[^"]*"[^>]*>.*?<\/div>/gi,
|
|
2941
|
+
/<div[^>]*id="[^"]*(?:nav|menu|sidebar|advertisement|ads|social|share|comment|popup|modal)[^"]*"[^>]*>.*?<\/div>/gi
|
|
2942
|
+
];
|
|
2943
|
+
let cleaned = html;
|
|
2944
|
+
for (const pattern of nonArticlePatterns) {
|
|
2945
|
+
cleaned = cleaned.replace(pattern, "");
|
|
2946
|
+
}
|
|
2947
|
+
cleaned = cleaned.replace(/\s*class="[^"]*"/gi, "");
|
|
2948
|
+
cleaned = cleaned.replace(/\s*id="[^"]*"/gi, "");
|
|
2949
|
+
cleaned = cleaned.replace(/\s*data-[^=]*="[^"]*"/gi, "");
|
|
2950
|
+
return cleaned;
|
|
2951
|
+
}
|
|
2952
|
+
function convertToMarkdown(html, options = {}) {
|
|
2953
|
+
const {
|
|
2954
|
+
cleanNonArticle = true,
|
|
2955
|
+
smartParagraphs: _smartParagraphs = true
|
|
2956
|
+
} = options;
|
|
2957
|
+
let processedHtml = html;
|
|
2958
|
+
if (cleanNonArticle) {
|
|
2959
|
+
processedHtml = stripNonArticleContent(processedHtml);
|
|
2960
|
+
}
|
|
2961
|
+
const markdown = htmlToMarkdown(processedHtml);
|
|
2962
|
+
return markdown;
|
|
2963
|
+
}
|
|
2964
|
+
|
|
2965
|
+
// src/formatters/text-cleaner.ts
|
|
2966
|
+
function cleanText(text) {
|
|
2967
|
+
if (!text) return "";
|
|
2968
|
+
let cleaned = text;
|
|
2969
|
+
cleaned = decodeHTMLEntities(cleaned);
|
|
2970
|
+
cleaned = normalizeWhitespace2(cleaned);
|
|
2971
|
+
cleaned = detectParagraphs(cleaned);
|
|
2972
|
+
cleaned = cleaned.trim();
|
|
2973
|
+
return cleaned;
|
|
2974
|
+
}
|
|
2975
|
+
function decodeHTMLEntities(text) {
|
|
2976
|
+
const entities = {
|
|
2977
|
+
" ": " ",
|
|
2978
|
+
"&": "&",
|
|
2979
|
+
"<": "<",
|
|
2980
|
+
">": ">",
|
|
2981
|
+
""": '"',
|
|
2982
|
+
"'": "'",
|
|
2983
|
+
"'": "'",
|
|
2984
|
+
"–": "\u2013",
|
|
2985
|
+
"—": "\u2014",
|
|
2986
|
+
"…": "\u2026",
|
|
2987
|
+
"“": '"',
|
|
2988
|
+
"”": '"',
|
|
2989
|
+
"‘": "\u2018",
|
|
2990
|
+
"’": "\u2019"
|
|
2991
|
+
};
|
|
2992
|
+
let decoded = text;
|
|
2993
|
+
for (const [entity, char] of Object.entries(entities)) {
|
|
2994
|
+
decoded = decoded.replace(new RegExp(entity, "g"), char);
|
|
2995
|
+
}
|
|
2996
|
+
decoded = decoded.replace(
|
|
2997
|
+
/&#(\d+);/g,
|
|
2998
|
+
(_, code) => String.fromCharCode(parseInt(code, 10))
|
|
2999
|
+
);
|
|
3000
|
+
decoded = decoded.replace(
|
|
3001
|
+
/&#x([0-9a-f]+);/gi,
|
|
3002
|
+
(_, code) => String.fromCharCode(parseInt(code, 16))
|
|
3003
|
+
);
|
|
3004
|
+
return decoded;
|
|
3005
|
+
}
|
|
3006
|
+
function normalizeWhitespace2(text) {
|
|
3007
|
+
let normalized = text.replace(/\t/g, " ");
|
|
3008
|
+
normalized = normalized.replace(/ {2,}/g, " ");
|
|
3009
|
+
normalized = normalized.split("\n").map((line) => line.trim()).join("\n");
|
|
3010
|
+
normalized = normalized.replace(/\n{3,}/g, "\n\n");
|
|
3011
|
+
return normalized;
|
|
3012
|
+
}
|
|
3013
|
+
function detectParagraphs(text) {
|
|
3014
|
+
const lines = text.split("\n").filter((line) => line.trim().length > 0);
|
|
3015
|
+
const result = [];
|
|
3016
|
+
for (let i = 0; i < lines.length; i++) {
|
|
3017
|
+
const line = lines[i];
|
|
3018
|
+
const nextLine = i < lines.length - 1 ? lines[i + 1] : "";
|
|
3019
|
+
result.push(line);
|
|
3020
|
+
if (line.match(/[.!?]$/) && nextLine.match(/^[A-Z0-9]/) && line.length > 40 && // Avoid breaking after short lines
|
|
3021
|
+
nextLine.length > 20) {
|
|
3022
|
+
result.push("");
|
|
3023
|
+
}
|
|
3024
|
+
}
|
|
3025
|
+
return result.join("\n");
|
|
3026
|
+
}
|
|
3027
|
+
function removeUrls(text) {
|
|
3028
|
+
return text.replace(/https?:\/\/[^\s]+/g, "");
|
|
3029
|
+
}
|
|
3030
|
+
function truncateText(text, maxLength) {
|
|
3031
|
+
if (text.length <= maxLength) return text;
|
|
3032
|
+
const truncated = text.substring(0, maxLength);
|
|
3033
|
+
const lastSpace = truncated.lastIndexOf(" ");
|
|
3034
|
+
if (lastSpace > 0) {
|
|
3035
|
+
return truncated.substring(0, lastSpace) + "\u2026";
|
|
3036
|
+
}
|
|
3037
|
+
return truncated + "\u2026";
|
|
3038
|
+
}
|
|
3039
|
+
function stripHTML(html) {
|
|
3040
|
+
return html.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, "").replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, "").replace(/<[^>]+>/g, "").replace(/\s+/g, " ").trim();
|
|
3041
|
+
}
|
|
3042
|
+
|
|
3043
|
+
// src/scraper.ts
|
|
3044
|
+
async function scrape(url, options = {}) {
|
|
3045
|
+
const startTime = Date.now();
|
|
3046
|
+
const {
|
|
3047
|
+
sourceType = "auto",
|
|
3048
|
+
maxArticles = 50,
|
|
3049
|
+
extractFullContent = true,
|
|
3050
|
+
denyPaths = DEFAULT_DENY_PATHS,
|
|
3051
|
+
qualityThreshold = 0.6
|
|
3052
|
+
} = options;
|
|
3053
|
+
console.log(`\u{1F3AF} [Scraper] Starting scrape of ${url}`);
|
|
3054
|
+
console.log(` Source type: ${sourceType}`);
|
|
3055
|
+
console.log(` Max articles: ${maxArticles}`);
|
|
3056
|
+
console.log(` Extract full content: ${extractFullContent}`);
|
|
3057
|
+
console.log(` Quality threshold: ${qualityThreshold}`);
|
|
3058
|
+
const errors = [];
|
|
3059
|
+
let totalDiscovered = 0;
|
|
3060
|
+
let afterDenyFilter = 0;
|
|
3061
|
+
let afterContentValidation = 0;
|
|
3062
|
+
let afterQualityFilter = 0;
|
|
3063
|
+
try {
|
|
3064
|
+
const config = {
|
|
3065
|
+
sourceType,
|
|
3066
|
+
denyPaths
|
|
3067
|
+
};
|
|
3068
|
+
const orchestrationResult = await globalSourceOrchestrator.processSource(url, config);
|
|
3069
|
+
totalDiscovered = orchestrationResult.articles.length;
|
|
3070
|
+
errors.push(...orchestrationResult.errors);
|
|
3071
|
+
console.log(`\u{1F4E6} [Scraper] Discovered ${totalDiscovered} candidate articles`);
|
|
3072
|
+
let candidateArticles = orchestrationResult.articles.filter((article) => {
|
|
3073
|
+
const shouldDeny = shouldDenyUrl(article.url, denyPaths);
|
|
3074
|
+
return !shouldDeny;
|
|
3075
|
+
});
|
|
3076
|
+
afterDenyFilter = candidateArticles.length;
|
|
3077
|
+
console.log(`\u{1F6AB} [Scraper] After deny filter: ${afterDenyFilter} articles`);
|
|
3078
|
+
let scrapedArticles = [];
|
|
3079
|
+
if (extractFullContent && candidateArticles.length > 0) {
|
|
3080
|
+
console.log(`\u{1F4D6} [Scraper] Extracting full content for ${Math.min(candidateArticles.length, maxArticles)} articles`);
|
|
3081
|
+
const articlesToProcess = candidateArticles.slice(0, maxArticles * 2);
|
|
3082
|
+
for (const candidate of articlesToProcess) {
|
|
3083
|
+
try {
|
|
3084
|
+
const extractedContent = await globalContentExtractor.extractContent(candidate.url);
|
|
3085
|
+
if (!extractedContent) {
|
|
3086
|
+
errors.push(`Failed to extract content from ${candidate.url}`);
|
|
3087
|
+
continue;
|
|
3088
|
+
}
|
|
3089
|
+
const markdown = convertToMarkdown(extractedContent.content || "");
|
|
3090
|
+
const cleanedText = cleanText(extractedContent.textContent || "");
|
|
3091
|
+
const qualityScore = calculateArticleQualityScore(extractedContent);
|
|
3092
|
+
scrapedArticles.push({
|
|
3093
|
+
url: candidate.url,
|
|
3094
|
+
title: extractedContent.title || candidate.title,
|
|
3095
|
+
publishedDate: extractedContent.publishedTime,
|
|
3096
|
+
description: extractedContent.excerpt || candidate.excerpt,
|
|
3097
|
+
fullContent: extractedContent.content,
|
|
3098
|
+
fullContentMarkdown: markdown,
|
|
3099
|
+
fullContentText: cleanedText,
|
|
3100
|
+
confidence: candidate.confidence,
|
|
3101
|
+
source: extractedContent.structured?.jsonLd ? "structured-data" : extractedContent.byline ? "meta-data" : "link-text",
|
|
3102
|
+
qualityScore,
|
|
3103
|
+
metadata: {
|
|
3104
|
+
...candidate.metadata,
|
|
3105
|
+
wordCount: extractedContent.wordCount,
|
|
3106
|
+
readingTime: extractedContent.readingTime,
|
|
3107
|
+
byline: extractedContent.byline,
|
|
3108
|
+
siteName: extractedContent.siteName,
|
|
3109
|
+
lang: extractedContent.lang
|
|
3110
|
+
}
|
|
3111
|
+
});
|
|
3112
|
+
if (scrapedArticles.length >= maxArticles) {
|
|
3113
|
+
break;
|
|
3114
|
+
}
|
|
3115
|
+
} catch (error) {
|
|
3116
|
+
const errorMsg = error instanceof Error ? error.message : "Unknown error";
|
|
3117
|
+
errors.push(`Error processing ${candidate.url}: ${errorMsg}`);
|
|
3118
|
+
continue;
|
|
3119
|
+
}
|
|
3120
|
+
}
|
|
3121
|
+
} else {
|
|
3122
|
+
scrapedArticles = candidateArticles.slice(0, maxArticles).map((candidate) => ({
|
|
3123
|
+
url: candidate.url,
|
|
3124
|
+
title: candidate.title,
|
|
3125
|
+
publishedDate: candidate.publishedAt,
|
|
3126
|
+
description: candidate.excerpt,
|
|
3127
|
+
confidence: candidate.confidence,
|
|
3128
|
+
source: candidate.source === "rss" ? "structured-data" : candidate.source === "sitemap" ? "meta-data" : "link-text",
|
|
3129
|
+
qualityScore: 0.5,
|
|
3130
|
+
// Default score when not extracting full content
|
|
3131
|
+
metadata: candidate.metadata
|
|
3132
|
+
}));
|
|
3133
|
+
}
|
|
3134
|
+
afterContentValidation = scrapedArticles.length;
|
|
3135
|
+
console.log(`\u2705 [Scraper] After content extraction: ${afterContentValidation} articles`);
|
|
3136
|
+
const filteredArticles = scrapedArticles.filter((article) => {
|
|
3137
|
+
const score = article.qualityScore ?? 0;
|
|
3138
|
+
return score >= qualityThreshold;
|
|
3139
|
+
});
|
|
3140
|
+
afterQualityFilter = filteredArticles.length;
|
|
3141
|
+
console.log(`\u2B50 [Scraper] After quality filter: ${afterQualityFilter} articles (threshold: ${qualityThreshold})`);
|
|
3142
|
+
const processingTime = Date.now() - startTime;
|
|
3143
|
+
const result = {
|
|
3144
|
+
url,
|
|
3145
|
+
detectedType: orchestrationResult.sourceInfo.detectedType,
|
|
3146
|
+
confidence: afterQualityFilter > 0 ? "high" : afterContentValidation > 0 ? "medium" : "low",
|
|
3147
|
+
articles: filteredArticles,
|
|
3148
|
+
extractionStats: {
|
|
3149
|
+
attempted: totalDiscovered,
|
|
3150
|
+
successful: afterQualityFilter,
|
|
3151
|
+
failed: errors.length,
|
|
3152
|
+
filtered: totalDiscovered - afterQualityFilter,
|
|
3153
|
+
totalDiscovered,
|
|
3154
|
+
afterDenyFilter,
|
|
3155
|
+
afterContentValidation,
|
|
3156
|
+
afterQualityFilter
|
|
3157
|
+
},
|
|
3158
|
+
processingTime,
|
|
3159
|
+
errors,
|
|
3160
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
3161
|
+
};
|
|
3162
|
+
console.log(`\u2728 [Scraper] Complete! ${afterQualityFilter} articles in ${processingTime}ms`);
|
|
3163
|
+
return result;
|
|
3164
|
+
} catch (error) {
|
|
3165
|
+
const errorMessage = error instanceof Error ? error.message : "Unknown error";
|
|
3166
|
+
console.error(`\u274C [Scraper] Fatal error:`, errorMessage);
|
|
3167
|
+
return {
|
|
3168
|
+
url,
|
|
3169
|
+
detectedType: "unknown",
|
|
3170
|
+
confidence: "low",
|
|
3171
|
+
articles: [],
|
|
3172
|
+
extractionStats: {
|
|
3173
|
+
attempted: totalDiscovered,
|
|
3174
|
+
successful: 0,
|
|
3175
|
+
failed: 1,
|
|
3176
|
+
filtered: totalDiscovered,
|
|
3177
|
+
totalDiscovered,
|
|
3178
|
+
afterDenyFilter,
|
|
3179
|
+
afterContentValidation,
|
|
3180
|
+
afterQualityFilter
|
|
3181
|
+
},
|
|
3182
|
+
processingTime: Date.now() - startTime,
|
|
3183
|
+
errors: [errorMessage, ...errors],
|
|
3184
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
3185
|
+
};
|
|
3186
|
+
}
|
|
3187
|
+
}
|
|
3188
|
+
async function quickScrape(url) {
|
|
3189
|
+
const result = await scrape(url, {
|
|
3190
|
+
extractFullContent: false,
|
|
3191
|
+
maxArticles: 100,
|
|
3192
|
+
qualityThreshold: 0
|
|
3193
|
+
});
|
|
3194
|
+
return result.articles.map((a) => a.url);
|
|
3195
|
+
}
|
|
3196
|
+
|
|
3197
|
+
// src/index.ts
|
|
3198
|
+
var VERSION = "0.1.0";
|
|
3199
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
3200
|
+
0 && (module.exports = {
|
|
3201
|
+
CircuitBreaker,
|
|
3202
|
+
ContentExtractor,
|
|
3203
|
+
DEFAULT_DENY_PATHS,
|
|
3204
|
+
DEFAULT_QUALITY_CONFIG,
|
|
3205
|
+
HTMLScraper,
|
|
3206
|
+
RSSDiscovery,
|
|
3207
|
+
RobotsChecker,
|
|
3208
|
+
ScrapingRateLimiter,
|
|
3209
|
+
SitemapParser,
|
|
3210
|
+
SourceOrchestrator,
|
|
3211
|
+
VERSION,
|
|
3212
|
+
calculateArticleQualityScore,
|
|
3213
|
+
circuitBreakers,
|
|
3214
|
+
cleanText,
|
|
3215
|
+
convertToMarkdown,
|
|
3216
|
+
decodeHTMLEntities,
|
|
3217
|
+
detectParagraphs,
|
|
3218
|
+
fetchRSSFeed,
|
|
3219
|
+
getQualityBreakdown,
|
|
3220
|
+
globalContentExtractor,
|
|
3221
|
+
globalRSSDiscovery,
|
|
3222
|
+
globalRateLimiter,
|
|
3223
|
+
globalRobotsChecker,
|
|
3224
|
+
globalSitemapParser,
|
|
3225
|
+
globalSourceOrchestrator,
|
|
3226
|
+
htmlToMarkdown,
|
|
3227
|
+
normalizeWhitespace,
|
|
3228
|
+
quickScrape,
|
|
3229
|
+
removeUrls,
|
|
3230
|
+
scrape,
|
|
3231
|
+
shouldDenyUrl,
|
|
3232
|
+
stripHTML,
|
|
3233
|
+
stripNonArticleContent,
|
|
3234
|
+
truncateText,
|
|
3235
|
+
validateContent
|
|
3236
|
+
});
|