@tyroneross/blog-scraper 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +254 -279
- package/dist/lib/circuit-breaker.d.ts +29 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.js +89 -0
- package/dist/lib/circuit-breaker.js.map +1 -0
- package/dist/lib/content-extractor.d.ts +13 -0
- package/dist/lib/content-extractor.d.ts.map +1 -0
- package/dist/lib/content-extractor.js +75 -0
- package/dist/lib/content-extractor.js.map +1 -0
- package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
- package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
- package/dist/lib/formatters/html-to-markdown.js +146 -0
- package/dist/lib/formatters/html-to-markdown.js.map +1 -0
- package/dist/lib/formatters/text-cleaner.d.ts +44 -0
- package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
- package/dist/lib/formatters/text-cleaner.js +143 -0
- package/dist/lib/formatters/text-cleaner.js.map +1 -0
- package/dist/lib/index.d.ts +96 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +184 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/quality-scorer.d.ts +83 -0
- package/dist/lib/quality-scorer.d.ts.map +1 -0
- package/dist/lib/quality-scorer.js +376 -0
- package/dist/lib/quality-scorer.js.map +1 -0
- package/dist/lib/rss-utils.d.ts +31 -0
- package/dist/lib/rss-utils.d.ts.map +1 -0
- package/dist/lib/rss-utils.js +175 -0
- package/dist/lib/rss-utils.js.map +1 -0
- package/dist/lib/scraping-rate-limiter.d.ts +52 -0
- package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
- package/dist/lib/scraping-rate-limiter.js +238 -0
- package/dist/lib/scraping-rate-limiter.js.map +1 -0
- package/dist/lib/source-orchestrator.d.ts +306 -0
- package/dist/lib/source-orchestrator.d.ts.map +1 -0
- package/dist/lib/source-orchestrator.js +840 -0
- package/dist/lib/source-orchestrator.js.map +1 -0
- package/dist/lib/types.d.ts +143 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +7 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.js +531 -0
- package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.js +598 -0
- package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
- package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.js +285 -0
- package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.js +384 -0
- package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
- package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
- package/package.json +54 -33
- package/dist/index.d.mts +0 -949
- package/dist/index.d.ts +0 -949
- package/dist/index.js +0 -3236
- package/dist/index.mjs +0 -3165
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* @package @tyroneross/scraper-testing
|
|
4
|
+
* Article quality scoring system
|
|
5
|
+
*
|
|
6
|
+
* No LLM required - uses metadata and content signals to determine article quality
|
|
7
|
+
*/
|
|
8
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
|
+
exports.DEFAULT_ALLOW_PATHS = exports.DEFAULT_DENY_PATHS = exports.DEFAULT_QUALITY_CONFIG = void 0;
|
|
10
|
+
exports.validateContent = validateContent;
|
|
11
|
+
exports.calculateArticleQualityScore = calculateArticleQualityScore;
|
|
12
|
+
exports.isNonEnglishLocalePath = isNonEnglishLocalePath;
|
|
13
|
+
exports.shouldDenyUrl = shouldDenyUrl;
|
|
14
|
+
exports.getQualityBreakdown = getQualityBreakdown;
|
|
15
|
+
/**
|
|
16
|
+
* Default quality score configuration
|
|
17
|
+
* These weights were optimized through testing with 1,788 real articles
|
|
18
|
+
*/
|
|
19
|
+
exports.DEFAULT_QUALITY_CONFIG = {
|
|
20
|
+
contentWeight: 0.60, // Content validation (length, quality, ratio)
|
|
21
|
+
dateWeight: 0.12, // Publication date presence
|
|
22
|
+
authorWeight: 0.08, // Author/byline presence
|
|
23
|
+
schemaWeight: 0.08, // Schema.org metadata
|
|
24
|
+
readingTimeWeight: 0.12, // Substantial reading time (2+ min)
|
|
25
|
+
threshold: 0.50, // Minimum score to pass (50%)
|
|
26
|
+
};
|
|
27
|
+
/**
|
|
28
|
+
* Default patterns to block non-article pages
|
|
29
|
+
* These cover common non-article paths across websites
|
|
30
|
+
*/
|
|
31
|
+
exports.DEFAULT_DENY_PATHS = [
|
|
32
|
+
'/',
|
|
33
|
+
'/index',
|
|
34
|
+
'/index.html',
|
|
35
|
+
'/about',
|
|
36
|
+
'/about/*',
|
|
37
|
+
'/careers',
|
|
38
|
+
'/careers/*',
|
|
39
|
+
'/jobs',
|
|
40
|
+
'/jobs/*',
|
|
41
|
+
'/contact',
|
|
42
|
+
'/contact/*',
|
|
43
|
+
'/team',
|
|
44
|
+
'/team/*',
|
|
45
|
+
'/privacy',
|
|
46
|
+
'/terms',
|
|
47
|
+
'/legal/*',
|
|
48
|
+
'/tag/*',
|
|
49
|
+
'/tags/*',
|
|
50
|
+
'/category/*',
|
|
51
|
+
'/categories/*',
|
|
52
|
+
'/author/*',
|
|
53
|
+
'/authors/*',
|
|
54
|
+
'/archive/*',
|
|
55
|
+
'/search',
|
|
56
|
+
'/search/*',
|
|
57
|
+
// Non-article content pages
|
|
58
|
+
'/use-cases',
|
|
59
|
+
'/use-cases/*',
|
|
60
|
+
'/solutions',
|
|
61
|
+
'/solutions/*',
|
|
62
|
+
'/products',
|
|
63
|
+
'/products/*',
|
|
64
|
+
'/services',
|
|
65
|
+
'/services/*',
|
|
66
|
+
'/partners',
|
|
67
|
+
'/partners/*',
|
|
68
|
+
'/support',
|
|
69
|
+
'/support/*',
|
|
70
|
+
'/help',
|
|
71
|
+
'/help/*',
|
|
72
|
+
'/faq',
|
|
73
|
+
'/faq/*',
|
|
74
|
+
'/pricing',
|
|
75
|
+
'/pricing/*',
|
|
76
|
+
'/features',
|
|
77
|
+
'/features/*',
|
|
78
|
+
'/demo',
|
|
79
|
+
'/demo/*',
|
|
80
|
+
'/login',
|
|
81
|
+
'/signup',
|
|
82
|
+
'/register',
|
|
83
|
+
'/account',
|
|
84
|
+
'/account/*',
|
|
85
|
+
'/dashboard',
|
|
86
|
+
'/dashboard/*',
|
|
87
|
+
'/settings',
|
|
88
|
+
'/settings/*',
|
|
89
|
+
'/trust-center',
|
|
90
|
+
'/trust-center/*',
|
|
91
|
+
'/ai-trust-center',
|
|
92
|
+
'/ai-trust-center/*',
|
|
93
|
+
'/safety',
|
|
94
|
+
'/safety/*',
|
|
95
|
+
'/compliance',
|
|
96
|
+
'/compliance/*',
|
|
97
|
+
'/certification',
|
|
98
|
+
'/certification/*',
|
|
99
|
+
'/industries',
|
|
100
|
+
'/industries/*',
|
|
101
|
+
'/platform',
|
|
102
|
+
'/platform/*',
|
|
103
|
+
'/developers',
|
|
104
|
+
'/developers/*',
|
|
105
|
+
'/documentation',
|
|
106
|
+
'/documentation/*',
|
|
107
|
+
'/docs',
|
|
108
|
+
'/docs/*',
|
|
109
|
+
'/api',
|
|
110
|
+
'/api/*',
|
|
111
|
+
'/download',
|
|
112
|
+
'/download/*',
|
|
113
|
+
'/downloads',
|
|
114
|
+
'/downloads/*',
|
|
115
|
+
// Non-English language paths (filter to English only)
|
|
116
|
+
'/cs-cz/*', // Czech
|
|
117
|
+
'/de-de/*', // German
|
|
118
|
+
'/de-at/*', // German (Austria)
|
|
119
|
+
'/de-ch/*', // German (Swiss)
|
|
120
|
+
'/fr-fr/*', // French
|
|
121
|
+
'/fr-ca/*', // French (Canada)
|
|
122
|
+
'/es-es/*', // Spanish
|
|
123
|
+
'/es-mx/*', // Spanish (Mexico)
|
|
124
|
+
'/es-la/*', // Spanish (Latin America)
|
|
125
|
+
'/it-it/*', // Italian
|
|
126
|
+
'/ja-jp/*', // Japanese
|
|
127
|
+
'/ko-kr/*', // Korean
|
|
128
|
+
'/zh-cn/*', // Chinese (Simplified)
|
|
129
|
+
'/zh-tw/*', // Chinese (Traditional)
|
|
130
|
+
'/zh-hk/*', // Chinese (Hong Kong)
|
|
131
|
+
'/pt-br/*', // Portuguese (Brazil)
|
|
132
|
+
'/pt-pt/*', // Portuguese
|
|
133
|
+
'/ru-ru/*', // Russian
|
|
134
|
+
'/pl-pl/*', // Polish
|
|
135
|
+
'/nl-nl/*', // Dutch
|
|
136
|
+
'/sv-se/*', // Swedish
|
|
137
|
+
'/nb-no/*', // Norwegian
|
|
138
|
+
'/da-dk/*', // Danish
|
|
139
|
+
'/fi-fi/*', // Finnish
|
|
140
|
+
'/tr-tr/*', // Turkish
|
|
141
|
+
'/ar-ae/*', // Arabic
|
|
142
|
+
'/he-il/*', // Hebrew
|
|
143
|
+
'/th-th/*', // Thai
|
|
144
|
+
'/vi-vn/*', // Vietnamese
|
|
145
|
+
'/id-id/*', // Indonesian
|
|
146
|
+
// Short language codes
|
|
147
|
+
'/de/*',
|
|
148
|
+
'/fr/*',
|
|
149
|
+
'/es/*',
|
|
150
|
+
'/it/*',
|
|
151
|
+
'/ja/*',
|
|
152
|
+
'/ko/*',
|
|
153
|
+
'/zh/*',
|
|
154
|
+
'/pt/*',
|
|
155
|
+
'/ru/*',
|
|
156
|
+
'/pl/*',
|
|
157
|
+
'/nl/*',
|
|
158
|
+
];
|
|
159
|
+
/**
|
|
160
|
+
* Default patterns for content sections (blog, news, articles)
|
|
161
|
+
* Used for allow-listing paths when scraping
|
|
162
|
+
*/
|
|
163
|
+
exports.DEFAULT_ALLOW_PATHS = [
|
|
164
|
+
'/news/*',
|
|
165
|
+
'/blog/*',
|
|
166
|
+
'/articles/*',
|
|
167
|
+
'/posts/*',
|
|
168
|
+
'/stories/*',
|
|
169
|
+
'/press/*',
|
|
170
|
+
'/updates/*',
|
|
171
|
+
'/announcements/*',
|
|
172
|
+
'/insights/*',
|
|
173
|
+
'/resources/*',
|
|
174
|
+
'/publications/*',
|
|
175
|
+
'/research/*',
|
|
176
|
+
'/engineering/*',
|
|
177
|
+
];
|
|
178
|
+
/**
|
|
179
|
+
* Validate content quality (Tier 2 filtering)
|
|
180
|
+
* Checks length, title quality, and text-to-HTML ratio
|
|
181
|
+
*
|
|
182
|
+
* @param extracted - Extracted content from article
|
|
183
|
+
* @returns Validation result with score and reasons
|
|
184
|
+
*/
|
|
185
|
+
function validateContent(extracted) {
|
|
186
|
+
const reasons = [];
|
|
187
|
+
let score = 1.0; // Start with perfect score, deduct for issues
|
|
188
|
+
// Check content length (minimum 200 characters)
|
|
189
|
+
const contentLength = extracted.textContent?.length || 0;
|
|
190
|
+
if (contentLength < 200) {
|
|
191
|
+
reasons.push('Content too short (< 200 characters)');
|
|
192
|
+
score -= 0.5; // Heavy penalty for short content
|
|
193
|
+
}
|
|
194
|
+
// Check title quality (10-200 characters)
|
|
195
|
+
const titleLength = extracted.title?.length || 0;
|
|
196
|
+
if (titleLength < 10 || titleLength > 200) {
|
|
197
|
+
reasons.push('Title length invalid (must be 10-200 characters)');
|
|
198
|
+
score -= 0.2;
|
|
199
|
+
}
|
|
200
|
+
// Check text-to-HTML ratio (should be at least 10% text)
|
|
201
|
+
if (extracted.content && extracted.textContent) {
|
|
202
|
+
const htmlLength = extracted.content.length;
|
|
203
|
+
const textLength = extracted.textContent.length;
|
|
204
|
+
const ratio = textLength / htmlLength;
|
|
205
|
+
if (ratio < 0.1) {
|
|
206
|
+
reasons.push('Low text-to-HTML ratio (< 10%)');
|
|
207
|
+
score -= 0.2;
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
// Content must score at least 0.5 to be considered valid
|
|
211
|
+
const isValid = score >= 0.5;
|
|
212
|
+
return {
|
|
213
|
+
isValid,
|
|
214
|
+
score: Math.max(0, Math.min(1.0, score)), // Clamp between 0-1
|
|
215
|
+
reasons,
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
/**
|
|
219
|
+
* Calculate article quality score (Tier 3 filtering)
|
|
220
|
+
*
|
|
221
|
+
* Score breakdown:
|
|
222
|
+
* - Content validation (60%): Length, title quality, text-to-HTML ratio
|
|
223
|
+
* - Publication date (12%): Articles should have timestamps
|
|
224
|
+
* - Author/byline (8%): Professional articles cite authors
|
|
225
|
+
* - Schema.org metadata (8%): Structured data indicates article pages
|
|
226
|
+
* - Reading time (12%): Substantial content (2+ min read)
|
|
227
|
+
*
|
|
228
|
+
* @param extracted - Extracted content from article
|
|
229
|
+
* @param config - Optional quality score configuration
|
|
230
|
+
* @returns Quality score between 0-1
|
|
231
|
+
*/
|
|
232
|
+
function calculateArticleQualityScore(extracted, config = {}) {
|
|
233
|
+
const finalConfig = { ...exports.DEFAULT_QUALITY_CONFIG, ...config };
|
|
234
|
+
let score = 0;
|
|
235
|
+
// Tier 2: Content validation (60% weight by default)
|
|
236
|
+
const validation = validateContent(extracted);
|
|
237
|
+
score += validation.score * finalConfig.contentWeight;
|
|
238
|
+
// Tier 3: Article metadata signals
|
|
239
|
+
// Has publication date (12% weight by default)
|
|
240
|
+
if (extracted.publishedTime) {
|
|
241
|
+
score += finalConfig.dateWeight;
|
|
242
|
+
}
|
|
243
|
+
// Has author/byline (8% weight by default)
|
|
244
|
+
if (extracted.byline) {
|
|
245
|
+
score += finalConfig.authorWeight;
|
|
246
|
+
}
|
|
247
|
+
// Has article schema.org metadata (8% weight by default)
|
|
248
|
+
if (extracted.structured?.jsonLd) {
|
|
249
|
+
const schemas = Array.isArray(extracted.structured.jsonLd)
|
|
250
|
+
? extracted.structured.jsonLd
|
|
251
|
+
: [extracted.structured.jsonLd];
|
|
252
|
+
const hasArticleType = schemas.some((s) => {
|
|
253
|
+
const type = s['@type'];
|
|
254
|
+
return (type === 'Article' ||
|
|
255
|
+
type === 'NewsArticle' ||
|
|
256
|
+
type === 'BlogPosting' ||
|
|
257
|
+
type === 'TechArticle' ||
|
|
258
|
+
type === 'ScholarlyArticle');
|
|
259
|
+
});
|
|
260
|
+
if (hasArticleType) {
|
|
261
|
+
score += finalConfig.schemaWeight;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
// Substantial reading time (12% weight by default)
|
|
265
|
+
// Articles should be at least 2 minutes to read
|
|
266
|
+
if (extracted.readingTime && extracted.readingTime >= 2) {
|
|
267
|
+
score += finalConfig.readingTimeWeight;
|
|
268
|
+
}
|
|
269
|
+
return Math.min(score, 1.0); // Cap at 1.0
|
|
270
|
+
}
|
|
271
|
+
/**
|
|
272
|
+
* Regex pattern for locale paths (xx-yy format like /en-us/, /fr-be/)
|
|
273
|
+
*/
|
|
274
|
+
const LOCALE_PATH_REGEX = /^\/[a-z]{2}[-_][a-z]{2}(?:\/|$)/i;
|
|
275
|
+
/**
|
|
276
|
+
* Regex pattern for US English locale path (/en-us/ only)
|
|
277
|
+
*/
|
|
278
|
+
const US_ENGLISH_LOCALE_REGEX = /^\/en[-_]us(?:\/|$)/i;
|
|
279
|
+
/**
|
|
280
|
+
* Check if a path should be filtered out (non-US-English locale)
|
|
281
|
+
*
|
|
282
|
+
* Returns true (should filter) for:
|
|
283
|
+
* - /fr-be/, /de-de/, /ja-jp/, /zh-cn/ (non-English locales)
|
|
284
|
+
* - /en-gb/, /en-au/, /en-ca/ (non-US English locales)
|
|
285
|
+
*
|
|
286
|
+
* Returns false (should keep) for:
|
|
287
|
+
* - /en-us/ (US English only)
|
|
288
|
+
* - /blog/, /news/, /articles/ (no locale prefix - default to US English)
|
|
289
|
+
*/
|
|
290
|
+
function isNonEnglishLocalePath(path) {
|
|
291
|
+
// If path has a locale prefix (xx-yy format)
|
|
292
|
+
if (LOCALE_PATH_REGEX.test(path)) {
|
|
293
|
+
// Only allow /en-us/
|
|
294
|
+
return !US_ENGLISH_LOCALE_REGEX.test(path);
|
|
295
|
+
}
|
|
296
|
+
// No locale prefix - allow (assume US English)
|
|
297
|
+
return false;
|
|
298
|
+
}
|
|
299
|
+
/**
|
|
300
|
+
* Check if a URL should be denied based on path patterns
|
|
301
|
+
*
|
|
302
|
+
* @param url - URL to check
|
|
303
|
+
* @param denyPaths - Patterns to deny (supports wildcards with *)
|
|
304
|
+
* @returns True if URL should be denied
|
|
305
|
+
*/
|
|
306
|
+
function shouldDenyUrl(url, denyPaths = exports.DEFAULT_DENY_PATHS) {
|
|
307
|
+
try {
|
|
308
|
+
const urlObj = new URL(url);
|
|
309
|
+
const path = urlObj.pathname;
|
|
310
|
+
// First check for non-English locale patterns (e.g., /fr-be/, /de-ch/)
|
|
311
|
+
if (isNonEnglishLocalePath(path)) {
|
|
312
|
+
return true;
|
|
313
|
+
}
|
|
314
|
+
return denyPaths.some((pattern) => {
|
|
315
|
+
// Exact match
|
|
316
|
+
if (pattern === path)
|
|
317
|
+
return true;
|
|
318
|
+
// Wildcard match (e.g., /about/*)
|
|
319
|
+
if (pattern.endsWith('/*')) {
|
|
320
|
+
const prefix = pattern.slice(0, -2); // Remove /*
|
|
321
|
+
return path.startsWith(prefix);
|
|
322
|
+
}
|
|
323
|
+
return false;
|
|
324
|
+
});
|
|
325
|
+
}
|
|
326
|
+
catch {
|
|
327
|
+
return false; // Invalid URL, don't deny
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
/**
|
|
331
|
+
* Get quality score breakdown for debugging
|
|
332
|
+
* Useful for understanding why an article scored a certain way
|
|
333
|
+
*
|
|
334
|
+
* @param extracted - Extracted content from article
|
|
335
|
+
* @param config - Optional quality score configuration
|
|
336
|
+
* @returns Breakdown of quality score components
|
|
337
|
+
*/
|
|
338
|
+
function getQualityBreakdown(extracted, config = {}) {
|
|
339
|
+
const finalConfig = { ...exports.DEFAULT_QUALITY_CONFIG, ...config };
|
|
340
|
+
const validation = validateContent(extracted);
|
|
341
|
+
const breakdown = {
|
|
342
|
+
contentValidation: validation.score * finalConfig.contentWeight,
|
|
343
|
+
publishedDate: extracted.publishedTime ? finalConfig.dateWeight : 0,
|
|
344
|
+
author: extracted.byline ? finalConfig.authorWeight : 0,
|
|
345
|
+
schema: 0,
|
|
346
|
+
readingTime: extracted.readingTime && extracted.readingTime >= 2 ? finalConfig.readingTimeWeight : 0,
|
|
347
|
+
total: 0,
|
|
348
|
+
passesThreshold: false,
|
|
349
|
+
};
|
|
350
|
+
// Check schema
|
|
351
|
+
if (extracted.structured?.jsonLd) {
|
|
352
|
+
const schemas = Array.isArray(extracted.structured.jsonLd)
|
|
353
|
+
? extracted.structured.jsonLd
|
|
354
|
+
: [extracted.structured.jsonLd];
|
|
355
|
+
const hasArticleType = schemas.some((s) => {
|
|
356
|
+
const type = s['@type'];
|
|
357
|
+
return (type === 'Article' ||
|
|
358
|
+
type === 'NewsArticle' ||
|
|
359
|
+
type === 'BlogPosting' ||
|
|
360
|
+
type === 'TechArticle' ||
|
|
361
|
+
type === 'ScholarlyArticle');
|
|
362
|
+
});
|
|
363
|
+
if (hasArticleType) {
|
|
364
|
+
breakdown.schema = finalConfig.schemaWeight;
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
breakdown.total =
|
|
368
|
+
breakdown.contentValidation +
|
|
369
|
+
breakdown.publishedDate +
|
|
370
|
+
breakdown.author +
|
|
371
|
+
breakdown.schema +
|
|
372
|
+
breakdown.readingTime;
|
|
373
|
+
breakdown.passesThreshold = breakdown.total >= finalConfig.threshold;
|
|
374
|
+
return breakdown;
|
|
375
|
+
}
|
|
376
|
+
//# sourceMappingURL=quality-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"quality-scorer.js","sourceRoot":"","sources":["../../lib/quality-scorer.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;AAiLH,0CAsCC;AAgBD,oEAoDC;AAuBD,wDAQC;AASD,sCAyBC;AAUD,kDAyDC;AA3ZD;;;GAGG;AACU,QAAA,sBAAsB,GAAiC;IAClE,aAAa,EAAE,IAAI,EAAO,8CAA8C;IACxE,UAAU,EAAE,IAAI,EAAW,4BAA4B;IACvD,YAAY,EAAE,IAAI,EAAS,yBAAyB;IACpD,YAAY,EAAE,IAAI,EAAS,sBAAsB;IACjD,iBAAiB,EAAE,IAAI,EAAI,oCAAoC;IAC/D,SAAS,EAAE,IAAI,EAAY,8BAA8B;CAC1D,CAAC;AAEF;;;GAGG;AACU,QAAA,kBAAkB,GAAG;IAChC,GAAG;IACH,QAAQ;IACR,aAAa;IACb,QAAQ;IACR,UAAU;IACV,UAAU;IACV,YAAY;IACZ,OAAO;IACP,SAAS;IACT,UAAU;IACV,YAAY;IACZ,OAAO;IACP,SAAS;IACT,UAAU;IACV,QAAQ;IACR,UAAU;IACV,QAAQ;IACR,SAAS;IACT,aAAa;IACb,eAAe;IACf,WAAW;IACX,YAAY;IACZ,YAAY;IACZ,SAAS;IACT,WAAW;IACX,4BAA4B;IAC5B,YAAY;IACZ,cAAc;IACd,YAAY;IACZ,cAAc;IACd,WAAW;IACX,aAAa;IACb,WAAW;IACX,aAAa;IACb,WAAW;IACX,aAAa;IACb,UAAU;IACV,YAAY;IACZ,OAAO;IACP,SAAS;IACT,MAAM;IACN,QAAQ;IACR,UAAU;IACV,YAAY;IACZ,WAAW;IACX,aAAa;IACb,OAAO;IACP,SAAS;IACT,QAAQ;IACR,SAAS;IACT,WAAW;IACX,UAAU;IACV,YAAY;IACZ,YAAY;IACZ,cAAc;IACd,WAAW;IACX,aAAa;IACb,eAAe;IACf,iBAAiB;IACjB,kBAAkB;IAClB,oBAAoB;IACpB,SAAS;IACT,WAAW;IACX,aAAa;IACb,eAAe;IACf,gBAAgB;IAChB,kBAAkB;IAClB,aAAa;IACb,eAAe;IACf,WAAW;IACX,aAAa;IACb,aAAa;IACb,eAAe;IACf,gBAAgB;IAChB,kBAAkB;IAClB,OAAO;IACP,SAAS;IACT,MAAM;IACN,QAAQ;IACR,WAAW;IACX,aAAa;IACb,YAAY;IACZ,cAAc;IACd,sDAAsD;IACtD,UAAU,EAAG,QAAQ;IACrB,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,mBAAmB;IAChC,UAAU,EAAG,iBAAiB;IAC9B,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,kBAAkB;IAC/B,UAAU,EAAG,UAAU;IACvB,UAAU,EAAG,mBAAmB;IAChC,UAAU,EAAG,0BAA0B;IACvC,UAAU,EAAG,UAAU;IACvB,UAAU,EAAG,WAAW;IACxB,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,uBAAuB;IACpC,UAAU,EAAG,wBAAwB;IACrC,UAAU,EAAG,sBAAsB;IACnC,UAAU,EAAG,sBAAsB;IACnC,UAAU,EAAG,aAAa;IAC1B,UAAU,EAAG,UAAU;IACvB,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,QAAQ;IACrB,UAAU,EAAG,UAAU;IACvB,UAAU,EAAG,YAAY;IACzB,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,UAAU;IACvB,UAAU,EAAG,UAAU;IACvB,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,OAAO;IACpB,UAAU,EAAG,aAAa;IAC1B,UAAU,EAAG,aAAa;IAC1B,uBAAuB;IACvB,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;CACR,CAAC;AAEF;;;GAGG;AACU,QAAA,mBAAmB,GAAG;IACjC,SAAS;IACT,SAAS;IACT,aAAa;IACb,UAAU;IACV,YAAY;IACZ,UAAU;IACV,YAAY;IACZ,kBAAkB;IAClB,aAAa;IACb,cAAc;IACd,iBAAiB;IACjB,aAAa;IACb,gBAAgB;CACjB,CAAC;AAEF;;;;;;GAMG;AACH,SAAgB,eAAe,CAAC,SAA2B;IACzD,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,IAAI,KAAK,GAAG,GAAG,CAAC,CAAC,8CAA8C;IAE/D,gDAAgD;IAChD,MAAM,aAAa,GAAG,SAAS,CAAC,WAAW,EAAE,MAAM,IAAI,CAAC,CAAC;IACzD,IAAI,aAAa,GAAG,GAAG,EAAE,CAAC;QACxB,OAAO,CAAC,IAAI,CAAC,sCAAsC,CAAC,CAAC;QACrD,KAAK,IAAI,GAAG,CAAC,CAAC,kCAAkC;IAClD,CAAC;IAED,0CAA0C;IAC1C,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,MAAM,IAAI,CAAC,CAAC;IACjD,IAAI,WAAW,GAAG,EAAE,IAAI,WAAW,GAAG,GAAG,EAAE,CAAC;QAC1C,OAAO,CAAC,IAAI,CAAC,kDAAkD,CAAC,CAAC;QACjE,KAAK,IAAI,GAAG,CAAC;IACf,CAAC;IAED,yDAAyD;IACzD,IAAI,SAAS,CAAC,OAAO,IAAI,SAAS,CAAC,WAAW,EAAE,CAAC;QAC/C,MAAM,UAAU,GAAG,SAAS,CAAC,OAAO,CAAC,MAAM,CAAC;QAC5C,MAAM,UAAU,GAAG,SAAS,CAAC,WAAW,CAAC,MAAM,CAAC;QAChD,MAAM,KAAK,GAAG,UAAU,GAAG,UAAU,CAAC;QAEtC,IAAI,KAAK,GAAG,GAAG,EAAE,CAAC;YAChB,OAAO,CAAC,IAAI,CAAC,gCAAgC,CAAC,CAAC;YAC/C,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;IACH,CAAC;IAED,yDAAyD;IACzD,MAAM,OAAO,GAAG,KAAK,IAAI,GAAG,CAAC;IAE7B,OAAO;QACL,OAAO;QACP,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC,EAAE,oBAAoB;QAC9D,OAAO;KACR,CAAC;AACJ,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,SAAgB,4BAA4B,CAC1C,SAA2B,EAC3B,SAA6B,EAAE;IAE/B,MAAM,WAAW,GAAG,EAAE,GAAG,8BAAsB,EAAE,GAAG,MAAM,EAAE,CAAC;IAC7D,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,qDAAqD;IACrD,MAAM,UAAU,GAAG,eAAe,CAAC,SAAS,CAAC,CAAC;IAC9C,KAAK,IAAI,UAAU,CAAC,KAAK,GAAG,WAAW,CAAC,aAAa,CAAC;IAEtD,mCAAmC;IAEnC,+CAA+C;IAC/C,IAAI,SAAS,CAAC,aAAa,EAAE,CAAC;QAC5B,KAAK,IAAI,WAAW,CAAC,UAAU,CAAC;IAClC,CAAC;IAED,2CAA2C;IAC3C,IAAI,SAAS,CAAC,MAAM,EAAE,CAAC;QACrB,KAAK,IAAI,WAAW,CAAC,YAAY,CAAC;IACpC,CAAC;IAED,yDAAyD;IACzD,IAAI,SAAS,CAAC,UAAU,EAAE,MAAM,EAAE,CAAC;QACjC,MAAM,OAAO,GAAG,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC;YACxD,CAAC,CAAC,SAAS,CAAC,UAAU,CAAC,MAAM;YAC7B,CAAC,CAAC,CAAC,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;QAElC,MAAM,cAAc,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,EAAE;YAC7C,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;YACxB,OAAO,CACL,IAAI,KAAK,SAAS;gBAClB,IAAI,KAAK,aAAa;gBACtB,IAAI,KAAK,aAAa;gBACtB,IAAI,KAAK,aAAa;gBACtB,IAAI,KAAK,kBAAkB,CAC5B,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,IAAI,cAAc,EAAE,CAAC;YACnB,KAAK,IAAI,WAAW,CAAC,YAAY,CAAC;QACpC,CAAC;IACH,CAAC;IAED,mDAAmD;IACnD,gDAAgD;IAChD,IAAI,SAAS,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,CAAC,EAAE,CAAC;QACxD,KAAK,IAAI,WAAW,CAAC,iBAAiB,CAAC;IACzC,CAAC;IAED,OAAO,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC,aAAa;AAC5C,CAAC;AAED;;GAEG;AACH,MAAM,iBAAiB,GAAG,kCAAkC,CAAC;AAE7D;;GAEG;AACH,MAAM,uBAAuB,GAAG,sBAAsB,CAAC;AAEvD;;;;;;;;;;GAUG;AACH,SAAgB,sBAAsB,CAAC,IAAY;IACjD,6CAA6C;IAC7C,IAAI,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;QACjC,qBAAqB;QACrB,OAAO,CAAC,uBAAuB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7C,CAAC;IACD,+CAA+C;IAC/C,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;;GAMG;AACH,SAAgB,aAAa,CAAC,GAAW,EAAE,YAAsB,0BAAkB;IACjF,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,MAAM,IAAI,GAAG,MAAM,CAAC,QAAQ,CAAC;QAE7B,uEAAuE;QACvE,IAAI,sBAAsB,CAAC,IAAI,CAAC,EAAE,CAAC;YACjC,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO,SAAS,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE;YAChC,cAAc;YACd,IAAI,OAAO,KAAK,IAAI;gBAAE,OAAO,IAAI,CAAC;YAElC,kCAAkC;YAClC,IAAI,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC3B,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,YAAY;gBACjD,OAAO,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;YACjC,CAAC;YAED,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,CAAC;IACL,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC,CAAC,0BAA0B;IAC1C,CAAC;AACH,CAAC;AAED;;;;;;;GAOG;AACH,SAAgB,mBAAmB,CACjC,SAA2B,EAC3B,SAA6B,EAAE;IAU/B,MAAM,WAAW,GAAG,EAAE,GAAG,8BAAsB,EAAE,GAAG,MAAM,EAAE,CAAC;IAC7D,MAAM,UAAU,GAAG,eAAe,CAAC,SAAS,CAAC,CAAC;IAE9C,MAAM,SAAS,GAAG;QAChB,iBAAiB,EAAE,UAAU,CAAC,KAAK,GAAG,WAAW,CAAC,aAAa;QAC/D,aAAa,EAAE,SAAS,CAAC,aAAa,CAAC,CAAC,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QACnE,MAAM,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,WAAW,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACvD,MAAM,EAAE,CAAC;QACT,WAAW,EAAE,SAAS,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,iBAAiB,CAAC,CAAC,CAAC,CAAC;QACpG,KAAK,EAAE,CAAC;QACR,eAAe,EAAE,KAAK;KACvB,CAAC;IAEF,eAAe;IACf,IAAI,SAAS,CAAC,UAAU,EAAE,MAAM,EAAE,CAAC;QACjC,MAAM,OAAO,GAAG,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC;YACxD,CAAC,CAAC,SAAS,CAAC,UAAU,CAAC,MAAM;YAC7B,CAAC,CAAC,CAAC,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;QAElC,MAAM,cAAc,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,EAAE;YAC7C,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;YACxB,OAAO,CACL,IAAI,KAAK,SAAS;gBAClB,IAAI,KAAK,aAAa;gBACtB,IAAI,KAAK,aAAa;gBACtB,IAAI,KAAK,aAAa;gBACtB,IAAI,KAAK,kBAAkB,CAC5B,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,IAAI,cAAc,EAAE,CAAC;YACnB,SAAS,CAAC,MAAM,GAAG,WAAW,CAAC,YAAY,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,SAAS,CAAC,KAAK;QACb,SAAS,CAAC,iBAAiB;YAC3B,SAAS,CAAC,aAAa;YACvB,SAAS,CAAC,MAAM;YAChB,SAAS,CAAC,MAAM;YAChB,SAAS,CAAC,WAAW,CAAC;IAExB,SAAS,CAAC,eAAe,GAAG,SAAS,CAAC,KAAK,IAAI,WAAW,CAAC,SAAS,CAAC;IAErE,OAAO,SAAS,CAAC;AACnB,CAAC"}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
export interface RSSItem {
|
|
2
|
+
title: string;
|
|
3
|
+
link: string;
|
|
4
|
+
pubDate: string;
|
|
5
|
+
guid: string;
|
|
6
|
+
content?: string;
|
|
7
|
+
contentSnippet?: string;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Creates a content-based hash for article deduplication.
|
|
11
|
+
* Uses normalized title + date + source instead of URL to handle:
|
|
12
|
+
* - URL tracking parameters (utm_source, etc.)
|
|
13
|
+
* - URL redirects (HTTP vs HTTPS, www vs non-www)
|
|
14
|
+
* - Cross-posting (same article on multiple sites)
|
|
15
|
+
*
|
|
16
|
+
* @param title - Article title
|
|
17
|
+
* @param link - Article URL (legacy, kept for backward compatibility)
|
|
18
|
+
* @param publishedAt - Publication date (defaults to now)
|
|
19
|
+
* @param source - Source name (defaults to 'unknown')
|
|
20
|
+
* @returns SHA-256 hash as hex string
|
|
21
|
+
*/
|
|
22
|
+
export declare function createGuidHash(title: string, link: string, publishedAt?: Date, source?: string): string;
|
|
23
|
+
export declare function fetchRSSFeed(url: string, sourceId?: string): Promise<RSSItem[]>;
|
|
24
|
+
export declare function validateRSSFeed(url: string): Promise<{
|
|
25
|
+
isValid: boolean;
|
|
26
|
+
error?: string;
|
|
27
|
+
feedTitle?: string;
|
|
28
|
+
itemCount?: number;
|
|
29
|
+
contentType?: string;
|
|
30
|
+
}>;
|
|
31
|
+
//# sourceMappingURL=rss-utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rss-utils.d.ts","sourceRoot":"","sources":["../../lib/rss-utils.ts"],"names":[],"mappings":"AAUA,MAAM,WAAW,OAAO;IACtB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,cAAc,CAC5B,KAAK,EAAE,MAAM,EACb,IAAI,EAAE,MAAM,EACZ,WAAW,GAAE,IAAiB,EAC9B,MAAM,GAAE,MAAkB,GACzB,MAAM,CA0BR;AAED,wBAAsB,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,EAAE,CAAC,CA4CrF;AAGD,wBAAsB,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;IAC1D,OAAO,EAAE,OAAO,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB,CAAC,CA2FD"}
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.createGuidHash = createGuidHash;
|
|
7
|
+
exports.fetchRSSFeed = fetchRSSFeed;
|
|
8
|
+
exports.validateRSSFeed = validateRSSFeed;
|
|
9
|
+
const rss_parser_1 = __importDefault(require("rss-parser"));
|
|
10
|
+
const crypto_1 = __importDefault(require("crypto"));
|
|
11
|
+
const parser = new rss_parser_1.default({
|
|
12
|
+
timeout: 15000, // Increased timeout
|
|
13
|
+
headers: {
|
|
14
|
+
'User-Agent': 'Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)'
|
|
15
|
+
}
|
|
16
|
+
});
|
|
17
|
+
/**
|
|
18
|
+
* Creates a content-based hash for article deduplication.
|
|
19
|
+
* Uses normalized title + date + source instead of URL to handle:
|
|
20
|
+
* - URL tracking parameters (utm_source, etc.)
|
|
21
|
+
* - URL redirects (HTTP vs HTTPS, www vs non-www)
|
|
22
|
+
* - Cross-posting (same article on multiple sites)
|
|
23
|
+
*
|
|
24
|
+
* @param title - Article title
|
|
25
|
+
* @param link - Article URL (legacy, kept for backward compatibility)
|
|
26
|
+
* @param publishedAt - Publication date (defaults to now)
|
|
27
|
+
* @param source - Source name (defaults to 'unknown')
|
|
28
|
+
* @returns SHA-256 hash as hex string
|
|
29
|
+
*/
|
|
30
|
+
function createGuidHash(title, link, publishedAt = new Date(), source = 'unknown') {
|
|
31
|
+
// Normalize title: lowercase, collapse whitespace, remove punctuation
|
|
32
|
+
const normalizedTitle = title
|
|
33
|
+
.trim()
|
|
34
|
+
.toLowerCase()
|
|
35
|
+
.replace(/\s+/g, ' ')
|
|
36
|
+
.replace(/[^\w\s]/g, ''); // Remove punctuation for better matching
|
|
37
|
+
// Extract date bucket (YYYY-MM-DD) - same day articles might be duplicates
|
|
38
|
+
const dateKey = publishedAt.toISOString().split('T')[0];
|
|
39
|
+
// Normalize source name
|
|
40
|
+
const normalizedSource = source.toLowerCase().trim();
|
|
41
|
+
// Create composite key: title | date | source
|
|
42
|
+
const composite = `${normalizedTitle}|${dateKey}|${normalizedSource}`;
|
|
43
|
+
// Generate SHA-256 hash
|
|
44
|
+
const hash = crypto_1.default.createHash('sha256').update(composite).digest('hex');
|
|
45
|
+
// Log for debugging (in development)
|
|
46
|
+
if (process.env.NODE_ENV === 'development') {
|
|
47
|
+
console.log(`🔍 [GuidHash] Generated hash for: "${title.substring(0, 50)}..." from ${source} on ${dateKey}`);
|
|
48
|
+
}
|
|
49
|
+
return hash;
|
|
50
|
+
}
|
|
51
|
+
async function fetchRSSFeed(url, sourceId) {
|
|
52
|
+
const now = new Date();
|
|
53
|
+
try {
|
|
54
|
+
console.log(`🔄 [RSS] Fetching feed from ${url}`);
|
|
55
|
+
const feed = await parser.parseURL(url);
|
|
56
|
+
if (!feed.items || feed.items.length === 0) {
|
|
57
|
+
console.warn(`⚠️ [RSS] Feed from ${url} contains no items`);
|
|
58
|
+
return [];
|
|
59
|
+
}
|
|
60
|
+
const items = feed.items.map(item => ({
|
|
61
|
+
title: item.title || 'Untitled',
|
|
62
|
+
link: item.link || '',
|
|
63
|
+
pubDate: item.pubDate || new Date().toISOString(),
|
|
64
|
+
guid: item.guid || item.link || crypto_1.default.randomUUID(),
|
|
65
|
+
content: item.content || item['content:encoded'] || '',
|
|
66
|
+
contentSnippet: item.contentSnippet || ''
|
|
67
|
+
}));
|
|
68
|
+
console.log(`✅ [RSS] Successfully fetched ${items.length} items from ${url}`);
|
|
69
|
+
return items;
|
|
70
|
+
}
|
|
71
|
+
catch (error) {
|
|
72
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
73
|
+
console.error(`❌ [RSS] Failed to fetch RSS from ${url}:`, errorMessage);
|
|
74
|
+
// Log specific error types for debugging
|
|
75
|
+
if (error instanceof Error) {
|
|
76
|
+
if (error.message.includes('Invalid character')) {
|
|
77
|
+
console.error(`🔍 [RSS] XML parsing error - feed may be malformed or contain HTML`);
|
|
78
|
+
}
|
|
79
|
+
else if (error.message.includes('timeout')) {
|
|
80
|
+
console.error(`🔍 [RSS] Request timeout - server may be slow or unreachable`);
|
|
81
|
+
}
|
|
82
|
+
else if (error.message.includes('ENOTFOUND')) {
|
|
83
|
+
console.error(`🔍 [RSS] Domain not found - check URL spelling`);
|
|
84
|
+
}
|
|
85
|
+
else if (error.message.includes('ECONNREFUSED')) {
|
|
86
|
+
console.error(`🔍 [RSS] Connection refused - server may be down`);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return [];
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
// Enhanced RSS validation with better error handling
|
|
93
|
+
async function validateRSSFeed(url) {
|
|
94
|
+
try {
|
|
95
|
+
console.log(`🔍 [Validation] Validating RSS feed: ${url}`);
|
|
96
|
+
// First check if URL is reachable
|
|
97
|
+
const response = await fetch(url, {
|
|
98
|
+
method: 'HEAD',
|
|
99
|
+
headers: {
|
|
100
|
+
'User-Agent': 'Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)'
|
|
101
|
+
},
|
|
102
|
+
signal: AbortSignal.timeout(10000)
|
|
103
|
+
});
|
|
104
|
+
if (!response.ok) {
|
|
105
|
+
return {
|
|
106
|
+
isValid: false,
|
|
107
|
+
error: `HTTP ${response.status}: ${response.statusText}`,
|
|
108
|
+
contentType: response.headers.get('content-type') || undefined
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
const contentType = response.headers.get('content-type') || '';
|
|
112
|
+
// Check if content type suggests RSS/XML
|
|
113
|
+
const isRssContentType = contentType.includes('application/rss+xml') ||
|
|
114
|
+
contentType.includes('application/xml') ||
|
|
115
|
+
contentType.includes('text/xml') ||
|
|
116
|
+
contentType.includes('application/atom+xml');
|
|
117
|
+
if (!isRssContentType && contentType.includes('text/html')) {
|
|
118
|
+
return {
|
|
119
|
+
isValid: false,
|
|
120
|
+
error: 'URL returns HTML content instead of RSS feed',
|
|
121
|
+
contentType
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
// Now fetch and parse the actual RSS content
|
|
125
|
+
const feed = await parser.parseURL(url);
|
|
126
|
+
if (!feed.title) {
|
|
127
|
+
return {
|
|
128
|
+
isValid: false,
|
|
129
|
+
error: 'RSS feed has no title - may be malformed',
|
|
130
|
+
contentType
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
if (!feed.items || feed.items.length === 0) {
|
|
134
|
+
return {
|
|
135
|
+
isValid: false,
|
|
136
|
+
error: 'RSS feed contains no items',
|
|
137
|
+
contentType,
|
|
138
|
+
feedTitle: feed.title
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
console.log(`✅ [Validation] RSS feed validated successfully: ${feed.title} (${feed.items.length} items)`);
|
|
142
|
+
return {
|
|
143
|
+
isValid: true,
|
|
144
|
+
feedTitle: feed.title,
|
|
145
|
+
itemCount: feed.items.length,
|
|
146
|
+
contentType
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
catch (error) {
|
|
150
|
+
let errorMessage = 'Unknown validation error';
|
|
151
|
+
if (error instanceof Error) {
|
|
152
|
+
if (error.message.includes('timeout') || error.name === 'AbortError') {
|
|
153
|
+
errorMessage = 'Request timeout - URL may be unreachable';
|
|
154
|
+
}
|
|
155
|
+
else if (error.message.includes('Invalid character')) {
|
|
156
|
+
errorMessage = 'Invalid RSS/XML format - feed may be malformed';
|
|
157
|
+
}
|
|
158
|
+
else if (error.message.includes('getaddrinfo ENOTFOUND')) {
|
|
159
|
+
errorMessage = 'Domain not found - check URL spelling';
|
|
160
|
+
}
|
|
161
|
+
else if (error.message.includes('ECONNREFUSED')) {
|
|
162
|
+
errorMessage = 'Connection refused - server may be down';
|
|
163
|
+
}
|
|
164
|
+
else {
|
|
165
|
+
errorMessage = error.message;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
console.error(`❌ [Validation] RSS validation failed for ${url}:`, errorMessage);
|
|
169
|
+
return {
|
|
170
|
+
isValid: false,
|
|
171
|
+
error: errorMessage
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
//# sourceMappingURL=rss-utils.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rss-utils.js","sourceRoot":"","sources":["../../lib/rss-utils.ts"],"names":[],"mappings":";;;;;AAgCA,wCA+BC;AAED,oCA4CC;AAGD,0CAiGC;AAjND,4DAAgC;AAChC,oDAA4B;AAE5B,MAAM,MAAM,GAAG,IAAI,oBAAM,CAAC;IACxB,OAAO,EAAE,KAAK,EAAE,oBAAoB;IACpC,OAAO,EAAE;QACP,YAAY,EAAE,6EAA6E;KAC5F;CACF,CAAC,CAAC;AAWH;;;;;;;;;;;;GAYG;AACH,SAAgB,cAAc,CAC5B,KAAa,EACb,IAAY,EACZ,cAAoB,IAAI,IAAI,EAAE,EAC9B,SAAiB,SAAS;IAE1B,sEAAsE;IACtE,MAAM,eAAe,GAAG,KAAK;SAC1B,IAAI,EAAE;SACN,WAAW,EAAE;SACb,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,CAAC,yCAAyC;IAErE,2EAA2E;IAC3E,MAAM,OAAO,GAAG,WAAW,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAExD,wBAAwB;IACxB,MAAM,gBAAgB,GAAG,MAAM,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;IAErD,8CAA8C;IAC9C,MAAM,SAAS,GAAG,GAAG,eAAe,IAAI,OAAO,IAAI,gBAAgB,EAAE,CAAC;IAEtE,wBAAwB;IACxB,MAAM,IAAI,GAAG,gBAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAEzE,qCAAqC;IACrC,IAAI,OAAO,CAAC,GAAG,CAAC,QAAQ,KAAK,aAAa,EAAE,CAAC;QAC3C,OAAO,CAAC,GAAG,CAAC,sCAAsC,KAAK,CAAC,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC,aAAa,MAAM,OAAO,OAAO,EAAE,CAAC,CAAC;IAC/G,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAEM,KAAK,UAAU,YAAY,CAAC,GAAW,EAAE,QAAiB;IAC/D,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC;IAEvB,IAAI,CAAC;QACH,OAAO,CAAC,GAAG,CAAC,+BAA+B,GAAG,EAAE,CAAC,CAAC;QAElD,MAAM,IAAI,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;QAExC,IAAI,CAAC,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3C,OAAO,CAAC,IAAI,CAAC,sBAAsB,GAAG,oBAAoB,CAAC,CAAC;YAC5D,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACpC,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,UAAU;YAC/B,IAAI,EAAE,IAAI,CAAC,IAAI,IAAI,EAAE;YACrB,OAAO,EAAE,IAAI,CAAC,OAAO,IAAI,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACjD,IAAI,EAAE,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,IAAI,gBAAM,CAAC,UAAU,EAAE;YACnD,OAAO,EAAE,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,iBAAiB,CAAC,IAAI,EAAE;YACtD,cAAc,EAAE,IAAI,CAAC,cAAc,IAAI,EAAE;SAC1C,CAAC,CAAC,CAAC;QAEJ,OAAO,CAAC,GAAG,CAAC,gCAAgC,KAAK,CAAC,MAAM,eAAe,GAAG,EAAE,CAAC,CAAC;QAC9E,OAAO,KAAK,CAAC;IAEf,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;QAC9E,OAAO,CAAC,KAAK,CAAC,oCAAoC,GAAG,GAAG,EAAE,YAAY,CAAC,CAAC;QAExE,yCAAyC;QACzC,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,mBAAmB,CAAC,EAAE,CAAC;gBAChD,OAAO,CAAC,KAAK,CAAC,oEAAoE,CAAC,CAAC;YACtF,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC7C,OAAO,CAAC,KAAK,CAAC,8DAA8D,CAAC,CAAC;YAChF,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;gBAC/C,OAAO,CAAC,KAAK,CAAC,gDAAgD,CAAC,CAAC;YAClE,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAC,EAAE,CAAC;gBAClD,OAAO,CAAC,KAAK,CAAC,kDAAkD,CAAC,CAAC;YACpE,CAAC;QACH,CAAC;QAED,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,qDAAqD;AAC9C,KAAK,UAAU,eAAe,CAAC,GAAW;IAO/C,IAAI,CAAC;QACH,OAAO,CAAC,GAAG,CAAC,wCAAwC,GAAG,EAAE,CAAC,CAAC;QAE3D,kCAAkC;QAClC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAChC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,YAAY,EAAE,6EAA6E;aAC5F;YACD,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,KAAK,CAAC;SACnC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE;gBACxD,WAAW,EAAE,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,SAAS;aAC/D,CAAC;QACJ,CAAC;QAED,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;QAE/D,yCAAyC;QACzC,MAAM,gBAAgB,GACpB,WAAW,CAAC,QAAQ,CAAC,qBAAqB,CAAC;YAC3C,WAAW,CAAC,QAAQ,CAAC,iBAAiB,CAAC;YACvC,WAAW,CAAC,QAAQ,CAAC,UAAU,CAAC;YAChC,WAAW,CAAC,QAAQ,CAAC,sBAAsB,CAAC,CAAC;QAE/C,IAAI,CAAC,gBAAgB,IAAI,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;YAC3D,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,8CAA8C;gBACrD,WAAW;aACZ,CAAC;QACJ,CAAC;QAED,6CAA6C;QAC7C,MAAM,IAAI,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;QAExC,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;YAChB,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,0CAA0C;gBACjD,WAAW;aACZ,CAAC;QACJ,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3C,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,4BAA4B;gBACnC,WAAW;gBACX,SAAS,EAAE,IAAI,CAAC,KAAK;aACtB,CAAC;QACJ,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,mDAAmD,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,CAAC,MAAM,SAAS,CAAC,CAAC;QAE1G,OAAO;YACL,OAAO,EAAE,IAAI;YACb,SAAS,EAAE,IAAI,CAAC,KAAK;YACrB,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM;YAC5B,WAAW;SACZ,CAAC;IAEJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,YAAY,GAAG,0BAA0B,CAAC;QAE9C,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBACrE,YAAY,GAAG,0CAA0C,CAAC;YAC5D,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,mBAAmB,CAAC,EAAE,CAAC;gBACvD,YAAY,GAAG,gDAAgD,CAAC;YAClE,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,uBAAuB,CAAC,EAAE,CAAC;gBAC3D,YAAY,GAAG,uCAAuC,CAAC;YACzD,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAC,EAAE,CAAC;gBAClD,YAAY,GAAG,yCAAyC,CAAC;YAC3D,CAAC;iBAAM,CAAC;gBACN,YAAY,GAAG,KAAK,CAAC,OAAO,CAAC;YAC/B,CAAC;QACH,CAAC;QAED,OAAO,CAAC,KAAK,CAAC,4CAA4C,GAAG,GAAG,EAAE,YAAY,CAAC,CAAC;QAEhF,OAAO;YACL,OAAO,EAAE,KAAK;YACd,KAAK,EAAE,YAAY;SACpB,CAAC;IACJ,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
export interface RateLimiterConfig {
|
|
2
|
+
requestsPerSecond?: number;
|
|
3
|
+
maxBackoff?: number;
|
|
4
|
+
maxConcurrent?: number;
|
|
5
|
+
maxConcurrentPerHost?: number;
|
|
6
|
+
}
|
|
7
|
+
export declare const RATE_LIMITER_PRESETS: {
|
|
8
|
+
readonly conservative: {
|
|
9
|
+
readonly requestsPerSecond: 1;
|
|
10
|
+
readonly maxBackoff: 30000;
|
|
11
|
+
readonly maxConcurrent: 10;
|
|
12
|
+
readonly maxConcurrentPerHost: 2;
|
|
13
|
+
};
|
|
14
|
+
readonly moderate: {
|
|
15
|
+
readonly requestsPerSecond: 2;
|
|
16
|
+
readonly maxBackoff: 30000;
|
|
17
|
+
readonly maxConcurrent: 20;
|
|
18
|
+
readonly maxConcurrentPerHost: 3;
|
|
19
|
+
};
|
|
20
|
+
readonly aggressive: {
|
|
21
|
+
readonly requestsPerSecond: 4;
|
|
22
|
+
readonly maxBackoff: 15000;
|
|
23
|
+
readonly maxConcurrent: 30;
|
|
24
|
+
readonly maxConcurrentPerHost: 5;
|
|
25
|
+
};
|
|
26
|
+
};
|
|
27
|
+
export type RateLimiterPreset = keyof typeof RATE_LIMITER_PRESETS;
|
|
28
|
+
export declare class ScrapingRateLimiter {
|
|
29
|
+
private hosts;
|
|
30
|
+
private readonly baseDelay;
|
|
31
|
+
private readonly maxBackoff;
|
|
32
|
+
private readonly maxConcurrent;
|
|
33
|
+
private readonly maxConcurrentPerHost;
|
|
34
|
+
private activeRequests;
|
|
35
|
+
constructor(options?: RateLimiterConfig);
|
|
36
|
+
static fromPreset(preset: RateLimiterPreset): ScrapingRateLimiter;
|
|
37
|
+
execute<T>(url: string, operation: () => Promise<T>, options?: {
|
|
38
|
+
priority?: number;
|
|
39
|
+
maxRetries?: number;
|
|
40
|
+
}): Promise<T>;
|
|
41
|
+
private extractHost;
|
|
42
|
+
private enqueueRequest;
|
|
43
|
+
private processQueue;
|
|
44
|
+
private handleRequestError;
|
|
45
|
+
private shouldRetry;
|
|
46
|
+
private shouldBackoff;
|
|
47
|
+
private wait;
|
|
48
|
+
getStats(): Record<string, any>;
|
|
49
|
+
}
|
|
50
|
+
export declare const globalRateLimiter: ScrapingRateLimiter;
|
|
51
|
+
export declare function createRateLimiter(config: RateLimiterConfig | RateLimiterPreset): ScrapingRateLimiter;
|
|
52
|
+
//# sourceMappingURL=scraping-rate-limiter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scraping-rate-limiter.d.ts","sourceRoot":"","sources":["../../lib/scraping-rate-limiter.ts"],"names":[],"mappings":"AAmBA,MAAM,WAAW,iBAAiB;IAChC,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,oBAAoB,CAAC,EAAE,MAAM,CAAC;CAC/B;AAGD,eAAO,MAAM,oBAAoB;;;;;;;;;;;;;;;;;;;CAsBvB,CAAC;AAEX,MAAM,MAAM,iBAAiB,GAAG,MAAM,OAAO,oBAAoB,CAAC;AAElE,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,KAAK,CAAgC;IAC7C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAS;IACpC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAS;IACvC,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAS;IAC9C,OAAO,CAAC,cAAc,CAAqB;gBAE/B,OAAO,GAAE,iBAAsB;IAO3C,MAAM,CAAC,UAAU,CAAC,MAAM,EAAE,iBAAiB,GAAG,mBAAmB;IAI3D,OAAO,CAAC,CAAC,EACb,GAAG,EAAE,MAAM,EACX,SAAS,EAAE,MAAM,OAAO,CAAC,CAAC,CAAC,EAC3B,OAAO,GAAE;QACP,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,UAAU,CAAC,EAAE,MAAM,CAAC;KAChB,GACL,OAAO,CAAC,CAAC,CAAC;IAqBb,OAAO,CAAC,WAAW;IASnB,OAAO,CAAC,cAAc;YAiCR,YAAY;YAiEZ,kBAAkB;IAuChC,OAAO,CAAC,WAAW;IAyBnB,OAAO,CAAC,aAAa;IAerB,OAAO,CAAC,IAAI;IAKZ,QAAQ,IAAI,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC;CAmBhC;AAGD,eAAO,MAAM,iBAAiB,qBAA6C,CAAC;AAG5E,wBAAgB,iBAAiB,CAAC,MAAM,EAAE,iBAAiB,GAAG,iBAAiB,GAAG,mBAAmB,CAKpG"}
|