@tyroneross/blog-scraper 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +254 -279
  3. package/dist/lib/circuit-breaker.d.ts +29 -0
  4. package/dist/lib/circuit-breaker.d.ts.map +1 -0
  5. package/dist/lib/circuit-breaker.js +89 -0
  6. package/dist/lib/circuit-breaker.js.map +1 -0
  7. package/dist/lib/content-extractor.d.ts +13 -0
  8. package/dist/lib/content-extractor.d.ts.map +1 -0
  9. package/dist/lib/content-extractor.js +75 -0
  10. package/dist/lib/content-extractor.js.map +1 -0
  11. package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
  12. package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
  13. package/dist/lib/formatters/html-to-markdown.js +146 -0
  14. package/dist/lib/formatters/html-to-markdown.js.map +1 -0
  15. package/dist/lib/formatters/text-cleaner.d.ts +44 -0
  16. package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
  17. package/dist/lib/formatters/text-cleaner.js +143 -0
  18. package/dist/lib/formatters/text-cleaner.js.map +1 -0
  19. package/dist/lib/index.d.ts +96 -0
  20. package/dist/lib/index.d.ts.map +1 -0
  21. package/dist/lib/index.js +184 -0
  22. package/dist/lib/index.js.map +1 -0
  23. package/dist/lib/quality-scorer.d.ts +83 -0
  24. package/dist/lib/quality-scorer.d.ts.map +1 -0
  25. package/dist/lib/quality-scorer.js +376 -0
  26. package/dist/lib/quality-scorer.js.map +1 -0
  27. package/dist/lib/rss-utils.d.ts +31 -0
  28. package/dist/lib/rss-utils.d.ts.map +1 -0
  29. package/dist/lib/rss-utils.js +175 -0
  30. package/dist/lib/rss-utils.js.map +1 -0
  31. package/dist/lib/scraping-rate-limiter.d.ts +52 -0
  32. package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
  33. package/dist/lib/scraping-rate-limiter.js +238 -0
  34. package/dist/lib/scraping-rate-limiter.js.map +1 -0
  35. package/dist/lib/source-orchestrator.d.ts +306 -0
  36. package/dist/lib/source-orchestrator.d.ts.map +1 -0
  37. package/dist/lib/source-orchestrator.js +840 -0
  38. package/dist/lib/source-orchestrator.js.map +1 -0
  39. package/dist/lib/types.d.ts +143 -0
  40. package/dist/lib/types.d.ts.map +1 -0
  41. package/dist/lib/types.js +7 -0
  42. package/dist/lib/types.js.map +1 -0
  43. package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
  44. package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
  45. package/dist/lib/web-scrapers/content-extractor.js +531 -0
  46. package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
  47. package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
  48. package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
  49. package/dist/lib/web-scrapers/html-scraper.js +598 -0
  50. package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
  51. package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
  52. package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
  53. package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
  54. package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
  55. package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
  56. package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
  57. package/dist/lib/web-scrapers/robots-checker.js +285 -0
  58. package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
  59. package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
  60. package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
  61. package/dist/lib/web-scrapers/rss-discovery.js +384 -0
  62. package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
  63. package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
  64. package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
  65. package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
  66. package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
  67. package/package.json +54 -33
  68. package/dist/index.d.mts +0 -949
  69. package/dist/index.d.ts +0 -949
  70. package/dist/index.js +0 -3236
  71. package/dist/index.mjs +0 -3165
@@ -0,0 +1,376 @@
1
+ "use strict";
2
+ /**
3
+ * @package @tyroneross/scraper-testing
4
+ * Article quality scoring system
5
+ *
6
+ * No LLM required - uses metadata and content signals to determine article quality
7
+ */
8
+ Object.defineProperty(exports, "__esModule", { value: true });
9
+ exports.DEFAULT_ALLOW_PATHS = exports.DEFAULT_DENY_PATHS = exports.DEFAULT_QUALITY_CONFIG = void 0;
10
+ exports.validateContent = validateContent;
11
+ exports.calculateArticleQualityScore = calculateArticleQualityScore;
12
+ exports.isNonEnglishLocalePath = isNonEnglishLocalePath;
13
+ exports.shouldDenyUrl = shouldDenyUrl;
14
+ exports.getQualityBreakdown = getQualityBreakdown;
15
+ /**
16
+ * Default quality score configuration
17
+ * These weights were optimized through testing with 1,788 real articles
18
+ */
19
+ exports.DEFAULT_QUALITY_CONFIG = {
20
+ contentWeight: 0.60, // Content validation (length, quality, ratio)
21
+ dateWeight: 0.12, // Publication date presence
22
+ authorWeight: 0.08, // Author/byline presence
23
+ schemaWeight: 0.08, // Schema.org metadata
24
+ readingTimeWeight: 0.12, // Substantial reading time (2+ min)
25
+ threshold: 0.50, // Minimum score to pass (50%)
26
+ };
27
+ /**
28
+ * Default patterns to block non-article pages
29
+ * These cover common non-article paths across websites
30
+ */
31
+ exports.DEFAULT_DENY_PATHS = [
32
+ '/',
33
+ '/index',
34
+ '/index.html',
35
+ '/about',
36
+ '/about/*',
37
+ '/careers',
38
+ '/careers/*',
39
+ '/jobs',
40
+ '/jobs/*',
41
+ '/contact',
42
+ '/contact/*',
43
+ '/team',
44
+ '/team/*',
45
+ '/privacy',
46
+ '/terms',
47
+ '/legal/*',
48
+ '/tag/*',
49
+ '/tags/*',
50
+ '/category/*',
51
+ '/categories/*',
52
+ '/author/*',
53
+ '/authors/*',
54
+ '/archive/*',
55
+ '/search',
56
+ '/search/*',
57
+ // Non-article content pages
58
+ '/use-cases',
59
+ '/use-cases/*',
60
+ '/solutions',
61
+ '/solutions/*',
62
+ '/products',
63
+ '/products/*',
64
+ '/services',
65
+ '/services/*',
66
+ '/partners',
67
+ '/partners/*',
68
+ '/support',
69
+ '/support/*',
70
+ '/help',
71
+ '/help/*',
72
+ '/faq',
73
+ '/faq/*',
74
+ '/pricing',
75
+ '/pricing/*',
76
+ '/features',
77
+ '/features/*',
78
+ '/demo',
79
+ '/demo/*',
80
+ '/login',
81
+ '/signup',
82
+ '/register',
83
+ '/account',
84
+ '/account/*',
85
+ '/dashboard',
86
+ '/dashboard/*',
87
+ '/settings',
88
+ '/settings/*',
89
+ '/trust-center',
90
+ '/trust-center/*',
91
+ '/ai-trust-center',
92
+ '/ai-trust-center/*',
93
+ '/safety',
94
+ '/safety/*',
95
+ '/compliance',
96
+ '/compliance/*',
97
+ '/certification',
98
+ '/certification/*',
99
+ '/industries',
100
+ '/industries/*',
101
+ '/platform',
102
+ '/platform/*',
103
+ '/developers',
104
+ '/developers/*',
105
+ '/documentation',
106
+ '/documentation/*',
107
+ '/docs',
108
+ '/docs/*',
109
+ '/api',
110
+ '/api/*',
111
+ '/download',
112
+ '/download/*',
113
+ '/downloads',
114
+ '/downloads/*',
115
+ // Non-English language paths (filter to English only)
116
+ '/cs-cz/*', // Czech
117
+ '/de-de/*', // German
118
+ '/de-at/*', // German (Austria)
119
+ '/de-ch/*', // German (Swiss)
120
+ '/fr-fr/*', // French
121
+ '/fr-ca/*', // French (Canada)
122
+ '/es-es/*', // Spanish
123
+ '/es-mx/*', // Spanish (Mexico)
124
+ '/es-la/*', // Spanish (Latin America)
125
+ '/it-it/*', // Italian
126
+ '/ja-jp/*', // Japanese
127
+ '/ko-kr/*', // Korean
128
+ '/zh-cn/*', // Chinese (Simplified)
129
+ '/zh-tw/*', // Chinese (Traditional)
130
+ '/zh-hk/*', // Chinese (Hong Kong)
131
+ '/pt-br/*', // Portuguese (Brazil)
132
+ '/pt-pt/*', // Portuguese
133
+ '/ru-ru/*', // Russian
134
+ '/pl-pl/*', // Polish
135
+ '/nl-nl/*', // Dutch
136
+ '/sv-se/*', // Swedish
137
+ '/nb-no/*', // Norwegian
138
+ '/da-dk/*', // Danish
139
+ '/fi-fi/*', // Finnish
140
+ '/tr-tr/*', // Turkish
141
+ '/ar-ae/*', // Arabic
142
+ '/he-il/*', // Hebrew
143
+ '/th-th/*', // Thai
144
+ '/vi-vn/*', // Vietnamese
145
+ '/id-id/*', // Indonesian
146
+ // Short language codes
147
+ '/de/*',
148
+ '/fr/*',
149
+ '/es/*',
150
+ '/it/*',
151
+ '/ja/*',
152
+ '/ko/*',
153
+ '/zh/*',
154
+ '/pt/*',
155
+ '/ru/*',
156
+ '/pl/*',
157
+ '/nl/*',
158
+ ];
159
+ /**
160
+ * Default patterns for content sections (blog, news, articles)
161
+ * Used for allow-listing paths when scraping
162
+ */
163
+ exports.DEFAULT_ALLOW_PATHS = [
164
+ '/news/*',
165
+ '/blog/*',
166
+ '/articles/*',
167
+ '/posts/*',
168
+ '/stories/*',
169
+ '/press/*',
170
+ '/updates/*',
171
+ '/announcements/*',
172
+ '/insights/*',
173
+ '/resources/*',
174
+ '/publications/*',
175
+ '/research/*',
176
+ '/engineering/*',
177
+ ];
178
+ /**
179
+ * Validate content quality (Tier 2 filtering)
180
+ * Checks length, title quality, and text-to-HTML ratio
181
+ *
182
+ * @param extracted - Extracted content from article
183
+ * @returns Validation result with score and reasons
184
+ */
185
+ function validateContent(extracted) {
186
+ const reasons = [];
187
+ let score = 1.0; // Start with perfect score, deduct for issues
188
+ // Check content length (minimum 200 characters)
189
+ const contentLength = extracted.textContent?.length || 0;
190
+ if (contentLength < 200) {
191
+ reasons.push('Content too short (< 200 characters)');
192
+ score -= 0.5; // Heavy penalty for short content
193
+ }
194
+ // Check title quality (10-200 characters)
195
+ const titleLength = extracted.title?.length || 0;
196
+ if (titleLength < 10 || titleLength > 200) {
197
+ reasons.push('Title length invalid (must be 10-200 characters)');
198
+ score -= 0.2;
199
+ }
200
+ // Check text-to-HTML ratio (should be at least 10% text)
201
+ if (extracted.content && extracted.textContent) {
202
+ const htmlLength = extracted.content.length;
203
+ const textLength = extracted.textContent.length;
204
+ const ratio = textLength / htmlLength;
205
+ if (ratio < 0.1) {
206
+ reasons.push('Low text-to-HTML ratio (< 10%)');
207
+ score -= 0.2;
208
+ }
209
+ }
210
+ // Content must score at least 0.5 to be considered valid
211
+ const isValid = score >= 0.5;
212
+ return {
213
+ isValid,
214
+ score: Math.max(0, Math.min(1.0, score)), // Clamp between 0-1
215
+ reasons,
216
+ };
217
+ }
218
+ /**
219
+ * Calculate article quality score (Tier 3 filtering)
220
+ *
221
+ * Score breakdown:
222
+ * - Content validation (60%): Length, title quality, text-to-HTML ratio
223
+ * - Publication date (12%): Articles should have timestamps
224
+ * - Author/byline (8%): Professional articles cite authors
225
+ * - Schema.org metadata (8%): Structured data indicates article pages
226
+ * - Reading time (12%): Substantial content (2+ min read)
227
+ *
228
+ * @param extracted - Extracted content from article
229
+ * @param config - Optional quality score configuration
230
+ * @returns Quality score between 0-1
231
+ */
232
+ function calculateArticleQualityScore(extracted, config = {}) {
233
+ const finalConfig = { ...exports.DEFAULT_QUALITY_CONFIG, ...config };
234
+ let score = 0;
235
+ // Tier 2: Content validation (60% weight by default)
236
+ const validation = validateContent(extracted);
237
+ score += validation.score * finalConfig.contentWeight;
238
+ // Tier 3: Article metadata signals
239
+ // Has publication date (12% weight by default)
240
+ if (extracted.publishedTime) {
241
+ score += finalConfig.dateWeight;
242
+ }
243
+ // Has author/byline (8% weight by default)
244
+ if (extracted.byline) {
245
+ score += finalConfig.authorWeight;
246
+ }
247
+ // Has article schema.org metadata (8% weight by default)
248
+ if (extracted.structured?.jsonLd) {
249
+ const schemas = Array.isArray(extracted.structured.jsonLd)
250
+ ? extracted.structured.jsonLd
251
+ : [extracted.structured.jsonLd];
252
+ const hasArticleType = schemas.some((s) => {
253
+ const type = s['@type'];
254
+ return (type === 'Article' ||
255
+ type === 'NewsArticle' ||
256
+ type === 'BlogPosting' ||
257
+ type === 'TechArticle' ||
258
+ type === 'ScholarlyArticle');
259
+ });
260
+ if (hasArticleType) {
261
+ score += finalConfig.schemaWeight;
262
+ }
263
+ }
264
+ // Substantial reading time (12% weight by default)
265
+ // Articles should be at least 2 minutes to read
266
+ if (extracted.readingTime && extracted.readingTime >= 2) {
267
+ score += finalConfig.readingTimeWeight;
268
+ }
269
+ return Math.min(score, 1.0); // Cap at 1.0
270
+ }
271
+ /**
272
+ * Regex pattern for locale paths (xx-yy format like /en-us/, /fr-be/)
273
+ */
274
+ const LOCALE_PATH_REGEX = /^\/[a-z]{2}[-_][a-z]{2}(?:\/|$)/i;
275
+ /**
276
+ * Regex pattern for US English locale path (/en-us/ only)
277
+ */
278
+ const US_ENGLISH_LOCALE_REGEX = /^\/en[-_]us(?:\/|$)/i;
279
+ /**
280
+ * Check if a path should be filtered out (non-US-English locale)
281
+ *
282
+ * Returns true (should filter) for:
283
+ * - /fr-be/, /de-de/, /ja-jp/, /zh-cn/ (non-English locales)
284
+ * - /en-gb/, /en-au/, /en-ca/ (non-US English locales)
285
+ *
286
+ * Returns false (should keep) for:
287
+ * - /en-us/ (US English only)
288
+ * - /blog/, /news/, /articles/ (no locale prefix - default to US English)
289
+ */
290
+ function isNonEnglishLocalePath(path) {
291
+ // If path has a locale prefix (xx-yy format)
292
+ if (LOCALE_PATH_REGEX.test(path)) {
293
+ // Only allow /en-us/
294
+ return !US_ENGLISH_LOCALE_REGEX.test(path);
295
+ }
296
+ // No locale prefix - allow (assume US English)
297
+ return false;
298
+ }
299
+ /**
300
+ * Check if a URL should be denied based on path patterns
301
+ *
302
+ * @param url - URL to check
303
+ * @param denyPaths - Patterns to deny (supports wildcards with *)
304
+ * @returns True if URL should be denied
305
+ */
306
+ function shouldDenyUrl(url, denyPaths = exports.DEFAULT_DENY_PATHS) {
307
+ try {
308
+ const urlObj = new URL(url);
309
+ const path = urlObj.pathname;
310
+ // First check for non-English locale patterns (e.g., /fr-be/, /de-ch/)
311
+ if (isNonEnglishLocalePath(path)) {
312
+ return true;
313
+ }
314
+ return denyPaths.some((pattern) => {
315
+ // Exact match
316
+ if (pattern === path)
317
+ return true;
318
+ // Wildcard match (e.g., /about/*)
319
+ if (pattern.endsWith('/*')) {
320
+ const prefix = pattern.slice(0, -2); // Remove /*
321
+ return path.startsWith(prefix);
322
+ }
323
+ return false;
324
+ });
325
+ }
326
+ catch {
327
+ return false; // Invalid URL, don't deny
328
+ }
329
+ }
330
+ /**
331
+ * Get quality score breakdown for debugging
332
+ * Useful for understanding why an article scored a certain way
333
+ *
334
+ * @param extracted - Extracted content from article
335
+ * @param config - Optional quality score configuration
336
+ * @returns Breakdown of quality score components
337
+ */
338
+ function getQualityBreakdown(extracted, config = {}) {
339
+ const finalConfig = { ...exports.DEFAULT_QUALITY_CONFIG, ...config };
340
+ const validation = validateContent(extracted);
341
+ const breakdown = {
342
+ contentValidation: validation.score * finalConfig.contentWeight,
343
+ publishedDate: extracted.publishedTime ? finalConfig.dateWeight : 0,
344
+ author: extracted.byline ? finalConfig.authorWeight : 0,
345
+ schema: 0,
346
+ readingTime: extracted.readingTime && extracted.readingTime >= 2 ? finalConfig.readingTimeWeight : 0,
347
+ total: 0,
348
+ passesThreshold: false,
349
+ };
350
+ // Check schema
351
+ if (extracted.structured?.jsonLd) {
352
+ const schemas = Array.isArray(extracted.structured.jsonLd)
353
+ ? extracted.structured.jsonLd
354
+ : [extracted.structured.jsonLd];
355
+ const hasArticleType = schemas.some((s) => {
356
+ const type = s['@type'];
357
+ return (type === 'Article' ||
358
+ type === 'NewsArticle' ||
359
+ type === 'BlogPosting' ||
360
+ type === 'TechArticle' ||
361
+ type === 'ScholarlyArticle');
362
+ });
363
+ if (hasArticleType) {
364
+ breakdown.schema = finalConfig.schemaWeight;
365
+ }
366
+ }
367
+ breakdown.total =
368
+ breakdown.contentValidation +
369
+ breakdown.publishedDate +
370
+ breakdown.author +
371
+ breakdown.schema +
372
+ breakdown.readingTime;
373
+ breakdown.passesThreshold = breakdown.total >= finalConfig.threshold;
374
+ return breakdown;
375
+ }
376
+ //# sourceMappingURL=quality-scorer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"quality-scorer.js","sourceRoot":"","sources":["../../lib/quality-scorer.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;AAiLH,0CAsCC;AAgBD,oEAoDC;AAuBD,wDAQC;AASD,sCAyBC;AAUD,kDAyDC;AA3ZD;;;GAGG;AACU,QAAA,sBAAsB,GAAiC;IAClE,aAAa,EAAE,IAAI,EAAO,8CAA8C;IACxE,UAAU,EAAE,IAAI,EAAW,4BAA4B;IACvD,YAAY,EAAE,IAAI,EAAS,yBAAyB;IACpD,YAAY,EAAE,IAAI,EAAS,sBAAsB;IACjD,iBAAiB,EAAE,IAAI,EAAI,oCAAoC;IAC/D,SAAS,EAAE,IAAI,EAAY,8BAA8B;CAC1D,CAAC;AAEF;;;GAGG;AACU,QAAA,kBAAkB,GAAG;IAChC,GAAG;IACH,QAAQ;IACR,aAAa;IACb,QAAQ;IACR,UAAU;IACV,UAAU;IACV,YAAY;IACZ,OAAO;IACP,SAAS;IACT,UAAU;IACV,YAAY;IACZ,OAAO;IACP,SAAS;IACT,UAAU;IACV,QAAQ;IACR,UAAU;IACV,QAAQ;IACR,SAAS;IACT,aAAa;IACb,eAAe;IACf,WAAW;IACX,YAAY;IACZ,YAAY;IACZ,SAAS;IACT,WAAW;IACX,4BAA4B;IAC5B,YAAY;IACZ,cAAc;IACd,YAAY;IACZ,cAAc;IACd,WAAW;IACX,aAAa;IACb,WAAW;IACX,aAAa;IACb,WAAW;IACX,aAAa;IACb,UAAU;IACV,YAAY;IACZ,OAAO;IACP,SAAS;IACT,MAAM;IACN,QAAQ;IACR,UAAU;IACV,YAAY;IACZ,WAAW;IACX,aAAa;IACb,OAAO;IACP,SAAS;IACT,QAAQ;IACR,SAAS;IACT,WAAW;IACX,UAAU;IACV,YAAY;IACZ,YAAY;IACZ,cAAc;IACd,WAAW;IACX,aAAa;IACb,eAAe;IACf,iBAAiB;IACjB,kBAAkB;IAClB,oBAAoB;IACpB,SAAS;IACT,WAAW;IACX,aAAa;IACb,eAAe;IACf,gBAAgB;IAChB,kBAAkB;IAClB,aAAa;IACb,eAAe;IACf,WAAW;IACX,aAAa;IACb,aAAa;IACb,eAAe;IACf,gBAAgB;IAChB,kBAAkB;IAClB,OAAO;IACP,SAAS;IACT,MAAM;IACN,QAAQ;IACR,WAAW;IACX,aAAa;IACb,YAAY;IACZ,cAAc;IACd,sDAAsD;IACtD,UAAU,EAAG,QAAQ;IACrB,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,mBAAmB;IAChC,UAAU,EAAG,iBAAiB;IAC9B,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,kBAAkB;IAC/B,UAAU,EAAG,UAAU;IACvB,UAAU,EAAG,mBAAmB;IAChC,UAAU,EAAG,0BAA0B;IACvC,UAAU,EAAG,UAAU;IACvB,UAAU,EAAG,WAAW;IACxB,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,uBAAuB;IACpC,UAAU,EAAG,wBAAwB;IACrC,UAAU,EAAG,sBAAsB;IACnC,UAAU,EAAG,sBAAsB;IACnC,UAAU,EAAG,aAAa;IAC1B,UAAU,EAAG,UAAU;IACvB,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,QAAQ;IACrB,UAAU,EAAG,UAAU;IACvB,UAAU,EAAG,YAAY;IACzB,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,UAAU;IACvB,UAAU,EAAG,UAAU;IACvB,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,SAAS;IACtB,UAAU,EAAG,OAAO;IACpB,UAAU,EAAG,aAAa;IAC1B,UAAU,EAAG,aAAa;IAC1B,uBAAuB;IACvB,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;CACR,CAAC;AAEF;;;GAGG;AACU,QAAA,mBAAmB,GAAG;IACjC,SAAS;IACT,SAAS;IACT,aAAa;IACb,UAAU;IACV,YAAY;IACZ,UAAU;IACV,YAAY;IACZ,kBAAkB;IAClB,aAAa;IACb,cAAc;IACd,iBAAiB;IACjB,aAAa;IACb,gBAAgB;CACjB,CAAC;AAEF;;;;;;GAMG;AACH,SAAgB,eAAe,CAAC,SAA2B;IACzD,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,IAAI,KAAK,GAAG,GAAG,CAAC,CAAC,8CAA8C;IAE/D,gDAAgD;IAChD,MAAM,aAAa,GAAG,SAAS,CAAC,WAAW,EAAE,MAAM,IAAI,CAAC,CAAC;IACzD,IAAI,aAAa,GAAG,GAAG,EAAE,CAAC;QACxB,OAAO,CAAC,IAAI,CAAC,sCAAsC,CAAC,CAAC;QACrD,KAAK,IAAI,GAAG,CAAC,CAAC,kCAAkC;IAClD,CAAC;IAED,0CAA0C;IAC1C,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,MAAM,IAAI,CAAC,CAAC;IACjD,IAAI,WAAW,GAAG,EAAE,IAAI,WAAW,GAAG,GAAG,EAAE,CAAC;QAC1C,OAAO,CAAC,IAAI,CAAC,kDAAkD,CAAC,CAAC;QACjE,KAAK,IAAI,GAAG,CAAC;IACf,CAAC;IAED,yDAAyD;IACzD,IAAI,SAAS,CAAC,OAAO,IAAI,SAAS,CAAC,WAAW,EAAE,CAAC;QAC/C,MAAM,UAAU,GAAG,SAAS,CAAC,OAAO,CAAC,MAAM,CAAC;QAC5C,MAAM,UAAU,GAAG,SAAS,CAAC,WAAW,CAAC,MAAM,CAAC;QAChD,MAAM,KAAK,GAAG,UAAU,GAAG,UAAU,CAAC;QAEtC,IAAI,KAAK,GAAG,GAAG,EAAE,CAAC;YAChB,OAAO,CAAC,IAAI,CAAC,gCAAgC,CAAC,CAAC;YAC/C,KAAK,IAAI,GAAG,CAAC;QACf,CAAC;IACH,CAAC;IAED,yDAAyD;IACzD,MAAM,OAAO,GAAG,KAAK,IAAI,GAAG,CAAC;IAE7B,OAAO;QACL,OAAO;QACP,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC,EAAE,oBAAoB;QAC9D,OAAO;KACR,CAAC;AACJ,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,SAAgB,4BAA4B,CAC1C,SAA2B,EAC3B,SAA6B,EAAE;IAE/B,MAAM,WAAW,GAAG,EAAE,GAAG,8BAAsB,EAAE,GAAG,MAAM,EAAE,CAAC;IAC7D,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,qDAAqD;IACrD,MAAM,UAAU,GAAG,eAAe,CAAC,SAAS,CAAC,CAAC;IAC9C,KAAK,IAAI,UAAU,CAAC,KAAK,GAAG,WAAW,CAAC,aAAa,CAAC;IAEtD,mCAAmC;IAEnC,+CAA+C;IAC/C,IAAI,SAAS,CAAC,aAAa,EAAE,CAAC;QAC5B,KAAK,IAAI,WAAW,CAAC,UAAU,CAAC;IAClC,CAAC;IAED,2CAA2C;IAC3C,IAAI,SAAS,CAAC,MAAM,EAAE,CAAC;QACrB,KAAK,IAAI,WAAW,CAAC,YAAY,CAAC;IACpC,CAAC;IAED,yDAAyD;IACzD,IAAI,SAAS,CAAC,UAAU,EAAE,MAAM,EAAE,CAAC;QACjC,MAAM,OAAO,GAAG,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC;YACxD,CAAC,CAAC,SAAS,CAAC,UAAU,CAAC,MAAM;YAC7B,CAAC,CAAC,CAAC,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;QAElC,MAAM,cAAc,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,EAAE;YAC7C,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;YACxB,OAAO,CACL,IAAI,KAAK,SAAS;gBAClB,IAAI,KAAK,aAAa;gBACtB,IAAI,KAAK,aAAa;gBACtB,IAAI,KAAK,aAAa;gBACtB,IAAI,KAAK,kBAAkB,CAC5B,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,IAAI,cAAc,EAAE,CAAC;YACnB,KAAK,IAAI,WAAW,CAAC,YAAY,CAAC;QACpC,CAAC;IACH,CAAC;IAED,mDAAmD;IACnD,gDAAgD;IAChD,IAAI,SAAS,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,CAAC,EAAE,CAAC;QACxD,KAAK,IAAI,WAAW,CAAC,iBAAiB,CAAC;IACzC,CAAC;IAED,OAAO,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC,aAAa;AAC5C,CAAC;AAED;;GAEG;AACH,MAAM,iBAAiB,GAAG,kCAAkC,CAAC;AAE7D;;GAEG;AACH,MAAM,uBAAuB,GAAG,sBAAsB,CAAC;AAEvD;;;;;;;;;;GAUG;AACH,SAAgB,sBAAsB,CAAC,IAAY;IACjD,6CAA6C;IAC7C,IAAI,iBAAiB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;QACjC,qBAAqB;QACrB,OAAO,CAAC,uBAAuB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7C,CAAC;IACD,+CAA+C;IAC/C,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;;GAMG;AACH,SAAgB,aAAa,CAAC,GAAW,EAAE,YAAsB,0BAAkB;IACjF,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,MAAM,IAAI,GAAG,MAAM,CAAC,QAAQ,CAAC;QAE7B,uEAAuE;QACvE,IAAI,sBAAsB,CAAC,IAAI,CAAC,EAAE,CAAC;YACjC,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO,SAAS,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE;YAChC,cAAc;YACd,IAAI,OAAO,KAAK,IAAI;gBAAE,OAAO,IAAI,CAAC;YAElC,kCAAkC;YAClC,IAAI,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC3B,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,YAAY;gBACjD,OAAO,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;YACjC,CAAC;YAED,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,CAAC;IACL,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC,CAAC,0BAA0B;IAC1C,CAAC;AACH,CAAC;AAED;;;;;;;GAOG;AACH,SAAgB,mBAAmB,CACjC,SAA2B,EAC3B,SAA6B,EAAE;IAU/B,MAAM,WAAW,GAAG,EAAE,GAAG,8BAAsB,EAAE,GAAG,MAAM,EAAE,CAAC;IAC7D,MAAM,UAAU,GAAG,eAAe,CAAC,SAAS,CAAC,CAAC;IAE9C,MAAM,SAAS,GAAG;QAChB,iBAAiB,EAAE,UAAU,CAAC,KAAK,GAAG,WAAW,CAAC,aAAa;QAC/D,aAAa,EAAE,SAAS,CAAC,aAAa,CAAC,CAAC,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QACnE,MAAM,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,WAAW,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACvD,MAAM,EAAE,CAAC;QACT,WAAW,EAAE,SAAS,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,iBAAiB,CAAC,CAAC,CAAC,CAAC;QACpG,KAAK,EAAE,CAAC;QACR,eAAe,EAAE,KAAK;KACvB,CAAC;IAEF,eAAe;IACf,IAAI,SAAS,CAAC,UAAU,EAAE,MAAM,EAAE,CAAC;QACjC,MAAM,OAAO,GAAG,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC;YACxD,CAAC,CAAC,SAAS,CAAC,UAAU,CAAC,MAAM;YAC7B,CAAC,CAAC,CAAC,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;QAElC,MAAM,cAAc,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,EAAE;YAC7C,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;YACxB,OAAO,CACL,IAAI,KAAK,SAAS;gBAClB,IAAI,KAAK,aAAa;gBACtB,IAAI,KAAK,aAAa;gBACtB,IAAI,KAAK,aAAa;gBACtB,IAAI,KAAK,kBAAkB,CAC5B,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,IAAI,cAAc,EAAE,CAAC;YACnB,SAAS,CAAC,MAAM,GAAG,WAAW,CAAC,YAAY,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,SAAS,CAAC,KAAK;QACb,SAAS,CAAC,iBAAiB;YAC3B,SAAS,CAAC,aAAa;YACvB,SAAS,CAAC,MAAM;YAChB,SAAS,CAAC,MAAM;YAChB,SAAS,CAAC,WAAW,CAAC;IAExB,SAAS,CAAC,eAAe,GAAG,SAAS,CAAC,KAAK,IAAI,WAAW,CAAC,SAAS,CAAC;IAErE,OAAO,SAAS,CAAC;AACnB,CAAC"}
@@ -0,0 +1,31 @@
1
+ export interface RSSItem {
2
+ title: string;
3
+ link: string;
4
+ pubDate: string;
5
+ guid: string;
6
+ content?: string;
7
+ contentSnippet?: string;
8
+ }
9
+ /**
10
+ * Creates a content-based hash for article deduplication.
11
+ * Uses normalized title + date + source instead of URL to handle:
12
+ * - URL tracking parameters (utm_source, etc.)
13
+ * - URL redirects (HTTP vs HTTPS, www vs non-www)
14
+ * - Cross-posting (same article on multiple sites)
15
+ *
16
+ * @param title - Article title
17
+ * @param link - Article URL (legacy, kept for backward compatibility)
18
+ * @param publishedAt - Publication date (defaults to now)
19
+ * @param source - Source name (defaults to 'unknown')
20
+ * @returns SHA-256 hash as hex string
21
+ */
22
+ export declare function createGuidHash(title: string, link: string, publishedAt?: Date, source?: string): string;
23
+ export declare function fetchRSSFeed(url: string, sourceId?: string): Promise<RSSItem[]>;
24
+ export declare function validateRSSFeed(url: string): Promise<{
25
+ isValid: boolean;
26
+ error?: string;
27
+ feedTitle?: string;
28
+ itemCount?: number;
29
+ contentType?: string;
30
+ }>;
31
+ //# sourceMappingURL=rss-utils.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"rss-utils.d.ts","sourceRoot":"","sources":["../../lib/rss-utils.ts"],"names":[],"mappings":"AAUA,MAAM,WAAW,OAAO;IACtB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,cAAc,CAC5B,KAAK,EAAE,MAAM,EACb,IAAI,EAAE,MAAM,EACZ,WAAW,GAAE,IAAiB,EAC9B,MAAM,GAAE,MAAkB,GACzB,MAAM,CA0BR;AAED,wBAAsB,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,EAAE,CAAC,CA4CrF;AAGD,wBAAsB,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;IAC1D,OAAO,EAAE,OAAO,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB,CAAC,CA2FD"}
@@ -0,0 +1,175 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.createGuidHash = createGuidHash;
7
+ exports.fetchRSSFeed = fetchRSSFeed;
8
+ exports.validateRSSFeed = validateRSSFeed;
9
+ const rss_parser_1 = __importDefault(require("rss-parser"));
10
+ const crypto_1 = __importDefault(require("crypto"));
11
+ const parser = new rss_parser_1.default({
12
+ timeout: 15000, // Increased timeout
13
+ headers: {
14
+ 'User-Agent': 'Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)'
15
+ }
16
+ });
17
+ /**
18
+ * Creates a content-based hash for article deduplication.
19
+ * Uses normalized title + date + source instead of URL to handle:
20
+ * - URL tracking parameters (utm_source, etc.)
21
+ * - URL redirects (HTTP vs HTTPS, www vs non-www)
22
+ * - Cross-posting (same article on multiple sites)
23
+ *
24
+ * @param title - Article title
25
+ * @param link - Article URL (legacy, kept for backward compatibility)
26
+ * @param publishedAt - Publication date (defaults to now)
27
+ * @param source - Source name (defaults to 'unknown')
28
+ * @returns SHA-256 hash as hex string
29
+ */
30
+ function createGuidHash(title, link, publishedAt = new Date(), source = 'unknown') {
31
+ // Normalize title: lowercase, collapse whitespace, remove punctuation
32
+ const normalizedTitle = title
33
+ .trim()
34
+ .toLowerCase()
35
+ .replace(/\s+/g, ' ')
36
+ .replace(/[^\w\s]/g, ''); // Remove punctuation for better matching
37
+ // Extract date bucket (YYYY-MM-DD) - same day articles might be duplicates
38
+ const dateKey = publishedAt.toISOString().split('T')[0];
39
+ // Normalize source name
40
+ const normalizedSource = source.toLowerCase().trim();
41
+ // Create composite key: title | date | source
42
+ const composite = `${normalizedTitle}|${dateKey}|${normalizedSource}`;
43
+ // Generate SHA-256 hash
44
+ const hash = crypto_1.default.createHash('sha256').update(composite).digest('hex');
45
+ // Log for debugging (in development)
46
+ if (process.env.NODE_ENV === 'development') {
47
+ console.log(`🔍 [GuidHash] Generated hash for: "${title.substring(0, 50)}..." from ${source} on ${dateKey}`);
48
+ }
49
+ return hash;
50
+ }
51
+ async function fetchRSSFeed(url, sourceId) {
52
+ const now = new Date();
53
+ try {
54
+ console.log(`🔄 [RSS] Fetching feed from ${url}`);
55
+ const feed = await parser.parseURL(url);
56
+ if (!feed.items || feed.items.length === 0) {
57
+ console.warn(`⚠️ [RSS] Feed from ${url} contains no items`);
58
+ return [];
59
+ }
60
+ const items = feed.items.map(item => ({
61
+ title: item.title || 'Untitled',
62
+ link: item.link || '',
63
+ pubDate: item.pubDate || new Date().toISOString(),
64
+ guid: item.guid || item.link || crypto_1.default.randomUUID(),
65
+ content: item.content || item['content:encoded'] || '',
66
+ contentSnippet: item.contentSnippet || ''
67
+ }));
68
+ console.log(`✅ [RSS] Successfully fetched ${items.length} items from ${url}`);
69
+ return items;
70
+ }
71
+ catch (error) {
72
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error';
73
+ console.error(`❌ [RSS] Failed to fetch RSS from ${url}:`, errorMessage);
74
+ // Log specific error types for debugging
75
+ if (error instanceof Error) {
76
+ if (error.message.includes('Invalid character')) {
77
+ console.error(`🔍 [RSS] XML parsing error - feed may be malformed or contain HTML`);
78
+ }
79
+ else if (error.message.includes('timeout')) {
80
+ console.error(`🔍 [RSS] Request timeout - server may be slow or unreachable`);
81
+ }
82
+ else if (error.message.includes('ENOTFOUND')) {
83
+ console.error(`🔍 [RSS] Domain not found - check URL spelling`);
84
+ }
85
+ else if (error.message.includes('ECONNREFUSED')) {
86
+ console.error(`🔍 [RSS] Connection refused - server may be down`);
87
+ }
88
+ }
89
+ return [];
90
+ }
91
+ }
92
+ // Enhanced RSS validation with better error handling
93
+ async function validateRSSFeed(url) {
94
+ try {
95
+ console.log(`🔍 [Validation] Validating RSS feed: ${url}`);
96
+ // First check if URL is reachable
97
+ const response = await fetch(url, {
98
+ method: 'HEAD',
99
+ headers: {
100
+ 'User-Agent': 'Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)'
101
+ },
102
+ signal: AbortSignal.timeout(10000)
103
+ });
104
+ if (!response.ok) {
105
+ return {
106
+ isValid: false,
107
+ error: `HTTP ${response.status}: ${response.statusText}`,
108
+ contentType: response.headers.get('content-type') || undefined
109
+ };
110
+ }
111
+ const contentType = response.headers.get('content-type') || '';
112
+ // Check if content type suggests RSS/XML
113
+ const isRssContentType = contentType.includes('application/rss+xml') ||
114
+ contentType.includes('application/xml') ||
115
+ contentType.includes('text/xml') ||
116
+ contentType.includes('application/atom+xml');
117
+ if (!isRssContentType && contentType.includes('text/html')) {
118
+ return {
119
+ isValid: false,
120
+ error: 'URL returns HTML content instead of RSS feed',
121
+ contentType
122
+ };
123
+ }
124
+ // Now fetch and parse the actual RSS content
125
+ const feed = await parser.parseURL(url);
126
+ if (!feed.title) {
127
+ return {
128
+ isValid: false,
129
+ error: 'RSS feed has no title - may be malformed',
130
+ contentType
131
+ };
132
+ }
133
+ if (!feed.items || feed.items.length === 0) {
134
+ return {
135
+ isValid: false,
136
+ error: 'RSS feed contains no items',
137
+ contentType,
138
+ feedTitle: feed.title
139
+ };
140
+ }
141
+ console.log(`✅ [Validation] RSS feed validated successfully: ${feed.title} (${feed.items.length} items)`);
142
+ return {
143
+ isValid: true,
144
+ feedTitle: feed.title,
145
+ itemCount: feed.items.length,
146
+ contentType
147
+ };
148
+ }
149
+ catch (error) {
150
+ let errorMessage = 'Unknown validation error';
151
+ if (error instanceof Error) {
152
+ if (error.message.includes('timeout') || error.name === 'AbortError') {
153
+ errorMessage = 'Request timeout - URL may be unreachable';
154
+ }
155
+ else if (error.message.includes('Invalid character')) {
156
+ errorMessage = 'Invalid RSS/XML format - feed may be malformed';
157
+ }
158
+ else if (error.message.includes('getaddrinfo ENOTFOUND')) {
159
+ errorMessage = 'Domain not found - check URL spelling';
160
+ }
161
+ else if (error.message.includes('ECONNREFUSED')) {
162
+ errorMessage = 'Connection refused - server may be down';
163
+ }
164
+ else {
165
+ errorMessage = error.message;
166
+ }
167
+ }
168
+ console.error(`❌ [Validation] RSS validation failed for ${url}:`, errorMessage);
169
+ return {
170
+ isValid: false,
171
+ error: errorMessage
172
+ };
173
+ }
174
+ }
175
+ //# sourceMappingURL=rss-utils.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"rss-utils.js","sourceRoot":"","sources":["../../lib/rss-utils.ts"],"names":[],"mappings":";;;;;AAgCA,wCA+BC;AAED,oCA4CC;AAGD,0CAiGC;AAjND,4DAAgC;AAChC,oDAA4B;AAE5B,MAAM,MAAM,GAAG,IAAI,oBAAM,CAAC;IACxB,OAAO,EAAE,KAAK,EAAE,oBAAoB;IACpC,OAAO,EAAE;QACP,YAAY,EAAE,6EAA6E;KAC5F;CACF,CAAC,CAAC;AAWH;;;;;;;;;;;;GAYG;AACH,SAAgB,cAAc,CAC5B,KAAa,EACb,IAAY,EACZ,cAAoB,IAAI,IAAI,EAAE,EAC9B,SAAiB,SAAS;IAE1B,sEAAsE;IACtE,MAAM,eAAe,GAAG,KAAK;SAC1B,IAAI,EAAE;SACN,WAAW,EAAE;SACb,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,CAAC,yCAAyC;IAErE,2EAA2E;IAC3E,MAAM,OAAO,GAAG,WAAW,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAExD,wBAAwB;IACxB,MAAM,gBAAgB,GAAG,MAAM,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;IAErD,8CAA8C;IAC9C,MAAM,SAAS,GAAG,GAAG,eAAe,IAAI,OAAO,IAAI,gBAAgB,EAAE,CAAC;IAEtE,wBAAwB;IACxB,MAAM,IAAI,GAAG,gBAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAEzE,qCAAqC;IACrC,IAAI,OAAO,CAAC,GAAG,CAAC,QAAQ,KAAK,aAAa,EAAE,CAAC;QAC3C,OAAO,CAAC,GAAG,CAAC,sCAAsC,KAAK,CAAC,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC,aAAa,MAAM,OAAO,OAAO,EAAE,CAAC,CAAC;IAC/G,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAEM,KAAK,UAAU,YAAY,CAAC,GAAW,EAAE,QAAiB;IAC/D,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC;IAEvB,IAAI,CAAC;QACH,OAAO,CAAC,GAAG,CAAC,+BAA+B,GAAG,EAAE,CAAC,CAAC;QAElD,MAAM,IAAI,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;QAExC,IAAI,CAAC,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3C,OAAO,CAAC,IAAI,CAAC,sBAAsB,GAAG,oBAAoB,CAAC,CAAC;YAC5D,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACpC,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,UAAU;YAC/B,IAAI,EAAE,IAAI,CAAC,IAAI,IAAI,EAAE;YACrB,OAAO,EAAE,IAAI,CAAC,OAAO,IAAI,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACjD,IAAI,EAAE,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,IAAI,gBAAM,CAAC,UAAU,EAAE;YACnD,OAAO,EAAE,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,iBAAiB,CAAC,IAAI,EAAE;YACtD,cAAc,EAAE,IAAI,CAAC,cAAc,IAAI,EAAE;SAC1C,CAAC,CAAC,CAAC;QAEJ,OAAO,CAAC,GAAG,CAAC,gCAAgC,KAAK,CAAC,MAAM,eAAe,GAAG,EAAE,CAAC,CAAC;QAC9E,OAAO,KAAK,CAAC;IAEf,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;QAC9E,OAAO,CAAC,KAAK,CAAC,oCAAoC,GAAG,GAAG,EAAE,YAAY,CAAC,CAAC;QAExE,yCAAyC;QACzC,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,mBAAmB,CAAC,EAAE,CAAC;gBAChD,OAAO,CAAC,KAAK,CAAC,oEAAoE,CAAC,CAAC;YACtF,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC7C,OAAO,CAAC,KAAK,CAAC,8DAA8D,CAAC,CAAC;YAChF,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;gBAC/C,OAAO,CAAC,KAAK,CAAC,gDAAgD,CAAC,CAAC;YAClE,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAC,EAAE,CAAC;gBAClD,OAAO,CAAC,KAAK,CAAC,kDAAkD,CAAC,CAAC;YACpE,CAAC;QACH,CAAC;QAED,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,qDAAqD;AAC9C,KAAK,UAAU,eAAe,CAAC,GAAW;IAO/C,IAAI,CAAC;QACH,OAAO,CAAC,GAAG,CAAC,wCAAwC,GAAG,EAAE,CAAC,CAAC;QAE3D,kCAAkC;QAClC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAChC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,YAAY,EAAE,6EAA6E;aAC5F;YACD,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,KAAK,CAAC;SACnC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE;gBACxD,WAAW,EAAE,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,SAAS;aAC/D,CAAC;QACJ,CAAC;QAED,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;QAE/D,yCAAyC;QACzC,MAAM,gBAAgB,GACpB,WAAW,CAAC,QAAQ,CAAC,qBAAqB,CAAC;YAC3C,WAAW,CAAC,QAAQ,CAAC,iBAAiB,CAAC;YACvC,WAAW,CAAC,QAAQ,CAAC,UAAU,CAAC;YAChC,WAAW,CAAC,QAAQ,CAAC,sBAAsB,CAAC,CAAC;QAE/C,IAAI,CAAC,gBAAgB,IAAI,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;YAC3D,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,8CAA8C;gBACrD,WAAW;aACZ,CAAC;QACJ,CAAC;QAED,6CAA6C;QAC7C,MAAM,IAAI,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;QAExC,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;YAChB,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,0CAA0C;gBACjD,WAAW;aACZ,CAAC;QACJ,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3C,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,4BAA4B;gBACnC,WAAW;gBACX,SAAS,EAAE,IAAI,CAAC,KAAK;aACtB,CAAC;QACJ,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,mDAAmD,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,CAAC,MAAM,SAAS,CAAC,CAAC;QAE1G,OAAO;YACL,OAAO,EAAE,IAAI;YACb,SAAS,EAAE,IAAI,CAAC,KAAK;YACrB,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM;YAC5B,WAAW;SACZ,CAAC;IAEJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,YAAY,GAAG,0BAA0B,CAAC;QAE9C,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBACrE,YAAY,GAAG,0CAA0C,CAAC;YAC5D,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,mBAAmB,CAAC,EAAE,CAAC;gBACvD,YAAY,GAAG,gDAAgD,CAAC;YAClE,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,uBAAuB,CAAC,EAAE,CAAC;gBAC3D,YAAY,GAAG,uCAAuC,CAAC;YACzD,CAAC;iBAAM,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAC,EAAE,CAAC;gBAClD,YAAY,GAAG,yCAAyC,CAAC;YAC3D,CAAC;iBAAM,CAAC;gBACN,YAAY,GAAG,KAAK,CAAC,OAAO,CAAC;YAC/B,CAAC;QACH,CAAC;QAED,OAAO,CAAC,KAAK,CAAC,4CAA4C,GAAG,GAAG,EAAE,YAAY,CAAC,CAAC;QAEhF,OAAO;YACL,OAAO,EAAE,KAAK;YACd,KAAK,EAAE,YAAY;SACpB,CAAC;IACJ,CAAC;AACH,CAAC"}
@@ -0,0 +1,52 @@
1
+ export interface RateLimiterConfig {
2
+ requestsPerSecond?: number;
3
+ maxBackoff?: number;
4
+ maxConcurrent?: number;
5
+ maxConcurrentPerHost?: number;
6
+ }
7
+ export declare const RATE_LIMITER_PRESETS: {
8
+ readonly conservative: {
9
+ readonly requestsPerSecond: 1;
10
+ readonly maxBackoff: 30000;
11
+ readonly maxConcurrent: 10;
12
+ readonly maxConcurrentPerHost: 2;
13
+ };
14
+ readonly moderate: {
15
+ readonly requestsPerSecond: 2;
16
+ readonly maxBackoff: 30000;
17
+ readonly maxConcurrent: 20;
18
+ readonly maxConcurrentPerHost: 3;
19
+ };
20
+ readonly aggressive: {
21
+ readonly requestsPerSecond: 4;
22
+ readonly maxBackoff: 15000;
23
+ readonly maxConcurrent: 30;
24
+ readonly maxConcurrentPerHost: 5;
25
+ };
26
+ };
27
+ export type RateLimiterPreset = keyof typeof RATE_LIMITER_PRESETS;
28
+ export declare class ScrapingRateLimiter {
29
+ private hosts;
30
+ private readonly baseDelay;
31
+ private readonly maxBackoff;
32
+ private readonly maxConcurrent;
33
+ private readonly maxConcurrentPerHost;
34
+ private activeRequests;
35
+ constructor(options?: RateLimiterConfig);
36
+ static fromPreset(preset: RateLimiterPreset): ScrapingRateLimiter;
37
+ execute<T>(url: string, operation: () => Promise<T>, options?: {
38
+ priority?: number;
39
+ maxRetries?: number;
40
+ }): Promise<T>;
41
+ private extractHost;
42
+ private enqueueRequest;
43
+ private processQueue;
44
+ private handleRequestError;
45
+ private shouldRetry;
46
+ private shouldBackoff;
47
+ private wait;
48
+ getStats(): Record<string, any>;
49
+ }
50
+ export declare const globalRateLimiter: ScrapingRateLimiter;
51
+ export declare function createRateLimiter(config: RateLimiterConfig | RateLimiterPreset): ScrapingRateLimiter;
52
+ //# sourceMappingURL=scraping-rate-limiter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"scraping-rate-limiter.d.ts","sourceRoot":"","sources":["../../lib/scraping-rate-limiter.ts"],"names":[],"mappings":"AAmBA,MAAM,WAAW,iBAAiB;IAChC,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,oBAAoB,CAAC,EAAE,MAAM,CAAC;CAC/B;AAGD,eAAO,MAAM,oBAAoB;;;;;;;;;;;;;;;;;;;CAsBvB,CAAC;AAEX,MAAM,MAAM,iBAAiB,GAAG,MAAM,OAAO,oBAAoB,CAAC;AAElE,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,KAAK,CAAgC;IAC7C,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAS;IACpC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAS;IACvC,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAS;IAC9C,OAAO,CAAC,cAAc,CAAqB;gBAE/B,OAAO,GAAE,iBAAsB;IAO3C,MAAM,CAAC,UAAU,CAAC,MAAM,EAAE,iBAAiB,GAAG,mBAAmB;IAI3D,OAAO,CAAC,CAAC,EACb,GAAG,EAAE,MAAM,EACX,SAAS,EAAE,MAAM,OAAO,CAAC,CAAC,CAAC,EAC3B,OAAO,GAAE;QACP,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,UAAU,CAAC,EAAE,MAAM,CAAC;KAChB,GACL,OAAO,CAAC,CAAC,CAAC;IAqBb,OAAO,CAAC,WAAW;IASnB,OAAO,CAAC,cAAc;YAiCR,YAAY;YAiEZ,kBAAkB;IAuChC,OAAO,CAAC,WAAW;IAyBnB,OAAO,CAAC,aAAa;IAerB,OAAO,CAAC,IAAI;IAKZ,QAAQ,IAAI,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC;CAmBhC;AAGD,eAAO,MAAM,iBAAiB,qBAA6C,CAAC;AAG5E,wBAAgB,iBAAiB,CAAC,MAAM,EAAE,iBAAiB,GAAG,iBAAiB,GAAG,mBAAmB,CAKpG"}