@tyroneross/blog-scraper 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +254 -279
  3. package/dist/lib/circuit-breaker.d.ts +29 -0
  4. package/dist/lib/circuit-breaker.d.ts.map +1 -0
  5. package/dist/lib/circuit-breaker.js +89 -0
  6. package/dist/lib/circuit-breaker.js.map +1 -0
  7. package/dist/lib/content-extractor.d.ts +13 -0
  8. package/dist/lib/content-extractor.d.ts.map +1 -0
  9. package/dist/lib/content-extractor.js +75 -0
  10. package/dist/lib/content-extractor.js.map +1 -0
  11. package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
  12. package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
  13. package/dist/lib/formatters/html-to-markdown.js +146 -0
  14. package/dist/lib/formatters/html-to-markdown.js.map +1 -0
  15. package/dist/lib/formatters/text-cleaner.d.ts +44 -0
  16. package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
  17. package/dist/lib/formatters/text-cleaner.js +143 -0
  18. package/dist/lib/formatters/text-cleaner.js.map +1 -0
  19. package/dist/lib/index.d.ts +96 -0
  20. package/dist/lib/index.d.ts.map +1 -0
  21. package/dist/lib/index.js +184 -0
  22. package/dist/lib/index.js.map +1 -0
  23. package/dist/lib/quality-scorer.d.ts +83 -0
  24. package/dist/lib/quality-scorer.d.ts.map +1 -0
  25. package/dist/lib/quality-scorer.js +376 -0
  26. package/dist/lib/quality-scorer.js.map +1 -0
  27. package/dist/lib/rss-utils.d.ts +31 -0
  28. package/dist/lib/rss-utils.d.ts.map +1 -0
  29. package/dist/lib/rss-utils.js +175 -0
  30. package/dist/lib/rss-utils.js.map +1 -0
  31. package/dist/lib/scraping-rate-limiter.d.ts +52 -0
  32. package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
  33. package/dist/lib/scraping-rate-limiter.js +238 -0
  34. package/dist/lib/scraping-rate-limiter.js.map +1 -0
  35. package/dist/lib/source-orchestrator.d.ts +306 -0
  36. package/dist/lib/source-orchestrator.d.ts.map +1 -0
  37. package/dist/lib/source-orchestrator.js +840 -0
  38. package/dist/lib/source-orchestrator.js.map +1 -0
  39. package/dist/lib/types.d.ts +143 -0
  40. package/dist/lib/types.d.ts.map +1 -0
  41. package/dist/lib/types.js +7 -0
  42. package/dist/lib/types.js.map +1 -0
  43. package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
  44. package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
  45. package/dist/lib/web-scrapers/content-extractor.js +531 -0
  46. package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
  47. package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
  48. package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
  49. package/dist/lib/web-scrapers/html-scraper.js +598 -0
  50. package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
  51. package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
  52. package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
  53. package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
  54. package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
  55. package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
  56. package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
  57. package/dist/lib/web-scrapers/robots-checker.js +285 -0
  58. package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
  59. package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
  60. package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
  61. package/dist/lib/web-scrapers/rss-discovery.js +384 -0
  62. package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
  63. package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
  64. package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
  65. package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
  66. package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
  67. package/package.json +54 -33
  68. package/dist/index.d.mts +0 -949
  69. package/dist/index.d.ts +0 -949
  70. package/dist/index.js +0 -3236
  71. package/dist/index.mjs +0 -3165
@@ -0,0 +1,285 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.globalRobotsChecker = exports.RobotsChecker = void 0;
4
+ class RobotsChecker {
5
+ constructor() {
6
+ this.cache = new Map();
7
+ this.cacheTimeout = 24 * 60 * 60 * 1000; // 24 hours
8
+ this.userAgent = 'AtomizeNews/1.0';
9
+ this.requestTimeout = 5000; // 5 seconds
10
+ }
11
+ /**
12
+ * Check if a URL is allowed to be crawled according to robots.txt
13
+ */
14
+ async isAllowed(url) {
15
+ try {
16
+ const urlObj = new URL(url);
17
+ const robotsUrl = `${urlObj.protocol}//${urlObj.host}/robots.txt`;
18
+ console.log(`🤖 [Robots] Checking ${url} against ${robotsUrl}`);
19
+ const robotsTxt = await this.getRobotsTxt(robotsUrl);
20
+ if (!robotsTxt) {
21
+ // If robots.txt doesn't exist or can't be fetched, allow by default
22
+ return {
23
+ allowed: true,
24
+ sitemaps: [],
25
+ reason: 'No robots.txt found - allowing by default'
26
+ };
27
+ }
28
+ const result = this.checkRules(urlObj.pathname, robotsTxt);
29
+ console.log(`🤖 [Robots] ${result.allowed ? '✅ Allowed' : '❌ Blocked'}: ${url} - ${result.reason}`);
30
+ return result;
31
+ }
32
+ catch (error) {
33
+ console.warn(`⚠️ [Robots] Error checking robots.txt for ${url}:`, error);
34
+ // On error, default to allowing the request
35
+ return {
36
+ allowed: true,
37
+ sitemaps: [],
38
+ reason: `Error checking robots.txt: ${error instanceof Error ? error.message : 'Unknown error'}`
39
+ };
40
+ }
41
+ }
42
+ /**
43
+ * Get sitemaps listed in robots.txt for a domain
44
+ */
45
+ async getSitemaps(domain) {
46
+ try {
47
+ const robotsUrl = `https://${domain}/robots.txt`;
48
+ const robotsTxt = await this.getRobotsTxt(robotsUrl);
49
+ return robotsTxt ? robotsTxt.sitemaps : [];
50
+ }
51
+ catch (error) {
52
+ console.warn(`⚠️ [Robots] Error getting sitemaps for ${domain}:`, error);
53
+ return [];
54
+ }
55
+ }
56
+ /**
57
+ * Get the recommended crawl delay for a domain
58
+ */
59
+ async getCrawlDelay(domain) {
60
+ try {
61
+ const robotsUrl = `https://${domain}/robots.txt`;
62
+ const robotsTxt = await this.getRobotsTxt(robotsUrl);
63
+ if (!robotsTxt)
64
+ return undefined;
65
+ // Find the most specific rule for our user agent
66
+ const rule = this.findBestMatchingRule(robotsTxt.rules);
67
+ return rule?.crawlDelay;
68
+ }
69
+ catch (error) {
70
+ console.warn(`⚠️ [Robots] Error getting crawl delay for ${domain}:`, error);
71
+ return undefined;
72
+ }
73
+ }
74
+ async getRobotsTxt(robotsUrl) {
75
+ // Check cache first
76
+ const cached = this.cache.get(robotsUrl);
77
+ if (cached && Date.now() < cached.expiresAt) {
78
+ return cached;
79
+ }
80
+ try {
81
+ console.log(`🤖 [Robots] Fetching ${robotsUrl}`);
82
+ const controller = new AbortController();
83
+ const timeoutId = setTimeout(() => controller.abort(), this.requestTimeout);
84
+ const response = await fetch(robotsUrl, {
85
+ headers: {
86
+ 'User-Agent': this.userAgent,
87
+ },
88
+ signal: controller.signal,
89
+ });
90
+ clearTimeout(timeoutId);
91
+ if (!response.ok) {
92
+ if (response.status === 404) {
93
+ console.log(`🤖 [Robots] No robots.txt found at ${robotsUrl}`);
94
+ return null;
95
+ }
96
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
97
+ }
98
+ const text = await response.text();
99
+ const robotsTxt = this.parseRobotsTxt(text);
100
+ // Cache the result
101
+ this.cache.set(robotsUrl, robotsTxt);
102
+ console.log(`🤖 [Robots] Successfully parsed robots.txt for ${new URL(robotsUrl).hostname}`);
103
+ return robotsTxt;
104
+ }
105
+ catch (error) {
106
+ if (error instanceof Error && error.name === 'AbortError') {
107
+ console.warn(`⚠️ [Robots] Timeout fetching ${robotsUrl}`);
108
+ }
109
+ else {
110
+ console.warn(`⚠️ [Robots] Error fetching ${robotsUrl}:`, error);
111
+ }
112
+ return null;
113
+ }
114
+ }
115
+ parseRobotsTxt(text) {
116
+ const lines = text.split('\n').map(line => line.trim()).filter(line => line && !line.startsWith('#'));
117
+ const rules = [];
118
+ const globalSitemaps = [];
119
+ let currentRule = null;
120
+ for (const line of lines) {
121
+ const [key, ...valueParts] = line.split(':');
122
+ const value = valueParts.join(':').trim();
123
+ const lowerKey = key.toLowerCase().trim();
124
+ switch (lowerKey) {
125
+ case 'user-agent':
126
+ // Start a new rule
127
+ if (currentRule) {
128
+ rules.push(this.completeRule(currentRule));
129
+ }
130
+ currentRule = {
131
+ userAgent: value.toLowerCase(),
132
+ disallows: [],
133
+ allows: [],
134
+ sitemaps: []
135
+ };
136
+ break;
137
+ case 'disallow':
138
+ if (currentRule) {
139
+ currentRule.disallows = currentRule.disallows || [];
140
+ if (value) {
141
+ currentRule.disallows.push(value);
142
+ }
143
+ }
144
+ break;
145
+ case 'allow':
146
+ if (currentRule) {
147
+ currentRule.allows = currentRule.allows || [];
148
+ if (value) {
149
+ currentRule.allows.push(value);
150
+ }
151
+ }
152
+ break;
153
+ case 'crawl-delay':
154
+ if (currentRule && value) {
155
+ const delay = parseFloat(value);
156
+ if (!isNaN(delay)) {
157
+ currentRule.crawlDelay = delay * 1000; // Convert to milliseconds
158
+ }
159
+ }
160
+ break;
161
+ case 'sitemap':
162
+ if (value) {
163
+ globalSitemaps.push(value);
164
+ if (currentRule) {
165
+ currentRule.sitemaps = currentRule.sitemaps || [];
166
+ currentRule.sitemaps.push(value);
167
+ }
168
+ }
169
+ break;
170
+ }
171
+ }
172
+ // Don't forget the last rule
173
+ if (currentRule) {
174
+ rules.push(this.completeRule(currentRule));
175
+ }
176
+ const now = Date.now();
177
+ return {
178
+ rules,
179
+ sitemaps: globalSitemaps,
180
+ fetchedAt: now,
181
+ expiresAt: now + this.cacheTimeout
182
+ };
183
+ }
184
+ completeRule(partial) {
185
+ return {
186
+ userAgent: partial.userAgent || '*',
187
+ disallows: partial.disallows || [],
188
+ allows: partial.allows || [],
189
+ crawlDelay: partial.crawlDelay,
190
+ sitemaps: partial.sitemaps || []
191
+ };
192
+ }
193
+ checkRules(path, robotsTxt) {
194
+ // Find the best matching rule for our user agent
195
+ const rule = this.findBestMatchingRule(robotsTxt.rules);
196
+ if (!rule) {
197
+ return {
198
+ allowed: true,
199
+ sitemaps: robotsTxt.sitemaps,
200
+ reason: 'No applicable rules found'
201
+ };
202
+ }
203
+ // Check Allow rules first (they have priority over Disallow)
204
+ for (const allowPattern of rule.allows) {
205
+ if (this.matchesPattern(path, allowPattern)) {
206
+ return {
207
+ allowed: true,
208
+ crawlDelay: rule.crawlDelay,
209
+ sitemaps: robotsTxt.sitemaps,
210
+ reason: `Explicitly allowed by pattern: ${allowPattern}`
211
+ };
212
+ }
213
+ }
214
+ // Check Disallow rules
215
+ for (const disallowPattern of rule.disallows) {
216
+ if (this.matchesPattern(path, disallowPattern)) {
217
+ return {
218
+ allowed: false,
219
+ crawlDelay: rule.crawlDelay,
220
+ sitemaps: robotsTxt.sitemaps,
221
+ reason: `Blocked by pattern: ${disallowPattern}`
222
+ };
223
+ }
224
+ }
225
+ // If no rules match, allow by default
226
+ return {
227
+ allowed: true,
228
+ crawlDelay: rule.crawlDelay,
229
+ sitemaps: robotsTxt.sitemaps,
230
+ reason: 'No matching disallow rules'
231
+ };
232
+ }
233
+ findBestMatchingRule(rules) {
234
+ // Priority order: exact match for our user agent, then wildcard
235
+ const exactMatch = rules.find(rule => rule.userAgent === this.userAgent.toLowerCase());
236
+ if (exactMatch)
237
+ return exactMatch;
238
+ const wildcardMatch = rules.find(rule => rule.userAgent === '*');
239
+ if (wildcardMatch)
240
+ return wildcardMatch;
241
+ return null;
242
+ }
243
+ matchesPattern(path, pattern) {
244
+ if (pattern === '') {
245
+ // Empty disallow means allow everything
246
+ return false;
247
+ }
248
+ if (pattern === '/') {
249
+ // Root disallow means disallow everything
250
+ return true;
251
+ }
252
+ // Handle wildcards - simplified pattern matching
253
+ if (pattern.includes('*')) {
254
+ // Convert robots.txt wildcard pattern to regex
255
+ const regexPattern = pattern
256
+ .replace(/[.*+?^${}()|[\]\\]/g, '\\$&') // Escape regex special chars
257
+ .replace(/\\\*/g, '.*'); // Convert * to .*
258
+ const regex = new RegExp('^' + regexPattern);
259
+ return regex.test(path);
260
+ }
261
+ // Simple prefix matching for patterns without wildcards
262
+ return path.startsWith(pattern);
263
+ }
264
+ // Clear cache (useful for testing)
265
+ clearCache() {
266
+ this.cache.clear();
267
+ }
268
+ // Get cache stats
269
+ getCacheStats() {
270
+ return {
271
+ size: this.cache.size,
272
+ entries: Array.from(this.cache.entries()).map(([url, data]) => ({
273
+ url,
274
+ fetchedAt: new Date(data.fetchedAt).toISOString(),
275
+ expiresAt: new Date(data.expiresAt).toISOString(),
276
+ rulesCount: data.rules.length,
277
+ sitemapsCount: data.sitemaps.length
278
+ }))
279
+ };
280
+ }
281
+ }
282
+ exports.RobotsChecker = RobotsChecker;
283
+ // Default global instance
284
+ exports.globalRobotsChecker = new RobotsChecker();
285
+ //# sourceMappingURL=robots-checker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"robots-checker.js","sourceRoot":"","sources":["../../../lib/web-scrapers/robots-checker.ts"],"names":[],"mappings":";;;AAeA,MAAa,aAAa;IAA1B;QACU,UAAK,GAAG,IAAI,GAAG,EAAqB,CAAC;QAC5B,iBAAY,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC,WAAW;QAC/C,cAAS,GAAG,iBAAiB,CAAC;QAC9B,mBAAc,GAAG,IAAI,CAAC,CAAC,YAAY;IAiUtD,CAAC;IA/TC;;OAEG;IACH,KAAK,CAAC,SAAS,CAAC,GAAW;QAMzB,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YAC5B,MAAM,SAAS,GAAG,GAAG,MAAM,CAAC,QAAQ,KAAK,MAAM,CAAC,IAAI,aAAa,CAAC;YAElE,OAAO,CAAC,GAAG,CAAC,wBAAwB,GAAG,YAAY,SAAS,EAAE,CAAC,CAAC;YAEhE,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;YACrD,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,oEAAoE;gBACpE,OAAO;oBACL,OAAO,EAAE,IAAI;oBACb,QAAQ,EAAE,EAAE;oBACZ,MAAM,EAAE,2CAA2C;iBACpD,CAAC;YACJ,CAAC;YAED,MAAM,MAAM,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;YAE3D,OAAO,CAAC,GAAG,CAAC,eAAe,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,WAAW,KAAK,GAAG,MAAM,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;YAEpG,OAAO,MAAM,CAAC;QAEhB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,6CAA6C,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YACzE,4CAA4C;YAC5C,OAAO;gBACL,OAAO,EAAE,IAAI;gBACb,QAAQ,EAAE,EAAE;gBACZ,MAAM,EAAE,8BAA8B,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE;aACjG,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,WAAW,CAAC,MAAc;QAC9B,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,WAAW,MAAM,aAAa,CAAC;YACjD,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;YACrD,OAAO,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7C,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,0CAA0C,MAAM,GAAG,EAAE,KAAK,CAAC,CAAC;YACzE,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,aAAa,CAAC,MAAc;QAChC,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,WAAW,MAAM,aAAa,CAAC;YACjD,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;YAErD,IAAI,CAAC,SAAS;gBAAE,OAAO,SAAS,CAAC;YAEjC,iDAAiD;YACjD,MAAM,IAAI,GAAG,IAAI,CAAC,oBAAoB,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;YACxD,OAAO,IAAI,EAAE,UAAU,CAAC;QAE1B,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,6CAA6C,MAAM,GAAG,EAAE,KAAK,CAAC,CAAC;YAC5E,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,YAAY,CAAC,SAAiB;QAC1C,oBAAoB;QACpB,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACzC,IAAI,MAAM,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,MAAM,CAAC,SAAS,EAAE,CAAC;YAC5C,OAAO,MAAM,CAAC;QAChB,CAAC;QAED,IAAI,CAAC;YACH,OAAO,CAAC,GAAG,CAAC,wBAAwB,SAAS,EAAE,CAAC,CAAC;YAEjD,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;YACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,cAAc,CAAC,CAAC;YAE5E,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,SAAS,EAAE;gBACtC,OAAO,EAAE;oBACP,YAAY,EAAE,IAAI,CAAC,SAAS;iBAC7B;gBACD,MAAM,EAAE,UAAU,CAAC,MAAM;aAC1B,CAAC,CAAC;YAEH,YAAY,CAAC,SAAS,CAAC,CAAC;YAExB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;oBAC5B,OAAO,CAAC,GAAG,CAAC,sCAAsC,SAAS,EAAE,CAAC,CAAC;oBAC/D,OAAO,IAAI,CAAC;gBACd,CAAC;gBACD,MAAM,IAAI,KAAK,CAAC,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;YACrE,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,MAAM,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;YAE5C,mBAAmB;YACnB,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;YAErC,OAAO,CAAC,GAAG,CAAC,kDAAkD,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;YAC7F,OAAO,SAAS,CAAC;QAEnB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,IAAI,KAAK,YAAY,KAAK,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBAC1D,OAAO,CAAC,IAAI,CAAC,gCAAgC,SAAS,EAAE,CAAC,CAAC;YAC5D,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,8BAA8B,SAAS,GAAG,EAAE,KAAK,CAAC,CAAC;YAClE,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAEO,cAAc,CAAC,IAAY;QACjC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;QACtG,MAAM,KAAK,GAAiB,EAAE,CAAC;QAC/B,MAAM,cAAc,GAAa,EAAE,CAAC;QAEpC,IAAI,WAAW,GAA+B,IAAI,CAAC;QAEnD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,CAAC,GAAG,EAAE,GAAG,UAAU,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAC7C,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YAC1C,MAAM,QAAQ,GAAG,GAAG,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;YAE1C,QAAQ,QAAQ,EAAE,CAAC;gBACjB,KAAK,YAAY;oBACf,mBAAmB;oBACnB,IAAI,WAAW,EAAE,CAAC;wBAChB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC;oBAC7C,CAAC;oBACD,WAAW,GAAG;wBACZ,SAAS,EAAE,KAAK,CAAC,WAAW,EAAE;wBAC9B,SAAS,EAAE,EAAE;wBACb,MAAM,EAAE,EAAE;wBACV,QAAQ,EAAE,EAAE;qBACb,CAAC;oBACF,MAAM;gBAER,KAAK,UAAU;oBACb,IAAI,WAAW,EAAE,CAAC;wBAChB,WAAW,CAAC,SAAS,GAAG,WAAW,CAAC,SAAS,IAAI,EAAE,CAAC;wBACpD,IAAI,KAAK,EAAE,CAAC;4BACV,WAAW,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;wBACpC,CAAC;oBACH,CAAC;oBACD,MAAM;gBAER,KAAK,OAAO;oBACV,IAAI,WAAW,EAAE,CAAC;wBAChB,WAAW,CAAC,MAAM,GAAG,WAAW,CAAC,MAAM,IAAI,EAAE,CAAC;wBAC9C,IAAI,KAAK,EAAE,CAAC;4BACV,WAAW,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;wBACjC,CAAC;oBACH,CAAC;oBACD,MAAM;gBAER,KAAK,aAAa;oBAChB,IAAI,WAAW,IAAI,KAAK,EAAE,CAAC;wBACzB,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;wBAChC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;4BAClB,WAAW,CAAC,UAAU,GAAG,KAAK,GAAG,IAAI,CAAC,CAAC,0BAA0B;wBACnE,CAAC;oBACH,CAAC;oBACD,MAAM;gBAER,KAAK,SAAS;oBACZ,IAAI,KAAK,EAAE,CAAC;wBACV,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;wBAC3B,IAAI,WAAW,EAAE,CAAC;4BAChB,WAAW,CAAC,QAAQ,GAAG,WAAW,CAAC,QAAQ,IAAI,EAAE,CAAC;4BAClD,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;wBACnC,CAAC;oBACH,CAAC;oBACD,MAAM;YACV,CAAC;QACH,CAAC;QAED,6BAA6B;QAC7B,IAAI,WAAW,EAAE,CAAC;YAChB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC;QAC7C,CAAC;QAED,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,OAAO;YACL,KAAK;YACL,QAAQ,EAAE,cAAc;YACxB,SAAS,EAAE,GAAG;YACd,SAAS,EAAE,GAAG,GAAG,IAAI,CAAC,YAAY;SACnC,CAAC;IACJ,CAAC;IAEO,YAAY,CAAC,OAA4B;QAC/C,OAAO;YACL,SAAS,EAAE,OAAO,CAAC,SAAS,IAAI,GAAG;YACnC,SAAS,EAAE,OAAO,CAAC,SAAS,IAAI,EAAE;YAClC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,EAAE;YAC5B,UAAU,EAAE,OAAO,CAAC,UAAU;YAC9B,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,EAAE;SACjC,CAAC;IACJ,CAAC;IAEO,UAAU,CAAC,IAAY,EAAE,SAAoB;QAMnD,iDAAiD;QACjD,MAAM,IAAI,GAAG,IAAI,CAAC,oBAAoB,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;QAExD,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,OAAO;gBACL,OAAO,EAAE,IAAI;gBACb,QAAQ,EAAE,SAAS,CAAC,QAAQ;gBAC5B,MAAM,EAAE,2BAA2B;aACpC,CAAC;QACJ,CAAC;QAED,6DAA6D;QAC7D,KAAK,MAAM,YAAY,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YACvC,IAAI,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,YAAY,CAAC,EAAE,CAAC;gBAC5C,OAAO;oBACL,OAAO,EAAE,IAAI;oBACb,UAAU,EAAE,IAAI,CAAC,UAAU;oBAC3B,QAAQ,EAAE,SAAS,CAAC,QAAQ;oBAC5B,MAAM,EAAE,kCAAkC,YAAY,EAAE;iBACzD,CAAC;YACJ,CAAC;QACH,CAAC;QAED,uBAAuB;QACvB,KAAK,MAAM,eAAe,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;YAC7C,IAAI,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,eAAe,CAAC,EAAE,CAAC;gBAC/C,OAAO;oBACL,OAAO,EAAE,KAAK;oBACd,UAAU,EAAE,IAAI,CAAC,UAAU;oBAC3B,QAAQ,EAAE,SAAS,CAAC,QAAQ;oBAC5B,MAAM,EAAE,uBAAuB,eAAe,EAAE;iBACjD,CAAC;YACJ,CAAC;QACH,CAAC;QAED,sCAAsC;QACtC,OAAO;YACL,OAAO,EAAE,IAAI;YACb,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,QAAQ,EAAE,SAAS,CAAC,QAAQ;YAC5B,MAAM,EAAE,4BAA4B;SACrC,CAAC;IACJ,CAAC;IAEO,oBAAoB,CAAC,KAAmB;QAC9C,gEAAgE;QAChE,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,SAAS,KAAK,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,CAAC,CAAC;QACvF,IAAI,UAAU;YAAE,OAAO,UAAU,CAAC;QAElC,MAAM,aAAa,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,SAAS,KAAK,GAAG,CAAC,CAAC;QACjE,IAAI,aAAa;YAAE,OAAO,aAAa,CAAC;QAExC,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,cAAc,CAAC,IAAY,EAAE,OAAe;QAClD,IAAI,OAAO,KAAK,EAAE,EAAE,CAAC;YACnB,wCAAwC;YACxC,OAAO,KAAK,CAAC;QACf,CAAC;QAED,IAAI,OAAO,KAAK,GAAG,EAAE,CAAC;YACpB,0CAA0C;YAC1C,OAAO,IAAI,CAAC;QACd,CAAC;QAED,iDAAiD;QACjD,IAAI,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC1B,+CAA+C;YAC/C,MAAM,YAAY,GAAG,OAAO;iBACzB,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC,6BAA6B;iBACpE,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,kBAAkB;YAE7C,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,GAAG,GAAG,YAAY,CAAC,CAAC;YAC7C,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC;QAED,wDAAwD;QACxD,OAAO,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC;IAClC,CAAC;IAED,mCAAmC;IACnC,UAAU;QACR,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;IACrB,CAAC;IAED,kBAAkB;IAClB,aAAa;QACX,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI;YACrB,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,IAAI,CAAC,EAAE,EAAE,CAAC,CAAC;gBAC9D,GAAG;gBACH,SAAS,EAAE,IAAI,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE;gBACjD,SAAS,EAAE,IAAI,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE;gBACjD,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM;gBAC7B,aAAa,EAAE,IAAI,CAAC,QAAQ,CAAC,MAAM;aACpC,CAAC,CAAC;SACJ,CAAC;IACJ,CAAC;CACF;AArUD,sCAqUC;AAED,0BAA0B;AACb,QAAA,mBAAmB,GAAG,IAAI,aAAa,EAAE,CAAC"}
@@ -0,0 +1,62 @@
1
+ export interface DiscoveredFeed {
2
+ url: string;
3
+ title?: string;
4
+ type: 'rss' | 'atom' | 'rdf';
5
+ source: 'link-tag' | 'common-path' | 'content-scan';
6
+ confidence: number;
7
+ }
8
+ export declare class RSSDiscovery {
9
+ private readonly userAgent;
10
+ private readonly timeout;
11
+ private readonly maxRedirects;
12
+ /**
13
+ * Discover RSS feeds from a given URL
14
+ */
15
+ discoverFeeds(url: string): Promise<DiscoveredFeed[]>;
16
+ /**
17
+ * Check if the URL itself is a direct feed
18
+ */
19
+ private checkDirectFeed;
20
+ /**
21
+ * Fetch HTML page content
22
+ */
23
+ private fetchPage;
24
+ /**
25
+ * Extract feed URLs from HTML link tags
26
+ */
27
+ private extractFeedsFromHTML;
28
+ /**
29
+ * Check common feed paths
30
+ */
31
+ private checkCommonPaths;
32
+ /**
33
+ * Scan HTML content for feed-like patterns
34
+ */
35
+ private scanForFeedContent;
36
+ /**
37
+ * Validate if a URL is actually a feed
38
+ */
39
+ private validateFeedUrl;
40
+ /**
41
+ * Resolve relative URLs to absolute URLs
42
+ */
43
+ private resolveUrl;
44
+ /**
45
+ * Check if content type indicates a feed
46
+ */
47
+ private isFeedContentType;
48
+ /**
49
+ * Determine feed type from content type
50
+ */
51
+ private determineFeedType;
52
+ /**
53
+ * Guess feed type from URL or text
54
+ */
55
+ private guessFeedType;
56
+ /**
57
+ * Check if a link looks like it could be a feed
58
+ */
59
+ private isFeedLikeLink;
60
+ }
61
+ export declare const globalRSSDiscovery: RSSDiscovery;
62
+ //# sourceMappingURL=rss-discovery.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"rss-discovery.d.ts","sourceRoot":"","sources":["../../../lib/web-scrapers/rss-discovery.ts"],"names":[],"mappings":"AAIA,MAAM,WAAW,cAAc;IAC7B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,KAAK,GAAG,MAAM,GAAG,KAAK,CAAC;IAC7B,MAAM,EAAE,UAAU,GAAG,aAAa,GAAG,cAAc,CAAC;IACpD,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,qBAAa,YAAY;IACvB,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAiF;IAC3G,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAK;IAElC;;OAEG;IACG,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,cAAc,EAAE,CAAC;IAuD3D;;OAEG;YACW,eAAe;IAuC7B;;OAEG;YACW,SAAS;IAmCvB;;OAEG;IACH,OAAO,CAAC,oBAAoB;IAsD5B;;OAEG;YACW,gBAAgB;IA+C9B;;OAEG;YACW,kBAAkB;IAqChC;;OAEG;YACW,eAAe;IAgC7B;;OAEG;IACH,OAAO,CAAC,UAAU;IAQlB;;OAEG;IACH,OAAO,CAAC,iBAAiB;IASzB;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAOzB;;OAEG;IACH,OAAO,CAAC,aAAa;IAOrB;;OAEG;IACH,OAAO,CAAC,cAAc;CAUvB;AAGD,eAAO,MAAM,kBAAkB,cAAqB,CAAC"}