@tyroneross/blog-scraper 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +254 -279
- package/dist/lib/circuit-breaker.d.ts +29 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.js +89 -0
- package/dist/lib/circuit-breaker.js.map +1 -0
- package/dist/lib/content-extractor.d.ts +13 -0
- package/dist/lib/content-extractor.d.ts.map +1 -0
- package/dist/lib/content-extractor.js +75 -0
- package/dist/lib/content-extractor.js.map +1 -0
- package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
- package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
- package/dist/lib/formatters/html-to-markdown.js +146 -0
- package/dist/lib/formatters/html-to-markdown.js.map +1 -0
- package/dist/lib/formatters/text-cleaner.d.ts +44 -0
- package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
- package/dist/lib/formatters/text-cleaner.js +143 -0
- package/dist/lib/formatters/text-cleaner.js.map +1 -0
- package/dist/lib/index.d.ts +96 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +184 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/quality-scorer.d.ts +83 -0
- package/dist/lib/quality-scorer.d.ts.map +1 -0
- package/dist/lib/quality-scorer.js +376 -0
- package/dist/lib/quality-scorer.js.map +1 -0
- package/dist/lib/rss-utils.d.ts +31 -0
- package/dist/lib/rss-utils.d.ts.map +1 -0
- package/dist/lib/rss-utils.js +175 -0
- package/dist/lib/rss-utils.js.map +1 -0
- package/dist/lib/scraping-rate-limiter.d.ts +52 -0
- package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
- package/dist/lib/scraping-rate-limiter.js +238 -0
- package/dist/lib/scraping-rate-limiter.js.map +1 -0
- package/dist/lib/source-orchestrator.d.ts +306 -0
- package/dist/lib/source-orchestrator.d.ts.map +1 -0
- package/dist/lib/source-orchestrator.js +840 -0
- package/dist/lib/source-orchestrator.js.map +1 -0
- package/dist/lib/types.d.ts +143 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +7 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.js +531 -0
- package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.js +598 -0
- package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
- package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.js +285 -0
- package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.js +384 -0
- package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
- package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
- package/package.json +54 -33
- package/dist/index.d.mts +0 -949
- package/dist/index.d.ts +0 -949
- package/dist/index.js +0 -3236
- package/dist/index.mjs +0 -3165
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.globalRobotsChecker = exports.RobotsChecker = void 0;
|
|
4
|
+
class RobotsChecker {
|
|
5
|
+
constructor() {
|
|
6
|
+
this.cache = new Map();
|
|
7
|
+
this.cacheTimeout = 24 * 60 * 60 * 1000; // 24 hours
|
|
8
|
+
this.userAgent = 'AtomizeNews/1.0';
|
|
9
|
+
this.requestTimeout = 5000; // 5 seconds
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Check if a URL is allowed to be crawled according to robots.txt
|
|
13
|
+
*/
|
|
14
|
+
async isAllowed(url) {
|
|
15
|
+
try {
|
|
16
|
+
const urlObj = new URL(url);
|
|
17
|
+
const robotsUrl = `${urlObj.protocol}//${urlObj.host}/robots.txt`;
|
|
18
|
+
console.log(`🤖 [Robots] Checking ${url} against ${robotsUrl}`);
|
|
19
|
+
const robotsTxt = await this.getRobotsTxt(robotsUrl);
|
|
20
|
+
if (!robotsTxt) {
|
|
21
|
+
// If robots.txt doesn't exist or can't be fetched, allow by default
|
|
22
|
+
return {
|
|
23
|
+
allowed: true,
|
|
24
|
+
sitemaps: [],
|
|
25
|
+
reason: 'No robots.txt found - allowing by default'
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
const result = this.checkRules(urlObj.pathname, robotsTxt);
|
|
29
|
+
console.log(`🤖 [Robots] ${result.allowed ? '✅ Allowed' : '❌ Blocked'}: ${url} - ${result.reason}`);
|
|
30
|
+
return result;
|
|
31
|
+
}
|
|
32
|
+
catch (error) {
|
|
33
|
+
console.warn(`⚠️ [Robots] Error checking robots.txt for ${url}:`, error);
|
|
34
|
+
// On error, default to allowing the request
|
|
35
|
+
return {
|
|
36
|
+
allowed: true,
|
|
37
|
+
sitemaps: [],
|
|
38
|
+
reason: `Error checking robots.txt: ${error instanceof Error ? error.message : 'Unknown error'}`
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Get sitemaps listed in robots.txt for a domain
|
|
44
|
+
*/
|
|
45
|
+
async getSitemaps(domain) {
|
|
46
|
+
try {
|
|
47
|
+
const robotsUrl = `https://${domain}/robots.txt`;
|
|
48
|
+
const robotsTxt = await this.getRobotsTxt(robotsUrl);
|
|
49
|
+
return robotsTxt ? robotsTxt.sitemaps : [];
|
|
50
|
+
}
|
|
51
|
+
catch (error) {
|
|
52
|
+
console.warn(`⚠️ [Robots] Error getting sitemaps for ${domain}:`, error);
|
|
53
|
+
return [];
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Get the recommended crawl delay for a domain
|
|
58
|
+
*/
|
|
59
|
+
async getCrawlDelay(domain) {
|
|
60
|
+
try {
|
|
61
|
+
const robotsUrl = `https://${domain}/robots.txt`;
|
|
62
|
+
const robotsTxt = await this.getRobotsTxt(robotsUrl);
|
|
63
|
+
if (!robotsTxt)
|
|
64
|
+
return undefined;
|
|
65
|
+
// Find the most specific rule for our user agent
|
|
66
|
+
const rule = this.findBestMatchingRule(robotsTxt.rules);
|
|
67
|
+
return rule?.crawlDelay;
|
|
68
|
+
}
|
|
69
|
+
catch (error) {
|
|
70
|
+
console.warn(`⚠️ [Robots] Error getting crawl delay for ${domain}:`, error);
|
|
71
|
+
return undefined;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
async getRobotsTxt(robotsUrl) {
|
|
75
|
+
// Check cache first
|
|
76
|
+
const cached = this.cache.get(robotsUrl);
|
|
77
|
+
if (cached && Date.now() < cached.expiresAt) {
|
|
78
|
+
return cached;
|
|
79
|
+
}
|
|
80
|
+
try {
|
|
81
|
+
console.log(`🤖 [Robots] Fetching ${robotsUrl}`);
|
|
82
|
+
const controller = new AbortController();
|
|
83
|
+
const timeoutId = setTimeout(() => controller.abort(), this.requestTimeout);
|
|
84
|
+
const response = await fetch(robotsUrl, {
|
|
85
|
+
headers: {
|
|
86
|
+
'User-Agent': this.userAgent,
|
|
87
|
+
},
|
|
88
|
+
signal: controller.signal,
|
|
89
|
+
});
|
|
90
|
+
clearTimeout(timeoutId);
|
|
91
|
+
if (!response.ok) {
|
|
92
|
+
if (response.status === 404) {
|
|
93
|
+
console.log(`🤖 [Robots] No robots.txt found at ${robotsUrl}`);
|
|
94
|
+
return null;
|
|
95
|
+
}
|
|
96
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
97
|
+
}
|
|
98
|
+
const text = await response.text();
|
|
99
|
+
const robotsTxt = this.parseRobotsTxt(text);
|
|
100
|
+
// Cache the result
|
|
101
|
+
this.cache.set(robotsUrl, robotsTxt);
|
|
102
|
+
console.log(`🤖 [Robots] Successfully parsed robots.txt for ${new URL(robotsUrl).hostname}`);
|
|
103
|
+
return robotsTxt;
|
|
104
|
+
}
|
|
105
|
+
catch (error) {
|
|
106
|
+
if (error instanceof Error && error.name === 'AbortError') {
|
|
107
|
+
console.warn(`⚠️ [Robots] Timeout fetching ${robotsUrl}`);
|
|
108
|
+
}
|
|
109
|
+
else {
|
|
110
|
+
console.warn(`⚠️ [Robots] Error fetching ${robotsUrl}:`, error);
|
|
111
|
+
}
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
parseRobotsTxt(text) {
|
|
116
|
+
const lines = text.split('\n').map(line => line.trim()).filter(line => line && !line.startsWith('#'));
|
|
117
|
+
const rules = [];
|
|
118
|
+
const globalSitemaps = [];
|
|
119
|
+
let currentRule = null;
|
|
120
|
+
for (const line of lines) {
|
|
121
|
+
const [key, ...valueParts] = line.split(':');
|
|
122
|
+
const value = valueParts.join(':').trim();
|
|
123
|
+
const lowerKey = key.toLowerCase().trim();
|
|
124
|
+
switch (lowerKey) {
|
|
125
|
+
case 'user-agent':
|
|
126
|
+
// Start a new rule
|
|
127
|
+
if (currentRule) {
|
|
128
|
+
rules.push(this.completeRule(currentRule));
|
|
129
|
+
}
|
|
130
|
+
currentRule = {
|
|
131
|
+
userAgent: value.toLowerCase(),
|
|
132
|
+
disallows: [],
|
|
133
|
+
allows: [],
|
|
134
|
+
sitemaps: []
|
|
135
|
+
};
|
|
136
|
+
break;
|
|
137
|
+
case 'disallow':
|
|
138
|
+
if (currentRule) {
|
|
139
|
+
currentRule.disallows = currentRule.disallows || [];
|
|
140
|
+
if (value) {
|
|
141
|
+
currentRule.disallows.push(value);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
break;
|
|
145
|
+
case 'allow':
|
|
146
|
+
if (currentRule) {
|
|
147
|
+
currentRule.allows = currentRule.allows || [];
|
|
148
|
+
if (value) {
|
|
149
|
+
currentRule.allows.push(value);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
break;
|
|
153
|
+
case 'crawl-delay':
|
|
154
|
+
if (currentRule && value) {
|
|
155
|
+
const delay = parseFloat(value);
|
|
156
|
+
if (!isNaN(delay)) {
|
|
157
|
+
currentRule.crawlDelay = delay * 1000; // Convert to milliseconds
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
break;
|
|
161
|
+
case 'sitemap':
|
|
162
|
+
if (value) {
|
|
163
|
+
globalSitemaps.push(value);
|
|
164
|
+
if (currentRule) {
|
|
165
|
+
currentRule.sitemaps = currentRule.sitemaps || [];
|
|
166
|
+
currentRule.sitemaps.push(value);
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
break;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
// Don't forget the last rule
|
|
173
|
+
if (currentRule) {
|
|
174
|
+
rules.push(this.completeRule(currentRule));
|
|
175
|
+
}
|
|
176
|
+
const now = Date.now();
|
|
177
|
+
return {
|
|
178
|
+
rules,
|
|
179
|
+
sitemaps: globalSitemaps,
|
|
180
|
+
fetchedAt: now,
|
|
181
|
+
expiresAt: now + this.cacheTimeout
|
|
182
|
+
};
|
|
183
|
+
}
|
|
184
|
+
completeRule(partial) {
|
|
185
|
+
return {
|
|
186
|
+
userAgent: partial.userAgent || '*',
|
|
187
|
+
disallows: partial.disallows || [],
|
|
188
|
+
allows: partial.allows || [],
|
|
189
|
+
crawlDelay: partial.crawlDelay,
|
|
190
|
+
sitemaps: partial.sitemaps || []
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
checkRules(path, robotsTxt) {
|
|
194
|
+
// Find the best matching rule for our user agent
|
|
195
|
+
const rule = this.findBestMatchingRule(robotsTxt.rules);
|
|
196
|
+
if (!rule) {
|
|
197
|
+
return {
|
|
198
|
+
allowed: true,
|
|
199
|
+
sitemaps: robotsTxt.sitemaps,
|
|
200
|
+
reason: 'No applicable rules found'
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
// Check Allow rules first (they have priority over Disallow)
|
|
204
|
+
for (const allowPattern of rule.allows) {
|
|
205
|
+
if (this.matchesPattern(path, allowPattern)) {
|
|
206
|
+
return {
|
|
207
|
+
allowed: true,
|
|
208
|
+
crawlDelay: rule.crawlDelay,
|
|
209
|
+
sitemaps: robotsTxt.sitemaps,
|
|
210
|
+
reason: `Explicitly allowed by pattern: ${allowPattern}`
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
// Check Disallow rules
|
|
215
|
+
for (const disallowPattern of rule.disallows) {
|
|
216
|
+
if (this.matchesPattern(path, disallowPattern)) {
|
|
217
|
+
return {
|
|
218
|
+
allowed: false,
|
|
219
|
+
crawlDelay: rule.crawlDelay,
|
|
220
|
+
sitemaps: robotsTxt.sitemaps,
|
|
221
|
+
reason: `Blocked by pattern: ${disallowPattern}`
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
// If no rules match, allow by default
|
|
226
|
+
return {
|
|
227
|
+
allowed: true,
|
|
228
|
+
crawlDelay: rule.crawlDelay,
|
|
229
|
+
sitemaps: robotsTxt.sitemaps,
|
|
230
|
+
reason: 'No matching disallow rules'
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
findBestMatchingRule(rules) {
|
|
234
|
+
// Priority order: exact match for our user agent, then wildcard
|
|
235
|
+
const exactMatch = rules.find(rule => rule.userAgent === this.userAgent.toLowerCase());
|
|
236
|
+
if (exactMatch)
|
|
237
|
+
return exactMatch;
|
|
238
|
+
const wildcardMatch = rules.find(rule => rule.userAgent === '*');
|
|
239
|
+
if (wildcardMatch)
|
|
240
|
+
return wildcardMatch;
|
|
241
|
+
return null;
|
|
242
|
+
}
|
|
243
|
+
matchesPattern(path, pattern) {
|
|
244
|
+
if (pattern === '') {
|
|
245
|
+
// Empty disallow means allow everything
|
|
246
|
+
return false;
|
|
247
|
+
}
|
|
248
|
+
if (pattern === '/') {
|
|
249
|
+
// Root disallow means disallow everything
|
|
250
|
+
return true;
|
|
251
|
+
}
|
|
252
|
+
// Handle wildcards - simplified pattern matching
|
|
253
|
+
if (pattern.includes('*')) {
|
|
254
|
+
// Convert robots.txt wildcard pattern to regex
|
|
255
|
+
const regexPattern = pattern
|
|
256
|
+
.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') // Escape regex special chars
|
|
257
|
+
.replace(/\\\*/g, '.*'); // Convert * to .*
|
|
258
|
+
const regex = new RegExp('^' + regexPattern);
|
|
259
|
+
return regex.test(path);
|
|
260
|
+
}
|
|
261
|
+
// Simple prefix matching for patterns without wildcards
|
|
262
|
+
return path.startsWith(pattern);
|
|
263
|
+
}
|
|
264
|
+
// Clear cache (useful for testing)
|
|
265
|
+
clearCache() {
|
|
266
|
+
this.cache.clear();
|
|
267
|
+
}
|
|
268
|
+
// Get cache stats
|
|
269
|
+
getCacheStats() {
|
|
270
|
+
return {
|
|
271
|
+
size: this.cache.size,
|
|
272
|
+
entries: Array.from(this.cache.entries()).map(([url, data]) => ({
|
|
273
|
+
url,
|
|
274
|
+
fetchedAt: new Date(data.fetchedAt).toISOString(),
|
|
275
|
+
expiresAt: new Date(data.expiresAt).toISOString(),
|
|
276
|
+
rulesCount: data.rules.length,
|
|
277
|
+
sitemapsCount: data.sitemaps.length
|
|
278
|
+
}))
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
exports.RobotsChecker = RobotsChecker;
|
|
283
|
+
// Default global instance
|
|
284
|
+
exports.globalRobotsChecker = new RobotsChecker();
|
|
285
|
+
//# sourceMappingURL=robots-checker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"robots-checker.js","sourceRoot":"","sources":["../../../lib/web-scrapers/robots-checker.ts"],"names":[],"mappings":";;;AAeA,MAAa,aAAa;IAA1B;QACU,UAAK,GAAG,IAAI,GAAG,EAAqB,CAAC;QAC5B,iBAAY,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC,WAAW;QAC/C,cAAS,GAAG,iBAAiB,CAAC;QAC9B,mBAAc,GAAG,IAAI,CAAC,CAAC,YAAY;IAiUtD,CAAC;IA/TC;;OAEG;IACH,KAAK,CAAC,SAAS,CAAC,GAAW;QAMzB,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YAC5B,MAAM,SAAS,GAAG,GAAG,MAAM,CAAC,QAAQ,KAAK,MAAM,CAAC,IAAI,aAAa,CAAC;YAElE,OAAO,CAAC,GAAG,CAAC,wBAAwB,GAAG,YAAY,SAAS,EAAE,CAAC,CAAC;YAEhE,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;YACrD,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,oEAAoE;gBACpE,OAAO;oBACL,OAAO,EAAE,IAAI;oBACb,QAAQ,EAAE,EAAE;oBACZ,MAAM,EAAE,2CAA2C;iBACpD,CAAC;YACJ,CAAC;YAED,MAAM,MAAM,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;YAE3D,OAAO,CAAC,GAAG,CAAC,eAAe,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,WAAW,KAAK,GAAG,MAAM,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;YAEpG,OAAO,MAAM,CAAC;QAEhB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,6CAA6C,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YACzE,4CAA4C;YAC5C,OAAO;gBACL,OAAO,EAAE,IAAI;gBACb,QAAQ,EAAE,EAAE;gBACZ,MAAM,EAAE,8BAA8B,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE;aACjG,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,WAAW,CAAC,MAAc;QAC9B,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,WAAW,MAAM,aAAa,CAAC;YACjD,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;YACrD,OAAO,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7C,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,0CAA0C,MAAM,GAAG,EAAE,KAAK,CAAC,CAAC;YACzE,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,aAAa,CAAC,MAAc;QAChC,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,WAAW,MAAM,aAAa,CAAC;YACjD,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;YAErD,IAAI,CAAC,SAAS;gBAAE,OAAO,SAAS,CAAC;YAEjC,iDAAiD;YACjD,MAAM,IAAI,GAAG,IAAI,CAAC,oBAAoB,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;YACxD,OAAO,IAAI,EAAE,UAAU,CAAC;QAE1B,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,6CAA6C,MAAM,GAAG,EAAE,KAAK,CAAC,CAAC;YAC5E,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,YAAY,CAAC,SAAiB;QAC1C,oBAAoB;QACpB,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACzC,IAAI,MAAM,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,MAAM,CAAC,SAAS,EAAE,CAAC;YAC5C,OAAO,MAAM,CAAC;QAChB,CAAC;QAED,IAAI,CAAC;YACH,OAAO,CAAC,GAAG,CAAC,wBAAwB,SAAS,EAAE,CAAC,CAAC;YAEjD,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;YACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,cAAc,CAAC,CAAC;YAE5E,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,SAAS,EAAE;gBACtC,OAAO,EAAE;oBACP,YAAY,EAAE,IAAI,CAAC,SAAS;iBAC7B;gBACD,MAAM,EAAE,UAAU,CAAC,MAAM;aAC1B,CAAC,CAAC;YAEH,YAAY,CAAC,SAAS,CAAC,CAAC;YAExB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;oBAC5B,OAAO,CAAC,GAAG,CAAC,sCAAsC,SAAS,EAAE,CAAC,CAAC;oBAC/D,OAAO,IAAI,CAAC;gBACd,CAAC;gBACD,MAAM,IAAI,KAAK,CAAC,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;YACrE,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,MAAM,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;YAE5C,mBAAmB;YACnB,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;YAErC,OAAO,CAAC,GAAG,CAAC,kDAAkD,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;YAC7F,OAAO,SAAS,CAAC;QAEnB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,IAAI,KAAK,YAAY,KAAK,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;gBAC1D,OAAO,CAAC,IAAI,CAAC,gCAAgC,SAAS,EAAE,CAAC,CAAC;YAC5D,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,8BAA8B,SAAS,GAAG,EAAE,KAAK,CAAC,CAAC;YAClE,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAEO,cAAc,CAAC,IAAY;QACjC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;QACtG,MAAM,KAAK,GAAiB,EAAE,CAAC;QAC/B,MAAM,cAAc,GAAa,EAAE,CAAC;QAEpC,IAAI,WAAW,GAA+B,IAAI,CAAC;QAEnD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,CAAC,GAAG,EAAE,GAAG,UAAU,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAC7C,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YAC1C,MAAM,QAAQ,GAAG,GAAG,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;YAE1C,QAAQ,QAAQ,EAAE,CAAC;gBACjB,KAAK,YAAY;oBACf,mBAAmB;oBACnB,IAAI,WAAW,EAAE,CAAC;wBAChB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC;oBAC7C,CAAC;oBACD,WAAW,GAAG;wBACZ,SAAS,EAAE,KAAK,CAAC,WAAW,EAAE;wBAC9B,SAAS,EAAE,EAAE;wBACb,MAAM,EAAE,EAAE;wBACV,QAAQ,EAAE,EAAE;qBACb,CAAC;oBACF,MAAM;gBAER,KAAK,UAAU;oBACb,IAAI,WAAW,EAAE,CAAC;wBAChB,WAAW,CAAC,SAAS,GAAG,WAAW,CAAC,SAAS,IAAI,EAAE,CAAC;wBACpD,IAAI,KAAK,EAAE,CAAC;4BACV,WAAW,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;wBACpC,CAAC;oBACH,CAAC;oBACD,MAAM;gBAER,KAAK,OAAO;oBACV,IAAI,WAAW,EAAE,CAAC;wBAChB,WAAW,CAAC,MAAM,GAAG,WAAW,CAAC,MAAM,IAAI,EAAE,CAAC;wBAC9C,IAAI,KAAK,EAAE,CAAC;4BACV,WAAW,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;wBACjC,CAAC;oBACH,CAAC;oBACD,MAAM;gBAER,KAAK,aAAa;oBAChB,IAAI,WAAW,IAAI,KAAK,EAAE,CAAC;wBACzB,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;wBAChC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;4BAClB,WAAW,CAAC,UAAU,GAAG,KAAK,GAAG,IAAI,CAAC,CAAC,0BAA0B;wBACnE,CAAC;oBACH,CAAC;oBACD,MAAM;gBAER,KAAK,SAAS;oBACZ,IAAI,KAAK,EAAE,CAAC;wBACV,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;wBAC3B,IAAI,WAAW,EAAE,CAAC;4BAChB,WAAW,CAAC,QAAQ,GAAG,WAAW,CAAC,QAAQ,IAAI,EAAE,CAAC;4BAClD,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;wBACnC,CAAC;oBACH,CAAC;oBACD,MAAM;YACV,CAAC;QACH,CAAC;QAED,6BAA6B;QAC7B,IAAI,WAAW,EAAE,CAAC;YAChB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC;QAC7C,CAAC;QAED,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,OAAO;YACL,KAAK;YACL,QAAQ,EAAE,cAAc;YACxB,SAAS,EAAE,GAAG;YACd,SAAS,EAAE,GAAG,GAAG,IAAI,CAAC,YAAY;SACnC,CAAC;IACJ,CAAC;IAEO,YAAY,CAAC,OAA4B;QAC/C,OAAO;YACL,SAAS,EAAE,OAAO,CAAC,SAAS,IAAI,GAAG;YACnC,SAAS,EAAE,OAAO,CAAC,SAAS,IAAI,EAAE;YAClC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,EAAE;YAC5B,UAAU,EAAE,OAAO,CAAC,UAAU;YAC9B,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,EAAE;SACjC,CAAC;IACJ,CAAC;IAEO,UAAU,CAAC,IAAY,EAAE,SAAoB;QAMnD,iDAAiD;QACjD,MAAM,IAAI,GAAG,IAAI,CAAC,oBAAoB,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;QAExD,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,OAAO;gBACL,OAAO,EAAE,IAAI;gBACb,QAAQ,EAAE,SAAS,CAAC,QAAQ;gBAC5B,MAAM,EAAE,2BAA2B;aACpC,CAAC;QACJ,CAAC;QAED,6DAA6D;QAC7D,KAAK,MAAM,YAAY,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YACvC,IAAI,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,YAAY,CAAC,EAAE,CAAC;gBAC5C,OAAO;oBACL,OAAO,EAAE,IAAI;oBACb,UAAU,EAAE,IAAI,CAAC,UAAU;oBAC3B,QAAQ,EAAE,SAAS,CAAC,QAAQ;oBAC5B,MAAM,EAAE,kCAAkC,YAAY,EAAE;iBACzD,CAAC;YACJ,CAAC;QACH,CAAC;QAED,uBAAuB;QACvB,KAAK,MAAM,eAAe,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;YAC7C,IAAI,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,eAAe,CAAC,EAAE,CAAC;gBAC/C,OAAO;oBACL,OAAO,EAAE,KAAK;oBACd,UAAU,EAAE,IAAI,CAAC,UAAU;oBAC3B,QAAQ,EAAE,SAAS,CAAC,QAAQ;oBAC5B,MAAM,EAAE,uBAAuB,eAAe,EAAE;iBACjD,CAAC;YACJ,CAAC;QACH,CAAC;QAED,sCAAsC;QACtC,OAAO;YACL,OAAO,EAAE,IAAI;YACb,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,QAAQ,EAAE,SAAS,CAAC,QAAQ;YAC5B,MAAM,EAAE,4BAA4B;SACrC,CAAC;IACJ,CAAC;IAEO,oBAAoB,CAAC,KAAmB;QAC9C,gEAAgE;QAChE,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,SAAS,KAAK,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,CAAC,CAAC;QACvF,IAAI,UAAU;YAAE,OAAO,UAAU,CAAC;QAElC,MAAM,aAAa,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,SAAS,KAAK,GAAG,CAAC,CAAC;QACjE,IAAI,aAAa;YAAE,OAAO,aAAa,CAAC;QAExC,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,cAAc,CAAC,IAAY,EAAE,OAAe;QAClD,IAAI,OAAO,KAAK,EAAE,EAAE,CAAC;YACnB,wCAAwC;YACxC,OAAO,KAAK,CAAC;QACf,CAAC;QAED,IAAI,OAAO,KAAK,GAAG,EAAE,CAAC;YACpB,0CAA0C;YAC1C,OAAO,IAAI,CAAC;QACd,CAAC;QAED,iDAAiD;QACjD,IAAI,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC1B,+CAA+C;YAC/C,MAAM,YAAY,GAAG,OAAO;iBACzB,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC,6BAA6B;iBACpE,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,kBAAkB;YAE7C,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,GAAG,GAAG,YAAY,CAAC,CAAC;YAC7C,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC;QAED,wDAAwD;QACxD,OAAO,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC;IAClC,CAAC;IAED,mCAAmC;IACnC,UAAU;QACR,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;IACrB,CAAC;IAED,kBAAkB;IAClB,aAAa;QACX,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI;YACrB,OAAO,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,IAAI,CAAC,EAAE,EAAE,CAAC,CAAC;gBAC9D,GAAG;gBACH,SAAS,EAAE,IAAI,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE;gBACjD,SAAS,EAAE,IAAI,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE;gBACjD,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM;gBAC7B,aAAa,EAAE,IAAI,CAAC,QAAQ,CAAC,MAAM;aACpC,CAAC,CAAC;SACJ,CAAC;IACJ,CAAC;CACF;AArUD,sCAqUC;AAED,0BAA0B;AACb,QAAA,mBAAmB,GAAG,IAAI,aAAa,EAAE,CAAC"}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
export interface DiscoveredFeed {
|
|
2
|
+
url: string;
|
|
3
|
+
title?: string;
|
|
4
|
+
type: 'rss' | 'atom' | 'rdf';
|
|
5
|
+
source: 'link-tag' | 'common-path' | 'content-scan';
|
|
6
|
+
confidence: number;
|
|
7
|
+
}
|
|
8
|
+
export declare class RSSDiscovery {
|
|
9
|
+
private readonly userAgent;
|
|
10
|
+
private readonly timeout;
|
|
11
|
+
private readonly maxRedirects;
|
|
12
|
+
/**
|
|
13
|
+
* Discover RSS feeds from a given URL
|
|
14
|
+
*/
|
|
15
|
+
discoverFeeds(url: string): Promise<DiscoveredFeed[]>;
|
|
16
|
+
/**
|
|
17
|
+
* Check if the URL itself is a direct feed
|
|
18
|
+
*/
|
|
19
|
+
private checkDirectFeed;
|
|
20
|
+
/**
|
|
21
|
+
* Fetch HTML page content
|
|
22
|
+
*/
|
|
23
|
+
private fetchPage;
|
|
24
|
+
/**
|
|
25
|
+
* Extract feed URLs from HTML link tags
|
|
26
|
+
*/
|
|
27
|
+
private extractFeedsFromHTML;
|
|
28
|
+
/**
|
|
29
|
+
* Check common feed paths
|
|
30
|
+
*/
|
|
31
|
+
private checkCommonPaths;
|
|
32
|
+
/**
|
|
33
|
+
* Scan HTML content for feed-like patterns
|
|
34
|
+
*/
|
|
35
|
+
private scanForFeedContent;
|
|
36
|
+
/**
|
|
37
|
+
* Validate if a URL is actually a feed
|
|
38
|
+
*/
|
|
39
|
+
private validateFeedUrl;
|
|
40
|
+
/**
|
|
41
|
+
* Resolve relative URLs to absolute URLs
|
|
42
|
+
*/
|
|
43
|
+
private resolveUrl;
|
|
44
|
+
/**
|
|
45
|
+
* Check if content type indicates a feed
|
|
46
|
+
*/
|
|
47
|
+
private isFeedContentType;
|
|
48
|
+
/**
|
|
49
|
+
* Determine feed type from content type
|
|
50
|
+
*/
|
|
51
|
+
private determineFeedType;
|
|
52
|
+
/**
|
|
53
|
+
* Guess feed type from URL or text
|
|
54
|
+
*/
|
|
55
|
+
private guessFeedType;
|
|
56
|
+
/**
|
|
57
|
+
* Check if a link looks like it could be a feed
|
|
58
|
+
*/
|
|
59
|
+
private isFeedLikeLink;
|
|
60
|
+
}
|
|
61
|
+
export declare const globalRSSDiscovery: RSSDiscovery;
|
|
62
|
+
//# sourceMappingURL=rss-discovery.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rss-discovery.d.ts","sourceRoot":"","sources":["../../../lib/web-scrapers/rss-discovery.ts"],"names":[],"mappings":"AAIA,MAAM,WAAW,cAAc;IAC7B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,KAAK,GAAG,MAAM,GAAG,KAAK,CAAC;IAC7B,MAAM,EAAE,UAAU,GAAG,aAAa,GAAG,cAAc,CAAC;IACpD,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,qBAAa,YAAY;IACvB,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAiF;IAC3G,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAK;IAElC;;OAEG;IACG,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,cAAc,EAAE,CAAC;IAuD3D;;OAEG;YACW,eAAe;IAuC7B;;OAEG;YACW,SAAS;IAmCvB;;OAEG;IACH,OAAO,CAAC,oBAAoB;IAsD5B;;OAEG;YACW,gBAAgB;IA+C9B;;OAEG;YACW,kBAAkB;IAqChC;;OAEG;YACW,eAAe;IAgC7B;;OAEG;IACH,OAAO,CAAC,UAAU;IAQlB;;OAEG;IACH,OAAO,CAAC,iBAAiB;IASzB;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAOzB;;OAEG;IACH,OAAO,CAAC,aAAa;IAOrB;;OAEG;IACH,OAAO,CAAC,cAAc;CAUvB;AAGD,eAAO,MAAM,kBAAkB,cAAqB,CAAC"}
|