@tyroneross/blog-scraper 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +254 -279
- package/dist/lib/circuit-breaker.d.ts +29 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.js +89 -0
- package/dist/lib/circuit-breaker.js.map +1 -0
- package/dist/lib/content-extractor.d.ts +13 -0
- package/dist/lib/content-extractor.d.ts.map +1 -0
- package/dist/lib/content-extractor.js +75 -0
- package/dist/lib/content-extractor.js.map +1 -0
- package/dist/lib/formatters/html-to-markdown.d.ts +21 -0
- package/dist/lib/formatters/html-to-markdown.d.ts.map +1 -0
- package/dist/lib/formatters/html-to-markdown.js +146 -0
- package/dist/lib/formatters/html-to-markdown.js.map +1 -0
- package/dist/lib/formatters/text-cleaner.d.ts +44 -0
- package/dist/lib/formatters/text-cleaner.d.ts.map +1 -0
- package/dist/lib/formatters/text-cleaner.js +143 -0
- package/dist/lib/formatters/text-cleaner.js.map +1 -0
- package/dist/lib/index.d.ts +96 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +184 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/quality-scorer.d.ts +83 -0
- package/dist/lib/quality-scorer.d.ts.map +1 -0
- package/dist/lib/quality-scorer.js +376 -0
- package/dist/lib/quality-scorer.js.map +1 -0
- package/dist/lib/rss-utils.d.ts +31 -0
- package/dist/lib/rss-utils.d.ts.map +1 -0
- package/dist/lib/rss-utils.js +175 -0
- package/dist/lib/rss-utils.js.map +1 -0
- package/dist/lib/scraping-rate-limiter.d.ts +52 -0
- package/dist/lib/scraping-rate-limiter.d.ts.map +1 -0
- package/dist/lib/scraping-rate-limiter.js +238 -0
- package/dist/lib/scraping-rate-limiter.js.map +1 -0
- package/dist/lib/source-orchestrator.d.ts +306 -0
- package/dist/lib/source-orchestrator.d.ts.map +1 -0
- package/dist/lib/source-orchestrator.js +840 -0
- package/dist/lib/source-orchestrator.js.map +1 -0
- package/dist/lib/types.d.ts +143 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +7 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts +62 -0
- package/dist/lib/web-scrapers/content-extractor.d.ts.map +1 -0
- package/dist/lib/web-scrapers/content-extractor.js +531 -0
- package/dist/lib/web-scrapers/content-extractor.js.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts +74 -0
- package/dist/lib/web-scrapers/html-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/html-scraper.js +598 -0
- package/dist/lib/web-scrapers/html-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts +57 -0
- package/dist/lib/web-scrapers/playwright-scraper.d.ts.map +1 -0
- package/dist/lib/web-scrapers/playwright-scraper.js +355 -0
- package/dist/lib/web-scrapers/playwright-scraper.js.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts +42 -0
- package/dist/lib/web-scrapers/robots-checker.d.ts.map +1 -0
- package/dist/lib/web-scrapers/robots-checker.js +285 -0
- package/dist/lib/web-scrapers/robots-checker.js.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts +62 -0
- package/dist/lib/web-scrapers/rss-discovery.d.ts.map +1 -0
- package/dist/lib/web-scrapers/rss-discovery.js +384 -0
- package/dist/lib/web-scrapers/rss-discovery.js.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts +65 -0
- package/dist/lib/web-scrapers/sitemap-parser.d.ts.map +1 -0
- package/dist/lib/web-scrapers/sitemap-parser.js +430 -0
- package/dist/lib/web-scrapers/sitemap-parser.js.map +1 -0
- package/package.json +54 -33
- package/dist/index.d.mts +0 -949
- package/dist/index.d.ts +0 -949
- package/dist/index.js +0 -3236
- package/dist/index.mjs +0 -3165
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.globalRSSDiscovery = exports.RSSDiscovery = void 0;
|
|
37
|
+
const cheerio = __importStar(require("cheerio"));
|
|
38
|
+
const scraping_rate_limiter_1 = require("../scraping-rate-limiter");
|
|
39
|
+
const robots_checker_1 = require("./robots-checker");
|
|
40
|
+
class RSSDiscovery {
|
|
41
|
+
constructor() {
|
|
42
|
+
this.userAgent = 'Mozilla/5.0 (compatible; AtomizeNews/1.0; +https://atomize-news.vercel.app)';
|
|
43
|
+
this.timeout = 10000; // 10 seconds
|
|
44
|
+
this.maxRedirects = 3;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Discover RSS feeds from a given URL
|
|
48
|
+
*/
|
|
49
|
+
async discoverFeeds(url) {
|
|
50
|
+
console.log(`🔍 [RSSDiscovery] Starting feed discovery for ${url}`);
|
|
51
|
+
const feeds = new Map();
|
|
52
|
+
try {
|
|
53
|
+
// Step 1: Check if the URL itself is a feed
|
|
54
|
+
const directFeed = await this.checkDirectFeed(url);
|
|
55
|
+
if (directFeed) {
|
|
56
|
+
feeds.set(directFeed.url, directFeed);
|
|
57
|
+
console.log(`✅ [RSSDiscovery] Direct feed found: ${directFeed.url}`);
|
|
58
|
+
return Array.from(feeds.values());
|
|
59
|
+
}
|
|
60
|
+
// Step 2: Check robots.txt compliance
|
|
61
|
+
const robotsCheck = await robots_checker_1.globalRobotsChecker.isAllowed(url);
|
|
62
|
+
if (!robotsCheck.allowed) {
|
|
63
|
+
console.warn(`🤖 [RSSDiscovery] URL blocked by robots.txt: ${url} - ${robotsCheck.reason}`);
|
|
64
|
+
return [];
|
|
65
|
+
}
|
|
66
|
+
// Step 3: Fetch and parse HTML page
|
|
67
|
+
const html = await this.fetchPage(url);
|
|
68
|
+
if (!html) {
|
|
69
|
+
return [];
|
|
70
|
+
}
|
|
71
|
+
// Step 4: Extract feeds from link tags in HTML
|
|
72
|
+
const linkFeeds = this.extractFeedsFromHTML(html, url);
|
|
73
|
+
linkFeeds.forEach(feed => feeds.set(feed.url, feed));
|
|
74
|
+
// Step 5: Try common feed paths if no feeds found in HTML
|
|
75
|
+
if (feeds.size === 0) {
|
|
76
|
+
const commonPathFeeds = await this.checkCommonPaths(url);
|
|
77
|
+
commonPathFeeds.forEach(feed => feeds.set(feed.url, feed));
|
|
78
|
+
}
|
|
79
|
+
// Step 6: Content-based feed discovery (look for feed-like content)
|
|
80
|
+
if (feeds.size === 0) {
|
|
81
|
+
const contentFeeds = await this.scanForFeedContent(html, url);
|
|
82
|
+
contentFeeds.forEach(feed => feeds.set(feed.url, feed));
|
|
83
|
+
}
|
|
84
|
+
const discoveredFeeds = Array.from(feeds.values());
|
|
85
|
+
discoveredFeeds.sort((a, b) => b.confidence - a.confidence); // Sort by confidence descending
|
|
86
|
+
console.log(`🔍 [RSSDiscovery] Discovered ${discoveredFeeds.length} feeds for ${url}`);
|
|
87
|
+
return discoveredFeeds;
|
|
88
|
+
}
|
|
89
|
+
catch (error) {
|
|
90
|
+
console.error(`❌ [RSSDiscovery] Error discovering feeds for ${url}:`, error);
|
|
91
|
+
return [];
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Check if the URL itself is a direct feed
|
|
96
|
+
*/
|
|
97
|
+
async checkDirectFeed(url) {
|
|
98
|
+
try {
|
|
99
|
+
const response = await scraping_rate_limiter_1.globalRateLimiter.execute(url, async () => {
|
|
100
|
+
const controller = new AbortController();
|
|
101
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
102
|
+
try {
|
|
103
|
+
const res = await fetch(url, {
|
|
104
|
+
method: 'HEAD',
|
|
105
|
+
headers: { 'User-Agent': this.userAgent },
|
|
106
|
+
signal: controller.signal,
|
|
107
|
+
});
|
|
108
|
+
clearTimeout(timeoutId);
|
|
109
|
+
return res;
|
|
110
|
+
}
|
|
111
|
+
catch (error) {
|
|
112
|
+
clearTimeout(timeoutId);
|
|
113
|
+
throw error;
|
|
114
|
+
}
|
|
115
|
+
});
|
|
116
|
+
const contentType = response.headers.get('content-type') || '';
|
|
117
|
+
if (this.isFeedContentType(contentType)) {
|
|
118
|
+
const type = this.determineFeedType(contentType);
|
|
119
|
+
return {
|
|
120
|
+
url,
|
|
121
|
+
type,
|
|
122
|
+
source: 'link-tag',
|
|
123
|
+
confidence: 1.0
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
return null;
|
|
127
|
+
}
|
|
128
|
+
catch (error) {
|
|
129
|
+
// Not a direct feed, continue with other discovery methods
|
|
130
|
+
return null;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Fetch HTML page content
|
|
135
|
+
*/
|
|
136
|
+
async fetchPage(url) {
|
|
137
|
+
try {
|
|
138
|
+
return await scraping_rate_limiter_1.globalRateLimiter.execute(url, async () => {
|
|
139
|
+
const controller = new AbortController();
|
|
140
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
141
|
+
try {
|
|
142
|
+
const response = await fetch(url, {
|
|
143
|
+
headers: { 'User-Agent': this.userAgent },
|
|
144
|
+
signal: controller.signal,
|
|
145
|
+
});
|
|
146
|
+
clearTimeout(timeoutId);
|
|
147
|
+
if (!response.ok) {
|
|
148
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
149
|
+
}
|
|
150
|
+
const contentType = response.headers.get('content-type') || '';
|
|
151
|
+
if (!contentType.includes('text/html')) {
|
|
152
|
+
throw new Error(`Not HTML content: ${contentType}`);
|
|
153
|
+
}
|
|
154
|
+
return await response.text();
|
|
155
|
+
}
|
|
156
|
+
catch (error) {
|
|
157
|
+
clearTimeout(timeoutId);
|
|
158
|
+
throw error;
|
|
159
|
+
}
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
catch (error) {
|
|
163
|
+
console.error(`❌ [RSSDiscovery] Error fetching page ${url}:`, error);
|
|
164
|
+
return null;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Extract feed URLs from HTML link tags
|
|
169
|
+
*/
|
|
170
|
+
extractFeedsFromHTML(html, baseUrl) {
|
|
171
|
+
const feeds = [];
|
|
172
|
+
try {
|
|
173
|
+
const $ = cheerio.load(html);
|
|
174
|
+
// Look for RSS/Atom feed links
|
|
175
|
+
$('link[rel="alternate"]').each((_, element) => {
|
|
176
|
+
const $link = $(element);
|
|
177
|
+
const type = $link.attr('type');
|
|
178
|
+
const href = $link.attr('href');
|
|
179
|
+
const title = $link.attr('title');
|
|
180
|
+
if (href && this.isFeedContentType(type || '')) {
|
|
181
|
+
const absoluteUrl = this.resolveUrl(href, baseUrl);
|
|
182
|
+
if (absoluteUrl) {
|
|
183
|
+
feeds.push({
|
|
184
|
+
url: absoluteUrl,
|
|
185
|
+
title: title || undefined,
|
|
186
|
+
type: this.determineFeedType(type || ''),
|
|
187
|
+
source: 'link-tag',
|
|
188
|
+
confidence: 0.9
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
});
|
|
193
|
+
// Look for other potential feed links
|
|
194
|
+
$('a[href]').each((_, element) => {
|
|
195
|
+
const $link = $(element);
|
|
196
|
+
const href = $link.attr('href');
|
|
197
|
+
const text = $link.text().toLowerCase().trim();
|
|
198
|
+
if (href && this.isFeedLikeLink(href, text)) {
|
|
199
|
+
const absoluteUrl = this.resolveUrl(href, baseUrl);
|
|
200
|
+
if (absoluteUrl && !feeds.some(f => f.url === absoluteUrl)) {
|
|
201
|
+
feeds.push({
|
|
202
|
+
url: absoluteUrl,
|
|
203
|
+
title: $link.text().trim() || undefined,
|
|
204
|
+
type: this.guessFeedType(href),
|
|
205
|
+
source: 'content-scan',
|
|
206
|
+
confidence: 0.6
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
});
|
|
211
|
+
}
|
|
212
|
+
catch (error) {
|
|
213
|
+
console.error(`❌ [RSSDiscovery] Error parsing HTML for feeds:`, error);
|
|
214
|
+
}
|
|
215
|
+
return feeds;
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Check common feed paths
|
|
219
|
+
*/
|
|
220
|
+
async checkCommonPaths(url) {
|
|
221
|
+
const baseUrl = new URL(url);
|
|
222
|
+
const commonPaths = [
|
|
223
|
+
'/feed/',
|
|
224
|
+
'/feed.xml',
|
|
225
|
+
'/rss/',
|
|
226
|
+
'/rss.xml',
|
|
227
|
+
'/feeds/',
|
|
228
|
+
'/feeds.xml',
|
|
229
|
+
'/atom.xml',
|
|
230
|
+
'/index.xml',
|
|
231
|
+
'/blog/feed/',
|
|
232
|
+
'/blog/rss.xml',
|
|
233
|
+
'/news/feed/',
|
|
234
|
+
'/news/rss.xml'
|
|
235
|
+
];
|
|
236
|
+
const feeds = [];
|
|
237
|
+
for (const path of commonPaths) {
|
|
238
|
+
try {
|
|
239
|
+
const testUrl = `${baseUrl.protocol}//${baseUrl.host}${path}`;
|
|
240
|
+
// Check robots.txt for this specific path
|
|
241
|
+
const robotsCheck = await robots_checker_1.globalRobotsChecker.isAllowed(testUrl);
|
|
242
|
+
if (!robotsCheck.allowed) {
|
|
243
|
+
continue;
|
|
244
|
+
}
|
|
245
|
+
const isValid = await this.validateFeedUrl(testUrl);
|
|
246
|
+
if (isValid) {
|
|
247
|
+
feeds.push({
|
|
248
|
+
url: testUrl,
|
|
249
|
+
type: this.guessFeedType(path),
|
|
250
|
+
source: 'common-path',
|
|
251
|
+
confidence: 0.7
|
|
252
|
+
});
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
catch (error) {
|
|
256
|
+
// Continue checking other paths
|
|
257
|
+
continue;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
return feeds;
|
|
261
|
+
}
|
|
262
|
+
/**
|
|
263
|
+
* Scan HTML content for feed-like patterns
|
|
264
|
+
*/
|
|
265
|
+
async scanForFeedContent(html, baseUrl) {
|
|
266
|
+
const feeds = [];
|
|
267
|
+
try {
|
|
268
|
+
const $ = cheerio.load(html);
|
|
269
|
+
// Look for URLs in the content that might be feeds
|
|
270
|
+
const text = $.text();
|
|
271
|
+
const urlRegex = /https?:\/\/[^\s]+(?:feed|rss|atom)[^\s]*/gi;
|
|
272
|
+
const matches = text.match(urlRegex);
|
|
273
|
+
if (matches) {
|
|
274
|
+
for (const match of matches) {
|
|
275
|
+
const cleanUrl = match.replace(/[.,;:!?)]$/, ''); // Remove trailing punctuation
|
|
276
|
+
const absoluteUrl = this.resolveUrl(cleanUrl, baseUrl);
|
|
277
|
+
if (absoluteUrl && !feeds.some(f => f.url === absoluteUrl)) {
|
|
278
|
+
const isValid = await this.validateFeedUrl(absoluteUrl);
|
|
279
|
+
if (isValid) {
|
|
280
|
+
feeds.push({
|
|
281
|
+
url: absoluteUrl,
|
|
282
|
+
type: this.guessFeedType(absoluteUrl),
|
|
283
|
+
source: 'content-scan',
|
|
284
|
+
confidence: 0.5
|
|
285
|
+
});
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
catch (error) {
|
|
292
|
+
console.error(`❌ [RSSDiscovery] Error scanning content for feeds:`, error);
|
|
293
|
+
}
|
|
294
|
+
return feeds;
|
|
295
|
+
}
|
|
296
|
+
/**
|
|
297
|
+
* Validate if a URL is actually a feed
|
|
298
|
+
*/
|
|
299
|
+
async validateFeedUrl(url) {
|
|
300
|
+
try {
|
|
301
|
+
return await scraping_rate_limiter_1.globalRateLimiter.execute(url, async () => {
|
|
302
|
+
const controller = new AbortController();
|
|
303
|
+
const timeoutId = setTimeout(() => controller.abort(), 5000); // Shorter timeout for validation
|
|
304
|
+
try {
|
|
305
|
+
const response = await fetch(url, {
|
|
306
|
+
method: 'HEAD',
|
|
307
|
+
headers: { 'User-Agent': this.userAgent },
|
|
308
|
+
signal: controller.signal,
|
|
309
|
+
});
|
|
310
|
+
clearTimeout(timeoutId);
|
|
311
|
+
if (!response.ok) {
|
|
312
|
+
return false;
|
|
313
|
+
}
|
|
314
|
+
const contentType = response.headers.get('content-type') || '';
|
|
315
|
+
return this.isFeedContentType(contentType);
|
|
316
|
+
}
|
|
317
|
+
catch (error) {
|
|
318
|
+
clearTimeout(timeoutId);
|
|
319
|
+
return false;
|
|
320
|
+
}
|
|
321
|
+
});
|
|
322
|
+
}
|
|
323
|
+
catch (error) {
|
|
324
|
+
return false;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
/**
|
|
328
|
+
* Resolve relative URLs to absolute URLs
|
|
329
|
+
*/
|
|
330
|
+
resolveUrl(url, baseUrl) {
|
|
331
|
+
try {
|
|
332
|
+
return new URL(url, baseUrl).toString();
|
|
333
|
+
}
|
|
334
|
+
catch {
|
|
335
|
+
return null;
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
/**
|
|
339
|
+
* Check if content type indicates a feed
|
|
340
|
+
*/
|
|
341
|
+
isFeedContentType(contentType) {
|
|
342
|
+
const lowerType = contentType.toLowerCase();
|
|
343
|
+
return lowerType.includes('application/rss+xml') ||
|
|
344
|
+
lowerType.includes('application/atom+xml') ||
|
|
345
|
+
lowerType.includes('application/rdf+xml') ||
|
|
346
|
+
lowerType.includes('text/xml') ||
|
|
347
|
+
lowerType.includes('application/xml');
|
|
348
|
+
}
|
|
349
|
+
/**
|
|
350
|
+
* Determine feed type from content type
|
|
351
|
+
*/
|
|
352
|
+
determineFeedType(contentType) {
|
|
353
|
+
const lowerType = contentType.toLowerCase();
|
|
354
|
+
if (lowerType.includes('atom'))
|
|
355
|
+
return 'atom';
|
|
356
|
+
if (lowerType.includes('rdf'))
|
|
357
|
+
return 'rdf';
|
|
358
|
+
return 'rss'; // Default to RSS
|
|
359
|
+
}
|
|
360
|
+
/**
|
|
361
|
+
* Guess feed type from URL or text
|
|
362
|
+
*/
|
|
363
|
+
guessFeedType(urlOrText) {
|
|
364
|
+
const lower = urlOrText.toLowerCase();
|
|
365
|
+
if (lower.includes('atom'))
|
|
366
|
+
return 'atom';
|
|
367
|
+
if (lower.includes('rdf'))
|
|
368
|
+
return 'rdf';
|
|
369
|
+
return 'rss'; // Default to RSS
|
|
370
|
+
}
|
|
371
|
+
/**
|
|
372
|
+
* Check if a link looks like it could be a feed
|
|
373
|
+
*/
|
|
374
|
+
isFeedLikeLink(href, text) {
|
|
375
|
+
const lowerHref = href.toLowerCase();
|
|
376
|
+
const lowerText = text.toLowerCase();
|
|
377
|
+
const feedKeywords = ['rss', 'feed', 'atom', 'xml', 'syndication'];
|
|
378
|
+
return feedKeywords.some(keyword => lowerHref.includes(keyword) || lowerText.includes(keyword));
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
exports.RSSDiscovery = RSSDiscovery;
|
|
382
|
+
// Default global instance
|
|
383
|
+
exports.globalRSSDiscovery = new RSSDiscovery();
|
|
384
|
+
//# sourceMappingURL=rss-discovery.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rss-discovery.js","sourceRoot":"","sources":["../../../lib/web-scrapers/rss-discovery.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,iDAAmC;AACnC,oEAA6D;AAC7D,qDAAuD;AAUvD,MAAa,YAAY;IAAzB;QACmB,cAAS,GAAG,6EAA6E,CAAC;QAC1F,YAAO,GAAG,KAAK,CAAC,CAAC,aAAa;QAC9B,iBAAY,GAAG,CAAC,CAAC;IA0XpC,CAAC;IAxXC;;OAEG;IACH,KAAK,CAAC,aAAa,CAAC,GAAW;QAC7B,OAAO,CAAC,GAAG,CAAC,iDAAiD,GAAG,EAAE,CAAC,CAAC;QAEpE,MAAM,KAAK,GAAG,IAAI,GAAG,EAA0B,CAAC;QAEhD,IAAI,CAAC;YACH,4CAA4C;YAC5C,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;YACnD,IAAI,UAAU,EAAE,CAAC;gBACf,KAAK,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;gBACtC,OAAO,CAAC,GAAG,CAAC,uCAAuC,UAAU,CAAC,GAAG,EAAE,CAAC,CAAC;gBACrE,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC;YACpC,CAAC;YAED,sCAAsC;YACtC,MAAM,WAAW,GAAG,MAAM,oCAAmB,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAC7D,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,CAAC;gBACzB,OAAO,CAAC,IAAI,CAAC,gDAAgD,GAAG,MAAM,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC;gBAC5F,OAAO,EAAE,CAAC;YACZ,CAAC;YAED,oCAAoC;YACpC,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YACvC,IAAI,CAAC,IAAI,EAAE,CAAC;gBACV,OAAO,EAAE,CAAC;YACZ,CAAC;YAED,+CAA+C;YAC/C,MAAM,SAAS,GAAG,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YACvD,SAAS,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC;YAErD,0DAA0D;YAC1D,IAAI,KAAK,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;gBACrB,MAAM,eAAe,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC;gBACzD,eAAe,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC;YAC7D,CAAC;YAED,oEAAoE;YACpE,IAAI,KAAK,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;gBACrB,MAAM,YAAY,GAAG,MAAM,IAAI,CAAC,kBAAkB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;gBAC9D,YAAY,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC;YAC1D,CAAC;YAED,MAAM,eAAe,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC;YACnD,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,gCAAgC;YAE7F,OAAO,CAAC,GAAG,CAAC,gCAAgC,eAAe,CAAC,MAAM,cAAc,GAAG,EAAE,CAAC,CAAC;YACvF,OAAO,eAAe,CAAC;QAEzB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,gDAAgD,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YAC7E,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,eAAe,CAAC,GAAW;QACvC,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,yCAAiB,CAAC,OAAO,CAAC,GAAG,EAAE,KAAK,IAAI,EAAE;gBAC/D,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;gBACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;gBAErE,IAAI,CAAC;oBACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;wBAC3B,MAAM,EAAE,MAAM;wBACd,OAAO,EAAE,EAAE,YAAY,EAAE,IAAI,CAAC,SAAS,EAAE;wBACzC,MAAM,EAAE,UAAU,CAAC,MAAM;qBAC1B,CAAC,CAAC;oBACH,YAAY,CAAC,SAAS,CAAC,CAAC;oBACxB,OAAO,GAAG,CAAC;gBACb,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,YAAY,CAAC,SAAS,CAAC,CAAC;oBACxB,MAAM,KAAK,CAAC;gBACd,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;YAE/D,IAAI,IAAI,CAAC,iBAAiB,CAAC,WAAW,CAAC,EAAE,CAAC;gBACxC,MAAM,IAAI,GAAG,IAAI,CAAC,iBAAiB,CAAC,WAAW,CAAC,CAAC;gBACjD,OAAO;oBACL,GAAG;oBACH,IAAI;oBACJ,MAAM,EAAE,UAAU;oBAClB,UAAU,EAAE,GAAG;iBAChB,CAAC;YACJ,CAAC;YAED,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,2DAA2D;YAC3D,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,SAAS,CAAC,GAAW;QACjC,IAAI,CAAC;YACH,OAAO,MAAM,yCAAiB,CAAC,OAAO,CAAC,GAAG,EAAE,KAAK,IAAI,EAAE;gBACrD,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;gBACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;gBAErE,IAAI,CAAC;oBACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;wBAChC,OAAO,EAAE,EAAE,YAAY,EAAE,IAAI,CAAC,SAAS,EAAE;wBACzC,MAAM,EAAE,UAAU,CAAC,MAAM;qBAC1B,CAAC,CAAC;oBAEH,YAAY,CAAC,SAAS,CAAC,CAAC;oBAExB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;wBACjB,MAAM,IAAI,KAAK,CAAC,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;oBACrE,CAAC;oBAED,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;oBAC/D,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;wBACvC,MAAM,IAAI,KAAK,CAAC,qBAAqB,WAAW,EAAE,CAAC,CAAC;oBACtD,CAAC;oBAED,OAAO,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;gBAC/B,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,YAAY,CAAC,SAAS,CAAC,CAAC;oBACxB,MAAM,KAAK,CAAC;gBACd,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,wCAAwC,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YACrE,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACK,oBAAoB,CAAC,IAAY,EAAE,OAAe;QACxD,MAAM,KAAK,GAAqB,EAAE,CAAC;QAEnC,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7B,+BAA+B;YAC/B,CAAC,CAAC,uBAAuB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBAC7C,MAAM,KAAK,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;gBACzB,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBAChC,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBAChC,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;gBAElC,IAAI,IAAI,IAAI,IAAI,CAAC,iBAAiB,CAAC,IAAI,IAAI,EAAE,CAAC,EAAE,CAAC;oBAC/C,MAAM,WAAW,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;oBACnD,IAAI,WAAW,EAAE,CAAC;wBAChB,KAAK,CAAC,IAAI,CAAC;4BACT,GAAG,EAAE,WAAW;4BAChB,KAAK,EAAE,KAAK,IAAI,SAAS;4BACzB,IAAI,EAAE,IAAI,CAAC,iBAAiB,CAAC,IAAI,IAAI,EAAE,CAAC;4BACxC,MAAM,EAAE,UAAU;4BAClB,UAAU,EAAE,GAAG;yBAChB,CAAC,CAAC;oBACL,CAAC;gBACH,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,sCAAsC;YACtC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBAC/B,MAAM,KAAK,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;gBACzB,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBAChC,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;gBAE/C,IAAI,IAAI,IAAI,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC;oBAC5C,MAAM,WAAW,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;oBACnD,IAAI,WAAW,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,KAAK,WAAW,CAAC,EAAE,CAAC;wBAC3D,KAAK,CAAC,IAAI,CAAC;4BACT,GAAG,EAAE,WAAW;4BAChB,KAAK,EAAE,KAAK,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,SAAS;4BACvC,IAAI,EAAE,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC;4BAC9B,MAAM,EAAE,cAAc;4BACtB,UAAU,EAAE,GAAG;yBAChB,CAAC,CAAC;oBACL,CAAC;gBACH,CAAC;YACH,CAAC,CAAC,CAAC;QAEL,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,gDAAgD,EAAE,KAAK,CAAC,CAAC;QACzE,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,gBAAgB,CAAC,GAAW;QACxC,MAAM,OAAO,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC7B,MAAM,WAAW,GAAG;YAClB,QAAQ;YACR,WAAW;YACX,OAAO;YACP,UAAU;YACV,SAAS;YACT,YAAY;YACZ,WAAW;YACX,YAAY;YACZ,aAAa;YACb,eAAe;YACf,aAAa;YACb,eAAe;SAChB,CAAC;QAEF,MAAM,KAAK,GAAqB,EAAE,CAAC;QAEnC,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;YAC/B,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,GAAG,OAAO,CAAC,QAAQ,KAAK,OAAO,CAAC,IAAI,GAAG,IAAI,EAAE,CAAC;gBAE9D,0CAA0C;gBAC1C,MAAM,WAAW,GAAG,MAAM,oCAAmB,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;gBACjE,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,CAAC;oBACzB,SAAS;gBACX,CAAC;gBAED,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,OAAO,CAAC,CAAC;gBACpD,IAAI,OAAO,EAAE,CAAC;oBACZ,KAAK,CAAC,IAAI,CAAC;wBACT,GAAG,EAAE,OAAO;wBACZ,IAAI,EAAE,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC;wBAC9B,MAAM,EAAE,aAAa;wBACrB,UAAU,EAAE,GAAG;qBAChB,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,gCAAgC;gBAChC,SAAS;YACX,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,kBAAkB,CAAC,IAAY,EAAE,OAAe;QAC5D,MAAM,KAAK,GAAqB,EAAE,CAAC;QAEnC,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7B,mDAAmD;YACnD,MAAM,IAAI,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YACtB,MAAM,QAAQ,GAAG,4CAA4C,CAAC;YAC9D,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;YAErC,IAAI,OAAO,EAAE,CAAC;gBACZ,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;oBAC5B,MAAM,QAAQ,GAAG,KAAK,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAC,CAAC,8BAA8B;oBAChF,MAAM,WAAW,GAAG,IAAI,CAAC,UAAU,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;oBAEvD,IAAI,WAAW,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,KAAK,WAAW,CAAC,EAAE,CAAC;wBAC3D,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,WAAW,CAAC,CAAC;wBACxD,IAAI,OAAO,EAAE,CAAC;4BACZ,KAAK,CAAC,IAAI,CAAC;gCACT,GAAG,EAAE,WAAW;gCAChB,IAAI,EAAE,IAAI,CAAC,aAAa,CAAC,WAAW,CAAC;gCACrC,MAAM,EAAE,cAAc;gCACtB,UAAU,EAAE,GAAG;6BAChB,CAAC,CAAC;wBACL,CAAC;oBACH,CAAC;gBACH,CAAC;YACH,CAAC;QAEH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,oDAAoD,EAAE,KAAK,CAAC,CAAC;QAC7E,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,eAAe,CAAC,GAAW;QACvC,IAAI,CAAC;YACH,OAAO,MAAM,yCAAiB,CAAC,OAAO,CAAC,GAAG,EAAE,KAAK,IAAI,EAAE;gBACrD,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;gBACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,CAAC,CAAC,iCAAiC;gBAE/F,IAAI,CAAC;oBACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;wBAChC,MAAM,EAAE,MAAM;wBACd,OAAO,EAAE,EAAE,YAAY,EAAE,IAAI,CAAC,SAAS,EAAE;wBACzC,MAAM,EAAE,UAAU,CAAC,MAAM;qBAC1B,CAAC,CAAC;oBAEH,YAAY,CAAC,SAAS,CAAC,CAAC;oBAExB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;wBACjB,OAAO,KAAK,CAAC;oBACf,CAAC;oBAED,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;oBAC/D,OAAO,IAAI,CAAC,iBAAiB,CAAC,WAAW,CAAC,CAAC;gBAE7C,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,YAAY,CAAC,SAAS,CAAC,CAAC;oBACxB,OAAO,KAAK,CAAC;gBACf,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,GAAW,EAAE,OAAe;QAC7C,IAAI,CAAC;YACH,OAAO,IAAI,GAAG,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC,QAAQ,EAAE,CAAC;QAC1C,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACK,iBAAiB,CAAC,WAAmB;QAC3C,MAAM,SAAS,GAAG,WAAW,CAAC,WAAW,EAAE,CAAC;QAC5C,OAAO,SAAS,CAAC,QAAQ,CAAC,qBAAqB,CAAC;YACzC,SAAS,CAAC,QAAQ,CAAC,sBAAsB,CAAC;YAC1C,SAAS,CAAC,QAAQ,CAAC,qBAAqB,CAAC;YACzC,SAAS,CAAC,QAAQ,CAAC,UAAU,CAAC;YAC9B,SAAS,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;IAC/C,CAAC;IAED;;OAEG;IACK,iBAAiB,CAAC,WAAmB;QAC3C,MAAM,SAAS,GAAG,WAAW,CAAC,WAAW,EAAE,CAAC;QAC5C,IAAI,SAAS,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAE,OAAO,MAAM,CAAC;QAC9C,IAAI,SAAS,CAAC,QAAQ,CAAC,KAAK,CAAC;YAAE,OAAO,KAAK,CAAC;QAC5C,OAAO,KAAK,CAAC,CAAC,iBAAiB;IACjC,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,SAAiB;QACrC,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,EAAE,CAAC;QACtC,IAAI,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAE,OAAO,MAAM,CAAC;QAC1C,IAAI,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC;YAAE,OAAO,KAAK,CAAC;QACxC,OAAO,KAAK,CAAC,CAAC,iBAAiB;IACjC,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,IAAY,EAAE,IAAY;QAC/C,MAAM,SAAS,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;QACrC,MAAM,SAAS,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;QAErC,MAAM,YAAY,GAAG,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,aAAa,CAAC,CAAC;QAEnE,OAAO,YAAY,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CACjC,SAAS,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,SAAS,CAAC,QAAQ,CAAC,OAAO,CAAC,CAC3D,CAAC;IACJ,CAAC;CACF;AA7XD,oCA6XC;AAED,0BAA0B;AACb,QAAA,kBAAkB,GAAG,IAAI,YAAY,EAAE,CAAC"}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
export interface SitemapEntry {
|
|
2
|
+
url: string;
|
|
3
|
+
lastmod?: Date;
|
|
4
|
+
changefreq?: 'always' | 'hourly' | 'daily' | 'weekly' | 'monthly' | 'yearly' | 'never';
|
|
5
|
+
priority?: number;
|
|
6
|
+
images?: SitemapImage[];
|
|
7
|
+
news?: SitemapNews;
|
|
8
|
+
}
|
|
9
|
+
export interface SitemapImage {
|
|
10
|
+
loc: string;
|
|
11
|
+
caption?: string;
|
|
12
|
+
title?: string;
|
|
13
|
+
}
|
|
14
|
+
export interface SitemapNews {
|
|
15
|
+
title: string;
|
|
16
|
+
publishedDate?: Date;
|
|
17
|
+
keywords?: string[];
|
|
18
|
+
}
|
|
19
|
+
export interface SitemapIndex {
|
|
20
|
+
sitemaps: {
|
|
21
|
+
loc: string;
|
|
22
|
+
lastmod?: Date;
|
|
23
|
+
}[];
|
|
24
|
+
}
|
|
25
|
+
export declare class SitemapParser {
|
|
26
|
+
private readonly userAgent;
|
|
27
|
+
private readonly timeout;
|
|
28
|
+
private readonly maxSitemapSize;
|
|
29
|
+
private readonly maxEntries;
|
|
30
|
+
private readonly recentTimeframe;
|
|
31
|
+
/**
|
|
32
|
+
* Parse sitemap from URL and return entries
|
|
33
|
+
*/
|
|
34
|
+
parseSitemap(url: string, options?: {
|
|
35
|
+
filterRecent?: boolean;
|
|
36
|
+
maxEntries?: number;
|
|
37
|
+
includeImages?: boolean;
|
|
38
|
+
includeNews?: boolean;
|
|
39
|
+
}): Promise<SitemapEntry[]>;
|
|
40
|
+
/**
|
|
41
|
+
* Discover sitemaps from domain
|
|
42
|
+
*/
|
|
43
|
+
discoverSitemaps(domain: string): Promise<string[]>;
|
|
44
|
+
/**
|
|
45
|
+
* Get recent entries from all sitemaps for a domain
|
|
46
|
+
*/
|
|
47
|
+
getRecentEntries(domain: string, options?: {
|
|
48
|
+
hoursBack?: number;
|
|
49
|
+
maxEntries?: number;
|
|
50
|
+
}): Promise<SitemapEntry[]>;
|
|
51
|
+
private fetchSitemap;
|
|
52
|
+
private checkSitemapExists;
|
|
53
|
+
private isSitemapIndex;
|
|
54
|
+
private parseSitemapIndex;
|
|
55
|
+
private parseRegularSitemap;
|
|
56
|
+
/**
|
|
57
|
+
* Validate sitemap format
|
|
58
|
+
*/
|
|
59
|
+
validateSitemapFormat(xml: string): {
|
|
60
|
+
valid: boolean;
|
|
61
|
+
errors: string[];
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
export declare const globalSitemapParser: SitemapParser;
|
|
65
|
+
//# sourceMappingURL=sitemap-parser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sitemap-parser.d.ts","sourceRoot":"","sources":["../../../lib/web-scrapers/sitemap-parser.ts"],"names":[],"mappings":"AAKA,MAAM,WAAW,YAAY;IAC3B,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,CAAC,EAAE,IAAI,CAAC;IACf,UAAU,CAAC,EAAE,QAAQ,GAAG,QAAQ,GAAG,OAAO,GAAG,QAAQ,GAAG,SAAS,GAAG,QAAQ,GAAG,OAAO,CAAC;IACvF,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,MAAM,CAAC,EAAE,YAAY,EAAE,CAAC;IACxB,IAAI,CAAC,EAAE,WAAW,CAAC;CACpB;AAED,MAAM,WAAW,YAAY;IAC3B,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,aAAa,CAAC,EAAE,IAAI,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;CACrB;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,EAAE;QACR,GAAG,EAAE,MAAM,CAAC;QACZ,OAAO,CAAC,EAAE,IAAI,CAAC;KAChB,EAAE,CAAC;CACL;AAED,qBAAa,aAAa;IACxB,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAiF;IAC3G,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAoB;IACnD,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAS;IACpC,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAuB;IAEvD;;OAEG;IACG,YAAY,CAChB,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;QACP,YAAY,CAAC,EAAE,OAAO,CAAC;QACvB,UAAU,CAAC,EAAE,MAAM,CAAC;QACpB,aAAa,CAAC,EAAE,OAAO,CAAC;QACxB,WAAW,CAAC,EAAE,OAAO,CAAC;KAClB,GACL,OAAO,CAAC,YAAY,EAAE,CAAC;IA6B1B;;OAEG;IACG,gBAAgB,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;IAwCzD;;OAEG;IACG,gBAAgB,CACpB,MAAM,EAAE,MAAM,EACd,OAAO,GAAE;QAAE,SAAS,CAAC,EAAE,MAAM,CAAC;QAAC,UAAU,CAAC,EAAE,MAAM,CAAA;KAAO,GACxD,OAAO,CAAC,YAAY,EAAE,CAAC;YAmCZ,YAAY;YA6CZ,kBAAkB;IA0BhC,OAAO,CAAC,cAAc;YAIR,iBAAiB;IA6E/B,OAAO,CAAC,mBAAmB;IA6G3B;;OAEG;IACH,qBAAqB,CAAC,GAAG,EAAE,MAAM,GAAG;QAAE,KAAK,EAAE,OAAO,CAAC;QAAC,MAAM,EAAE,MAAM,EAAE,CAAA;KAAE;CAiEzE;AAGD,eAAO,MAAM,mBAAmB,eAAsB,CAAC"}
|