crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,678 @@
|
|
|
1
|
+
import { load } from 'cheerio';
|
|
2
|
+
import zlib from 'zlib';
|
|
3
|
+
import { promisify } from 'util';
|
|
4
|
+
import { CacheManager } from '../core/cache/CacheManager.js';
|
|
5
|
+
import { normalizeUrl } from './urlNormalizer.js';
|
|
6
|
+
|
|
7
|
+
const gunzip = promisify(zlib.gunzip);
|
|
8
|
+
|
|
9
|
+
export class SitemapParser {
|
|
10
|
+
constructor(options = {}) {
|
|
11
|
+
const {
|
|
12
|
+
userAgent = 'CrawlForge/1.0',
|
|
13
|
+
timeout = 10000,
|
|
14
|
+
maxRecursionDepth = 3,
|
|
15
|
+
maxUrlsPerSitemap = 50000,
|
|
16
|
+
enableCaching = true,
|
|
17
|
+
cacheTTL = 3600000, // 1 hour
|
|
18
|
+
validateUrls = true
|
|
19
|
+
} = options;
|
|
20
|
+
|
|
21
|
+
this.userAgent = userAgent;
|
|
22
|
+
this.timeout = timeout;
|
|
23
|
+
this.maxRecursionDepth = maxRecursionDepth;
|
|
24
|
+
this.maxUrlsPerSitemap = maxUrlsPerSitemap;
|
|
25
|
+
this.validateUrls = validateUrls;
|
|
26
|
+
|
|
27
|
+
// Initialize cache if enabled
|
|
28
|
+
this.cache = enableCaching ? new CacheManager({
|
|
29
|
+
maxSize: 500,
|
|
30
|
+
ttl: cacheTTL,
|
|
31
|
+
diskCacheDir: './cache/sitemaps',
|
|
32
|
+
enableDiskCache: true
|
|
33
|
+
}) : null;
|
|
34
|
+
|
|
35
|
+
// Track processed sitemaps to avoid infinite loops
|
|
36
|
+
this.processedSitemaps = new Set();
|
|
37
|
+
|
|
38
|
+
// Statistics
|
|
39
|
+
this.stats = {
|
|
40
|
+
sitemapsProcessed: 0,
|
|
41
|
+
urlsFound: 0,
|
|
42
|
+
errors: 0,
|
|
43
|
+
cacheHits: 0,
|
|
44
|
+
compressionSavings: 0
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Parse a sitemap from a URL with full feature support
|
|
50
|
+
* @param {string} url - Sitemap URL
|
|
51
|
+
* @param {Object} options - Parsing options
|
|
52
|
+
* @returns {Promise<Object>} Parsed sitemap data
|
|
53
|
+
*/
|
|
54
|
+
async parseSitemap(url, options = {}) {
|
|
55
|
+
const {
|
|
56
|
+
includeMetadata = true,
|
|
57
|
+
followIndexes = true,
|
|
58
|
+
maxDepth = this.maxRecursionDepth
|
|
59
|
+
} = options;
|
|
60
|
+
|
|
61
|
+
// Reset stats for new parsing session
|
|
62
|
+
this.stats = {
|
|
63
|
+
sitemapsProcessed: 0,
|
|
64
|
+
urlsFound: 0,
|
|
65
|
+
errors: 0,
|
|
66
|
+
cacheHits: 0,
|
|
67
|
+
compressionSavings: 0
|
|
68
|
+
};
|
|
69
|
+
this.processedSitemaps.clear();
|
|
70
|
+
|
|
71
|
+
try {
|
|
72
|
+
const result = await this._parseSitemapRecursive(url, 0, maxDepth, {
|
|
73
|
+
includeMetadata,
|
|
74
|
+
followIndexes
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
success: true,
|
|
79
|
+
urls: result.urls,
|
|
80
|
+
metadata: includeMetadata ? result.metadata : {},
|
|
81
|
+
sitemaps: result.sitemaps,
|
|
82
|
+
statistics: this.getStatistics(),
|
|
83
|
+
lastUpdated: new Date().toISOString()
|
|
84
|
+
};
|
|
85
|
+
} catch (error) {
|
|
86
|
+
this.stats.errors++;
|
|
87
|
+
return {
|
|
88
|
+
success: false,
|
|
89
|
+
error: error.message,
|
|
90
|
+
urls: [],
|
|
91
|
+
metadata: {},
|
|
92
|
+
sitemaps: [],
|
|
93
|
+
statistics: this.getStatistics()
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Parse sitemap index files and return all contained sitemaps
|
|
100
|
+
* @param {string} indexUrl - Sitemap index URL
|
|
101
|
+
* @returns {Promise<Array>} Array of sitemap URLs with metadata
|
|
102
|
+
*/
|
|
103
|
+
async parseSitemapIndex(indexUrl) {
|
|
104
|
+
try {
|
|
105
|
+
const content = await this._fetchSitemapContent(indexUrl);
|
|
106
|
+
if (!content) return [];
|
|
107
|
+
|
|
108
|
+
const $ = load(content, { xmlMode: true });
|
|
109
|
+
const sitemaps = [];
|
|
110
|
+
|
|
111
|
+
// Parse sitemap index entries
|
|
112
|
+
$('sitemap').each((_, element) => {
|
|
113
|
+
const $sitemap = $(element);
|
|
114
|
+
const loc = $sitemap.find('loc').text().trim();
|
|
115
|
+
|
|
116
|
+
if (loc) {
|
|
117
|
+
const sitemap = {
|
|
118
|
+
url: normalizeUrl(loc),
|
|
119
|
+
lastmod: $sitemap.find('lastmod').text().trim() || null
|
|
120
|
+
};
|
|
121
|
+
sitemaps.push(sitemap);
|
|
122
|
+
}
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
return sitemaps;
|
|
126
|
+
} catch (error) {
|
|
127
|
+
console.warn(`Failed to parse sitemap index ${indexUrl}:`, error.message);
|
|
128
|
+
return [];
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Extract all URLs from parsed sitemap data
|
|
134
|
+
* @param {Object} sitemapData - Parsed sitemap result
|
|
135
|
+
* @param {Object} filters - URL filtering options
|
|
136
|
+
* @returns {Array} Filtered URLs with metadata
|
|
137
|
+
*/
|
|
138
|
+
extractUrls(sitemapData, filters = {}) {
|
|
139
|
+
const {
|
|
140
|
+
includeImages = false,
|
|
141
|
+
includeVideos = false,
|
|
142
|
+
includeNews = false,
|
|
143
|
+
domainFilter = null,
|
|
144
|
+
pathFilter = null,
|
|
145
|
+
changeFreqFilter = null,
|
|
146
|
+
priorityFilter = null,
|
|
147
|
+
lastModAfter = null,
|
|
148
|
+
lastModBefore = null
|
|
149
|
+
} = filters;
|
|
150
|
+
|
|
151
|
+
if (!sitemapData.success || !sitemapData.urls) {
|
|
152
|
+
return [];
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
let urls = [...sitemapData.urls];
|
|
156
|
+
|
|
157
|
+
// Apply domain filter
|
|
158
|
+
if (domainFilter) {
|
|
159
|
+
const domain = domainFilter.toLowerCase();
|
|
160
|
+
urls = urls.filter(url => {
|
|
161
|
+
try {
|
|
162
|
+
return new URL(url.loc).hostname.toLowerCase().includes(domain);
|
|
163
|
+
} catch {
|
|
164
|
+
return false;
|
|
165
|
+
}
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// Apply path filter
|
|
170
|
+
if (pathFilter) {
|
|
171
|
+
const regex = new RegExp(pathFilter, 'i');
|
|
172
|
+
urls = urls.filter(url => {
|
|
173
|
+
try {
|
|
174
|
+
return regex.test(new URL(url.loc).pathname);
|
|
175
|
+
} catch {
|
|
176
|
+
return false;
|
|
177
|
+
}
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Apply change frequency filter
|
|
182
|
+
if (changeFreqFilter) {
|
|
183
|
+
urls = urls.filter(url => url.changefreq === changeFreqFilter);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Apply priority filter
|
|
187
|
+
if (priorityFilter) {
|
|
188
|
+
const { min = 0, max = 1 } = priorityFilter;
|
|
189
|
+
urls = urls.filter(url => {
|
|
190
|
+
const priority = parseFloat(url.priority || 0.5);
|
|
191
|
+
return priority >= min && priority <= max;
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Apply date filters
|
|
196
|
+
if (lastModAfter || lastModBefore) {
|
|
197
|
+
urls = urls.filter(url => {
|
|
198
|
+
if (!url.lastmod) return true;
|
|
199
|
+
|
|
200
|
+
const urlDate = new Date(url.lastmod);
|
|
201
|
+
if (isNaN(urlDate.getTime())) return true;
|
|
202
|
+
|
|
203
|
+
if (lastModAfter && urlDate < new Date(lastModAfter)) return false;
|
|
204
|
+
if (lastModBefore && urlDate > new Date(lastModBefore)) return false;
|
|
205
|
+
|
|
206
|
+
return true;
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Include additional content types if requested
|
|
211
|
+
const result = urls.map(url => ({
|
|
212
|
+
url: url.loc,
|
|
213
|
+
lastmod: url.lastmod,
|
|
214
|
+
changefreq: url.changefreq,
|
|
215
|
+
priority: url.priority,
|
|
216
|
+
type: 'standard'
|
|
217
|
+
}));
|
|
218
|
+
|
|
219
|
+
// Add images, videos, news if requested and available
|
|
220
|
+
if (includeImages && sitemapData.metadata.images) {
|
|
221
|
+
sitemapData.metadata.images.forEach(img => {
|
|
222
|
+
result.push({
|
|
223
|
+
url: img.loc,
|
|
224
|
+
caption: img.caption,
|
|
225
|
+
title: img.title,
|
|
226
|
+
type: 'image'
|
|
227
|
+
});
|
|
228
|
+
});
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
if (includeVideos && sitemapData.metadata.videos) {
|
|
232
|
+
sitemapData.metadata.videos.forEach(vid => {
|
|
233
|
+
result.push({
|
|
234
|
+
url: vid.content_loc || vid.player_loc,
|
|
235
|
+
title: vid.title,
|
|
236
|
+
description: vid.description,
|
|
237
|
+
duration: vid.duration,
|
|
238
|
+
type: 'video'
|
|
239
|
+
});
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
if (includeNews && sitemapData.metadata.news) {
|
|
244
|
+
sitemapData.metadata.news.forEach(news => {
|
|
245
|
+
result.push({
|
|
246
|
+
url: news.loc,
|
|
247
|
+
title: news.title,
|
|
248
|
+
publication: news.publication,
|
|
249
|
+
publication_date: news.publication_date,
|
|
250
|
+
type: 'news'
|
|
251
|
+
});
|
|
252
|
+
});
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
return result;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/**
|
|
259
|
+
* Discover sitemap URLs from various sources
|
|
260
|
+
* @param {string} baseUrl - Base URL of the website
|
|
261
|
+
* @param {Object} sources - Sources to check
|
|
262
|
+
* @returns {Promise<Array>} Array of discovered sitemap URLs
|
|
263
|
+
*/
|
|
264
|
+
async discoverSitemaps(baseUrl, sources = {}) {
|
|
265
|
+
const {
|
|
266
|
+
checkRobotsTxt = true,
|
|
267
|
+
checkCommonPaths = true,
|
|
268
|
+
checkSitemapIndex = true
|
|
269
|
+
} = sources;
|
|
270
|
+
|
|
271
|
+
const discovered = new Set();
|
|
272
|
+
const urlObj = new URL(baseUrl);
|
|
273
|
+
const baseOrigin = `${urlObj.protocol}//${urlObj.host}`;
|
|
274
|
+
|
|
275
|
+
// Check robots.txt for sitemap declarations
|
|
276
|
+
if (checkRobotsTxt) {
|
|
277
|
+
try {
|
|
278
|
+
const robotsUrl = `${baseOrigin}/robots.txt`;
|
|
279
|
+
const robotsContent = await this._fetchWithTimeout(robotsUrl);
|
|
280
|
+
if (robotsContent) {
|
|
281
|
+
const sitemapMatches = robotsContent.match(/^Sitemap:\s*(.+)$/gmi);
|
|
282
|
+
if (sitemapMatches) {
|
|
283
|
+
sitemapMatches.forEach(match => {
|
|
284
|
+
const url = match.replace(/^Sitemap:\s*/i, '').trim();
|
|
285
|
+
if (url) discovered.add(url);
|
|
286
|
+
});
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
} catch (error) {
|
|
290
|
+
console.warn('Failed to check robots.txt for sitemaps:', error.message);
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// Check common sitemap paths
|
|
295
|
+
if (checkCommonPaths) {
|
|
296
|
+
const commonPaths = [
|
|
297
|
+
'/sitemap.xml',
|
|
298
|
+
'/sitemap_index.xml',
|
|
299
|
+
'/sitemap-index.xml',
|
|
300
|
+
'/sitemaps.xml',
|
|
301
|
+
'/sitemap1.xml',
|
|
302
|
+
'/feeds/all.xml',
|
|
303
|
+
'/rss.xml',
|
|
304
|
+
'/atom.xml'
|
|
305
|
+
];
|
|
306
|
+
|
|
307
|
+
for (const path of commonPaths) {
|
|
308
|
+
const sitemapUrl = `${baseOrigin}${path}`;
|
|
309
|
+
try {
|
|
310
|
+
const response = await this._fetchWithTimeoutResponse(sitemapUrl);
|
|
311
|
+
if (response && response.ok) {
|
|
312
|
+
discovered.add(sitemapUrl);
|
|
313
|
+
}
|
|
314
|
+
} catch {
|
|
315
|
+
// Continue checking other paths
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
return Array.from(discovered);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/**
|
|
324
|
+
* Recursive sitemap parsing with depth control
|
|
325
|
+
* @private
|
|
326
|
+
*/
|
|
327
|
+
async _parseSitemapRecursive(url, currentDepth, maxDepth, options) {
|
|
328
|
+
if (currentDepth >= maxDepth || this.processedSitemaps.has(url)) {
|
|
329
|
+
return { urls: [], metadata: {}, sitemaps: [] };
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
this.processedSitemaps.add(url);
|
|
333
|
+
this.stats.sitemapsProcessed++;
|
|
334
|
+
|
|
335
|
+
// Check cache first
|
|
336
|
+
const cacheKey = this.cache?.generateKey(url, { depth: currentDepth });
|
|
337
|
+
if (this.cache && cacheKey) {
|
|
338
|
+
const cached = await this.cache.get(cacheKey);
|
|
339
|
+
if (cached) {
|
|
340
|
+
this.stats.cacheHits++;
|
|
341
|
+
return cached;
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
try {
|
|
346
|
+
const content = await this._fetchSitemapContent(url);
|
|
347
|
+
if (!content) {
|
|
348
|
+
throw new Error(`Failed to fetch sitemap content from ${url}`);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
const result = await this._parseSitemapContent(content, url, options);
|
|
352
|
+
|
|
353
|
+
// Handle sitemap indexes recursively
|
|
354
|
+
if (options.followIndexes && result.sitemaps && result.sitemaps.length > 0) {
|
|
355
|
+
for (const sitemapUrl of result.sitemaps.slice(0, 50)) { // Limit to prevent abuse
|
|
356
|
+
if (currentDepth < maxDepth - 1) {
|
|
357
|
+
const childResult = await this._parseSitemapRecursive(
|
|
358
|
+
sitemapUrl,
|
|
359
|
+
currentDepth + 1,
|
|
360
|
+
maxDepth,
|
|
361
|
+
options
|
|
362
|
+
);
|
|
363
|
+
|
|
364
|
+
result.urls.push(...childResult.urls);
|
|
365
|
+
Object.assign(result.metadata, childResult.metadata);
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
// Cache the result
|
|
371
|
+
if (this.cache && cacheKey) {
|
|
372
|
+
await this.cache.set(cacheKey, result);
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
return result;
|
|
376
|
+
} catch (error) {
|
|
377
|
+
this.stats.errors++;
|
|
378
|
+
console.warn(`Failed to parse sitemap ${url}:`, error.message);
|
|
379
|
+
return { urls: [], metadata: {}, sitemaps: [] };
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Fetch and decompress sitemap content
|
|
385
|
+
* @private
|
|
386
|
+
*/
|
|
387
|
+
async _fetchSitemapContent(url) {
|
|
388
|
+
try {
|
|
389
|
+
const response = await this._fetchWithTimeoutResponse(url);
|
|
390
|
+
if (!response || !response.ok) {
|
|
391
|
+
return null;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
const contentType = response.headers.get('content-type') || '';
|
|
395
|
+
const contentEncoding = response.headers.get('content-encoding') || '';
|
|
396
|
+
|
|
397
|
+
let content;
|
|
398
|
+
|
|
399
|
+
// Handle compressed content
|
|
400
|
+
if (url.endsWith('.gz') || contentEncoding.includes('gzip')) {
|
|
401
|
+
const buffer = await response.arrayBuffer();
|
|
402
|
+
const decompressed = await gunzip(Buffer.from(buffer));
|
|
403
|
+
content = decompressed.toString('utf8');
|
|
404
|
+
this.stats.compressionSavings += buffer.byteLength - decompressed.length;
|
|
405
|
+
} else {
|
|
406
|
+
content = await response.text();
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
return content;
|
|
410
|
+
} catch (error) {
|
|
411
|
+
console.warn(`Failed to fetch sitemap content from ${url}:`, error.message);
|
|
412
|
+
return null;
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
/**
|
|
417
|
+
* Parse sitemap content with format detection
|
|
418
|
+
* @private
|
|
419
|
+
*/
|
|
420
|
+
async _parseSitemapContent(content, url, options) {
|
|
421
|
+
const result = {
|
|
422
|
+
urls: [],
|
|
423
|
+
metadata: {},
|
|
424
|
+
sitemaps: []
|
|
425
|
+
};
|
|
426
|
+
|
|
427
|
+
try {
|
|
428
|
+
// Detect format and parse accordingly
|
|
429
|
+
if (content.includes('<sitemapindex')) {
|
|
430
|
+
return this._parseSitemapIndex(content, url);
|
|
431
|
+
} else if (content.includes('<urlset') || content.includes('<url>')) {
|
|
432
|
+
return this._parseStandardSitemap(content, url, options);
|
|
433
|
+
} else if (content.includes('<rss') || content.includes('<feed')) {
|
|
434
|
+
return this._parseRSSAtomFeed(content, url);
|
|
435
|
+
} else {
|
|
436
|
+
throw new Error(`Unrecognized sitemap format for ${url}`);
|
|
437
|
+
}
|
|
438
|
+
} catch (error) {
|
|
439
|
+
console.warn(`Failed to parse sitemap content:`, error.message);
|
|
440
|
+
return result;
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
/**
|
|
445
|
+
* Parse standard XML sitemap
|
|
446
|
+
* @private
|
|
447
|
+
*/
|
|
448
|
+
_parseStandardSitemap(content, url, options) {
|
|
449
|
+
const $ = load(content, { xmlMode: true });
|
|
450
|
+
const result = {
|
|
451
|
+
urls: [],
|
|
452
|
+
metadata: {},
|
|
453
|
+
sitemaps: []
|
|
454
|
+
};
|
|
455
|
+
|
|
456
|
+
// Parse standard URLs
|
|
457
|
+
$('url').each((_, element) => {
|
|
458
|
+
const $url = $(element);
|
|
459
|
+
const loc = $url.find('loc').text().trim();
|
|
460
|
+
|
|
461
|
+
if (loc && result.urls.length < this.maxUrlsPerSitemap) {
|
|
462
|
+
const urlData = {
|
|
463
|
+
loc: normalizeUrl(loc),
|
|
464
|
+
lastmod: $url.find('lastmod').text().trim() || null,
|
|
465
|
+
changefreq: $url.find('changefreq').text().trim() || null,
|
|
466
|
+
priority: $url.find('priority').text().trim() || null
|
|
467
|
+
};
|
|
468
|
+
|
|
469
|
+
if (this.validateUrls) {
|
|
470
|
+
try {
|
|
471
|
+
new URL(urlData.loc);
|
|
472
|
+
result.urls.push(urlData);
|
|
473
|
+
this.stats.urlsFound++;
|
|
474
|
+
} catch {
|
|
475
|
+
// Skip invalid URLs
|
|
476
|
+
}
|
|
477
|
+
} else {
|
|
478
|
+
result.urls.push(urlData);
|
|
479
|
+
this.stats.urlsFound++;
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
});
|
|
483
|
+
|
|
484
|
+
// Parse additional metadata if requested
|
|
485
|
+
if (options.includeMetadata) {
|
|
486
|
+
// Parse image sitemaps
|
|
487
|
+
result.metadata.images = [];
|
|
488
|
+
$('image\\:image, image').each((_, element) => {
|
|
489
|
+
const $img = $(element);
|
|
490
|
+
const loc = $img.find('image\\:loc, loc').text().trim();
|
|
491
|
+
if (loc) {
|
|
492
|
+
result.metadata.images.push({
|
|
493
|
+
loc,
|
|
494
|
+
caption: $img.find('image\\:caption, caption').text().trim(),
|
|
495
|
+
title: $img.find('image\\:title, title').text().trim(),
|
|
496
|
+
geo_location: $img.find('image\\:geo_location').text().trim()
|
|
497
|
+
});
|
|
498
|
+
}
|
|
499
|
+
});
|
|
500
|
+
|
|
501
|
+
// Parse video sitemaps
|
|
502
|
+
result.metadata.videos = [];
|
|
503
|
+
$('video\\:video, video').each((_, element) => {
|
|
504
|
+
const $vid = $(element);
|
|
505
|
+
const contentLoc = $vid.find('video\\:content_loc, content_loc').text().trim();
|
|
506
|
+
const playerLoc = $vid.find('video\\:player_loc, player_loc').text().trim();
|
|
507
|
+
|
|
508
|
+
if (contentLoc || playerLoc) {
|
|
509
|
+
result.metadata.videos.push({
|
|
510
|
+
content_loc: contentLoc,
|
|
511
|
+
player_loc: playerLoc,
|
|
512
|
+
title: $vid.find('video\\:title, title').text().trim(),
|
|
513
|
+
description: $vid.find('video\\:description, description').text().trim(),
|
|
514
|
+
thumbnail_loc: $vid.find('video\\:thumbnail_loc, thumbnail_loc').text().trim(),
|
|
515
|
+
duration: $vid.find('video\\:duration, duration').text().trim()
|
|
516
|
+
});
|
|
517
|
+
}
|
|
518
|
+
});
|
|
519
|
+
|
|
520
|
+
// Parse news sitemaps
|
|
521
|
+
result.metadata.news = [];
|
|
522
|
+
$('news\\:news, news').each((_, element) => {
|
|
523
|
+
const $news = $(element);
|
|
524
|
+
const title = $news.find('news\\:title, title').text().trim();
|
|
525
|
+
const publication = $news.find('news\\:publication news\\:name, publication name').text().trim();
|
|
526
|
+
|
|
527
|
+
if (title) {
|
|
528
|
+
result.metadata.news.push({
|
|
529
|
+
title,
|
|
530
|
+
publication,
|
|
531
|
+
publication_date: $news.find('news\\:publication_date, publication_date').text().trim(),
|
|
532
|
+
keywords: $news.find('news\\:keywords, keywords').text().trim()
|
|
533
|
+
});
|
|
534
|
+
}
|
|
535
|
+
});
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
return result;
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
/**
|
|
542
|
+
* Parse sitemap index
|
|
543
|
+
* @private
|
|
544
|
+
*/
|
|
545
|
+
_parseSitemapIndex(content, url) {
|
|
546
|
+
const $ = load(content, { xmlMode: true });
|
|
547
|
+
const result = {
|
|
548
|
+
urls: [],
|
|
549
|
+
metadata: {},
|
|
550
|
+
sitemaps: []
|
|
551
|
+
};
|
|
552
|
+
|
|
553
|
+
$('sitemap').each((_, element) => {
|
|
554
|
+
const $sitemap = $(element);
|
|
555
|
+
const loc = $sitemap.find('loc').text().trim();
|
|
556
|
+
|
|
557
|
+
if (loc) {
|
|
558
|
+
result.sitemaps.push(normalizeUrl(loc));
|
|
559
|
+
}
|
|
560
|
+
});
|
|
561
|
+
|
|
562
|
+
return result;
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
/**
|
|
566
|
+
* Parse RSS/Atom feeds as fallback
|
|
567
|
+
* @private
|
|
568
|
+
*/
|
|
569
|
+
_parseRSSAtomFeed(content, url) {
|
|
570
|
+
const $ = load(content, { xmlMode: true });
|
|
571
|
+
const result = {
|
|
572
|
+
urls: [],
|
|
573
|
+
metadata: { feedType: null },
|
|
574
|
+
sitemaps: []
|
|
575
|
+
};
|
|
576
|
+
|
|
577
|
+
// Detect feed type
|
|
578
|
+
if (content.includes('<rss')) {
|
|
579
|
+
result.metadata.feedType = 'rss';
|
|
580
|
+
$('item').each((_, element) => {
|
|
581
|
+
const $item = $(element);
|
|
582
|
+
const link = $item.find('link').text().trim();
|
|
583
|
+
const pubDate = $item.find('pubDate').text().trim();
|
|
584
|
+
|
|
585
|
+
if (link && result.urls.length < this.maxUrlsPerSitemap) {
|
|
586
|
+
result.urls.push({
|
|
587
|
+
loc: normalizeUrl(link),
|
|
588
|
+
lastmod: pubDate ? new Date(pubDate).toISOString() : null,
|
|
589
|
+
changefreq: 'weekly',
|
|
590
|
+
priority: '0.5'
|
|
591
|
+
});
|
|
592
|
+
this.stats.urlsFound++;
|
|
593
|
+
}
|
|
594
|
+
});
|
|
595
|
+
} else if (content.includes('<feed')) {
|
|
596
|
+
result.metadata.feedType = 'atom';
|
|
597
|
+
$('entry').each((_, element) => {
|
|
598
|
+
const $entry = $(element);
|
|
599
|
+
const link = $entry.find('link').attr('href') || $entry.find('link').text().trim();
|
|
600
|
+
const updated = $entry.find('updated').text().trim();
|
|
601
|
+
|
|
602
|
+
if (link && result.urls.length < this.maxUrlsPerSitemap) {
|
|
603
|
+
result.urls.push({
|
|
604
|
+
loc: normalizeUrl(link),
|
|
605
|
+
lastmod: updated || null,
|
|
606
|
+
changefreq: 'weekly',
|
|
607
|
+
priority: '0.5'
|
|
608
|
+
});
|
|
609
|
+
this.stats.urlsFound++;
|
|
610
|
+
}
|
|
611
|
+
});
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
return result;
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
/**
|
|
618
|
+
* Fetch with timeout
|
|
619
|
+
* @private
|
|
620
|
+
*/
|
|
621
|
+
async _fetchWithTimeout(url) {
|
|
622
|
+
const response = await this._fetchWithTimeoutResponse(url);
|
|
623
|
+
return response ? await response.text() : null;
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
/**
|
|
627
|
+
* Fetch with timeout returning response object
|
|
628
|
+
* @private
|
|
629
|
+
*/
|
|
630
|
+
async _fetchWithTimeoutResponse(url) {
|
|
631
|
+
const controller = new AbortController();
|
|
632
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
633
|
+
|
|
634
|
+
try {
|
|
635
|
+
const response = await fetch(url, {
|
|
636
|
+
signal: controller.signal,
|
|
637
|
+
headers: {
|
|
638
|
+
'User-Agent': this.userAgent,
|
|
639
|
+
'Accept': 'application/xml,text/xml,text/plain,*/*',
|
|
640
|
+
'Accept-Encoding': 'gzip, deflate'
|
|
641
|
+
}
|
|
642
|
+
});
|
|
643
|
+
clearTimeout(timeoutId);
|
|
644
|
+
return response;
|
|
645
|
+
} catch (error) {
|
|
646
|
+
clearTimeout(timeoutId);
|
|
647
|
+
throw error;
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
/**
|
|
652
|
+
* Get parsing statistics
|
|
653
|
+
*/
|
|
654
|
+
getStatistics() {
|
|
655
|
+
return {
|
|
656
|
+
...this.stats,
|
|
657
|
+
cacheHitRate: this.stats.sitemapsProcessed > 0
|
|
658
|
+
? (this.stats.cacheHits / this.stats.sitemapsProcessed) * 100
|
|
659
|
+
: 0,
|
|
660
|
+
averageUrlsPerSitemap: this.stats.sitemapsProcessed > 0
|
|
661
|
+
? this.stats.urlsFound / this.stats.sitemapsProcessed
|
|
662
|
+
: 0,
|
|
663
|
+
compressionSavingsKB: Math.round(this.stats.compressionSavings / 1024)
|
|
664
|
+
};
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
/**
|
|
668
|
+
* Clear all caches
|
|
669
|
+
*/
|
|
670
|
+
async clearCache() {
|
|
671
|
+
this.processedSitemaps.clear();
|
|
672
|
+
if (this.cache) {
|
|
673
|
+
await this.cache.clear();
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
export default SitemapParser;
|