crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,678 @@
1
+ import { load } from 'cheerio';
2
+ import zlib from 'zlib';
3
+ import { promisify } from 'util';
4
+ import { CacheManager } from '../core/cache/CacheManager.js';
5
+ import { normalizeUrl } from './urlNormalizer.js';
6
+
7
+ const gunzip = promisify(zlib.gunzip);
8
+
9
+ export class SitemapParser {
10
+ constructor(options = {}) {
11
+ const {
12
+ userAgent = 'CrawlForge/1.0',
13
+ timeout = 10000,
14
+ maxRecursionDepth = 3,
15
+ maxUrlsPerSitemap = 50000,
16
+ enableCaching = true,
17
+ cacheTTL = 3600000, // 1 hour
18
+ validateUrls = true
19
+ } = options;
20
+
21
+ this.userAgent = userAgent;
22
+ this.timeout = timeout;
23
+ this.maxRecursionDepth = maxRecursionDepth;
24
+ this.maxUrlsPerSitemap = maxUrlsPerSitemap;
25
+ this.validateUrls = validateUrls;
26
+
27
+ // Initialize cache if enabled
28
+ this.cache = enableCaching ? new CacheManager({
29
+ maxSize: 500,
30
+ ttl: cacheTTL,
31
+ diskCacheDir: './cache/sitemaps',
32
+ enableDiskCache: true
33
+ }) : null;
34
+
35
+ // Track processed sitemaps to avoid infinite loops
36
+ this.processedSitemaps = new Set();
37
+
38
+ // Statistics
39
+ this.stats = {
40
+ sitemapsProcessed: 0,
41
+ urlsFound: 0,
42
+ errors: 0,
43
+ cacheHits: 0,
44
+ compressionSavings: 0
45
+ };
46
+ }
47
+
48
+ /**
49
+ * Parse a sitemap from a URL with full feature support
50
+ * @param {string} url - Sitemap URL
51
+ * @param {Object} options - Parsing options
52
+ * @returns {Promise<Object>} Parsed sitemap data
53
+ */
54
+ async parseSitemap(url, options = {}) {
55
+ const {
56
+ includeMetadata = true,
57
+ followIndexes = true,
58
+ maxDepth = this.maxRecursionDepth
59
+ } = options;
60
+
61
+ // Reset stats for new parsing session
62
+ this.stats = {
63
+ sitemapsProcessed: 0,
64
+ urlsFound: 0,
65
+ errors: 0,
66
+ cacheHits: 0,
67
+ compressionSavings: 0
68
+ };
69
+ this.processedSitemaps.clear();
70
+
71
+ try {
72
+ const result = await this._parseSitemapRecursive(url, 0, maxDepth, {
73
+ includeMetadata,
74
+ followIndexes
75
+ });
76
+
77
+ return {
78
+ success: true,
79
+ urls: result.urls,
80
+ metadata: includeMetadata ? result.metadata : {},
81
+ sitemaps: result.sitemaps,
82
+ statistics: this.getStatistics(),
83
+ lastUpdated: new Date().toISOString()
84
+ };
85
+ } catch (error) {
86
+ this.stats.errors++;
87
+ return {
88
+ success: false,
89
+ error: error.message,
90
+ urls: [],
91
+ metadata: {},
92
+ sitemaps: [],
93
+ statistics: this.getStatistics()
94
+ };
95
+ }
96
+ }
97
+
98
+ /**
99
+ * Parse sitemap index files and return all contained sitemaps
100
+ * @param {string} indexUrl - Sitemap index URL
101
+ * @returns {Promise<Array>} Array of sitemap URLs with metadata
102
+ */
103
+ async parseSitemapIndex(indexUrl) {
104
+ try {
105
+ const content = await this._fetchSitemapContent(indexUrl);
106
+ if (!content) return [];
107
+
108
+ const $ = load(content, { xmlMode: true });
109
+ const sitemaps = [];
110
+
111
+ // Parse sitemap index entries
112
+ $('sitemap').each((_, element) => {
113
+ const $sitemap = $(element);
114
+ const loc = $sitemap.find('loc').text().trim();
115
+
116
+ if (loc) {
117
+ const sitemap = {
118
+ url: normalizeUrl(loc),
119
+ lastmod: $sitemap.find('lastmod').text().trim() || null
120
+ };
121
+ sitemaps.push(sitemap);
122
+ }
123
+ });
124
+
125
+ return sitemaps;
126
+ } catch (error) {
127
+ console.warn(`Failed to parse sitemap index ${indexUrl}:`, error.message);
128
+ return [];
129
+ }
130
+ }
131
+
132
+ /**
133
+ * Extract all URLs from parsed sitemap data
134
+ * @param {Object} sitemapData - Parsed sitemap result
135
+ * @param {Object} filters - URL filtering options
136
+ * @returns {Array} Filtered URLs with metadata
137
+ */
138
+ extractUrls(sitemapData, filters = {}) {
139
+ const {
140
+ includeImages = false,
141
+ includeVideos = false,
142
+ includeNews = false,
143
+ domainFilter = null,
144
+ pathFilter = null,
145
+ changeFreqFilter = null,
146
+ priorityFilter = null,
147
+ lastModAfter = null,
148
+ lastModBefore = null
149
+ } = filters;
150
+
151
+ if (!sitemapData.success || !sitemapData.urls) {
152
+ return [];
153
+ }
154
+
155
+ let urls = [...sitemapData.urls];
156
+
157
+ // Apply domain filter
158
+ if (domainFilter) {
159
+ const domain = domainFilter.toLowerCase();
160
+ urls = urls.filter(url => {
161
+ try {
162
+ return new URL(url.loc).hostname.toLowerCase().includes(domain);
163
+ } catch {
164
+ return false;
165
+ }
166
+ });
167
+ }
168
+
169
+ // Apply path filter
170
+ if (pathFilter) {
171
+ const regex = new RegExp(pathFilter, 'i');
172
+ urls = urls.filter(url => {
173
+ try {
174
+ return regex.test(new URL(url.loc).pathname);
175
+ } catch {
176
+ return false;
177
+ }
178
+ });
179
+ }
180
+
181
+ // Apply change frequency filter
182
+ if (changeFreqFilter) {
183
+ urls = urls.filter(url => url.changefreq === changeFreqFilter);
184
+ }
185
+
186
+ // Apply priority filter
187
+ if (priorityFilter) {
188
+ const { min = 0, max = 1 } = priorityFilter;
189
+ urls = urls.filter(url => {
190
+ const priority = parseFloat(url.priority || 0.5);
191
+ return priority >= min && priority <= max;
192
+ });
193
+ }
194
+
195
+ // Apply date filters
196
+ if (lastModAfter || lastModBefore) {
197
+ urls = urls.filter(url => {
198
+ if (!url.lastmod) return true;
199
+
200
+ const urlDate = new Date(url.lastmod);
201
+ if (isNaN(urlDate.getTime())) return true;
202
+
203
+ if (lastModAfter && urlDate < new Date(lastModAfter)) return false;
204
+ if (lastModBefore && urlDate > new Date(lastModBefore)) return false;
205
+
206
+ return true;
207
+ });
208
+ }
209
+
210
+ // Include additional content types if requested
211
+ const result = urls.map(url => ({
212
+ url: url.loc,
213
+ lastmod: url.lastmod,
214
+ changefreq: url.changefreq,
215
+ priority: url.priority,
216
+ type: 'standard'
217
+ }));
218
+
219
+ // Add images, videos, news if requested and available
220
+ if (includeImages && sitemapData.metadata.images) {
221
+ sitemapData.metadata.images.forEach(img => {
222
+ result.push({
223
+ url: img.loc,
224
+ caption: img.caption,
225
+ title: img.title,
226
+ type: 'image'
227
+ });
228
+ });
229
+ }
230
+
231
+ if (includeVideos && sitemapData.metadata.videos) {
232
+ sitemapData.metadata.videos.forEach(vid => {
233
+ result.push({
234
+ url: vid.content_loc || vid.player_loc,
235
+ title: vid.title,
236
+ description: vid.description,
237
+ duration: vid.duration,
238
+ type: 'video'
239
+ });
240
+ });
241
+ }
242
+
243
+ if (includeNews && sitemapData.metadata.news) {
244
+ sitemapData.metadata.news.forEach(news => {
245
+ result.push({
246
+ url: news.loc,
247
+ title: news.title,
248
+ publication: news.publication,
249
+ publication_date: news.publication_date,
250
+ type: 'news'
251
+ });
252
+ });
253
+ }
254
+
255
+ return result;
256
+ }
257
+
258
+ /**
259
+ * Discover sitemap URLs from various sources
260
+ * @param {string} baseUrl - Base URL of the website
261
+ * @param {Object} sources - Sources to check
262
+ * @returns {Promise<Array>} Array of discovered sitemap URLs
263
+ */
264
+ async discoverSitemaps(baseUrl, sources = {}) {
265
+ const {
266
+ checkRobotsTxt = true,
267
+ checkCommonPaths = true,
268
+ checkSitemapIndex = true
269
+ } = sources;
270
+
271
+ const discovered = new Set();
272
+ const urlObj = new URL(baseUrl);
273
+ const baseOrigin = `${urlObj.protocol}//${urlObj.host}`;
274
+
275
+ // Check robots.txt for sitemap declarations
276
+ if (checkRobotsTxt) {
277
+ try {
278
+ const robotsUrl = `${baseOrigin}/robots.txt`;
279
+ const robotsContent = await this._fetchWithTimeout(robotsUrl);
280
+ if (robotsContent) {
281
+ const sitemapMatches = robotsContent.match(/^Sitemap:\s*(.+)$/gmi);
282
+ if (sitemapMatches) {
283
+ sitemapMatches.forEach(match => {
284
+ const url = match.replace(/^Sitemap:\s*/i, '').trim();
285
+ if (url) discovered.add(url);
286
+ });
287
+ }
288
+ }
289
+ } catch (error) {
290
+ console.warn('Failed to check robots.txt for sitemaps:', error.message);
291
+ }
292
+ }
293
+
294
+ // Check common sitemap paths
295
+ if (checkCommonPaths) {
296
+ const commonPaths = [
297
+ '/sitemap.xml',
298
+ '/sitemap_index.xml',
299
+ '/sitemap-index.xml',
300
+ '/sitemaps.xml',
301
+ '/sitemap1.xml',
302
+ '/feeds/all.xml',
303
+ '/rss.xml',
304
+ '/atom.xml'
305
+ ];
306
+
307
+ for (const path of commonPaths) {
308
+ const sitemapUrl = `${baseOrigin}${path}`;
309
+ try {
310
+ const response = await this._fetchWithTimeoutResponse(sitemapUrl);
311
+ if (response && response.ok) {
312
+ discovered.add(sitemapUrl);
313
+ }
314
+ } catch {
315
+ // Continue checking other paths
316
+ }
317
+ }
318
+ }
319
+
320
+ return Array.from(discovered);
321
+ }
322
+
323
+ /**
324
+ * Recursive sitemap parsing with depth control
325
+ * @private
326
+ */
327
+ async _parseSitemapRecursive(url, currentDepth, maxDepth, options) {
328
+ if (currentDepth >= maxDepth || this.processedSitemaps.has(url)) {
329
+ return { urls: [], metadata: {}, sitemaps: [] };
330
+ }
331
+
332
+ this.processedSitemaps.add(url);
333
+ this.stats.sitemapsProcessed++;
334
+
335
+ // Check cache first
336
+ const cacheKey = this.cache?.generateKey(url, { depth: currentDepth });
337
+ if (this.cache && cacheKey) {
338
+ const cached = await this.cache.get(cacheKey);
339
+ if (cached) {
340
+ this.stats.cacheHits++;
341
+ return cached;
342
+ }
343
+ }
344
+
345
+ try {
346
+ const content = await this._fetchSitemapContent(url);
347
+ if (!content) {
348
+ throw new Error(`Failed to fetch sitemap content from ${url}`);
349
+ }
350
+
351
+ const result = await this._parseSitemapContent(content, url, options);
352
+
353
+ // Handle sitemap indexes recursively
354
+ if (options.followIndexes && result.sitemaps && result.sitemaps.length > 0) {
355
+ for (const sitemapUrl of result.sitemaps.slice(0, 50)) { // Limit to prevent abuse
356
+ if (currentDepth < maxDepth - 1) {
357
+ const childResult = await this._parseSitemapRecursive(
358
+ sitemapUrl,
359
+ currentDepth + 1,
360
+ maxDepth,
361
+ options
362
+ );
363
+
364
+ result.urls.push(...childResult.urls);
365
+ Object.assign(result.metadata, childResult.metadata);
366
+ }
367
+ }
368
+ }
369
+
370
+ // Cache the result
371
+ if (this.cache && cacheKey) {
372
+ await this.cache.set(cacheKey, result);
373
+ }
374
+
375
+ return result;
376
+ } catch (error) {
377
+ this.stats.errors++;
378
+ console.warn(`Failed to parse sitemap ${url}:`, error.message);
379
+ return { urls: [], metadata: {}, sitemaps: [] };
380
+ }
381
+ }
382
+
383
+ /**
384
+ * Fetch and decompress sitemap content
385
+ * @private
386
+ */
387
+ async _fetchSitemapContent(url) {
388
+ try {
389
+ const response = await this._fetchWithTimeoutResponse(url);
390
+ if (!response || !response.ok) {
391
+ return null;
392
+ }
393
+
394
+ const contentType = response.headers.get('content-type') || '';
395
+ const contentEncoding = response.headers.get('content-encoding') || '';
396
+
397
+ let content;
398
+
399
+ // Handle compressed content
400
+ if (url.endsWith('.gz') || contentEncoding.includes('gzip')) {
401
+ const buffer = await response.arrayBuffer();
402
+ const decompressed = await gunzip(Buffer.from(buffer));
403
+ content = decompressed.toString('utf8');
404
+ this.stats.compressionSavings += buffer.byteLength - decompressed.length;
405
+ } else {
406
+ content = await response.text();
407
+ }
408
+
409
+ return content;
410
+ } catch (error) {
411
+ console.warn(`Failed to fetch sitemap content from ${url}:`, error.message);
412
+ return null;
413
+ }
414
+ }
415
+
416
+ /**
417
+ * Parse sitemap content with format detection
418
+ * @private
419
+ */
420
+ async _parseSitemapContent(content, url, options) {
421
+ const result = {
422
+ urls: [],
423
+ metadata: {},
424
+ sitemaps: []
425
+ };
426
+
427
+ try {
428
+ // Detect format and parse accordingly
429
+ if (content.includes('<sitemapindex')) {
430
+ return this._parseSitemapIndex(content, url);
431
+ } else if (content.includes('<urlset') || content.includes('<url>')) {
432
+ return this._parseStandardSitemap(content, url, options);
433
+ } else if (content.includes('<rss') || content.includes('<feed')) {
434
+ return this._parseRSSAtomFeed(content, url);
435
+ } else {
436
+ throw new Error(`Unrecognized sitemap format for ${url}`);
437
+ }
438
+ } catch (error) {
439
+ console.warn(`Failed to parse sitemap content:`, error.message);
440
+ return result;
441
+ }
442
+ }
443
+
444
+ /**
445
+ * Parse standard XML sitemap
446
+ * @private
447
+ */
448
+ _parseStandardSitemap(content, url, options) {
449
+ const $ = load(content, { xmlMode: true });
450
+ const result = {
451
+ urls: [],
452
+ metadata: {},
453
+ sitemaps: []
454
+ };
455
+
456
+ // Parse standard URLs
457
+ $('url').each((_, element) => {
458
+ const $url = $(element);
459
+ const loc = $url.find('loc').text().trim();
460
+
461
+ if (loc && result.urls.length < this.maxUrlsPerSitemap) {
462
+ const urlData = {
463
+ loc: normalizeUrl(loc),
464
+ lastmod: $url.find('lastmod').text().trim() || null,
465
+ changefreq: $url.find('changefreq').text().trim() || null,
466
+ priority: $url.find('priority').text().trim() || null
467
+ };
468
+
469
+ if (this.validateUrls) {
470
+ try {
471
+ new URL(urlData.loc);
472
+ result.urls.push(urlData);
473
+ this.stats.urlsFound++;
474
+ } catch {
475
+ // Skip invalid URLs
476
+ }
477
+ } else {
478
+ result.urls.push(urlData);
479
+ this.stats.urlsFound++;
480
+ }
481
+ }
482
+ });
483
+
484
+ // Parse additional metadata if requested
485
+ if (options.includeMetadata) {
486
+ // Parse image sitemaps
487
+ result.metadata.images = [];
488
+ $('image\\:image, image').each((_, element) => {
489
+ const $img = $(element);
490
+ const loc = $img.find('image\\:loc, loc').text().trim();
491
+ if (loc) {
492
+ result.metadata.images.push({
493
+ loc,
494
+ caption: $img.find('image\\:caption, caption').text().trim(),
495
+ title: $img.find('image\\:title, title').text().trim(),
496
+ geo_location: $img.find('image\\:geo_location').text().trim()
497
+ });
498
+ }
499
+ });
500
+
501
+ // Parse video sitemaps
502
+ result.metadata.videos = [];
503
+ $('video\\:video, video').each((_, element) => {
504
+ const $vid = $(element);
505
+ const contentLoc = $vid.find('video\\:content_loc, content_loc').text().trim();
506
+ const playerLoc = $vid.find('video\\:player_loc, player_loc').text().trim();
507
+
508
+ if (contentLoc || playerLoc) {
509
+ result.metadata.videos.push({
510
+ content_loc: contentLoc,
511
+ player_loc: playerLoc,
512
+ title: $vid.find('video\\:title, title').text().trim(),
513
+ description: $vid.find('video\\:description, description').text().trim(),
514
+ thumbnail_loc: $vid.find('video\\:thumbnail_loc, thumbnail_loc').text().trim(),
515
+ duration: $vid.find('video\\:duration, duration').text().trim()
516
+ });
517
+ }
518
+ });
519
+
520
+ // Parse news sitemaps
521
+ result.metadata.news = [];
522
+ $('news\\:news, news').each((_, element) => {
523
+ const $news = $(element);
524
+ const title = $news.find('news\\:title, title').text().trim();
525
+ const publication = $news.find('news\\:publication news\\:name, publication name').text().trim();
526
+
527
+ if (title) {
528
+ result.metadata.news.push({
529
+ title,
530
+ publication,
531
+ publication_date: $news.find('news\\:publication_date, publication_date').text().trim(),
532
+ keywords: $news.find('news\\:keywords, keywords').text().trim()
533
+ });
534
+ }
535
+ });
536
+ }
537
+
538
+ return result;
539
+ }
540
+
541
+ /**
542
+ * Parse sitemap index
543
+ * @private
544
+ */
545
+ _parseSitemapIndex(content, url) {
546
+ const $ = load(content, { xmlMode: true });
547
+ const result = {
548
+ urls: [],
549
+ metadata: {},
550
+ sitemaps: []
551
+ };
552
+
553
+ $('sitemap').each((_, element) => {
554
+ const $sitemap = $(element);
555
+ const loc = $sitemap.find('loc').text().trim();
556
+
557
+ if (loc) {
558
+ result.sitemaps.push(normalizeUrl(loc));
559
+ }
560
+ });
561
+
562
+ return result;
563
+ }
564
+
565
+ /**
566
+ * Parse RSS/Atom feeds as fallback
567
+ * @private
568
+ */
569
+ _parseRSSAtomFeed(content, url) {
570
+ const $ = load(content, { xmlMode: true });
571
+ const result = {
572
+ urls: [],
573
+ metadata: { feedType: null },
574
+ sitemaps: []
575
+ };
576
+
577
+ // Detect feed type
578
+ if (content.includes('<rss')) {
579
+ result.metadata.feedType = 'rss';
580
+ $('item').each((_, element) => {
581
+ const $item = $(element);
582
+ const link = $item.find('link').text().trim();
583
+ const pubDate = $item.find('pubDate').text().trim();
584
+
585
+ if (link && result.urls.length < this.maxUrlsPerSitemap) {
586
+ result.urls.push({
587
+ loc: normalizeUrl(link),
588
+ lastmod: pubDate ? new Date(pubDate).toISOString() : null,
589
+ changefreq: 'weekly',
590
+ priority: '0.5'
591
+ });
592
+ this.stats.urlsFound++;
593
+ }
594
+ });
595
+ } else if (content.includes('<feed')) {
596
+ result.metadata.feedType = 'atom';
597
+ $('entry').each((_, element) => {
598
+ const $entry = $(element);
599
+ const link = $entry.find('link').attr('href') || $entry.find('link').text().trim();
600
+ const updated = $entry.find('updated').text().trim();
601
+
602
+ if (link && result.urls.length < this.maxUrlsPerSitemap) {
603
+ result.urls.push({
604
+ loc: normalizeUrl(link),
605
+ lastmod: updated || null,
606
+ changefreq: 'weekly',
607
+ priority: '0.5'
608
+ });
609
+ this.stats.urlsFound++;
610
+ }
611
+ });
612
+ }
613
+
614
+ return result;
615
+ }
616
+
617
+ /**
618
+ * Fetch with timeout
619
+ * @private
620
+ */
621
+ async _fetchWithTimeout(url) {
622
+ const response = await this._fetchWithTimeoutResponse(url);
623
+ return response ? await response.text() : null;
624
+ }
625
+
626
+ /**
627
+ * Fetch with timeout returning response object
628
+ * @private
629
+ */
630
+ async _fetchWithTimeoutResponse(url) {
631
+ const controller = new AbortController();
632
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
633
+
634
+ try {
635
+ const response = await fetch(url, {
636
+ signal: controller.signal,
637
+ headers: {
638
+ 'User-Agent': this.userAgent,
639
+ 'Accept': 'application/xml,text/xml,text/plain,*/*',
640
+ 'Accept-Encoding': 'gzip, deflate'
641
+ }
642
+ });
643
+ clearTimeout(timeoutId);
644
+ return response;
645
+ } catch (error) {
646
+ clearTimeout(timeoutId);
647
+ throw error;
648
+ }
649
+ }
650
+
651
+ /**
652
+ * Get parsing statistics
653
+ */
654
+ getStatistics() {
655
+ return {
656
+ ...this.stats,
657
+ cacheHitRate: this.stats.sitemapsProcessed > 0
658
+ ? (this.stats.cacheHits / this.stats.sitemapsProcessed) * 100
659
+ : 0,
660
+ averageUrlsPerSitemap: this.stats.sitemapsProcessed > 0
661
+ ? this.stats.urlsFound / this.stats.sitemapsProcessed
662
+ : 0,
663
+ compressionSavingsKB: Math.round(this.stats.compressionSavings / 1024)
664
+ };
665
+ }
666
+
667
+ /**
668
+ * Clear all caches
669
+ */
670
+ async clearCache() {
671
+ this.processedSitemaps.clear();
672
+ if (this.cache) {
673
+ await this.cache.clear();
674
+ }
675
+ }
676
+ }
677
+
678
+ export default SitemapParser;