crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,1037 @@
1
+ /**
2
+ * SnapshotManager - Snapshot Storage and Management
3
+ * Handles snapshot storage with compression, delta storage for efficiency,
4
+ * retention policies, cleanup, and change history querying
5
+ */
6
+
7
+ import { promises as fs } from 'fs';
8
+ import path from 'path';
9
+ import { createHash } from 'crypto';
10
+ import { gzip, gunzip } from 'zlib';
11
+ import { promisify } from 'util';
12
+ import { z } from 'zod';
13
+ import { EventEmitter } from 'events';
14
+
15
+ const gzipAsync = promisify(gzip);
16
+ const gunzipAsync = promisify(gunzip);
17
+
18
+ const SnapshotSchema = z.object({
19
+ id: z.string(),
20
+ url: z.string().url(),
21
+ content: z.string(),
22
+ metadata: z.object({
23
+ timestamp: z.number(),
24
+ contentHash: z.string(),
25
+ contentLength: z.number(),
26
+ contentType: z.string().optional(),
27
+ userAgent: z.string().optional(),
28
+ statusCode: z.number().optional(),
29
+ headers: z.record(z.string()).optional(),
30
+ extractionOptions: z.object({}).optional()
31
+ }),
32
+ compression: z.object({
33
+ enabled: z.boolean().default(true),
34
+ algorithm: z.enum(['gzip', 'none']).default('gzip'),
35
+ originalSize: z.number().optional(),
36
+ compressedSize: z.number().optional(),
37
+ compressionRatio: z.number().optional()
38
+ }).optional(),
39
+ delta: z.object({
40
+ enabled: z.boolean().default(false),
41
+ baseSnapshotId: z.string().optional(),
42
+ deltaData: z.string().optional(),
43
+ deltaSize: z.number().optional()
44
+ }).optional()
45
+ });
46
+
47
+ const RetentionPolicySchema = z.object({
48
+ maxSnapshots: z.number().min(1).default(100),
49
+ maxAge: z.number().min(3600000).default(30 * 24 * 3600 * 1000), // 30 days
50
+ maxStorageSize: z.number().min(10 * 1024 * 1024).default(1 * 1024 * 1024 * 1024), // 1GB
51
+ compressionThreshold: z.number().min(1024).default(10 * 1024), // 10KB
52
+ enableDeltaStorage: z.boolean().default(true),
53
+ deltaThreshold: z.number().min(0).max(1).default(0.8), // Similarity threshold for delta storage
54
+ autoCleanup: z.boolean().default(true),
55
+ cleanupInterval: z.number().min(60000).default(24 * 3600 * 1000) // 24 hours
56
+ });
57
+
58
+ const QuerySchema = z.object({
59
+ url: z.string().url().optional(),
60
+ startTime: z.number().optional(),
61
+ endTime: z.number().optional(),
62
+ limit: z.number().min(1).max(1000).default(50),
63
+ offset: z.number().min(0).default(0),
64
+ includeDelta: z.boolean().default(true),
65
+ includeContent: z.boolean().default(false),
66
+ sortBy: z.enum(['timestamp', 'size', 'similarity']).default('timestamp'),
67
+ sortOrder: z.enum(['asc', 'desc']).default('desc'),
68
+ filters: z.object({
69
+ minSize: z.number().optional(),
70
+ maxSize: z.number().optional(),
71
+ contentType: z.string().optional(),
72
+ hasChanges: z.boolean().optional()
73
+ }).optional()
74
+ });
75
+
76
+ export class SnapshotManager extends EventEmitter {
77
+ constructor(options = {}) {
78
+ super();
79
+
80
+ this.options = {
81
+ storageDir: options.storageDir || './snapshots',
82
+ metadataDir: options.metadataDir || './snapshots/metadata',
83
+ tempDir: options.tempDir || './snapshots/temp',
84
+ enableCompression: options.enableCompression !== false,
85
+ enableDeltaStorage: options.enableDeltaStorage !== false,
86
+ enableEncryption: options.enableEncryption || false,
87
+ encryptionKey: options.encryptionKey,
88
+ maxConcurrentOperations: options.maxConcurrentOperations || 10,
89
+ cacheEnabled: options.cacheEnabled !== false,
90
+ cacheSize: options.cacheSize || 100,
91
+ ...options
92
+ };
93
+
94
+ this.retentionPolicy = RetentionPolicySchema.parse(options.retentionPolicy || {});
95
+
96
+ // In-memory cache for frequently accessed snapshots
97
+ this.snapshotCache = new Map();
98
+ this.metadataCache = new Map();
99
+
100
+ // Storage statistics
101
+ this.stats = {
102
+ totalSnapshots: 0,
103
+ totalStorageSize: 0,
104
+ compressedSnapshots: 0,
105
+ deltaSnapshots: 0,
106
+ averageCompressionRatio: 0,
107
+ averageDeltaSize: 0,
108
+ cacheHits: 0,
109
+ cacheMisses: 0,
110
+ cleanupOperations: 0,
111
+ lastCleanup: null,
112
+ operationCounts: {
113
+ store: 0,
114
+ retrieve: 0,
115
+ delete: 0,
116
+ query: 0
117
+ }
118
+ };
119
+
120
+ // Active operations tracking
121
+ this.activeOperations = new Map();
122
+ this.operationQueue = [];
123
+
124
+ // Cleanup timer
125
+ this.cleanupTimer = null;
126
+
127
+ this.initialize();
128
+ }
129
+
130
+ async initialize() {
131
+ try {
132
+ // Create storage directories
133
+ await this.createDirectories();
134
+
135
+ // Load existing snapshot metadata
136
+ await this.loadMetadata();
137
+
138
+ // Start cleanup timer if enabled
139
+ if (this.retentionPolicy.autoCleanup) {
140
+ this.startCleanupTimer();
141
+ }
142
+
143
+ // Initialize cache
144
+ if (this.options.cacheEnabled) {
145
+ await this.initializeCache();
146
+ }
147
+
148
+ this.emit('initialized', {
149
+ totalSnapshots: this.stats.totalSnapshots,
150
+ storageSize: this.stats.totalStorageSize
151
+ });
152
+
153
+ } catch (error) {
154
+ this.emit('error', { operation: 'initialize', error: error.message });
155
+ throw error;
156
+ }
157
+ }
158
+
159
+ /**
160
+ * Store a new snapshot
161
+ * @param {string} url - URL of the snapshot
162
+ * @param {string} content - Content to store
163
+ * @param {Object} metadata - Additional metadata
164
+ * @param {Object} options - Storage options
165
+ * @returns {Object} - Stored snapshot information
166
+ */
167
+ async storeSnapshot(url, content, metadata = {}, options = {}) {
168
+ const operationId = this.generateOperationId();
169
+
170
+ try {
171
+ this.activeOperations.set(operationId, { type: 'store', url, startTime: Date.now() });
172
+
173
+ const snapshotId = this.generateSnapshotId(url, metadata.timestamp || Date.now());
174
+ const contentHash = this.hashContent(content);
175
+
176
+ // Check for similar existing snapshots for delta storage
177
+ let deltaInfo = null;
178
+ if (this.retentionPolicy.enableDeltaStorage) {
179
+ deltaInfo = await this.findSimilarSnapshot(url, contentHash, content);
180
+ }
181
+
182
+ // Prepare snapshot data
183
+ const snapshot = {
184
+ id: snapshotId,
185
+ url,
186
+ content,
187
+ metadata: {
188
+ timestamp: Date.now(),
189
+ contentHash,
190
+ contentLength: content.length,
191
+ contentType: metadata.contentType || 'text/html',
192
+ userAgent: metadata.userAgent,
193
+ statusCode: metadata.statusCode,
194
+ headers: metadata.headers,
195
+ extractionOptions: metadata.extractionOptions,
196
+ ...metadata
197
+ },
198
+ compression: {
199
+ enabled: false,
200
+ algorithm: 'none',
201
+ originalSize: content.length
202
+ },
203
+ delta: {
204
+ enabled: false
205
+ }
206
+ };
207
+
208
+ let finalContent = content;
209
+ let isCompressed = false;
210
+ let isDelta = false;
211
+
212
+ // Apply delta storage if similar snapshot found
213
+ if (deltaInfo && deltaInfo.similarity > this.retentionPolicy.deltaThreshold) {
214
+ const deltaData = this.createDelta(deltaInfo.content, content);
215
+ if (deltaData.length < content.length * 0.7) { // Only use delta if it's significantly smaller
216
+ finalContent = deltaData;
217
+ isDelta = true;
218
+
219
+ snapshot.delta = {
220
+ enabled: true,
221
+ baseSnapshotId: deltaInfo.snapshotId,
222
+ deltaData: deltaData,
223
+ deltaSize: deltaData.length
224
+ };
225
+
226
+ this.stats.deltaSnapshots++;
227
+ }
228
+ }
229
+
230
+ // Apply compression if enabled and above threshold
231
+ if (this.options.enableCompression &&
232
+ finalContent.length > this.retentionPolicy.compressionThreshold) {
233
+
234
+ const compressed = await gzipAsync(finalContent);
235
+ const compressionRatio = compressed.length / finalContent.length;
236
+
237
+ if (compressionRatio < 0.8) { // Only compress if it reduces size by at least 20%
238
+ finalContent = compressed;
239
+ isCompressed = true;
240
+
241
+ snapshot.compression = {
242
+ enabled: true,
243
+ algorithm: 'gzip',
244
+ originalSize: content.length,
245
+ compressedSize: compressed.length,
246
+ compressionRatio
247
+ };
248
+
249
+ this.stats.compressedSnapshots++;
250
+ this.updateCompressionStats(compressionRatio);
251
+ }
252
+ }
253
+
254
+ // Store snapshot to disk
255
+ const filePath = await this.writeSnapshotFile(snapshotId, finalContent);
256
+
257
+ // Store metadata
258
+ await this.storeMetadata(snapshotId, snapshot);
259
+
260
+ // Update cache
261
+ if (this.options.cacheEnabled) {
262
+ this.updateCache(snapshotId, snapshot);
263
+ }
264
+
265
+ // Update statistics
266
+ this.updateStorageStats(snapshot, isDelta);
267
+ this.stats.operationCounts.store++;
268
+
269
+ this.activeOperations.delete(operationId);
270
+
271
+ this.emit('snapshotStored', {
272
+ snapshotId,
273
+ url,
274
+ size: finalContent.length,
275
+ originalSize: content.length,
276
+ compressed: isCompressed,
277
+ delta: isDelta,
278
+ filePath
279
+ });
280
+
281
+ return {
282
+ snapshotId,
283
+ url,
284
+ timestamp: snapshot.metadata.timestamp,
285
+ contentHash,
286
+ size: finalContent.length,
287
+ originalSize: content.length,
288
+ compressed: isCompressed,
289
+ compressionRatio: snapshot.compression.compressionRatio,
290
+ delta: isDelta,
291
+ deltaSize: snapshot.delta.deltaSize
292
+ };
293
+
294
+ } catch (error) {
295
+ this.activeOperations.delete(operationId);
296
+ this.emit('error', { operation: 'storeSnapshot', url, error: error.message });
297
+ throw new Error(`Failed to store snapshot: ${error.message}`);
298
+ }
299
+ }
300
+
301
+ /**
302
+ * Retrieve a snapshot by ID
303
+ * @param {string} snapshotId - Snapshot ID
304
+ * @param {Object} options - Retrieval options
305
+ * @returns {Object} - Retrieved snapshot
306
+ */
307
+ async retrieveSnapshot(snapshotId, options = {}) {
308
+ const operationId = this.generateOperationId();
309
+
310
+ try {
311
+ this.activeOperations.set(operationId, {
312
+ type: 'retrieve',
313
+ snapshotId,
314
+ startTime: Date.now()
315
+ });
316
+
317
+ // Check cache first
318
+ if (this.options.cacheEnabled && this.snapshotCache.has(snapshotId)) {
319
+ this.stats.cacheHits++;
320
+ const cached = this.snapshotCache.get(snapshotId);
321
+ this.activeOperations.delete(operationId);
322
+ return cached;
323
+ }
324
+
325
+ this.stats.cacheMisses++;
326
+
327
+ // Load metadata
328
+ const metadata = await this.loadSnapshotMetadata(snapshotId);
329
+ if (!metadata) {
330
+ throw new Error(`Snapshot not found: ${snapshotId}`);
331
+ }
332
+
333
+ // Read snapshot file
334
+ let content = await this.readSnapshotFile(snapshotId);
335
+
336
+ // Decompress if needed
337
+ if (metadata.compression && metadata.compression.enabled) {
338
+ if (metadata.compression.algorithm === 'gzip') {
339
+ content = await gunzipAsync(content);
340
+ content = content.toString();
341
+ }
342
+ }
343
+
344
+ // Reconstruct from delta if needed
345
+ if (metadata.delta && metadata.delta.enabled) {
346
+ const baseSnapshot = await this.retrieveSnapshot(metadata.delta.baseSnapshotId, options);
347
+ content = this.applyDelta(baseSnapshot.content, content);
348
+ }
349
+
350
+ const snapshot = {
351
+ ...metadata,
352
+ content: options.includeContent !== false ? content : undefined,
353
+ retrievedAt: Date.now()
354
+ };
355
+
356
+ // Update cache
357
+ if (this.options.cacheEnabled) {
358
+ this.updateCache(snapshotId, snapshot);
359
+ }
360
+
361
+ this.stats.operationCounts.retrieve++;
362
+ this.activeOperations.delete(operationId);
363
+
364
+ this.emit('snapshotRetrieved', {
365
+ snapshotId,
366
+ url: metadata.url,
367
+ size: content.length,
368
+ fromCache: false
369
+ });
370
+
371
+ return snapshot;
372
+
373
+ } catch (error) {
374
+ this.activeOperations.delete(operationId);
375
+ this.emit('error', { operation: 'retrieveSnapshot', snapshotId, error: error.message });
376
+ throw new Error(`Failed to retrieve snapshot: ${error.message}`);
377
+ }
378
+ }
379
+
380
+ /**
381
+ * Query snapshots with filters
382
+ * @param {Object} query - Query parameters
383
+ * @returns {Array} - Matching snapshots
384
+ */
385
+ async querySnapshots(query = {}) {
386
+ try {
387
+ const validated = QuerySchema.parse(query);
388
+
389
+ // Load all metadata that matches URL filter
390
+ let snapshots = [];
391
+
392
+ for (const [snapshotId, metadata] of this.metadataCache) {
393
+ if (validated.url && metadata.url !== validated.url) continue;
394
+
395
+ if (validated.startTime && metadata.metadata.timestamp < validated.startTime) continue;
396
+ if (validated.endTime && metadata.metadata.timestamp > validated.endTime) continue;
397
+
398
+ if (validated.filters) {
399
+ const filters = validated.filters;
400
+
401
+ if (filters.minSize && metadata.metadata.contentLength < filters.minSize) continue;
402
+ if (filters.maxSize && metadata.metadata.contentLength > filters.maxSize) continue;
403
+ if (filters.contentType && metadata.metadata.contentType !== filters.contentType) continue;
404
+ }
405
+
406
+ snapshots.push({
407
+ ...metadata,
408
+ content: undefined // Don't include content by default
409
+ });
410
+ }
411
+
412
+ // Sort snapshots
413
+ snapshots.sort((a, b) => {
414
+ const aValue = this.getSortValue(a, validated.sortBy);
415
+ const bValue = this.getSortValue(b, validated.sortBy);
416
+
417
+ if (validated.sortOrder === 'desc') {
418
+ return bValue - aValue;
419
+ } else {
420
+ return aValue - bValue;
421
+ }
422
+ });
423
+
424
+ // Apply pagination
425
+ const start = validated.offset;
426
+ const end = start + validated.limit;
427
+ snapshots = snapshots.slice(start, end);
428
+
429
+ // Include content if requested
430
+ if (validated.includeContent) {
431
+ snapshots = await Promise.all(
432
+ snapshots.map(async (snapshot) => {
433
+ const fullSnapshot = await this.retrieveSnapshot(snapshot.id, { includeContent: true });
434
+ return fullSnapshot;
435
+ })
436
+ );
437
+ }
438
+
439
+ this.stats.operationCounts.query++;
440
+
441
+ this.emit('snapshotsQueried', {
442
+ query: validated,
443
+ resultCount: snapshots.length,
444
+ totalMatching: snapshots.length + validated.offset
445
+ });
446
+
447
+ return {
448
+ snapshots,
449
+ totalCount: snapshots.length,
450
+ query: validated,
451
+ executedAt: Date.now()
452
+ };
453
+
454
+ } catch (error) {
455
+ this.emit('error', { operation: 'querySnapshots', query, error: error.message });
456
+ throw new Error(`Failed to query snapshots: ${error.message}`);
457
+ }
458
+ }
459
+
460
+ /**
461
+ * Get change history for a URL
462
+ * @param {string} url - URL to get history for
463
+ * @param {Object} options - History options
464
+ * @returns {Array} - Change history
465
+ */
466
+ async getChangeHistory(url, options = {}) {
467
+ try {
468
+ const snapshots = await this.querySnapshots({
469
+ url,
470
+ limit: options.limit || 100,
471
+ sortBy: 'timestamp',
472
+ sortOrder: 'desc',
473
+ includeContent: false
474
+ });
475
+
476
+ const history = [];
477
+ const snapshotList = snapshots.snapshots;
478
+
479
+ for (let i = 0; i < snapshotList.length - 1; i++) {
480
+ const current = snapshotList[i];
481
+ const previous = snapshotList[i + 1];
482
+
483
+ // Calculate change metrics
484
+ const changeMetrics = await this.calculateChangeMetrics(previous, current);
485
+
486
+ history.push({
487
+ timestamp: current.metadata.timestamp,
488
+ snapshotId: current.id,
489
+ previousSnapshotId: previous.id,
490
+ changes: changeMetrics,
491
+ timeDelta: current.metadata.timestamp - previous.metadata.timestamp,
492
+ sizeDelta: current.metadata.contentLength - previous.metadata.contentLength
493
+ });
494
+ }
495
+
496
+ return {
497
+ url,
498
+ history,
499
+ totalSnapshots: snapshotList.length,
500
+ timespan: snapshotList.length > 0 ?
501
+ snapshotList[0].metadata.timestamp - snapshotList[snapshotList.length - 1].metadata.timestamp : 0
502
+ };
503
+
504
+ } catch (error) {
505
+ this.emit('error', { operation: 'getChangeHistory', url, error: error.message });
506
+ throw new Error(`Failed to get change history: ${error.message}`);
507
+ }
508
+ }
509
+
510
+ /**
511
+ * Delete snapshots
512
+ * @param {string|Array} snapshotIds - Snapshot ID(s) to delete
513
+ * @returns {Object} - Deletion results
514
+ */
515
+ async deleteSnapshots(snapshotIds) {
516
+ const ids = Array.isArray(snapshotIds) ? snapshotIds : [snapshotIds];
517
+ const results = {
518
+ deleted: [],
519
+ failed: [],
520
+ totalSize: 0
521
+ };
522
+
523
+ try {
524
+ for (const snapshotId of ids) {
525
+ try {
526
+ const metadata = await this.loadSnapshotMetadata(snapshotId);
527
+ if (!metadata) {
528
+ results.failed.push({ snapshotId, error: 'Snapshot not found' });
529
+ continue;
530
+ }
531
+
532
+ // Delete file
533
+ await this.deleteSnapshotFile(snapshotId);
534
+
535
+ // Delete metadata
536
+ await this.deleteSnapshotMetadata(snapshotId);
537
+
538
+ // Remove from cache
539
+ this.snapshotCache.delete(snapshotId);
540
+ this.metadataCache.delete(snapshotId);
541
+
542
+ // Update statistics
543
+ this.stats.totalSnapshots--;
544
+ this.stats.totalStorageSize -= metadata.metadata.contentLength;
545
+
546
+ results.deleted.push(snapshotId);
547
+ results.totalSize += metadata.metadata.contentLength;
548
+
549
+ this.emit('snapshotDeleted', { snapshotId, size: metadata.metadata.contentLength });
550
+
551
+ } catch (error) {
552
+ results.failed.push({ snapshotId, error: error.message });
553
+ }
554
+ }
555
+
556
+ this.stats.operationCounts.delete += results.deleted.length;
557
+
558
+ return results;
559
+
560
+ } catch (error) {
561
+ this.emit('error', { operation: 'deleteSnapshots', snapshotIds, error: error.message });
562
+ throw new Error(`Failed to delete snapshots: ${error.message}`);
563
+ }
564
+ }
565
+
566
+ /**
567
+ * Clean up old snapshots based on retention policy
568
+ * @returns {Object} - Cleanup results
569
+ */
570
+ async cleanupSnapshots() {
571
+ const startTime = Date.now();
572
+
573
+ try {
574
+ const cleanupResults = {
575
+ deletedCount: 0,
576
+ freedSpace: 0,
577
+ errors: []
578
+ };
579
+
580
+ const allSnapshots = Array.from(this.metadataCache.values());
581
+ const now = Date.now();
582
+
583
+ // Group snapshots by URL for intelligent cleanup
584
+ const snapshotsByUrl = new Map();
585
+ allSnapshots.forEach(snapshot => {
586
+ const url = snapshot.url;
587
+ if (!snapshotsByUrl.has(url)) {
588
+ snapshotsByUrl.set(url, []);
589
+ }
590
+ snapshotsByUrl.get(url).push(snapshot);
591
+ });
592
+
593
+ // Cleanup by retention policy rules
594
+ for (const [url, snapshots] of snapshotsByUrl) {
595
+ const sortedSnapshots = snapshots.sort((a, b) =>
596
+ b.metadata.timestamp - a.metadata.timestamp
597
+ );
598
+
599
+ const toDelete = [];
600
+
601
+ // Rule 1: Respect maximum snapshot limit per URL
602
+ if (sortedSnapshots.length > this.retentionPolicy.maxSnapshots) {
603
+ toDelete.push(...sortedSnapshots.slice(this.retentionPolicy.maxSnapshots));
604
+ }
605
+
606
+ // Rule 2: Delete snapshots older than maxAge
607
+ const ageThreshold = now - this.retentionPolicy.maxAge;
608
+ sortedSnapshots.forEach(snapshot => {
609
+ if (snapshot.metadata.timestamp < ageThreshold && !toDelete.includes(snapshot)) {
610
+ toDelete.push(snapshot);
611
+ }
612
+ });
613
+
614
+ // Delete marked snapshots
615
+ if (toDelete.length > 0) {
616
+ const deleteResult = await this.deleteSnapshots(toDelete.map(s => s.id));
617
+ cleanupResults.deletedCount += deleteResult.deleted.length;
618
+ cleanupResults.freedSpace += deleteResult.totalSize;
619
+ cleanupResults.errors.push(...deleteResult.failed);
620
+ }
621
+ }
622
+
623
+ // Rule 3: Check total storage size
624
+ if (this.stats.totalStorageSize > this.retentionPolicy.maxStorageSize) {
625
+ const excess = this.stats.totalStorageSize - this.retentionPolicy.maxStorageSize;
626
+ const additionalCleanup = await this.cleanupBySize(excess);
627
+
628
+ cleanupResults.deletedCount += additionalCleanup.deletedCount;
629
+ cleanupResults.freedSpace += additionalCleanup.freedSpace;
630
+ }
631
+
632
+ const cleanupTime = Date.now() - startTime;
633
+
634
+ this.stats.cleanupOperations++;
635
+ this.stats.lastCleanup = Date.now();
636
+
637
+ this.emit('cleanupCompleted', {
638
+ ...cleanupResults,
639
+ cleanupTime,
640
+ remainingSnapshots: this.stats.totalSnapshots,
641
+ remainingSize: this.stats.totalStorageSize
642
+ });
643
+
644
+ return {
645
+ ...cleanupResults,
646
+ cleanupTime
647
+ };
648
+
649
+ } catch (error) {
650
+ this.emit('error', { operation: 'cleanupSnapshots', error: error.message });
651
+ throw new Error(`Failed to cleanup snapshots: ${error.message}`);
652
+ }
653
+ }
654
+
655
+ // File system operations
656
+
657
+ async createDirectories() {
658
+ const dirs = [
659
+ this.options.storageDir,
660
+ this.options.metadataDir,
661
+ this.options.tempDir
662
+ ];
663
+
664
+ for (const dir of dirs) {
665
+ await fs.mkdir(dir, { recursive: true });
666
+ }
667
+ }
668
+
669
+ async writeSnapshotFile(snapshotId, content) {
670
+ const filePath = path.join(this.options.storageDir, `${snapshotId}.snap`);
671
+
672
+ if (Buffer.isBuffer(content)) {
673
+ await fs.writeFile(filePath, content);
674
+ } else {
675
+ await fs.writeFile(filePath, content, 'utf8');
676
+ }
677
+
678
+ return filePath;
679
+ }
680
+
681
+ async readSnapshotFile(snapshotId) {
682
+ const filePath = path.join(this.options.storageDir, `${snapshotId}.snap`);
683
+ return await fs.readFile(filePath);
684
+ }
685
+
686
+ async deleteSnapshotFile(snapshotId) {
687
+ const filePath = path.join(this.options.storageDir, `${snapshotId}.snap`);
688
+ await fs.unlink(filePath);
689
+ }
690
+
691
+ async storeMetadata(snapshotId, metadata) {
692
+ const filePath = path.join(this.options.metadataDir, `${snapshotId}.meta`);
693
+ await fs.writeFile(filePath, JSON.stringify(metadata, null, 2), 'utf8');
694
+
695
+ // Update in-memory cache
696
+ this.metadataCache.set(snapshotId, metadata);
697
+ }
698
+
699
+ async loadSnapshotMetadata(snapshotId) {
700
+ // Check cache first
701
+ if (this.metadataCache.has(snapshotId)) {
702
+ return this.metadataCache.get(snapshotId);
703
+ }
704
+
705
+ // Load from disk
706
+ try {
707
+ const filePath = path.join(this.options.metadataDir, `${snapshotId}.meta`);
708
+ const content = await fs.readFile(filePath, 'utf8');
709
+ const metadata = JSON.parse(content);
710
+
711
+ // Cache it
712
+ this.metadataCache.set(snapshotId, metadata);
713
+
714
+ return metadata;
715
+ } catch (error) {
716
+ return null;
717
+ }
718
+ }
719
+
720
+ async deleteSnapshotMetadata(snapshotId) {
721
+ const filePath = path.join(this.options.metadataDir, `${snapshotId}.meta`);
722
+ await fs.unlink(filePath);
723
+
724
+ this.metadataCache.delete(snapshotId);
725
+ }
726
+
727
+ async loadMetadata() {
728
+ try {
729
+ const metadataFiles = await fs.readdir(this.options.metadataDir);
730
+ let totalSize = 0;
731
+ let totalSnapshots = 0;
732
+
733
+ for (const file of metadataFiles) {
734
+ if (file.endsWith('.meta')) {
735
+ const snapshotId = file.replace('.meta', '');
736
+ const metadata = await this.loadSnapshotMetadata(snapshotId);
737
+
738
+ if (metadata) {
739
+ totalSnapshots++;
740
+ totalSize += metadata.metadata.contentLength || 0;
741
+ }
742
+ }
743
+ }
744
+
745
+ this.stats.totalSnapshots = totalSnapshots;
746
+ this.stats.totalStorageSize = totalSize;
747
+
748
+ } catch (error) {
749
+ // Directory doesn't exist yet, that's okay
750
+ this.stats.totalSnapshots = 0;
751
+ this.stats.totalStorageSize = 0;
752
+ }
753
+ }
754
+
755
+ // Utility methods
756
+
757
+ generateSnapshotId(url, timestamp) {
758
+ const hash = createHash('sha256');
759
+ hash.update(`${url}-${timestamp}-${Math.random()}`);
760
+ return hash.digest('hex').substring(0, 16);
761
+ }
762
+
763
+ generateOperationId() {
764
+ return `op-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
765
+ }
766
+
767
+ hashContent(content) {
768
+ const hash = createHash('sha256');
769
+ hash.update(content);
770
+ return hash.digest('hex');
771
+ }
772
+
773
+ async findSimilarSnapshot(url, contentHash, content) {
774
+ // Find recent snapshots for the same URL
775
+ const recentSnapshots = await this.querySnapshots({
776
+ url,
777
+ limit: 10,
778
+ sortBy: 'timestamp',
779
+ sortOrder: 'desc',
780
+ includeContent: false
781
+ });
782
+
783
+ for (const snapshot of recentSnapshots.snapshots) {
784
+ if (snapshot.metadata.contentHash === contentHash) {
785
+ // Exact match
786
+ return {
787
+ snapshotId: snapshot.id,
788
+ similarity: 1.0,
789
+ content: null
790
+ };
791
+ }
792
+
793
+ // Load content for similarity comparison
794
+ const fullSnapshot = await this.retrieveSnapshot(snapshot.id, { includeContent: true });
795
+ const similarity = this.calculateContentSimilarity(content, fullSnapshot.content);
796
+
797
+ if (similarity > this.retentionPolicy.deltaThreshold) {
798
+ return {
799
+ snapshotId: snapshot.id,
800
+ similarity,
801
+ content: fullSnapshot.content
802
+ };
803
+ }
804
+ }
805
+
806
+ return null;
807
+ }
808
+
809
+ calculateContentSimilarity(content1, content2) {
810
+ // Simple similarity calculation based on content length difference
811
+ // In production, you might want to use more sophisticated algorithms
812
+ const len1 = content1.length;
813
+ const len2 = content2.length;
814
+
815
+ if (len1 === 0 && len2 === 0) return 1.0;
816
+ if (len1 === 0 || len2 === 0) return 0.0;
817
+
818
+ const lengthSimilarity = 1 - Math.abs(len1 - len2) / Math.max(len1, len2);
819
+
820
+ // Additional similarity checks can be added here
821
+ // For example, using diff algorithms or content hashing
822
+
823
+ return lengthSimilarity;
824
+ }
825
+
826
+ createDelta(baseContent, currentContent) {
827
+ // Simple delta implementation - in production, consider using proper diff libraries
828
+ // This is a placeholder that would create a compressed diff
829
+ const deltaObject = {
830
+ type: 'diff',
831
+ base: baseContent.length,
832
+ current: currentContent.length,
833
+ // In a real implementation, you'd store the actual diff data
834
+ operations: [] // diff operations would go here
835
+ };
836
+
837
+ return JSON.stringify(deltaObject);
838
+ }
839
+
840
+ applyDelta(baseContent, deltaData) {
841
+ try {
842
+ const delta = JSON.parse(deltaData);
843
+
844
+ // In a real implementation, you'd apply the diff operations
845
+ // For now, return the base content as a fallback
846
+ return baseContent;
847
+ } catch (error) {
848
+ throw new Error(`Failed to apply delta: ${error.message}`);
849
+ }
850
+ }
851
+
852
+ async calculateChangeMetrics(previousSnapshot, currentSnapshot) {
853
+ // Calculate various change metrics between snapshots
854
+ const metrics = {
855
+ sizeDelta: currentSnapshot.metadata.contentLength - previousSnapshot.metadata.contentLength,
856
+ timeDelta: currentSnapshot.metadata.timestamp - previousSnapshot.metadata.timestamp,
857
+ hashChanged: currentSnapshot.metadata.contentHash !== previousSnapshot.metadata.contentHash,
858
+ contentTypeChanged: currentSnapshot.metadata.contentType !== previousSnapshot.metadata.contentType,
859
+ similarity: 0
860
+ };
861
+
862
+ // Calculate content similarity if different hashes
863
+ if (metrics.hashChanged) {
864
+ // This would require loading both snapshots' content
865
+ // For now, estimate based on size difference
866
+ metrics.similarity = 1 - Math.abs(metrics.sizeDelta) / Math.max(
867
+ currentSnapshot.metadata.contentLength,
868
+ previousSnapshot.metadata.contentLength
869
+ );
870
+ } else {
871
+ metrics.similarity = 1.0;
872
+ }
873
+
874
+ return metrics;
875
+ }
876
+
877
+ getSortValue(snapshot, sortBy) {
878
+ switch (sortBy) {
879
+ case 'timestamp':
880
+ return snapshot.metadata.timestamp;
881
+ case 'size':
882
+ return snapshot.metadata.contentLength;
883
+ case 'similarity':
884
+ return snapshot.similarity || 0;
885
+ default:
886
+ return snapshot.metadata.timestamp;
887
+ }
888
+ }
889
+
890
+ updateCache(snapshotId, snapshot) {
891
+ // Simple LRU-like cache management
892
+ if (this.snapshotCache.size >= this.options.cacheSize) {
893
+ const firstKey = this.snapshotCache.keys().next().value;
894
+ this.snapshotCache.delete(firstKey);
895
+ }
896
+
897
+ this.snapshotCache.set(snapshotId, snapshot);
898
+ }
899
+
900
+ async initializeCache() {
901
+ // Pre-load recent snapshots into cache
902
+ const recentSnapshots = await this.querySnapshots({
903
+ limit: Math.min(this.options.cacheSize, 50),
904
+ sortBy: 'timestamp',
905
+ sortOrder: 'desc',
906
+ includeContent: false
907
+ });
908
+
909
+ for (const snapshot of recentSnapshots.snapshots) {
910
+ this.metadataCache.set(snapshot.id, snapshot);
911
+ }
912
+ }
913
+
914
+ updateStorageStats(snapshot, isDelta) {
915
+ this.stats.totalSnapshots++;
916
+ this.stats.totalStorageSize += snapshot.metadata.contentLength;
917
+
918
+ if (isDelta) {
919
+ this.updateDeltaStats(snapshot.delta.deltaSize);
920
+ }
921
+ }
922
+
923
+ updateCompressionStats(ratio) {
924
+ const currentAvg = this.stats.averageCompressionRatio;
925
+ const count = this.stats.compressedSnapshots;
926
+
927
+ this.stats.averageCompressionRatio =
928
+ (currentAvg * (count - 1) + ratio) / count;
929
+ }
930
+
931
+ updateDeltaStats(deltaSize) {
932
+ const currentAvg = this.stats.averageDeltaSize;
933
+ const count = this.stats.deltaSnapshots;
934
+
935
+ this.stats.averageDeltaSize =
936
+ (currentAvg * (count - 1) + deltaSize) / count;
937
+ }
938
+
939
+ async cleanupBySize(targetReduction) {
940
+ // Clean up oldest snapshots to free up space
941
+ const allSnapshots = Array.from(this.metadataCache.values());
942
+ const sorted = allSnapshots.sort((a, b) =>
943
+ a.metadata.timestamp - b.metadata.timestamp
944
+ );
945
+
946
+ let freedSpace = 0;
947
+ const toDelete = [];
948
+
949
+ for (const snapshot of sorted) {
950
+ if (freedSpace >= targetReduction) break;
951
+
952
+ toDelete.push(snapshot.id);
953
+ freedSpace += snapshot.metadata.contentLength;
954
+ }
955
+
956
+ const deleteResult = await this.deleteSnapshots(toDelete);
957
+
958
+ return {
959
+ deletedCount: deleteResult.deleted.length,
960
+ freedSpace: deleteResult.totalSize
961
+ };
962
+ }
963
+
964
+ startCleanupTimer() {
965
+ if (this.cleanupTimer) {
966
+ clearInterval(this.cleanupTimer);
967
+ }
968
+
969
+ this.cleanupTimer = setInterval(async () => {
970
+ try {
971
+ await this.cleanupSnapshots();
972
+ } catch (error) {
973
+ this.emit('error', { operation: 'scheduledCleanup', error: error.message });
974
+ }
975
+ }, this.retentionPolicy.cleanupInterval);
976
+ }
977
+
978
+ stopCleanupTimer() {
979
+ if (this.cleanupTimer) {
980
+ clearInterval(this.cleanupTimer);
981
+ this.cleanupTimer = null;
982
+ }
983
+ }
984
+
985
+ // Public API methods
986
+
987
+ getStats() {
988
+ return {
989
+ ...this.stats,
990
+ cacheSize: this.snapshotCache.size,
991
+ metadataCacheSize: this.metadataCache.size,
992
+ activeOperations: this.activeOperations.size,
993
+ averageSnapshotSize: this.stats.totalSnapshots > 0 ?
994
+ this.stats.totalStorageSize / this.stats.totalSnapshots : 0,
995
+ storageEfficiency: {
996
+ compressionRatio: this.stats.averageCompressionRatio,
997
+ deltaRatio: this.stats.averageDeltaSize,
998
+ compressedPercentage: this.stats.totalSnapshots > 0 ?
999
+ (this.stats.compressedSnapshots / this.stats.totalSnapshots) * 100 : 0,
1000
+ deltaPercentage: this.stats.totalSnapshots > 0 ?
1001
+ (this.stats.deltaSnapshots / this.stats.totalSnapshots) * 100 : 0
1002
+ }
1003
+ };
1004
+ }
1005
+
1006
+ getRetentionPolicy() {
1007
+ return { ...this.retentionPolicy };
1008
+ }
1009
+
1010
+ updateRetentionPolicy(newPolicy) {
1011
+ this.retentionPolicy = RetentionPolicySchema.parse({
1012
+ ...this.retentionPolicy,
1013
+ ...newPolicy
1014
+ });
1015
+
1016
+ this.emit('retentionPolicyUpdated', this.retentionPolicy);
1017
+ }
1018
+
1019
+ async shutdown() {
1020
+ this.stopCleanupTimer();
1021
+
1022
+ // Wait for active operations to complete
1023
+ const maxWaitTime = 30000; // 30 seconds
1024
+ const startTime = Date.now();
1025
+
1026
+ while (this.activeOperations.size > 0 && (Date.now() - startTime) < maxWaitTime) {
1027
+ await new Promise(resolve => setTimeout(resolve, 100));
1028
+ }
1029
+
1030
+ this.emit('shutdown', {
1031
+ pendingOperations: this.activeOperations.size,
1032
+ shutdownTime: Date.now() - startTime
1033
+ });
1034
+ }
1035
+ }
1036
+
1037
+ export default SnapshotManager;