crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,173 @@
1
+ /**
2
+ * Memory Monitoring and Leak Detection Utility
3
+ * Tracks memory usage patterns and detects potential leaks
4
+ */
5
+
6
+ export class MemoryMonitor {
7
+ constructor(options = {}) {
8
+ this.options = {
9
+ sampleInterval: options.sampleInterval || 30000, // 30 seconds
10
+ maxSamples: options.maxSamples || 100,
11
+ leakThreshold: options.leakThreshold || 50 * 1024 * 1024, // 50MB
12
+ enableLogging: options.enableLogging !== false,
13
+ alertCallback: options.alertCallback || null,
14
+ ...options
15
+ };
16
+
17
+ this.samples = [];
18
+ this.isMonitoring = false;
19
+ this.intervalId = null;
20
+ this.leakWarnings = 0;
21
+ }
22
+
23
+ /**
24
+ * Start memory monitoring
25
+ */
26
+ start() {
27
+ if (this.isMonitoring) {
28
+ return;
29
+ }
30
+
31
+ this.isMonitoring = true;
32
+ this.log('Starting memory monitoring...');
33
+
34
+ this.intervalId = setInterval(() => {
35
+ this.takeSample();
36
+ this.analyzeMemoryTrend();
37
+ }, this.options.sampleInterval);
38
+
39
+ // Take initial sample
40
+ this.takeSample();
41
+ }
42
+
43
+ /**
44
+ * Stop memory monitoring
45
+ */
46
+ stop() {
47
+ if (!this.isMonitoring) {
48
+ return;
49
+ }
50
+
51
+ this.isMonitoring = false;
52
+
53
+ if (this.intervalId) {
54
+ clearInterval(this.intervalId);
55
+ this.intervalId = null;
56
+ }
57
+
58
+ this.log('Memory monitoring stopped');
59
+ }
60
+
61
+ /**
62
+ * Take a memory usage sample
63
+ */
64
+ takeSample() {
65
+ const usage = process.memoryUsage();
66
+ const timestamp = Date.now();
67
+
68
+ const sample = {
69
+ timestamp,
70
+ heapUsed: usage.heapUsed,
71
+ heapTotal: usage.heapTotal,
72
+ external: usage.external,
73
+ rss: usage.rss,
74
+ heapUsedMB: Math.round(usage.heapUsed / 1024 / 1024 * 100) / 100,
75
+ heapTotalMB: Math.round(usage.heapTotal / 1024 / 1024 * 100) / 100
76
+ };
77
+
78
+ this.samples.push(sample);
79
+
80
+ // Keep only the last N samples
81
+ if (this.samples.length > this.options.maxSamples) {
82
+ this.samples.shift();
83
+ }
84
+
85
+ return sample;
86
+ }
87
+
88
+ /**
89
+ * Get current memory statistics
90
+ */
91
+ getStats() {
92
+ if (this.samples.length === 0) {
93
+ return null;
94
+ }
95
+
96
+ const latest = this.samples[this.samples.length - 1];
97
+ const peak = Math.max(...this.samples.map(s => s.heapUsed));
98
+ const average = this.samples.reduce((sum, s) => sum + s.heapUsed, 0) / this.samples.length;
99
+
100
+ return {
101
+ current: {
102
+ heapUsedMB: latest.heapUsedMB,
103
+ heapTotalMB: latest.heapTotalMB,
104
+ timestamp: latest.timestamp
105
+ },
106
+ peak: {
107
+ heapUsedMB: Math.round(peak / 1024 / 1024 * 100) / 100
108
+ },
109
+ average: {
110
+ heapUsedMB: Math.round(average / 1024 / 1024 * 100) / 100
111
+ },
112
+ samples: this.samples.length,
113
+ leakWarnings: this.leakWarnings,
114
+ isMonitoring: this.isMonitoring
115
+ };
116
+ }
117
+
118
+ /**
119
+ * Analyze memory trend for potential leaks
120
+ */
121
+ analyzeMemoryTrend() {
122
+ if (this.samples.length < 5) {
123
+ return; // Need at least 5 samples for trend analysis
124
+ }
125
+
126
+ // Check if memory is consistently increasing
127
+ const recentSamples = this.samples.slice(-10); // Last 10 samples
128
+ let increasingCount = 0;
129
+
130
+ for (let i = 1; i < recentSamples.length; i++) {
131
+ if (recentSamples[i].heapUsed > recentSamples[i - 1].heapUsed) {
132
+ increasingCount++;
133
+ }
134
+ }
135
+
136
+ // If memory increased in 80% of recent samples, potential leak
137
+ if (increasingCount > recentSamples.length * 0.8) {
138
+ const memoryGrowth = recentSamples[recentSamples.length - 1].heapUsed - recentSamples[0].heapUsed;
139
+
140
+ if (memoryGrowth > this.options.leakThreshold) {
141
+ this.leakWarnings++;
142
+ const growthMB = Math.round(memoryGrowth / 1024 / 1024 * 100) / 100;
143
+
144
+ this.log(`Warning: Potential memory leak detected. Memory grew by ${growthMB}MB over ${recentSamples.length} samples`);
145
+
146
+ if (this.options.alertCallback) {
147
+ this.options.alertCallback({
148
+ type: 'memory_leak',
149
+ growthMB,
150
+ samples: recentSamples.length,
151
+ warnings: this.leakWarnings
152
+ });
153
+ }
154
+ }
155
+ }
156
+ }
157
+
158
+ /**
159
+ * Log messages if logging is enabled
160
+ */
161
+ log(message) {
162
+ if (this.options.enableLogging) {
163
+ console.error(`[MemoryMonitor] ${message}`);
164
+ }
165
+ }
166
+ }
167
+
168
+ // Export singleton instance for global use
169
+ export const memoryMonitor = new MemoryMonitor({
170
+ enableLogging: process.env.NODE_ENV === 'development',
171
+ sampleInterval: 30000, // 30 seconds
172
+ leakThreshold: 100 * 1024 * 1024 // 100MB
173
+ });
@@ -0,0 +1,386 @@
1
+ /**
2
+ * RetryManager - Comprehensive retry management with multiple strategies
3
+ * Handles exponential backoff, circuit breaking integration, and retry policies
4
+ */
5
+
6
+ export class RetryManager {
7
+ constructor(options = {}) {
8
+ const {
9
+ maxRetries = 3,
10
+ baseDelay = 1000,
11
+ maxDelay = 30000,
12
+ strategy = 'exponential',
13
+ jitter = true,
14
+ retryableErrors = ['ECONNRESET', 'ENOTFOUND', 'ECONNREFUSED', 'ETIMEDOUT'],
15
+ retryableStatusCodes = [408, 429, 500, 502, 503, 504],
16
+ onRetry = null,
17
+ onFailure = null
18
+ } = options;
19
+
20
+ this.maxRetries = maxRetries;
21
+ this.baseDelay = baseDelay;
22
+ this.maxDelay = maxDelay;
23
+ this.strategy = strategy;
24
+ this.jitter = jitter;
25
+ this.retryableErrors = new Set(retryableErrors);
26
+ this.retryableStatusCodes = new Set(retryableStatusCodes);
27
+ this.onRetry = onRetry;
28
+ this.onFailure = onFailure;
29
+
30
+ // Strategy implementations
31
+ this.strategies = {
32
+ linear: this.linearBackoff.bind(this),
33
+ exponential: this.exponentialBackoff.bind(this),
34
+ fibonacci: this.fibonacciBackoff.bind(this),
35
+ fixed: this.fixedBackoff.bind(this)
36
+ };
37
+
38
+ // Statistics tracking
39
+ this.stats = {
40
+ totalAttempts: 0,
41
+ totalRetries: 0,
42
+ successfulRetries: 0,
43
+ failedOperations: 0,
44
+ averageRetryDelay: 0
45
+ };
46
+ }
47
+
48
+ /**
49
+ * Execute an operation with retry logic
50
+ * @param {Function} operation - Async function to execute
51
+ * @param {Object} context - Context information for logging/callbacks
52
+ * @returns {Promise} Result of successful operation
53
+ */
54
+ async execute(operation, context = {}) {
55
+ let lastError;
56
+ let totalDelay = 0;
57
+ const startTime = Date.now();
58
+
59
+ for (let attempt = 0; attempt <= this.maxRetries; attempt++) {
60
+ this.stats.totalAttempts++;
61
+
62
+ try {
63
+ const result = await operation();
64
+
65
+ if (attempt > 0) {
66
+ this.stats.successfulRetries++;
67
+ this.updateAverageDelay(totalDelay);
68
+ }
69
+
70
+ return result;
71
+ } catch (error) {
72
+ lastError = error;
73
+
74
+ // Check if error is retryable
75
+ if (!this.isRetryableError(error)) {
76
+ this.stats.failedOperations++;
77
+ if (this.onFailure) {
78
+ await this.onFailure(error, attempt, context);
79
+ }
80
+ throw error;
81
+ }
82
+
83
+ // Don't retry on last attempt
84
+ if (attempt === this.maxRetries) {
85
+ this.stats.failedOperations++;
86
+ if (this.onFailure) {
87
+ await this.onFailure(error, attempt, context);
88
+ }
89
+ throw new RetryExhaustedError(
90
+ `Operation failed after ${this.maxRetries} retries: ${error.message}`,
91
+ error,
92
+ attempt
93
+ );
94
+ }
95
+
96
+ // Calculate delay for next attempt
97
+ const delay = this.calculateDelay(attempt);
98
+ totalDelay += delay;
99
+
100
+ this.stats.totalRetries++;
101
+
102
+ // Call retry callback if provided
103
+ if (this.onRetry) {
104
+ await this.onRetry(error, attempt, delay, context);
105
+ }
106
+
107
+ // Wait before retrying
108
+ await this.delay(delay);
109
+ }
110
+ }
111
+
112
+ throw lastError;
113
+ }
114
+
115
+ /**
116
+ * Execute multiple operations with retry, stopping on first success
117
+ * @param {Array<Function>} operations - Array of async functions
118
+ * @param {Object} context - Context information
119
+ * @returns {Promise} Result of first successful operation
120
+ */
121
+ async executeAny(operations, context = {}) {
122
+ let lastError;
123
+
124
+ for (const operation of operations) {
125
+ try {
126
+ return await this.execute(operation, context);
127
+ } catch (error) {
128
+ lastError = error;
129
+ }
130
+ }
131
+
132
+ throw new Error(`All ${operations.length} operations failed. Last error: ${lastError.message}`);
133
+ }
134
+
135
+ /**
136
+ * Execute operation with circuit breaker integration
137
+ * @param {Function} operation - Async function to execute
138
+ * @param {CircuitBreaker} circuitBreaker - Circuit breaker instance
139
+ * @param {string} domain - Domain for circuit breaker
140
+ * @param {Object} context - Context information
141
+ * @returns {Promise} Result of successful operation
142
+ */
143
+ async executeWithCircuitBreaker(operation, circuitBreaker, domain, context = {}) {
144
+ return this.execute(async () => {
145
+ return circuitBreaker.execute(domain, operation);
146
+ }, { ...context, domain });
147
+ }
148
+
149
+ /**
150
+ * Check if an error is retryable based on configuration
151
+ * @param {Error} error - Error to check
152
+ * @returns {boolean} Whether the error is retryable
153
+ */
154
+ isRetryableError(error) {
155
+ // Check error codes
156
+ if (error.code && this.retryableErrors.has(error.code)) {
157
+ return true;
158
+ }
159
+
160
+ // Check HTTP status codes
161
+ if (error.response && error.response.status) {
162
+ return this.retryableStatusCodes.has(error.response.status);
163
+ }
164
+
165
+ // Check error types
166
+ if (error instanceof TypeError && error.message.includes('fetch')) {
167
+ return true;
168
+ }
169
+
170
+ // Check for timeout errors
171
+ if (error.message && (
172
+ error.message.includes('timeout') ||
173
+ error.message.includes('ETIMEDOUT') ||
174
+ error.message.includes('socket hang up')
175
+ )) {
176
+ return true;
177
+ }
178
+
179
+ // Circuit breaker errors are retryable
180
+ if (error.message && error.message.includes('Circuit breaker is OPEN')) {
181
+ return true;
182
+ }
183
+
184
+ return false;
185
+ }
186
+
187
+ /**
188
+ * Calculate delay based on configured strategy
189
+ * @param {number} attempt - Current attempt number (0-based)
190
+ * @returns {number} Delay in milliseconds
191
+ */
192
+ calculateDelay(attempt) {
193
+ const strategy = this.strategies[this.strategy] || this.strategies.exponential;
194
+ let delay = strategy(attempt);
195
+
196
+ // Apply maximum delay cap
197
+ delay = Math.min(delay, this.maxDelay);
198
+
199
+ // Apply jitter to prevent thundering herd
200
+ if (this.jitter) {
201
+ delay = this.addJitter(delay);
202
+ }
203
+
204
+ return Math.max(0, delay);
205
+ }
206
+
207
+ /**
208
+ * Linear backoff strategy
209
+ * @param {number} attempt - Attempt number
210
+ * @returns {number} Delay in milliseconds
211
+ */
212
+ linearBackoff(attempt) {
213
+ return this.baseDelay * (attempt + 1);
214
+ }
215
+
216
+ /**
217
+ * Exponential backoff strategy
218
+ * @param {number} attempt - Attempt number
219
+ * @returns {number} Delay in milliseconds
220
+ */
221
+ exponentialBackoff(attempt) {
222
+ return this.baseDelay * Math.pow(2, attempt);
223
+ }
224
+
225
+ /**
226
+ * Fibonacci backoff strategy
227
+ * @param {number} attempt - Attempt number
228
+ * @returns {number} Delay in milliseconds
229
+ */
230
+ fibonacciBackoff(attempt) {
231
+ if (attempt <= 1) return this.baseDelay;
232
+
233
+ let a = 1, b = 1;
234
+ for (let i = 2; i <= attempt; i++) {
235
+ [a, b] = [b, a + b];
236
+ }
237
+
238
+ return this.baseDelay * b;
239
+ }
240
+
241
+ /**
242
+ * Fixed delay strategy
243
+ * @param {number} attempt - Attempt number
244
+ * @returns {number} Delay in milliseconds
245
+ */
246
+ fixedBackoff(attempt) {
247
+ return this.baseDelay;
248
+ }
249
+
250
+ /**
251
+ * Add jitter to delay to prevent thundering herd
252
+ * @param {number} delay - Base delay
253
+ * @returns {number} Jittered delay
254
+ */
255
+ addJitter(delay) {
256
+ // Use full jitter: random value between 0 and delay
257
+ return Math.random() * delay;
258
+ }
259
+
260
+ /**
261
+ * Promise-based delay utility
262
+ * @param {number} ms - Milliseconds to delay
263
+ * @returns {Promise} Promise that resolves after delay
264
+ */
265
+ delay(ms) {
266
+ return new Promise(resolve => setTimeout(resolve, ms));
267
+ }
268
+
269
+ /**
270
+ * Update average retry delay statistic
271
+ * @param {number} totalDelay - Total delay for this operation
272
+ */
273
+ updateAverageDelay(totalDelay) {
274
+ const currentAverage = this.stats.averageRetryDelay;
275
+ const count = this.stats.successfulRetries;
276
+ this.stats.averageRetryDelay = ((currentAverage * (count - 1)) + totalDelay) / count;
277
+ }
278
+
279
+ /**
280
+ * Get retry statistics
281
+ * @returns {Object} Statistics object
282
+ */
283
+ getStats() {
284
+ return {
285
+ ...this.stats,
286
+ successRate: this.stats.totalAttempts > 0
287
+ ? ((this.stats.totalAttempts - this.stats.failedOperations) / this.stats.totalAttempts) * 100
288
+ : 0,
289
+ retryRate: this.stats.totalAttempts > 0
290
+ ? (this.stats.totalRetries / this.stats.totalAttempts) * 100
291
+ : 0
292
+ };
293
+ }
294
+
295
+ /**
296
+ * Reset statistics
297
+ */
298
+ resetStats() {
299
+ this.stats = {
300
+ totalAttempts: 0,
301
+ totalRetries: 0,
302
+ successfulRetries: 0,
303
+ failedOperations: 0,
304
+ averageRetryDelay: 0
305
+ };
306
+ }
307
+
308
+ /**
309
+ * Create a configured retry manager for specific use cases
310
+ * @param {string} preset - Preset name ('aggressive', 'conservative', 'network', 'api')
311
+ * @returns {RetryManager} Configured retry manager
312
+ */
313
+ static createPreset(preset) {
314
+ const presets = {
315
+ aggressive: {
316
+ maxRetries: 5,
317
+ baseDelay: 500,
318
+ maxDelay: 10000,
319
+ strategy: 'exponential',
320
+ jitter: true
321
+ },
322
+ conservative: {
323
+ maxRetries: 2,
324
+ baseDelay: 2000,
325
+ maxDelay: 60000,
326
+ strategy: 'linear',
327
+ jitter: false
328
+ },
329
+ network: {
330
+ maxRetries: 3,
331
+ baseDelay: 1000,
332
+ maxDelay: 30000,
333
+ strategy: 'exponential',
334
+ jitter: true,
335
+ retryableErrors: ['ECONNRESET', 'ENOTFOUND', 'ECONNREFUSED', 'ETIMEDOUT', 'EPIPE']
336
+ },
337
+ api: {
338
+ maxRetries: 4,
339
+ baseDelay: 1000,
340
+ maxDelay: 16000,
341
+ strategy: 'exponential',
342
+ jitter: true,
343
+ retryableStatusCodes: [408, 429, 500, 502, 503, 504, 520, 521, 522, 523, 524]
344
+ }
345
+ };
346
+
347
+ const config = presets[preset];
348
+ if (!config) {
349
+ throw new Error(`Unknown preset: ${preset}. Available presets: ${Object.keys(presets).join(', ')}`);
350
+ }
351
+
352
+ return new RetryManager(config);
353
+ }
354
+ }
355
+
356
+ /**
357
+ * Custom error class for retry exhausted scenarios
358
+ */
359
+ export class RetryExhaustedError extends Error {
360
+ constructor(message, originalError, attempts) {
361
+ super(message);
362
+ this.name = 'RetryExhaustedError';
363
+ this.originalError = originalError;
364
+ this.attempts = attempts;
365
+ }
366
+ }
367
+
368
+ /**
369
+ * Decorator function to add retry logic to any async function
370
+ * @param {RetryManager} retryManager - Retry manager instance
371
+ * @param {Object} context - Context for the operation
372
+ * @returns {Function} Decorator function
373
+ */
374
+ export function withRetry(retryManager, context = {}) {
375
+ return function(target, propertyKey, descriptor) {
376
+ const originalMethod = descriptor.value;
377
+
378
+ descriptor.value = async function(...args) {
379
+ return retryManager.execute(() => originalMethod.apply(this, args), context);
380
+ };
381
+
382
+ return descriptor;
383
+ };
384
+ }
385
+
386
+ export default RetryManager;