crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,515 @@
1
+ /**
2
+ * Enhanced CircuitBreaker - Advanced circuit breaker pattern implementation
3
+ * Features: Multiple states, health monitoring, automatic recovery, metrics
4
+ */
5
+
6
+ export class CircuitBreaker {
7
+ constructor(options = {}) {
8
+ const {
9
+ threshold = 5,
10
+ timeout = 30000,
11
+ resetTimeout = 60000,
12
+ halfOpenMaxCalls = 3,
13
+ monitoringWindow = 60000,
14
+ errorThresholdPercentage = 50,
15
+ minimumThroughput = 10,
16
+ onStateChange = null,
17
+ onFailure = null,
18
+ onSuccess = null,
19
+ name = 'default'
20
+ } = options;
21
+
22
+ this.threshold = threshold;
23
+ this.timeout = timeout;
24
+ this.resetTimeout = resetTimeout;
25
+ this.halfOpenMaxCalls = halfOpenMaxCalls;
26
+ this.monitoringWindow = monitoringWindow;
27
+ this.errorThresholdPercentage = errorThresholdPercentage;
28
+ this.minimumThroughput = minimumThroughput;
29
+ this.onStateChange = onStateChange;
30
+ this.onFailure = onFailure;
31
+ this.onSuccess = onSuccess;
32
+ this.name = name;
33
+
34
+ // Circuit state per service endpoint
35
+ this.circuits = new Map();
36
+
37
+ // Health monitoring
38
+ this.healthChecks = new Map();
39
+ this.monitoringIntervals = new Map();
40
+
41
+ // States
42
+ this.STATES = {
43
+ CLOSED: 'CLOSED',
44
+ OPEN: 'OPEN',
45
+ HALF_OPEN: 'HALF_OPEN'
46
+ };
47
+ }
48
+
49
+ /**
50
+ * Execute operation with circuit breaker protection
51
+ * @param {string} serviceId - Unique identifier for the service
52
+ * @param {Function} operation - Async operation to execute
53
+ * @param {Object} options - Execution options
54
+ * @returns {Promise} Operation result
55
+ */
56
+ async execute(serviceId, operation, options = {}) {
57
+ const circuit = this.getCircuit(serviceId);
58
+ const startTime = Date.now();
59
+
60
+ // Check circuit state
61
+ if (circuit.state === this.STATES.OPEN) {
62
+ if (Date.now() < circuit.nextAttempt) {
63
+ const error = new CircuitBreakerOpenError(
64
+ `Circuit breaker is OPEN for service: ${serviceId}`,
65
+ serviceId,
66
+ circuit
67
+ );
68
+ this.recordMetric(serviceId, 'rejected', Date.now() - startTime);
69
+ throw error;
70
+ }
71
+ // Transition to half-open
72
+ this.transitionTo(serviceId, this.STATES.HALF_OPEN);
73
+ }
74
+
75
+ if (circuit.state === this.STATES.HALF_OPEN) {
76
+ if (circuit.halfOpenCalls >= this.halfOpenMaxCalls) {
77
+ const error = new CircuitBreakerOpenError(
78
+ `Circuit breaker HALF_OPEN limit exceeded for service: ${serviceId}`,
79
+ serviceId,
80
+ circuit
81
+ );
82
+ this.recordMetric(serviceId, 'rejected', Date.now() - startTime);
83
+ throw error;
84
+ }
85
+ circuit.halfOpenCalls++;
86
+ }
87
+
88
+ try {
89
+ // Execute operation with timeout
90
+ const result = await this.executeWithTimeout(operation, options.timeout || this.timeout);
91
+
92
+ const duration = Date.now() - startTime;
93
+ this.onSuccess(serviceId, duration);
94
+ this.recordMetric(serviceId, 'success', duration);
95
+
96
+ return result;
97
+ } catch (error) {
98
+ const duration = Date.now() - startTime;
99
+ this.onFailure(serviceId, error, duration);
100
+ this.recordMetric(serviceId, 'failure', duration, error);
101
+ throw error;
102
+ }
103
+ }
104
+
105
+ /**
106
+ * Execute operation with timeout
107
+ * @param {Function} operation - Operation to execute
108
+ * @param {number} timeout - Timeout in milliseconds
109
+ * @returns {Promise} Operation result
110
+ */
111
+ async executeWithTimeout(operation, timeout) {
112
+ return Promise.race([
113
+ operation(),
114
+ new Promise((_, reject) => {
115
+ setTimeout(() => {
116
+ reject(new Error(`Operation timeout after ${timeout}ms`));
117
+ }, timeout);
118
+ })
119
+ ]);
120
+ }
121
+
122
+ /**
123
+ * Handle successful operation
124
+ * @param {string} serviceId - Service identifier
125
+ * @param {number} duration - Operation duration
126
+ */
127
+ onSuccess(serviceId, duration) {
128
+ const circuit = this.getCircuit(serviceId);
129
+
130
+ if (circuit.state === this.STATES.HALF_OPEN) {
131
+ // Check if we can close the circuit
132
+ if (circuit.halfOpenCalls >= this.halfOpenMaxCalls) {
133
+ this.transitionTo(serviceId, this.STATES.CLOSED);
134
+ }
135
+ } else if (circuit.state === this.STATES.CLOSED) {
136
+ // Reset failure count on success
137
+ circuit.failureCount = 0;
138
+ }
139
+
140
+ // Call success callback
141
+ if (this.onSuccess) {
142
+ this.onSuccess(serviceId, duration);
143
+ }
144
+ }
145
+
146
+ /**
147
+ * Handle failed operation
148
+ * @param {string} serviceId - Service identifier
149
+ * @param {Error} error - Error that occurred
150
+ * @param {number} duration - Operation duration
151
+ */
152
+ onFailure(serviceId, error, duration) {
153
+ const circuit = this.getCircuit(serviceId);
154
+ circuit.failureCount++;
155
+ circuit.lastFailure = error;
156
+ circuit.lastFailureTime = Date.now();
157
+
158
+ if (circuit.state === this.STATES.HALF_OPEN) {
159
+ // Failure in half-open state immediately opens the circuit
160
+ this.transitionTo(serviceId, this.STATES.OPEN);
161
+ } else if (circuit.state === this.STATES.CLOSED) {
162
+ // Check if we should open the circuit
163
+ if (this.shouldOpenCircuit(serviceId)) {
164
+ this.transitionTo(serviceId, this.STATES.OPEN);
165
+ }
166
+ }
167
+
168
+ // Call failure callback
169
+ if (this.onFailure) {
170
+ this.onFailure(serviceId, error, duration);
171
+ }
172
+ }
173
+
174
+ /**
175
+ * Determine if circuit should be opened
176
+ * @param {string} serviceId - Service identifier
177
+ * @returns {boolean} Whether to open circuit
178
+ */
179
+ shouldOpenCircuit(serviceId) {
180
+ const circuit = this.getCircuit(serviceId);
181
+ const metrics = this.getServiceMetrics(serviceId);
182
+
183
+ // Simple threshold check
184
+ if (circuit.failureCount >= this.threshold) {
185
+ return true;
186
+ }
187
+
188
+ // Percentage-based check within monitoring window
189
+ if (metrics.totalCalls >= this.minimumThroughput) {
190
+ const errorRate = (metrics.failures / metrics.totalCalls) * 100;
191
+ return errorRate >= this.errorThresholdPercentage;
192
+ }
193
+
194
+ return false;
195
+ }
196
+
197
+ /**
198
+ * Transition circuit to new state
199
+ * @param {string} serviceId - Service identifier
200
+ * @param {string} newState - New state
201
+ */
202
+ transitionTo(serviceId, newState) {
203
+ const circuit = this.getCircuit(serviceId);
204
+ const oldState = circuit.state;
205
+
206
+ circuit.state = newState;
207
+ circuit.stateChangeTime = Date.now();
208
+
209
+ switch (newState) {
210
+ case this.STATES.OPEN:
211
+ circuit.nextAttempt = Date.now() + this.resetTimeout;
212
+ circuit.halfOpenCalls = 0;
213
+ break;
214
+ case this.STATES.HALF_OPEN:
215
+ circuit.halfOpenCalls = 0;
216
+ break;
217
+ case this.STATES.CLOSED:
218
+ circuit.failureCount = 0;
219
+ circuit.halfOpenCalls = 0;
220
+ break;
221
+ }
222
+
223
+ // Call state change callback
224
+ if (this.onStateChange) {
225
+ this.onStateChange(serviceId, oldState, newState, circuit);
226
+ }
227
+
228
+ // Start health monitoring for open circuits
229
+ if (newState === this.STATES.OPEN) {
230
+ this.startHealthMonitoring(serviceId);
231
+ } else {
232
+ this.stopHealthMonitoring(serviceId);
233
+ }
234
+ }
235
+
236
+ /**
237
+ * Get or create circuit for service
238
+ * @param {string} serviceId - Service identifier
239
+ * @returns {Object} Circuit state object
240
+ */
241
+ getCircuit(serviceId) {
242
+ if (!this.circuits.has(serviceId)) {
243
+ this.circuits.set(serviceId, {
244
+ state: this.STATES.CLOSED,
245
+ failureCount: 0,
246
+ halfOpenCalls: 0,
247
+ nextAttempt: 0,
248
+ stateChangeTime: Date.now(),
249
+ lastFailure: null,
250
+ lastFailureTime: 0,
251
+ metrics: {
252
+ totalCalls: 0,
253
+ failures: 0,
254
+ successes: 0,
255
+ rejections: 0,
256
+ timeouts: 0,
257
+ averageResponseTime: 0,
258
+ callsInWindow: []
259
+ }
260
+ });
261
+ }
262
+ return this.circuits.get(serviceId);
263
+ }
264
+
265
+ /**
266
+ * Record operation metrics
267
+ * @param {string} serviceId - Service identifier
268
+ * @param {string} type - Metric type ('success', 'failure', 'rejected')
269
+ * @param {number} duration - Operation duration
270
+ * @param {Error} error - Error if applicable
271
+ */
272
+ recordMetric(serviceId, type, duration, error = null) {
273
+ const circuit = this.getCircuit(serviceId);
274
+ const now = Date.now();
275
+
276
+ // Record call in sliding window
277
+ circuit.metrics.callsInWindow.push({
278
+ timestamp: now,
279
+ type,
280
+ duration,
281
+ error: error ? error.message : null
282
+ });
283
+
284
+ // Clean old entries outside monitoring window
285
+ circuit.metrics.callsInWindow = circuit.metrics.callsInWindow.filter(
286
+ call => now - call.timestamp <= this.monitoringWindow
287
+ );
288
+
289
+ // Update counters
290
+ circuit.metrics.totalCalls++;
291
+ circuit.metrics[type === 'success' ? 'successes' :
292
+ type === 'failure' ? 'failures' : 'rejections']++;
293
+
294
+ if (error && error.message.includes('timeout')) {
295
+ circuit.metrics.timeouts++;
296
+ }
297
+
298
+ // Update average response time
299
+ if (type === 'success' || type === 'failure') {
300
+ const currentAverage = circuit.metrics.averageResponseTime;
301
+ const calls = circuit.metrics.successes + circuit.metrics.failures;
302
+ circuit.metrics.averageResponseTime =
303
+ ((currentAverage * (calls - 1)) + duration) / calls;
304
+ }
305
+ }
306
+
307
+ /**
308
+ * Get metrics for a specific service
309
+ * @param {string} serviceId - Service identifier
310
+ * @returns {Object} Service metrics
311
+ */
312
+ getServiceMetrics(serviceId) {
313
+ const circuit = this.getCircuit(serviceId);
314
+ const now = Date.now();
315
+
316
+ // Get metrics within monitoring window
317
+ const windowCalls = circuit.metrics.callsInWindow.filter(
318
+ call => now - call.timestamp <= this.monitoringWindow
319
+ );
320
+
321
+ const windowFailures = windowCalls.filter(call => call.type === 'failure').length;
322
+ const windowSuccesses = windowCalls.filter(call => call.type === 'success').length;
323
+ const windowTotal = windowCalls.length;
324
+
325
+ return {
326
+ state: circuit.state,
327
+ totalCalls: windowTotal,
328
+ failures: windowFailures,
329
+ successes: windowSuccesses,
330
+ errorRate: windowTotal > 0 ? (windowFailures / windowTotal) * 100 : 0,
331
+ averageResponseTime: circuit.metrics.averageResponseTime,
332
+ lastFailure: circuit.lastFailure?.message,
333
+ lastFailureTime: circuit.lastFailureTime,
334
+ nextAttempt: circuit.nextAttempt,
335
+ stateChangeTime: circuit.stateChangeTime
336
+ };
337
+ }
338
+
339
+ /**
340
+ * Start health monitoring for a service
341
+ * @param {string} serviceId - Service identifier
342
+ */
343
+ startHealthMonitoring(serviceId) {
344
+ if (this.monitoringIntervals.has(serviceId)) {
345
+ return; // Already monitoring
346
+ }
347
+
348
+ const interval = setInterval(() => {
349
+ this.performHealthCheck(serviceId);
350
+ }, this.resetTimeout / 2); // Check twice per reset timeout
351
+
352
+ this.monitoringIntervals.set(serviceId, interval);
353
+ }
354
+
355
+ /**
356
+ * Stop health monitoring for a service
357
+ * @param {string} serviceId - Service identifier
358
+ */
359
+ stopHealthMonitoring(serviceId) {
360
+ const interval = this.monitoringIntervals.get(serviceId);
361
+ if (interval) {
362
+ clearInterval(interval);
363
+ this.monitoringIntervals.delete(serviceId);
364
+ }
365
+ }
366
+
367
+ /**
368
+ * Perform health check for a service
369
+ * @param {string} serviceId - Service identifier
370
+ */
371
+ async performHealthCheck(serviceId) {
372
+ const circuit = this.getCircuit(serviceId);
373
+
374
+ if (circuit.state !== this.STATES.OPEN) {
375
+ return;
376
+ }
377
+
378
+ if (Date.now() >= circuit.nextAttempt) {
379
+ // Transition to half-open for testing
380
+ this.transitionTo(serviceId, this.STATES.HALF_OPEN);
381
+ }
382
+ }
383
+
384
+ /**
385
+ * Manually reset circuit breaker for a service
386
+ * @param {string} serviceId - Service identifier
387
+ */
388
+ reset(serviceId) {
389
+ if (serviceId) {
390
+ this.transitionTo(serviceId, this.STATES.CLOSED);
391
+ } else {
392
+ // Reset all circuits
393
+ for (const id of this.circuits.keys()) {
394
+ this.transitionTo(id, this.STATES.CLOSED);
395
+ }
396
+ }
397
+ }
398
+
399
+ /**
400
+ * Get comprehensive statistics for all services
401
+ * @returns {Object} Statistics object
402
+ */
403
+ getStats() {
404
+ const stats = {
405
+ totalServices: this.circuits.size,
406
+ serviceStates: {},
407
+ globalMetrics: {
408
+ totalCalls: 0,
409
+ totalFailures: 0,
410
+ totalSuccesses: 0,
411
+ totalRejections: 0,
412
+ averageErrorRate: 0
413
+ }
414
+ };
415
+
416
+ let totalErrorRate = 0;
417
+ let servicesWithCalls = 0;
418
+
419
+ for (const [serviceId, circuit] of this.circuits.entries()) {
420
+ const metrics = this.getServiceMetrics(serviceId);
421
+ stats.serviceStates[serviceId] = metrics;
422
+
423
+ stats.globalMetrics.totalCalls += circuit.metrics.totalCalls;
424
+ stats.globalMetrics.totalFailures += circuit.metrics.failures;
425
+ stats.globalMetrics.totalSuccesses += circuit.metrics.successes;
426
+ stats.globalMetrics.totalRejections += circuit.metrics.rejections;
427
+
428
+ if (circuit.metrics.totalCalls > 0) {
429
+ totalErrorRate += metrics.errorRate;
430
+ servicesWithCalls++;
431
+ }
432
+ }
433
+
434
+ stats.globalMetrics.averageErrorRate = servicesWithCalls > 0
435
+ ? totalErrorRate / servicesWithCalls
436
+ : 0;
437
+
438
+ return stats;
439
+ }
440
+
441
+ /**
442
+ * Cleanup resources
443
+ */
444
+ destroy() {
445
+ // Stop all monitoring intervals
446
+ for (const interval of this.monitoringIntervals.values()) {
447
+ clearInterval(interval);
448
+ }
449
+ this.monitoringIntervals.clear();
450
+ this.circuits.clear();
451
+ this.healthChecks.clear();
452
+ }
453
+ }
454
+
455
+ /**
456
+ * Circuit Breaker specific error class
457
+ */
458
+ export class CircuitBreakerOpenError extends Error {
459
+ constructor(message, serviceId, circuit) {
460
+ super(message);
461
+ this.name = 'CircuitBreakerOpenError';
462
+ this.serviceId = serviceId;
463
+ this.circuit = circuit;
464
+ this.isCircuitBreakerError = true;
465
+ }
466
+ }
467
+
468
+ /**
469
+ * Factory function to create circuit breakers with presets
470
+ */
471
+ export function createCircuitBreaker(preset = 'default', overrides = {}) {
472
+ const presets = {
473
+ default: {
474
+ threshold: 5,
475
+ timeout: 30000,
476
+ resetTimeout: 60000,
477
+ halfOpenMaxCalls: 3
478
+ },
479
+ aggressive: {
480
+ threshold: 3,
481
+ timeout: 15000,
482
+ resetTimeout: 30000,
483
+ halfOpenMaxCalls: 2,
484
+ errorThresholdPercentage: 30
485
+ },
486
+ conservative: {
487
+ threshold: 10,
488
+ timeout: 60000,
489
+ resetTimeout: 300000,
490
+ halfOpenMaxCalls: 5,
491
+ errorThresholdPercentage: 70
492
+ },
493
+ api: {
494
+ threshold: 5,
495
+ timeout: 30000,
496
+ resetTimeout: 120000,
497
+ halfOpenMaxCalls: 3,
498
+ errorThresholdPercentage: 50,
499
+ minimumThroughput: 20
500
+ },
501
+ network: {
502
+ threshold: 3,
503
+ timeout: 10000,
504
+ resetTimeout: 60000,
505
+ halfOpenMaxCalls: 2,
506
+ errorThresholdPercentage: 40,
507
+ minimumThroughput: 10
508
+ }
509
+ };
510
+
511
+ const config = { ...presets[preset], ...overrides };
512
+ return new CircuitBreaker(config);
513
+ }
514
+
515
+ export default CircuitBreaker;