crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,600 @@
1
+ /**
2
+ * Health Check System for CrawlForge MCP Server
3
+ * Provides comprehensive health monitoring for production environments
4
+ */
5
+
6
+ import { EventEmitter } from 'events';
7
+ import os from 'os';
8
+ import { performance } from 'perf_hooks';
9
+ import { config } from '../constants/config.js';
10
+ import { createLogger } from '../utils/Logger.js';
11
+
12
+ const logger = createLogger('HealthCheck');
13
+
14
+ export class HealthCheckManager extends EventEmitter {
15
+ constructor(options = {}) {
16
+ super();
17
+
18
+ this.options = {
19
+ checkInterval: options.checkInterval || 30000, // 30 seconds
20
+ timeout: options.timeout || 5000, // 5 seconds
21
+ thresholds: {
22
+ memoryUsage: options.memoryThreshold || 512 * 1024 * 1024, // 512MB
23
+ cpuUsage: options.cpuThreshold || 90, // 90%
24
+ responseTime: options.responseTimeThreshold || 2000, // 2 seconds
25
+ errorRate: options.errorRateThreshold || 0.05, // 5%
26
+ ...options.thresholds
27
+ },
28
+ ...options
29
+ };
30
+
31
+ this.healthStatus = {
32
+ overall: 'healthy',
33
+ lastCheck: Date.now(),
34
+ uptime: Date.now(),
35
+ checks: {}
36
+ };
37
+
38
+ this.dependencyChecks = new Map();
39
+ this.performanceMonitor = {
40
+ requestCount: 0,
41
+ errorCount: 0,
42
+ responseTimeSum: 0,
43
+ lastMinuteRequests: [],
44
+ lastMinuteErrors: []
45
+ };
46
+
47
+ this.isRunning = false;
48
+ this.checkTimer = null;
49
+ }
50
+
51
+ /**
52
+ * Start health monitoring
53
+ */
54
+ start() {
55
+ if (this.isRunning) {
56
+ logger.warn('Health monitoring is already running');
57
+ return;
58
+ }
59
+
60
+ this.isRunning = true;
61
+ this.healthStatus.uptime = Date.now();
62
+
63
+ // Register core health checks
64
+ this.registerCoreChecks();
65
+
66
+ // Start periodic health checks
67
+ this.checkTimer = setInterval(() => {
68
+ this.performHealthCheck();
69
+ }, this.options.checkInterval);
70
+
71
+ logger.info('Health monitoring started', {
72
+ interval: this.options.checkInterval,
73
+ thresholds: this.options.thresholds
74
+ });
75
+
76
+ this.emit('started');
77
+ }
78
+
79
+ /**
80
+ * Stop health monitoring
81
+ */
82
+ stop() {
83
+ if (!this.isRunning) {
84
+ return;
85
+ }
86
+
87
+ this.isRunning = false;
88
+
89
+ if (this.checkTimer) {
90
+ clearInterval(this.checkTimer);
91
+ this.checkTimer = null;
92
+ }
93
+
94
+ logger.info('Health monitoring stopped');
95
+ this.emit('stopped');
96
+ }
97
+
98
+ /**
99
+ * Register core system health checks
100
+ */
101
+ registerCoreChecks() {
102
+ // System resource checks
103
+ this.registerCheck('memory', this.checkMemoryUsage.bind(this));
104
+ this.registerCheck('cpu', this.checkCpuUsage.bind(this));
105
+ this.registerCheck('disk', this.checkDiskSpace.bind(this));
106
+
107
+ // Application health checks
108
+ this.registerCheck('cache', this.checkCacheHealth.bind(this));
109
+ this.registerCheck('queue', this.checkQueueHealth.bind(this));
110
+ this.registerCheck('workers', this.checkWorkerHealth.bind(this));
111
+ this.registerCheck('connections', this.checkConnectionHealth.bind(this));
112
+
113
+ // External dependency checks
114
+ this.registerCheck('search_api', this.checkSearchApiHealth.bind(this));
115
+ this.registerCheck('network', this.checkNetworkConnectivity.bind(this));
116
+ }
117
+
118
+ /**
119
+ * Register a health check function
120
+ * @param {string} name - Check name
121
+ * @param {Function} checkFunction - Function that returns health status
122
+ */
123
+ registerCheck(name, checkFunction) {
124
+ this.dependencyChecks.set(name, checkFunction);
125
+ logger.debug(`Registered health check: ${name}`);
126
+ }
127
+
128
+ /**
129
+ * Unregister a health check
130
+ * @param {string} name - Check name
131
+ */
132
+ unregisterCheck(name) {
133
+ this.dependencyChecks.delete(name);
134
+ logger.debug(`Unregistered health check: ${name}`);
135
+ }
136
+
137
+ /**
138
+ * Perform comprehensive health check
139
+ */
140
+ async performHealthCheck() {
141
+ const startTime = performance.now();
142
+ const results = {};
143
+
144
+ logger.debug('Starting health check');
145
+
146
+ try {
147
+ // Run all registered checks in parallel
148
+ const checkPromises = Array.from(this.dependencyChecks.entries()).map(
149
+ async ([name, checkFunction]) => {
150
+ try {
151
+ const checkStart = performance.now();
152
+ const result = await Promise.race([
153
+ checkFunction(),
154
+ new Promise((_, reject) =>
155
+ setTimeout(() => reject(new Error('Health check timeout')), this.options.timeout)
156
+ )
157
+ ]);
158
+
159
+ return {
160
+ name,
161
+ status: 'healthy',
162
+ responseTime: performance.now() - checkStart,
163
+ details: result,
164
+ timestamp: Date.now()
165
+ };
166
+ } catch (error) {
167
+ return {
168
+ name,
169
+ status: 'unhealthy',
170
+ error: error.message,
171
+ timestamp: Date.now()
172
+ };
173
+ }
174
+ }
175
+ );
176
+
177
+ const checkResults = await Promise.all(checkPromises);
178
+
179
+ // Process results
180
+ for (const result of checkResults) {
181
+ results[result.name] = result;
182
+ }
183
+
184
+ // Determine overall health
185
+ const unhealthyChecks = checkResults.filter(r => r.status === 'unhealthy');
186
+ const warningChecks = checkResults.filter(r => r.status === 'warning');
187
+
188
+ let overallStatus = 'healthy';
189
+ if (unhealthyChecks.length > 0) {
190
+ overallStatus = 'unhealthy';
191
+ } else if (warningChecks.length > 0) {
192
+ overallStatus = 'warning';
193
+ }
194
+
195
+ // Update health status
196
+ this.healthStatus = {
197
+ overall: overallStatus,
198
+ lastCheck: Date.now(),
199
+ uptime: this.healthStatus.uptime,
200
+ checkDuration: performance.now() - startTime,
201
+ checks: results,
202
+ performance: this.getPerformanceMetrics()
203
+ };
204
+
205
+ // Emit health status event
206
+ this.emit('healthCheck', this.healthStatus);
207
+
208
+ // Log status changes
209
+ if (overallStatus !== 'healthy') {
210
+ logger.warn('Health check completed with issues', {
211
+ status: overallStatus,
212
+ unhealthyChecks: unhealthyChecks.length,
213
+ warningChecks: warningChecks.length
214
+ });
215
+ } else {
216
+ logger.debug('Health check completed successfully');
217
+ }
218
+
219
+ } catch (error) {
220
+ logger.error('Health check failed', { error: error.message });
221
+
222
+ this.healthStatus = {
223
+ overall: 'unhealthy',
224
+ lastCheck: Date.now(),
225
+ uptime: this.healthStatus.uptime,
226
+ error: error.message,
227
+ checks: results
228
+ };
229
+
230
+ this.emit('healthCheckError', error);
231
+ }
232
+ }
233
+
234
+ /**
235
+ * Get current health status
236
+ * @returns {Object} Current health status
237
+ */
238
+ getHealthStatus() {
239
+ return {
240
+ ...this.healthStatus,
241
+ performance: this.getPerformanceMetrics()
242
+ };
243
+ }
244
+
245
+ /**
246
+ * Record a request for performance tracking
247
+ * @param {number} responseTime - Response time in milliseconds
248
+ * @param {boolean} isError - Whether the request resulted in an error
249
+ */
250
+ recordRequest(responseTime, isError = false) {
251
+ const now = Date.now();
252
+
253
+ this.performanceMonitor.requestCount++;
254
+ this.performanceMonitor.responseTimeSum += responseTime;
255
+
256
+ if (isError) {
257
+ this.performanceMonitor.errorCount++;
258
+ }
259
+
260
+ // Track last minute metrics
261
+ this.performanceMonitor.lastMinuteRequests.push({
262
+ timestamp: now,
263
+ responseTime,
264
+ isError
265
+ });
266
+
267
+ // Clean up old entries (older than 1 minute)
268
+ const oneMinuteAgo = now - 60000;
269
+ this.performanceMonitor.lastMinuteRequests = this.performanceMonitor.lastMinuteRequests
270
+ .filter(req => req.timestamp > oneMinuteAgo);
271
+ }
272
+
273
+ /**
274
+ * Get performance metrics
275
+ * @returns {Object} Performance metrics
276
+ */
277
+ getPerformanceMetrics() {
278
+ const now = Date.now();
279
+ const oneMinuteAgo = now - 60000;
280
+
281
+ const recentRequests = this.performanceMonitor.lastMinuteRequests
282
+ .filter(req => req.timestamp > oneMinuteAgo);
283
+
284
+ const recentErrors = recentRequests.filter(req => req.isError);
285
+
286
+ const avgResponseTime = this.performanceMonitor.requestCount > 0
287
+ ? this.performanceMonitor.responseTimeSum / this.performanceMonitor.requestCount
288
+ : 0;
289
+
290
+ return {
291
+ totalRequests: this.performanceMonitor.requestCount,
292
+ totalErrors: this.performanceMonitor.errorCount,
293
+ avgResponseTime,
294
+ errorRate: this.performanceMonitor.requestCount > 0
295
+ ? this.performanceMonitor.errorCount / this.performanceMonitor.requestCount
296
+ : 0,
297
+ lastMinute: {
298
+ requests: recentRequests.length,
299
+ errors: recentErrors.length,
300
+ avgResponseTime: recentRequests.length > 0
301
+ ? recentRequests.reduce((sum, req) => sum + req.responseTime, 0) / recentRequests.length
302
+ : 0,
303
+ errorRate: recentRequests.length > 0
304
+ ? recentErrors.length / recentRequests.length
305
+ : 0
306
+ }
307
+ };
308
+ }
309
+
310
+ // Individual health check implementations
311
+
312
+ /**
313
+ * Check memory usage
314
+ */
315
+ async checkMemoryUsage() {
316
+ const memUsage = process.memoryUsage();
317
+ const systemMem = {
318
+ total: os.totalmem(),
319
+ free: os.freemem(),
320
+ used: os.totalmem() - os.freemem()
321
+ };
322
+
323
+ const heapUsagePercent = (memUsage.heapUsed / memUsage.heapTotal) * 100;
324
+ const systemUsagePercent = (systemMem.used / systemMem.total) * 100;
325
+
326
+ const result = {
327
+ process: {
328
+ heapUsed: memUsage.heapUsed,
329
+ heapTotal: memUsage.heapTotal,
330
+ heapUsagePercent,
331
+ rss: memUsage.rss,
332
+ external: memUsage.external
333
+ },
334
+ system: {
335
+ total: systemMem.total,
336
+ free: systemMem.free,
337
+ used: systemMem.used,
338
+ usagePercent: systemUsagePercent
339
+ }
340
+ };
341
+
342
+ // Check against thresholds
343
+ if (memUsage.heapUsed > this.options.thresholds.memoryUsage) {
344
+ throw new Error(`Memory usage (${memUsage.heapUsed}) exceeds threshold (${this.options.thresholds.memoryUsage})`);
345
+ }
346
+
347
+ return result;
348
+ }
349
+
350
+ /**
351
+ * Check CPU usage
352
+ */
353
+ async checkCpuUsage() {
354
+ const cpus = os.cpus();
355
+ const loadAvg = os.loadavg();
356
+
357
+ // Calculate average CPU usage over a short interval
358
+ const startUsage = process.cpuUsage();
359
+ await new Promise(resolve => setTimeout(resolve, 100));
360
+ const endUsage = process.cpuUsage(startUsage);
361
+
362
+ const cpuPercent = ((endUsage.user + endUsage.system) / 100000); // Convert to percentage
363
+
364
+ const result = {
365
+ cores: cpus.length,
366
+ loadAverage: {
367
+ '1min': loadAvg[0],
368
+ '5min': loadAvg[1],
369
+ '15min': loadAvg[2]
370
+ },
371
+ usage: {
372
+ user: endUsage.user,
373
+ system: endUsage.system,
374
+ percent: cpuPercent
375
+ }
376
+ };
377
+
378
+ // Check against thresholds
379
+ if (cpuPercent > this.options.thresholds.cpuUsage) {
380
+ throw new Error(`CPU usage (${cpuPercent}%) exceeds threshold (${this.options.thresholds.cpuUsage}%)`);
381
+ }
382
+
383
+ return result;
384
+ }
385
+
386
+ /**
387
+ * Check disk space
388
+ */
389
+ async checkDiskSpace() {
390
+ // Note: This is a simplified check. In production, you might want to use statvfs or similar
391
+ const stats = {
392
+ available: true, // Placeholder - implement actual disk space check if needed
393
+ usage: 'Not implemented'
394
+ };
395
+
396
+ return stats;
397
+ }
398
+
399
+ /**
400
+ * Check cache health
401
+ */
402
+ async checkCacheHealth() {
403
+ // This would integrate with your CacheManager
404
+ return {
405
+ status: 'operational',
406
+ hitRate: 85, // Placeholder
407
+ size: 1024 * 1024 // Placeholder
408
+ };
409
+ }
410
+
411
+ /**
412
+ * Check queue health
413
+ */
414
+ async checkQueueHealth() {
415
+ // This would integrate with your QueueManager
416
+ return {
417
+ status: 'operational',
418
+ pending: 0, // Placeholder
419
+ processing: 0 // Placeholder
420
+ };
421
+ }
422
+
423
+ /**
424
+ * Check worker health
425
+ */
426
+ async checkWorkerHealth() {
427
+ // This would integrate with your WorkerPool
428
+ return {
429
+ status: 'operational',
430
+ activeWorkers: 0, // Placeholder
431
+ totalWorkers: config.performance?.maxWorkers || 4
432
+ };
433
+ }
434
+
435
+ /**
436
+ * Check connection pool health
437
+ */
438
+ async checkConnectionHealth() {
439
+ // This would integrate with your ConnectionPool
440
+ return {
441
+ status: 'operational',
442
+ activeConnections: 0, // Placeholder
443
+ poolSize: 20 // Placeholder
444
+ };
445
+ }
446
+
447
+ /**
448
+ * Check search API health
449
+ */
450
+ async checkSearchApiHealth() {
451
+ try {
452
+ // Simple connectivity test - you might want to implement actual API health checks
453
+ const controller = new AbortController();
454
+ const timeoutId = setTimeout(() => controller.abort(), 3000);
455
+
456
+ const response = await fetch('https://www.google.com', {
457
+ method: 'HEAD',
458
+ signal: controller.signal
459
+ });
460
+
461
+ clearTimeout(timeoutId);
462
+
463
+ return {
464
+ status: 'operational',
465
+ responseCode: response.status,
466
+ responseTime: performance.now()
467
+ };
468
+ } catch (error) {
469
+ throw new Error(`Search API health check failed: ${error.message}`);
470
+ }
471
+ }
472
+
473
+ /**
474
+ * Check network connectivity
475
+ */
476
+ async checkNetworkConnectivity() {
477
+ try {
478
+ const testUrls = [
479
+ 'https://www.google.com',
480
+ 'https://www.github.com'
481
+ ];
482
+
483
+ const results = await Promise.all(
484
+ testUrls.map(async (url) => {
485
+ try {
486
+ const controller = new AbortController();
487
+ const timeoutId = setTimeout(() => controller.abort(), 2000);
488
+
489
+ const start = performance.now();
490
+ const response = await fetch(url, {
491
+ method: 'HEAD',
492
+ signal: controller.signal
493
+ });
494
+ const duration = performance.now() - start;
495
+
496
+ clearTimeout(timeoutId);
497
+
498
+ return {
499
+ url,
500
+ status: 'ok',
501
+ responseTime: duration,
502
+ statusCode: response.status
503
+ };
504
+ } catch (error) {
505
+ return {
506
+ url,
507
+ status: 'error',
508
+ error: error.message
509
+ };
510
+ }
511
+ })
512
+ );
513
+
514
+ const failedChecks = results.filter(r => r.status === 'error');
515
+
516
+ if (failedChecks.length === results.length) {
517
+ throw new Error('All network connectivity tests failed');
518
+ }
519
+
520
+ return {
521
+ results,
522
+ successRate: (results.length - failedChecks.length) / results.length
523
+ };
524
+ } catch (error) {
525
+ throw new Error(`Network connectivity check failed: ${error.message}`);
526
+ }
527
+ }
528
+
529
+ /**
530
+ * Generate health report in different formats
531
+ * @param {string} format - Report format ('json', 'text', 'summary')
532
+ * @returns {string|Object} Formatted health report
533
+ */
534
+ generateReport(format = 'json') {
535
+ const status = this.getHealthStatus();
536
+
537
+ switch (format) {
538
+ case 'text':
539
+ return this.generateTextReport(status);
540
+ case 'summary':
541
+ return this.generateSummaryReport(status);
542
+ case 'json':
543
+ default:
544
+ return status;
545
+ }
546
+ }
547
+
548
+ /**
549
+ * Generate text-based health report
550
+ */
551
+ generateTextReport(status) {
552
+ const lines = [];
553
+ lines.push(`CrawlForge MCP Server Health Report`);
554
+ lines.push(`Generated: ${new Date(status.lastCheck).toISOString()}`);
555
+ lines.push(`Overall Status: ${status.overall.toUpperCase()}`);
556
+ lines.push(`Uptime: ${Math.floor((Date.now() - status.uptime) / 1000)}s`);
557
+ lines.push('');
558
+
559
+ if (status.performance) {
560
+ lines.push('Performance Metrics:');
561
+ lines.push(` Requests: ${status.performance.totalRequests}`);
562
+ lines.push(` Errors: ${status.performance.totalErrors}`);
563
+ lines.push(` Avg Response Time: ${status.performance.avgResponseTime.toFixed(2)}ms`);
564
+ lines.push(` Error Rate: ${(status.performance.errorRate * 100).toFixed(2)}%`);
565
+ lines.push('');
566
+ }
567
+
568
+ lines.push('Health Checks:');
569
+ for (const [name, check] of Object.entries(status.checks)) {
570
+ lines.push(` ${name}: ${check.status.toUpperCase()}`);
571
+ if (check.error) {
572
+ lines.push(` Error: ${check.error}`);
573
+ }
574
+ }
575
+
576
+ return lines.join('\n');
577
+ }
578
+
579
+ /**
580
+ * Generate summary health report
581
+ */
582
+ generateSummaryReport(status) {
583
+ const healthyChecks = Object.values(status.checks).filter(c => c.status === 'healthy').length;
584
+ const totalChecks = Object.keys(status.checks).length;
585
+
586
+ return {
587
+ status: status.overall,
588
+ uptime: Date.now() - status.uptime,
589
+ checksHealthy: `${healthyChecks}/${totalChecks}`,
590
+ lastCheck: status.lastCheck,
591
+ performance: status.performance ? {
592
+ requestsPerMinute: status.performance.lastMinute.requests,
593
+ avgResponseTime: status.performance.avgResponseTime,
594
+ errorRate: status.performance.errorRate
595
+ } : null
596
+ };
597
+ }
598
+ }
599
+
600
+ export default HealthCheckManager;