crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,553 @@
1
+ /**
2
+ * ConnectionPool - HTTP connection pooling with backpressure handling
3
+ * Optimizes concurrent requests through connection reuse and intelligent queueing
4
+ */
5
+
6
+ import { Agent as HttpAgent } from 'http';
7
+ import { Agent as HttpsAgent } from 'https';
8
+ import { EventEmitter } from 'events';
9
+ import { config } from '../../constants/config.js';
10
+
11
+ export class ConnectionPool extends EventEmitter {
12
+ constructor(options = {}) {
13
+ super();
14
+
15
+ const {
16
+ maxSockets = 50,
17
+ maxFreeSockets = 10,
18
+ timeout = 30000,
19
+ keepAlive = true,
20
+ keepAliveMsecs = 1000,
21
+ maxCachedSessions = 100,
22
+ backpressureThreshold = 0.8,
23
+ cleanupInterval = 60000,
24
+ enableMetrics = true
25
+ } = options;
26
+
27
+ this.maxSockets = maxSockets;
28
+ this.maxFreeSockets = maxFreeSockets;
29
+ this.timeout = timeout;
30
+ this.keepAlive = keepAlive;
31
+ this.keepAliveMsecs = keepAliveMsecs;
32
+ this.maxCachedSessions = maxCachedSessions;
33
+ this.backpressureThreshold = backpressureThreshold;
34
+ this.cleanupInterval = cleanupInterval;
35
+ this.enableMetrics = enableMetrics;
36
+
37
+ // Create HTTP/HTTPS agents
38
+ this.httpAgent = new HttpAgent({
39
+ keepAlive: this.keepAlive,
40
+ keepAliveMsecs: this.keepAliveMsecs,
41
+ maxSockets: this.maxSockets,
42
+ maxFreeSockets: this.maxFreeSockets,
43
+ timeout: this.timeout,
44
+ scheduling: 'fifo'
45
+ });
46
+
47
+ this.httpsAgent = new HttpsAgent({
48
+ keepAlive: this.keepAlive,
49
+ keepAliveMsecs: this.keepAliveMsecs,
50
+ maxSockets: this.maxSockets,
51
+ maxFreeSockets: this.maxFreeSockets,
52
+ timeout: this.timeout,
53
+ maxCachedSessions: this.maxCachedSessions,
54
+ scheduling: 'fifo'
55
+ });
56
+
57
+ // Connection tracking
58
+ this.activeConnections = new Map();
59
+ this.connectionStats = new Map();
60
+ this.requestQueue = [];
61
+
62
+ // Metrics
63
+ this.metrics = {
64
+ totalRequests: 0,
65
+ activeRequests: 0,
66
+ completedRequests: 0,
67
+ failedRequests: 0,
68
+ connectionReuse: 0,
69
+ backpressureEvents: 0,
70
+ avgResponseTime: 0,
71
+ peakConcurrency: 0
72
+ };
73
+
74
+ // Backpressure state
75
+ this.backpressureActive = false;
76
+ this.lastBackpressureCheck = Date.now();
77
+
78
+ // Start cleanup interval
79
+ if (this.cleanupInterval > 0) {
80
+ this.cleanupTimer = setInterval(() => {
81
+ this.cleanup();
82
+ }, this.cleanupInterval);
83
+ }
84
+
85
+ this.setupAgentMonitoring();
86
+ this.setupGracefulShutdown();
87
+ }
88
+
89
+ /**
90
+ * Get appropriate agent for URL
91
+ * @param {string} protocol - URL protocol
92
+ * @returns {Agent} - HTTP or HTTPS agent
93
+ */
94
+ getAgent(protocol) {
95
+ return protocol === 'https:' ? this.httpsAgent : this.httpAgent;
96
+ }
97
+
98
+ /**
99
+ * Execute HTTP request with connection pooling
100
+ * @param {Object} options - Request options
101
+ * @returns {Promise<Object>} - Request result
102
+ */
103
+ async request(options) {
104
+ const requestId = this.generateRequestId();
105
+ const startTime = Date.now();
106
+
107
+ this.metrics.totalRequests++;
108
+ this.metrics.activeRequests++;
109
+ this.metrics.peakConcurrency = Math.max(this.metrics.peakConcurrency, this.metrics.activeRequests);
110
+
111
+ try {
112
+ // Check for backpressure
113
+ if (this.shouldApplyBackpressure()) {
114
+ await this.handleBackpressure(requestId);
115
+ }
116
+
117
+ // Track connection
118
+ this.trackConnection(requestId, options, startTime);
119
+
120
+ // Execute request
121
+ const result = await this.executeRequest(options, requestId);
122
+
123
+ // Update metrics
124
+ const duration = Date.now() - startTime;
125
+ this.updateSuccessMetrics(duration);
126
+
127
+ this.emit('requestCompleted', { requestId, duration, options });
128
+
129
+ return result;
130
+
131
+ } catch (error) {
132
+ this.metrics.failedRequests++;
133
+ this.emit('requestFailed', { requestId, error: error.message, options });
134
+ throw error;
135
+ } finally {
136
+ this.metrics.activeRequests--;
137
+ this.untrackConnection(requestId);
138
+ }
139
+ }
140
+
141
+ /**
142
+ * Execute multiple requests with intelligent batching
143
+ * @param {Array} requests - Array of request options
144
+ * @param {Object} batchOptions - Batching configuration
145
+ * @returns {Promise<Array>} - Array of results
146
+ */
147
+ async requestBatch(requests, batchOptions = {}) {
148
+ const {
149
+ maxConcurrent = Math.min(this.maxSockets * 0.8, 20),
150
+ failFast = false,
151
+ retryFailures = false,
152
+ batchDelay = 0
153
+ } = batchOptions;
154
+
155
+ const chunks = this.chunkArray(requests, maxConcurrent);
156
+ const results = [];
157
+
158
+ for (let i = 0; i < chunks.length; i++) {
159
+ const chunk = chunks[i];
160
+
161
+ if (i > 0 && batchDelay > 0) {
162
+ await this.delay(batchDelay);
163
+ }
164
+
165
+ const chunkPromises = chunk.map(async (requestOptions) => {
166
+ try {
167
+ return await this.request(requestOptions);
168
+ } catch (error) {
169
+ if (failFast) throw error;
170
+ return { error: error.message, options: requestOptions };
171
+ }
172
+ });
173
+
174
+ const chunkResults = await Promise.all(chunkPromises);
175
+ results.push(...chunkResults);
176
+
177
+ // Handle fail-fast mode
178
+ if (failFast && chunkResults.some(result => result && result.error)) {
179
+ throw new Error('Batch request failed in fail-fast mode');
180
+ }
181
+
182
+ // Check for backpressure between chunks
183
+ if (this.shouldApplyBackpressure()) {
184
+ await this.handleBackpressure(`batch_${i}`);
185
+ }
186
+ }
187
+
188
+ // Retry failures if requested
189
+ if (retryFailures) {
190
+ const failedRequests = results
191
+ .map((result, index) => ({ result, index, original: requests[index] }))
192
+ .filter(({ result }) => result && result.error)
193
+ .map(({ original }) => original);
194
+
195
+ if (failedRequests.length > 0) {
196
+ const retryResults = await this.requestBatch(failedRequests, {
197
+ ...batchOptions,
198
+ retryFailures: false // Prevent infinite recursion
199
+ });
200
+
201
+ // Merge retry results back
202
+ let retryIndex = 0;
203
+ for (let i = 0; i < results.length; i++) {
204
+ if (results[i] && results[i].error) {
205
+ results[i] = retryResults[retryIndex++];
206
+ }
207
+ }
208
+ }
209
+ }
210
+
211
+ return results;
212
+ }
213
+
214
+ /**
215
+ * Execute the actual HTTP request
216
+ * @param {Object} options - Request options
217
+ * @param {string} requestId - Request ID
218
+ * @returns {Promise<Object>} - Request result
219
+ */
220
+ async executeRequest(options, requestId) {
221
+ const { fetch } = await import('node-fetch');
222
+
223
+ const {
224
+ url,
225
+ method = 'GET',
226
+ headers = {},
227
+ body,
228
+ timeout = this.timeout,
229
+ ...otherOptions
230
+ } = options;
231
+
232
+ const urlObj = new URL(url);
233
+ const agent = this.getAgent(urlObj.protocol);
234
+
235
+ const fetchOptions = {
236
+ method,
237
+ headers: {
238
+ 'User-Agent': config.crawling.userAgent,
239
+ ...headers
240
+ },
241
+ body,
242
+ timeout,
243
+ agent,
244
+ ...otherOptions
245
+ };
246
+
247
+ const response = await fetch(url, fetchOptions);
248
+
249
+ // Check if connection was reused
250
+ if (this.wasConnectionReused(agent, urlObj.hostname)) {
251
+ this.metrics.connectionReuse++;
252
+ }
253
+
254
+ return {
255
+ status: response.status,
256
+ statusText: response.statusText,
257
+ headers: Object.fromEntries(response.headers.entries()),
258
+ url: response.url,
259
+ body: await response.text(),
260
+ ok: response.ok
261
+ };
262
+ }
263
+
264
+ /**
265
+ * Check if backpressure should be applied
266
+ * @returns {boolean} - Whether to apply backpressure
267
+ */
268
+ shouldApplyBackpressure() {
269
+ const now = Date.now();
270
+
271
+ // Only check periodically to avoid overhead
272
+ if (now - this.lastBackpressureCheck < 1000) {
273
+ return this.backpressureActive;
274
+ }
275
+
276
+ this.lastBackpressureCheck = now;
277
+
278
+ const activeRatio = this.metrics.activeRequests / this.maxSockets;
279
+ const shouldActivate = activeRatio > this.backpressureThreshold;
280
+
281
+ if (shouldActivate && !this.backpressureActive) {
282
+ this.backpressureActive = true;
283
+ this.metrics.backpressureEvents++;
284
+ this.emit('backpressureActivated', { activeRequests: this.metrics.activeRequests });
285
+ } else if (!shouldActivate && this.backpressureActive) {
286
+ this.backpressureActive = false;
287
+ this.emit('backpressureDeactivated', { activeRequests: this.metrics.activeRequests });
288
+ }
289
+
290
+ return this.backpressureActive;
291
+ }
292
+
293
+ /**
294
+ * Handle backpressure by delaying request
295
+ * @param {string} requestId - Request ID
296
+ */
297
+ async handleBackpressure(requestId) {
298
+ const baseDelay = 100;
299
+ const maxDelay = 2000;
300
+ const backoffMultiplier = Math.min(this.metrics.backpressureEvents, 10);
301
+
302
+ const delay = Math.min(baseDelay * Math.pow(1.5, backoffMultiplier), maxDelay);
303
+
304
+ this.emit('backpressureDelay', { requestId, delay });
305
+ await this.delay(delay);
306
+ }
307
+
308
+ /**
309
+ * Track active connection
310
+ * @param {string} requestId - Request ID
311
+ * @param {Object} options - Request options
312
+ * @param {number} startTime - Request start time
313
+ */
314
+ trackConnection(requestId, options, startTime) {
315
+ this.activeConnections.set(requestId, {
316
+ url: options.url,
317
+ method: options.method || 'GET',
318
+ startTime,
319
+ host: new URL(options.url).hostname
320
+ });
321
+
322
+ // Update per-host statistics
323
+ const host = new URL(options.url).hostname;
324
+ if (!this.connectionStats.has(host)) {
325
+ this.connectionStats.set(host, {
326
+ totalRequests: 0,
327
+ activeRequests: 0,
328
+ avgResponseTime: 0,
329
+ lastRequestTime: startTime
330
+ });
331
+ }
332
+
333
+ const hostStats = this.connectionStats.get(host);
334
+ hostStats.totalRequests++;
335
+ hostStats.activeRequests++;
336
+ hostStats.lastRequestTime = startTime;
337
+ }
338
+
339
+ /**
340
+ * Stop tracking connection
341
+ * @param {string} requestId - Request ID
342
+ */
343
+ untrackConnection(requestId) {
344
+ const connection = this.activeConnections.get(requestId);
345
+ if (connection) {
346
+ const hostStats = this.connectionStats.get(connection.host);
347
+ if (hostStats) {
348
+ hostStats.activeRequests--;
349
+
350
+ const duration = Date.now() - connection.startTime;
351
+ hostStats.avgResponseTime = (
352
+ (hostStats.avgResponseTime * (hostStats.totalRequests - 1) + duration) /
353
+ hostStats.totalRequests
354
+ );
355
+ }
356
+
357
+ this.activeConnections.delete(requestId);
358
+ }
359
+ }
360
+
361
+ /**
362
+ * Check if connection was reused
363
+ * @param {Agent} agent - HTTP/HTTPS agent
364
+ * @param {string} hostname - Target hostname
365
+ * @returns {boolean} - Whether connection was reused
366
+ */
367
+ wasConnectionReused(agent, hostname) {
368
+ const sockets = agent.sockets[hostname] || [];
369
+ const freeSockets = agent.freeSockets[hostname] || [];
370
+
371
+ // If there are free sockets or multiple sockets, connection was likely reused
372
+ return freeSockets.length > 0 || sockets.length > 1;
373
+ }
374
+
375
+ /**
376
+ * Update success metrics
377
+ * @param {number} duration - Request duration
378
+ */
379
+ updateSuccessMetrics(duration) {
380
+ this.metrics.completedRequests++;
381
+
382
+ // Update average response time
383
+ const totalCompleted = this.metrics.completedRequests;
384
+ this.metrics.avgResponseTime = (
385
+ (this.metrics.avgResponseTime * (totalCompleted - 1) + duration) / totalCompleted
386
+ );
387
+ }
388
+
389
+ /**
390
+ * Generate unique request ID
391
+ * @returns {string} - Request ID
392
+ */
393
+ generateRequestId() {
394
+ return `req_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
395
+ }
396
+
397
+ /**
398
+ * Split array into chunks
399
+ * @param {Array} array - Array to chunk
400
+ * @param {number} chunkSize - Size of each chunk
401
+ * @returns {Array} - Array of chunks
402
+ */
403
+ chunkArray(array, chunkSize) {
404
+ const chunks = [];
405
+ for (let i = 0; i < array.length; i += chunkSize) {
406
+ chunks.push(array.slice(i, i + chunkSize));
407
+ }
408
+ return chunks;
409
+ }
410
+
411
+ /**
412
+ * Delay execution
413
+ * @param {number} ms - Milliseconds to delay
414
+ * @returns {Promise<void>}
415
+ */
416
+ delay(ms) {
417
+ return new Promise(resolve => setTimeout(resolve, ms));
418
+ }
419
+
420
+ /**
421
+ * Setup agent monitoring
422
+ */
423
+ setupAgentMonitoring() {
424
+ if (!this.enableMetrics) return;
425
+
426
+ const monitorAgent = (agent, protocol) => {
427
+ const originalCreateConnection = agent.createConnection;
428
+ agent.createConnection = (...args) => {
429
+ this.emit('connectionCreated', { protocol });
430
+ return originalCreateConnection.apply(agent, args);
431
+ };
432
+ };
433
+
434
+ monitorAgent(this.httpAgent, 'http');
435
+ monitorAgent(this.httpsAgent, 'https');
436
+ }
437
+
438
+ /**
439
+ * Cleanup idle connections and stale statistics
440
+ */
441
+ cleanup() {
442
+ const now = Date.now();
443
+ const maxIdleTime = 300000; // 5 minutes
444
+
445
+ // Cleanup stale host statistics
446
+ for (const [host, stats] of this.connectionStats.entries()) {
447
+ if (now - stats.lastRequestTime > maxIdleTime && stats.activeRequests === 0) {
448
+ this.connectionStats.delete(host);
449
+ }
450
+ }
451
+
452
+ // Force socket cleanup on agents
453
+ this.httpAgent.destroy();
454
+ this.httpsAgent.destroy();
455
+
456
+ this.emit('cleanup', {
457
+ hostsTracked: this.connectionStats.size,
458
+ activeConnections: this.activeConnections.size
459
+ });
460
+ }
461
+
462
+ /**
463
+ * Get connection pool statistics
464
+ * @returns {Object} - Statistics object
465
+ */
466
+ getStats() {
467
+ return {
468
+ ...this.metrics,
469
+ activeConnections: this.activeConnections.size,
470
+ hostsTracked: this.connectionStats.size,
471
+ backpressureActive: this.backpressureActive,
472
+ httpSockets: Object.keys(this.httpAgent.sockets).length,
473
+ httpFreeSockets: Object.keys(this.httpAgent.freeSockets).length,
474
+ httpsSockets: Object.keys(this.httpsAgent.sockets).length,
475
+ httpsFreeSockets: Object.keys(this.httpsAgent.freeSockets).length
476
+ };
477
+ }
478
+
479
+ /**
480
+ * Get per-host statistics
481
+ * @returns {Map} - Host statistics
482
+ */
483
+ getHostStats() {
484
+ return new Map(this.connectionStats);
485
+ }
486
+
487
+ /**
488
+ * Get active connections information
489
+ * @returns {Array} - Active connections
490
+ */
491
+ getActiveConnections() {
492
+ return Array.from(this.activeConnections.entries()).map(([id, info]) => ({
493
+ id,
494
+ ...info,
495
+ duration: Date.now() - info.startTime
496
+ }));
497
+ }
498
+
499
+ /**
500
+ * Force close all connections
501
+ */
502
+ async closeAllConnections() {
503
+ // Destroy all agent connections
504
+ this.httpAgent.destroy();
505
+ this.httpsAgent.destroy();
506
+
507
+ // Clear tracking
508
+ this.activeConnections.clear();
509
+ this.connectionStats.clear();
510
+
511
+ this.emit('allConnectionsClosed');
512
+ }
513
+
514
+ /**
515
+ * Graceful shutdown
516
+ * @returns {Promise<void>}
517
+ */
518
+ async shutdown() {
519
+ this.emit('shutdown');
520
+
521
+ // Clear cleanup timer
522
+ if (this.cleanupTimer) {
523
+ clearInterval(this.cleanupTimer);
524
+ this.cleanupTimer = null;
525
+ }
526
+
527
+ // Wait for active requests to complete or timeout
528
+ const shutdownTimeout = 30000; // 30 seconds
529
+ const startTime = Date.now();
530
+
531
+ while (this.activeConnections.size > 0 && (Date.now() - startTime) < shutdownTimeout) {
532
+ await this.delay(100);
533
+ }
534
+
535
+ // Force close remaining connections
536
+ await this.closeAllConnections();
537
+ }
538
+
539
+ /**
540
+ * Setup graceful shutdown handlers
541
+ */
542
+ setupGracefulShutdown() {
543
+ const shutdown = async () => {
544
+ console.log('ConnectionPool: Graceful shutdown initiated');
545
+ await this.shutdown();
546
+ };
547
+
548
+ process.on('SIGTERM', shutdown);
549
+ process.on('SIGINT', shutdown);
550
+ }
551
+ }
552
+
553
+ export default ConnectionPool;