crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,342 @@
1
+ /**
2
+ * ErrorHandlingConfig - Centralized configuration for error handling systems
3
+ * Provides default configurations and factory methods for error handling components
4
+ */
5
+
6
+ import { RetryManager } from './RetryManager.js';
7
+ import { createCircuitBreaker } from './CircuitBreaker.js';
8
+ import { createLogger } from './Logger.js';
9
+
10
+ /**
11
+ * Default configurations for different service types
12
+ */
13
+ export const DEFAULT_CONFIGS = {
14
+ // Web scraping operations
15
+ scraping: {
16
+ retry: {
17
+ maxRetries: 3,
18
+ baseDelay: 1000,
19
+ maxDelay: 30000,
20
+ strategy: 'exponential',
21
+ jitter: true,
22
+ retryableErrors: ['ECONNRESET', 'ENOTFOUND', 'ECONNREFUSED', 'ETIMEDOUT', 'EPIPE'],
23
+ retryableStatusCodes: [408, 429, 500, 502, 503, 504, 520, 521, 522, 523, 524]
24
+ },
25
+ circuitBreaker: {
26
+ threshold: 5,
27
+ timeout: 30000,
28
+ resetTimeout: 60000,
29
+ halfOpenMaxCalls: 3,
30
+ errorThresholdPercentage: 50,
31
+ minimumThroughput: 10
32
+ },
33
+ logger: {
34
+ level: 'info',
35
+ enableRequestTracking: true,
36
+ enablePerformanceTracking: true,
37
+ enableErrorTracking: true
38
+ }
39
+ },
40
+
41
+ // API operations (Google Search, etc.)
42
+ api: {
43
+ retry: {
44
+ maxRetries: 4,
45
+ baseDelay: 1000,
46
+ maxDelay: 16000,
47
+ strategy: 'exponential',
48
+ jitter: true,
49
+ retryableErrors: ['ECONNRESET', 'ENOTFOUND', 'ECONNREFUSED', 'ETIMEDOUT'],
50
+ retryableStatusCodes: [408, 429, 500, 502, 503, 504, 520, 521, 522, 523, 524]
51
+ },
52
+ circuitBreaker: {
53
+ threshold: 5,
54
+ timeout: 30000,
55
+ resetTimeout: 120000,
56
+ halfOpenMaxCalls: 3,
57
+ errorThresholdPercentage: 50,
58
+ minimumThroughput: 20
59
+ },
60
+ logger: {
61
+ level: 'info',
62
+ enableRequestTracking: true,
63
+ enablePerformanceTracking: true,
64
+ enableErrorTracking: true
65
+ }
66
+ },
67
+
68
+ // Network operations (general HTTP requests)
69
+ network: {
70
+ retry: {
71
+ maxRetries: 3,
72
+ baseDelay: 1000,
73
+ maxDelay: 30000,
74
+ strategy: 'exponential',
75
+ jitter: true,
76
+ retryableErrors: ['ECONNRESET', 'ENOTFOUND', 'ECONNREFUSED', 'ETIMEDOUT', 'EPIPE'],
77
+ retryableStatusCodes: [408, 429, 500, 502, 503, 504]
78
+ },
79
+ circuitBreaker: {
80
+ threshold: 3,
81
+ timeout: 10000,
82
+ resetTimeout: 60000,
83
+ halfOpenMaxCalls: 2,
84
+ errorThresholdPercentage: 40,
85
+ minimumThroughput: 10
86
+ },
87
+ logger: {
88
+ level: 'info',
89
+ enableRequestTracking: true,
90
+ enablePerformanceTracking: false,
91
+ enableErrorTracking: true
92
+ }
93
+ },
94
+
95
+ // File processing operations
96
+ processing: {
97
+ retry: {
98
+ maxRetries: 2,
99
+ baseDelay: 2000,
100
+ maxDelay: 60000,
101
+ strategy: 'linear',
102
+ jitter: false,
103
+ retryableErrors: ['EMFILE', 'ENFILE', 'EBUSY', 'ENOENT'],
104
+ retryableStatusCodes: []
105
+ },
106
+ circuitBreaker: {
107
+ threshold: 10,
108
+ timeout: 60000,
109
+ resetTimeout: 300000,
110
+ halfOpenMaxCalls: 5,
111
+ errorThresholdPercentage: 70,
112
+ minimumThroughput: 5
113
+ },
114
+ logger: {
115
+ level: 'info',
116
+ enableRequestTracking: false,
117
+ enablePerformanceTracking: true,
118
+ enableErrorTracking: true
119
+ }
120
+ },
121
+
122
+ // Critical operations (require high reliability)
123
+ critical: {
124
+ retry: {
125
+ maxRetries: 5,
126
+ baseDelay: 500,
127
+ maxDelay: 10000,
128
+ strategy: 'exponential',
129
+ jitter: true,
130
+ retryableErrors: ['ECONNRESET', 'ENOTFOUND', 'ECONNREFUSED', 'ETIMEDOUT', 'EPIPE'],
131
+ retryableStatusCodes: [408, 429, 500, 502, 503, 504, 520, 521, 522, 523, 524]
132
+ },
133
+ circuitBreaker: {
134
+ threshold: 3,
135
+ timeout: 15000,
136
+ resetTimeout: 30000,
137
+ halfOpenMaxCalls: 2,
138
+ errorThresholdPercentage: 30,
139
+ minimumThroughput: 5
140
+ },
141
+ logger: {
142
+ level: 'debug',
143
+ enableRequestTracking: true,
144
+ enablePerformanceTracking: true,
145
+ enableErrorTracking: true
146
+ }
147
+ }
148
+ };
149
+
150
+ /**
151
+ * Factory class for creating error handling components
152
+ */
153
+ export class ErrorHandlingFactory {
154
+ /**
155
+ * Create a complete error handling suite for a service type
156
+ * @param {string} serviceType - Type of service (scraping, api, network, processing, critical)
157
+ * @param {Object} overrides - Configuration overrides
158
+ * @returns {Object} Error handling components
159
+ */
160
+ static createSuite(serviceType = 'scraping', overrides = {}) {
161
+ const config = { ...DEFAULT_CONFIGS[serviceType], ...overrides };
162
+
163
+ if (!config) {
164
+ throw new Error(`Unknown service type: ${serviceType}. Available types: ${Object.keys(DEFAULT_CONFIGS).join(', ')}`);
165
+ }
166
+
167
+ return {
168
+ retryManager: new RetryManager(config.retry),
169
+ circuitBreaker: createCircuitBreaker('default', config.circuitBreaker),
170
+ logger: createLogger('default', config.logger)
171
+ };
172
+ }
173
+
174
+ /**
175
+ * Create retry manager with preset configuration
176
+ * @param {string} serviceType - Service type
177
+ * @param {Object} overrides - Configuration overrides
178
+ * @returns {RetryManager} Configured retry manager
179
+ */
180
+ static createRetryManager(serviceType = 'scraping', overrides = {}) {
181
+ const config = { ...DEFAULT_CONFIGS[serviceType]?.retry, ...overrides };
182
+ return new RetryManager(config);
183
+ }
184
+
185
+ /**
186
+ * Create circuit breaker with preset configuration
187
+ * @param {string} serviceType - Service type
188
+ * @param {Object} overrides - Configuration overrides
189
+ * @returns {CircuitBreaker} Configured circuit breaker
190
+ */
191
+ static createCircuitBreaker(serviceType = 'scraping', overrides = {}) {
192
+ const config = { ...DEFAULT_CONFIGS[serviceType]?.circuitBreaker, ...overrides };
193
+ return createCircuitBreaker('default', config);
194
+ }
195
+
196
+ /**
197
+ * Create logger with preset configuration
198
+ * @param {string} serviceType - Service type
199
+ * @param {Object} overrides - Configuration overrides
200
+ * @returns {Logger} Configured logger
201
+ */
202
+ static createLogger(serviceType = 'scraping', overrides = {}) {
203
+ const config = { ...DEFAULT_CONFIGS[serviceType]?.logger, ...overrides };
204
+ return createLogger('default', config);
205
+ }
206
+ }
207
+
208
+ /**
209
+ * Global error handling configuration manager
210
+ */
211
+ export class ErrorHandlingConfigManager {
212
+ constructor() {
213
+ this.configs = new Map();
214
+ this.instances = new Map();
215
+ }
216
+
217
+ /**
218
+ * Register a custom configuration for a service
219
+ * @param {string} serviceName - Service name
220
+ * @param {Object} config - Error handling configuration
221
+ */
222
+ registerConfig(serviceName, config) {
223
+ this.configs.set(serviceName, config);
224
+ }
225
+
226
+ /**
227
+ * Get error handling suite for a service
228
+ * @param {string} serviceName - Service name
229
+ * @param {string} serviceType - Default service type if not registered
230
+ * @returns {Object} Error handling components
231
+ */
232
+ getSuite(serviceName, serviceType = 'scraping') {
233
+ if (this.instances.has(serviceName)) {
234
+ return this.instances.get(serviceName);
235
+ }
236
+
237
+ const config = this.configs.get(serviceName);
238
+ const suite = config
239
+ ? ErrorHandlingFactory.createSuite('scraping', config)
240
+ : ErrorHandlingFactory.createSuite(serviceType);
241
+
242
+ this.instances.set(serviceName, suite);
243
+ return suite;
244
+ }
245
+
246
+ /**
247
+ * Clear cached instances (useful for testing)
248
+ */
249
+ clearCache() {
250
+ // Cleanup existing instances
251
+ for (const suite of this.instances.values()) {
252
+ if (suite.circuitBreaker && typeof suite.circuitBreaker.destroy === 'function') {
253
+ suite.circuitBreaker.destroy();
254
+ }
255
+ if (suite.logger && typeof suite.logger.close === 'function') {
256
+ suite.logger.close();
257
+ }
258
+ }
259
+
260
+ this.instances.clear();
261
+ }
262
+
263
+ /**
264
+ * Get statistics for all registered services
265
+ * @returns {Object} Statistics for all services
266
+ */
267
+ getAllStats() {
268
+ const stats = {};
269
+
270
+ for (const [serviceName, suite] of this.instances.entries()) {
271
+ stats[serviceName] = {
272
+ retry: suite.retryManager?.getStats(),
273
+ circuitBreaker: suite.circuitBreaker?.getStats(),
274
+ logger: suite.logger?.getStats()
275
+ };
276
+ }
277
+
278
+ return stats;
279
+ }
280
+ }
281
+
282
+ // Global instance
283
+ export const errorHandlingConfig = new ErrorHandlingConfigManager();
284
+
285
+ /**
286
+ * Helper function to get error handling suite
287
+ * @param {string} serviceName - Service name
288
+ * @param {string} serviceType - Service type
289
+ * @returns {Object} Error handling components
290
+ */
291
+ export function getErrorHandling(serviceName, serviceType = 'scraping') {
292
+ return errorHandlingConfig.getSuite(serviceName, serviceType);
293
+ }
294
+
295
+ /**
296
+ * Helper function to register custom error handling configuration
297
+ * @param {string} serviceName - Service name
298
+ * @param {Object} config - Configuration object
299
+ */
300
+ export function registerErrorHandlingConfig(serviceName, config) {
301
+ errorHandlingConfig.registerConfig(serviceName, config);
302
+ }
303
+
304
+ /**
305
+ * Decorator for adding error handling to class methods
306
+ * @param {string} serviceName - Service name for error handling
307
+ * @param {string} serviceType - Service type
308
+ * @returns {Function} Decorator function
309
+ */
310
+ export function withErrorHandling(serviceName, serviceType = 'scraping') {
311
+ return function(target, propertyKey, descriptor) {
312
+ const originalMethod = descriptor.value;
313
+ const suite = getErrorHandling(serviceName, serviceType);
314
+
315
+ descriptor.value = async function(...args) {
316
+ const requestId = suite.logger.startRequest({
317
+ method: propertyKey,
318
+ service: serviceName,
319
+ args: args.length
320
+ });
321
+
322
+ try {
323
+ const result = await suite.retryManager.executeWithCircuitBreaker(
324
+ () => originalMethod.apply(this, args),
325
+ suite.circuitBreaker,
326
+ serviceName,
327
+ { method: propertyKey }
328
+ );
329
+
330
+ suite.logger.endRequest(requestId, { success: true });
331
+ return result;
332
+ } catch (error) {
333
+ suite.logger.requestError(requestId, error, { method: propertyKey });
334
+ throw error;
335
+ }
336
+ };
337
+
338
+ return descriptor;
339
+ };
340
+ }
341
+
342
+ export default ErrorHandlingFactory;