crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,568 @@
1
+ /**
2
+ * Logger - Comprehensive logging system with Winston integration
3
+ * Features: Structured logging, multiple transports, contextual logging, request tracking
4
+ */
5
+
6
+ import winston from 'winston';
7
+ import { fileURLToPath } from 'url';
8
+ import { dirname, join } from 'path';
9
+ import { existsSync, mkdirSync } from 'fs';
10
+
11
+ const __filename = fileURLToPath(import.meta.url);
12
+ const __dirname = dirname(__filename);
13
+
14
+ export class Logger {
15
+ constructor(options = {}) {
16
+ const {
17
+ level = process.env.LOG_LEVEL || 'info',
18
+ service = 'crawlforge',
19
+ enableConsole = true,
20
+ enableFile = true,
21
+ enableJson = process.env.NODE_ENV === 'production',
22
+ logDir = join(__dirname, '../../logs'),
23
+ maxFiles = 5,
24
+ maxSize = '10m',
25
+ enableRequestTracking = true,
26
+ enableErrorTracking = true,
27
+ enablePerformanceTracking = true
28
+ } = options;
29
+
30
+ this.service = service;
31
+ this.enableRequestTracking = enableRequestTracking;
32
+ this.enableErrorTracking = enableErrorTracking;
33
+ this.enablePerformanceTracking = enablePerformanceTracking;
34
+ this.logDir = logDir;
35
+
36
+ // Ensure log directory exists
37
+ this.ensureLogDirectory();
38
+
39
+ // Request tracking
40
+ this.requests = new Map(); // requestId -> request context
41
+ this.requestCounter = 0;
42
+
43
+ // Performance metrics
44
+ this.metrics = {
45
+ requests: 0,
46
+ errors: 0,
47
+ warnings: 0,
48
+ averageResponseTime: 0,
49
+ slowRequests: 0
50
+ };
51
+
52
+ // Create Winston logger
53
+ this.winston = winston.createLogger({
54
+ level,
55
+ defaultMeta: { service: this.service },
56
+ format: this.createFormat(enableJson),
57
+ transports: this.createTransports(enableConsole, enableFile, maxFiles, maxSize),
58
+ exitOnError: false
59
+ });
60
+
61
+ // Error handling for logger itself
62
+ this.winston.on('error', (error) => {
63
+ console.error('Logger error:', error);
64
+ });
65
+ }
66
+
67
+ /**
68
+ * Create Winston format configuration
69
+ * @param {boolean} enableJson - Whether to use JSON format
70
+ * @returns {winston.Format} Winston format
71
+ */
72
+ createFormat(enableJson) {
73
+ const formats = [
74
+ winston.format.timestamp({ format: 'YYYY-MM-DD HH:mm:ss.SSS' }),
75
+ winston.format.errors({ stack: true }),
76
+ winston.format.metadata({ fillExcept: ['message', 'level', 'timestamp', 'service'] })
77
+ ];
78
+
79
+ if (enableJson) {
80
+ formats.push(winston.format.json());
81
+ } else {
82
+ formats.push(
83
+ winston.format.colorize({ all: true }),
84
+ winston.format.printf(this.formatMessage.bind(this))
85
+ );
86
+ }
87
+
88
+ return winston.format.combine(...formats);
89
+ }
90
+
91
+ /**
92
+ * Create Winston transports
93
+ * @param {boolean} enableConsole - Enable console transport
94
+ * @param {boolean} enableFile - Enable file transport
95
+ * @param {number} maxFiles - Maximum log files to keep
96
+ * @param {string} maxSize - Maximum size per log file
97
+ * @returns {Array} Array of Winston transports
98
+ */
99
+ createTransports(enableConsole, enableFile, maxFiles, maxSize) {
100
+ const transports = [];
101
+
102
+ if (enableConsole) {
103
+ transports.push(new winston.transports.Console({
104
+ format: winston.format.combine(
105
+ winston.format.colorize(),
106
+ winston.format.simple()
107
+ )
108
+ }));
109
+ }
110
+
111
+ if (enableFile) {
112
+ // General log file
113
+ transports.push(new winston.transports.File({
114
+ filename: join(this.logDir, 'app.log'),
115
+ maxFiles,
116
+ maxsize: maxSize,
117
+ tailable: true
118
+ }));
119
+
120
+ // Error log file
121
+ transports.push(new winston.transports.File({
122
+ filename: join(this.logDir, 'error.log'),
123
+ level: 'error',
124
+ maxFiles,
125
+ maxsize: maxSize,
126
+ tailable: true
127
+ }));
128
+
129
+ // Performance log file
130
+ if (this.enablePerformanceTracking) {
131
+ transports.push(new winston.transports.File({
132
+ filename: join(this.logDir, 'performance.log'),
133
+ level: 'info',
134
+ maxFiles,
135
+ maxsize: maxSize,
136
+ tailable: true,
137
+ format: winston.format.combine(
138
+ winston.format.timestamp(),
139
+ winston.format.json()
140
+ )
141
+ }));
142
+ }
143
+ }
144
+
145
+ return transports;
146
+ }
147
+
148
+ /**
149
+ * Format log message for console output
150
+ * @param {Object} info - Log info object
151
+ * @returns {string} Formatted message
152
+ */
153
+ formatMessage(info) {
154
+ const { timestamp, level, message, service, metadata } = info;
155
+ let formatted = `[${timestamp}] ${level.toUpperCase()} [${service}]`;
156
+
157
+ // Add request ID if available
158
+ if (metadata.requestId) {
159
+ formatted += ` [${metadata.requestId}]`;
160
+ }
161
+
162
+ formatted += `: ${message}`;
163
+
164
+ // Add context information
165
+ if (metadata.context && Object.keys(metadata.context).length > 0) {
166
+ formatted += ` | Context: ${JSON.stringify(metadata.context)}`;
167
+ }
168
+
169
+ // Add error stack if available
170
+ if (metadata.stack) {
171
+ formatted += `\n${metadata.stack}`;
172
+ }
173
+
174
+ return formatted;
175
+ }
176
+
177
+ /**
178
+ * Ensure log directory exists
179
+ */
180
+ ensureLogDirectory() {
181
+ if (!existsSync(this.logDir)) {
182
+ mkdirSync(this.logDir, { recursive: true });
183
+ }
184
+ }
185
+
186
+ /**
187
+ * Generate unique request ID
188
+ * @returns {string} Request ID
189
+ */
190
+ generateRequestId() {
191
+ return `req_${Date.now()}_${++this.requestCounter}`;
192
+ }
193
+
194
+ /**
195
+ * Start request tracking
196
+ * @param {Object} context - Request context
197
+ * @returns {string} Request ID
198
+ */
199
+ startRequest(context = {}) {
200
+ if (!this.enableRequestTracking) {
201
+ return null;
202
+ }
203
+
204
+ const requestId = this.generateRequestId();
205
+ const startTime = Date.now();
206
+
207
+ this.requests.set(requestId, {
208
+ ...context,
209
+ startTime,
210
+ requestId
211
+ });
212
+
213
+ this.metrics.requests++;
214
+
215
+ this.info('Request started', {
216
+ requestId,
217
+ context,
218
+ startTime
219
+ });
220
+
221
+ return requestId;
222
+ }
223
+
224
+ /**
225
+ * End request tracking
226
+ * @param {string} requestId - Request ID
227
+ * @param {Object} result - Request result
228
+ */
229
+ endRequest(requestId, result = {}) {
230
+ if (!this.enableRequestTracking || !requestId) {
231
+ return;
232
+ }
233
+
234
+ const request = this.requests.get(requestId);
235
+ if (!request) {
236
+ return;
237
+ }
238
+
239
+ const endTime = Date.now();
240
+ const duration = endTime - request.startTime;
241
+
242
+ // Update performance metrics
243
+ this.updatePerformanceMetrics(duration);
244
+
245
+ // Log request completion
246
+ this.info('Request completed', {
247
+ requestId,
248
+ duration: `${duration}ms`,
249
+ result,
250
+ context: request
251
+ });
252
+
253
+ // Log slow requests
254
+ if (duration > 5000) { // 5 seconds threshold
255
+ this.warn('Slow request detected', {
256
+ requestId,
257
+ duration: `${duration}ms`,
258
+ context: request
259
+ });
260
+ this.metrics.slowRequests++;
261
+ }
262
+
263
+ this.requests.delete(requestId);
264
+ }
265
+
266
+ /**
267
+ * Log request error
268
+ * @param {string} requestId - Request ID
269
+ * @param {Error} error - Error object
270
+ * @param {Object} context - Additional context
271
+ */
272
+ requestError(requestId, error, context = {}) {
273
+ if (!this.enableRequestTracking || !requestId) {
274
+ this.error('Request error', error, context);
275
+ return;
276
+ }
277
+
278
+ const request = this.requests.get(requestId);
279
+ const duration = request ? Date.now() - request.startTime : null;
280
+
281
+ this.error('Request failed', error, {
282
+ requestId,
283
+ duration: duration ? `${duration}ms` : 'unknown',
284
+ context: { ...request, ...context }
285
+ });
286
+
287
+ if (request) {
288
+ this.requests.delete(requestId);
289
+ }
290
+ }
291
+
292
+ /**
293
+ * Update performance metrics
294
+ * @param {number} duration - Request duration in ms
295
+ */
296
+ updatePerformanceMetrics(duration) {
297
+ if (!this.enablePerformanceTracking) {
298
+ return;
299
+ }
300
+
301
+ const currentAverage = this.metrics.averageResponseTime;
302
+ const totalRequests = this.metrics.requests;
303
+ this.metrics.averageResponseTime =
304
+ ((currentAverage * (totalRequests - 1)) + duration) / totalRequests;
305
+ }
306
+
307
+ /**
308
+ * Log debug message
309
+ * @param {string} message - Log message
310
+ * @param {Object} context - Additional context
311
+ * @param {string} requestId - Optional request ID
312
+ */
313
+ debug(message, context = {}, requestId = null) {
314
+ this.winston.debug(message, {
315
+ context,
316
+ requestId,
317
+ timestamp: new Date().toISOString()
318
+ });
319
+ }
320
+
321
+ /**
322
+ * Log info message
323
+ * @param {string} message - Log message
324
+ * @param {Object} context - Additional context
325
+ * @param {string} requestId - Optional request ID
326
+ */
327
+ info(message, context = {}, requestId = null) {
328
+ this.winston.info(message, {
329
+ context,
330
+ requestId,
331
+ timestamp: new Date().toISOString()
332
+ });
333
+ }
334
+
335
+ /**
336
+ * Log warning message
337
+ * @param {string} message - Log message
338
+ * @param {Object} context - Additional context
339
+ * @param {string} requestId - Optional request ID
340
+ */
341
+ warn(message, context = {}, requestId = null) {
342
+ this.metrics.warnings++;
343
+ this.winston.warn(message, {
344
+ context,
345
+ requestId,
346
+ timestamp: new Date().toISOString()
347
+ });
348
+ }
349
+
350
+ /**
351
+ * Log error message
352
+ * @param {string} message - Log message
353
+ * @param {Error} error - Error object
354
+ * @param {Object} context - Additional context
355
+ * @param {string} requestId - Optional request ID
356
+ */
357
+ error(message, error = null, context = {}, requestId = null) {
358
+ this.metrics.errors++;
359
+
360
+ const errorContext = {
361
+ ...context,
362
+ requestId
363
+ };
364
+
365
+ if (error) {
366
+ errorContext.error = {
367
+ message: error.message,
368
+ stack: error.stack,
369
+ name: error.name,
370
+ code: error.code
371
+ };
372
+ }
373
+
374
+ this.winston.error(message, errorContext);
375
+
376
+ // Track error for analysis
377
+ if (this.enableErrorTracking) {
378
+ this.trackError(error, context, requestId);
379
+ }
380
+ }
381
+
382
+ /**
383
+ * Log performance metrics
384
+ * @param {string} operation - Operation name
385
+ * @param {number} duration - Duration in ms
386
+ * @param {Object} context - Additional context
387
+ * @param {string} requestId - Optional request ID
388
+ */
389
+ performance(operation, duration, context = {}, requestId = null) {
390
+ if (!this.enablePerformanceTracking) {
391
+ return;
392
+ }
393
+
394
+ this.info(`Performance: ${operation}`, {
395
+ operation,
396
+ duration: `${duration}ms`,
397
+ ...context
398
+ }, requestId);
399
+ }
400
+
401
+ /**
402
+ * Track error for analysis
403
+ * @param {Error} error - Error object
404
+ * @param {Object} context - Additional context
405
+ * @param {string} requestId - Request ID
406
+ */
407
+ trackError(error, context, requestId) {
408
+ // Could be extended to send to error tracking service
409
+ // For now, just log structured error data
410
+ this.winston.error('Error tracking', {
411
+ errorTracking: {
412
+ type: error.name,
413
+ message: error.message,
414
+ stack: error.stack,
415
+ context,
416
+ requestId,
417
+ timestamp: new Date().toISOString()
418
+ }
419
+ });
420
+ }
421
+
422
+ /**
423
+ * Create child logger with additional context
424
+ * @param {Object} context - Additional context for all logs
425
+ * @returns {Logger} Child logger instance
426
+ */
427
+ child(context = {}) {
428
+ return new ChildLogger(this, context);
429
+ }
430
+
431
+ /**
432
+ * Get logger statistics
433
+ * @returns {Object} Statistics object
434
+ */
435
+ getStats() {
436
+ return {
437
+ ...this.metrics,
438
+ activeRequests: this.requests.size,
439
+ uptime: process.uptime(),
440
+ memoryUsage: process.memoryUsage()
441
+ };
442
+ }
443
+
444
+ /**
445
+ * Set log level
446
+ * @param {string} level - Log level
447
+ */
448
+ setLevel(level) {
449
+ this.winston.level = level;
450
+ }
451
+
452
+ /**
453
+ * Close logger and cleanup resources
454
+ */
455
+ close() {
456
+ return new Promise((resolve) => {
457
+ this.winston.close(() => {
458
+ resolve();
459
+ });
460
+ });
461
+ }
462
+ }
463
+
464
+ /**
465
+ * Child logger that inherits from parent with additional context
466
+ */
467
+ export class ChildLogger {
468
+ constructor(parent, context) {
469
+ this.parent = parent;
470
+ this.context = context;
471
+ }
472
+
473
+ debug(message, additionalContext = {}, requestId = null) {
474
+ this.parent.debug(message, { ...this.context, ...additionalContext }, requestId);
475
+ }
476
+
477
+ info(message, additionalContext = {}, requestId = null) {
478
+ this.parent.info(message, { ...this.context, ...additionalContext }, requestId);
479
+ }
480
+
481
+ warn(message, additionalContext = {}, requestId = null) {
482
+ this.parent.warn(message, { ...this.context, ...additionalContext }, requestId);
483
+ }
484
+
485
+ error(message, error = null, additionalContext = {}, requestId = null) {
486
+ this.parent.error(message, error, { ...this.context, ...additionalContext }, requestId);
487
+ }
488
+
489
+ performance(operation, duration, additionalContext = {}, requestId = null) {
490
+ this.parent.performance(operation, duration, { ...this.context, ...additionalContext }, requestId);
491
+ }
492
+
493
+ startRequest(additionalContext = {}) {
494
+ return this.parent.startRequest({ ...this.context, ...additionalContext });
495
+ }
496
+
497
+ endRequest(requestId, result = {}) {
498
+ this.parent.endRequest(requestId, result);
499
+ }
500
+
501
+ requestError(requestId, error, additionalContext = {}) {
502
+ this.parent.requestError(requestId, error, { ...this.context, ...additionalContext });
503
+ }
504
+
505
+ child(additionalContext = {}) {
506
+ return new ChildLogger(this.parent, { ...this.context, ...additionalContext });
507
+ }
508
+ }
509
+
510
+ /**
511
+ * Create a logger instance with preset configurations
512
+ * @param {string} preset - Preset name
513
+ * @param {Object} overrides - Configuration overrides
514
+ * @returns {Logger} Logger instance
515
+ */
516
+ export function createLogger(preset = 'default', overrides = {}) {
517
+ const presets = {
518
+ default: {
519
+ level: 'info',
520
+ enableConsole: true,
521
+ enableFile: true,
522
+ enableJson: false
523
+ },
524
+ development: {
525
+ level: 'debug',
526
+ enableConsole: true,
527
+ enableFile: true,
528
+ enableJson: false,
529
+ enableRequestTracking: true,
530
+ enablePerformanceTracking: true
531
+ },
532
+ production: {
533
+ level: 'info',
534
+ enableConsole: false,
535
+ enableFile: true,
536
+ enableJson: true,
537
+ enableRequestTracking: true,
538
+ enableErrorTracking: true,
539
+ enablePerformanceTracking: true
540
+ },
541
+ testing: {
542
+ level: 'error',
543
+ enableConsole: false,
544
+ enableFile: false,
545
+ enableJson: false,
546
+ enableRequestTracking: false
547
+ },
548
+ debug: {
549
+ level: 'debug',
550
+ enableConsole: true,
551
+ enableFile: true,
552
+ enableJson: false,
553
+ enableRequestTracking: true,
554
+ enablePerformanceTracking: true,
555
+ enableErrorTracking: true
556
+ }
557
+ };
558
+
559
+ const config = { ...presets[preset], ...overrides };
560
+ return new Logger(config);
561
+ }
562
+
563
+ // Global logger instance
564
+ export const logger = createLogger(
565
+ process.env.NODE_ENV === 'production' ? 'production' : 'development'
566
+ );
567
+
568
+ export default logger;