crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,503 @@
1
+ /**
2
+ * Process Document MCP Tool
3
+ * Multi-format document processing for PDFs, web pages, and other content types
4
+ */
5
+
6
+ import { z } from 'zod';
7
+ import { PDFProcessor } from '../../core/processing/PDFProcessor.js';
8
+ import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
9
+ import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
10
+ import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
11
+
12
+ const ProcessDocumentSchema = z.object({
13
+ source: z.string().min(1),
14
+ sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).default('url'),
15
+ options: z.object({
16
+ // PDF processing options
17
+ extractText: z.boolean().default(true),
18
+ extractMetadata: z.boolean().default(true),
19
+ password: z.string().optional(),
20
+ maxPages: z.number().min(1).max(500).default(100),
21
+
22
+ // Web content options
23
+ useReadability: z.boolean().default(true),
24
+ extractStructuredData: z.boolean().default(true),
25
+ requiresJavaScript: z.boolean().optional(),
26
+ waitForTimeout: z.number().min(0).max(30000).default(5000),
27
+
28
+ // Processing options
29
+ assessContentQuality: z.boolean().default(true),
30
+ includeStatistics: z.boolean().default(true),
31
+ outputFormat: z.enum(['text', 'structured', 'full']).default('structured'),
32
+
33
+ // Content filtering
34
+ minContentLength: z.number().min(0).default(50),
35
+ removeBoilerplate: z.boolean().default(true)
36
+ }).optional().default({})
37
+ });
38
+
39
+ const ProcessDocumentResult = z.object({
40
+ source: z.string(),
41
+ sourceType: z.string(),
42
+ documentType: z.string(),
43
+ title: z.string().nullable(),
44
+ content: z.object({
45
+ text: z.string(),
46
+ html: z.string().optional(),
47
+ extractedContent: z.string().optional()
48
+ }),
49
+ metadata: z.object({
50
+ // Common metadata
51
+ title: z.string().nullable(),
52
+ author: z.string().nullable(),
53
+ description: z.string().nullable(),
54
+ language: z.string().nullable(),
55
+
56
+ // PDF-specific metadata
57
+ creator: z.string().nullable().optional(),
58
+ producer: z.string().nullable().optional(),
59
+ creationDate: z.string().nullable().optional(),
60
+ modificationDate: z.string().nullable().optional(),
61
+ format: z.string().nullable().optional(),
62
+ pages: z.number().nullable().optional(),
63
+ encrypted: z.boolean().nullable().optional(),
64
+
65
+ // Web-specific metadata
66
+ canonical: z.string().nullable().optional(),
67
+ openGraph: z.record(z.string()).optional(),
68
+ twitterCard: z.record(z.string()).optional()
69
+ }).optional(),
70
+ statistics: z.object({
71
+ characters: z.number(),
72
+ charactersNoSpaces: z.number(),
73
+ words: z.number(),
74
+ sentences: z.number(),
75
+ paragraphs: z.number(),
76
+ readingTime: z.number(),
77
+ pages: z.number().optional()
78
+ }).optional(),
79
+ qualityAssessment: z.object({
80
+ isValid: z.boolean(),
81
+ score: z.number(),
82
+ reasons: z.array(z.string()),
83
+ metrics: z.record(z.any())
84
+ }).optional(),
85
+ readabilityScore: z.object({
86
+ score: z.number(),
87
+ level: z.string(),
88
+ metrics: z.record(z.any())
89
+ }).optional(),
90
+ structuredData: z.object({
91
+ jsonLd: z.array(z.any()),
92
+ microdata: z.array(z.any()),
93
+ schemaOrg: z.array(z.any())
94
+ }).optional(),
95
+ processedAt: z.string(),
96
+ processingTime: z.number(),
97
+ success: z.boolean(),
98
+ error: z.string().optional()
99
+ });
100
+
101
+ export class ProcessDocumentTool {
102
+ constructor() {
103
+ this.pdfProcessor = new PDFProcessor();
104
+ this.contentProcessor = new ContentProcessor();
105
+ this.browserProcessor = new BrowserProcessor();
106
+ }
107
+
108
+ /**
109
+ * Get tool definition for MCP server
110
+ * @returns {Object} Tool definition
111
+ */
112
+ getDefinition() {
113
+ return {
114
+ name: 'process_document',
115
+ description: 'Process documents from multiple sources and formats including PDFs, web pages, and local files with comprehensive content extraction and analysis.',
116
+ inputSchema: ProcessDocumentSchema
117
+ };
118
+ }
119
+
120
+ /**
121
+ * Execute document processing
122
+ * @param {Object} params - Processing parameters
123
+ * @returns {Promise<Object>} Processing result
124
+ */
125
+ async execute(params) {
126
+ const startTime = Date.now();
127
+
128
+ try {
129
+ const validated = ProcessDocumentSchema.parse(params);
130
+ const { source, sourceType, options } = validated;
131
+
132
+ const result = {
133
+ source,
134
+ sourceType,
135
+ processedAt: new Date().toISOString(),
136
+ success: false,
137
+ processingTime: 0
138
+ };
139
+
140
+ // Determine document type and processing method
141
+ if (sourceType.includes('pdf')) {
142
+ result.documentType = 'pdf';
143
+ await this.processPDFDocument(result, source, sourceType, options);
144
+ } else {
145
+ result.documentType = 'web';
146
+ await this.processWebDocument(result, source, options);
147
+ }
148
+
149
+ // Add statistics if requested
150
+ if (options.includeStatistics && result.content?.text) {
151
+ result.statistics = this.calculateStatistics(result.content.text);
152
+ }
153
+
154
+ // Assess content quality if requested
155
+ if (options.assessContentQuality && result.content?.text) {
156
+ result.qualityAssessment = ContentQualityAssessor.assessContentQuality(
157
+ result.content.text,
158
+ { minLength: options.minContentLength }
159
+ );
160
+ }
161
+
162
+ result.processingTime = Date.now() - startTime;
163
+ result.success = true;
164
+
165
+ return result;
166
+
167
+ } catch (error) {
168
+ return {
169
+ source: params.source || 'unknown',
170
+ sourceType: params.sourceType || 'unknown',
171
+ documentType: 'unknown',
172
+ processedAt: new Date().toISOString(),
173
+ success: false,
174
+ error: `Document processing failed: ${error.message}`,
175
+ processingTime: Date.now() - startTime,
176
+ content: { text: '' }
177
+ };
178
+ }
179
+ }
180
+
181
+ /**
182
+ * Process PDF document
183
+ * @param {Object} result - Result object to populate
184
+ * @param {string} source - PDF source
185
+ * @param {string} sourceType - Source type
186
+ * @param {Object} options - Processing options
187
+ * @returns {Promise<void>}
188
+ */
189
+ async processPDFDocument(result, source, sourceType, options) {
190
+ const pdfResult = await this.pdfProcessor.processPDF({
191
+ source,
192
+ sourceType: sourceType.replace('pdf_', ''),
193
+ options: {
194
+ extractText: options.extractText,
195
+ extractMetadata: options.extractMetadata,
196
+ password: options.password,
197
+ maxPages: options.maxPages
198
+ }
199
+ });
200
+
201
+ if (!pdfResult.success) {
202
+ throw new Error(pdfResult.error || 'PDF processing failed');
203
+ }
204
+
205
+ // Set content
206
+ result.content = {
207
+ text: pdfResult.text || ''
208
+ };
209
+
210
+ // Set title
211
+ result.title = pdfResult.metadata?.title || null;
212
+
213
+ // Set metadata
214
+ if (pdfResult.metadata) {
215
+ result.metadata = {
216
+ title: pdfResult.metadata.title,
217
+ author: pdfResult.metadata.author,
218
+ description: null, // PDFs don't typically have descriptions
219
+ language: null,
220
+ creator: pdfResult.metadata.creator,
221
+ producer: pdfResult.metadata.producer,
222
+ creationDate: pdfResult.metadata.creationDate,
223
+ modificationDate: pdfResult.metadata.modificationDate,
224
+ format: pdfResult.metadata.format,
225
+ pages: pdfResult.metadata.pages,
226
+ encrypted: pdfResult.metadata.encrypted
227
+ };
228
+ }
229
+
230
+ // Calculate readability score for text content
231
+ if (options.assessContentQuality && result.content.text) {
232
+ const readabilityScore = this.calculateReadabilityScore(result.content.text);
233
+ if (readabilityScore) {
234
+ result.readabilityScore = readabilityScore;
235
+ }
236
+ }
237
+ }
238
+
239
+ /**
240
+ * Process web document
241
+ * @param {Object} result - Result object to populate
242
+ * @param {string} source - Web source URL
243
+ * @param {Object} options - Processing options
244
+ * @returns {Promise<void>}
245
+ */
246
+ async processWebDocument(result, source, options) {
247
+ // Step 1: Fetch content (with or without JavaScript rendering)
248
+ let html, pageTitle;
249
+ const shouldUseJavaScript = options.requiresJavaScript || await this.shouldUseJavaScript(source);
250
+
251
+ if (shouldUseJavaScript) {
252
+ console.log('Using browser rendering for JavaScript content...');
253
+ const browserResult = await this.browserProcessor.processURL({
254
+ url: source,
255
+ options: {
256
+ waitForTimeout: options.waitForTimeout,
257
+ enableJavaScript: true,
258
+ enableImages: false,
259
+ captureScreenshot: false
260
+ }
261
+ });
262
+
263
+ if (!browserResult.success) {
264
+ throw new Error(`Browser processing failed: ${browserResult.error}`);
265
+ }
266
+
267
+ html = browserResult.html;
268
+ pageTitle = browserResult.title;
269
+ } else {
270
+ // Simple HTTP fetch
271
+ const response = await fetch(source, {
272
+ headers: {
273
+ 'User-Agent': 'Mozilla/5.0 (compatible; MCP-WebScraper/3.0; Document-Processor)'
274
+ },
275
+ timeout: 15000
276
+ });
277
+
278
+ if (!response.ok) {
279
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
280
+ }
281
+
282
+ html = await response.text();
283
+ pageTitle = this.extractTitleFromHTML(html);
284
+ }
285
+
286
+ result.title = pageTitle;
287
+
288
+ // Step 2: Process content with ContentProcessor
289
+ const processingResult = await this.contentProcessor.processContent({
290
+ html,
291
+ url: source,
292
+ options: {
293
+ extractStructuredData: options.extractStructuredData,
294
+ calculateReadabilityScore: true,
295
+ removeBoilerplate: options.useReadability,
296
+ preserveImageInfo: false,
297
+ extractMetadata: true
298
+ }
299
+ });
300
+
301
+ // Step 3: Extract and format content
302
+ let mainText = '';
303
+ let extractedContent = '';
304
+
305
+ if (processingResult.readability) {
306
+ mainText = processingResult.readability.textContent || processingResult.readability.content;
307
+ extractedContent = processingResult.readability.content;
308
+ } else if (processingResult.fallback_content) {
309
+ mainText = processingResult.fallback_content.content;
310
+ } else {
311
+ // Last resort: extract text from HTML
312
+ mainText = HTMLCleaner.extractTextWithFormatting(html, {
313
+ preserveLineBreaks: true,
314
+ preserveParagraphs: true,
315
+ includeLinks: false,
316
+ includeImageAlt: false
317
+ });
318
+ }
319
+
320
+ // Set content based on output format
321
+ result.content = { text: mainText };
322
+
323
+ if (options.outputFormat === 'structured' || options.outputFormat === 'full') {
324
+ if (extractedContent) result.content.extractedContent = extractedContent;
325
+ }
326
+
327
+ if (options.outputFormat === 'full') {
328
+ result.content.html = html;
329
+ }
330
+
331
+ // Step 4: Set metadata
332
+ if (processingResult.metadata) {
333
+ result.metadata = {
334
+ title: processingResult.metadata.title,
335
+ author: processingResult.metadata.author,
336
+ description: processingResult.metadata.description,
337
+ language: processingResult.metadata.language,
338
+ canonical: processingResult.metadata.canonical,
339
+ openGraph: processingResult.metadata.openGraph,
340
+ twitterCard: processingResult.metadata.twitterCard
341
+ };
342
+ }
343
+
344
+ // Step 5: Add readability score
345
+ if (processingResult.readability_score) {
346
+ result.readabilityScore = processingResult.readability_score;
347
+ }
348
+
349
+ // Step 6: Add structured data
350
+ if (options.extractStructuredData && processingResult.structured_data) {
351
+ result.structuredData = processingResult.structured_data;
352
+ }
353
+ }
354
+
355
+ /**
356
+ * Calculate text statistics
357
+ * @param {string} text - Text to analyze
358
+ * @returns {Object} - Text statistics
359
+ */
360
+ calculateStatistics(text) {
361
+ const characters = text.length;
362
+ const charactersNoSpaces = text.replace(/\s/g, '').length;
363
+ const words = text.split(/\s+/).filter(w => w.length > 0);
364
+ const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
365
+ const paragraphs = text.split(/\n\s*\n/).filter(p => p.trim().length > 0);
366
+
367
+ // Estimate reading time (average 200 words per minute)
368
+ const readingTime = Math.ceil(words.length / 200);
369
+
370
+ return {
371
+ characters,
372
+ charactersNoSpaces,
373
+ words: words.length,
374
+ sentences: sentences.length,
375
+ paragraphs: paragraphs.length,
376
+ readingTime
377
+ };
378
+ }
379
+
380
+ /**
381
+ * Calculate readability score
382
+ * @param {string} text - Text to analyze
383
+ * @returns {Object|null} - Readability score
384
+ */
385
+ calculateReadabilityScore(text) {
386
+ try {
387
+ const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
388
+ const words = text.split(/\s+/).filter(w => w.length > 0);
389
+
390
+ if (sentences.length === 0 || words.length === 0) {
391
+ return null;
392
+ }
393
+
394
+ const avgWordsPerSentence = words.length / sentences.length;
395
+ const avgCharsPerWord = text.replace(/\s/g, '').length / words.length;
396
+
397
+ // Flesch Reading Ease Score approximation
398
+ const score = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * (avgCharsPerWord / 4.7));
399
+ const clampedScore = Math.max(0, Math.min(100, score));
400
+
401
+ return {
402
+ score: Math.round(clampedScore * 100) / 100,
403
+ level: this.getReadabilityLevel(clampedScore),
404
+ metrics: {
405
+ sentences: sentences.length,
406
+ words: words.length,
407
+ avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
408
+ avgCharsPerWord: Math.round(avgCharsPerWord * 100) / 100
409
+ }
410
+ };
411
+
412
+ } catch (error) {
413
+ console.warn('Readability calculation failed:', error.message);
414
+ return null;
415
+ }
416
+ }
417
+
418
+ /**
419
+ * Get readability level from score
420
+ * @param {number} score - Readability score
421
+ * @returns {string} - Readability level
422
+ */
423
+ getReadabilityLevel(score) {
424
+ if (score >= 90) return 'Very Easy';
425
+ if (score >= 80) return 'Easy';
426
+ if (score >= 70) return 'Fairly Easy';
427
+ if (score >= 60) return 'Standard';
428
+ if (score >= 50) return 'Fairly Difficult';
429
+ if (score >= 30) return 'Difficult';
430
+ return 'Very Difficult';
431
+ }
432
+
433
+ /**
434
+ * Determine if JavaScript rendering is needed
435
+ * @param {string} url - URL to analyze
436
+ * @returns {Promise<boolean>} - Whether JavaScript is needed
437
+ */
438
+ async shouldUseJavaScript(url) {
439
+ const jsIndicators = [
440
+ /\/(app|spa|dashboard|admin)/,
441
+ /#/,
442
+ /\.(js|jsx|ts|tsx)$/
443
+ ];
444
+
445
+ return jsIndicators.some(pattern => pattern.test(url));
446
+ }
447
+
448
+ /**
449
+ * Extract title from HTML using simple parsing
450
+ * @param {string} html - HTML content
451
+ * @returns {string|null} - Extracted title
452
+ */
453
+ extractTitleFromHTML(html) {
454
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
455
+ return titleMatch ? titleMatch[1].trim() : null;
456
+ }
457
+
458
+ /**
459
+ * Process multiple documents concurrently
460
+ * @param {Array} sources - Array of document sources
461
+ * @param {Object} options - Processing options
462
+ * @returns {Promise<Array>} - Array of processing results
463
+ */
464
+ async processMultipleDocuments(sources, options = {}) {
465
+ const concurrency = options.concurrency || 3;
466
+ const results = [];
467
+
468
+ for (let i = 0; i < sources.length; i += concurrency) {
469
+ const batch = sources.slice(i, i + concurrency);
470
+ const batchPromises = batch.map(source => {
471
+ const params = typeof source === 'string'
472
+ ? { source, options }
473
+ : { ...source, options: { ...options, ...source.options } };
474
+
475
+ return this.execute(params).catch(error => ({
476
+ source: params.source,
477
+ success: false,
478
+ error: error.message,
479
+ processedAt: new Date().toISOString(),
480
+ processingTime: 0,
481
+ content: { text: '' }
482
+ }));
483
+ });
484
+
485
+ const batchResults = await Promise.all(batchPromises);
486
+ results.push(...batchResults);
487
+ }
488
+
489
+ return results;
490
+ }
491
+
492
+ /**
493
+ * Clean up resources
494
+ * @returns {Promise<void>}
495
+ */
496
+ async cleanup() {
497
+ if (this.browserProcessor) {
498
+ await this.browserProcessor.cleanup();
499
+ }
500
+ }
501
+ }
502
+
503
+ export default ProcessDocumentTool;