crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,448 @@
1
+ /**
2
+ * PDFProcessor - PDF document processing with text and metadata extraction
3
+ * Handles PDF files from URLs or local paths with comprehensive error handling
4
+ */
5
+
6
+ // Use dynamic import for pdf-parse to avoid initialization issues
7
+ import { z } from 'zod';
8
+ import fs from 'fs/promises';
9
+ import path from 'path';
10
+
11
+ const PDFProcessorSchema = z.object({
12
+ source: z.string().min(1),
13
+ sourceType: z.enum(['url', 'file', 'buffer']).default('url'),
14
+ options: z.object({
15
+ extractMetadata: z.boolean().default(true),
16
+ extractText: z.boolean().default(true),
17
+ password: z.string().optional(),
18
+ maxPages: z.number().min(1).max(1000).default(100),
19
+ parseOptions: z.object({
20
+ normalizeWhitespace: z.boolean().default(true),
21
+ disableCombineTextItems: z.boolean().default(false)
22
+ }).optional().default({})
23
+ }).optional().default({})
24
+ });
25
+
26
+ const PDFResult = z.object({
27
+ source: z.string(),
28
+ sourceType: z.string(),
29
+ text: z.string().optional(),
30
+ metadata: z.object({
31
+ title: z.string().nullable(),
32
+ author: z.string().nullable(),
33
+ subject: z.string().nullable(),
34
+ creator: z.string().nullable(),
35
+ producer: z.string().nullable(),
36
+ creationDate: z.string().nullable(),
37
+ modificationDate: z.string().nullable(),
38
+ format: z.string().nullable(),
39
+ pages: z.number().nullable(),
40
+ encrypted: z.boolean().nullable(),
41
+ linearized: z.boolean().nullable(),
42
+ pdfVersion: z.string().nullable()
43
+ }).optional(),
44
+ pageCount: z.number(),
45
+ extractedAt: z.string(),
46
+ processingTime: z.number(),
47
+ success: z.boolean(),
48
+ error: z.string().optional()
49
+ });
50
+
51
+ export class PDFProcessor {
52
+ constructor() {
53
+ this.defaultOptions = {
54
+ extractMetadata: true,
55
+ extractText: true,
56
+ maxPages: 100,
57
+ parseOptions: {
58
+ normalizeWhitespace: true,
59
+ disableCombineTextItems: false
60
+ }
61
+ };
62
+ }
63
+
64
+ /**
65
+ * Process PDF document from various sources
66
+ * @param {Object} params - Processing parameters
67
+ * @param {string} params.source - PDF source (URL, file path, or buffer)
68
+ * @param {string} params.sourceType - Type of source ('url', 'file', 'buffer')
69
+ * @param {Object} params.options - Processing options
70
+ * @returns {Promise<Object>} - Processing result with text and metadata
71
+ */
72
+ async processPDF(params) {
73
+ const startTime = Date.now();
74
+
75
+ try {
76
+ const validated = PDFProcessorSchema.parse(params);
77
+ const { source, sourceType, options } = validated;
78
+ const processingOptions = { ...this.defaultOptions, ...options };
79
+
80
+ const result = {
81
+ source,
82
+ sourceType,
83
+ extractedAt: new Date().toISOString(),
84
+ success: false,
85
+ processingTime: 0
86
+ };
87
+
88
+ // Get PDF buffer based on source type
89
+ let pdfBuffer;
90
+ try {
91
+ pdfBuffer = await this.getPDFBuffer(source, sourceType);
92
+ } catch (error) {
93
+ result.error = `Failed to load PDF: ${error.message}`;
94
+ result.processingTime = Date.now() - startTime;
95
+ return result;
96
+ }
97
+
98
+ // Parse PDF with options
99
+ const parseOptions = {
100
+ ...processingOptions.parseOptions,
101
+ max: processingOptions.maxPages
102
+ };
103
+
104
+ if (processingOptions.password) {
105
+ parseOptions.password = processingOptions.password;
106
+ }
107
+
108
+ let pdfData;
109
+ try {
110
+ // Dynamic import to avoid initialization issues
111
+ const pdfParse = (await import('pdf-parse')).default;
112
+ pdfData = await pdfParse(pdfBuffer, parseOptions);
113
+ } catch (error) {
114
+ result.error = `PDF parsing failed: ${error.message}`;
115
+ result.processingTime = Date.now() - startTime;
116
+ return result;
117
+ }
118
+
119
+ // Extract text content
120
+ if (processingOptions.extractText) {
121
+ result.text = this.cleanPDFText(pdfData.text);
122
+ }
123
+
124
+ // Extract metadata
125
+ if (processingOptions.extractMetadata) {
126
+ result.metadata = this.extractPDFMetadata(pdfData);
127
+ }
128
+
129
+ // Set page count
130
+ result.pageCount = pdfData.numpages || 0;
131
+
132
+ // Calculate processing time
133
+ result.processingTime = Date.now() - startTime;
134
+ result.success = true;
135
+
136
+ return result;
137
+
138
+ } catch (error) {
139
+ return {
140
+ source: params.source || 'unknown',
141
+ sourceType: params.sourceType || 'unknown',
142
+ extractedAt: new Date().toISOString(),
143
+ success: false,
144
+ error: `PDF processing failed: ${error.message}`,
145
+ processingTime: Date.now() - startTime,
146
+ pageCount: 0
147
+ };
148
+ }
149
+ }
150
+
151
+ /**
152
+ * Get PDF buffer from various sources
153
+ * @param {string} source - PDF source
154
+ * @param {string} sourceType - Source type
155
+ * @returns {Promise<Buffer>} - PDF buffer
156
+ */
157
+ async getPDFBuffer(source, sourceType) {
158
+ switch (sourceType) {
159
+ case 'url':
160
+ return await this.downloadPDFFromURL(source);
161
+ case 'file':
162
+ return await this.readPDFFromFile(source);
163
+ case 'buffer':
164
+ return Buffer.isBuffer(source) ? source : Buffer.from(source);
165
+ default:
166
+ throw new Error(`Unsupported source type: ${sourceType}`);
167
+ }
168
+ }
169
+
170
+ /**
171
+ * Download PDF from URL
172
+ * @param {string} url - PDF URL
173
+ * @returns {Promise<Buffer>} - PDF buffer
174
+ */
175
+ async downloadPDFFromURL(url) {
176
+ try {
177
+ const response = await fetch(url, {
178
+ headers: {
179
+ 'User-Agent': 'Mozilla/5.0 (compatible; MCP-WebScraper/2.0; PDF-Processor)'
180
+ },
181
+ timeout: 30000
182
+ });
183
+
184
+ if (!response.ok) {
185
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
186
+ }
187
+
188
+ const contentType = response.headers.get('content-type');
189
+ if (contentType && !contentType.includes('pdf')) {
190
+ console.warn(`Warning: Content-Type is ${contentType}, expected PDF`);
191
+ }
192
+
193
+ const arrayBuffer = await response.arrayBuffer();
194
+ return Buffer.from(arrayBuffer);
195
+
196
+ } catch (error) {
197
+ throw new Error(`Failed to download PDF from URL: ${error.message}`);
198
+ }
199
+ }
200
+
201
+ /**
202
+ * Read PDF from local file
203
+ * @param {string} filePath - Local file path
204
+ * @returns {Promise<Buffer>} - PDF buffer
205
+ */
206
+ async readPDFFromFile(filePath) {
207
+ try {
208
+ // Validate file path
209
+ const resolvedPath = path.resolve(filePath);
210
+ const stats = await fs.stat(resolvedPath);
211
+
212
+ if (!stats.isFile()) {
213
+ throw new Error('Path is not a file');
214
+ }
215
+
216
+ // Check file extension
217
+ const ext = path.extname(resolvedPath).toLowerCase();
218
+ if (ext !== '.pdf') {
219
+ console.warn(`Warning: File extension is ${ext}, expected .pdf`);
220
+ }
221
+
222
+ // Read file
223
+ return await fs.readFile(resolvedPath);
224
+
225
+ } catch (error) {
226
+ throw new Error(`Failed to read PDF file: ${error.message}`);
227
+ }
228
+ }
229
+
230
+ /**
231
+ * Extract and format PDF metadata
232
+ * @param {Object} pdfData - Parsed PDF data from pdf-parse
233
+ * @returns {Object} - Formatted metadata
234
+ */
235
+ extractPDFMetadata(pdfData) {
236
+ const info = pdfData.info || {};
237
+ const metadata = pdfData.metadata || {};
238
+
239
+ return {
240
+ title: this.cleanMetadataValue(info.Title || metadata.title),
241
+ author: this.cleanMetadataValue(info.Author || metadata.author),
242
+ subject: this.cleanMetadataValue(info.Subject || metadata.subject),
243
+ creator: this.cleanMetadataValue(info.Creator || metadata.creator),
244
+ producer: this.cleanMetadataValue(info.Producer || metadata.producer),
245
+ creationDate: this.formatPDFDate(info.CreationDate || metadata.creationDate),
246
+ modificationDate: this.formatPDFDate(info.ModDate || metadata.modificationDate),
247
+ format: this.cleanMetadataValue(info.Format || metadata.format),
248
+ pages: pdfData.numpages || null,
249
+ encrypted: info.IsEncrypted || false,
250
+ linearized: info.IsLinearized || false,
251
+ pdfVersion: this.cleanMetadataValue(info.PDFFormatVersion || metadata.pdfVersion)
252
+ };
253
+ }
254
+
255
+ /**
256
+ * Clean metadata value
257
+ * @param {any} value - Raw metadata value
258
+ * @returns {string|null} - Cleaned value
259
+ */
260
+ cleanMetadataValue(value) {
261
+ if (value === undefined || value === null) {
262
+ return null;
263
+ }
264
+
265
+ const stringValue = String(value).trim();
266
+ return stringValue.length > 0 ? stringValue : null;
267
+ }
268
+
269
+ /**
270
+ * Format PDF date string
271
+ * @param {string} dateString - Raw PDF date
272
+ * @returns {string|null} - Formatted date
273
+ */
274
+ formatPDFDate(dateString) {
275
+ if (!dateString) return null;
276
+
277
+ try {
278
+ // PDF dates are often in format: D:YYYYMMDDHHmmSSOHH'mm'
279
+ let cleanDate = dateString.toString().trim();
280
+
281
+ // Remove D: prefix if present
282
+ if (cleanDate.startsWith('D:')) {
283
+ cleanDate = cleanDate.substring(2);
284
+ }
285
+
286
+ // Extract YYYYMMDDHHMMSS part
287
+ const match = cleanDate.match(/^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/);
288
+ if (match) {
289
+ const [, year, month, day, hour, minute, second] = match;
290
+ const date = new Date(
291
+ parseInt(year),
292
+ parseInt(month) - 1, // Month is 0-indexed
293
+ parseInt(day),
294
+ parseInt(hour),
295
+ parseInt(minute),
296
+ parseInt(second)
297
+ );
298
+
299
+ if (!isNaN(date.getTime())) {
300
+ return date.toISOString();
301
+ }
302
+ }
303
+
304
+ // Try to parse as regular date
305
+ const date = new Date(cleanDate);
306
+ if (!isNaN(date.getTime())) {
307
+ return date.toISOString();
308
+ }
309
+
310
+ return cleanDate; // Return as-is if can't parse
311
+
312
+ } catch (error) {
313
+ return dateString; // Return original if parsing fails
314
+ }
315
+ }
316
+
317
+ /**
318
+ * Clean and normalize PDF text content
319
+ * @param {string} text - Raw PDF text
320
+ * @returns {string} - Cleaned text
321
+ */
322
+ cleanPDFText(text) {
323
+ if (!text || typeof text !== 'string') {
324
+ return '';
325
+ }
326
+
327
+ return text
328
+ // Normalize line breaks
329
+ .replace(/\r\n/g, '\n')
330
+ .replace(/\r/g, '\n')
331
+ // Remove excessive whitespace
332
+ .replace(/[ \t]+/g, ' ')
333
+ // Remove excessive line breaks (more than 2)
334
+ .replace(/\n{3,}/g, '\n\n')
335
+ // Remove leading/trailing whitespace from lines
336
+ .split('\n')
337
+ .map(line => line.trim())
338
+ .join('\n')
339
+ // Remove leading/trailing whitespace from entire text
340
+ .trim();
341
+ }
342
+
343
+ /**
344
+ * Process multiple PDFs concurrently
345
+ * @param {Array} sources - Array of PDF sources
346
+ * @param {Object} options - Processing options
347
+ * @returns {Promise<Array>} - Array of processing results
348
+ */
349
+ async processMultiplePDFs(sources, options = {}) {
350
+ const concurrency = options.concurrency || 3;
351
+ const results = [];
352
+
353
+ // Process in batches to avoid overwhelming the system
354
+ for (let i = 0; i < sources.length; i += concurrency) {
355
+ const batch = sources.slice(i, i + concurrency);
356
+ const batchPromises = batch.map(source => {
357
+ const params = typeof source === 'string'
358
+ ? { source, sourceType: 'url', options }
359
+ : { ...source, options: { ...options, ...source.options } };
360
+
361
+ return this.processPDF(params).catch(error => ({
362
+ source: params.source,
363
+ success: false,
364
+ error: error.message,
365
+ extractedAt: new Date().toISOString(),
366
+ processingTime: 0,
367
+ pageCount: 0
368
+ }));
369
+ });
370
+
371
+ const batchResults = await Promise.all(batchPromises);
372
+ results.push(...batchResults);
373
+ }
374
+
375
+ return results;
376
+ }
377
+
378
+ /**
379
+ * Get text statistics from extracted content
380
+ * @param {string} text - Extracted text
381
+ * @returns {Object} - Text statistics
382
+ */
383
+ getTextStatistics(text) {
384
+ if (!text || typeof text !== 'string') {
385
+ return {
386
+ characters: 0,
387
+ charactersNoSpaces: 0,
388
+ words: 0,
389
+ sentences: 0,
390
+ paragraphs: 0,
391
+ lines: 0,
392
+ averageWordsPerSentence: 0,
393
+ averageCharactersPerWord: 0
394
+ };
395
+ }
396
+
397
+ const characters = text.length;
398
+ const charactersNoSpaces = text.replace(/\s/g, '').length;
399
+ const words = text.split(/\s+/).filter(word => word.length > 0);
400
+ const sentences = text.split(/[.!?]+/).filter(sentence => sentence.trim().length > 0);
401
+ const paragraphs = text.split(/\n\s*\n/).filter(paragraph => paragraph.trim().length > 0);
402
+ const lines = text.split('\n').length;
403
+
404
+ return {
405
+ characters,
406
+ charactersNoSpaces,
407
+ words: words.length,
408
+ sentences: sentences.length,
409
+ paragraphs: paragraphs.length,
410
+ lines,
411
+ averageWordsPerSentence: sentences.length > 0 ? Math.round((words.length / sentences.length) * 100) / 100 : 0,
412
+ averageCharactersPerWord: words.length > 0 ? Math.round((charactersNoSpaces / words.length) * 100) / 100 : 0
413
+ };
414
+ }
415
+
416
+ /**
417
+ * Extract specific pages from PDF
418
+ * @param {Object} params - Processing parameters with page range
419
+ * @returns {Promise<Object>} - Processing result for specified pages
420
+ */
421
+ async extractPDFPages(params) {
422
+ const { startPage = 1, endPage, ...processingParams } = params;
423
+
424
+ // Override parse options to limit page range
425
+ const options = {
426
+ ...processingParams.options,
427
+ parseOptions: {
428
+ ...processingParams.options?.parseOptions,
429
+ max: endPage || processingParams.options?.maxPages || 100
430
+ }
431
+ };
432
+
433
+ const result = await this.processPDF({
434
+ ...processingParams,
435
+ options
436
+ });
437
+
438
+ if (result.success && result.text && startPage > 1) {
439
+ // This is a simplified approach - pdf-parse doesn't provide per-page text
440
+ // For proper page-by-page extraction, consider using pdf2pic or pdf-poppler
441
+ console.warn('Page-specific extraction is limited with current PDF parser');
442
+ }
443
+
444
+ return result;
445
+ }
446
+ }
447
+
448
+ export default PDFProcessor;