crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,377 @@
1
+ /**
2
+ * PerformanceIntegration - Simple integration layer for existing tools
3
+ * Provides optional performance enhancements without breaking existing functionality
4
+ */
5
+
6
+ import PerformanceManager from '../PerformanceManager.js';
7
+ import WorkerPool from '../workers/WorkerPool.js';
8
+ import ConnectionPool from '../connections/ConnectionPool.js';
9
+ import StreamProcessor from '../processing/StreamProcessor.js';
10
+ import { config } from '../../constants/config.js';
11
+
12
+ let performanceManagerInstance = null;
13
+ let workerPoolInstance = null;
14
+ let connectionPoolInstance = null;
15
+ let streamProcessorInstance = null;
16
+
17
+ /**
18
+ * Initialize performance components (lazy initialization)
19
+ * @param {Object} options - Initialization options
20
+ * @returns {PerformanceManager} - Performance manager instance
21
+ */
22
+ export function initializePerformance(options = {}) {
23
+ if (performanceManagerInstance) {
24
+ return performanceManagerInstance;
25
+ }
26
+
27
+ const {
28
+ enableWorkerPool = true,
29
+ enableConnectionPool = true,
30
+ enableStreamProcessor = true,
31
+ enableFullManager = false
32
+ } = options;
33
+
34
+ if (enableFullManager) {
35
+ performanceManagerInstance = new PerformanceManager(options);
36
+ return performanceManagerInstance;
37
+ }
38
+
39
+ // Initialize individual components as needed
40
+ if (enableWorkerPool && !workerPoolInstance) {
41
+ workerPoolInstance = new WorkerPool({
42
+ maxWorkers: config.performance.maxWorkers,
43
+ ...options.workerPoolOptions
44
+ });
45
+ }
46
+
47
+ if (enableConnectionPool && !connectionPoolInstance) {
48
+ connectionPoolInstance = new ConnectionPool({
49
+ maxSockets: config.performance.maxWorkers * 2,
50
+ ...options.connectionPoolOptions
51
+ });
52
+ }
53
+
54
+ if (enableStreamProcessor && !streamProcessorInstance) {
55
+ streamProcessorInstance = new StreamProcessor({
56
+ chunkSize: 1000,
57
+ memoryLimit: 100 * 1024 * 1024,
58
+ ...options.streamProcessorOptions
59
+ });
60
+ }
61
+
62
+ return {
63
+ workerPool: workerPoolInstance,
64
+ connectionPool: connectionPoolInstance,
65
+ streamProcessor: streamProcessorInstance
66
+ };
67
+ }
68
+
69
+ /**
70
+ * Get performance manager instance
71
+ * @returns {PerformanceManager|null} - Performance manager instance
72
+ */
73
+ export function getPerformanceManager() {
74
+ return performanceManagerInstance;
75
+ }
76
+
77
+ /**
78
+ * Get worker pool instance
79
+ * @returns {WorkerPool|null} - Worker pool instance
80
+ */
81
+ export function getWorkerPool() {
82
+ return workerPoolInstance;
83
+ }
84
+
85
+ /**
86
+ * Get connection pool instance
87
+ * @returns {ConnectionPool|null} - Connection pool instance
88
+ */
89
+ export function getConnectionPool() {
90
+ return connectionPoolInstance;
91
+ }
92
+
93
+ /**
94
+ * Get stream processor instance
95
+ * @returns {StreamProcessor|null} - Stream processor instance
96
+ */
97
+ export function getStreamProcessor() {
98
+ return streamProcessorInstance;
99
+ }
100
+
101
+ /**
102
+ * Enhanced fetch function with connection pooling
103
+ * @param {string|Object} url - URL or request options
104
+ * @param {Object} options - Fetch options
105
+ * @returns {Promise<Response>} - Fetch response
106
+ */
107
+ export async function enhancedFetch(url, options = {}) {
108
+ if (connectionPoolInstance) {
109
+ const requestOptions = typeof url === 'string' ? { url, ...options } : url;
110
+ return await connectionPoolInstance.request(requestOptions);
111
+ } else {
112
+ // Fallback to regular fetch
113
+ const { default: fetch } = await import('node-fetch');
114
+ return await fetch(url, options);
115
+ }
116
+ }
117
+
118
+ /**
119
+ * Enhanced HTML parsing with worker pool
120
+ * @param {string} html - HTML content
121
+ * @param {Object} options - Parsing options
122
+ * @returns {Promise<Object>} - Parsed HTML data
123
+ */
124
+ export async function enhancedParseHtml(html, options = {}) {
125
+ if (workerPoolInstance && html.length > 50000) { // Use worker for large HTML
126
+ return await workerPoolInstance.execute('parseHtml', { html, options });
127
+ } else {
128
+ // Fallback to synchronous parsing
129
+ return await parseHtmlSync(html, options);
130
+ }
131
+ }
132
+
133
+ /**
134
+ * Enhanced content extraction with worker pool
135
+ * @param {string} html - HTML content
136
+ * @param {string} url - Source URL
137
+ * @param {Object} options - Extraction options
138
+ * @returns {Promise<Object>} - Extracted content
139
+ */
140
+ export async function enhancedExtractContent(html, url, options = {}) {
141
+ if (workerPoolInstance && html.length > 30000) { // Use worker for large content
142
+ return await workerPoolInstance.execute('extractContent', { html, url, options });
143
+ } else {
144
+ // Fallback to synchronous extraction
145
+ return await extractContentSync(html, url, options);
146
+ }
147
+ }
148
+
149
+ /**
150
+ * Enhanced batch processing with streaming
151
+ * @param {Array} items - Items to process
152
+ * @param {Function} processor - Processing function
153
+ * @param {Object} options - Processing options
154
+ * @returns {Promise<Object>} - Processing results
155
+ */
156
+ export async function enhancedBatchProcess(items, processor, options = {}) {
157
+ const { useStreaming = items.length > 1000, useWorkers = false } = options;
158
+
159
+ if (useStreaming && streamProcessorInstance) {
160
+ return await streamProcessorInstance.processStream(items, processor, options);
161
+ } else if (useWorkers && workerPoolInstance && items.length > 100) {
162
+ const tasks = items.map(item => ({ taskType: 'processItem', data: item, options }));
163
+ return await workerPoolInstance.executeBatch(tasks, options);
164
+ } else {
165
+ // Fallback to sequential processing
166
+ const results = [];
167
+ for (let i = 0; i < items.length; i++) {
168
+ const result = await processor(items[i], i);
169
+ results.push(result);
170
+ }
171
+ return { results, processedItems: items.length };
172
+ }
173
+ }
174
+
175
+ /**
176
+ * Enhanced concurrent requests with connection pooling
177
+ * @param {Array} requests - Request configurations
178
+ * @param {Object} options - Request options
179
+ * @returns {Promise<Array>} - Request results
180
+ */
181
+ export async function enhancedConcurrentRequests(requests, options = {}) {
182
+ if (connectionPoolInstance) {
183
+ return await connectionPoolInstance.requestBatch(requests, options);
184
+ } else {
185
+ // Fallback to Promise.all with regular fetch
186
+ const { default: fetch } = await import('node-fetch');
187
+ const promises = requests.map(request => fetch(request.url || request, request));
188
+ return await Promise.all(promises);
189
+ }
190
+ }
191
+
192
+ /**
193
+ * Check if performance optimization is available
194
+ * @param {string} component - Component name ('worker', 'connection', 'stream', 'full')
195
+ * @returns {boolean} - Whether component is available
196
+ */
197
+ export function isPerformanceAvailable(component) {
198
+ switch (component) {
199
+ case 'worker':
200
+ return !!workerPoolInstance;
201
+ case 'connection':
202
+ return !!connectionPoolInstance;
203
+ case 'stream':
204
+ return !!streamProcessorInstance;
205
+ case 'full':
206
+ return !!performanceManagerInstance;
207
+ default:
208
+ return !!(workerPoolInstance || connectionPoolInstance || streamProcessorInstance || performanceManagerInstance);
209
+ }
210
+ }
211
+
212
+ /**
213
+ * Get performance statistics
214
+ * @returns {Object} - Performance statistics
215
+ */
216
+ export function getPerformanceStats() {
217
+ const stats = {};
218
+
219
+ if (performanceManagerInstance) {
220
+ stats.full = performanceManagerInstance.getMetrics();
221
+ }
222
+
223
+ if (workerPoolInstance) {
224
+ stats.workerPool = workerPoolInstance.getStats();
225
+ }
226
+
227
+ if (connectionPoolInstance) {
228
+ stats.connectionPool = connectionPoolInstance.getStats();
229
+ }
230
+
231
+ if (streamProcessorInstance) {
232
+ stats.streamProcessor = streamProcessorInstance.getStats();
233
+ }
234
+
235
+ return stats;
236
+ }
237
+
238
+ /**
239
+ * Graceful shutdown of all performance components
240
+ * @returns {Promise<void>}
241
+ */
242
+ export async function shutdownPerformance() {
243
+ const shutdownPromises = [];
244
+
245
+ if (performanceManagerInstance) {
246
+ shutdownPromises.push(performanceManagerInstance.shutdown());
247
+ performanceManagerInstance = null;
248
+ } else {
249
+ if (workerPoolInstance) {
250
+ shutdownPromises.push(workerPoolInstance.shutdown());
251
+ workerPoolInstance = null;
252
+ }
253
+
254
+ if (connectionPoolInstance) {
255
+ shutdownPromises.push(connectionPoolInstance.shutdown());
256
+ connectionPoolInstance = null;
257
+ }
258
+
259
+ if (streamProcessorInstance) {
260
+ shutdownPromises.push(streamProcessorInstance.shutdown());
261
+ streamProcessorInstance = null;
262
+ }
263
+ }
264
+
265
+ await Promise.all(shutdownPromises);
266
+ }
267
+
268
+ // Fallback implementations for when performance components are not available
269
+
270
+ /**
271
+ * Synchronous HTML parsing fallback
272
+ * @param {string} html - HTML content
273
+ * @param {Object} options - Parsing options
274
+ * @returns {Promise<Object>} - Parsed HTML data
275
+ */
276
+ async function parseHtmlSync(html, options = {}) {
277
+ const cheerio = await import('cheerio');
278
+ const $ = cheerio.load(html);
279
+
280
+ const result = {};
281
+
282
+ if (options.extractText !== false) {
283
+ result.text = $('body').text().trim();
284
+ result.title = $('title').text().trim();
285
+ }
286
+
287
+ if (options.extractLinks) {
288
+ result.links = [];
289
+ $('a[href]').each((_, element) => {
290
+ const $link = $(element);
291
+ result.links.push({
292
+ href: $link.attr('href'),
293
+ text: $link.text().trim(),
294
+ title: $link.attr('title') || null
295
+ });
296
+ });
297
+ }
298
+
299
+ if (options.extractImages) {
300
+ result.images = [];
301
+ $('img[src]').each((_, element) => {
302
+ const $img = $(element);
303
+ result.images.push({
304
+ src: $img.attr('src'),
305
+ alt: $img.attr('alt') || null,
306
+ title: $img.attr('title') || null
307
+ });
308
+ });
309
+ }
310
+
311
+ return result;
312
+ }
313
+
314
+ /**
315
+ * Synchronous content extraction fallback
316
+ * @param {string} html - HTML content
317
+ * @param {string} url - Source URL
318
+ * @param {Object} options - Extraction options
319
+ * @returns {Promise<Object>} - Extracted content
320
+ */
321
+ async function extractContentSync(html, url, options = {}) {
322
+ const { Readability } = await import('@mozilla/readability');
323
+ const { JSDOM } = await import('jsdom');
324
+
325
+ const dom = new JSDOM(html, { url });
326
+ const document = dom.window.document;
327
+
328
+ const reader = new Readability(document);
329
+ const article = reader.parse();
330
+
331
+ return {
332
+ url,
333
+ title: article?.title || null,
334
+ content: article?.content || null,
335
+ textContent: article?.textContent || null,
336
+ length: article?.length || 0,
337
+ excerpt: article?.excerpt || null,
338
+ byline: article?.byline || null,
339
+ processed_at: new Date().toISOString()
340
+ };
341
+ }
342
+
343
+ // Setup graceful shutdown
344
+ let shutdownRegistered = false;
345
+
346
+ function registerShutdown() {
347
+ if (shutdownRegistered) return;
348
+ shutdownRegistered = true;
349
+
350
+ const shutdown = async () => {
351
+ console.log('PerformanceIntegration: Graceful shutdown initiated');
352
+ await shutdownPerformance();
353
+ };
354
+
355
+ process.on('SIGTERM', shutdown);
356
+ process.on('SIGINT', shutdown);
357
+ process.on('beforeExit', shutdown);
358
+ }
359
+
360
+ // Auto-register shutdown handlers
361
+ registerShutdown();
362
+
363
+ export default {
364
+ initializePerformance,
365
+ getPerformanceManager,
366
+ getWorkerPool,
367
+ getConnectionPool,
368
+ getStreamProcessor,
369
+ enhancedFetch,
370
+ enhancedParseHtml,
371
+ enhancedExtractContent,
372
+ enhancedBatchProcess,
373
+ enhancedConcurrentRequests,
374
+ isPerformanceAvailable,
375
+ getPerformanceStats,
376
+ shutdownPerformance
377
+ };
@@ -0,0 +1,135 @@
1
+ import { LLMProvider } from './LLMProvider.js';
2
+
3
+ /**
4
+ * Anthropic Claude API Provider
5
+ * Implements LLM operations using Anthropic's Claude models
6
+ */
7
+ export class AnthropicProvider extends LLMProvider {
8
+ constructor(options = {}) {
9
+ super(options);
10
+
11
+ this.apiKey = options.apiKey || process.env.ANTHROPIC_API_KEY;
12
+ this.baseURL = options.baseURL || 'https://api.anthropic.com/v1';
13
+ this.model = options.model || 'claude-3-haiku-20240307';
14
+ this.timeout = options.timeout || 30000;
15
+ this.version = options.version || '2023-06-01';
16
+
17
+ if (!this.apiKey) {
18
+ this.logger.warn('Anthropic API key not configured');
19
+ }
20
+ }
21
+
22
+ async generateCompletion(prompt, options = {}) {
23
+ if (!this.apiKey) {
24
+ throw new Error('Anthropic API key not configured');
25
+ }
26
+
27
+ const {
28
+ maxTokens = 1000,
29
+ temperature = 0.7,
30
+ systemPrompt = null,
31
+ stopSequences = null
32
+ } = options;
33
+
34
+ try {
35
+ const requestBody = {
36
+ model: this.model,
37
+ max_tokens: maxTokens,
38
+ temperature,
39
+ messages: [
40
+ { role: 'user', content: prompt }
41
+ ]
42
+ };
43
+
44
+ if (systemPrompt) {
45
+ requestBody.system = systemPrompt;
46
+ }
47
+
48
+ if (stopSequences) {
49
+ requestBody.stop_sequences = stopSequences;
50
+ }
51
+
52
+ const response = await fetch(`${this.baseURL}/messages`, {
53
+ method: 'POST',
54
+ headers: {
55
+ 'Authorization': `Bearer ${this.apiKey}`,
56
+ 'Content-Type': 'application/json',
57
+ 'anthropic-version': this.version
58
+ },
59
+ body: JSON.stringify(requestBody),
60
+ signal: AbortSignal.timeout(this.timeout)
61
+ });
62
+
63
+ if (!response.ok) {
64
+ const errorData = await response.json().catch(() => ({}));
65
+ throw new Error(`Anthropic API error: ${response.status} - ${errorData.error?.message || response.statusText}`);
66
+ }
67
+
68
+ const data = await response.json();
69
+
70
+ if (!data.content || data.content.length === 0) {
71
+ throw new Error('No completion generated');
72
+ }
73
+
74
+ return data.content[0].text.trim();
75
+ } catch (error) {
76
+ this.logger.error('Anthropic completion failed', { error: error.message });
77
+ throw error;
78
+ }
79
+ }
80
+
81
+ async generateEmbedding(text) {
82
+ // Anthropic doesn't provide embeddings API
83
+ // Fallback to simple text similarity
84
+ this.logger.warn('Anthropic does not provide embeddings API, using fallback similarity');
85
+ return this.generateSimpleEmbedding(text);
86
+ }
87
+
88
+ /**
89
+ * Generate a simple embedding based on text characteristics
90
+ * This is a fallback when embeddings API is not available
91
+ */
92
+ generateSimpleEmbedding(text) {
93
+ const words = text.toLowerCase().split(/\s+/);
94
+ const embedding = new Array(100).fill(0); // 100-dimensional vector
95
+
96
+ // Simple hash-based embedding
97
+ for (let i = 0; i < words.length; i++) {
98
+ const word = words[i];
99
+ let hash = 0;
100
+ for (let j = 0; j < word.length; j++) {
101
+ hash = ((hash << 5) - hash + word.charCodeAt(j)) & 0xffffffff;
102
+ }
103
+ const index = Math.abs(hash) % embedding.length;
104
+ embedding[index] += 1 / words.length;
105
+ }
106
+
107
+ // Normalize
108
+ const magnitude = Math.sqrt(embedding.reduce((sum, val) => sum + val * val, 0));
109
+ return magnitude > 0 ? embedding.map(val => val / magnitude) : embedding;
110
+ }
111
+
112
+ async calculateSimilarity(text1, text2) {
113
+ // Simple Jaccard similarity for fallback
114
+ const words1 = new Set(text1.toLowerCase().split(/\s+/));
115
+ const words2 = new Set(text2.toLowerCase().split(/\s+/));
116
+
117
+ const intersection = new Set([...words1].filter(x => words2.has(x)));
118
+ const union = new Set([...words1, ...words2]);
119
+
120
+ return union.size > 0 ? intersection.size / union.size : 0;
121
+ }
122
+
123
+ getMetadata() {
124
+ return {
125
+ ...super.getMetadata(),
126
+ name: 'Anthropic',
127
+ model: this.model,
128
+ capabilities: {
129
+ completion: true,
130
+ embedding: false, // Uses fallback
131
+ similarity: true // Uses fallback
132
+ }
133
+ };
134
+ }
135
+ }