crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,329 @@
1
+ /**
2
+ * Extract Content MCP Tool
3
+ * Enhanced content extraction with main content detection and readability features
4
+ */
5
+
6
+ import { z } from 'zod';
7
+ import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
8
+ import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
9
+ import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
10
+
11
+ const ExtractContentSchema = z.object({
12
+ url: z.string().url(),
13
+ options: z.object({
14
+ // Content extraction options
15
+ useReadability: z.boolean().default(true),
16
+ extractStructuredData: z.boolean().default(true),
17
+ calculateReadabilityScore: z.boolean().default(true),
18
+ preserveImageInfo: z.boolean().default(true),
19
+ extractMetadata: z.boolean().default(true),
20
+
21
+ // Browser rendering options
22
+ requiresJavaScript: z.boolean().optional(),
23
+ waitForSelector: z.string().optional(),
24
+ waitForTimeout: z.number().min(0).max(30000).default(5000),
25
+
26
+ // Quality assessment options
27
+ assessContentQuality: z.boolean().default(true),
28
+ minContentLength: z.number().min(0).default(100),
29
+
30
+ // Output options
31
+ includeRawHTML: z.boolean().default(false),
32
+ includeCleanedHTML: z.boolean().default(false),
33
+ outputFormat: z.enum(['text', 'markdown', 'structured']).default('structured')
34
+ }).optional().default({})
35
+ });
36
+
37
+ const ExtractContentResult = z.object({
38
+ url: z.string(),
39
+ title: z.string().nullable(),
40
+ content: z.object({
41
+ text: z.string(),
42
+ html: z.string().optional(),
43
+ markdown: z.string().optional()
44
+ }),
45
+ metadata: z.object({
46
+ title: z.string().nullable(),
47
+ description: z.string().nullable(),
48
+ author: z.string().nullable(),
49
+ published: z.string().nullable(),
50
+ language: z.string().nullable(),
51
+ canonical: z.string().nullable(),
52
+ openGraph: z.record(z.string()).optional(),
53
+ twitterCard: z.record(z.string()).optional()
54
+ }).optional(),
55
+ readability: z.object({
56
+ title: z.string().nullable(),
57
+ content: z.string(),
58
+ textContent: z.string(),
59
+ length: z.number(),
60
+ excerpt: z.string().nullable(),
61
+ byline: z.string().nullable(),
62
+ siteName: z.string().nullable(),
63
+ lang: z.string().nullable()
64
+ }).optional(),
65
+ readabilityScore: z.object({
66
+ score: z.number(),
67
+ level: z.string(),
68
+ sentences: z.number(),
69
+ words: z.number(),
70
+ characters: z.number(),
71
+ avgWordsPerSentence: z.number(),
72
+ avgCharsPerWord: z.number()
73
+ }).optional(),
74
+ structuredData: z.object({
75
+ jsonLd: z.array(z.any()),
76
+ microdata: z.array(z.any()),
77
+ schemaOrg: z.array(z.any())
78
+ }).optional(),
79
+ images: z.array(z.object({
80
+ src: z.string(),
81
+ alt: z.string().nullable(),
82
+ title: z.string().nullable(),
83
+ width: z.string().nullable(),
84
+ height: z.string().nullable()
85
+ })).optional(),
86
+ qualityAssessment: z.object({
87
+ isValid: z.boolean(),
88
+ score: z.number(),
89
+ reasons: z.array(z.string()),
90
+ metrics: z.record(z.any())
91
+ }).optional(),
92
+ extractedAt: z.string(),
93
+ processingTime: z.number(),
94
+ success: z.boolean(),
95
+ error: z.string().optional()
96
+ });
97
+
98
+ export class ExtractContentTool {
99
+ constructor() {
100
+ this.contentProcessor = new ContentProcessor();
101
+ this.browserProcessor = new BrowserProcessor();
102
+ }
103
+
104
+ /**
105
+ * Get tool definition for MCP server
106
+ * @returns {Object} Tool definition
107
+ */
108
+ getDefinition() {
109
+ return {
110
+ name: 'extract_content',
111
+ description: 'Extract and analyze main content from web pages with enhanced readability detection, structured data extraction, and content quality assessment.',
112
+ inputSchema: ExtractContentSchema
113
+ };
114
+ }
115
+
116
+ /**
117
+ * Execute content extraction
118
+ * @param {Object} params - Extraction parameters
119
+ * @returns {Promise<Object>} Extraction result
120
+ */
121
+ async execute(params) {
122
+ const startTime = Date.now();
123
+
124
+ try {
125
+ const validated = ExtractContentSchema.parse(params);
126
+ const { url, options } = validated;
127
+
128
+ const result = {
129
+ url,
130
+ extractedAt: new Date().toISOString(),
131
+ success: false,
132
+ processingTime: 0
133
+ };
134
+
135
+ // Step 1: Fetch content (with or without JavaScript rendering)
136
+ let html, pageTitle;
137
+ const shouldUseJavaScript = options.requiresJavaScript || await this.shouldUseJavaScript(url);
138
+
139
+ if (shouldUseJavaScript) {
140
+ console.log('Using browser rendering for JavaScript content...');
141
+ const browserResult = await this.browserProcessor.processURL({
142
+ url,
143
+ options: {
144
+ waitForSelector: options.waitForSelector,
145
+ waitForTimeout: options.waitForTimeout,
146
+ enableJavaScript: true,
147
+ enableImages: options.preserveImageInfo,
148
+ captureScreenshot: false
149
+ }
150
+ });
151
+
152
+ if (!browserResult.success) {
153
+ throw new Error(`Browser processing failed: ${browserResult.error}`);
154
+ }
155
+
156
+ html = browserResult.html;
157
+ pageTitle = browserResult.title;
158
+ } else {
159
+ // Simple HTTP fetch
160
+ const response = await fetch(url, {
161
+ headers: {
162
+ 'User-Agent': 'Mozilla/5.0 (compatible; MCP-WebScraper/3.0; Enhanced-Content-Extractor)'
163
+ },
164
+ timeout: 15000
165
+ });
166
+
167
+ if (!response.ok) {
168
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
169
+ }
170
+
171
+ html = await response.text();
172
+ pageTitle = this.extractTitleFromHTML(html);
173
+ }
174
+
175
+ result.title = pageTitle;
176
+
177
+ // Step 2: Process content with ContentProcessor
178
+ const processingResult = await this.contentProcessor.processContent({
179
+ html,
180
+ url,
181
+ options: {
182
+ extractStructuredData: options.extractStructuredData,
183
+ calculateReadabilityScore: options.calculateReadabilityScore,
184
+ removeBoilerplate: options.useReadability,
185
+ preserveImageInfo: options.preserveImageInfo,
186
+ extractMetadata: options.extractMetadata
187
+ }
188
+ });
189
+
190
+ // Step 3: Extract and format content
191
+ if (processingResult.readability) {
192
+ result.readability = processingResult.readability;
193
+ result.content = {
194
+ text: processingResult.readability.textContent || processingResult.readability.content,
195
+ };
196
+
197
+ // Convert to markdown if requested
198
+ if (options.outputFormat === 'markdown') {
199
+ result.content.markdown = this.convertToMarkdown(processingResult.readability.content);
200
+ }
201
+ } else if (processingResult.fallback_content) {
202
+ result.content = {
203
+ text: processingResult.fallback_content.content
204
+ };
205
+ } else {
206
+ // Last resort: extract text from HTML
207
+ result.content = {
208
+ text: HTMLCleaner.extractTextWithFormatting(html, {
209
+ preserveLineBreaks: true,
210
+ preserveParagraphs: true,
211
+ includeLinks: false,
212
+ includeImageAlt: true
213
+ })
214
+ };
215
+ }
216
+
217
+ // Include HTML if requested
218
+ if (options.includeRawHTML) {
219
+ result.content.html = html;
220
+ }
221
+ if (options.includeCleanedHTML && processingResult.readability) {
222
+ result.content.cleanedHTML = processingResult.readability.content;
223
+ }
224
+
225
+ // Step 4: Add readability score
226
+ if (processingResult.readability_score) {
227
+ result.readabilityScore = processingResult.readability_score;
228
+ }
229
+
230
+ // Step 5: Add metadata
231
+ if (processingResult.metadata) {
232
+ result.metadata = processingResult.metadata;
233
+ }
234
+
235
+ // Step 6: Add structured data
236
+ if (processingResult.structured_data) {
237
+ result.structuredData = processingResult.structured_data;
238
+ }
239
+
240
+ // Step 7: Add image information
241
+ if (processingResult.images) {
242
+ result.images = processingResult.images;
243
+ }
244
+
245
+ // Step 8: Assess content quality
246
+ if (options.assessContentQuality) {
247
+ result.qualityAssessment = ContentQualityAssessor.assessContentQuality(
248
+ result.content.text,
249
+ { minLength: options.minContentLength }
250
+ );
251
+ }
252
+
253
+ result.processingTime = Date.now() - startTime;
254
+ result.success = true;
255
+
256
+ return result;
257
+
258
+ } catch (error) {
259
+ return {
260
+ url: params.url || 'unknown',
261
+ extractedAt: new Date().toISOString(),
262
+ success: false,
263
+ error: `Content extraction failed: ${error.message}`,
264
+ processingTime: Date.now() - startTime,
265
+ content: { text: '' }
266
+ };
267
+ }
268
+ }
269
+
270
+ /**
271
+ * Determine if JavaScript rendering is needed
272
+ * @param {string} url - URL to analyze
273
+ * @returns {Promise<boolean>} - Whether JavaScript is needed
274
+ */
275
+ async shouldUseJavaScript(url) {
276
+ // Simple heuristics for determining if JavaScript is needed
277
+ const jsIndicators = [
278
+ /\/(app|spa|dashboard|admin)/,
279
+ /#/,
280
+ /\.(js|jsx|ts|tsx)$/
281
+ ];
282
+
283
+ return jsIndicators.some(pattern => pattern.test(url));
284
+ }
285
+
286
+ /**
287
+ * Extract title from HTML using simple parsing
288
+ * @param {string} html - HTML content
289
+ * @returns {string|null} - Extracted title
290
+ */
291
+ extractTitleFromHTML(html) {
292
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
293
+ return titleMatch ? titleMatch[1].trim() : null;
294
+ }
295
+
296
+ /**
297
+ * Convert HTML content to Markdown
298
+ * @param {string} html - HTML content
299
+ * @returns {string} - Markdown content
300
+ */
301
+ convertToMarkdown(html) {
302
+ // Simple HTML to Markdown conversion
303
+ return html
304
+ .replace(/<h([1-6])[^>]*>(.*?)<\/h[1-6]>/gi, (match, level, text) => {
305
+ const hashes = '#'.repeat(parseInt(level));
306
+ return `\n${hashes} ${text}\n`;
307
+ })
308
+ .replace(/<p[^>]*>(.*?)<\/p>/gi, '\n$1\n')
309
+ .replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
310
+ .replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
311
+ .replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)')
312
+ .replace(/<br[^>]*>/gi, '\n')
313
+ .replace(/<[^>]+>/g, '') // Remove remaining HTML tags
314
+ .replace(/\n{3,}/g, '\n\n') // Normalize line breaks
315
+ .trim();
316
+ }
317
+
318
+ /**
319
+ * Clean up resources
320
+ * @returns {Promise<void>}
321
+ */
322
+ async cleanup() {
323
+ if (this.browserProcessor) {
324
+ await this.browserProcessor.cleanup();
325
+ }
326
+ }
327
+ }
328
+
329
+ export default ExtractContentTool;