crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract Content MCP Tool
|
|
3
|
+
* Enhanced content extraction with main content detection and readability features
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { z } from 'zod';
|
|
7
|
+
import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
|
|
8
|
+
import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
|
|
9
|
+
import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
|
|
10
|
+
|
|
11
|
+
const ExtractContentSchema = z.object({
|
|
12
|
+
url: z.string().url(),
|
|
13
|
+
options: z.object({
|
|
14
|
+
// Content extraction options
|
|
15
|
+
useReadability: z.boolean().default(true),
|
|
16
|
+
extractStructuredData: z.boolean().default(true),
|
|
17
|
+
calculateReadabilityScore: z.boolean().default(true),
|
|
18
|
+
preserveImageInfo: z.boolean().default(true),
|
|
19
|
+
extractMetadata: z.boolean().default(true),
|
|
20
|
+
|
|
21
|
+
// Browser rendering options
|
|
22
|
+
requiresJavaScript: z.boolean().optional(),
|
|
23
|
+
waitForSelector: z.string().optional(),
|
|
24
|
+
waitForTimeout: z.number().min(0).max(30000).default(5000),
|
|
25
|
+
|
|
26
|
+
// Quality assessment options
|
|
27
|
+
assessContentQuality: z.boolean().default(true),
|
|
28
|
+
minContentLength: z.number().min(0).default(100),
|
|
29
|
+
|
|
30
|
+
// Output options
|
|
31
|
+
includeRawHTML: z.boolean().default(false),
|
|
32
|
+
includeCleanedHTML: z.boolean().default(false),
|
|
33
|
+
outputFormat: z.enum(['text', 'markdown', 'structured']).default('structured')
|
|
34
|
+
}).optional().default({})
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
const ExtractContentResult = z.object({
|
|
38
|
+
url: z.string(),
|
|
39
|
+
title: z.string().nullable(),
|
|
40
|
+
content: z.object({
|
|
41
|
+
text: z.string(),
|
|
42
|
+
html: z.string().optional(),
|
|
43
|
+
markdown: z.string().optional()
|
|
44
|
+
}),
|
|
45
|
+
metadata: z.object({
|
|
46
|
+
title: z.string().nullable(),
|
|
47
|
+
description: z.string().nullable(),
|
|
48
|
+
author: z.string().nullable(),
|
|
49
|
+
published: z.string().nullable(),
|
|
50
|
+
language: z.string().nullable(),
|
|
51
|
+
canonical: z.string().nullable(),
|
|
52
|
+
openGraph: z.record(z.string()).optional(),
|
|
53
|
+
twitterCard: z.record(z.string()).optional()
|
|
54
|
+
}).optional(),
|
|
55
|
+
readability: z.object({
|
|
56
|
+
title: z.string().nullable(),
|
|
57
|
+
content: z.string(),
|
|
58
|
+
textContent: z.string(),
|
|
59
|
+
length: z.number(),
|
|
60
|
+
excerpt: z.string().nullable(),
|
|
61
|
+
byline: z.string().nullable(),
|
|
62
|
+
siteName: z.string().nullable(),
|
|
63
|
+
lang: z.string().nullable()
|
|
64
|
+
}).optional(),
|
|
65
|
+
readabilityScore: z.object({
|
|
66
|
+
score: z.number(),
|
|
67
|
+
level: z.string(),
|
|
68
|
+
sentences: z.number(),
|
|
69
|
+
words: z.number(),
|
|
70
|
+
characters: z.number(),
|
|
71
|
+
avgWordsPerSentence: z.number(),
|
|
72
|
+
avgCharsPerWord: z.number()
|
|
73
|
+
}).optional(),
|
|
74
|
+
structuredData: z.object({
|
|
75
|
+
jsonLd: z.array(z.any()),
|
|
76
|
+
microdata: z.array(z.any()),
|
|
77
|
+
schemaOrg: z.array(z.any())
|
|
78
|
+
}).optional(),
|
|
79
|
+
images: z.array(z.object({
|
|
80
|
+
src: z.string(),
|
|
81
|
+
alt: z.string().nullable(),
|
|
82
|
+
title: z.string().nullable(),
|
|
83
|
+
width: z.string().nullable(),
|
|
84
|
+
height: z.string().nullable()
|
|
85
|
+
})).optional(),
|
|
86
|
+
qualityAssessment: z.object({
|
|
87
|
+
isValid: z.boolean(),
|
|
88
|
+
score: z.number(),
|
|
89
|
+
reasons: z.array(z.string()),
|
|
90
|
+
metrics: z.record(z.any())
|
|
91
|
+
}).optional(),
|
|
92
|
+
extractedAt: z.string(),
|
|
93
|
+
processingTime: z.number(),
|
|
94
|
+
success: z.boolean(),
|
|
95
|
+
error: z.string().optional()
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
export class ExtractContentTool {
|
|
99
|
+
constructor() {
|
|
100
|
+
this.contentProcessor = new ContentProcessor();
|
|
101
|
+
this.browserProcessor = new BrowserProcessor();
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Get tool definition for MCP server
|
|
106
|
+
* @returns {Object} Tool definition
|
|
107
|
+
*/
|
|
108
|
+
getDefinition() {
|
|
109
|
+
return {
|
|
110
|
+
name: 'extract_content',
|
|
111
|
+
description: 'Extract and analyze main content from web pages with enhanced readability detection, structured data extraction, and content quality assessment.',
|
|
112
|
+
inputSchema: ExtractContentSchema
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Execute content extraction
|
|
118
|
+
* @param {Object} params - Extraction parameters
|
|
119
|
+
* @returns {Promise<Object>} Extraction result
|
|
120
|
+
*/
|
|
121
|
+
async execute(params) {
|
|
122
|
+
const startTime = Date.now();
|
|
123
|
+
|
|
124
|
+
try {
|
|
125
|
+
const validated = ExtractContentSchema.parse(params);
|
|
126
|
+
const { url, options } = validated;
|
|
127
|
+
|
|
128
|
+
const result = {
|
|
129
|
+
url,
|
|
130
|
+
extractedAt: new Date().toISOString(),
|
|
131
|
+
success: false,
|
|
132
|
+
processingTime: 0
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
// Step 1: Fetch content (with or without JavaScript rendering)
|
|
136
|
+
let html, pageTitle;
|
|
137
|
+
const shouldUseJavaScript = options.requiresJavaScript || await this.shouldUseJavaScript(url);
|
|
138
|
+
|
|
139
|
+
if (shouldUseJavaScript) {
|
|
140
|
+
console.log('Using browser rendering for JavaScript content...');
|
|
141
|
+
const browserResult = await this.browserProcessor.processURL({
|
|
142
|
+
url,
|
|
143
|
+
options: {
|
|
144
|
+
waitForSelector: options.waitForSelector,
|
|
145
|
+
waitForTimeout: options.waitForTimeout,
|
|
146
|
+
enableJavaScript: true,
|
|
147
|
+
enableImages: options.preserveImageInfo,
|
|
148
|
+
captureScreenshot: false
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
if (!browserResult.success) {
|
|
153
|
+
throw new Error(`Browser processing failed: ${browserResult.error}`);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
html = browserResult.html;
|
|
157
|
+
pageTitle = browserResult.title;
|
|
158
|
+
} else {
|
|
159
|
+
// Simple HTTP fetch
|
|
160
|
+
const response = await fetch(url, {
|
|
161
|
+
headers: {
|
|
162
|
+
'User-Agent': 'Mozilla/5.0 (compatible; MCP-WebScraper/3.0; Enhanced-Content-Extractor)'
|
|
163
|
+
},
|
|
164
|
+
timeout: 15000
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
if (!response.ok) {
|
|
168
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
html = await response.text();
|
|
172
|
+
pageTitle = this.extractTitleFromHTML(html);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
result.title = pageTitle;
|
|
176
|
+
|
|
177
|
+
// Step 2: Process content with ContentProcessor
|
|
178
|
+
const processingResult = await this.contentProcessor.processContent({
|
|
179
|
+
html,
|
|
180
|
+
url,
|
|
181
|
+
options: {
|
|
182
|
+
extractStructuredData: options.extractStructuredData,
|
|
183
|
+
calculateReadabilityScore: options.calculateReadabilityScore,
|
|
184
|
+
removeBoilerplate: options.useReadability,
|
|
185
|
+
preserveImageInfo: options.preserveImageInfo,
|
|
186
|
+
extractMetadata: options.extractMetadata
|
|
187
|
+
}
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
// Step 3: Extract and format content
|
|
191
|
+
if (processingResult.readability) {
|
|
192
|
+
result.readability = processingResult.readability;
|
|
193
|
+
result.content = {
|
|
194
|
+
text: processingResult.readability.textContent || processingResult.readability.content,
|
|
195
|
+
};
|
|
196
|
+
|
|
197
|
+
// Convert to markdown if requested
|
|
198
|
+
if (options.outputFormat === 'markdown') {
|
|
199
|
+
result.content.markdown = this.convertToMarkdown(processingResult.readability.content);
|
|
200
|
+
}
|
|
201
|
+
} else if (processingResult.fallback_content) {
|
|
202
|
+
result.content = {
|
|
203
|
+
text: processingResult.fallback_content.content
|
|
204
|
+
};
|
|
205
|
+
} else {
|
|
206
|
+
// Last resort: extract text from HTML
|
|
207
|
+
result.content = {
|
|
208
|
+
text: HTMLCleaner.extractTextWithFormatting(html, {
|
|
209
|
+
preserveLineBreaks: true,
|
|
210
|
+
preserveParagraphs: true,
|
|
211
|
+
includeLinks: false,
|
|
212
|
+
includeImageAlt: true
|
|
213
|
+
})
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Include HTML if requested
|
|
218
|
+
if (options.includeRawHTML) {
|
|
219
|
+
result.content.html = html;
|
|
220
|
+
}
|
|
221
|
+
if (options.includeCleanedHTML && processingResult.readability) {
|
|
222
|
+
result.content.cleanedHTML = processingResult.readability.content;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Step 4: Add readability score
|
|
226
|
+
if (processingResult.readability_score) {
|
|
227
|
+
result.readabilityScore = processingResult.readability_score;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Step 5: Add metadata
|
|
231
|
+
if (processingResult.metadata) {
|
|
232
|
+
result.metadata = processingResult.metadata;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// Step 6: Add structured data
|
|
236
|
+
if (processingResult.structured_data) {
|
|
237
|
+
result.structuredData = processingResult.structured_data;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Step 7: Add image information
|
|
241
|
+
if (processingResult.images) {
|
|
242
|
+
result.images = processingResult.images;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// Step 8: Assess content quality
|
|
246
|
+
if (options.assessContentQuality) {
|
|
247
|
+
result.qualityAssessment = ContentQualityAssessor.assessContentQuality(
|
|
248
|
+
result.content.text,
|
|
249
|
+
{ minLength: options.minContentLength }
|
|
250
|
+
);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
result.processingTime = Date.now() - startTime;
|
|
254
|
+
result.success = true;
|
|
255
|
+
|
|
256
|
+
return result;
|
|
257
|
+
|
|
258
|
+
} catch (error) {
|
|
259
|
+
return {
|
|
260
|
+
url: params.url || 'unknown',
|
|
261
|
+
extractedAt: new Date().toISOString(),
|
|
262
|
+
success: false,
|
|
263
|
+
error: `Content extraction failed: ${error.message}`,
|
|
264
|
+
processingTime: Date.now() - startTime,
|
|
265
|
+
content: { text: '' }
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Determine if JavaScript rendering is needed
|
|
272
|
+
* @param {string} url - URL to analyze
|
|
273
|
+
* @returns {Promise<boolean>} - Whether JavaScript is needed
|
|
274
|
+
*/
|
|
275
|
+
async shouldUseJavaScript(url) {
|
|
276
|
+
// Simple heuristics for determining if JavaScript is needed
|
|
277
|
+
const jsIndicators = [
|
|
278
|
+
/\/(app|spa|dashboard|admin)/,
|
|
279
|
+
/#/,
|
|
280
|
+
/\.(js|jsx|ts|tsx)$/
|
|
281
|
+
];
|
|
282
|
+
|
|
283
|
+
return jsIndicators.some(pattern => pattern.test(url));
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
/**
|
|
287
|
+
* Extract title from HTML using simple parsing
|
|
288
|
+
* @param {string} html - HTML content
|
|
289
|
+
* @returns {string|null} - Extracted title
|
|
290
|
+
*/
|
|
291
|
+
extractTitleFromHTML(html) {
|
|
292
|
+
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
|
293
|
+
return titleMatch ? titleMatch[1].trim() : null;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Convert HTML content to Markdown
|
|
298
|
+
* @param {string} html - HTML content
|
|
299
|
+
* @returns {string} - Markdown content
|
|
300
|
+
*/
|
|
301
|
+
convertToMarkdown(html) {
|
|
302
|
+
// Simple HTML to Markdown conversion
|
|
303
|
+
return html
|
|
304
|
+
.replace(/<h([1-6])[^>]*>(.*?)<\/h[1-6]>/gi, (match, level, text) => {
|
|
305
|
+
const hashes = '#'.repeat(parseInt(level));
|
|
306
|
+
return `\n${hashes} ${text}\n`;
|
|
307
|
+
})
|
|
308
|
+
.replace(/<p[^>]*>(.*?)<\/p>/gi, '\n$1\n')
|
|
309
|
+
.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
|
|
310
|
+
.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
|
|
311
|
+
.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)')
|
|
312
|
+
.replace(/<br[^>]*>/gi, '\n')
|
|
313
|
+
.replace(/<[^>]+>/g, '') // Remove remaining HTML tags
|
|
314
|
+
.replace(/\n{3,}/g, '\n\n') // Normalize line breaks
|
|
315
|
+
.trim();
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Clean up resources
|
|
320
|
+
* @returns {Promise<void>}
|
|
321
|
+
*/
|
|
322
|
+
async cleanup() {
|
|
323
|
+
if (this.browserProcessor) {
|
|
324
|
+
await this.browserProcessor.cleanup();
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
export default ExtractContentTool;
|