crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,505 @@
1
+ /**
2
+ * ContentProcessor - Enhanced content extraction with Mozilla Readability
3
+ * Provides main content detection, boilerplate removal, and structured data extraction
4
+ */
5
+
6
+ import { Readability } from '@mozilla/readability';
7
+ import { JSDOM } from 'jsdom';
8
+ import * as cheerio from 'cheerio';
9
+ import { z } from 'zod';
10
+
11
+ const ContentProcessorSchema = z.object({
12
+ html: z.string(),
13
+ url: z.string().url().optional(),
14
+ options: z.object({
15
+ extractStructuredData: z.boolean().default(true),
16
+ calculateReadabilityScore: z.boolean().default(true),
17
+ removeBoilerplate: z.boolean().default(true),
18
+ preserveImageInfo: z.boolean().default(true),
19
+ extractMetadata: z.boolean().default(true)
20
+ }).optional().default({})
21
+ });
22
+
23
+ const ReadabilityResult = z.object({
24
+ title: z.string().nullable(),
25
+ content: z.string(),
26
+ textContent: z.string(),
27
+ length: z.number(),
28
+ excerpt: z.string().nullable(),
29
+ byline: z.string().nullable(),
30
+ dir: z.string().nullable(),
31
+ siteName: z.string().nullable(),
32
+ lang: z.string().nullable()
33
+ });
34
+
35
+ export class ContentProcessor {
36
+ constructor() {
37
+ this.defaultOptions = {
38
+ extractStructuredData: true,
39
+ calculateReadabilityScore: true,
40
+ removeBoilerplate: true,
41
+ preserveImageInfo: true,
42
+ extractMetadata: true
43
+ };
44
+ }
45
+
46
+ /**
47
+ * Process HTML content with enhanced extraction capabilities
48
+ * @param {Object} params - Processing parameters
49
+ * @param {string} params.html - HTML content to process
50
+ * @param {string} params.url - Source URL (optional)
51
+ * @param {Object} params.options - Processing options
52
+ * @returns {Promise<Object>} - Processed content with metadata
53
+ */
54
+ async processContent(params) {
55
+ try {
56
+ const validated = ContentProcessorSchema.parse(params);
57
+ const { html, url, options } = validated;
58
+ const processingOptions = { ...this.defaultOptions, ...options };
59
+
60
+ const result = {
61
+ url,
62
+ processed_at: new Date().toISOString(),
63
+ processing_options: processingOptions
64
+ };
65
+
66
+ // Create JSDOM instance for Readability
67
+ const dom = new JSDOM(html, { url });
68
+ const document = dom.window.document;
69
+
70
+ // Extract main content using Mozilla Readability
71
+ if (processingOptions.removeBoilerplate) {
72
+ const reader = new Readability(document, {
73
+ debug: false,
74
+ maxElemsToDivide: 300,
75
+ charThreshold: 500
76
+ });
77
+
78
+ const article = reader.parse();
79
+ if (article) {
80
+ result.readability = {
81
+ title: article.title,
82
+ content: article.content,
83
+ textContent: article.textContent,
84
+ length: article.length,
85
+ excerpt: article.excerpt,
86
+ byline: article.byline,
87
+ dir: article.dir,
88
+ siteName: article.siteName,
89
+ lang: article.lang
90
+ };
91
+
92
+ // Calculate readability score
93
+ if (processingOptions.calculateReadabilityScore) {
94
+ result.readability_score = this.calculateReadabilityScore(article.textContent);
95
+ }
96
+ }
97
+ }
98
+
99
+ // Extract structured data
100
+ if (processingOptions.extractStructuredData) {
101
+ result.structured_data = this.extractStructuredData(html);
102
+ }
103
+
104
+ // Extract additional metadata
105
+ if (processingOptions.extractMetadata) {
106
+ result.metadata = this.extractMetadata(html);
107
+ }
108
+
109
+ // Preserve image information
110
+ if (processingOptions.preserveImageInfo) {
111
+ result.images = this.extractImageInfo(html);
112
+ }
113
+
114
+ // Fallback content extraction if Readability fails
115
+ if (!result.readability) {
116
+ result.fallback_content = this.extractFallbackContent(html);
117
+ }
118
+
119
+ return result;
120
+
121
+ } catch (error) {
122
+ throw new Error(`Content processing failed: ${error.message}`);
123
+ }
124
+ }
125
+
126
+ /**
127
+ * Extract structured data from HTML (JSON-LD, microdata, schema.org)
128
+ * @param {string} html - HTML content
129
+ * @returns {Object} - Extracted structured data
130
+ */
131
+ extractStructuredData(html) {
132
+ const $ = cheerio.load(html);
133
+ const structuredData = {
134
+ jsonLd: [],
135
+ microdata: [],
136
+ schemaOrg: []
137
+ };
138
+
139
+ try {
140
+ // Extract JSON-LD
141
+ $('script[type="application/ld+json"]').each((_, element) => {
142
+ try {
143
+ const jsonText = $(element).html();
144
+ if (jsonText) {
145
+ const parsed = JSON.parse(jsonText);
146
+ structuredData.jsonLd.push(parsed);
147
+ }
148
+ } catch (err) {
149
+ // Skip invalid JSON-LD
150
+ }
151
+ });
152
+
153
+ // Extract microdata
154
+ $('[itemscope]').each((_, element) => {
155
+ const item = this.extractMicrodataItem($, element);
156
+ if (item) {
157
+ structuredData.microdata.push(item);
158
+ }
159
+ });
160
+
161
+ // Extract schema.org markup
162
+ $('[typeof], [property], [vocab]').each((_, element) => {
163
+ const schemaItem = this.extractSchemaOrgItem($, element);
164
+ if (schemaItem) {
165
+ structuredData.schemaOrg.push(schemaItem);
166
+ }
167
+ });
168
+
169
+ } catch (error) {
170
+ console.warn('Error extracting structured data:', error.message);
171
+ }
172
+
173
+ return structuredData;
174
+ }
175
+
176
+ /**
177
+ * Extract microdata item from element
178
+ * @param {Object} $ - Cheerio instance
179
+ * @param {Object} element - DOM element
180
+ * @returns {Object|null} - Extracted microdata item
181
+ */
182
+ extractMicrodataItem($, element) {
183
+ const $element = $(element);
184
+ const itemType = $element.attr('itemtype');
185
+ const itemId = $element.attr('itemid');
186
+
187
+ if (!itemType) return null;
188
+
189
+ const item = {
190
+ type: itemType,
191
+ properties: {}
192
+ };
193
+
194
+ if (itemId) {
195
+ item.id = itemId;
196
+ }
197
+
198
+ // Extract properties
199
+ $element.find('[itemprop]').each((_, propElement) => {
200
+ const $prop = $(propElement);
201
+ const propName = $prop.attr('itemprop');
202
+ const propValue = this.extractMicrodataValue($, propElement);
203
+
204
+ if (propName && propValue !== null) {
205
+ if (!item.properties[propName]) {
206
+ item.properties[propName] = [];
207
+ }
208
+ item.properties[propName].push(propValue);
209
+ }
210
+ });
211
+
212
+ return item;
213
+ }
214
+
215
+ /**
216
+ * Extract microdata property value
217
+ * @param {Object} $ - Cheerio instance
218
+ * @param {Object} element - DOM element
219
+ * @returns {string|Object|null} - Property value
220
+ */
221
+ extractMicrodataValue($, element) {
222
+ const $element = $(element);
223
+ const tagName = $element.get(0).tagName.toLowerCase();
224
+
225
+ // Check for nested itemscope
226
+ if ($element.attr('itemscope')) {
227
+ return this.extractMicrodataItem($, element);
228
+ }
229
+
230
+ // Extract value based on element type
231
+ switch (tagName) {
232
+ case 'meta':
233
+ return $element.attr('content') || null;
234
+ case 'a':
235
+ case 'area':
236
+ case 'link':
237
+ return $element.attr('href') || null;
238
+ case 'img':
239
+ case 'audio':
240
+ case 'embed':
241
+ case 'iframe':
242
+ case 'source':
243
+ case 'track':
244
+ case 'video':
245
+ return $element.attr('src') || null;
246
+ case 'object':
247
+ return $element.attr('data') || null;
248
+ case 'time':
249
+ return $element.attr('datetime') || $element.text().trim() || null;
250
+ default:
251
+ return $element.text().trim() || null;
252
+ }
253
+ }
254
+
255
+ /**
256
+ * Extract schema.org item from element
257
+ * @param {Object} $ - Cheerio instance
258
+ * @param {Object} element - DOM element
259
+ * @returns {Object|null} - Extracted schema.org item
260
+ */
261
+ extractSchemaOrgItem($, element) {
262
+ const $element = $(element);
263
+ const typeOf = $element.attr('typeof');
264
+ const property = $element.attr('property');
265
+ const vocab = $element.attr('vocab');
266
+
267
+ if (!typeOf && !property && !vocab) return null;
268
+
269
+ const item = {};
270
+
271
+ if (typeOf) item.typeof = typeOf;
272
+ if (property) item.property = property;
273
+ if (vocab) item.vocab = vocab;
274
+
275
+ const content = $element.attr('content') || $element.text().trim();
276
+ if (content) {
277
+ item.content = content;
278
+ }
279
+
280
+ return item;
281
+ }
282
+
283
+ /**
284
+ * Extract metadata from HTML
285
+ * @param {string} html - HTML content
286
+ * @returns {Object} - Extracted metadata
287
+ */
288
+ extractMetadata(html) {
289
+ const $ = cheerio.load(html);
290
+ const metadata = {
291
+ title: $('title').text().trim() || null,
292
+ description: null,
293
+ keywords: null,
294
+ author: null,
295
+ published: null,
296
+ modified: null,
297
+ openGraph: {},
298
+ twitterCard: {},
299
+ canonical: null,
300
+ language: null
301
+ };
302
+
303
+ // Basic meta tags
304
+ $('meta').each((_, element) => {
305
+ const $meta = $(element);
306
+ const name = $meta.attr('name') || $meta.attr('property') || $meta.attr('http-equiv');
307
+ const content = $meta.attr('content');
308
+
309
+ if (!name || !content) return;
310
+
311
+ const nameLower = name.toLowerCase();
312
+
313
+ // Standard meta tags
314
+ if (nameLower === 'description') {
315
+ metadata.description = content;
316
+ } else if (nameLower === 'keywords') {
317
+ metadata.keywords = content.split(',').map(k => k.trim());
318
+ } else if (nameLower === 'author') {
319
+ metadata.author = content;
320
+ } else if (nameLower.includes('published') || nameLower.includes('date')) {
321
+ metadata.published = content;
322
+ } else if (nameLower.includes('modified') || nameLower.includes('updated')) {
323
+ metadata.modified = content;
324
+ }
325
+
326
+ // Open Graph
327
+ if (name.startsWith('og:')) {
328
+ const ogProperty = name.substring(3);
329
+ metadata.openGraph[ogProperty] = content;
330
+ }
331
+
332
+ // Twitter Cards
333
+ if (name.startsWith('twitter:')) {
334
+ const twitterProperty = name.substring(8);
335
+ metadata.twitterCard[twitterProperty] = content;
336
+ }
337
+ });
338
+
339
+ // Canonical URL
340
+ const canonical = $('link[rel="canonical"]').attr('href');
341
+ if (canonical) {
342
+ metadata.canonical = canonical;
343
+ }
344
+
345
+ // Language
346
+ const htmlLang = $('html').attr('lang');
347
+ if (htmlLang) {
348
+ metadata.language = htmlLang;
349
+ }
350
+
351
+ return metadata;
352
+ }
353
+
354
+ /**
355
+ * Extract image information from HTML
356
+ * @param {string} html - HTML content
357
+ * @returns {Array} - Image information
358
+ */
359
+ extractImageInfo(html) {
360
+ const $ = cheerio.load(html);
361
+ const images = [];
362
+
363
+ $('img').each((_, element) => {
364
+ const $img = $(element);
365
+ const imageInfo = {
366
+ src: $img.attr('src'),
367
+ alt: $img.attr('alt') || null,
368
+ title: $img.attr('title') || null,
369
+ width: $img.attr('width') || null,
370
+ height: $img.attr('height') || null,
371
+ loading: $img.attr('loading') || null,
372
+ srcset: $img.attr('srcset') || null
373
+ };
374
+
375
+ if (imageInfo.src) {
376
+ images.push(imageInfo);
377
+ }
378
+ });
379
+
380
+ return images;
381
+ }
382
+
383
+ /**
384
+ * Calculate readability score using simple metrics
385
+ * @param {string} text - Text content
386
+ * @returns {Object} - Readability metrics
387
+ */
388
+ calculateReadabilityScore(text) {
389
+ if (!text || typeof text !== 'string') {
390
+ return null;
391
+ }
392
+
393
+ const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
394
+ const words = text.split(/\s+/).filter(w => w.length > 0);
395
+ const characters = text.length;
396
+ const charactersNoSpaces = text.replace(/\s/g, '').length;
397
+
398
+ if (sentences.length === 0 || words.length === 0) {
399
+ return null;
400
+ }
401
+
402
+ const avgWordsPerSentence = words.length / sentences.length;
403
+ const avgCharsPerWord = charactersNoSpaces / words.length;
404
+
405
+ // Simple readability score (lower is better)
406
+ const readabilityScore = (avgWordsPerSentence * 1.015) + (avgCharsPerWord * 84.6) - 206.835;
407
+
408
+ return {
409
+ sentences: sentences.length,
410
+ words: words.length,
411
+ characters,
412
+ charactersNoSpaces,
413
+ avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
414
+ avgCharsPerWord: Math.round(avgCharsPerWord * 100) / 100,
415
+ readabilityScore: Math.round(readabilityScore * 100) / 100,
416
+ readabilityLevel: this.getReadabilityLevel(readabilityScore)
417
+ };
418
+ }
419
+
420
+ /**
421
+ * Get readability level based on score
422
+ * @param {number} score - Readability score
423
+ * @returns {string} - Readability level
424
+ */
425
+ getReadabilityLevel(score) {
426
+ if (score >= 90) return 'Very Easy';
427
+ if (score >= 80) return 'Easy';
428
+ if (score >= 70) return 'Fairly Easy';
429
+ if (score >= 60) return 'Standard';
430
+ if (score >= 50) return 'Fairly Difficult';
431
+ if (score >= 30) return 'Difficult';
432
+ return 'Very Difficult';
433
+ }
434
+
435
+ /**
436
+ * Extract fallback content when Readability fails
437
+ * @param {string} html - HTML content
438
+ * @returns {Object} - Fallback content
439
+ */
440
+ extractFallbackContent(html) {
441
+ const $ = cheerio.load(html);
442
+
443
+ // Remove unwanted elements
444
+ $('script, style, nav, header, footer, aside, .advertisement, .ads, .social-share').remove();
445
+
446
+ // Find main content candidates
447
+ const contentSelectors = [
448
+ 'main',
449
+ 'article',
450
+ '[role="main"]',
451
+ '.main-content',
452
+ '.content',
453
+ '.post-content',
454
+ '.entry-content',
455
+ '#content',
456
+ '#main'
457
+ ];
458
+
459
+ let mainContent = null;
460
+ for (const selector of contentSelectors) {
461
+ const element = $(selector).first();
462
+ if (element.length > 0) {
463
+ mainContent = element.text().trim();
464
+ break;
465
+ }
466
+ }
467
+
468
+ // Fallback to body content
469
+ if (!mainContent) {
470
+ mainContent = $('body').text().trim();
471
+ }
472
+
473
+ return {
474
+ content: mainContent,
475
+ title: $('title').text().trim() || null,
476
+ headings: this.extractHeadings($),
477
+ length: mainContent ? mainContent.length : 0
478
+ };
479
+ }
480
+
481
+ /**
482
+ * Extract headings from content
483
+ * @param {Object} $ - Cheerio instance
484
+ * @returns {Array} - Extracted headings
485
+ */
486
+ extractHeadings($) {
487
+ const headings = [];
488
+ $('h1, h2, h3, h4, h5, h6').each((_, element) => {
489
+ const $heading = $(element);
490
+ const level = parseInt(element.tagName.substring(1));
491
+ const text = $heading.text().trim();
492
+
493
+ if (text) {
494
+ headings.push({
495
+ level,
496
+ text,
497
+ id: $heading.attr('id') || null
498
+ });
499
+ }
500
+ });
501
+ return headings;
502
+ }
503
+ }
504
+
505
+ export default ContentProcessor;