npm - crawlforge-mcp-server - Versions diffs - 3.0.0 - Mend

crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

package/CLAUDE.md +315 -0
package/LICENSE +21 -0
package/README.md +181 -0
package/package.json +115 -0
package/server.js +1963 -0
package/setup.js +112 -0
package/src/constants/config.js +615 -0
package/src/core/ActionExecutor.js +1104 -0
package/src/core/AlertNotificationSystem.js +601 -0
package/src/core/AuthManager.js +315 -0
package/src/core/ChangeTracker.js +2306 -0
package/src/core/JobManager.js +687 -0
package/src/core/LLMsTxtAnalyzer.js +753 -0
package/src/core/LocalizationManager.js +1615 -0
package/src/core/PerformanceManager.js +828 -0
package/src/core/ResearchOrchestrator.js +1327 -0
package/src/core/SnapshotManager.js +1037 -0
package/src/core/StealthBrowserManager.js +1795 -0
package/src/core/WebhookDispatcher.js +745 -0
package/src/core/analysis/ContentAnalyzer.js +749 -0
package/src/core/analysis/LinkAnalyzer.js +972 -0
package/src/core/cache/CacheManager.js +821 -0
package/src/core/connections/ConnectionPool.js +553 -0
package/src/core/crawlers/BFSCrawler.js +845 -0
package/src/core/integrations/PerformanceIntegration.js +377 -0
package/src/core/llm/AnthropicProvider.js +135 -0
package/src/core/llm/LLMManager.js +415 -0
package/src/core/llm/LLMProvider.js +97 -0
package/src/core/llm/OpenAIProvider.js +127 -0
package/src/core/processing/BrowserProcessor.js +986 -0
package/src/core/processing/ContentProcessor.js +505 -0
package/src/core/processing/PDFProcessor.js +448 -0
package/src/core/processing/StreamProcessor.js +673 -0
package/src/core/queue/QueueManager.js +98 -0
package/src/core/workers/WorkerPool.js +585 -0
package/src/core/workers/worker.js +743 -0
package/src/monitoring/healthCheck.js +600 -0
package/src/monitoring/metrics.js +761 -0
package/src/optimization/wave3-optimizations.js +932 -0
package/src/security/security-patches.js +120 -0
package/src/security/security-tests.js +355 -0
package/src/security/wave3-security.js +652 -0
package/src/tools/advanced/BatchScrapeTool.js +1089 -0
package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
package/src/tools/crawl/crawlDeep.js +449 -0
package/src/tools/crawl/mapSite.js +400 -0
package/src/tools/extract/analyzeContent.js +624 -0
package/src/tools/extract/extractContent.js +329 -0
package/src/tools/extract/processDocument.js +503 -0
package/src/tools/extract/summarizeContent.js +376 -0
package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
package/src/tools/research/deepResearch.js +706 -0
package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
package/src/tools/search/adapters/googleSearch.js +236 -0
package/src/tools/search/adapters/searchProviderFactory.js +96 -0
package/src/tools/search/queryExpander.js +543 -0
package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
package/src/tools/search/ranking/ResultRanker.js +497 -0
package/src/tools/search/searchWeb.js +482 -0
package/src/tools/tracking/trackChanges.js +1355 -0
package/src/utils/CircuitBreaker.js +515 -0
package/src/utils/ErrorHandlingConfig.js +342 -0
package/src/utils/HumanBehaviorSimulator.js +569 -0
package/src/utils/Logger.js +568 -0
package/src/utils/MemoryMonitor.js +173 -0
package/src/utils/RetryManager.js +386 -0
package/src/utils/contentUtils.js +588 -0
package/src/utils/domainFilter.js +612 -0
package/src/utils/inputValidation.js +766 -0
package/src/utils/rateLimiter.js +196 -0
package/src/utils/robotsChecker.js +91 -0
package/src/utils/securityMiddleware.js +416 -0
package/src/utils/sitemapParser.js +678 -0
package/src/utils/ssrfProtection.js +640 -0
package/src/utils/urlNormalizer.js +168 -0

package/src/utils/contentUtils.js ADDED Viewed

@@ -0,0 +1,588 @@
+/**
+ * Content Processing Utilities
+ * Supporting functions for content extraction, cleaning, and quality assessment
+ */
+import * as cheerio from 'cheerio';
+import { z } from 'zod';
+/**
+ * HTML cleaning utilities
+ */
+export class HTMLCleaner {
+  /**
+   * Clean HTML content by removing unwanted elements and attributes
+   * @param {string} html - HTML content to clean
+   * @param {Object} options - Cleaning options
+   * @returns {string} - Cleaned HTML
+   */
+  static cleanHTML(html, options = {}) {
+    const defaultOptions = {
+      removeScripts: true,
+      removeStyles: true,
+      removeComments: true,
+      removeEmpty: true,
+      allowedTags: ['p', 'div', 'span', 'a', 'img', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'strong', 'em', 'b', 'i', 'blockquote', 'code', 'pre'],
+      allowedAttributes: ['href', 'src', 'alt', 'title', 'class', 'id']
+    };
+    const cleaningOptions = { ...defaultOptions, ...options };
+    const $ = cheerio.load(html);
+    // Remove scripts and styles
+    if (cleaningOptions.removeScripts) {
+      $('script, noscript').remove();
+    }
+    if (cleaningOptions.removeStyles) {
+      $('style, link[rel="stylesheet"]').remove();
+    }
+    // Remove comments
+    if (cleaningOptions.removeComments) {
+      $('*').contents().filter((_, node) => node.type === 'comment').remove();
+    }
+    // Remove unwanted elements
+    $('nav, header, footer, aside, .advertisement, .ads, .social-share, .popup, .modal').remove();
+    // Clean attributes
+    if (cleaningOptions.allowedAttributes) {
+      $('*').each((_, element) => {
+        const $element = $(element);
+        const attributes = element.attribs || {};
+        Object.keys(attributes).forEach(attr => {
+          if (!cleaningOptions.allowedAttributes.includes(attr)) {
+            $element.removeAttr(attr);
+          }
+        });
+      });
+    }
+    // Remove empty elements
+    if (cleaningOptions.removeEmpty) {
+      $('*').filter((_, element) => {
+        const $element = $(element);
+        return $element.text().trim() === '' &&
+               $element.find('img, video, audio, iframe').length === 0;
+      }).remove();
+    }
+    return $.html();
+  }
+  /**
+   * Extract text content with preserved formatting
+   * @param {string} html - HTML content
+   * @param {Object} options - Extraction options
+   * @returns {string} - Extracted text
+   */
+  static extractTextWithFormatting(html, options = {}) {
+    const defaultOptions = {
+      preserveLineBreaks: true,
+      preserveParagraphs: true,
+      includeLinks: false,
+      includeImageAlt: true
+    };
+    const extractOptions = { ...defaultOptions, ...options };
+    const $ = cheerio.load(html);
+    // Remove unwanted elements
+    $('script, style, nav, header, footer, aside').remove();
+    let text = '';
+    $('body').find('*').each((_, element) => {
+      const $element = $(element);
+      const tagName = element.tagName.toLowerCase();
+      switch (tagName) {
+        case 'p':
+        case 'div':
+          if (extractOptions.preserveParagraphs) {
+            text += '\n\n' + $element.text().trim();
+          } else {
+            text += ' ' + $element.text().trim();
+          }
+          break;
+        case 'br':
+          if (extractOptions.preserveLineBreaks) {
+            text += '\n';
+          }
+          break;
+        case 'h1':
+        case 'h2':
+        case 'h3':
+        case 'h4':
+        case 'h5':
+        case 'h6':
+          text += '\n\n' + $element.text().trim().toUpperCase() + '\n';
+          break;
+        case 'a':
+          if (extractOptions.includeLinks) {
+            const href = $element.attr('href');
+            const linkText = $element.text().trim();
+            text += ` ${linkText}${href ? ` (${href})` : ''}`;
+          } else {
+            text += ' ' + $element.text().trim();
+          }
+          break;
+        case 'img':
+          if (extractOptions.includeImageAlt) {
+            const alt = $element.attr('alt');
+            if (alt) {
+              text += ` [Image: ${alt}]`;
+            }
+          }
+          break;
+        case 'li':
+          text += '\n• ' + $element.text().trim();
+          break;
+        default:
+          // For other elements, just extract text
+          if ($element.children().length === 0) {
+            text += ' ' + $element.text().trim();
+          }
+      }
+    });
+    return text.replace(/\s+/g, ' ').replace(/\n\s+/g, '\n').trim();
+  }
+}
+/**
+ * Content quality assessment utilities
+ */
+export class ContentQualityAssessor {
+  /**
+   * Assess content quality based on various metrics
+   * @param {string} content - Text content to assess
+   * @param {Object} options - Assessment options
+   * @returns {Object} - Quality assessment results
+   */
+  static assessContentQuality(content, options = {}) {
+    const defaultOptions = {
+      minLength: 100,
+      maxLength: 50000,
+      minWords: 20,
+      assessReadability: true,
+      checkForBoilerplate: true
+    };
+    const assessmentOptions = { ...defaultOptions, ...options };
+    if (!content || typeof content !== 'string') {
+      return {
+        isValid: false,
+        score: 0,
+        reasons: ['Invalid or empty content']
+      };
+    }
+    const assessment = {
+      isValid: true,
+      score: 100,
+      reasons: [],
+      metrics: {}
+    };
+    // Basic metrics
+    const words = content.split(/\s+/).filter(w => w.length > 0);
+    const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 0);
+    const paragraphs = content.split(/\n\s*\n/).filter(p => p.trim().length > 0);
+    assessment.metrics = {
+      length: content.length,
+      words: words.length,
+      sentences: sentences.length,
+      paragraphs: paragraphs.length,
+      avgWordsPerSentence: words.length / Math.max(sentences.length, 1),
+      avgSentencesPerParagraph: sentences.length / Math.max(paragraphs.length, 1)
+    };
+    // Length assessment
+    if (content.length < assessmentOptions.minLength) {
+      assessment.score -= 30;
+      assessment.reasons.push(`Content too short (${content.length} chars)`);
+    }
+    if (content.length > assessmentOptions.maxLength) {
+      assessment.score -= 10;
+      assessment.reasons.push(`Content very long (${content.length} chars)`);
+    }
+    // Word count assessment
+    if (words.length < assessmentOptions.minWords) {
+      assessment.score -= 25;
+      assessment.reasons.push(`Too few words (${words.length})`);
+    }
+    // Sentence structure assessment
+    if (assessment.metrics.avgWordsPerSentence < 5) {
+      assessment.score -= 15;
+      assessment.reasons.push('Very short sentences detected');
+    }
+    if (assessment.metrics.avgWordsPerSentence > 30) {
+      assessment.score -= 10;
+      assessment.reasons.push('Very long sentences detected');
+    }
+    // Boilerplate detection
+    if (assessmentOptions.checkForBoilerplate) {
+      const boilerplateScore = this.detectBoilerplate(content);
+      if (boilerplateScore > 0.3) {
+        assessment.score -= Math.round(boilerplateScore * 50);
+        assessment.reasons.push('Potential boilerplate content detected');
+      }
+      assessment.metrics.boilerplateScore = boilerplateScore;
+    }
+    // Readability assessment
+    if (assessmentOptions.assessReadability) {
+      const readability = this.calculateSimpleReadability(content);
+      assessment.metrics.readability = readability;
+      if (readability.score < 30 || readability.score > 100) {
+        assessment.score -= 10;
+        assessment.reasons.push('Poor readability score');
+      }
+    }
+    // Final validation
+    if (assessment.score < 50) {
+      assessment.isValid = false;
+    }
+    assessment.score = Math.max(0, Math.min(100, assessment.score));
+    return assessment;
+  }
+  /**
+   * Detect boilerplate content patterns
+   * @param {string} content - Content to analyze
+   * @returns {number} - Boilerplate score (0-1)
+   */
+  static detectBoilerplate(content) {
+    const boilerplatePatterns = [
+      /cookie/gi,
+      /privacy policy/gi,
+      /terms of service/gi,
+      /subscribe to/gi,
+      /newsletter/gi,
+      /follow us/gi,
+      /share this/gi,
+      /related articles/gi,
+      /read more/gi,
+      /advertisement/gi,
+      /sponsored/gi,
+      /copyright/gi,
+      /all rights reserved/gi
+    ];
+    let matches = 0;
+    let totalLength = 0;
+    boilerplatePatterns.forEach(pattern => {
+      const patternMatches = content.match(pattern);
+      if (patternMatches) {
+        matches += patternMatches.length;
+        totalLength += patternMatches.join('').length;
+      }
+    });
+    // Calculate score based on frequency and length of matches
+    const frequency = matches / Math.max(content.split(/\s+/).length, 1);
+    const lengthRatio = totalLength / Math.max(content.length, 1);
+    return Math.min(1, frequency * 10 + lengthRatio * 5);
+  }
+  /**
+   * Calculate simple readability metrics
+   * @param {string} text - Text to analyze
+   * @returns {Object} - Readability metrics
+   */
+  static calculateSimpleReadability(text) {
+    const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
+    const words = text.split(/\s+/).filter(w => w.length > 0);
+    const syllables = words.reduce((count, word) => count + this.countSyllables(word), 0);
+    if (sentences.length === 0 || words.length === 0) {
+      return { score: 0, level: 'Unknown' };
+    }
+    const avgWordsPerSentence = words.length / sentences.length;
+    const avgSyllablesPerWord = syllables / words.length;
+    // Flesch Reading Ease Score
+    const score = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * avgSyllablesPerWord);
+    return {
+      score: Math.round(score * 100) / 100,
+      level: this.getReadabilityLevel(score),
+      avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
+      avgSyllablesPerWord: Math.round(avgSyllablesPerWord * 100) / 100
+    };
+  }
+  /**
+   * Count syllables in a word (simple approximation)
+   * @param {string} word - Word to count syllables for
+   * @returns {number} - Syllable count
+   */
+  static countSyllables(word) {
+    if (!word || word.length <= 3) return 1;
+    const vowels = 'aeiouy';
+    let count = 0;
+    let prevIsVowel = false;
+    for (let i = 0; i < word.length; i++) {
+      const isVowel = vowels.includes(word[i].toLowerCase());
+      if (isVowel && !prevIsVowel) {
+        count++;
+      }
+      prevIsVowel = isVowel;
+    }
+    // Adjust for silent 'e'
+    if (word.toLowerCase().endsWith('e')) {
+      count--;
+    }
+    return Math.max(1, count);
+  }
+  /**
+   * Get readability level from score
+   * @param {number} score - Readability score
+   * @returns {string} - Readability level
+   */
+  static getReadabilityLevel(score) {
+    if (score >= 90) return 'Very Easy';
+    if (score >= 80) return 'Easy';
+    if (score >= 70) return 'Fairly Easy';
+    if (score >= 60) return 'Standard';
+    if (score >= 50) return 'Fairly Difficult';
+    if (score >= 30) return 'Difficult';
+    return 'Very Difficult';
+  }
+}
+/**
+ * Structured data parsing utilities
+ */
+export class StructuredDataParser {
+  /**
+   * Parse and validate JSON-LD data
+   * @param {Array} jsonLdArray - Array of JSON-LD objects
+   * @returns {Array} - Validated and parsed JSON-LD data
+   */
+  static parseJsonLD(jsonLdArray) {
+    if (!Array.isArray(jsonLdArray)) {
+      return [];
+    }
+    return jsonLdArray.map(item => {
+      try {
+        // If item is a string, parse it
+        const parsed = typeof item === 'string' ? JSON.parse(item) : item;
+        // Validate basic JSON-LD structure
+        if (parsed && typeof parsed === 'object') {
+          return {
+            type: parsed['@type'] || 'Unknown',
+            context: parsed['@context'] || null,
+            data: parsed,
+            isValid: true
+          };
+        }
+        return null;
+      } catch (error) {
+        return {
+          type: 'Invalid',
+          context: null,
+          data: item,
+          isValid: false,
+          error: error.message
+        };
+      }
+    }).filter(item => item !== null);
+  }
+  /**
+   * Extract common schema.org types from structured data
+   * @param {Object} structuredData - Structured data object
+   * @returns {Object} - Extracted common types
+   */
+  static extractCommonSchemaTypes(structuredData) {
+    const commonTypes = {
+      article: null,
+      organization: null,
+      person: null,
+      product: null,
+      event: null,
+      place: null,
+      website: null
+    };
+    // Process JSON-LD data
+    if (structuredData.jsonLd && Array.isArray(structuredData.jsonLd)) {
+      structuredData.jsonLd.forEach(item => {
+        if (!item || typeof item !== 'object') return;
+        const type = (item['@type'] || '').toLowerCase();
+        if (type.includes('article') || type.includes('blogposting') || type.includes('newsarticle')) {
+          commonTypes.article = this.extractArticleData(item);
+        } else if (type.includes('organization')) {
+          commonTypes.organization = this.extractOrganizationData(item);
+        } else if (type.includes('person')) {
+          commonTypes.person = this.extractPersonData(item);
+        } else if (type.includes('product')) {
+          commonTypes.product = this.extractProductData(item);
+        } else if (type.includes('event')) {
+          commonTypes.event = this.extractEventData(item);
+        } else if (type.includes('place')) {
+          commonTypes.place = this.extractPlaceData(item);
+        } else if (type.includes('website')) {
+          commonTypes.website = this.extractWebsiteData(item);
+        }
+      });
+    }
+    return commonTypes;
+  }
+  /**
+   * Extract article data from structured data
+   * @param {Object} data - Structured data item
+   * @returns {Object} - Extracted article data
+   */
+  static extractArticleData(data) {
+    return {
+      headline: data.headline || data.name || null,
+      author: data.author ? (typeof data.author === 'string' ? data.author : data.author.name) : null,
+      datePublished: data.datePublished || null,
+      dateModified: data.dateModified || null,
+      description: data.description || null,
+      image: data.image || null,
+      publisher: data.publisher ? (typeof data.publisher === 'string' ? data.publisher : data.publisher.name) : null,
+      wordCount: data.wordCount || null,
+      articleSection: data.articleSection || null
+    };
+  }
+  /**
+   * Extract organization data from structured data
+   * @param {Object} data - Structured data item
+   * @returns {Object} - Extracted organization data
+   */
+  static extractOrganizationData(data) {
+    return {
+      name: data.name || null,
+      url: data.url || null,
+      logo: data.logo || null,
+      description: data.description || null,
+      address: data.address || null,
+      telephone: data.telephone || null,
+      email: data.email || null,
+      foundingDate: data.foundingDate || null
+    };
+  }
+  /**
+   * Extract person data from structured data
+   * @param {Object} data - Structured data item
+   * @returns {Object} - Extracted person data
+   */
+  static extractPersonData(data) {
+    return {
+      name: data.name || null,
+      givenName: data.givenName || null,
+      familyName: data.familyName || null,
+      jobTitle: data.jobTitle || null,
+      worksFor: data.worksFor ? (typeof data.worksFor === 'string' ? data.worksFor : data.worksFor.name) : null,
+      url: data.url || null,
+      image: data.image || null,
+      description: data.description || null
+    };
+  }
+  /**
+   * Extract product data from structured data
+   * @param {Object} data - Structured data item
+   * @returns {Object} - Extracted product data
+   */
+  static extractProductData(data) {
+    return {
+      name: data.name || null,
+      description: data.description || null,
+      image: data.image || null,
+      brand: data.brand ? (typeof data.brand === 'string' ? data.brand : data.brand.name) : null,
+      price: data.offers ? data.offers.price : null,
+      currency: data.offers ? data.offers.priceCurrency : null,
+      availability: data.offers ? data.offers.availability : null,
+      sku: data.sku || null,
+      gtin: data.gtin || data.gtin13 || data.gtin12 || data.gtin8 || null
+    };
+  }
+  /**
+   * Extract event data from structured data
+   * @param {Object} data - Structured data item
+   * @returns {Object} - Extracted event data
+   */
+  static extractEventData(data) {
+    return {
+      name: data.name || null,
+      description: data.description || null,
+      startDate: data.startDate || null,
+      endDate: data.endDate || null,
+      location: data.location ? (typeof data.location === 'string' ? data.location : data.location.name) : null,
+      organizer: data.organizer ? (typeof data.organizer === 'string' ? data.organizer : data.organizer.name) : null,
+      price: data.offers ? data.offers.price : null,
+      url: data.url || null
+    };
+  }
+  /**
+   * Extract place data from structured data
+   * @param {Object} data - Structured data item
+   * @returns {Object} - Extracted place data
+   */
+  static extractPlaceData(data) {
+    return {
+      name: data.name || null,
+      address: data.address || null,
+      telephone: data.telephone || null,
+      url: data.url || null,
+      description: data.description || null,
+      geo: data.geo || null,
+      openingHours: data.openingHours || null
+    };
+  }
+  /**
+   * Extract website data from structured data
+   * @param {Object} data - Structured data item
+   * @returns {Object} - Extracted website data
+   */
+  static extractWebsiteData(data) {
+    return {
+      name: data.name || null,
+      url: data.url || null,
+      description: data.description || null,
+      publisher: data.publisher ? (typeof data.publisher === 'string' ? data.publisher : data.publisher.name) : null,
+      inLanguage: data.inLanguage || null,
+      potentialAction: data.potentialAction || null
+    };
+  }
+}
+export default {
+  HTMLCleaner,
+  ContentQualityAssessor,
+  StructuredDataParser
+};