npm - crawlforge-mcp-server - Versions diffs - 3.0.12 → 3.0.13 - Mend

crawlforge-mcp-server 3.0.12 → 3.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/CLAUDE.md +103 -324
package/package.json +2 -1
package/server.js +332 -169
package/src/core/AuthManager.js +5 -2
package/src/core/ChangeTracker.js +1 -1
package/src/core/ResearchOrchestrator.js +43 -5
package/src/core/analysis/ContentAnalyzer.js +70 -17
package/src/core/analysis/sentenceUtils.js +73 -0
package/src/core/creatorMode.js +47 -0
package/src/core/llm/LLMManager.js +120 -0
package/src/core/processing/BrowserProcessor.js +1 -1
package/src/tools/extract/extractStructured.js +280 -0
package/src/tools/extract/summarizeContent.js +3 -2
package/src/tools/search/ranking/ResultDeduplicator.js +21 -21
package/src/tools/search/searchWeb.js +1 -1

package/src/core/AuthManager.js CHANGED Viewed

@@ -6,7 +6,7 @@
 // Using native fetch (Node.js 18+)
 import fs from 'fs/promises';
 import path from 'path';
-import { isCreatorModeVerified } from '../../server.js';
+import { isCreatorModeVerified } from './creatorMode.js';
 class AuthManager {
   constructor() {
@@ -284,7 +284,10 @@ class AuthManager {
       scrape_with_actions: 5,
       generate_llms_txt: 3,
       localization: 5,
-      track_changes: 3
+      track_changes: 3,
+      // Phase 1: LLM-Powered Structured Extraction
+      extract_structured: 4
     };
     return costs[tool] || 1;

package/src/core/ChangeTracker.js CHANGED Viewed

@@ -1113,7 +1113,7 @@ export class ChangeTracker extends EventEmitter {
   /**
    * Detect changes against the latest snapshot
    */
-  async detectChanges(url, currentContent) {
+  async detectChangesFromSnapshot(url, currentContent) {
     // Validate URL format
     try {
       new URL(url);

package/src/core/ResearchOrchestrator.js CHANGED Viewed

@@ -462,11 +462,49 @@ export class ResearchOrchestrator extends EventEmitter {
             this.researchState.visitedUrls.add(source.link);
             this.metrics.urlsProcessed++;
-            // Extract detailed content
-            const contentData = await this.extractTool.execute({
-              url: source.link,
-              options: { includeMetadata: true, includeStructuredData: true }
-            });
+            // Extract detailed content (with fallback to fetch_url + text extraction)
+            let contentData;
+            try {
+              contentData = await this.extractTool.execute({
+                url: source.link,
+                options: { includeMetadata: true, includeStructuredData: true }
+              });
+            } catch (extractError) {
+              this.logger.warn('Primary extraction failed, trying fallback', {
+                url: source.link,
+                error: extractError.message
+              });
+              // Fallback: use fetch + basic text extraction
+              try {
+                const fetchResponse = await fetch(source.link, {
+                  headers: { 'User-Agent': 'CrawlForge-Research/1.0' },
+                  signal: AbortSignal.timeout(10000)
+                });
+                if (fetchResponse.ok) {
+                  const html = await fetchResponse.text();
+                  // Strip HTML tags for basic text content
+                  const textContent = html
+                    .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
+                    .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
+                    .replace(/<[^>]+>/g, ' ')
+                    .replace(/\s+/g, ' ')
+                    .trim();
+                  if (textContent.length > 50) {
+                    contentData = {
+                      content: textContent.slice(0, 5000),
+                      metadata: { title: source.title || '' },
+                      structuredData: {},
+                      fallback: true
+                    };
+                  }
+                }
+              } catch (fallbackError) {
+                this.logger.warn('Fallback extraction also failed', {
+                  url: source.link,
+                  error: fallbackError.message
+                });
+              }
+            }
             if (contentData && contentData.content) {
               this.metrics.contentExtracted++;

package/src/core/analysis/ContentAnalyzer.js CHANGED Viewed

@@ -7,6 +7,7 @@ import { SummarizerManager } from 'node-summarizer';
 import { franc } from 'franc';
 import nlp from 'compromise';
 import { z } from 'zod';
+import { splitSentences } from './sentenceUtils.js';
 const ContentAnalyzerSchema = z.object({
   text: z.string().min(1),
@@ -290,11 +291,29 @@ export class ContentAnalyzer {
       });
       if (detected === 'und') {
-        return null; // Undetermined language
+        // Fallback: check if text is predominantly ASCII Latin characters (likely English)
+        const latinChars = (text.match(/[a-zA-Z]/g) || []).length;
+        const totalChars = text.replace(/\s/g, '').length;
+        if (totalChars > 0 && latinChars / totalChars > 0.7) {
+          // Check for common English words as a heuristic
+          const lower = text.toLowerCase();
+          const englishMarkers = ['the ', 'is ', 'are ', 'was ', 'and ', 'for ', 'that ', 'with ', 'this ', 'from '];
+          const matchCount = englishMarkers.filter(w => lower.includes(w)).length;
+          if (matchCount >= 2) {
+            return {
+              code: 'eng',
+              name: 'English',
+              confidence: 0.6,
+              alternative: [],
+              detectionMethod: 'heuristic'
+            };
+          }
+        }
+        return null; // Truly undetermined language
       }
-      // Get confidence score (simplified approach)
-      const confidence = Math.min(1, text.length / 100 * 0.01 + 0.5);
+      // Get confidence score based on text length and detection certainty
+      const confidence = Math.min(1, 0.5 + (text.length / 500) * 0.5);
       // Get alternative languages using franc.all
       const alternatives = franc.all(text, {
@@ -329,7 +348,7 @@ export class ContentAnalyzer {
    */
   async summarizeText(text, options = {}) {
     try {
-      const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
+      const sentences = splitSentences(text);
       if (sentences.length < 3) {
         return {
@@ -364,7 +383,7 @@ export class ContentAnalyzer {
       if (options.summaryType === 'extractive') {
         // Use node-summarizer for extractive summarization
         const summary = await this.summarizer.getSummaryByRanking(text, targetSentences);
-        summarySentences = summary.split(/[.!?]+/).filter(s => s.trim().length > 0);
+        summarySentences = splitSentences(summary);
       } else {
         // Simple abstractive approach (for demonstration)
         summarySentences = await this.createAbstractiveSummary(text, targetSentences);
@@ -385,7 +404,7 @@ export class ContentAnalyzer {
       console.warn('Text summarization failed:', error.message);
       // Fallback: return first few sentences
-      const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
+      const sentences = splitSentences(text);
       const fallbackSentences = sentences.slice(0, 2);
       return {
@@ -479,13 +498,45 @@ export class ContentAnalyzer {
     try {
       const doc = nlp(text);
+      const people = doc.people().out('array');
+      const places = doc.places().out('array');
+      const organizations = doc.organizations().out('array');
+      const dates = doc.dates().out('array');
+      const money = doc.money().out('array');
+      let other = doc.topics().out('array').slice(0, 10);
+      // Supplement with capitalized proper nouns that compromise may miss
+      // (technology names, product names, etc.)
+      const existingEntities = new Set([
+        ...people, ...places, ...organizations, ...other
+      ].map(e => e.toLowerCase()));
+      const properNouns = text.match(/\b[A-Z][a-zA-Z.]+(?:\s+[A-Z][a-zA-Z.]+)*/g) || [];
+      const supplemental = [...new Set(properNouns)]
+        .filter(n => !existingEntities.has(n.toLowerCase()) && n.length > 1)
+        .slice(0, 10);
+      if (supplemental.length > 0) {
+        other = [...other, ...supplemental].slice(0, 15);
+      }
+      const allEntities = [...people, ...places, ...organizations, ...dates, ...money, ...other];
+      const uniqueEntities = new Set(allEntities.map(e => e.toLowerCase()));
       return {
-        people: doc.people().out('array'),
-        places: doc.places().out('array'),
-        organizations: doc.organizations().out('array'),
-        dates: doc.dates().out('array'),
-        money: doc.money().out('array'),
-        other: doc.topics().out('array').slice(0, 10) // Limit other entities
+        people,
+        places,
+        organizations,
+        dates,
+        money,
+        other,
+        summary: {
+          totalEntities: allEntities.length,
+          uniqueEntities: uniqueEntities.size,
+          entityDensity: text.split(/\s+/).length > 0
+            ? uniqueEntities.size / text.split(/\s+/).length
+            : 0
+        }
       };
     } catch (error) {
@@ -496,7 +547,8 @@ export class ContentAnalyzer {
         organizations: [],
         dates: [],
         money: [],
-        other: []
+        other: [],
+        summary: { totalEntities: 0, uniqueEntities: 0, entityDensity: 0 }
       };
     }
   }
@@ -521,10 +573,11 @@ export class ContentAnalyzer {
       const termTypes = {};
       [...nouns, ...verbs, ...adjectives].forEach(term => {
-        const cleaned = term.toLowerCase().trim();
+        // Strip leading/trailing punctuation but preserve internal periods (e.g. Node.js)
+        const cleaned = term.toLowerCase().trim().replace(/^[^a-z0-9]+|[^a-z0-9.]+$/gi, '').replace(/\.+$/, '');
         if (cleaned.length > 2 && !this.isStopWord(cleaned)) {
           termFreq[cleaned] = (termFreq[cleaned] || 0) + 1;
           if (!termTypes[cleaned]) {
             if (nouns.includes(term)) termTypes[cleaned] = 'noun';
             else if (verbs.includes(term)) termTypes[cleaned] = 'verb';
@@ -561,7 +614,7 @@ export class ContentAnalyzer {
    */
   async calculateReadability(text) {
     try {
-      const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
+      const sentences = splitSentences(text);
       const words = text.split(/\s+/).filter(w => w.length > 0);
       const characters = text.length;
       const charactersNoSpaces = text.replace(/\s/g, '').length;
@@ -669,7 +722,7 @@ export class ContentAnalyzer {
     const characters = text.length;
     const charactersNoSpaces = text.replace(/\s/g, '').length;
     const words = text.split(/\s+/).filter(w => w.length > 0);
-    const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
+    const sentences = splitSentences(text);
     const paragraphs = text.split(/\n\s*\n/).filter(p => p.trim().length > 0);
     // Estimate reading time (average 200 words per minute)

package/src/core/analysis/sentenceUtils.js ADDED Viewed

@@ -0,0 +1,73 @@
+/**
+ * Sentence splitting utility that handles abbreviations, decimal numbers,
+ * domain names, and other common patterns that contain periods.
+ */
+// Common abbreviations that should not trigger sentence splits
+const ABBREVIATIONS = new Set([
+  'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', 'jr', 'st', 'ave', 'blvd',
+  'vs', 'etc', 'inc', 'ltd', 'corp', 'dept', 'univ', 'assn',
+  'approx', 'appt', 'apt', 'dept', 'est', 'min', 'max',
+  'govt', 'lib', 'misc', 'natl', 'intl',
+  'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec',
+  'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
+  'fig', 'eq', 'ref', 'vol', 'no', 'pp', 'ed', 'rev',
+  'e', 'i',  // for e.g. and i.e.
+]);
+/**
+ * Split text into sentences, handling abbreviations and technical terms.
+ * @param {string} text - Text to split
+ * @returns {string[]} - Array of sentence strings
+ */
+export function splitSentences(text) {
+  if (!text || typeof text !== 'string') return [];
+  const sentences = [];
+  let current = '';
+  // Split by potential sentence boundaries: . ! ?
+  // But be smart about abbreviations, numbers, and domain-like patterns
+  const tokens = text.split(/(?<=[.!?])\s+/);
+  for (const token of tokens) {
+    const combined = current ? current + ' ' + token : token;
+    // Check if the current chunk ends with something that looks like a sentence end
+    if (/[.!?]\s*$/.test(combined)) {
+      // Check if the period is likely NOT a sentence boundary
+      const beforePeriod = combined.replace(/[.!?]\s*$/, '');
+      const lastWord = beforePeriod.split(/\s+/).pop() || '';
+      const lastWordLower = lastWord.toLowerCase().replace(/[^a-z]/g, '');
+      const isAbbreviation = ABBREVIATIONS.has(lastWordLower);
+      // e.g., i.e., U.S., Node.js - words with internal periods
+      const hasInternalPeriods = /\w\.\w/.test(lastWord);
+      // Numbers like 3.14, v2.0
+      const isDecimal = /\d\.\d/.test(lastWord);
+      // Single letter followed by period (initials like "A. Smith")
+      const isInitial = /^[A-Z]\.$/.test(lastWord);
+      if (isAbbreviation || hasInternalPeriods || isDecimal || isInitial) {
+        // Not a real sentence boundary — accumulate
+        current = combined;
+      } else {
+        // Real sentence boundary
+        const trimmed = combined.trim();
+        if (trimmed.length > 0) {
+          sentences.push(trimmed);
+        }
+        current = '';
+      }
+    } else {
+      current = combined;
+    }
+  }
+  // Don't forget the last chunk
+  if (current.trim().length > 0) {
+    sentences.push(current.trim());
+  }
+  return sentences.length > 0 ? sentences : [text.trim()];
+}

package/src/core/creatorMode.js ADDED Viewed

@@ -0,0 +1,47 @@
+/**
+ * Creator Mode Authentication
+ * Extracted from server.js to allow tool classes to be imported independently
+ * without triggering the full MCP server startup sequence.
+ *
+ * SECURITY: The creator secret hash is safe to commit — one-way SHA-256.
+ * The actual secret is never stored. Only the package maintainer has it.
+ */
+import crypto from 'crypto';
+import dotenv from 'dotenv';
+// Load .env file early to check for creator secret
+dotenv.config({ path: '.env', quiet: true });
+// SECURITY: Clear any externally-set creator mode env var to prevent bypass
+delete process.env.CRAWLFORGE_CREATOR_MODE;
+const CREATOR_SECRET_HASH = 'cfef62e5068d48e7dd6a39c9e16f0be2615510c6b68274fc8abe3156feb5050b';
+// Module-scoped flag — cannot be set externally
+let _creatorModeVerified = false;
+if (process.env.CRAWLFORGE_CREATOR_SECRET) {
+  const providedHash = crypto
+    .createHash('sha256')
+    .update(process.env.CRAWLFORGE_CREATOR_SECRET)
+    .digest('hex');
+  if (crypto.timingSafeEqual(Buffer.from(providedHash, 'hex'), Buffer.from(CREATOR_SECRET_HASH, 'hex'))) {
+    _creatorModeVerified = true;
+    console.log('Creator Mode Enabled - Unlimited Access');
+  } else {
+    console.warn('Invalid creator secret provided');
+  }
+  // Clean up the secret from environment
+  delete process.env.CRAWLFORGE_CREATOR_SECRET;
+}
+/**
+ * Returns true only when the package maintainer has provided the correct secret.
+ * This flag is module-scoped and cannot be set via environment variables after
+ * the module has loaded.
+ */
+export function isCreatorModeVerified() {
+  return _creatorModeVerified;
+}

package/src/core/llm/LLMManager.js CHANGED Viewed

@@ -319,6 +319,126 @@ Synthesize these findings into a comprehensive analysis:`;
     }
   }
+  /**
+   * Extract structured data from content using LLM and a JSON Schema
+   * Follows the same pattern as analyzeRelevance()
+   */
+  async extractStructured(content, schema, options = {}) {
+    const { maxContentLength = 6000, prompt: userPrompt = '', maxTokens = 1000 } = options;
+    const truncatedContent = content.length > maxContentLength
+      ? content.substring(0, maxContentLength) + '...'
+      : content;
+    // Scale maxTokens with schema complexity
+    const schemaFields = Object.keys(schema.properties || {}).length;
+    const scaledTokens = Math.min(2000, Math.max(maxTokens, schemaFields * 100 + 500));
+    const systemPrompt = `You are a structured data extraction expert. Extract data from the provided content and return ONLY valid JSON that conforms to the given JSON Schema. Do not include any explanation or markdown — only the raw JSON object.`;
+    const schemaStr = JSON.stringify(schema, null, 2);
+    const guidance = userPrompt ? `\n\nExtraction guidance: ${userPrompt}` : '';
+    const extractionPrompt = `JSON Schema to extract:
+${schemaStr}${guidance}
+Content to extract from:
+${truncatedContent}
+Extract the data and return valid JSON:`;
+    try {
+      const response = await this.generateCompletion(extractionPrompt, {
+        systemPrompt,
+        maxTokens: scaledTokens,
+        temperature: 0.1
+      });
+      // Strip markdown code fences if present
+      const cleaned = response.replace(/^```(?:json)?\n?/, '').replace(/\n?```$/, '').trim();
+      const parsed = JSON.parse(cleaned);
+      // Lightweight validation
+      const validation = this.validateAgainstSchema(parsed, schema);
+      return {
+        data: parsed,
+        valid: validation.valid,
+        validationErrors: validation.errors
+      };
+    } catch (error) {
+      this.logger.warn('LLM structured extraction failed, using fallback', { error: error.message });
+      return this.fallbackStructuredExtraction(content, schema);
+    }
+  }
+  /**
+   * Validate a parsed object against a simple JSON Schema
+   */
+  validateAgainstSchema(data, schema) {
+    const errors = [];
+    const properties = schema.properties || {};
+    const required = schema.required || [];
+    for (const field of required) {
+      if (!(field in data)) {
+        errors.push(`Missing required field: ${field}`);
+      }
+    }
+    for (const [key, fieldSchema] of Object.entries(properties)) {
+      if (key in data) {
+        const value = data[key];
+        const expectedType = fieldSchema.type;
+        if (expectedType) {
+          const actualType = Array.isArray(value) ? 'array' : typeof value;
+          if (actualType !== expectedType) {
+            errors.push(`Field "${key}": expected ${expectedType}, got ${actualType}`);
+          }
+        }
+        if (fieldSchema.enum && !fieldSchema.enum.includes(value)) {
+          errors.push(`Field "${key}": value "${value}" not in enum ${JSON.stringify(fieldSchema.enum)}`);
+        }
+      }
+    }
+    return { valid: errors.length === 0, errors };
+  }
+  /**
+   * Fallback structured extraction without LLM — keyword/regex matching for primitives
+   */
+  fallbackStructuredExtraction(content, schema) {
+    const extracted = {};
+    const properties = schema.properties || {};
+    for (const [key, fieldSchema] of Object.entries(properties)) {
+      const keyPattern = new RegExp(key.replace(/_/g, '[\\s_-]'), 'i');
+      const lineMatch = content.split('\n').find(line => keyPattern.test(line));
+      if (lineMatch) {
+        const valueMatch = lineMatch.match(/:\s*(.+)$/);
+        const rawValue = valueMatch ? valueMatch[1].trim() : null;
+        if (rawValue) {
+          if (fieldSchema.type === 'number') {
+            const num = parseFloat(rawValue.replace(/[^0-9.-]/g, ''));
+            if (!isNaN(num)) extracted[key] = num;
+          } else if (fieldSchema.type === 'boolean') {
+            extracted[key] = /true|yes|1/i.test(rawValue);
+          } else {
+            extracted[key] = rawValue;
+          }
+        }
+      }
+    }
+    return {
+      data: extracted,
+      valid: false,
+      validationErrors: ['Used fallback extraction — no LLM provider available']
+    };
+  }
   /**
    * Fallback query expansion without LLM
    */

package/src/core/processing/BrowserProcessor.js CHANGED Viewed

@@ -668,7 +668,7 @@ export class BrowserProcessor {
       // Check for dynamic content indicators
       const dynamicIndicators = document.querySelectorAll(
-        '[data-bind], [v-if], [v-for], [ng-if], [ng-repeat], [*ngFor], [*ngIf]'
+        '[data-bind], [v-if], [v-for], [ng-if], [ng-repeat], [ngFor], [ngIf]'
       );
       analysis.hasDynamicContent = dynamicIndicators.length > 0 || analysis.detectedFrameworks.length > 0;