npm - crawlforge-mcp-server - Versions diffs - 4.2.12 → 4.5.0 - Mend

crawlforge-mcp-server 4.2.12 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/package.json +2 -1
package/server.js +138 -20
package/src/constants/config.js +5 -0
package/src/core/ActionExecutor.js +13 -1
package/src/core/ChangeTracker.js +8 -5
package/src/core/LLMsTxtAnalyzer.js +71 -47
package/src/core/LocalizationManager.js +7 -4
package/src/core/ResearchOrchestrator.js +10 -6
package/src/core/StealthBrowserManager.js +52 -13
package/src/core/analysis/ContentAnalyzer.js +2 -2
package/src/core/crawlers/BFSCrawler.js +23 -12
package/src/core/processing/ContentProcessor.js +19 -3
package/src/core/processing/PDFProcessor.js +72 -23
package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
package/src/tools/advanced/batchScrape/index.js +3 -1
package/src/tools/advanced/batchScrape/reporter.js +5 -1
package/src/tools/advanced/batchScrape/worker.js +6 -1
package/src/tools/basic/_fetch.js +78 -5
package/src/tools/basic/extractLinks.js +1 -1
package/src/tools/basic/extractMetadata.js +65 -1
package/src/tools/basic/extractText.js +61 -5
package/src/tools/basic/scrapeStructured.js +48 -10
package/src/tools/crawl/crawlDeep.js +13 -5
package/src/tools/crawl/mapSite.js +24 -51
package/src/tools/extract/analyzeContent.js +11 -6
package/src/tools/extract/extractContent.js +23 -5
package/src/tools/extract/extractStructured.js +65 -16
package/src/tools/extract/extractWithLlm.js +192 -11
package/src/tools/extract/listOllamaModels.js +19 -8
package/src/tools/extract/processDocument.js +10 -4
package/src/tools/extract/summarizeContent.js +58 -1
package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
package/src/tools/research/deepResearch.js +43 -4
package/src/tools/search/providers/searxng.js +2 -2
package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
package/src/tools/search/ranking/ResultRanker.js +13 -4
package/src/tools/search/searchWeb.js +5 -5
package/src/tools/templates/TemplateRegistry.js +3 -2
package/src/tools/tracking/trackChanges/differ.js +33 -1
package/src/utils/htmlToMarkdown.js +5 -1

package/src/tools/llmstxt/generateLLMsTxt.js CHANGED Viewed

@@ -13,7 +13,8 @@ const GenerateLLMsTxtSchema = z.object({
     maxPages: z.number().min(10).max(500).optional().default(100).describe('Maximum pages to analyze'),
     detectAPIs: z.boolean().optional().default(true).describe('Whether to detect API endpoints'),
     analyzeContent: z.boolean().optional().default(true).describe('Whether to analyze content types'),
-    checkSecurity: z.boolean().optional().default(true).describe('Whether to check security boundaries'),
+    checkSecurity: z.boolean().optional().default(false).describe('Whether to probe security-sensitive paths (opt-in; sends requests to /admin, /login, etc.)'),
+    probeRateLimit: z.boolean().optional().default(false).describe('Whether to send repeated probe requests to estimate rate limits (opt-in; fires ~5 requests)'),
     respectRobots: z.boolean().optional().default(true).describe('Whether to respect robots.txt')
   }).optional().default({}),
@@ -23,7 +24,8 @@ const GenerateLLMsTxtSchema = z.object({
     contactEmail: z.string().email().optional().describe('Contact email for the LLMs.txt'),
     organizationName: z.string().optional().describe('Organization name'),
     customGuidelines: z.array(z.string()).optional().describe('Additional custom guidelines'),
-    customRestrictions: z.array(z.string()).optional().describe('Additional restrictions')
+    customRestrictions: z.array(z.string()).optional().describe('Additional restrictions'),
+    robotsStyle: z.boolean().optional().default(false).describe('Emit legacy robots.txt-style directives instead of spec-compliant llmstxt.org markdown')
   }).optional().default({}),
   complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard').describe('Compliance level for generated guidelines'),
@@ -120,9 +122,128 @@ export class GenerateLLMsTxtTool {
   }
   /**
-   * Generate standard LLMs.txt content
+   * Generate LLMs.txt content. By default emits spec-compliant llmstxt.org markdown;
+   * set outputOptions.robotsStyle=true for the legacy robots.txt-style directives.
    */
   generateLLMsTxt(analysis, outputOptions, complianceLevel) {
+    if (outputOptions.robotsStyle) {
+      return this.generateRobotsStyleTxt(analysis, outputOptions, complianceLevel);
+    }
+    return this.generateSpecLLMsTxt(analysis, outputOptions);
+  }
+  /**
+   * Generate spec-compliant llms.txt per https://llmstxt.org/ :
+   *   # Title
+   *   > one-line summary (blockquote)
+   *   optional detail paragraph(s)
+   *   ## Section
+   *   - [name](url): optional notes
+   */
+  generateSpecLLMsTxt(analysis, outputOptions) {
+    const lines = [];
+    const baseUrl = analysis.metadata.baseUrl;
+    let host = baseUrl;
+    try { host = new URL(baseUrl).hostname; } catch { /* keep baseUrl */ }
+    // H1 title (required)
+    const title = outputOptions.organizationName || host;
+    lines.push(`# ${title}`);
+    lines.push('');
+    // Blockquote summary (required by spec)
+    const summary = `Site map and key resources for ${baseUrl}, generated to help LLMs locate relevant content.`;
+    lines.push(`> ${summary}`);
+    lines.push('');
+    // Optional detail paragraph(s)
+    const details = [];
+    if (analysis.structure?.totalPages) {
+      details.push(`This site has approximately ${analysis.structure.totalPages} discoverable pages.`);
+    }
+    if (outputOptions.contactEmail) {
+      details.push(`Contact: ${outputOptions.contactEmail}.`);
+    }
+    if (Array.isArray(outputOptions.customGuidelines) && outputOptions.customGuidelines.length > 0) {
+      details.push(...outputOptions.customGuidelines);
+    }
+    if (details.length > 0) {
+      lines.push(details.join(' '));
+      lines.push('');
+    }
+    // Helper: emit a "## Section" with a list of [name](url) links.
+    const linkLabel = (u) => {
+      try {
+        const p = new URL(u).pathname.replace(/\/+$/, '');
+        if (!p || p === '') return 'Home';
+        const seg = p.split('/').filter(Boolean).pop() || p;
+        return seg.replace(/[-_]/g, ' ').replace(/\.[a-z0-9]+$/i, '').trim() || p;
+      } catch {
+        return u;
+      }
+    };
+    const emitSection = (heading, urls) => {
+      // Coerce to an array: sitemap/sections may arrive as a flat array, a
+      // grouped object ({path: [...]}), or a single value.
+      let arr = [];
+      if (Array.isArray(urls)) {
+        arr = urls;
+      } else if (urls && typeof urls === 'object') {
+        arr = Object.values(urls).flat();
+      }
+      const list = arr
+        .map((u) => (typeof u === 'string' ? u : (u?.url || u?.loc)))
+        .filter(Boolean)
+        .slice(0, 25);
+      if (list.length === 0) return;
+      lines.push(`## ${heading}`);
+      lines.push('');
+      for (const u of list) {
+        lines.push(`- [${linkLabel(u)}](${u})`);
+      }
+      lines.push('');
+    };
+    // Sections derived from the categorized site structure.
+    const sections = analysis.structure?.sections || {};
+    const flatten = (cat) => {
+      const v = sections[cat];
+      if (!Array.isArray(v)) return [];
+      // categorizeSections may produce either flat URLs or {path, urls:[...]} groups.
+      return v.flatMap((entry) =>
+        typeof entry === 'string' ? [entry] : (Array.isArray(entry?.urls) ? entry.urls : []));
+    };
+    emitSection('Documentation', flatten('documentation'));
+    emitSection('Content', flatten('content'));
+    emitSection('Tools', flatten('tools'));
+    emitSection('Navigation', flatten('navigation'));
+    // APIs as their own section.
+    if (Array.isArray(analysis.apis) && analysis.apis.length > 0) {
+      lines.push('## APIs');
+      lines.push('');
+      for (const api of analysis.apis.slice(0, 25)) {
+        const note = api.type ? `: ${api.type}` : '';
+        lines.push(`- [${linkLabel(api.url)}](${api.url})${note}`);
+      }
+      lines.push('');
+    }
+    // Fallback: if no categorized sections produced output, list the raw sitemap.
+    const hasBody = lines.some((l) => l.startsWith('## '));
+    if (!hasBody) {
+      emitSection('Pages', analysis.structure?.sitemap || []);
+    }
+    return lines.join('\n').replace(/\n{3,}/g, '\n\n').trimEnd() + '\n';
+  }
+  /**
+   * Generate legacy robots.txt-style content (opt-in via outputOptions.robotsStyle).
+   */
+  generateRobotsStyleTxt(analysis, outputOptions, complianceLevel) {
     const lines = [];
     const baseUrl = analysis.metadata.baseUrl;

package/src/tools/research/deepResearch.js CHANGED Viewed

@@ -379,18 +379,57 @@ export class DeepResearchTool {
    * Format research results according to output preferences
    */
   formatResults(results, params) {
-    // Raw evidence mode (no LLM configured): pass through the clean shape
-    // designed for the calling LLM to synthesize.
+    // Raw evidence mode (no LLM configured): apply lightweight formatting so
+    // outputFormat is not silently ignored, and rank sources by credibility.
     if (results.synthesisMode === 'raw_evidence') {
-      return {
+      const rankedSources = (results.sources || [])
+        .slice()
+        .sort((a, b) => (b.credibility || 0) - (a.credibility || 0));
+      const base = {
         synthesisMode: 'raw_evidence',
         note: results.note,
-        sources: results.sources,
         researchSummary: results.researchSummary,
         metadata: results.metadata,
         performance: results.performance,
         activityLog: params.includeActivityLog ? results.activityLog : undefined
       };
+      switch (params.outputFormat) {
+        case 'summary':
+          return {
+            ...base,
+            sources: rankedSources.slice(0, 5)
+          };
+        case 'citations_only':
+          return {
+            ...base,
+            sources: rankedSources.map(s => ({
+              title: s.title,
+              url: s.url,
+              credibility: s.credibility
+            })),
+            citationCount: rankedSources.length,
+            citationSummary: this.generateCitationSummary(rankedSources)
+          };
+        case 'conflicts_focus':
+          // Without LLM there is no conflict detection; return ranked sources
+          // with a note so the caller knows what happened.
+          return {
+            ...base,
+            sources: rankedSources,
+            conflictsNote: 'Conflict detection requires an LLM (OPENAI_API_KEY or ANTHROPIC_API_KEY). Sources are ranked by credibility for manual review.'
+          };
+        case 'comprehensive':
+        default:
+          return {
+            ...base,
+            sources: rankedSources
+          };
+      }
     }
     const formatted = {

package/src/tools/search/providers/searxng.js CHANGED Viewed

@@ -117,8 +117,8 @@ export async function searchViaSearxng(opts = {}) {
   return {
     items,
     searchInformation: {
-      totalResults: String(rawResults.length),
-      searchTime: data.answers ? 0 : 0
+      totalResults: rawResults.length,
+      searchTime: 0
     },
     queries: {},
     context: {}

package/src/tools/search/ranking/ResultDeduplicator.js CHANGED Viewed

@@ -545,27 +545,50 @@ export class ResultDeduplicator {
   }
   /**
-   * SimHash implementation for content similarity
+   * SimHash implementation for content similarity.
+   * Uses two independent 32-bit FNV-1a hashes (seeded differently) to produce
+   * independent high/low 32-bit words, giving a true 64-bit fingerprint so that
+   * bits 32-63 are not a duplicate of bits 0-31.
    */
   simHash(text, bits = 64) {
     const tokens = text.split(/\s+/);
     const hashBits = new Array(bits).fill(0);
     for (const token of tokens) {
-      const hash = this.stringHash(token);
-      for (let i = 0; i < bits; i++) {
-        const bit = (hash >> i) & 1;
-        hashBits[i] += bit ? 1 : -1;
+      const lo = this._fnv1a32(token, 0x811c9dc5);
+      const hi = this._fnv1a32(token, 0x84222325); // different seed
+      for (let i = 0; i < 32; i++) {
+        hashBits[i]      += ((lo >>> i) & 1) ? 1 : -1;
+      }
+      for (let i = 0; i < 32; i++) {
+        hashBits[32 + i] += ((hi >>> i) & 1) ? 1 : -1;
       }
     }
     // Convert to binary string
     return hashBits.map(bit => bit > 0 ? '1' : '0').join('');
   }
   /**
-   * String hash function
+   * FNV-1a 32-bit hash with a configurable offset basis (seed).
+   * @param {string} str
+   * @param {number} seed - 32-bit unsigned offset basis
+   * @returns {number} 32-bit unsigned integer
+   */
+  _fnv1a32(str, seed) {
+    const FNV_PRIME = 0x01000193;
+    let hash = seed >>> 0;
+    for (let i = 0; i < str.length; i++) {
+      hash ^= str.charCodeAt(i);
+      // Multiply by FNV prime using 32-bit arithmetic
+      hash = Math.imul(hash, FNV_PRIME) >>> 0;
+    }
+    return hash;
+  }
+  /**
+   * String hash function (kept for hashResults / cache-key use)
    */
   stringHash(str) {
     let hash = 0;

package/src/tools/search/ranking/ResultRanker.js CHANGED Viewed

@@ -191,18 +191,27 @@ export class ResultRanker {
     // Calculate term frequencies
     const termFreqs = this.getTermFrequencies(contentTerms);
+    // Build per-term document frequency across all results
+    const docFreqs = {};
+    for (const r of allResults) {
+      const rContent = [r.title || '', r.snippet || '', r.htmlSnippet || ''].join(' ');
+      const rTerms = new Set(this.tokenize(rContent.toLowerCase()));
+      for (const t of rTerms) {
+        docFreqs[t] = (docFreqs[t] || 0) + 1;
+      }
+    }
     let score = 0;
     for (const term of queryTerms) {
       const tf = termFreqs[term] || 0;
       if (tf > 0) {
-        // Document frequency (simplified - assume term appears in some docs)
-        const df = Math.min(allResults.length * 0.1, 1); // Conservative estimate
+        const df = docFreqs[term] || 1;
         const idf = Math.log((allResults.length - df + 0.5) / (df + 0.5));
         // BM25 formula
         const numerator = tf * (k1 + 1);
         const denominator = tf + k1 * (1 - b + b * (contentLength / avgDocLength));
         score += idf * (numerator / denominator);
       }
     }

package/src/tools/search/searchWeb.js CHANGED Viewed

@@ -280,14 +280,14 @@ export class SearchWebTool {
       // Clean up results based on detail level requested
       if (!validated.include_ranking_details) {
         processedResults = processedResults.map(result => {
-          const { rankingDetails, ...cleanResult } = result;
+          const { rankingDetails, finalScore, originalIndex, scores, ...cleanResult } = result;
           return cleanResult;
         });
       }
       if (!validated.include_deduplication_details) {
         processedResults = processedResults.map(result => {
-          const { deduplicationInfo, ...cleanResult } = result;
+          const { deduplicationInfo, contentHash, normalizedUrl, titleTokens, ...cleanResult } = result;
           return cleanResult;
         });
       }
@@ -407,10 +407,10 @@ export class SearchWebTool {
     }
     if (!validated.include_ranking_details) {
-      processedResults = processedResults.map(({ rankingDetails, ...r }) => r);
+      processedResults = processedResults.map(({ rankingDetails, finalScore, originalIndex, scores, ...r }) => r);
     }
     if (!validated.include_deduplication_details) {
-      processedResults = processedResults.map(({ deduplicationInfo: _d, ...r }) => r);
+      processedResults = processedResults.map(({ deduplicationInfo: _d, contentHash, normalizedUrl, titleTokens, ...r }) => r);
     }
     return {

package/src/tools/templates/TemplateRegistry.js CHANGED Viewed

@@ -162,8 +162,9 @@ const TEMPLATES = [
       const stories = [];
       $('tr.athing').each((_, el) => {
         const $row = $(el);
-        const $score = $row.next('.spacer').find('.score');
-        const $subtext = $row.next('.spacer').find('.subtext');
+        // The metadata row (".subtext") is the sibling row immediately after tr.athing.
+        const $subtext = $row.next('tr').find('.subtext');
+        const $score = $subtext.find('.score');
         const $titleLink = $row.find('.titleline > a');
         stories.push({
           id: $row.attr('id'),

package/src/tools/tracking/trackChanges/differ.js CHANGED Viewed

@@ -3,6 +3,38 @@
  * URL content fetching and history/stat helper functions.
  */
+/**
+ * Default Jaccard similarity threshold below which a change is considered
+ * meaningful (i.e. worth flagging). 0.85 means content must be at least 85 %
+ * similar to be treated as "no significant change".
+ */
+export const DEFAULT_CHANGE_THRESHOLD = 0.85;
+/**
+ * Compute token-based Jaccard similarity between two text strings.
+ * Tokenises on whitespace; returns a value in [0, 1] where 1 is identical.
+ *
+ * @param {string} text1
+ * @param {string} text2
+ * @returns {number}
+ */
+export function calculateSimilarity(text1, text2) {
+  if (!text1 && !text2) return 1;
+  if (!text1 || !text2) return 0;
+  const tokenise = (str) => new Set(str.toLowerCase().split(/\s+/).filter(Boolean));
+  const setA = tokenise(text1);
+  const setB = tokenise(text2);
+  let intersection = 0;
+  for (const token of setA) {
+    if (setB.has(token)) intersection++;
+  }
+  const union = setA.size + setB.size - intersection;
+  return union === 0 ? 1 : intersection / union;
+}
 /**
  * Fetch the HTML/text content of a URL with change-tracking headers.
  * @param {string} url
@@ -18,7 +50,7 @@ export async function fetchContent(url) {
         'Accept-Encoding': 'gzip, deflate',
         'Cache-Control': 'no-cache'
       },
-      timeout: 30000
+      signal: AbortSignal.timeout(30000)
     });
     if (!response.ok) {

package/src/utils/htmlToMarkdown.js CHANGED Viewed

@@ -11,10 +11,11 @@
  *     headingStyle: 'atx'       -> # H1 / ## H2 instead of underline style
  *     codeBlockStyle: 'fenced'  -> triple-backtick fences
  *     bulletListMarker: '-'
- * - Tables fall back to prose (no GFM plugin loaded by default).
+ * - GFM plugin enabled for table support (turndown-plugin-gfm).
  */
 import TurndownService from 'turndown';
+import { gfm } from 'turndown-plugin-gfm';
 let _td = null;
@@ -30,6 +31,9 @@ function getTurndown() {
       linkStyle: 'inlined'
     });
+    // Enable GFM extensions (tables, strikethrough, task lists)
+    _td.use(gfm);
     // Remove boilerplate elements before converting
     _td.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);
   }