npm - crawlforge-mcp-server - Versions diffs - 3.0.18 → 3.3.1 - Mend

crawlforge-mcp-server 3.0.18 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/package.json +5 -2
package/server.js +192 -1277
package/src/core/ActionExecutor.js +2 -43
package/src/core/AuthManager.js +127 -14
package/src/core/BrowserContextPool.js +187 -0
package/src/core/JobManager.js +7 -5
package/src/core/LocalizationManager.js +14 -125
package/src/core/StealthBrowserManager.js +26 -18
package/src/core/cache/CacheManager.js +4 -1
package/src/core/crawlers/BFSCrawler.js +19 -5
package/src/observability/metrics.js +137 -0
package/src/observability/tracing.js +74 -0
package/src/server/auth/oauth.js +388 -0
package/src/server/registerTool.js +41 -0
package/src/server/schemas/common.js +29 -0
package/src/server/transports/http.js +22 -0
package/src/server/transports/stdio.js +16 -0
package/src/server/transports/streamableHttp.js +226 -0
package/src/server/withAuth.js +121 -0
package/src/tools/advanced/BatchScrapeTool.js +12 -1086
package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
package/src/tools/advanced/batchScrape/index.js +328 -0
package/src/tools/advanced/batchScrape/queue.js +91 -0
package/src/tools/advanced/batchScrape/reporter.js +26 -0
package/src/tools/advanced/batchScrape/schema.js +37 -0
package/src/tools/advanced/batchScrape/worker.js +179 -0
package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
package/src/tools/basic/_fetch.js +35 -0
package/src/tools/basic/extractLinks.js +74 -0
package/src/tools/basic/extractMetadata.js +74 -0
package/src/tools/basic/extractText.js +46 -0
package/src/tools/basic/fetchUrl.js +44 -0
package/src/tools/basic/scrapeStructured.js +58 -0
package/src/tools/crawl/_sessionContext.js +234 -0
package/src/tools/crawl/crawlDeep.js +55 -5
package/src/tools/crawl/mapSite.js +23 -2
package/src/tools/extract/_fetchAndParse.js +57 -0
package/src/tools/extract/extractStructured.js +3 -19
package/src/tools/extract/extractWithLlm.js +295 -0
package/src/tools/search/providers/searxng.js +126 -0
package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
package/src/tools/search/ranking/ResultRanker.js +17 -10
package/src/tools/search/ranking/SearchResultCache.js +52 -0
package/src/tools/search/searchWeb.js +112 -6
package/src/tools/tracking/trackChanges/differ.js +98 -0
package/src/tools/tracking/trackChanges/index.js +432 -0
package/src/tools/tracking/trackChanges/monitor.js +93 -0
package/src/tools/tracking/trackChanges/notifier.js +105 -0
package/src/tools/tracking/trackChanges/schema.js +127 -0
package/src/tools/tracking/trackChanges.js +12 -1374

package/src/tools/extract/extractWithLlm.js ADDED Viewed

@@ -0,0 +1,295 @@
+/**
+ * Extract With LLM MCP Tool
+ * Natural-language extraction powered by OpenAI or Anthropic.
+ * Mirrors ScrapeGraphAI positioning: describe what you want, get structured JSON back.
+ *
+ * Requires OPENAI_API_KEY or ANTHROPIC_API_KEY in environment.
+ * Gate: tool throws a clear error when neither key is present.
+ */
+import { fetchAndParse } from './_fetchAndParse.js';
+// ── Constants ─────────────────────────────────────────────────────────────────
+const MAX_INPUT_CHARS = 50_000;
+const OPENAI_DEFAULT_MODEL = 'gpt-4o-mini';
+const ANTHROPIC_DEFAULT_MODEL = 'claude-haiku-4-5-20251001';
+// Support test-time overrides so the test suite can stub endpoints.
+function openaiBaseUrl() {
+  return (process.env.OPENAI_BASE_URL || 'https://api.openai.com').replace(/\/$/, '');
+}
+function anthropicBaseUrl() {
+  return (process.env.ANTHROPIC_BASE_URL || 'https://api.anthropic.com').replace(/\/$/, '');
+}
+// ── Helpers ───────────────────────────────────────────────────────────────────
+/**
+ * Resolve which provider to use.
+ * @param {'openai'|'anthropic'|'auto'} provider
+ * @returns {{ provider: 'openai'|'anthropic', apiKey: string }}
+ */
+function resolveProvider(provider) {
+  const anthropicKey = process.env.ANTHROPIC_API_KEY;
+  const openaiKey = process.env.OPENAI_API_KEY;
+  if (provider === 'auto') {
+    if (anthropicKey) return { provider: 'anthropic', apiKey: anthropicKey };
+    if (openaiKey) return { provider: 'openai', apiKey: openaiKey };
+    throw new Error(
+      'extract_with_llm requires OPENAI_API_KEY or ANTHROPIC_API_KEY in environment'
+    );
+  }
+  if (provider === 'anthropic') {
+    if (!anthropicKey) throw new Error('extract_with_llm: ANTHROPIC_API_KEY is not set');
+    return { provider: 'anthropic', apiKey: anthropicKey };
+  }
+  if (provider === 'openai') {
+    if (!openaiKey) throw new Error('extract_with_llm: OPENAI_API_KEY is not set');
+    return { provider: 'openai', apiKey: openaiKey };
+  }
+  throw new Error(`extract_with_llm: unknown provider "${provider}"`);
+}
+/**
+ * Build the user message text that goes to the LLM.
+ */
+function buildUserMessage(userPrompt, text, schema) {
+  const truncated = text.length > MAX_INPUT_CHARS ? text.slice(0, MAX_INPUT_CHARS) + '\n[...truncated]' : text;
+  let msg = `Extraction instruction: ${userPrompt}\n\n`;
+  if (schema && Object.keys(schema).length > 0) {
+    msg += `Output schema hint:\n${JSON.stringify(schema, null, 2)}\n\n`;
+  }
+  msg += `Web page content:\n${truncated}\n\nReturn only valid JSON.`;
+  return msg;
+}
+/**
+ * Parse JSON from an LLM response string defensively.
+ * Strips markdown code fences if present.
+ * Returns parsed object or throws.
+ */
+function parseJson(raw) {
+  // Strip markdown fences
+  const stripped = raw
+    .replace(/^```(?:json)?\s*/i, '')
+    .replace(/\s*```\s*$/, '')
+    .trim();
+  return JSON.parse(stripped);
+}
+// ── OpenAI call ───────────────────────────────────────────────────────────────
+async function callOpenAI({ apiKey, model, systemMessage, userMessage, maxTokens }) {
+  const url = `${openaiBaseUrl()}/v1/chat/completions`;
+  const body = {
+    model,
+    messages: [
+      { role: 'system', content: systemMessage },
+      { role: 'user', content: userMessage }
+    ],
+    max_tokens: maxTokens,
+    response_format: { type: 'json_object' }
+  };
+  const response = await fetch(url, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      'Authorization': `Bearer ${apiKey}`
+    },
+    body: JSON.stringify(body),
+    signal: AbortSignal.timeout(120_000)
+  });
+  if (!response.ok) {
+    const errText = await response.text().catch(() => '');
+    throw new Error(`OpenAI API error ${response.status}: ${errText.slice(0, 200)}`);
+  }
+  const json = await response.json();
+  const content = json.choices?.[0]?.message?.content ?? '';
+  const usage = {
+    input_tokens: json.usage?.prompt_tokens ?? 0,
+    output_tokens: json.usage?.completion_tokens ?? 0
+  };
+  return { rawText: content, usage, model: json.model || model };
+}
+// ── Anthropic call ────────────────────────────────────────────────────────────
+async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens }) {
+  const url = `${anthropicBaseUrl()}/v1/messages`;
+  const body = {
+    model,
+    system: systemMessage,
+    messages: [{ role: 'user', content: userMessage }],
+    max_tokens: maxTokens
+  };
+  const response = await fetch(url, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      'x-api-key': apiKey,
+      'anthropic-version': '2023-06-01'
+    },
+    body: JSON.stringify(body),
+    signal: AbortSignal.timeout(120_000)
+  });
+  if (!response.ok) {
+    const errText = await response.text().catch(() => '');
+    throw new Error(`Anthropic API error ${response.status}: ${errText.slice(0, 200)}`);
+  }
+  const json = await response.json();
+  const content = json.content?.[0]?.text ?? '';
+  const usage = {
+    input_tokens: json.usage?.input_tokens ?? 0,
+    output_tokens: json.usage?.output_tokens ?? 0
+  };
+  return { rawText: content, usage, model: json.model || model };
+}
+// ── LLM dispatch ─────────────────────────────────────────────────────────────
+async function callLLM({ provider, apiKey, model, systemMessage, userMessage, maxTokens }) {
+  if (provider === 'openai') {
+    return callOpenAI({ apiKey, model, systemMessage, userMessage, maxTokens });
+  }
+  return callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens });
+}
+// ── Tool class ────────────────────────────────────────────────────────────────
+export class ExtractWithLlm {
+  constructor(config = {}) {
+    this.config = config;
+  }
+  /**
+   * Execute LLM-powered extraction.
+   * @param {Object} params
+   * @param {string}  [params.url]       - URL to fetch (one of url/content required)
+   * @param {string}  [params.content]   - Pre-fetched text content
+   * @param {string}   params.prompt     - Natural-language extraction instruction
+   * @param {Object}  [params.schema]    - Optional JSON-schema-like output hint
+   * @param {string}  [params.provider]  - 'openai' | 'anthropic' | 'auto'
+   * @param {string}  [params.model]     - Override default model
+   * @param {number}  [params.maxTokens] - Max output tokens (default 4096)
+   * @returns {Promise<Object>}
+   */
+  async execute(params) {
+    const {
+      url,
+      content,
+      prompt,
+      schema,
+      provider: providerParam = 'auto',
+      model: modelParam,
+      maxTokens = 4096
+    } = params;
+    // Validate: exactly one of url or content must be provided
+    if (!url && !content) {
+      return {
+        success: false,
+        error: 'extract_with_llm: either "url" or "content" must be provided'
+      };
+    }
+    if (!prompt) {
+      return { success: false, error: 'extract_with_llm: "prompt" is required' };
+    }
+    // Resolve provider + API key (throws clearly if neither key is set)
+    let resolved;
+    try {
+      resolved = resolveProvider(providerParam);
+    } catch (err) {
+      return { success: false, error: err.message };
+    }
+    const { provider, apiKey } = resolved;
+    const defaultModel = provider === 'openai' ? OPENAI_DEFAULT_MODEL : ANTHROPIC_DEFAULT_MODEL;
+    const model = modelParam || defaultModel;
+    // Step 1: Get text to extract from
+    let text;
+    try {
+      if (url) {
+        const { textContent } = await fetchAndParse(url);
+        text = textContent;
+      } else {
+        text = content;
+      }
+    } catch (fetchErr) {
+      return { success: false, error: `Failed to fetch content: ${fetchErr.message}` };
+    }
+    const systemMessage =
+      'You extract structured data from web content per the user\'s instructions. Return JSON only.';
+    const userMessage = buildUserMessage(prompt, text, schema);
+    // Step 2: First LLM call
+    let rawText, usage;
+    try {
+      ({ rawText, usage } = await callLLM({
+        provider, apiKey, model, systemMessage, userMessage, maxTokens
+      }));
+    } catch (llmErr) {
+      return { success: false, error: `LLM call failed: ${llmErr.message}` };
+    }
+    // Step 3: Parse JSON; retry once with stricter prompt if it fails
+    let parsed;
+    try {
+      parsed = parseJson(rawText);
+    } catch (_parseErr) {
+      // Retry with stricter instruction
+      const retryUserMessage =
+        `${userMessage}\n\nIMPORTANT: Your previous response was not valid JSON. ` +
+        'Respond with ONLY a JSON object or array. No explanation, no markdown fences.';
+      let retryRaw, retryUsage;
+      try {
+        ({ rawText: retryRaw, usage: retryUsage } = await callLLM({
+          provider, apiKey, model, systemMessage,
+          userMessage: retryUserMessage, maxTokens
+        }));
+        // Merge usage
+        usage = {
+          input_tokens: usage.input_tokens + retryUsage.input_tokens,
+          output_tokens: usage.output_tokens + retryUsage.output_tokens
+        };
+      } catch (retryLlmErr) {
+        return { success: false, error: `LLM retry call failed: ${retryLlmErr.message}` };
+      }
+      try {
+        parsed = parseJson(retryRaw);
+      } catch (_retryParseErr) {
+        return {
+          success: false,
+          error: 'LLM did not return valid JSON after retry',
+          raw: retryRaw.slice(0, 500)
+        };
+      }
+    }
+    return {
+      success: true,
+      data: parsed,
+      provider,
+      model,
+      usage
+    };
+  }
+}
+export default ExtractWithLlm;

package/src/tools/search/providers/searxng.js ADDED Viewed

@@ -0,0 +1,126 @@
+/**
+ * SearXNG Search Provider
+ *
+ * Executes searches against a self-hosted SearXNG instance via its JSON API.
+ * Instance URL is read from the CRAWLFORGE_SEARXNG_URL environment variable.
+ *
+ * SearXNG JSON API reference:
+ *   https://docs.searxng.org/dev/search_api.html
+ *
+ * Result shape is normalised to match the CrawlForge/Google adapter format so
+ * the rest of the search pipeline (ranking, deduplication, caching) is unaffected.
+ */
+/**
+ * Map a single SearXNG result object to the internal item shape used throughout
+ * the search pipeline.
+ *
+ * SearXNG field → internal field
+ *   title       → title
+ *   url         → link, displayLink, formattedUrl
+ *   content     → snippet, htmlSnippet
+ *   (all others) → ignored / defaulted
+ *
+ * @param {Object} result - Raw SearXNG result entry
+ * @returns {Object} Normalised item
+ */
+export function normalizeSearxngResult(result) {
+  const url = result.url || '';
+  let displayLink = '';
+  try {
+    displayLink = new URL(url).hostname;
+  } catch {
+    displayLink = url;
+  }
+  return {
+    title: result.title || '',
+    link: url,
+    snippet: result.content || '',
+    displayLink,
+    formattedUrl: url,
+    htmlSnippet: result.content || '',
+    pagemap: {},
+    metadata: {
+      mime: null,
+      fileFormat: null,
+      cacheId: null
+    }
+  };
+}
+/**
+ * Fetch search results from a SearXNG instance.
+ *
+ * @param {Object} opts
+ * @param {string}  opts.query       - Search query string
+ * @param {number}  [opts.limit=10]  - Maximum number of results to return
+ * @param {number}  [opts.page=1]    - Page number (1-based)
+ * @param {boolean} [opts.safeSearch=true] - Whether safe search is enabled
+ * @param {string}  [opts.language='en']   - Language code (e.g. 'en', 'de')
+ * @param {string}  [opts.instanceUrl]     - Override for CRAWLFORGE_SEARXNG_URL
+ * @returns {Promise<Object>} Results in the internal adapter format
+ *   { items: Array, searchInformation: { totalResults, searchTime }, queries: {}, context: {} }
+ */
+export async function searchViaSearxng(opts = {}) {
+  const instanceUrl = opts.instanceUrl || process.env.CRAWLFORGE_SEARXNG_URL;
+  if (!instanceUrl) {
+    throw new Error(
+      "provider 'searxng' requires CRAWLFORGE_SEARXNG_URL in environment"
+    );
+  }
+  const {
+    query,
+    limit = 10,
+    page = 1,
+    safeSearch = true,
+    language = 'en'
+  } = opts;
+  // SearXNG safesearch: 0=off, 1=moderate, 2=strict
+  const safesearch = safeSearch ? 1 : 0;
+  const url = new URL('/search', instanceUrl);
+  url.searchParams.set('q', query);
+  url.searchParams.set('format', 'json');
+  url.searchParams.set('pageno', String(page));
+  url.searchParams.set('safesearch', String(safesearch));
+  url.searchParams.set('language', language);
+  let response;
+  try {
+    response = await fetch(url.toString(), {
+      headers: { Accept: 'application/json' }
+    });
+  } catch (err) {
+    throw new Error(`SearXNG request failed: ${err.message}`);
+  }
+  if (!response.ok) {
+    throw new Error(
+      `SearXNG returned HTTP ${response.status}: ${response.statusText}`
+    );
+  }
+  let data;
+  try {
+    data = await response.json();
+  } catch {
+    throw new Error('SearXNG returned invalid JSON');
+  }
+  const rawResults = Array.isArray(data.results) ? data.results : [];
+  const items = rawResults.slice(0, limit).map(normalizeSearxngResult);
+  return {
+    items,
+    searchInformation: {
+      totalResults: String(rawResults.length),
+      searchTime: data.answers ? 0 : 0
+    },
+    queries: {},
+    context: {}
+  };
+}

package/src/tools/search/ranking/ResultDeduplicator.js CHANGED Viewed

@@ -1,10 +1,13 @@
 import { CacheManager } from '../../../core/cache/CacheManager.js';
 /**
- * Advanced search result deduplication system using multiple similarity algorithms
+ * Advanced search result deduplication system using multiple similarity algorithms.
+ * Accepts an optional `sharedCache` (SearchResultCache instance) to avoid
+ * creating a duplicate CacheManager when used alongside ResultRanker.
  */
 export class ResultDeduplicator {
   constructor(options = {}) {
+    const { sharedCache, ...serializableOptions } = options;
     this.options = {
       // Similarity thresholds
       thresholds: {
@@ -13,7 +16,7 @@ export class ResultDeduplicator {
         content: 0.85,      // Content similarity threshold
         combined: 0.8       // Combined similarity threshold for final decision
       },
       // Deduplication strategies
       strategies: {
         urlNormalization: true,    // Normalize URLs for comparison
@@ -21,7 +24,7 @@ export class ResultDeduplicator {
         contentSimhash: true,      // Use SimHash for content comparison
         domainClustering: true     // Cluster results by domain
       },
       // URL normalization options
       urlNormalization: {
         removeProtocol: true,      // Remove http/https difference
@@ -32,7 +35,7 @@ export class ResultDeduplicator {
         removeEmptyParams: true,   // Remove empty query parameters
         lowercaseDomain: true      // Convert domain to lowercase
       },
       // Content similarity options
       contentSimilarity: {
         minLength: 10,             // Minimum content length to compare
@@ -40,7 +43,7 @@ export class ResultDeduplicator {
         simhashBits: 64,           // SimHash bit size
         hammingThreshold: 16       // Hamming distance threshold for SimHash
       },
       // Merge strategy
       mergeStrategy: {
         preserveBestRank: true,    // Keep the best ranking result as primary
@@ -48,17 +51,21 @@ export class ResultDeduplicator {
         preferHttps: true,         // Prefer HTTPS URLs when merging
         preferShorterUrl: true     // Prefer shorter, cleaner URLs
       },
       // Performance options
       cacheEnabled: true,
       cacheTTL: 3600000,          // 1 hour
-      ...options
+      ...serializableOptions
     };
-    // Initialize cache for deduplication computation
-    this.cache = this.options.cacheEnabled ?
-      new CacheManager({ ttl: this.options.cacheTTL }) : null;
+    // Use shared cache if provided, otherwise create own CacheManager instance.
+    // sharedCache is held separately — never in this.options — because it holds
+    // a setInterval Timer that would create a circular reference when the
+    // options object is JSON.stringify'd to build a cache key (see generateKey).
+    this.cache = sharedCache || (this.options.cacheEnabled
+      ? new CacheManager({ ttl: this.options.cacheTTL })
+      : null);
     // Statistics tracking
     this.stats = {
       totalProcessed: 0,

package/src/tools/search/ranking/ResultRanker.js CHANGED Viewed

@@ -1,10 +1,13 @@
 import { CacheManager } from '../../../core/cache/CacheManager.js';
 /**
- * Advanced search result ranking system with multiple scoring algorithms
+ * Advanced search result ranking system with multiple scoring algorithms.
+ * Accepts an optional `sharedCache` (SearchResultCache instance) to avoid
+ * creating a duplicate CacheManager when used alongside ResultDeduplicator.
  */
 export class ResultRanker {
   constructor(options = {}) {
+    const { sharedCache, ...serializableOptions } = options;
     this.options = {
       // Ranking weight configuration
       weights: {
@@ -13,13 +16,13 @@ export class ResultRanker {
         authority: 0.2,      // URL/domain authority
         freshness: 0.1       // Content freshness
       },
       // BM25 parameters
       bm25: {
         k1: 1.5,             // Term frequency saturation parameter
         b: 0.75              // Length normalization parameter
       },
       // Authority scoring parameters
       authority: {
         domainBoosts: {      // Domain authority boosts
@@ -32,23 +35,27 @@ export class ResultRanker {
         httpsBoost: 0.1,     // HTTPS boost
         pathDepthPenalty: 0.02 // Penalty per path segment
       },
       // Freshness parameters
       freshness: {
         maxAgeMonths: 24,    // Content older than this gets 0 freshness score
         decayRate: 0.1       // Exponential decay rate per month
       },
       // Performance options
       cacheEnabled: true,
       cacheTTL: 3600000,     // 1 hour
-      ...options
+      ...serializableOptions
     };
-    // Initialize cache for score computation
-    this.cache = this.options.cacheEnabled ?
-      new CacheManager({ ttl: this.options.cacheTTL }) : null;
+    // Use shared cache if provided, otherwise create own CacheManager instance.
+    // sharedCache is held separately — never in this.options — because it holds
+    // a setInterval Timer that would create a circular reference when the
+    // options object is JSON.stringify'd to build a cache key (see generateKey).
+    this.cache = sharedCache || (this.options.cacheEnabled
+      ? new CacheManager({ ttl: this.options.cacheTTL })
+      : null);
     // Precompute domain authority scores
     this.domainAuthorityMap = new Map();
     this.initializeDomainAuthority();

package/src/tools/search/ranking/SearchResultCache.js ADDED Viewed

@@ -0,0 +1,52 @@
+/**
+ * SearchResultCache — unified cache layer for search ranking and deduplication.
+ *
+ * Both ResultRanker and ResultDeduplicator previously held separate CacheManager
+ * instances with identical TTL configuration. This module provides a single
+ * shared cache they can both use, halving the number of LRU cache instances
+ * created per SearchWebTool instantiation.
+ *
+ * Usage:
+ *   const cache = new SearchResultCache({ ttl: 3600000 });
+ *   // pass to ResultRanker and ResultDeduplicator via options.sharedCache
+ */
+import { CacheManager } from '../../../core/cache/CacheManager.js';
+export class SearchResultCache {
+  /**
+   * @param {Object} [options]
+   * @param {number} [options.ttl=3600000]  — cache TTL in milliseconds
+   * @param {boolean} [options.enabled=true] — disable to skip caching
+   */
+  constructor(options = {}) {
+    const { ttl = 3600000, enabled = true } = options;
+    this.enabled = enabled;
+    this._cache = enabled ? new CacheManager({ ttl }) : null;
+  }
+  /** Retrieve a cached value by key (returns undefined on miss or when disabled). */
+  async get(key) {
+    if (!this.enabled || !this._cache) return undefined;
+    return this._cache.get(key);
+  }
+  /** Store a value under the given key. */
+  async set(key, value) {
+    if (!this.enabled || !this._cache) return;
+    return this._cache.set(key, value);
+  }
+  /** Generate a deterministic cache key from an arbitrary descriptor object. */
+  generateKey(namespace, descriptor) {
+    if (!this._cache) return null;
+    return this._cache.generateKey(namespace, descriptor);
+  }
+  /** Return underlying cache stats (or null when disabled). */
+  getStats() {
+    return this._cache ? this._cache.getStats() : null;
+  }
+}
+export default SearchResultCache;