npm - crawlforge-mcp-server - Versions diffs - 3.0.17 → 3.3.1 - Mend

crawlforge-mcp-server 3.0.17 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/CLAUDE.md +2 -0
package/README.md +1 -0
package/package.json +6 -2
package/server.js +192 -1277
package/src/constants/config.js +2 -1
package/src/core/ActionExecutor.js +2 -43
package/src/core/AuthManager.js +230 -32
package/src/core/BrowserContextPool.js +187 -0
package/src/core/JobManager.js +7 -5
package/src/core/LocalizationManager.js +14 -125
package/src/core/ResearchOrchestrator.js +86 -5
package/src/core/StealthBrowserManager.js +26 -18
package/src/core/cache/CacheManager.js +4 -1
package/src/core/crawlers/BFSCrawler.js +19 -5
package/src/core/endpointGuard.js +37 -0
package/src/observability/metrics.js +137 -0
package/src/observability/tracing.js +74 -0
package/src/server/auth/oauth.js +388 -0
package/src/server/registerTool.js +41 -0
package/src/server/schemas/common.js +29 -0
package/src/server/transports/http.js +22 -0
package/src/server/transports/stdio.js +16 -0
package/src/server/transports/streamableHttp.js +226 -0
package/src/server/withAuth.js +121 -0
package/src/tools/advanced/BatchScrapeTool.js +12 -1086
package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
package/src/tools/advanced/batchScrape/index.js +328 -0
package/src/tools/advanced/batchScrape/queue.js +91 -0
package/src/tools/advanced/batchScrape/reporter.js +26 -0
package/src/tools/advanced/batchScrape/schema.js +37 -0
package/src/tools/advanced/batchScrape/worker.js +179 -0
package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
package/src/tools/basic/_fetch.js +35 -0
package/src/tools/basic/extractLinks.js +74 -0
package/src/tools/basic/extractMetadata.js +74 -0
package/src/tools/basic/extractText.js +46 -0
package/src/tools/basic/fetchUrl.js +44 -0
package/src/tools/basic/scrapeStructured.js +58 -0
package/src/tools/crawl/_sessionContext.js +234 -0
package/src/tools/crawl/crawlDeep.js +55 -5
package/src/tools/crawl/mapSite.js +23 -2
package/src/tools/extract/_fetchAndParse.js +57 -0
package/src/tools/extract/extractStructured.js +3 -19
package/src/tools/extract/extractWithLlm.js +295 -0
package/src/tools/research/deepResearch.js +33 -8
package/src/tools/search/providers/searxng.js +126 -0
package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
package/src/tools/search/ranking/ResultRanker.js +17 -10
package/src/tools/search/ranking/SearchResultCache.js +52 -0
package/src/tools/search/searchWeb.js +112 -6
package/src/tools/tracking/trackChanges/differ.js +98 -0
package/src/tools/tracking/trackChanges/index.js +432 -0
package/src/tools/tracking/trackChanges/monitor.js +93 -0
package/src/tools/tracking/trackChanges/notifier.js +105 -0
package/src/tools/tracking/trackChanges/schema.js +127 -0
package/src/tools/tracking/trackChanges.js +12 -1374

package/src/tools/crawl/_sessionContext.js ADDED Viewed

@@ -0,0 +1,234 @@
+/**
+ * _sessionContext.js
+ *
+ * Lightweight in-memory cookie jar for crawl session reuse.
+ * Zero external runtime dependencies — Set-Cookie headers are parsed
+ * with a minimal hand-rolled implementation that handles the attributes
+ * needed for a single-host crawl session (name, value, path, domain,
+ * secure, httponly, max-age, expires).
+ *
+ * Rationale for not using set-cookie-parser / tough-cookie:
+ *   - We only need same-origin cookie persistence within one crawl run.
+ *   - The crawl never spans multiple registered domains in a way that
+ *     requires full RFC 6265 compliance (partitioned jars, public suffix
+ *     list, etc.).
+ *   - Keeping zero new runtime deps satisfies the project constraint.
+ */
+/**
+ * Parse a single Set-Cookie header value into a cookie object.
+ * Returns null if the header is empty or unparseable.
+ *
+ * @param {string} header - Raw Set-Cookie header value
+ * @param {string} requestUrl - URL that issued the Set-Cookie response
+ * @returns {{ name: string, value: string, domain: string, path: string,
+ *             secure: boolean, expires: number|null }|null}
+ */
+function parseSetCookie(header, requestUrl) {
+  if (!header) return null;
+  const parts = header.split(';').map(s => s.trim());
+  if (parts.length === 0 || !parts[0].includes('=')) return null;
+  const eqIdx = parts[0].indexOf('=');
+  const name = parts[0].slice(0, eqIdx).trim();
+  const value = parts[0].slice(eqIdx + 1).trim();
+  if (!name) return null;
+  let requestUrlObj;
+  try {
+    requestUrlObj = new URL(requestUrl);
+  } catch {
+    return null;
+  }
+  // Defaults derived from the request URL
+  let domain = requestUrlObj.hostname;
+  let path = '/';
+  let secure = false;
+  let expires = null; // null = session cookie (lives until crawl ends)
+  for (let i = 1; i < parts.length; i++) {
+    const part = parts[i];
+    const lower = part.toLowerCase();
+    if (lower.startsWith('domain=')) {
+      // Strip leading dot — we do exact hostname matching
+      domain = part.slice('domain='.length).trim().replace(/^\./, '');
+    } else if (lower.startsWith('path=')) {
+      path = part.slice('path='.length).trim() || '/';
+    } else if (lower === 'secure') {
+      secure = true;
+    } else if (lower.startsWith('max-age=')) {
+      const maxAge = parseInt(part.slice('max-age='.length), 10);
+      if (!isNaN(maxAge)) {
+        expires = maxAge <= 0 ? 0 : Date.now() + maxAge * 1000;
+      }
+    } else if (lower.startsWith('expires=')) {
+      const dateStr = part.slice('expires='.length).trim();
+      const ts = Date.parse(dateStr);
+      if (!isNaN(ts) && expires === null) {
+        // max-age takes precedence over expires
+        expires = ts;
+      }
+    }
+    // httponly is intentionally ignored — not relevant for a server-side crawler
+  }
+  return { name, value, domain, path, secure, expires };
+}
+/**
+ * Determine whether a stored cookie should be sent for the given URL.
+ *
+ * @param {object} cookie - Stored cookie object
+ * @param {URL} urlObj - Parsed URL of the outgoing request
+ * @returns {boolean}
+ */
+function cookieMatchesUrl(cookie, urlObj) {
+  // Honour expiry
+  if (cookie.expires !== null && Date.now() > cookie.expires) return false;
+  // Domain: exact match or subdomain match (cookie.domain is already dot-stripped)
+  const host = urlObj.hostname;
+  if (host !== cookie.domain && !host.endsWith('.' + cookie.domain)) return false;
+  // Secure flag
+  if (cookie.secure && urlObj.protocol !== 'https:') return false;
+  // Path: request path must start with cookie path
+  const reqPath = urlObj.pathname || '/';
+  if (!reqPath.startsWith(cookie.path)) return false;
+  return true;
+}
+/**
+ * SessionContext — holds the shared cookie jar and custom headers for one
+ * crawl session. Passed into BFSCrawler so every page fetch participates
+ * in the same session.
+ */
+export class SessionContext {
+  /**
+   * @param {object} [options]
+   * @param {boolean} [options.persistCookies=true]
+   * @param {Record<string,string>} [options.headers={}]
+   */
+  constructor(options = {}) {
+    this.persistCookies = options.persistCookies !== false; // default true
+    this.headers = options.headers || {};
+    /** @type {Array<{name,value,domain,path,secure,expires}>} */
+    this._jar = [];
+  }
+  /**
+   * Record cookies from a fetch Response.
+   * Handles the `set-cookie` header (Node fetch returns it as a single
+   * string value; actual multi-cookie responses are represented as multiple
+   * headers which the Headers API coalesces with ', ' for some values — we
+   * deal with raw strings from getSetCookie() when available).
+   *
+   * @param {Response} response - Native fetch Response
+   * @param {string} requestUrl - URL the response came from
+   */
+  recordCookies(response, requestUrl) {
+    if (!this.persistCookies) return;
+    // Node 18+ exposes `getSetCookie()` which returns an array, one per header
+    const rawHeaders = typeof response.headers.getSetCookie === 'function'
+      ? response.headers.getSetCookie()
+      : [response.headers.get('set-cookie')].filter(Boolean);
+    for (const raw of rawHeaders) {
+      const cookie = parseSetCookie(raw, requestUrl);
+      if (!cookie) continue;
+      // Upsert: replace any existing cookie with same name+domain+path
+      const idx = this._jar.findIndex(
+        c => c.name === cookie.name && c.domain === cookie.domain && c.path === cookie.path
+      );
+      if (cookie.expires !== null && Date.now() > cookie.expires) {
+        // Explicit deletion (max-age=0 or past expires)
+        if (idx !== -1) this._jar.splice(idx, 1);
+      } else if (idx !== -1) {
+        this._jar[idx] = cookie;
+      } else {
+        this._jar.push(cookie);
+      }
+    }
+  }
+  /**
+   * Build the `Cookie` header string for outgoing requests to the given URL.
+   *
+   * @param {string} url
+   * @returns {string} Cookie header value, or empty string
+   */
+  getCookieHeader(url) {
+    let urlObj;
+    try {
+      urlObj = new URL(url);
+    } catch {
+      return '';
+    }
+    const matching = this._jar.filter(c => cookieMatchesUrl(c, urlObj));
+    return matching.map(c => `${c.name}=${c.value}`).join('; ');
+  }
+  /**
+   * Merge session headers + cookie header into a headers object.
+   * The caller's own headers take priority over session headers.
+   *
+   * @param {string} url
+   * @param {Record<string,string>} baseHeaders - Headers already built by the caller
+   * @returns {Record<string,string>}
+   */
+  applyToHeaders(url, baseHeaders) {
+    const merged = { ...this.headers, ...baseHeaders };
+    const cookieHeader = this.getCookieHeader(url);
+    if (cookieHeader) {
+      // Append to any existing Cookie header rather than clobber
+      const existing = merged['Cookie'] || merged['cookie'] || '';
+      merged['Cookie'] = existing ? `${existing}; ${cookieHeader}` : cookieHeader;
+    }
+    return merged;
+  }
+  /**
+   * Perform an optional "initial request" (e.g. a login POST) and capture
+   * any cookies it sets into the jar. Returns the response body text.
+   *
+   * @param {{ url: string, method?: string, headers?: Record<string,string>, body?: string }} req
+   * @returns {Promise<{ status: number, body: string }>}
+   */
+  async performInitialRequest(req) {
+    const { url, method = 'GET', headers: extraHeaders = {}, body } = req;
+    const requestHeaders = this.applyToHeaders(url, {
+      'User-Agent': 'MCP-WebScraper/1.0',
+      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+      ...extraHeaders
+    });
+    const fetchOpts = {
+      method,
+      headers: requestHeaders,
+      redirect: 'follow'
+    };
+    if (body) {
+      fetchOpts.body = body;
+    }
+    const response = await fetch(url, fetchOpts);
+    this.recordCookies(response, url);
+    const text = await response.text().catch(() => '');
+    return { status: response.status, body: text };
+  }
+  /** Number of cookies currently held in the jar (for diagnostics). */
+  get cookieCount() {
+    return this._jar.length;
+  }
+}

package/src/tools/crawl/crawlDeep.js CHANGED Viewed

@@ -1,6 +1,8 @@
 import { z } from 'zod';
 import { BFSCrawler } from '../../core/crawlers/BFSCrawler.js';
 import { DomainFilter } from '../../utils/domainFilter.js';
+import { CacheManager } from '../../core/cache/CacheManager.js';
+import { SessionContext } from './_sessionContext.js';
 const CrawlDeepSchema = z.object({
   url: z.string().url(),
@@ -56,24 +58,48 @@ const CrawlDeepSchema = z.object({
       concurrency: z.number().optional()
     })).optional().default({})
   }).optional(),
-  import_filter_config: z.string().optional() // JSON string of exported config
+  import_filter_config: z.string().optional(), // JSON string of exported config
+  // Session reuse: when enabled, all page fetches share a cookie jar and
+  // consistent headers — enabling login-then-crawl workflows.
+  session: z.object({
+    enabled: z.boolean(),
+    persistCookies: z.boolean().optional().default(true),
+    headers: z.record(z.string()).optional().default({}),
+    initialRequest: z.object({
+      url: z.string().url(),
+      method: z.string().optional().default('GET'),
+      headers: z.record(z.string()).optional().default({}),
+      body: z.string().optional()
+    }).optional()
+  }).optional()
 });
 export class CrawlDeepTool {
   constructor(options = {}) {
     const {
       userAgent = 'MCP-WebScraper/1.0',
-      timeout = 30000
+      timeout = 30000,
+      cacheEnabled = true,
+      cacheTTL = 3600000
     } = options;
     this.userAgent = userAgent;
     this.timeout = timeout;
+    // Per-session result cache: avoids redundant crawls of the same root URL
+    this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
   }
   async execute(params) {
     try {
       const validated = CrawlDeepSchema.parse(params);
+      // Cache dedup: skip re-crawling the same root URL within the TTL window
+      if (this.cache) {
+        const cacheKey = this.cache.generateKey('crawl_deep', { url: validated.url, depth: validated.max_depth, pages: validated.max_pages });
+        const cached = await this.cache.get(cacheKey);
+        if (cached) return cached;
+      }
       // Create domain filter if configuration provided
       let domainFilter = null;
       if (validated.import_filter_config) {
@@ -117,6 +143,20 @@ export class CrawlDeepTool {
         }
       }
+      // Set up session context when requested
+      let sessionContext = null;
+      if (validated.session?.enabled) {
+        sessionContext = new SessionContext({
+          persistCookies: validated.session.persistCookies,
+          headers: validated.session.headers || {}
+        });
+        // Perform optional login / pre-crawl request
+        if (validated.session.initialRequest) {
+          await sessionContext.performInitialRequest(validated.session.initialRequest);
+        }
+      }
       // Create crawler instance
       const crawler = new BFSCrawler({
         maxDepth: validated.max_depth,
@@ -128,7 +168,8 @@ export class CrawlDeepTool {
         concurrency: validated.concurrency,
         domainFilter: domainFilter,
         enableLinkAnalysis: validated.enable_link_analysis,
-        linkAnalyzerOptions: validated.link_analysis_options
+        linkAnalyzerOptions: validated.link_analysis_options,
+        sessionContext
       });
       // Start crawling
@@ -154,9 +195,18 @@ export class CrawlDeepTool {
         stats: results.stats,
         site_structure: this.analyzeSiteStructure(results.urls),
         domain_filter_config: domainFilter ? domainFilter.exportConfig() : null,
-        link_analysis: results.linkAnalysis
+        link_analysis: results.linkAnalysis,
+        session: sessionContext
+          ? { enabled: true, cookies_captured: sessionContext.cookieCount }
+          : { enabled: false }
       };
+      // Store in cache before returning
+      if (this.cache) {
+        const cacheKey = this.cache.generateKey('crawl_deep', { url: validated.url, depth: validated.max_depth, pages: validated.max_pages });
+        await this.cache.set(cacheKey, response);
+      }
       return response;
     } catch (error) {
       throw new Error(`Crawl failed: ${error.message}`);

package/src/tools/crawl/mapSite.js CHANGED Viewed

@@ -2,6 +2,7 @@ import { z } from 'zod';
 import { load } from 'cheerio';
 import { DomainFilter } from '../../utils/domainFilter.js';
 import { normalizeUrl, getBaseUrl } from '../../utils/urlNormalizer.js';
+import { CacheManager } from '../../core/cache/CacheManager.js';
 const MapSiteSchema = z.object({
   url: z.string().url(),
@@ -23,16 +24,28 @@ export class MapSiteTool {
   constructor(options = {}) {
     const {
       userAgent = 'MCP-WebScraper/1.0',
-      timeout = 10000
+      timeout = 10000,
+      cacheEnabled = true,
+      cacheTTL = 3600000
     } = options;
     this.userAgent = userAgent;
     this.timeout = timeout;
+    // Per-session result cache: avoids redundant site maps for the same root URL
+    this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
   }
   async execute(params) {
     try {
       const validated = MapSiteSchema.parse(params);
+      // Cache dedup: skip re-mapping the same site within the TTL window
+      if (this.cache) {
+        const cacheKey = this.cache.generateKey('map_site', { url: validated.url, maxUrls: validated.max_urls });
+        const cached = await this.cache.get(cacheKey);
+        if (cached) return cached;
+      }
       const baseUrl = getBaseUrl(validated.url);
       const urls = new Set();
       const metadata = new Map();
@@ -94,7 +107,7 @@ export class MapSiteTool {
         ? this.groupByPath(urlArray)
         : urlArray;
-      return {
+      const result = {
         base_url: baseUrl,
         total_urls: urlArray.length,
         urls: organized,
@@ -104,6 +117,14 @@ export class MapSiteTool {
         domain_filter_config: domainFilter ? domainFilter.exportConfig() : null,
         filter_stats: domainFilter ? domainFilter.getStats() : null
       };
+      // Store in cache before returning
+      if (this.cache) {
+        const cacheKey = this.cache.generateKey('map_site', { url: validated.url, maxUrls: validated.max_urls });
+        await this.cache.set(cacheKey, result);
+      }
+      return result;
     } catch (error) {
       throw new Error(`Site mapping failed: ${error.message}`);
     }

package/src/tools/extract/_fetchAndParse.js ADDED Viewed

@@ -0,0 +1,57 @@
+/**
+ * _fetchAndParse.js — shared fetch + HTML parse helper for extract tools.
+ *
+ * Used by:
+ *   extractStructured.js
+ *   extractContent.js      (uses native fetch directly but can adopt this)
+ *   processDocument.js     (URL sources)
+ *
+ * Returns { html, $, textContent, finalUrl } so callers don't repeat
+ * the same fetch/cheerio/cleanup boilerplate.
+ */
+import { load } from 'cheerio';
+const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; CrawlForge-MCP/3.0)';
+const DEFAULT_TIMEOUT_MS = 15000;
+/**
+ * Fetch a URL and return parsed HTML via Cheerio.
+ *
+ * @param {string} url
+ * @param {Object} [options]
+ * @param {string}   [options.userAgent]
+ * @param {number}   [options.timeoutMs]
+ * @param {string[]} [options.stripTags]   — additional tags to strip (default: script, style, noscript, iframe, svg)
+ * @returns {Promise<{ html: string, $: import('cheerio').CheerioAPI, textContent: string, finalUrl: string }>}
+ */
+export async function fetchAndParse(url, options = {}) {
+  const {
+    userAgent = DEFAULT_USER_AGENT,
+    timeoutMs = DEFAULT_TIMEOUT_MS,
+    stripTags = ['script', 'style', 'noscript', 'iframe', 'svg']
+  } = options;
+  const response = await fetch(url, {
+    headers: {
+      'User-Agent': userAgent,
+      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+    },
+    signal: AbortSignal.timeout(timeoutMs)
+  });
+  if (!response.ok) {
+    throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+  }
+  const html = await response.text();
+  const $ = load(html);
+  if (stripTags.length > 0) {
+    $(stripTags.join(', ')).remove();
+  }
+  const textContent = $('body').text().replace(/\s+/g, ' ').trim();
+  return { html, $, textContent, finalUrl: response.url };
+}

package/src/tools/extract/extractStructured.js CHANGED Viewed

@@ -7,6 +7,7 @@
 import { z } from 'zod';
 import { load } from 'cheerio';
 import { LLMManager } from '../../core/llm/LLMManager.js';
+import { fetchAndParse } from './_fetchAndParse.js';
 const ExtractStructuredSchema = z.object({
   url: z.string().url(),
@@ -73,25 +74,8 @@ export class ExtractStructuredTool {
       const validated = ExtractStructuredSchema.parse(params);
       const { url, schema, prompt, llmConfig, fallbackToSelectors, selectorHints } = validated;
-      // Step 1: Fetch URL
-      const response = await fetch(url, {
-        headers: {
-          'User-Agent': this.userAgent,
-          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
-        },
-        signal: AbortSignal.timeout(15000)
-      });
-      if (!response.ok) {
-        throw new Error(`HTTP ${response.status}: ${response.statusText}`);
-      }
-      const html = await response.text();
-      // Step 2: Parse HTML with Cheerio, strip scripts/styles
-      const $ = load(html);
-      $('script, style, noscript, iframe, svg').remove();
-      const textContent = $('body').text().replace(/\s+/g, ' ').trim();
+      // Step 1: Fetch and parse — shared helper strips scripts/styles/iframes/svgs
+      const { html, $, textContent } = await fetchAndParse(url, { userAgent: this.userAgent });
       // Step 3: Try LLM extraction first
       let extractionResult = null;