npm - crawlforge-mcp-server - Versions diffs - 4.2.12 → 4.6.0 - Mend

crawlforge-mcp-server 4.2.12 → 4.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/CLAUDE.md +19 -7
package/README.md +11 -3
package/package.json +3 -2
package/server.js +195 -22
package/src/cli/commands/init.js +107 -0
package/src/cli/index.js +2 -0
package/src/constants/config.js +5 -0
package/src/core/ActionExecutor.js +13 -1
package/src/core/AgentOrchestrator.js +300 -0
package/src/core/AuthManager.js +21 -1
package/src/core/ChangeTracker.js +8 -5
package/src/core/LLMsTxtAnalyzer.js +71 -47
package/src/core/LocalizationManager.js +7 -4
package/src/core/ResearchOrchestrator.js +10 -6
package/src/core/StealthBrowserManager.js +52 -13
package/src/core/analysis/ContentAnalyzer.js +2 -2
package/src/core/crawlers/BFSCrawler.js +23 -12
package/src/core/processing/ContentProcessor.js +19 -3
package/src/core/processing/PDFProcessor.js +72 -23
package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
package/src/tools/advanced/batchScrape/index.js +3 -1
package/src/tools/advanced/batchScrape/reporter.js +5 -1
package/src/tools/advanced/batchScrape/worker.js +6 -1
package/src/tools/agent/agent.js +71 -0
package/src/tools/basic/_fetch.js +78 -5
package/src/tools/basic/extractLinks.js +1 -1
package/src/tools/basic/extractMetadata.js +65 -1
package/src/tools/basic/extractText.js +73 -5
package/src/tools/basic/scrapeStructured.js +48 -10
package/src/tools/crawl/crawlDeep.js +13 -5
package/src/tools/crawl/mapSite.js +53 -52
package/src/tools/extract/analyzeContent.js +11 -6
package/src/tools/extract/extractContent.js +23 -5
package/src/tools/extract/extractStructured.js +65 -16
package/src/tools/extract/extractWithLlm.js +192 -11
package/src/tools/extract/listOllamaModels.js +19 -8
package/src/tools/extract/processDocument.js +10 -4
package/src/tools/extract/summarizeContent.js +58 -1
package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
package/src/tools/research/deepResearch.js +43 -4
package/src/tools/scrape/unifiedScrape.js +314 -0
package/src/tools/search/providers/searxng.js +2 -2
package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
package/src/tools/search/ranking/ResultRanker.js +13 -4
package/src/tools/search/searchWeb.js +5 -5
package/src/tools/templates/TemplateRegistry.js +3 -2
package/src/tools/tracking/trackChanges/differ.js +33 -1
package/src/utils/htmlToMarkdown.js +5 -1

package/src/core/ActionExecutor.js CHANGED Viewed

@@ -213,7 +213,17 @@ export class ActionExecutor extends EventEmitter {
         // Execute chain with potential retries
         chainResult = await this.executeChainWithRetries(executionContext);
+        // Capture the LIVE post-action page state before the page is closed,
+        // so callers can extract final content reflecting all actions
+        // (instead of re-fetching the original URL).
+        try {
+          executionContext.finalHtml = await page.content();
+          executionContext.finalUrl = page.url();
+        } catch (captureErr) {
+          this.log('warn', 'Failed to capture final page content: ' + captureErr.message);
+        }
         this.stats.successfulChains++;
         executionContext.success = true;
@@ -268,6 +278,8 @@ export class ActionExecutor extends EventEmitter {
         success: true,
         chainId,
         url,
+        finalUrl: executionContext.finalUrl || url,
+        finalHtml: executionContext.finalHtml,
         executionTime: Date.now() - startTime,
         results: executionContext.results,
         screenshots: executionContext.screenshots,

package/src/core/AgentOrchestrator.js ADDED Viewed

@@ -0,0 +1,300 @@
+/**
+ * AgentOrchestrator — autonomous NL-prompt → search/navigate/extract → answer.
+ *
+ * Design: hardcoded 3-action state machine.
+ *   PLAN   — one SamplingClient call to decompose prompt into search queries
+ *   GATHER — search_web (≤maxUrls results total)
+ *   ACT    — fetchAndParse + relevance gate per URL
+ *   DECIDE — loop or answer (step/URL/time hard stops; never LLM-trusted)
+ *   SHAPE  — schema→ExtractWithLlm prose→synthesis via SamplingClient
+ *
+ * Hard stops (enforced here, not by the LLM):
+ *   1. maxSteps iterations of the ACT loop
+ *   2. maxUrls total URLs fetched
+ *   3. wallClockMs wall-clock milliseconds (default 120 000)
+ *
+ * No-LLM-key path: if all LLM calls fail, return collected evidence + {degraded:true}.
+ * pro model: delegates to ResearchOrchestrator.conductResearch() for richer synthesis.
+ */
+import { fetchAndParse } from '../tools/extract/_fetchAndParse.js';
+import { SamplingClient } from './SamplingClient.js';
+const DEFAULT_WALL_CLOCK_MS = 120_000;
+const DEFAULT_MAX_STEPS = 5;
+const DEFAULT_MAX_URLS = 10;
+// ── Helpers ───────────────────────────────────────────────────────────────────
+/**
+ * Naive relevance gate: does the fetched text contain any query term?
+ * Avoids an LLM call for an obviously irrelevant page.
+ */
+function isRelevant(text, query) {
+  if (!text || !query) return true; // fail-open
+  const lc = text.toLowerCase();
+  return query.toLowerCase().split(/\s+/).some(term => term.length > 3 && lc.includes(term));
+}
+/**
+ * Truncate text to a safe token budget (~8 000 chars ≈ ~2 000 tokens).
+ */
+function truncate(text, maxChars = 8000) {
+  if (!text || text.length <= maxChars) return text;
+  return text.slice(0, maxChars) + '\n[...truncated]';
+}
+// ── Orchestrator ──────────────────────────────────────────────────────────────
+export class AgentOrchestrator {
+  /**
+   * @param {object} options
+   * @param {object|null} options.mcpServer  - McpServer instance (for SamplingClient)
+   * @param {object}      options.searchConfig - passed to SearchWebTool constructor
+   * @param {object}      options.llmConfig    - passed to ExtractWithLlm constructor
+   */
+  constructor(options = {}) {
+    this._mcpServer = options.mcpServer || null;
+    this._searchConfig = options.searchConfig || {};
+    this._llmConfig = options.llmConfig || {};
+    this._samplingClient = null;
+    this._searchTool = null;
+    this._extractWithLlm = null;
+    this._researchOrchestrator = null;
+  }
+  /** Set MCP server (called by agent.js after construction). */
+  setMcpServer(mcpServer) {
+    this._mcpServer = mcpServer;
+    this._samplingClient = null; // reset so it is rebuilt with the new server
+  }
+  // ── Lazy accessors ──────────────────────────────────────────────────────────
+  _getSamplingClient() {
+    if (!this._samplingClient) {
+      this._samplingClient = new SamplingClient({ mcpServer: this._mcpServer });
+    }
+    return this._samplingClient;
+  }
+  async _getSearchTool() {
+    if (!this._searchTool) {
+      const { SearchWebTool } = await import('../tools/search/searchWeb.js');
+      this._searchTool = new SearchWebTool(this._searchConfig);
+    }
+    return this._searchTool;
+  }
+  async _getExtractWithLlm() {
+    if (!this._extractWithLlm) {
+      const { ExtractWithLlm } = await import('../tools/extract/extractWithLlm.js');
+      this._extractWithLlm = new ExtractWithLlm(this._llmConfig);
+    }
+    return this._extractWithLlm;
+  }
+  async _getResearchOrchestrator() {
+    if (!this._researchOrchestrator) {
+      const { ResearchOrchestrator } = await import('./ResearchOrchestrator.js');
+      this._researchOrchestrator = new ResearchOrchestrator({
+        maxUrls: 50,
+        timeLimit: DEFAULT_WALL_CLOCK_MS
+      });
+    }
+    return this._researchOrchestrator;
+  }
+  // ── Main entry ──────────────────────────────────────────────────────────────
+  /**
+   * Run the agent loop.
+   *
+   * @param {object} params
+   * @param {string}    params.prompt      - Natural-language task
+   * @param {string[]}  [params.urls]      - Seed URLs (skips search for those)
+   * @param {object}    [params.schema]    - JSON schema for structured output
+   * @param {string}    [params.model]     - 'default' | 'pro'
+   * @param {number}    [params.maxSteps]  - Max ACT iterations (≤10)
+   * @param {number}    [params.maxUrls]   - Max URLs to fetch (≤20)
+   * @param {number}    [params.wallClockMs] - Wall-clock budget in ms
+   * @returns {Promise<object>}
+   */
+  async run(params) {
+    const {
+      prompt,
+      urls: seedUrls = [],
+      schema,
+      model = 'default',
+      maxSteps = DEFAULT_MAX_STEPS,
+      maxUrls = DEFAULT_MAX_URLS,
+      wallClockMs = DEFAULT_WALL_CLOCK_MS
+    } = params;
+    const startTime = Date.now();
+    const deadline = () => (Date.now() - startTime) >= wallClockMs;
+    // Hard-cap params regardless of what caller sends
+    const capSteps = Math.min(maxSteps, 10);
+    const capUrls = Math.min(maxUrls, 20);
+    // pro model: delegate to ResearchOrchestrator
+    if (model === 'pro') {
+      try {
+        const orchestrator = await this._getResearchOrchestrator();
+        const result = await orchestrator.conductResearch(prompt, {
+          maxUrls: capUrls,
+          timeLimit: wallClockMs,
+          researchApproach: 'focused'
+        });
+        return { success: true, answer: result, model: 'pro', degraded: false };
+      } catch (err) {
+        // Fall through to default path on pro failure
+        return {
+          success: false,
+          degraded: true,
+          reason: `pro research failed: ${err.message}`,
+          answer: null
+        };
+      }
+    }
+    // ── PLAN ──────────────────────────────────────────────────────────────────
+    let searchQueries = [prompt]; // fallback: use raw prompt as query
+    try {
+      const planPrompt =
+        `Decompose this research task into 1-3 concise web search queries (one per line, no bullets):\n\n${prompt}`;
+      const { text } = await this._getSamplingClient().complete(planPrompt, { maxTokens: 200 });
+      const lines = text.split('\n').map(l => l.replace(/^[-*\d.)\s]+/, '').trim()).filter(Boolean);
+      if (lines.length > 0) searchQueries = lines.slice(0, 3);
+    } catch {
+      // Sampling unavailable — use raw prompt
+    }
+    // ── GATHER (search) ───────────────────────────────────────────────────────
+    const urlQueue = [...seedUrls]; // start with any user-provided seeds
+    const searchResults = [];
+    if (urlQueue.length < capUrls) {
+      try {
+        const searchTool = await this._getSearchTool();
+        for (const q of searchQueries) {
+          if (deadline()) break;
+          try {
+            const sr = await searchTool.execute({ query: q, limit: Math.ceil(capUrls / searchQueries.length) });
+            const parsed = sr?.content?.[0]?.text ? JSON.parse(sr.content[0].text) : null;
+            if (parsed?.results) {
+              for (const r of parsed.results) {
+                if (r.link && !urlQueue.includes(r.link)) urlQueue.push(r.link);
+                searchResults.push({ query: q, title: r.title || '', url: r.link || '', snippet: r.snippet || '' });
+              }
+            }
+          } catch { /* skip failed search */ }
+        }
+      } catch { /* search tool init failed */ }
+    }
+    // ── ACT loop ──────────────────────────────────────────────────────────────
+    const evidence = [];
+    let urlsFetched = 0;
+    let step = 0;
+    for (const url of urlQueue) {
+      if (step >= capSteps || urlsFetched >= capUrls || deadline()) break;
+      step++;
+      urlsFetched++;
+      try {
+        const { textContent, finalUrl } = await fetchAndParse(url, { timeoutMs: 10000 });
+        if (!isRelevant(textContent, prompt)) continue;
+        evidence.push({
+          url: finalUrl,
+          text: truncate(textContent),
+          step
+        });
+      } catch { /* skip unreachable URL */ }
+    }
+    // ── SHAPE ─────────────────────────────────────────────────────────────────
+    const combinedText = evidence.map(e => `--- Source: ${e.url} ---\n${e.text}`).join('\n\n');
+    if (!combinedText.trim()) {
+      return {
+        success: true,
+        degraded: true,
+        reason: 'No content could be fetched for the given prompt.',
+        search_results: searchResults,
+        evidence: [],
+        answer: null,
+        steps: step,
+        urls_fetched: urlsFetched
+      };
+    }
+    // Schema path: use ExtractWithLlm for structured output
+    if (schema && Object.keys(schema).length > 0) {
+      try {
+        const extractWithLlm = await this._getExtractWithLlm();
+        const result = await extractWithLlm.execute({
+          content: combinedText,
+          prompt: `From the following research sources, answer this task and extract structured data:\n${prompt}`,
+          schema,
+          provider: 'auto'
+        });
+        return {
+          success: result.success,
+          answer: result.success ? result.data : null,
+          structured: true,
+          search_results: searchResults,
+          evidence: evidence.map(e => ({ url: e.url })),
+          degraded: !result.success,
+          reason: result.success ? undefined : result.error,
+          steps: step,
+          urls_fetched: urlsFetched
+        };
+      } catch (err) {
+        // Fall through to prose synthesis
+      }
+    }
+    // Prose synthesis via SamplingClient
+    let answer = null;
+    let degraded = false;
+    let degradedReason;
+    try {
+      const synthesisPrompt =
+        `You are a research assistant. Based on the sources below, answer this task:\n\n` +
+        `Task: ${prompt}\n\n` +
+        `${truncate(combinedText, 12000)}\n\n` +
+        `Provide a clear, concise answer.`;
+      const { text } = await this._getSamplingClient().complete(synthesisPrompt, { maxTokens: 1024 });
+      answer = text;
+    } catch (err) {
+      degraded = true;
+      degradedReason = `LLM synthesis unavailable: ${err.message}`;
+      // Return raw evidence so the host LLM can synthesize
+      answer = null;
+    }
+    return {
+      success: true,
+      answer,
+      search_results: searchResults,
+      evidence: degraded ? evidence : evidence.map(e => ({ url: e.url })),
+      degraded,
+      reason: degradedReason,
+      steps: step,
+      urls_fetched: urlsFetched
+    };
+  }
+  async destroy() {
+    if (this._researchOrchestrator && typeof this._researchOrchestrator.destroy === 'function') {
+      await this._researchOrchestrator.destroy();
+    }
+  }
+}
+export default AgentOrchestrator;

package/src/core/AuthManager.js CHANGED Viewed

@@ -538,7 +538,13 @@ class AuthManager {
       extract_with_llm: 5,
       // D3.3: Pre-built site templates (1 credit per template scrape)
-      scrape_template: 1
+      scrape_template: 1,
+      // Phase D (v4.6.0)
+      // scrape: base 2; projectCost() scales with format count
+      scrape: 2,
+      // agent: base 8; projectCost() scales with maxUrls
+      agent: 8
     };
     return costs[tool] || 1;
@@ -585,6 +591,20 @@ class AuthManager {
       case 'extract_with_llm':
         note = 'Includes external LLM API call cost (not billed in credits, billed by your LLM provider).';
         break;
+      case 'scrape': {
+        // Base 2 + 1 per format beyond the first
+        const fmtCount = Array.isArray(params?.formats) ? params.formats.length : 1;
+        projected = Math.max(base, base + Math.max(0, fmtCount - 1));
+        note = `Estimated from ${fmtCount} format(s). json format may incur external LLM cost.`;
+        break;
+      }
+      case 'agent': {
+        const agentUrls = params?.maxUrls || 10;
+        const isPro = params?.model === 'pro';
+        projected = Math.max(base, base + Math.ceil(agentUrls / 5) + (isPro ? 5 : 0));
+        note = `Lower-bound estimate. Scales with maxUrls (${agentUrls}).${isPro ? ' pro model adds deep-research cost.' : ''} External LLM billed separately.`;
+        break;
+      }
       default:
         note = 'Fixed cost per invocation.';
     }

package/src/core/ChangeTracker.js CHANGED Viewed

@@ -173,12 +173,15 @@ export class ChangeTracker extends EventEmitter {
    */
   async compareWithBaseline(url, currentContent, options = {}) {
     const startTime = Date.now();
+    // Expected no-baseline case: return a clean error WITHOUT emitting an
+    // unhandled 'error' event (which would crash callers with no 'error' listener).
+    if (!this.snapshots.has(url)) {
+      throw new Error(`No baseline found for ${url} — run create_baseline first`);
+    }
     try {
-      if (!this.snapshots.has(url)) {
-        throw new Error(`No baseline found for URL: ${url}`);
-      }
       const snapshots = this.snapshots.get(url);
       const baseline = snapshots[snapshots.length - 1]; // Get latest baseline

package/src/core/LLMsTxtAnalyzer.js CHANGED Viewed

@@ -28,7 +28,10 @@ export class LLMsTxtAnalyzer {
       respectRobots: options.respectRobots !== false,
       detectAPIs: options.detectAPIs !== false,
       analyzeContent: options.analyzeContent !== false,
-      checkSecurity: options.checkSecurity !== false,
+      // C1: intrusive probing is now opt-in (default false) to avoid hammering
+      // security-sensitive and rate-probe paths on every generation run.
+      checkSecurity: options.checkSecurity === true,
+      probeRateLimit: options.probeRateLimit === true,
       ...options
     };
@@ -70,26 +73,31 @@ export class LLMsTxtAnalyzer {
         analysisOptions: { ...this.options, ...options }
       };
-      // Phase 1: Site Structure Analysis
+      // Phase 1: Site Structure Analysis (must run first — subsequent phases
+      // depend on the URL list it produces)
       await this.analyzeSiteStructure(url, options);
-      // Phase 2: API Detection
+      // Phases 2-5 run in parallel where they are independent of each other.
+      // detectAPIEndpoints and analyzeSecurity each fire a bounded set of probe
+      // fetches (capped at PROBE_CONCURRENCY concurrent requests per phase).
+      // analyzeRateLimiting is only executed when the caller opts in via
+      // probeRateLimit:true — its 5 sequential requests are intrusive.
+      const parallelTasks = [];
       if (this.options.detectAPIs) {
-        await this.detectAPIEndpoints(url);
+        parallelTasks.push(this.detectAPIEndpoints(url));
       }
-      // Phase 3: Content Classification
       if (this.options.analyzeContent) {
-        await this.classifyContent();
+        parallelTasks.push(this.classifyContent());
       }
-      // Phase 4: Security Analysis
       if (this.options.checkSecurity) {
-        await this.analyzeSecurity(url);
+        parallelTasks.push(this.analyzeSecurity(url));
+      }
+      if (this.options.probeRateLimit) {
+        parallelTasks.push(this.analyzeRateLimiting(url));
       }
-      // Phase 5: Rate Limiting Analysis
-      await this.analyzeRateLimiting(url);
+      await Promise.all(parallelTasks);
       // Phase 6: Generate Guidelines
       await this.generateUsageGuidelines();
@@ -160,35 +168,43 @@ export class LLMsTxtAnalyzer {
   /**
    * Detect API endpoints and data sources
+   * C1: probe fetches run in parallel (capped at PROBE_CONCURRENCY).
    */
   async detectAPIEndpoints(baseUrl) {
     logger.info('Detecting API endpoints...');
+    const PROBE_CONCURRENCY = 6;
     try {
-      const apis = [];
       const commonPaths = [
         '/api', '/v1', '/v2', '/v3', '/rest', '/graphql',
         '/data', '/feed', '/json', '/xml', '/rss',
         '/.well-known', '/openapi', '/swagger'
       ];
-      // Check common API paths
-      for (const path of commonPaths) {
-        const apiUrl = `${baseUrl}${path}`;
-        try {
-          const response = await this.fetchWithTimeout(apiUrl, { timeout: 5000 });
-          if (response.ok) {
-            const contentType = response.headers.get('content-type') || '';
-            apis.push({
-              url: apiUrl,
-              type: this.determineAPIType(apiUrl, contentType),
-              status: response.status,
-              contentType,
-              accessible: true
-            });
-          }
-        } catch {
-          // API endpoint not accessible or doesn't exist
+      // Run path probes in parallel batches
+      const apis = [];
+      for (let i = 0; i < commonPaths.length; i += PROBE_CONCURRENCY) {
+        const batch = commonPaths.slice(i, i + PROBE_CONCURRENCY);
+        const results = await Promise.allSettled(
+          batch.map(async (path) => {
+            const apiUrl = `${baseUrl}${path}`;
+            const response = await this.fetchWithTimeout(apiUrl, { timeout: 5000 });
+            if (response.ok) {
+              const contentType = response.headers.get('content-type') || '';
+              return {
+                url: apiUrl,
+                type: this.determineAPIType(apiUrl, contentType),
+                status: response.status,
+                contentType,
+                accessible: true
+              };
+            }
+            return null;
+          })
+        );
+        for (const r of results) {
+          if (r.status === 'fulfilled' && r.value) apis.push(r.value);
         }
       }
@@ -278,13 +294,14 @@ export class LLMsTxtAnalyzer {
   /**
    * Analyze security boundaries and sensitive areas
+   * C1: probe fetches run in parallel (capped at PROBE_CONCURRENCY).
    */
   async analyzeSecurity(baseUrl) {
     logger.info('Analyzing security boundaries...');
-    try {
-      const securityAreas = [];
+    const PROBE_CONCURRENCY = 6;
+    try {
       // Check for common sensitive paths
       const sensitivePaths = [
         '/admin', '/administrator', '/wp-admin', '/cms',
@@ -294,21 +311,28 @@ export class LLMsTxtAnalyzer {
         '/config', '/settings', '/env'
       ];
-      for (const path of sensitivePaths) {
-        const testUrl = `${baseUrl}${path}`;
-        try {
-          const response = await this.fetchWithTimeout(testUrl, { timeout: 3000 });
-          if (response.status === 200 || response.status === 302 || response.status === 401) {
-            securityAreas.push({
-              path,
-              url: testUrl,
-              status: response.status,
-              type: this.classifySecurityArea(path),
-              recommendation: 'restrict'
-            });
-          }
-        } catch {
-          // Area not accessible
+      // Run path probes in parallel batches
+      const securityAreas = [];
+      for (let i = 0; i < sensitivePaths.length; i += PROBE_CONCURRENCY) {
+        const batch = sensitivePaths.slice(i, i + PROBE_CONCURRENCY);
+        const results = await Promise.allSettled(
+          batch.map(async (path) => {
+            const testUrl = `${baseUrl}${path}`;
+            const response = await this.fetchWithTimeout(testUrl, { timeout: 3000 });
+            if (response.status === 200 || response.status === 302 || response.status === 401) {
+              return {
+                path,
+                url: testUrl,
+                status: response.status,
+                type: this.classifySecurityArea(path),
+                recommendation: 'restrict'
+              };
+            }
+            return null;
+          })
+        );
+        for (const r of results) {
+          if (r.status === 'fulfilled' && r.value) securityAreas.push(r.value);
         }
       }

package/src/core/LocalizationManager.js CHANGED Viewed

@@ -499,12 +499,14 @@ export class LocalizationManager extends EventEmitter {
   }
   /**
-   * Detect and handle geo-blocked content
+   * Detect geo-blocked content and return suggestions.
+   * C3: renamed from handleGeoBlocking — no bypass is actually applied here;
+   * the returned bypassStrategies are recommendations only.
    * @param {string} url - URL to check
    * @param {Object} response - HTTP response object
-   * @returns {Object} - Analysis and bypass suggestions
+   * @returns {Object} - Detection result and bypass suggestions
    */
-  async handleGeoBlocking(url, response) {
+  async detectGeoBlocking(url, response) {
     const geoBlockingIndicators = [
       /not available in your country/i,
       /access denied/i,
@@ -1386,8 +1388,9 @@ export class LocalizationManager extends EventEmitter {
     }
     // Phone number pattern analysis
+    // C3: fix US pattern — was using \\d (literal backslash-d) instead of \d
     const phonePatterns = {
-      'US': /\+1[\s.-]?\(?\\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}/,
+      'US': /\+1[\s.-]?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}/,
       'GB': /\+44[\s.-]?\d{2,4}[\s.-]?\d{6,8}/,
       'DE': /\+49[\s.-]?\d{2,4}[\s.-]?\d{6,8}/,
       'FR': /\+33[\s.-]?\d{1}[\s.-]?\d{8}/

package/src/core/ResearchOrchestrator.js CHANGED Viewed

@@ -519,14 +519,18 @@ export class ResearchOrchestrator extends EventEmitter {
               }
             }
-            if (contentData && contentData.content) {
+            // Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
+            const contentText = contentData && contentData.content
+              ? (typeof contentData.content === 'string'
+                  ? contentData.content
+                  : (contentData.content.text || ''))
+              : '';
+            // Only count and enhance sources that actually produced non-empty content.
+            // Skip failed extractions and empty {text:""} results.
+            if (contentData && contentData.success !== false && contentText.trim().length > 0) {
               this.metrics.contentExtracted++;
-              // Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
-              const contentText = typeof contentData.content === 'string'
-                ? contentData.content
-                : (contentData.content.text || JSON.stringify(contentData.content));
               // Enhance source with extracted content
               let enhancedSource = {
                 ...source,