crawlforge-mcp-server 4.2.12 → 4.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CLAUDE.md +19 -7
  2. package/README.md +11 -3
  3. package/package.json +3 -2
  4. package/server.js +195 -22
  5. package/src/cli/commands/init.js +107 -0
  6. package/src/cli/index.js +2 -0
  7. package/src/constants/config.js +5 -0
  8. package/src/core/ActionExecutor.js +13 -1
  9. package/src/core/AgentOrchestrator.js +300 -0
  10. package/src/core/AuthManager.js +21 -1
  11. package/src/core/ChangeTracker.js +8 -5
  12. package/src/core/LLMsTxtAnalyzer.js +71 -47
  13. package/src/core/LocalizationManager.js +7 -4
  14. package/src/core/ResearchOrchestrator.js +10 -6
  15. package/src/core/StealthBrowserManager.js +52 -13
  16. package/src/core/analysis/ContentAnalyzer.js +2 -2
  17. package/src/core/crawlers/BFSCrawler.js +23 -12
  18. package/src/core/processing/ContentProcessor.js +19 -3
  19. package/src/core/processing/PDFProcessor.js +72 -23
  20. package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
  21. package/src/tools/advanced/batchScrape/index.js +3 -1
  22. package/src/tools/advanced/batchScrape/reporter.js +5 -1
  23. package/src/tools/advanced/batchScrape/worker.js +6 -1
  24. package/src/tools/agent/agent.js +71 -0
  25. package/src/tools/basic/_fetch.js +78 -5
  26. package/src/tools/basic/extractLinks.js +1 -1
  27. package/src/tools/basic/extractMetadata.js +65 -1
  28. package/src/tools/basic/extractText.js +73 -5
  29. package/src/tools/basic/scrapeStructured.js +48 -10
  30. package/src/tools/crawl/crawlDeep.js +13 -5
  31. package/src/tools/crawl/mapSite.js +53 -52
  32. package/src/tools/extract/analyzeContent.js +11 -6
  33. package/src/tools/extract/extractContent.js +23 -5
  34. package/src/tools/extract/extractStructured.js +65 -16
  35. package/src/tools/extract/extractWithLlm.js +192 -11
  36. package/src/tools/extract/listOllamaModels.js +19 -8
  37. package/src/tools/extract/processDocument.js +10 -4
  38. package/src/tools/extract/summarizeContent.js +58 -1
  39. package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
  40. package/src/tools/research/deepResearch.js +43 -4
  41. package/src/tools/scrape/unifiedScrape.js +314 -0
  42. package/src/tools/search/providers/searxng.js +2 -2
  43. package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
  44. package/src/tools/search/ranking/ResultRanker.js +13 -4
  45. package/src/tools/search/searchWeb.js +5 -5
  46. package/src/tools/templates/TemplateRegistry.js +3 -2
  47. package/src/tools/tracking/trackChanges/differ.js +33 -1
  48. package/src/utils/htmlToMarkdown.js +5 -1
@@ -213,7 +213,17 @@ export class ActionExecutor extends EventEmitter {
213
213
 
214
214
  // Execute chain with potential retries
215
215
  chainResult = await this.executeChainWithRetries(executionContext);
216
-
216
+
217
+ // Capture the LIVE post-action page state before the page is closed,
218
+ // so callers can extract final content reflecting all actions
219
+ // (instead of re-fetching the original URL).
220
+ try {
221
+ executionContext.finalHtml = await page.content();
222
+ executionContext.finalUrl = page.url();
223
+ } catch (captureErr) {
224
+ this.log('warn', 'Failed to capture final page content: ' + captureErr.message);
225
+ }
226
+
217
227
  this.stats.successfulChains++;
218
228
  executionContext.success = true;
219
229
 
@@ -268,6 +278,8 @@ export class ActionExecutor extends EventEmitter {
268
278
  success: true,
269
279
  chainId,
270
280
  url,
281
+ finalUrl: executionContext.finalUrl || url,
282
+ finalHtml: executionContext.finalHtml,
271
283
  executionTime: Date.now() - startTime,
272
284
  results: executionContext.results,
273
285
  screenshots: executionContext.screenshots,
@@ -0,0 +1,300 @@
1
+ /**
2
+ * AgentOrchestrator — autonomous NL-prompt → search/navigate/extract → answer.
3
+ *
4
+ * Design: hardcoded 3-action state machine.
5
+ * PLAN — one SamplingClient call to decompose prompt into search queries
6
+ * GATHER — search_web (≤maxUrls results total)
7
+ * ACT — fetchAndParse + relevance gate per URL
8
+ * DECIDE — loop or answer (step/URL/time hard stops; never LLM-trusted)
9
+ * SHAPE — schema→ExtractWithLlm prose→synthesis via SamplingClient
10
+ *
11
+ * Hard stops (enforced here, not by the LLM):
12
+ * 1. maxSteps iterations of the ACT loop
13
+ * 2. maxUrls total URLs fetched
14
+ * 3. wallClockMs wall-clock milliseconds (default 120 000)
15
+ *
16
+ * No-LLM-key path: if all LLM calls fail, return collected evidence + {degraded:true}.
17
+ * pro model: delegates to ResearchOrchestrator.conductResearch() for richer synthesis.
18
+ */
19
+
20
+ import { fetchAndParse } from '../tools/extract/_fetchAndParse.js';
21
+ import { SamplingClient } from './SamplingClient.js';
22
+
23
+ const DEFAULT_WALL_CLOCK_MS = 120_000;
24
+ const DEFAULT_MAX_STEPS = 5;
25
+ const DEFAULT_MAX_URLS = 10;
26
+
27
+ // ── Helpers ───────────────────────────────────────────────────────────────────
28
+
29
+ /**
30
+ * Naive relevance gate: does the fetched text contain any query term?
31
+ * Avoids an LLM call for an obviously irrelevant page.
32
+ */
33
+ function isRelevant(text, query) {
34
+ if (!text || !query) return true; // fail-open
35
+ const lc = text.toLowerCase();
36
+ return query.toLowerCase().split(/\s+/).some(term => term.length > 3 && lc.includes(term));
37
+ }
38
+
39
+ /**
40
+ * Truncate text to a safe token budget (~8 000 chars ≈ ~2 000 tokens).
41
+ */
42
+ function truncate(text, maxChars = 8000) {
43
+ if (!text || text.length <= maxChars) return text;
44
+ return text.slice(0, maxChars) + '\n[...truncated]';
45
+ }
46
+
47
+ // ── Orchestrator ──────────────────────────────────────────────────────────────
48
+
49
+ export class AgentOrchestrator {
50
+ /**
51
+ * @param {object} options
52
+ * @param {object|null} options.mcpServer - McpServer instance (for SamplingClient)
53
+ * @param {object} options.searchConfig - passed to SearchWebTool constructor
54
+ * @param {object} options.llmConfig - passed to ExtractWithLlm constructor
55
+ */
56
+ constructor(options = {}) {
57
+ this._mcpServer = options.mcpServer || null;
58
+ this._searchConfig = options.searchConfig || {};
59
+ this._llmConfig = options.llmConfig || {};
60
+ this._samplingClient = null;
61
+ this._searchTool = null;
62
+ this._extractWithLlm = null;
63
+ this._researchOrchestrator = null;
64
+ }
65
+
66
+ /** Set MCP server (called by agent.js after construction). */
67
+ setMcpServer(mcpServer) {
68
+ this._mcpServer = mcpServer;
69
+ this._samplingClient = null; // reset so it is rebuilt with the new server
70
+ }
71
+
72
+ // ── Lazy accessors ──────────────────────────────────────────────────────────
73
+
74
+ _getSamplingClient() {
75
+ if (!this._samplingClient) {
76
+ this._samplingClient = new SamplingClient({ mcpServer: this._mcpServer });
77
+ }
78
+ return this._samplingClient;
79
+ }
80
+
81
+ async _getSearchTool() {
82
+ if (!this._searchTool) {
83
+ const { SearchWebTool } = await import('../tools/search/searchWeb.js');
84
+ this._searchTool = new SearchWebTool(this._searchConfig);
85
+ }
86
+ return this._searchTool;
87
+ }
88
+
89
+ async _getExtractWithLlm() {
90
+ if (!this._extractWithLlm) {
91
+ const { ExtractWithLlm } = await import('../tools/extract/extractWithLlm.js');
92
+ this._extractWithLlm = new ExtractWithLlm(this._llmConfig);
93
+ }
94
+ return this._extractWithLlm;
95
+ }
96
+
97
+ async _getResearchOrchestrator() {
98
+ if (!this._researchOrchestrator) {
99
+ const { ResearchOrchestrator } = await import('./ResearchOrchestrator.js');
100
+ this._researchOrchestrator = new ResearchOrchestrator({
101
+ maxUrls: 50,
102
+ timeLimit: DEFAULT_WALL_CLOCK_MS
103
+ });
104
+ }
105
+ return this._researchOrchestrator;
106
+ }
107
+
108
+ // ── Main entry ──────────────────────────────────────────────────────────────
109
+
110
+ /**
111
+ * Run the agent loop.
112
+ *
113
+ * @param {object} params
114
+ * @param {string} params.prompt - Natural-language task
115
+ * @param {string[]} [params.urls] - Seed URLs (skips search for those)
116
+ * @param {object} [params.schema] - JSON schema for structured output
117
+ * @param {string} [params.model] - 'default' | 'pro'
118
+ * @param {number} [params.maxSteps] - Max ACT iterations (≤10)
119
+ * @param {number} [params.maxUrls] - Max URLs to fetch (≤20)
120
+ * @param {number} [params.wallClockMs] - Wall-clock budget in ms
121
+ * @returns {Promise<object>}
122
+ */
123
+ async run(params) {
124
+ const {
125
+ prompt,
126
+ urls: seedUrls = [],
127
+ schema,
128
+ model = 'default',
129
+ maxSteps = DEFAULT_MAX_STEPS,
130
+ maxUrls = DEFAULT_MAX_URLS,
131
+ wallClockMs = DEFAULT_WALL_CLOCK_MS
132
+ } = params;
133
+
134
+ const startTime = Date.now();
135
+ const deadline = () => (Date.now() - startTime) >= wallClockMs;
136
+
137
+ // Hard-cap params regardless of what caller sends
138
+ const capSteps = Math.min(maxSteps, 10);
139
+ const capUrls = Math.min(maxUrls, 20);
140
+
141
+ // pro model: delegate to ResearchOrchestrator
142
+ if (model === 'pro') {
143
+ try {
144
+ const orchestrator = await this._getResearchOrchestrator();
145
+ const result = await orchestrator.conductResearch(prompt, {
146
+ maxUrls: capUrls,
147
+ timeLimit: wallClockMs,
148
+ researchApproach: 'focused'
149
+ });
150
+ return { success: true, answer: result, model: 'pro', degraded: false };
151
+ } catch (err) {
152
+ // Fall through to default path on pro failure
153
+ return {
154
+ success: false,
155
+ degraded: true,
156
+ reason: `pro research failed: ${err.message}`,
157
+ answer: null
158
+ };
159
+ }
160
+ }
161
+
162
+ // ── PLAN ──────────────────────────────────────────────────────────────────
163
+ let searchQueries = [prompt]; // fallback: use raw prompt as query
164
+ try {
165
+ const planPrompt =
166
+ `Decompose this research task into 1-3 concise web search queries (one per line, no bullets):\n\n${prompt}`;
167
+ const { text } = await this._getSamplingClient().complete(planPrompt, { maxTokens: 200 });
168
+ const lines = text.split('\n').map(l => l.replace(/^[-*\d.)\s]+/, '').trim()).filter(Boolean);
169
+ if (lines.length > 0) searchQueries = lines.slice(0, 3);
170
+ } catch {
171
+ // Sampling unavailable — use raw prompt
172
+ }
173
+
174
+ // ── GATHER (search) ───────────────────────────────────────────────────────
175
+ const urlQueue = [...seedUrls]; // start with any user-provided seeds
176
+ const searchResults = [];
177
+
178
+ if (urlQueue.length < capUrls) {
179
+ try {
180
+ const searchTool = await this._getSearchTool();
181
+ for (const q of searchQueries) {
182
+ if (deadline()) break;
183
+ try {
184
+ const sr = await searchTool.execute({ query: q, limit: Math.ceil(capUrls / searchQueries.length) });
185
+ const parsed = sr?.content?.[0]?.text ? JSON.parse(sr.content[0].text) : null;
186
+ if (parsed?.results) {
187
+ for (const r of parsed.results) {
188
+ if (r.link && !urlQueue.includes(r.link)) urlQueue.push(r.link);
189
+ searchResults.push({ query: q, title: r.title || '', url: r.link || '', snippet: r.snippet || '' });
190
+ }
191
+ }
192
+ } catch { /* skip failed search */ }
193
+ }
194
+ } catch { /* search tool init failed */ }
195
+ }
196
+
197
+ // ── ACT loop ──────────────────────────────────────────────────────────────
198
+ const evidence = [];
199
+ let urlsFetched = 0;
200
+ let step = 0;
201
+
202
+ for (const url of urlQueue) {
203
+ if (step >= capSteps || urlsFetched >= capUrls || deadline()) break;
204
+ step++;
205
+ urlsFetched++;
206
+
207
+ try {
208
+ const { textContent, finalUrl } = await fetchAndParse(url, { timeoutMs: 10000 });
209
+ if (!isRelevant(textContent, prompt)) continue;
210
+ evidence.push({
211
+ url: finalUrl,
212
+ text: truncate(textContent),
213
+ step
214
+ });
215
+ } catch { /* skip unreachable URL */ }
216
+ }
217
+
218
+ // ── SHAPE ─────────────────────────────────────────────────────────────────
219
+ const combinedText = evidence.map(e => `--- Source: ${e.url} ---\n${e.text}`).join('\n\n');
220
+
221
+ if (!combinedText.trim()) {
222
+ return {
223
+ success: true,
224
+ degraded: true,
225
+ reason: 'No content could be fetched for the given prompt.',
226
+ search_results: searchResults,
227
+ evidence: [],
228
+ answer: null,
229
+ steps: step,
230
+ urls_fetched: urlsFetched
231
+ };
232
+ }
233
+
234
+ // Schema path: use ExtractWithLlm for structured output
235
+ if (schema && Object.keys(schema).length > 0) {
236
+ try {
237
+ const extractWithLlm = await this._getExtractWithLlm();
238
+ const result = await extractWithLlm.execute({
239
+ content: combinedText,
240
+ prompt: `From the following research sources, answer this task and extract structured data:\n${prompt}`,
241
+ schema,
242
+ provider: 'auto'
243
+ });
244
+ return {
245
+ success: result.success,
246
+ answer: result.success ? result.data : null,
247
+ structured: true,
248
+ search_results: searchResults,
249
+ evidence: evidence.map(e => ({ url: e.url })),
250
+ degraded: !result.success,
251
+ reason: result.success ? undefined : result.error,
252
+ steps: step,
253
+ urls_fetched: urlsFetched
254
+ };
255
+ } catch (err) {
256
+ // Fall through to prose synthesis
257
+ }
258
+ }
259
+
260
+ // Prose synthesis via SamplingClient
261
+ let answer = null;
262
+ let degraded = false;
263
+ let degradedReason;
264
+
265
+ try {
266
+ const synthesisPrompt =
267
+ `You are a research assistant. Based on the sources below, answer this task:\n\n` +
268
+ `Task: ${prompt}\n\n` +
269
+ `${truncate(combinedText, 12000)}\n\n` +
270
+ `Provide a clear, concise answer.`;
271
+
272
+ const { text } = await this._getSamplingClient().complete(synthesisPrompt, { maxTokens: 1024 });
273
+ answer = text;
274
+ } catch (err) {
275
+ degraded = true;
276
+ degradedReason = `LLM synthesis unavailable: ${err.message}`;
277
+ // Return raw evidence so the host LLM can synthesize
278
+ answer = null;
279
+ }
280
+
281
+ return {
282
+ success: true,
283
+ answer,
284
+ search_results: searchResults,
285
+ evidence: degraded ? evidence : evidence.map(e => ({ url: e.url })),
286
+ degraded,
287
+ reason: degradedReason,
288
+ steps: step,
289
+ urls_fetched: urlsFetched
290
+ };
291
+ }
292
+
293
+ async destroy() {
294
+ if (this._researchOrchestrator && typeof this._researchOrchestrator.destroy === 'function') {
295
+ await this._researchOrchestrator.destroy();
296
+ }
297
+ }
298
+ }
299
+
300
+ export default AgentOrchestrator;
@@ -538,7 +538,13 @@ class AuthManager {
538
538
  extract_with_llm: 5,
539
539
 
540
540
  // D3.3: Pre-built site templates (1 credit per template scrape)
541
- scrape_template: 1
541
+ scrape_template: 1,
542
+
543
+ // Phase D (v4.6.0)
544
+ // scrape: base 2; projectCost() scales with format count
545
+ scrape: 2,
546
+ // agent: base 8; projectCost() scales with maxUrls
547
+ agent: 8
542
548
  };
543
549
 
544
550
  return costs[tool] || 1;
@@ -585,6 +591,20 @@ class AuthManager {
585
591
  case 'extract_with_llm':
586
592
  note = 'Includes external LLM API call cost (not billed in credits, billed by your LLM provider).';
587
593
  break;
594
+ case 'scrape': {
595
+ // Base 2 + 1 per format beyond the first
596
+ const fmtCount = Array.isArray(params?.formats) ? params.formats.length : 1;
597
+ projected = Math.max(base, base + Math.max(0, fmtCount - 1));
598
+ note = `Estimated from ${fmtCount} format(s). json format may incur external LLM cost.`;
599
+ break;
600
+ }
601
+ case 'agent': {
602
+ const agentUrls = params?.maxUrls || 10;
603
+ const isPro = params?.model === 'pro';
604
+ projected = Math.max(base, base + Math.ceil(agentUrls / 5) + (isPro ? 5 : 0));
605
+ note = `Lower-bound estimate. Scales with maxUrls (${agentUrls}).${isPro ? ' pro model adds deep-research cost.' : ''} External LLM billed separately.`;
606
+ break;
607
+ }
588
608
  default:
589
609
  note = 'Fixed cost per invocation.';
590
610
  }
@@ -173,12 +173,15 @@ export class ChangeTracker extends EventEmitter {
173
173
  */
174
174
  async compareWithBaseline(url, currentContent, options = {}) {
175
175
  const startTime = Date.now();
176
-
176
+
177
+ // Expected no-baseline case: return a clean error WITHOUT emitting an
178
+ // unhandled 'error' event (which would crash callers with no 'error' listener).
179
+ if (!this.snapshots.has(url)) {
180
+ throw new Error(`No baseline found for ${url} — run create_baseline first`);
181
+ }
182
+
177
183
  try {
178
- if (!this.snapshots.has(url)) {
179
- throw new Error(`No baseline found for URL: ${url}`);
180
- }
181
-
184
+
182
185
  const snapshots = this.snapshots.get(url);
183
186
  const baseline = snapshots[snapshots.length - 1]; // Get latest baseline
184
187
 
@@ -28,7 +28,10 @@ export class LLMsTxtAnalyzer {
28
28
  respectRobots: options.respectRobots !== false,
29
29
  detectAPIs: options.detectAPIs !== false,
30
30
  analyzeContent: options.analyzeContent !== false,
31
- checkSecurity: options.checkSecurity !== false,
31
+ // C1: intrusive probing is now opt-in (default false) to avoid hammering
32
+ // security-sensitive and rate-probe paths on every generation run.
33
+ checkSecurity: options.checkSecurity === true,
34
+ probeRateLimit: options.probeRateLimit === true,
32
35
  ...options
33
36
  };
34
37
 
@@ -70,26 +73,31 @@ export class LLMsTxtAnalyzer {
70
73
  analysisOptions: { ...this.options, ...options }
71
74
  };
72
75
 
73
- // Phase 1: Site Structure Analysis
76
+ // Phase 1: Site Structure Analysis (must run first — subsequent phases
77
+ // depend on the URL list it produces)
74
78
  await this.analyzeSiteStructure(url, options);
75
79
 
76
- // Phase 2: API Detection
80
+ // Phases 2-5 run in parallel where they are independent of each other.
81
+ // detectAPIEndpoints and analyzeSecurity each fire a bounded set of probe
82
+ // fetches (capped at PROBE_CONCURRENCY concurrent requests per phase).
83
+ // analyzeRateLimiting is only executed when the caller opts in via
84
+ // probeRateLimit:true — its 5 sequential requests are intrusive.
85
+ const parallelTasks = [];
86
+
77
87
  if (this.options.detectAPIs) {
78
- await this.detectAPIEndpoints(url);
88
+ parallelTasks.push(this.detectAPIEndpoints(url));
79
89
  }
80
-
81
- // Phase 3: Content Classification
82
90
  if (this.options.analyzeContent) {
83
- await this.classifyContent();
91
+ parallelTasks.push(this.classifyContent());
84
92
  }
85
-
86
- // Phase 4: Security Analysis
87
93
  if (this.options.checkSecurity) {
88
- await this.analyzeSecurity(url);
94
+ parallelTasks.push(this.analyzeSecurity(url));
95
+ }
96
+ if (this.options.probeRateLimit) {
97
+ parallelTasks.push(this.analyzeRateLimiting(url));
89
98
  }
90
99
 
91
- // Phase 5: Rate Limiting Analysis
92
- await this.analyzeRateLimiting(url);
100
+ await Promise.all(parallelTasks);
93
101
 
94
102
  // Phase 6: Generate Guidelines
95
103
  await this.generateUsageGuidelines();
@@ -160,35 +168,43 @@ export class LLMsTxtAnalyzer {
160
168
 
161
169
  /**
162
170
  * Detect API endpoints and data sources
171
+ * C1: probe fetches run in parallel (capped at PROBE_CONCURRENCY).
163
172
  */
164
173
  async detectAPIEndpoints(baseUrl) {
165
174
  logger.info('Detecting API endpoints...');
166
175
 
176
+ const PROBE_CONCURRENCY = 6;
177
+
167
178
  try {
168
- const apis = [];
169
179
  const commonPaths = [
170
180
  '/api', '/v1', '/v2', '/v3', '/rest', '/graphql',
171
181
  '/data', '/feed', '/json', '/xml', '/rss',
172
182
  '/.well-known', '/openapi', '/swagger'
173
183
  ];
174
184
 
175
- // Check common API paths
176
- for (const path of commonPaths) {
177
- const apiUrl = `${baseUrl}${path}`;
178
- try {
179
- const response = await this.fetchWithTimeout(apiUrl, { timeout: 5000 });
180
- if (response.ok) {
181
- const contentType = response.headers.get('content-type') || '';
182
- apis.push({
183
- url: apiUrl,
184
- type: this.determineAPIType(apiUrl, contentType),
185
- status: response.status,
186
- contentType,
187
- accessible: true
188
- });
189
- }
190
- } catch {
191
- // API endpoint not accessible or doesn't exist
185
+ // Run path probes in parallel batches
186
+ const apis = [];
187
+ for (let i = 0; i < commonPaths.length; i += PROBE_CONCURRENCY) {
188
+ const batch = commonPaths.slice(i, i + PROBE_CONCURRENCY);
189
+ const results = await Promise.allSettled(
190
+ batch.map(async (path) => {
191
+ const apiUrl = `${baseUrl}${path}`;
192
+ const response = await this.fetchWithTimeout(apiUrl, { timeout: 5000 });
193
+ if (response.ok) {
194
+ const contentType = response.headers.get('content-type') || '';
195
+ return {
196
+ url: apiUrl,
197
+ type: this.determineAPIType(apiUrl, contentType),
198
+ status: response.status,
199
+ contentType,
200
+ accessible: true
201
+ };
202
+ }
203
+ return null;
204
+ })
205
+ );
206
+ for (const r of results) {
207
+ if (r.status === 'fulfilled' && r.value) apis.push(r.value);
192
208
  }
193
209
  }
194
210
 
@@ -278,13 +294,14 @@ export class LLMsTxtAnalyzer {
278
294
 
279
295
  /**
280
296
  * Analyze security boundaries and sensitive areas
297
+ * C1: probe fetches run in parallel (capped at PROBE_CONCURRENCY).
281
298
  */
282
299
  async analyzeSecurity(baseUrl) {
283
300
  logger.info('Analyzing security boundaries...');
284
301
 
285
- try {
286
- const securityAreas = [];
302
+ const PROBE_CONCURRENCY = 6;
287
303
 
304
+ try {
288
305
  // Check for common sensitive paths
289
306
  const sensitivePaths = [
290
307
  '/admin', '/administrator', '/wp-admin', '/cms',
@@ -294,21 +311,28 @@ export class LLMsTxtAnalyzer {
294
311
  '/config', '/settings', '/env'
295
312
  ];
296
313
 
297
- for (const path of sensitivePaths) {
298
- const testUrl = `${baseUrl}${path}`;
299
- try {
300
- const response = await this.fetchWithTimeout(testUrl, { timeout: 3000 });
301
- if (response.status === 200 || response.status === 302 || response.status === 401) {
302
- securityAreas.push({
303
- path,
304
- url: testUrl,
305
- status: response.status,
306
- type: this.classifySecurityArea(path),
307
- recommendation: 'restrict'
308
- });
309
- }
310
- } catch {
311
- // Area not accessible
314
+ // Run path probes in parallel batches
315
+ const securityAreas = [];
316
+ for (let i = 0; i < sensitivePaths.length; i += PROBE_CONCURRENCY) {
317
+ const batch = sensitivePaths.slice(i, i + PROBE_CONCURRENCY);
318
+ const results = await Promise.allSettled(
319
+ batch.map(async (path) => {
320
+ const testUrl = `${baseUrl}${path}`;
321
+ const response = await this.fetchWithTimeout(testUrl, { timeout: 3000 });
322
+ if (response.status === 200 || response.status === 302 || response.status === 401) {
323
+ return {
324
+ path,
325
+ url: testUrl,
326
+ status: response.status,
327
+ type: this.classifySecurityArea(path),
328
+ recommendation: 'restrict'
329
+ };
330
+ }
331
+ return null;
332
+ })
333
+ );
334
+ for (const r of results) {
335
+ if (r.status === 'fulfilled' && r.value) securityAreas.push(r.value);
312
336
  }
313
337
  }
314
338
 
@@ -499,12 +499,14 @@ export class LocalizationManager extends EventEmitter {
499
499
  }
500
500
 
501
501
  /**
502
- * Detect and handle geo-blocked content
502
+ * Detect geo-blocked content and return suggestions.
503
+ * C3: renamed from handleGeoBlocking — no bypass is actually applied here;
504
+ * the returned bypassStrategies are recommendations only.
503
505
  * @param {string} url - URL to check
504
506
  * @param {Object} response - HTTP response object
505
- * @returns {Object} - Analysis and bypass suggestions
507
+ * @returns {Object} - Detection result and bypass suggestions
506
508
  */
507
- async handleGeoBlocking(url, response) {
509
+ async detectGeoBlocking(url, response) {
508
510
  const geoBlockingIndicators = [
509
511
  /not available in your country/i,
510
512
  /access denied/i,
@@ -1386,8 +1388,9 @@ export class LocalizationManager extends EventEmitter {
1386
1388
  }
1387
1389
 
1388
1390
  // Phone number pattern analysis
1391
+ // C3: fix US pattern — was using \\d (literal backslash-d) instead of \d
1389
1392
  const phonePatterns = {
1390
- 'US': /\+1[\s.-]?\(?\\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}/,
1393
+ 'US': /\+1[\s.-]?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}/,
1391
1394
  'GB': /\+44[\s.-]?\d{2,4}[\s.-]?\d{6,8}/,
1392
1395
  'DE': /\+49[\s.-]?\d{2,4}[\s.-]?\d{6,8}/,
1393
1396
  'FR': /\+33[\s.-]?\d{1}[\s.-]?\d{8}/
@@ -519,14 +519,18 @@ export class ResearchOrchestrator extends EventEmitter {
519
519
  }
520
520
  }
521
521
 
522
- if (contentData && contentData.content) {
522
+ // Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
523
+ const contentText = contentData && contentData.content
524
+ ? (typeof contentData.content === 'string'
525
+ ? contentData.content
526
+ : (contentData.content.text || ''))
527
+ : '';
528
+
529
+ // Only count and enhance sources that actually produced non-empty content.
530
+ // Skip failed extractions and empty {text:""} results.
531
+ if (contentData && contentData.success !== false && contentText.trim().length > 0) {
523
532
  this.metrics.contentExtracted++;
524
533
 
525
- // Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
526
- const contentText = typeof contentData.content === 'string'
527
- ? contentData.content
528
- : (contentData.content.text || JSON.stringify(contentData.content));
529
-
530
534
  // Enhance source with extracted content
531
535
  let enhancedSource = {
532
536
  ...source,