crawlforge-mcp-server 4.2.12 → 4.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +19 -7
- package/README.md +11 -3
- package/package.json +3 -2
- package/server.js +195 -22
- package/src/cli/commands/init.js +107 -0
- package/src/cli/index.js +2 -0
- package/src/constants/config.js +5 -0
- package/src/core/ActionExecutor.js +13 -1
- package/src/core/AgentOrchestrator.js +300 -0
- package/src/core/AuthManager.js +21 -1
- package/src/core/ChangeTracker.js +8 -5
- package/src/core/LLMsTxtAnalyzer.js +71 -47
- package/src/core/LocalizationManager.js +7 -4
- package/src/core/ResearchOrchestrator.js +10 -6
- package/src/core/StealthBrowserManager.js +52 -13
- package/src/core/analysis/ContentAnalyzer.js +2 -2
- package/src/core/crawlers/BFSCrawler.js +23 -12
- package/src/core/processing/ContentProcessor.js +19 -3
- package/src/core/processing/PDFProcessor.js +72 -23
- package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
- package/src/tools/advanced/batchScrape/index.js +3 -1
- package/src/tools/advanced/batchScrape/reporter.js +5 -1
- package/src/tools/advanced/batchScrape/worker.js +6 -1
- package/src/tools/agent/agent.js +71 -0
- package/src/tools/basic/_fetch.js +78 -5
- package/src/tools/basic/extractLinks.js +1 -1
- package/src/tools/basic/extractMetadata.js +65 -1
- package/src/tools/basic/extractText.js +73 -5
- package/src/tools/basic/scrapeStructured.js +48 -10
- package/src/tools/crawl/crawlDeep.js +13 -5
- package/src/tools/crawl/mapSite.js +53 -52
- package/src/tools/extract/analyzeContent.js +11 -6
- package/src/tools/extract/extractContent.js +23 -5
- package/src/tools/extract/extractStructured.js +65 -16
- package/src/tools/extract/extractWithLlm.js +192 -11
- package/src/tools/extract/listOllamaModels.js +19 -8
- package/src/tools/extract/processDocument.js +10 -4
- package/src/tools/extract/summarizeContent.js +58 -1
- package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
- package/src/tools/research/deepResearch.js +43 -4
- package/src/tools/scrape/unifiedScrape.js +314 -0
- package/src/tools/search/providers/searxng.js +2 -2
- package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
- package/src/tools/search/ranking/ResultRanker.js +13 -4
- package/src/tools/search/searchWeb.js +5 -5
- package/src/tools/templates/TemplateRegistry.js +3 -2
- package/src/tools/tracking/trackChanges/differ.js +33 -1
- package/src/utils/htmlToMarkdown.js +5 -1
|
@@ -213,7 +213,17 @@ export class ActionExecutor extends EventEmitter {
|
|
|
213
213
|
|
|
214
214
|
// Execute chain with potential retries
|
|
215
215
|
chainResult = await this.executeChainWithRetries(executionContext);
|
|
216
|
-
|
|
216
|
+
|
|
217
|
+
// Capture the LIVE post-action page state before the page is closed,
|
|
218
|
+
// so callers can extract final content reflecting all actions
|
|
219
|
+
// (instead of re-fetching the original URL).
|
|
220
|
+
try {
|
|
221
|
+
executionContext.finalHtml = await page.content();
|
|
222
|
+
executionContext.finalUrl = page.url();
|
|
223
|
+
} catch (captureErr) {
|
|
224
|
+
this.log('warn', 'Failed to capture final page content: ' + captureErr.message);
|
|
225
|
+
}
|
|
226
|
+
|
|
217
227
|
this.stats.successfulChains++;
|
|
218
228
|
executionContext.success = true;
|
|
219
229
|
|
|
@@ -268,6 +278,8 @@ export class ActionExecutor extends EventEmitter {
|
|
|
268
278
|
success: true,
|
|
269
279
|
chainId,
|
|
270
280
|
url,
|
|
281
|
+
finalUrl: executionContext.finalUrl || url,
|
|
282
|
+
finalHtml: executionContext.finalHtml,
|
|
271
283
|
executionTime: Date.now() - startTime,
|
|
272
284
|
results: executionContext.results,
|
|
273
285
|
screenshots: executionContext.screenshots,
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AgentOrchestrator — autonomous NL-prompt → search/navigate/extract → answer.
|
|
3
|
+
*
|
|
4
|
+
* Design: hardcoded 3-action state machine.
|
|
5
|
+
* PLAN — one SamplingClient call to decompose prompt into search queries
|
|
6
|
+
* GATHER — search_web (≤maxUrls results total)
|
|
7
|
+
* ACT — fetchAndParse + relevance gate per URL
|
|
8
|
+
* DECIDE — loop or answer (step/URL/time hard stops; never LLM-trusted)
|
|
9
|
+
* SHAPE — schema→ExtractWithLlm prose→synthesis via SamplingClient
|
|
10
|
+
*
|
|
11
|
+
* Hard stops (enforced here, not by the LLM):
|
|
12
|
+
* 1. maxSteps iterations of the ACT loop
|
|
13
|
+
* 2. maxUrls total URLs fetched
|
|
14
|
+
* 3. wallClockMs wall-clock milliseconds (default 120 000)
|
|
15
|
+
*
|
|
16
|
+
* No-LLM-key path: if all LLM calls fail, return collected evidence + {degraded:true}.
|
|
17
|
+
* pro model: delegates to ResearchOrchestrator.conductResearch() for richer synthesis.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { fetchAndParse } from '../tools/extract/_fetchAndParse.js';
|
|
21
|
+
import { SamplingClient } from './SamplingClient.js';
|
|
22
|
+
|
|
23
|
+
const DEFAULT_WALL_CLOCK_MS = 120_000;
|
|
24
|
+
const DEFAULT_MAX_STEPS = 5;
|
|
25
|
+
const DEFAULT_MAX_URLS = 10;
|
|
26
|
+
|
|
27
|
+
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Naive relevance gate: does the fetched text contain any query term?
|
|
31
|
+
* Avoids an LLM call for an obviously irrelevant page.
|
|
32
|
+
*/
|
|
33
|
+
function isRelevant(text, query) {
|
|
34
|
+
if (!text || !query) return true; // fail-open
|
|
35
|
+
const lc = text.toLowerCase();
|
|
36
|
+
return query.toLowerCase().split(/\s+/).some(term => term.length > 3 && lc.includes(term));
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Truncate text to a safe token budget (~8 000 chars ≈ ~2 000 tokens).
|
|
41
|
+
*/
|
|
42
|
+
function truncate(text, maxChars = 8000) {
|
|
43
|
+
if (!text || text.length <= maxChars) return text;
|
|
44
|
+
return text.slice(0, maxChars) + '\n[...truncated]';
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ── Orchestrator ──────────────────────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
export class AgentOrchestrator {
|
|
50
|
+
/**
|
|
51
|
+
* @param {object} options
|
|
52
|
+
* @param {object|null} options.mcpServer - McpServer instance (for SamplingClient)
|
|
53
|
+
* @param {object} options.searchConfig - passed to SearchWebTool constructor
|
|
54
|
+
* @param {object} options.llmConfig - passed to ExtractWithLlm constructor
|
|
55
|
+
*/
|
|
56
|
+
constructor(options = {}) {
|
|
57
|
+
this._mcpServer = options.mcpServer || null;
|
|
58
|
+
this._searchConfig = options.searchConfig || {};
|
|
59
|
+
this._llmConfig = options.llmConfig || {};
|
|
60
|
+
this._samplingClient = null;
|
|
61
|
+
this._searchTool = null;
|
|
62
|
+
this._extractWithLlm = null;
|
|
63
|
+
this._researchOrchestrator = null;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/** Set MCP server (called by agent.js after construction). */
|
|
67
|
+
setMcpServer(mcpServer) {
|
|
68
|
+
this._mcpServer = mcpServer;
|
|
69
|
+
this._samplingClient = null; // reset so it is rebuilt with the new server
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// ── Lazy accessors ──────────────────────────────────────────────────────────
|
|
73
|
+
|
|
74
|
+
_getSamplingClient() {
|
|
75
|
+
if (!this._samplingClient) {
|
|
76
|
+
this._samplingClient = new SamplingClient({ mcpServer: this._mcpServer });
|
|
77
|
+
}
|
|
78
|
+
return this._samplingClient;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
async _getSearchTool() {
|
|
82
|
+
if (!this._searchTool) {
|
|
83
|
+
const { SearchWebTool } = await import('../tools/search/searchWeb.js');
|
|
84
|
+
this._searchTool = new SearchWebTool(this._searchConfig);
|
|
85
|
+
}
|
|
86
|
+
return this._searchTool;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
async _getExtractWithLlm() {
|
|
90
|
+
if (!this._extractWithLlm) {
|
|
91
|
+
const { ExtractWithLlm } = await import('../tools/extract/extractWithLlm.js');
|
|
92
|
+
this._extractWithLlm = new ExtractWithLlm(this._llmConfig);
|
|
93
|
+
}
|
|
94
|
+
return this._extractWithLlm;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
async _getResearchOrchestrator() {
|
|
98
|
+
if (!this._researchOrchestrator) {
|
|
99
|
+
const { ResearchOrchestrator } = await import('./ResearchOrchestrator.js');
|
|
100
|
+
this._researchOrchestrator = new ResearchOrchestrator({
|
|
101
|
+
maxUrls: 50,
|
|
102
|
+
timeLimit: DEFAULT_WALL_CLOCK_MS
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
return this._researchOrchestrator;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// ── Main entry ──────────────────────────────────────────────────────────────
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Run the agent loop.
|
|
112
|
+
*
|
|
113
|
+
* @param {object} params
|
|
114
|
+
* @param {string} params.prompt - Natural-language task
|
|
115
|
+
* @param {string[]} [params.urls] - Seed URLs (skips search for those)
|
|
116
|
+
* @param {object} [params.schema] - JSON schema for structured output
|
|
117
|
+
* @param {string} [params.model] - 'default' | 'pro'
|
|
118
|
+
* @param {number} [params.maxSteps] - Max ACT iterations (≤10)
|
|
119
|
+
* @param {number} [params.maxUrls] - Max URLs to fetch (≤20)
|
|
120
|
+
* @param {number} [params.wallClockMs] - Wall-clock budget in ms
|
|
121
|
+
* @returns {Promise<object>}
|
|
122
|
+
*/
|
|
123
|
+
async run(params) {
|
|
124
|
+
const {
|
|
125
|
+
prompt,
|
|
126
|
+
urls: seedUrls = [],
|
|
127
|
+
schema,
|
|
128
|
+
model = 'default',
|
|
129
|
+
maxSteps = DEFAULT_MAX_STEPS,
|
|
130
|
+
maxUrls = DEFAULT_MAX_URLS,
|
|
131
|
+
wallClockMs = DEFAULT_WALL_CLOCK_MS
|
|
132
|
+
} = params;
|
|
133
|
+
|
|
134
|
+
const startTime = Date.now();
|
|
135
|
+
const deadline = () => (Date.now() - startTime) >= wallClockMs;
|
|
136
|
+
|
|
137
|
+
// Hard-cap params regardless of what caller sends
|
|
138
|
+
const capSteps = Math.min(maxSteps, 10);
|
|
139
|
+
const capUrls = Math.min(maxUrls, 20);
|
|
140
|
+
|
|
141
|
+
// pro model: delegate to ResearchOrchestrator
|
|
142
|
+
if (model === 'pro') {
|
|
143
|
+
try {
|
|
144
|
+
const orchestrator = await this._getResearchOrchestrator();
|
|
145
|
+
const result = await orchestrator.conductResearch(prompt, {
|
|
146
|
+
maxUrls: capUrls,
|
|
147
|
+
timeLimit: wallClockMs,
|
|
148
|
+
researchApproach: 'focused'
|
|
149
|
+
});
|
|
150
|
+
return { success: true, answer: result, model: 'pro', degraded: false };
|
|
151
|
+
} catch (err) {
|
|
152
|
+
// Fall through to default path on pro failure
|
|
153
|
+
return {
|
|
154
|
+
success: false,
|
|
155
|
+
degraded: true,
|
|
156
|
+
reason: `pro research failed: ${err.message}`,
|
|
157
|
+
answer: null
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// ── PLAN ──────────────────────────────────────────────────────────────────
|
|
163
|
+
let searchQueries = [prompt]; // fallback: use raw prompt as query
|
|
164
|
+
try {
|
|
165
|
+
const planPrompt =
|
|
166
|
+
`Decompose this research task into 1-3 concise web search queries (one per line, no bullets):\n\n${prompt}`;
|
|
167
|
+
const { text } = await this._getSamplingClient().complete(planPrompt, { maxTokens: 200 });
|
|
168
|
+
const lines = text.split('\n').map(l => l.replace(/^[-*\d.)\s]+/, '').trim()).filter(Boolean);
|
|
169
|
+
if (lines.length > 0) searchQueries = lines.slice(0, 3);
|
|
170
|
+
} catch {
|
|
171
|
+
// Sampling unavailable — use raw prompt
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// ── GATHER (search) ───────────────────────────────────────────────────────
|
|
175
|
+
const urlQueue = [...seedUrls]; // start with any user-provided seeds
|
|
176
|
+
const searchResults = [];
|
|
177
|
+
|
|
178
|
+
if (urlQueue.length < capUrls) {
|
|
179
|
+
try {
|
|
180
|
+
const searchTool = await this._getSearchTool();
|
|
181
|
+
for (const q of searchQueries) {
|
|
182
|
+
if (deadline()) break;
|
|
183
|
+
try {
|
|
184
|
+
const sr = await searchTool.execute({ query: q, limit: Math.ceil(capUrls / searchQueries.length) });
|
|
185
|
+
const parsed = sr?.content?.[0]?.text ? JSON.parse(sr.content[0].text) : null;
|
|
186
|
+
if (parsed?.results) {
|
|
187
|
+
for (const r of parsed.results) {
|
|
188
|
+
if (r.link && !urlQueue.includes(r.link)) urlQueue.push(r.link);
|
|
189
|
+
searchResults.push({ query: q, title: r.title || '', url: r.link || '', snippet: r.snippet || '' });
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
} catch { /* skip failed search */ }
|
|
193
|
+
}
|
|
194
|
+
} catch { /* search tool init failed */ }
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// ── ACT loop ──────────────────────────────────────────────────────────────
|
|
198
|
+
const evidence = [];
|
|
199
|
+
let urlsFetched = 0;
|
|
200
|
+
let step = 0;
|
|
201
|
+
|
|
202
|
+
for (const url of urlQueue) {
|
|
203
|
+
if (step >= capSteps || urlsFetched >= capUrls || deadline()) break;
|
|
204
|
+
step++;
|
|
205
|
+
urlsFetched++;
|
|
206
|
+
|
|
207
|
+
try {
|
|
208
|
+
const { textContent, finalUrl } = await fetchAndParse(url, { timeoutMs: 10000 });
|
|
209
|
+
if (!isRelevant(textContent, prompt)) continue;
|
|
210
|
+
evidence.push({
|
|
211
|
+
url: finalUrl,
|
|
212
|
+
text: truncate(textContent),
|
|
213
|
+
step
|
|
214
|
+
});
|
|
215
|
+
} catch { /* skip unreachable URL */ }
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// ── SHAPE ─────────────────────────────────────────────────────────────────
|
|
219
|
+
const combinedText = evidence.map(e => `--- Source: ${e.url} ---\n${e.text}`).join('\n\n');
|
|
220
|
+
|
|
221
|
+
if (!combinedText.trim()) {
|
|
222
|
+
return {
|
|
223
|
+
success: true,
|
|
224
|
+
degraded: true,
|
|
225
|
+
reason: 'No content could be fetched for the given prompt.',
|
|
226
|
+
search_results: searchResults,
|
|
227
|
+
evidence: [],
|
|
228
|
+
answer: null,
|
|
229
|
+
steps: step,
|
|
230
|
+
urls_fetched: urlsFetched
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// Schema path: use ExtractWithLlm for structured output
|
|
235
|
+
if (schema && Object.keys(schema).length > 0) {
|
|
236
|
+
try {
|
|
237
|
+
const extractWithLlm = await this._getExtractWithLlm();
|
|
238
|
+
const result = await extractWithLlm.execute({
|
|
239
|
+
content: combinedText,
|
|
240
|
+
prompt: `From the following research sources, answer this task and extract structured data:\n${prompt}`,
|
|
241
|
+
schema,
|
|
242
|
+
provider: 'auto'
|
|
243
|
+
});
|
|
244
|
+
return {
|
|
245
|
+
success: result.success,
|
|
246
|
+
answer: result.success ? result.data : null,
|
|
247
|
+
structured: true,
|
|
248
|
+
search_results: searchResults,
|
|
249
|
+
evidence: evidence.map(e => ({ url: e.url })),
|
|
250
|
+
degraded: !result.success,
|
|
251
|
+
reason: result.success ? undefined : result.error,
|
|
252
|
+
steps: step,
|
|
253
|
+
urls_fetched: urlsFetched
|
|
254
|
+
};
|
|
255
|
+
} catch (err) {
|
|
256
|
+
// Fall through to prose synthesis
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Prose synthesis via SamplingClient
|
|
261
|
+
let answer = null;
|
|
262
|
+
let degraded = false;
|
|
263
|
+
let degradedReason;
|
|
264
|
+
|
|
265
|
+
try {
|
|
266
|
+
const synthesisPrompt =
|
|
267
|
+
`You are a research assistant. Based on the sources below, answer this task:\n\n` +
|
|
268
|
+
`Task: ${prompt}\n\n` +
|
|
269
|
+
`${truncate(combinedText, 12000)}\n\n` +
|
|
270
|
+
`Provide a clear, concise answer.`;
|
|
271
|
+
|
|
272
|
+
const { text } = await this._getSamplingClient().complete(synthesisPrompt, { maxTokens: 1024 });
|
|
273
|
+
answer = text;
|
|
274
|
+
} catch (err) {
|
|
275
|
+
degraded = true;
|
|
276
|
+
degradedReason = `LLM synthesis unavailable: ${err.message}`;
|
|
277
|
+
// Return raw evidence so the host LLM can synthesize
|
|
278
|
+
answer = null;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
return {
|
|
282
|
+
success: true,
|
|
283
|
+
answer,
|
|
284
|
+
search_results: searchResults,
|
|
285
|
+
evidence: degraded ? evidence : evidence.map(e => ({ url: e.url })),
|
|
286
|
+
degraded,
|
|
287
|
+
reason: degradedReason,
|
|
288
|
+
steps: step,
|
|
289
|
+
urls_fetched: urlsFetched
|
|
290
|
+
};
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
async destroy() {
|
|
294
|
+
if (this._researchOrchestrator && typeof this._researchOrchestrator.destroy === 'function') {
|
|
295
|
+
await this._researchOrchestrator.destroy();
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
export default AgentOrchestrator;
|
package/src/core/AuthManager.js
CHANGED
|
@@ -538,7 +538,13 @@ class AuthManager {
|
|
|
538
538
|
extract_with_llm: 5,
|
|
539
539
|
|
|
540
540
|
// D3.3: Pre-built site templates (1 credit per template scrape)
|
|
541
|
-
scrape_template: 1
|
|
541
|
+
scrape_template: 1,
|
|
542
|
+
|
|
543
|
+
// Phase D (v4.6.0)
|
|
544
|
+
// scrape: base 2; projectCost() scales with format count
|
|
545
|
+
scrape: 2,
|
|
546
|
+
// agent: base 8; projectCost() scales with maxUrls
|
|
547
|
+
agent: 8
|
|
542
548
|
};
|
|
543
549
|
|
|
544
550
|
return costs[tool] || 1;
|
|
@@ -585,6 +591,20 @@ class AuthManager {
|
|
|
585
591
|
case 'extract_with_llm':
|
|
586
592
|
note = 'Includes external LLM API call cost (not billed in credits, billed by your LLM provider).';
|
|
587
593
|
break;
|
|
594
|
+
case 'scrape': {
|
|
595
|
+
// Base 2 + 1 per format beyond the first
|
|
596
|
+
const fmtCount = Array.isArray(params?.formats) ? params.formats.length : 1;
|
|
597
|
+
projected = Math.max(base, base + Math.max(0, fmtCount - 1));
|
|
598
|
+
note = `Estimated from ${fmtCount} format(s). json format may incur external LLM cost.`;
|
|
599
|
+
break;
|
|
600
|
+
}
|
|
601
|
+
case 'agent': {
|
|
602
|
+
const agentUrls = params?.maxUrls || 10;
|
|
603
|
+
const isPro = params?.model === 'pro';
|
|
604
|
+
projected = Math.max(base, base + Math.ceil(agentUrls / 5) + (isPro ? 5 : 0));
|
|
605
|
+
note = `Lower-bound estimate. Scales with maxUrls (${agentUrls}).${isPro ? ' pro model adds deep-research cost.' : ''} External LLM billed separately.`;
|
|
606
|
+
break;
|
|
607
|
+
}
|
|
588
608
|
default:
|
|
589
609
|
note = 'Fixed cost per invocation.';
|
|
590
610
|
}
|
|
@@ -173,12 +173,15 @@ export class ChangeTracker extends EventEmitter {
|
|
|
173
173
|
*/
|
|
174
174
|
async compareWithBaseline(url, currentContent, options = {}) {
|
|
175
175
|
const startTime = Date.now();
|
|
176
|
-
|
|
176
|
+
|
|
177
|
+
// Expected no-baseline case: return a clean error WITHOUT emitting an
|
|
178
|
+
// unhandled 'error' event (which would crash callers with no 'error' listener).
|
|
179
|
+
if (!this.snapshots.has(url)) {
|
|
180
|
+
throw new Error(`No baseline found for ${url} — run create_baseline first`);
|
|
181
|
+
}
|
|
182
|
+
|
|
177
183
|
try {
|
|
178
|
-
|
|
179
|
-
throw new Error(`No baseline found for URL: ${url}`);
|
|
180
|
-
}
|
|
181
|
-
|
|
184
|
+
|
|
182
185
|
const snapshots = this.snapshots.get(url);
|
|
183
186
|
const baseline = snapshots[snapshots.length - 1]; // Get latest baseline
|
|
184
187
|
|
|
@@ -28,7 +28,10 @@ export class LLMsTxtAnalyzer {
|
|
|
28
28
|
respectRobots: options.respectRobots !== false,
|
|
29
29
|
detectAPIs: options.detectAPIs !== false,
|
|
30
30
|
analyzeContent: options.analyzeContent !== false,
|
|
31
|
-
|
|
31
|
+
// C1: intrusive probing is now opt-in (default false) to avoid hammering
|
|
32
|
+
// security-sensitive and rate-probe paths on every generation run.
|
|
33
|
+
checkSecurity: options.checkSecurity === true,
|
|
34
|
+
probeRateLimit: options.probeRateLimit === true,
|
|
32
35
|
...options
|
|
33
36
|
};
|
|
34
37
|
|
|
@@ -70,26 +73,31 @@ export class LLMsTxtAnalyzer {
|
|
|
70
73
|
analysisOptions: { ...this.options, ...options }
|
|
71
74
|
};
|
|
72
75
|
|
|
73
|
-
// Phase 1: Site Structure Analysis
|
|
76
|
+
// Phase 1: Site Structure Analysis (must run first — subsequent phases
|
|
77
|
+
// depend on the URL list it produces)
|
|
74
78
|
await this.analyzeSiteStructure(url, options);
|
|
75
79
|
|
|
76
|
-
//
|
|
80
|
+
// Phases 2-5 run in parallel where they are independent of each other.
|
|
81
|
+
// detectAPIEndpoints and analyzeSecurity each fire a bounded set of probe
|
|
82
|
+
// fetches (capped at PROBE_CONCURRENCY concurrent requests per phase).
|
|
83
|
+
// analyzeRateLimiting is only executed when the caller opts in via
|
|
84
|
+
// probeRateLimit:true — its 5 sequential requests are intrusive.
|
|
85
|
+
const parallelTasks = [];
|
|
86
|
+
|
|
77
87
|
if (this.options.detectAPIs) {
|
|
78
|
-
|
|
88
|
+
parallelTasks.push(this.detectAPIEndpoints(url));
|
|
79
89
|
}
|
|
80
|
-
|
|
81
|
-
// Phase 3: Content Classification
|
|
82
90
|
if (this.options.analyzeContent) {
|
|
83
|
-
|
|
91
|
+
parallelTasks.push(this.classifyContent());
|
|
84
92
|
}
|
|
85
|
-
|
|
86
|
-
// Phase 4: Security Analysis
|
|
87
93
|
if (this.options.checkSecurity) {
|
|
88
|
-
|
|
94
|
+
parallelTasks.push(this.analyzeSecurity(url));
|
|
95
|
+
}
|
|
96
|
+
if (this.options.probeRateLimit) {
|
|
97
|
+
parallelTasks.push(this.analyzeRateLimiting(url));
|
|
89
98
|
}
|
|
90
99
|
|
|
91
|
-
|
|
92
|
-
await this.analyzeRateLimiting(url);
|
|
100
|
+
await Promise.all(parallelTasks);
|
|
93
101
|
|
|
94
102
|
// Phase 6: Generate Guidelines
|
|
95
103
|
await this.generateUsageGuidelines();
|
|
@@ -160,35 +168,43 @@ export class LLMsTxtAnalyzer {
|
|
|
160
168
|
|
|
161
169
|
/**
|
|
162
170
|
* Detect API endpoints and data sources
|
|
171
|
+
* C1: probe fetches run in parallel (capped at PROBE_CONCURRENCY).
|
|
163
172
|
*/
|
|
164
173
|
async detectAPIEndpoints(baseUrl) {
|
|
165
174
|
logger.info('Detecting API endpoints...');
|
|
166
175
|
|
|
176
|
+
const PROBE_CONCURRENCY = 6;
|
|
177
|
+
|
|
167
178
|
try {
|
|
168
|
-
const apis = [];
|
|
169
179
|
const commonPaths = [
|
|
170
180
|
'/api', '/v1', '/v2', '/v3', '/rest', '/graphql',
|
|
171
181
|
'/data', '/feed', '/json', '/xml', '/rss',
|
|
172
182
|
'/.well-known', '/openapi', '/swagger'
|
|
173
183
|
];
|
|
174
184
|
|
|
175
|
-
//
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
const
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
185
|
+
// Run path probes in parallel batches
|
|
186
|
+
const apis = [];
|
|
187
|
+
for (let i = 0; i < commonPaths.length; i += PROBE_CONCURRENCY) {
|
|
188
|
+
const batch = commonPaths.slice(i, i + PROBE_CONCURRENCY);
|
|
189
|
+
const results = await Promise.allSettled(
|
|
190
|
+
batch.map(async (path) => {
|
|
191
|
+
const apiUrl = `${baseUrl}${path}`;
|
|
192
|
+
const response = await this.fetchWithTimeout(apiUrl, { timeout: 5000 });
|
|
193
|
+
if (response.ok) {
|
|
194
|
+
const contentType = response.headers.get('content-type') || '';
|
|
195
|
+
return {
|
|
196
|
+
url: apiUrl,
|
|
197
|
+
type: this.determineAPIType(apiUrl, contentType),
|
|
198
|
+
status: response.status,
|
|
199
|
+
contentType,
|
|
200
|
+
accessible: true
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
return null;
|
|
204
|
+
})
|
|
205
|
+
);
|
|
206
|
+
for (const r of results) {
|
|
207
|
+
if (r.status === 'fulfilled' && r.value) apis.push(r.value);
|
|
192
208
|
}
|
|
193
209
|
}
|
|
194
210
|
|
|
@@ -278,13 +294,14 @@ export class LLMsTxtAnalyzer {
|
|
|
278
294
|
|
|
279
295
|
/**
|
|
280
296
|
* Analyze security boundaries and sensitive areas
|
|
297
|
+
* C1: probe fetches run in parallel (capped at PROBE_CONCURRENCY).
|
|
281
298
|
*/
|
|
282
299
|
async analyzeSecurity(baseUrl) {
|
|
283
300
|
logger.info('Analyzing security boundaries...');
|
|
284
301
|
|
|
285
|
-
|
|
286
|
-
const securityAreas = [];
|
|
302
|
+
const PROBE_CONCURRENCY = 6;
|
|
287
303
|
|
|
304
|
+
try {
|
|
288
305
|
// Check for common sensitive paths
|
|
289
306
|
const sensitivePaths = [
|
|
290
307
|
'/admin', '/administrator', '/wp-admin', '/cms',
|
|
@@ -294,21 +311,28 @@ export class LLMsTxtAnalyzer {
|
|
|
294
311
|
'/config', '/settings', '/env'
|
|
295
312
|
];
|
|
296
313
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
314
|
+
// Run path probes in parallel batches
|
|
315
|
+
const securityAreas = [];
|
|
316
|
+
for (let i = 0; i < sensitivePaths.length; i += PROBE_CONCURRENCY) {
|
|
317
|
+
const batch = sensitivePaths.slice(i, i + PROBE_CONCURRENCY);
|
|
318
|
+
const results = await Promise.allSettled(
|
|
319
|
+
batch.map(async (path) => {
|
|
320
|
+
const testUrl = `${baseUrl}${path}`;
|
|
321
|
+
const response = await this.fetchWithTimeout(testUrl, { timeout: 3000 });
|
|
322
|
+
if (response.status === 200 || response.status === 302 || response.status === 401) {
|
|
323
|
+
return {
|
|
324
|
+
path,
|
|
325
|
+
url: testUrl,
|
|
326
|
+
status: response.status,
|
|
327
|
+
type: this.classifySecurityArea(path),
|
|
328
|
+
recommendation: 'restrict'
|
|
329
|
+
};
|
|
330
|
+
}
|
|
331
|
+
return null;
|
|
332
|
+
})
|
|
333
|
+
);
|
|
334
|
+
for (const r of results) {
|
|
335
|
+
if (r.status === 'fulfilled' && r.value) securityAreas.push(r.value);
|
|
312
336
|
}
|
|
313
337
|
}
|
|
314
338
|
|
|
@@ -499,12 +499,14 @@ export class LocalizationManager extends EventEmitter {
|
|
|
499
499
|
}
|
|
500
500
|
|
|
501
501
|
/**
|
|
502
|
-
* Detect
|
|
502
|
+
* Detect geo-blocked content and return suggestions.
|
|
503
|
+
* C3: renamed from handleGeoBlocking — no bypass is actually applied here;
|
|
504
|
+
* the returned bypassStrategies are recommendations only.
|
|
503
505
|
* @param {string} url - URL to check
|
|
504
506
|
* @param {Object} response - HTTP response object
|
|
505
|
-
* @returns {Object} -
|
|
507
|
+
* @returns {Object} - Detection result and bypass suggestions
|
|
506
508
|
*/
|
|
507
|
-
async
|
|
509
|
+
async detectGeoBlocking(url, response) {
|
|
508
510
|
const geoBlockingIndicators = [
|
|
509
511
|
/not available in your country/i,
|
|
510
512
|
/access denied/i,
|
|
@@ -1386,8 +1388,9 @@ export class LocalizationManager extends EventEmitter {
|
|
|
1386
1388
|
}
|
|
1387
1389
|
|
|
1388
1390
|
// Phone number pattern analysis
|
|
1391
|
+
// C3: fix US pattern — was using \\d (literal backslash-d) instead of \d
|
|
1389
1392
|
const phonePatterns = {
|
|
1390
|
-
'US': /\+1[\s.-]?\(
|
|
1393
|
+
'US': /\+1[\s.-]?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}/,
|
|
1391
1394
|
'GB': /\+44[\s.-]?\d{2,4}[\s.-]?\d{6,8}/,
|
|
1392
1395
|
'DE': /\+49[\s.-]?\d{2,4}[\s.-]?\d{6,8}/,
|
|
1393
1396
|
'FR': /\+33[\s.-]?\d{1}[\s.-]?\d{8}/
|
|
@@ -519,14 +519,18 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
519
519
|
}
|
|
520
520
|
}
|
|
521
521
|
|
|
522
|
-
|
|
522
|
+
// Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
|
|
523
|
+
const contentText = contentData && contentData.content
|
|
524
|
+
? (typeof contentData.content === 'string'
|
|
525
|
+
? contentData.content
|
|
526
|
+
: (contentData.content.text || ''))
|
|
527
|
+
: '';
|
|
528
|
+
|
|
529
|
+
// Only count and enhance sources that actually produced non-empty content.
|
|
530
|
+
// Skip failed extractions and empty {text:""} results.
|
|
531
|
+
if (contentData && contentData.success !== false && contentText.trim().length > 0) {
|
|
523
532
|
this.metrics.contentExtracted++;
|
|
524
533
|
|
|
525
|
-
// Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
|
|
526
|
-
const contentText = typeof contentData.content === 'string'
|
|
527
|
-
? contentData.content
|
|
528
|
-
: (contentData.content.text || JSON.stringify(contentData.content));
|
|
529
|
-
|
|
530
534
|
// Enhance source with extracted content
|
|
531
535
|
let enhancedSource = {
|
|
532
536
|
...source,
|