crawlforge-mcp-server 4.6.3 → 4.6.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -39
- package/package.json +2 -2
- package/server.js +10 -18
- package/src/core/AgentOrchestrator.js +13 -1
- package/src/core/AuthManager.js +57 -49
- package/src/core/ResearchOrchestrator.js +71 -29
- package/src/server/withAuth.js +10 -5
- package/src/tools/research/deepResearch.js +32 -2
- package/src/tools/search/searchWeb.js +21 -12
package/README.md
CHANGED
|
@@ -67,7 +67,9 @@
|
|
|
67
67
|
npm install -g crawlforge-mcp-server
|
|
68
68
|
```
|
|
69
69
|
|
|
70
|
-
### 2. Setup Your API Key
|
|
70
|
+
### 2. Setup Your API Key (optional for the free local tools)
|
|
71
|
+
|
|
72
|
+
The 15 free local tools work immediately with **no API key at all** — skip straight to step 3 if that's all you need. To unlock the metered premium tools (`search_web`, `crawl_deep`, `stealth_mode`, `agent`, …):
|
|
71
73
|
|
|
72
74
|
```bash
|
|
73
75
|
npx crawlforge-setup
|
|
@@ -148,7 +150,9 @@ Restart Cursor to activate.
|
|
|
148
150
|
|
|
149
151
|
## 📊 Available Tools
|
|
150
152
|
|
|
151
|
-
**
|
|
153
|
+
CrawlForge is **open-core**: 15 tools run locally on your machine and are **completely free — no API key required**. The metered premium tools cover real infrastructure (search fees, proxies, browser farms) and need an API key.
|
|
154
|
+
|
|
155
|
+
**Free Local Tools** (0 credits, no API key needed)
|
|
152
156
|
|
|
153
157
|
| Tool | What it does |
|
|
154
158
|
|------|--------------|
|
|
@@ -156,43 +160,33 @@ Restart Cursor to activate.
|
|
|
156
160
|
| `extract_text` | Extract clean text from web pages |
|
|
157
161
|
| `extract_links` | Get all links from a page |
|
|
158
162
|
| `extract_metadata` | Extract page metadata (title, OG tags, schema.org) |
|
|
159
|
-
| `
|
|
160
|
-
|
|
161
|
-
**Advanced Tools** (2–3 credits)
|
|
162
|
-
|
|
163
|
-
| Tool | What it does |
|
|
164
|
-
|------|--------------|
|
|
165
|
-
| `scrape` | **Unified single-fetch, multi-format extraction.** Pass a `formats` array (markdown/html/rawHtml/text/links/metadata/screenshot/json-schema) plus `onlyMainContent`; one fetch serves every requested format with per-format partial-success warnings |
|
|
163
|
+
| `scrape` | **Unified single-fetch, multi-format extraction.** Pass a `formats` array (markdown/html/rawHtml/text/links/metadata/screenshot/json-schema) plus `onlyMainContent`; one fetch serves every requested format with per-format partial-success warnings. *The `screenshot` format is the one metered exception (2 credits — needs a server browser)* |
|
|
166
164
|
| `scrape_structured` | Extract structured data with CSS selectors |
|
|
167
|
-
| `
|
|
165
|
+
| `scrape_template` | Structured data from well-known sites (Amazon, GitHub, LinkedIn, YouTube, Reddit, Hacker News, npm, and more) without writing selectors |
|
|
166
|
+
| `extract_content` | Enhanced content extraction |
|
|
168
167
|
| `summarize_content` | Generate intelligent summaries |
|
|
169
168
|
| `analyze_content` | Comprehensive content analysis |
|
|
170
|
-
| `extract_structured` | LLM-powered schema-driven extraction |
|
|
169
|
+
| `extract_structured` | LLM-powered schema-driven extraction (your own LLM key or local Ollama) |
|
|
171
170
|
| `extract_with_llm` | Natural-language extraction. **Defaults to a local Ollama model — no API key, no API costs.** Pass `provider: "openai" \| "anthropic"` with the matching key for cloud models |
|
|
172
|
-
| `
|
|
173
|
-
| `
|
|
171
|
+
| `process_document` | Multi-format document processing |
|
|
172
|
+
| `list_ollama_models` | List the Ollama models installed locally (helps you pick a `model` for `extract_with_llm`) |
|
|
174
173
|
| `get_batch_results` | Retrieve paginated results for a `batch_scrape` job by `batchId` |
|
|
175
174
|
|
|
176
|
-
**Premium Tools** (
|
|
177
|
-
|
|
178
|
-
| Tool | What it does |
|
|
179
|
-
|
|
180
|
-
| `
|
|
181
|
-
| `
|
|
182
|
-
| `
|
|
183
|
-
| `
|
|
184
|
-
| `
|
|
185
|
-
| `
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
|
190
|
-
|
|
191
|
-
| `process_document` | Multi-format document processing |
|
|
192
|
-
| `extract_content` | Enhanced content extraction |
|
|
193
|
-
| `scrape_with_actions` | Browser automation chains |
|
|
194
|
-
| `generate_llms_txt` | Generate AI interaction guidelines |
|
|
195
|
-
| `localization` | Multi-language and geo-location management |
|
|
175
|
+
**Metered Premium Tools** (3–10 credits, API key required)
|
|
176
|
+
|
|
177
|
+
| Tool | Credits | What it does |
|
|
178
|
+
|------|---------|--------------|
|
|
179
|
+
| `map_site` | 3 | Discover and map website structure (optional `search=` ranks the discovered URLs) |
|
|
180
|
+
| `track_changes` | 3 | Monitor content changes over time |
|
|
181
|
+
| `search_web` | 5 | Search the web using Google Search API |
|
|
182
|
+
| `crawl_deep` | 5 | Deep crawl entire websites |
|
|
183
|
+
| `batch_scrape` | 5 | Process multiple URLs simultaneously |
|
|
184
|
+
| `scrape_with_actions` | 5 | Browser automation chains |
|
|
185
|
+
| `generate_llms_txt` | 5 | Generate AI interaction guidelines |
|
|
186
|
+
| `localization` | 5 | Multi-language and geo-location management |
|
|
187
|
+
| `agent` | 8 | **Autonomous research/extraction from a natural-language prompt — no URLs required.** Plans, gathers, and shapes an answer under hard safety stops (max steps/URLs/wall-clock enforced by the orchestrator, never the LLM) |
|
|
188
|
+
| `deep_research` | 10 | Multi-stage research with source verification |
|
|
189
|
+
| `stealth_mode` | 10 | Anti-detection browser management |
|
|
196
190
|
|
|
197
191
|
For the full canonical capabilities reference (all tools, CLI commands, stealth engines, research workflow), see [SKILL.md](SKILL.md).
|
|
198
192
|
|
|
@@ -200,15 +194,17 @@ For the full canonical capabilities reference (all tools, CLI commands, stealth
|
|
|
200
194
|
|
|
201
195
|
## 💳 Pricing
|
|
202
196
|
|
|
197
|
+
**15 local tools are free forever — no API key, no credit card.** Credits only meter the premium tools that run on CrawlForge infrastructure.
|
|
198
|
+
|
|
203
199
|
| Plan | Credits/Month | Best For |
|
|
204
200
|
|------|---------------|----------|
|
|
205
201
|
| **Free** | 1,000 | Testing & personal projects |
|
|
206
|
-
| **
|
|
207
|
-
| **Professional** | 50,000 | Professional use & production |
|
|
208
|
-
| **
|
|
202
|
+
| **Hobby** ($19) | 5,000 | Small projects & development |
|
|
203
|
+
| **Professional** ($99) | 50,000 | Professional use & production |
|
|
204
|
+
| **Business** ($399) | 250,000 | Large scale operations |
|
|
209
205
|
|
|
210
206
|
**All plans include:**
|
|
211
|
-
- Access to all 26 tools
|
|
207
|
+
- Access to all 26 tools (the 15 local tools never consume credits)
|
|
212
208
|
- Credits never expire and roll over month-to-month
|
|
213
209
|
- API access and webhook notifications
|
|
214
210
|
|
|
@@ -277,7 +273,7 @@ Once configured, use these tools in your AI assistant:
|
|
|
277
273
|
|
|
278
274
|
## 🔒 Security & Privacy
|
|
279
275
|
|
|
280
|
-
- **Secure Authentication**: API keys required for all
|
|
276
|
+
- **Secure Authentication**: API keys required for all metered premium tools (the 15 free local tools run without one)
|
|
281
277
|
- **Local Storage**: API keys stored securely at `~/.crawlforge/config.json`
|
|
282
278
|
- **HTTPS Only**: All connections use encrypted HTTPS
|
|
283
279
|
- **No Data Retention**: We don't store scraped data, only usage logs
|
|
@@ -291,7 +287,7 @@ Once configured, use these tools in your AI assistant:
|
|
|
291
287
|
- **Action allowlist**: `scrape_with_actions` accepts only 7 action types (`wait`, `click`, `type`, `press`, `scroll`, `screenshot`, `executeJavaScript`). No download, file-write, or arbitrary cross-page navigation primitives exist.
|
|
292
288
|
- **JavaScript gate**: The `executeJavaScript` action throws by default. Set `ALLOW_JAVASCRIPT_EXECUTION=true` at deploy time to enable (not recommended in production).
|
|
293
289
|
- **MCP Elicitation** (v3.6.0): Four tools request user confirmation before executing expensive operations — `deep_research` (>50 URLs), `batch_scrape` (sync mode, >25 URLs), `crawl_deep` (projected >500 pages), `extract_structured` (schema has >3 required fields with no LLM configured). Credit-low situations also elicit. Confirmation is best-effort: if the MCP client does not support elicitation the tool proceeds (fail-open).
|
|
294
|
-
- **Per-tool credit gating**: Every tool is wrapped with `withAuth()
|
|
290
|
+
- **Per-tool credit gating**: Every tool is wrapped with `withAuth()`; metered tools check and deduct credits before execution (fail-closed since v3.0.18). Free local tools (cost 0) skip the credit path entirely.
|
|
295
291
|
|
|
296
292
|
See [docs/sandboxing-and-approvals.md](docs/sandboxing-and-approvals.md) for the full reference.
|
|
297
293
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "4.6.
|
|
3
|
+
"version": "4.6.5",
|
|
4
4
|
"mcpName": "io.github.mysleekdesigns/crawlforge-mcp-server",
|
|
5
5
|
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 26 web scraping, crawling, deep-research, and autonomous-extraction tools. Returns clean Markdown and structured JSON for Claude, Cursor, and any MCP client. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. Includes a unified multi-format scrape tool, an autonomous agent, pre-built site templates, and Camoufox stealth browsing.",
|
|
6
6
|
"main": "server.js",
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
"test": "node tests/integration/mcp-protocol-compliance.test.js",
|
|
19
19
|
"test:unit": "CRAWLFORGE_CREATOR_SECRET= node --test 'tests/unit/*.test.js'",
|
|
20
20
|
"test:integration": "CRAWLFORGE_CREATOR_SECRET= node --test 'tests/integration/tools/*.test.js'",
|
|
21
|
-
"test:coverage": "CRAWLFORGE_CREATOR_SECRET= c8 --reporter=text --reporter=lcov --include='src/**/*.js' --exclude='src/**/_*.js' --lines=60 --statements=60 --functions=55 --branches=45 node --test 'tests/unit/*.test.js' 'tests/integration/tools/*.test.js'",
|
|
21
|
+
"test:coverage": "CRAWLFORGE_CREATOR_SECRET= c8 --reporter=text --reporter=lcov --include='src/**/*.js' --exclude='src/**/_*.js' --lines=60 --statements=60 --functions=55 --branches=45 node --test --test-force-exit 'tests/unit/*.test.js' 'tests/integration/tools/*.test.js'",
|
|
22
22
|
"test:tools": "node test-tools.js",
|
|
23
23
|
"test:real-world": "node test-real-world.js",
|
|
24
24
|
"test:all": "bash run-all-tests.sh",
|
package/server.js
CHANGED
|
@@ -68,23 +68,15 @@ if (!AuthManager.isAuthenticated() && !AuthManager.isCreatorMode()) {
|
|
|
68
68
|
process.exit(1);
|
|
69
69
|
}
|
|
70
70
|
} else {
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
console.
|
|
76
|
-
console.
|
|
77
|
-
console.
|
|
78
|
-
console.
|
|
79
|
-
console.
|
|
80
|
-
console.log('');
|
|
81
|
-
console.log('Or set your API key via environment variable:');
|
|
82
|
-
console.log(' export CRAWLFORGE_API_KEY="your_api_key_here"');
|
|
83
|
-
console.log('');
|
|
84
|
-
console.log('Get your free API key at: https://www.crawlforge.dev/signup');
|
|
85
|
-
console.log('(Includes 1,000 free credits!)');
|
|
86
|
-
console.log('');
|
|
87
|
-
process.exit(0);
|
|
71
|
+
// Open-core Phase 2: no API key is fine — start in free-tier mode.
|
|
72
|
+
// Tier-0 tools (cost 0) run locally without a key; Tier-1 metered tools
|
|
73
|
+
// return a "not configured" error until a key is set.
|
|
74
|
+
// Status → stderr; stdout is reserved for the MCP JSON-RPC stream.
|
|
75
|
+
console.error('ℹ️ CrawlForge running in free-tier mode (no API key configured).');
|
|
76
|
+
console.error(' Free local tools work out of the box. Premium tools (search_web,');
|
|
77
|
+
console.error(' crawl_deep, stealth_mode, agent, deep_research, …) need an API key:');
|
|
78
|
+
console.error(' get one at https://www.crawlforge.dev/signup, then run `npm run setup`');
|
|
79
|
+
console.error(' or set CRAWLFORGE_API_KEY.');
|
|
88
80
|
}
|
|
89
81
|
}
|
|
90
82
|
|
|
@@ -98,7 +90,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
|
|
|
98
90
|
// Create the server
|
|
99
91
|
const server = new McpServer({
|
|
100
92
|
name: "crawlforge",
|
|
101
|
-
version: "4.6.
|
|
93
|
+
version: "4.6.5",
|
|
102
94
|
description: "Production-ready MCP server with 26 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, local-LLM extraction via Ollama, unified multi-format scrape, and autonomous agent tool.",
|
|
103
95
|
homepage: "https://www.crawlforge.dev",
|
|
104
96
|
icon: "https://www.crawlforge.dev/icon.png"
|
|
@@ -99,7 +99,10 @@ export class AgentOrchestrator {
|
|
|
99
99
|
const { ResearchOrchestrator } = await import('./ResearchOrchestrator.js');
|
|
100
100
|
this._researchOrchestrator = new ResearchOrchestrator({
|
|
101
101
|
maxUrls: 50,
|
|
102
|
-
timeLimit: DEFAULT_WALL_CLOCK_MS
|
|
102
|
+
timeLimit: DEFAULT_WALL_CLOCK_MS,
|
|
103
|
+
// Without this the orchestrator builds a keyless SearchWebTool and
|
|
104
|
+
// every pro-model search silently fails (zero sources).
|
|
105
|
+
searchConfig: this._searchConfig
|
|
103
106
|
});
|
|
104
107
|
}
|
|
105
108
|
return this._researchOrchestrator;
|
|
@@ -147,6 +150,15 @@ export class AgentOrchestrator {
|
|
|
147
150
|
timeLimit: wallClockMs,
|
|
148
151
|
researchApproach: 'focused'
|
|
149
152
|
});
|
|
153
|
+
// conductResearch never rejects — failures come back as an error payload
|
|
154
|
+
if (result?.error) {
|
|
155
|
+
return {
|
|
156
|
+
success: false,
|
|
157
|
+
degraded: true,
|
|
158
|
+
reason: `pro research failed: ${result.error}`,
|
|
159
|
+
answer: null
|
|
160
|
+
};
|
|
161
|
+
}
|
|
150
162
|
return { success: true, answer: result, model: 'pro', degraded: false };
|
|
151
163
|
} catch (err) {
|
|
152
164
|
// Fall through to default path on pro failure
|
package/src/core/AuthManager.js
CHANGED
|
@@ -238,7 +238,12 @@ class AuthManager {
|
|
|
238
238
|
if (this.isCreatorMode()) {
|
|
239
239
|
return true;
|
|
240
240
|
}
|
|
241
|
-
|
|
241
|
+
|
|
242
|
+
// Open-core Phase 2: Tier-0 tools cost 0 and run without an API key
|
|
243
|
+
if (estimatedCredits === 0) {
|
|
244
|
+
return true;
|
|
245
|
+
}
|
|
246
|
+
|
|
242
247
|
if (!this.config) {
|
|
243
248
|
throw new Error('CrawlForge not configured. Run setup first.');
|
|
244
249
|
}
|
|
@@ -500,54 +505,56 @@ class AuthManager {
|
|
|
500
505
|
}
|
|
501
506
|
|
|
502
507
|
/**
|
|
503
|
-
* Get credit cost for a tool
|
|
508
|
+
* Get credit cost for a tool.
|
|
509
|
+
*
|
|
510
|
+
* Open-core Phase 1 (docs/tier-map.md): this table is the single source of
|
|
511
|
+
* truth shared with the backend (crawlforge-website/src/lib/credits.ts).
|
|
512
|
+
* Tier 0 tools run locally on the user's machine and cost 0; Tier 1 tools
|
|
513
|
+
* are metered per COGS.
|
|
514
|
+
*
|
|
515
|
+
* @param {string} tool
|
|
516
|
+
* @param {object} [params] — invocation params; only used for per-call
|
|
517
|
+
* exceptions (scrape's screenshot format needs a server browser).
|
|
504
518
|
*/
|
|
505
|
-
getToolCost(tool) {
|
|
519
|
+
getToolCost(tool, params) {
|
|
520
|
+
// Tier-0 exception: the screenshot format of `scrape` is browser-backed
|
|
521
|
+
if (tool === 'scrape' && Array.isArray(params?.formats) && params.formats.includes('screenshot')) {
|
|
522
|
+
return 2;
|
|
523
|
+
}
|
|
524
|
+
|
|
506
525
|
const costs = {
|
|
507
|
-
//
|
|
508
|
-
fetch_url:
|
|
509
|
-
extract_text:
|
|
510
|
-
extract_links:
|
|
511
|
-
extract_metadata:
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
summarize_content:
|
|
517
|
-
analyze_content:
|
|
518
|
-
|
|
519
|
-
|
|
526
|
+
// Tier 0 — free, local (key optional)
|
|
527
|
+
fetch_url: 0,
|
|
528
|
+
extract_text: 0,
|
|
529
|
+
extract_links: 0,
|
|
530
|
+
extract_metadata: 0,
|
|
531
|
+
scrape_structured: 0,
|
|
532
|
+
scrape_template: 0,
|
|
533
|
+
extract_content: 0,
|
|
534
|
+
scrape: 0, // 2 if formats includes 'screenshot' (handled above)
|
|
535
|
+
summarize_content: 0,
|
|
536
|
+
analyze_content: 0,
|
|
537
|
+
extract_with_llm: 0,
|
|
538
|
+
extract_structured: 0,
|
|
539
|
+
process_document: 0,
|
|
540
|
+
list_ollama_models: 0,
|
|
541
|
+
get_batch_results: 0, // retrieval of an already-paid batch job
|
|
542
|
+
|
|
543
|
+
// Tier 1 — metered (costs reflect COGS)
|
|
544
|
+
map_site: 3,
|
|
545
|
+
track_changes: 3,
|
|
546
|
+
generate_llms_txt: 5,
|
|
547
|
+
search_web: 5,
|
|
520
548
|
crawl_deep: 5,
|
|
521
|
-
map_site: 5,
|
|
522
549
|
batch_scrape: 5,
|
|
523
|
-
deep_research: 10,
|
|
524
|
-
stealth_mode: 10,
|
|
525
|
-
|
|
526
|
-
// Heavy processing (3-5 credits)
|
|
527
|
-
process_document: 3,
|
|
528
|
-
extract_content: 3,
|
|
529
550
|
scrape_with_actions: 5,
|
|
530
|
-
generate_llms_txt: 3,
|
|
531
551
|
localization: 5,
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
extract_structured: 4,
|
|
536
|
-
|
|
537
|
-
// D3.3: Pre-built site templates (1 credit — same as fetch_url)
|
|
538
|
-
extract_with_llm: 5,
|
|
539
|
-
|
|
540
|
-
// D3.3: Pre-built site templates (1 credit per template scrape)
|
|
541
|
-
scrape_template: 1,
|
|
542
|
-
|
|
543
|
-
// Phase D (v4.6.0)
|
|
544
|
-
// scrape: base 2; projectCost() scales with format count
|
|
545
|
-
scrape: 2,
|
|
546
|
-
// agent: base 8; projectCost() scales with maxUrls
|
|
547
|
-
agent: 8
|
|
552
|
+
agent: 8, // projectCost() scales with maxUrls
|
|
553
|
+
deep_research: 10,
|
|
554
|
+
stealth_mode: 10
|
|
548
555
|
};
|
|
549
556
|
|
|
550
|
-
return costs[tool]
|
|
557
|
+
return costs[tool] ?? 1;
|
|
551
558
|
}
|
|
552
559
|
|
|
553
560
|
/**
|
|
@@ -563,11 +570,11 @@ class AuthManager {
|
|
|
563
570
|
* @returns {{ projected: number, note: string }}
|
|
564
571
|
*/
|
|
565
572
|
projectCost(toolName, params) {
|
|
566
|
-
const base = this.getToolCost(toolName);
|
|
573
|
+
const base = this.getToolCost(toolName, params);
|
|
567
574
|
|
|
568
575
|
// Override for tools whose cost scales with params
|
|
569
576
|
let projected = base;
|
|
570
|
-
let note = 'Fixed cost per invocation.';
|
|
577
|
+
let note = base === 0 ? 'Free local tool — no credits charged.' : 'Fixed cost per invocation.';
|
|
571
578
|
|
|
572
579
|
switch (toolName) {
|
|
573
580
|
case 'batch_scrape': {
|
|
@@ -589,13 +596,14 @@ class AuthManager {
|
|
|
589
596
|
break;
|
|
590
597
|
}
|
|
591
598
|
case 'extract_with_llm':
|
|
592
|
-
note = '
|
|
599
|
+
note = 'Free local tool. External LLM API call billed by your LLM provider, not in credits.';
|
|
593
600
|
break;
|
|
594
601
|
case 'scrape': {
|
|
595
|
-
//
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
602
|
+
// Free local tool; only the browser-backed screenshot format is metered
|
|
603
|
+
projected = base;
|
|
604
|
+
note = base > 0
|
|
605
|
+
? 'screenshot format requires a server browser (2 credits). Other formats are free.'
|
|
606
|
+
: 'Free local tool — no credits charged. json format may incur external LLM cost.';
|
|
599
607
|
break;
|
|
600
608
|
}
|
|
601
609
|
case 'agent': {
|
|
@@ -606,7 +614,7 @@ class AuthManager {
|
|
|
606
614
|
break;
|
|
607
615
|
}
|
|
608
616
|
default:
|
|
609
|
-
note = 'Fixed cost per invocation.';
|
|
617
|
+
note = base === 0 ? 'Free local tool — no credits charged.' : 'Fixed cost per invocation.';
|
|
610
618
|
}
|
|
611
619
|
|
|
612
620
|
return { projected, note };
|
|
@@ -34,12 +34,14 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
34
34
|
enableConflictDetection = true,
|
|
35
35
|
cacheEnabled = true,
|
|
36
36
|
cacheTTL = 1800000, // 30 minutes
|
|
37
|
+
researchApproach = 'broad',
|
|
37
38
|
searchConfig = {},
|
|
38
39
|
crawlConfig = {},
|
|
39
40
|
extractConfig = {},
|
|
40
41
|
summarizeConfig = {}
|
|
41
42
|
} = options;
|
|
42
43
|
|
|
44
|
+
this.researchApproach = researchApproach;
|
|
43
45
|
this.maxDepth = Math.min(Math.max(1, maxDepth), 10);
|
|
44
46
|
this.maxUrls = Math.min(Math.max(1, maxUrls), 1000);
|
|
45
47
|
this.timeLimit = Math.min(Math.max(30000, timeLimit), 300000);
|
|
@@ -269,32 +271,50 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
269
271
|
}
|
|
270
272
|
|
|
271
273
|
/**
|
|
272
|
-
* Generate research-specific query variations
|
|
274
|
+
* Generate research-specific query variations, tuned to the research approach.
|
|
275
|
+
*
|
|
276
|
+
* Academic/scientific suffixes ("peer reviewed", "research paper", "what is")
|
|
277
|
+
* only help when the caller actually asked for an academic search. Appending
|
|
278
|
+
* them to commercial or comparative topics dragged web search toward
|
|
279
|
+
* irrelevant government/academic PDFs and long-tail noise — the cause of
|
|
280
|
+
* near-empty research runs on niche commercial topics.
|
|
273
281
|
*/
|
|
274
282
|
generateResearchVariations(topic) {
|
|
275
|
-
const
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
283
|
+
const approach = this.researchApproach || 'broad';
|
|
284
|
+
|
|
285
|
+
if (approach === 'academic') {
|
|
286
|
+
return [
|
|
287
|
+
`${topic} research`,
|
|
288
|
+
`${topic} study`,
|
|
289
|
+
`${topic} analysis`,
|
|
290
|
+
`${topic} academic`,
|
|
291
|
+
`${topic} scientific`,
|
|
292
|
+
`${topic} research paper`,
|
|
293
|
+
`${topic} peer reviewed`,
|
|
294
|
+
`${topic} explained`
|
|
295
|
+
];
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
if (approach === 'current_events') {
|
|
299
|
+
return [
|
|
300
|
+
`latest ${topic}`,
|
|
301
|
+
`${topic} news`,
|
|
302
|
+
`recent ${topic}`,
|
|
303
|
+
`${topic} update`,
|
|
304
|
+
`${topic} announcement`
|
|
305
|
+
];
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// broad / focused / comparative — commercial & general intent
|
|
309
|
+
return [
|
|
310
|
+
`${topic} review`,
|
|
311
|
+
`${topic} reviews`,
|
|
312
|
+
`${topic} comparison`,
|
|
313
|
+
`${topic} vs alternatives`,
|
|
314
|
+
`${topic} pricing`,
|
|
315
|
+
`best ${topic}`,
|
|
316
|
+
`${topic} company`
|
|
317
|
+
];
|
|
298
318
|
}
|
|
299
319
|
|
|
300
320
|
/**
|
|
@@ -409,18 +429,20 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
409
429
|
*/
|
|
410
430
|
async gatherInitialSources(queries, options) {
|
|
411
431
|
const allSources = [];
|
|
432
|
+
const searchErrors = [];
|
|
433
|
+
const attemptedQueries = queries.slice(0, 5);
|
|
412
434
|
const maxSourcesPerQuery = Math.ceil(this.maxUrls / queries.length);
|
|
413
|
-
|
|
435
|
+
|
|
414
436
|
await this.processWithTimeLimit(async () => {
|
|
415
|
-
const searchPromises =
|
|
437
|
+
const searchPromises = attemptedQueries.map(async (query) => {
|
|
416
438
|
try {
|
|
417
|
-
this.metrics.searchQueries++;
|
|
418
439
|
const searchResults = await this.searchTool.execute({
|
|
419
440
|
query,
|
|
420
441
|
limit: maxSourcesPerQuery,
|
|
421
442
|
enable_ranking: true,
|
|
422
443
|
enable_deduplication: true
|
|
423
444
|
});
|
|
445
|
+
this.metrics.searchQueries++;
|
|
424
446
|
|
|
425
447
|
if (searchResults.results && searchResults.results.length > 0) {
|
|
426
448
|
const processedResults = searchResults.results.map(result => ({
|
|
@@ -437,6 +459,7 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
437
459
|
return [];
|
|
438
460
|
} catch (error) {
|
|
439
461
|
this.logger.warn('Search failed for query', { query, error: error.message });
|
|
462
|
+
searchErrors.push({ query, error: error.message });
|
|
440
463
|
return [];
|
|
441
464
|
}
|
|
442
465
|
});
|
|
@@ -445,6 +468,14 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
445
468
|
results.forEach(sources => allSources.push(...sources));
|
|
446
469
|
});
|
|
447
470
|
|
|
471
|
+
// Fail loudly when every search threw (e.g. missing API key) instead of
|
|
472
|
+
// reporting a successful research run with zero sources.
|
|
473
|
+
if (searchErrors.length === attemptedQueries.length && searchErrors.length > 0) {
|
|
474
|
+
throw new Error(
|
|
475
|
+
`All ${searchErrors.length} search queries failed — first error: ${searchErrors[0].error}`
|
|
476
|
+
);
|
|
477
|
+
}
|
|
478
|
+
|
|
448
479
|
// Deduplicate and rank sources
|
|
449
480
|
const uniqueSources = this.deduplicateSources(allSources);
|
|
450
481
|
const rankedSources = await this.rankSourcesByResearchValue(uniqueSources);
|
|
@@ -633,8 +664,19 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
633
664
|
citationPotential: this.assessCitationPotential(source)
|
|
634
665
|
};
|
|
635
666
|
|
|
636
|
-
|
|
637
|
-
|
|
667
|
+
let overallCredibility = this.calculateOverallCredibility(credibilityFactors);
|
|
668
|
+
|
|
669
|
+
// Down-weight topically-irrelevant sources so high-authority but
|
|
670
|
+
// off-topic pages (e.g. a .gov PDF unrelated to the query) don't
|
|
671
|
+
// dominate the results. relevanceScore is keyword-based here (no LLM):
|
|
672
|
+
// ~1 when the topic appears in the content, ~0 when it doesn't.
|
|
673
|
+
const relevance = typeof source.relevanceScore === 'number'
|
|
674
|
+
? source.relevanceScore
|
|
675
|
+
: null;
|
|
676
|
+
if (relevance !== null) {
|
|
677
|
+
overallCredibility *= (0.4 + 0.6 * relevance);
|
|
678
|
+
}
|
|
679
|
+
|
|
638
680
|
// Only include sources that meet minimum credibility threshold
|
|
639
681
|
if (overallCredibility >= 0.3) {
|
|
640
682
|
verifiedSources.push({
|
package/src/server/withAuth.js
CHANGED
|
@@ -4,7 +4,8 @@
|
|
|
4
4
|
* (OpenTelemetry spans + Prometheus counters) added in v3.2.0.
|
|
5
5
|
*
|
|
6
6
|
* Contract:
|
|
7
|
-
* - resolves toolCost once per call
|
|
7
|
+
* - resolves toolCost once per call (params-aware; 0-cost Tier-0 tools skip
|
|
8
|
+
* the credit check and usage reports entirely — open-core Phase 2)
|
|
8
9
|
* - try/finally guarantees a single `tool invocation` log line per call
|
|
9
10
|
* - log payload: { toolName, paramHash, durationMs, outcome, creditCost, creatorMode }
|
|
10
11
|
* - outcome ∈ { 'success' | 'error' | 'insufficient_credits' }
|
|
@@ -35,12 +36,16 @@ export function makeWithAuth({ authManager, logger, metrics = null }) {
|
|
|
35
36
|
const startTime = Date.now();
|
|
36
37
|
const paramHash = hashParams(params);
|
|
37
38
|
const creatorMode = authManager.isCreatorMode();
|
|
38
|
-
|
|
39
|
+
// Params-aware: scrape's screenshot format is metered, other formats free
|
|
40
|
+
const creditCost = creatorMode ? 0 : authManager.getToolCost(toolName, params);
|
|
41
|
+
// Open-core Phase 2: Tier-0 tools (cost 0) run locally for free — no
|
|
42
|
+
// credit check, no usage report, and no API key required.
|
|
43
|
+
const freeTier = creditCost === 0;
|
|
39
44
|
let outcome = 'pending';
|
|
40
45
|
let thrown = null;
|
|
41
46
|
|
|
42
47
|
try {
|
|
43
|
-
if (!creatorMode) {
|
|
48
|
+
if (!creatorMode && !freeTier) {
|
|
44
49
|
const hasCredits = await authManager.checkCredits(creditCost);
|
|
45
50
|
if (!hasCredits) {
|
|
46
51
|
outcome = 'insufficient_credits';
|
|
@@ -85,7 +90,7 @@ export function makeWithAuth({ authManager, logger, metrics = null }) {
|
|
|
85
90
|
// Cost injection must never break the request path
|
|
86
91
|
}
|
|
87
92
|
|
|
88
|
-
if (!creatorMode) {
|
|
93
|
+
if (!creatorMode && !freeTier) {
|
|
89
94
|
await authManager.reportUsage(toolName, creditCost, params, 200, Date.now() - startTime);
|
|
90
95
|
}
|
|
91
96
|
|
|
@@ -93,7 +98,7 @@ export function makeWithAuth({ authManager, logger, metrics = null }) {
|
|
|
93
98
|
} catch (error) {
|
|
94
99
|
outcome = 'error';
|
|
95
100
|
thrown = error;
|
|
96
|
-
if (!creatorMode) {
|
|
101
|
+
if (!creatorMode && !freeTier) {
|
|
97
102
|
await authManager.reportUsage(
|
|
98
103
|
toolName,
|
|
99
104
|
Math.max(1, Math.floor(creditCost * 0.5)),
|
|
@@ -2,6 +2,7 @@ import { z } from 'zod';
|
|
|
2
2
|
// D1.4: Elicitation helper (injected from server.js or can be used standalone)
|
|
3
3
|
import { ElicitationHelper } from '../../core/ElicitationHelper.js';
|
|
4
4
|
import { ResearchOrchestrator } from '../../core/ResearchOrchestrator.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
5
6
|
import { Logger } from '../../utils/Logger.js';
|
|
6
7
|
|
|
7
8
|
/**
|
|
@@ -172,6 +173,20 @@ export class DeepResearchTool {
|
|
|
172
173
|
this.buildResearchOptions(validated)
|
|
173
174
|
);
|
|
174
175
|
|
|
176
|
+
// conductResearch never rejects — orchestrator failures come back as a
|
|
177
|
+
// handleResearchError() payload. Surface them as a failed run instead
|
|
178
|
+
// of formatting them into a success-shaped result.
|
|
179
|
+
if (researchResults?.error) {
|
|
180
|
+
this.activeSessions.delete(sessionId);
|
|
181
|
+
return {
|
|
182
|
+
success: false,
|
|
183
|
+
sessionId,
|
|
184
|
+
error: researchResults.error,
|
|
185
|
+
partialResults: validated.includeRawData ? researchResults.partialResults : undefined,
|
|
186
|
+
recommendations: researchResults.recommendations
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
|
|
175
190
|
// Format results according to output preference
|
|
176
191
|
const formattedResults = this.formatResults(researchResults, validated);
|
|
177
192
|
|
|
@@ -236,7 +251,15 @@ export class DeepResearchTool {
|
|
|
236
251
|
*/
|
|
237
252
|
buildOrchestratorConfig(params) {
|
|
238
253
|
const baseConfig = { ...this.defaultOrchestratorConfig };
|
|
239
|
-
|
|
254
|
+
|
|
255
|
+
// The orchestrator constructs its own SearchWebTool, so it needs the same
|
|
256
|
+
// config (apiKey/apiBaseUrl) as the registered search_web tool — without
|
|
257
|
+
// it every internal search throws and research returns zero sources.
|
|
258
|
+
baseConfig.searchConfig = {
|
|
259
|
+
...getToolConfig('search_web'),
|
|
260
|
+
...baseConfig.searchConfig
|
|
261
|
+
};
|
|
262
|
+
|
|
240
263
|
// Add LLM configuration if provided
|
|
241
264
|
if (params.llmConfig) {
|
|
242
265
|
baseConfig.llmConfig = params.llmConfig;
|
|
@@ -248,7 +271,11 @@ export class DeepResearchTool {
|
|
|
248
271
|
const scopeConfig = {
|
|
249
272
|
maxUrls: params.maxUrls,
|
|
250
273
|
timeLimit: params.timeLimit,
|
|
251
|
-
concurrency: params.concurrency
|
|
274
|
+
concurrency: params.concurrency,
|
|
275
|
+
// The orchestrator tunes its query expansion to the approach (commercial
|
|
276
|
+
// vs academic vs current-events); without this it always used academic
|
|
277
|
+
// variations, which poisoned commercial/comparative searches.
|
|
278
|
+
researchApproach: params.researchApproach
|
|
252
279
|
};
|
|
253
280
|
|
|
254
281
|
switch (params.researchApproach) {
|
|
@@ -259,6 +286,7 @@ export class DeepResearchTool {
|
|
|
259
286
|
maxDepth: Math.min(params.maxDepth, 8),
|
|
260
287
|
enableSourceVerification: true,
|
|
261
288
|
searchConfig: {
|
|
289
|
+
...baseConfig.searchConfig,
|
|
262
290
|
enableRanking: true,
|
|
263
291
|
rankingWeights: {
|
|
264
292
|
authority: 0.4, // Higher weight for academic sources
|
|
@@ -275,6 +303,7 @@ export class DeepResearchTool {
|
|
|
275
303
|
...scopeConfig,
|
|
276
304
|
maxDepth: Math.min(params.maxDepth, 6),
|
|
277
305
|
searchConfig: {
|
|
306
|
+
...baseConfig.searchConfig,
|
|
278
307
|
enableRanking: true,
|
|
279
308
|
rankingWeights: {
|
|
280
309
|
freshness: 0.4, // Prioritize recent content
|
|
@@ -301,6 +330,7 @@ export class DeepResearchTool {
|
|
|
301
330
|
enableConflictDetection: true,
|
|
302
331
|
maxDepth: params.maxDepth,
|
|
303
332
|
searchConfig: {
|
|
333
|
+
...baseConfig.searchConfig,
|
|
304
334
|
enableDeduplication: true,
|
|
305
335
|
deduplicationThresholds: {
|
|
306
336
|
url: 0.9,
|
|
@@ -79,19 +79,23 @@ export class SearchWebTool {
|
|
|
79
79
|
// Check for Creator Mode - allows search without API key for development/testing
|
|
80
80
|
const isCreatorMode = isCreatorModeVerified();
|
|
81
81
|
|
|
82
|
+
// Open-core Phase 2: no API key is allowed at construction time (the server
|
|
83
|
+
// now starts in free-tier mode without one). The key requirement is
|
|
84
|
+
// enforced at execute() time instead, so Tier-0 tools keep working.
|
|
82
85
|
if (!apiKey && !isCreatorMode) {
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
86
|
+
this.searchAdapter = null;
|
|
87
|
+
this.isCreatorModeFallback = false;
|
|
88
|
+
} else {
|
|
89
|
+
// Create the search adapter (CrawlForge API proxy or Google Search API direct in Creator Mode)
|
|
90
|
+
try {
|
|
91
|
+
this.searchAdapter = SearchProviderFactory.createAdapter(apiKey, {
|
|
92
|
+
apiBaseUrl,
|
|
93
|
+
creatorMode: isCreatorMode
|
|
94
|
+
});
|
|
95
|
+
this.isCreatorModeFallback = !apiKey && isCreatorMode;
|
|
96
|
+
} catch (error) {
|
|
97
|
+
throw new Error(`Failed to initialize search adapter: ${error.message}`);
|
|
98
|
+
}
|
|
95
99
|
}
|
|
96
100
|
|
|
97
101
|
this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
|
|
@@ -123,6 +127,11 @@ export class SearchWebTool {
|
|
|
123
127
|
}
|
|
124
128
|
// --- end SearXNG short-circuit ---
|
|
125
129
|
|
|
130
|
+
// Free-tier mode: search via the CrawlForge proxy needs an API key
|
|
131
|
+
if (!this.searchAdapter) {
|
|
132
|
+
throw new Error('CrawlForge API key is required for search functionality. Get one at https://www.crawlforge.dev/signup');
|
|
133
|
+
}
|
|
134
|
+
|
|
126
135
|
// Apply localization if specified
|
|
127
136
|
let localizedParams = validated;
|
|
128
137
|
if (validated.localization) {
|