crawlforge-mcp-server 4.2.2 → 4.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +41 -8
- package/README.md +13 -2
- package/package.json +2 -2
- package/server.js +3 -2
- package/src/cli/commands/analyze.js +26 -1
- package/src/cli/commands/localize.js +45 -10
- package/src/cli/commands/monitor.js +2 -1
- package/src/cli/commands/template.js +8 -3
- package/src/cli/index.js +17 -0
- package/src/cli/lib/runTool.js +11 -2
- package/src/core/ActionExecutor.js +2 -1
- package/src/core/AuthManager.js +3 -2
- package/src/core/PerformanceManager.js +3 -0
- package/src/core/creatorMode.js +2 -1
- package/src/tools/advanced/batchScrape/index.js +2 -1
- package/src/tools/search/adapters/searchProviderFactory.js +2 -1
- package/src/utils/Logger.js +3 -0
package/CLAUDE.md
CHANGED
|
@@ -60,9 +60,9 @@ These guidelines are working if: fewer unnecessary changes in diffs, fewer rewri
|
|
|
60
60
|
|
|
61
61
|
## Project Overview
|
|
62
62
|
|
|
63
|
-
CrawlForge MCP Server - A professional MCP (Model Context Protocol) server providing
|
|
63
|
+
CrawlForge MCP Server - A professional MCP (Model Context Protocol) server providing 23 web scraping, crawling, and content processing tools (5 inline + 18 advanced).
|
|
64
64
|
|
|
65
|
-
**Current Version:**
|
|
65
|
+
**Current Version:** 4.2.2
|
|
66
66
|
|
|
67
67
|
## Development Commands
|
|
68
68
|
|
|
@@ -92,6 +92,12 @@ npm run dev
|
|
|
92
92
|
# Test MCP protocol compliance
|
|
93
93
|
npm test
|
|
94
94
|
|
|
95
|
+
# Unit tests (131 tests across 17 tools, no live network)
|
|
96
|
+
npm run test:unit
|
|
97
|
+
|
|
98
|
+
# Integration tests
|
|
99
|
+
npm run test:integration
|
|
100
|
+
|
|
95
101
|
# Functional tests
|
|
96
102
|
node test-tools.js # Test all tools
|
|
97
103
|
node test-real-world.js # Test real-world usage scenarios
|
|
@@ -99,6 +105,13 @@ node test-real-world.js # Test real-world usage scenarios
|
|
|
99
105
|
# MCP Protocol tests
|
|
100
106
|
node tests/integration/mcp-protocol-compliance.test.js
|
|
101
107
|
|
|
108
|
+
# CLI (v4.1.0+, requires global install or npx)
|
|
109
|
+
crawlforge --help # Show all 15 subcommands
|
|
110
|
+
crawlforge scrape https://example.com
|
|
111
|
+
crawlforge batch --urls urls.txt --format markdown
|
|
112
|
+
crawlforge install-skills --target claude-code
|
|
113
|
+
# See docs/cli-guide.md for full reference
|
|
114
|
+
|
|
102
115
|
# Docker
|
|
103
116
|
npm run docker:build # Build Docker image
|
|
104
117
|
npm run docker:dev # Run development container
|
|
@@ -124,30 +137,37 @@ npm run docker:prod # Run production container
|
|
|
124
137
|
- **WebhookDispatcher**: Event notification system for job completion callbacks
|
|
125
138
|
- **ActionExecutor**: Browser automation engine (Playwright-based)
|
|
126
139
|
- **ResearchOrchestrator**: Multi-stage research with query expansion and synthesis
|
|
127
|
-
- **StealthBrowserManager**: Stealth mode scraping with anti-detection
|
|
140
|
+
- **StealthBrowserManager**: Stealth mode scraping with anti-detection; Camoufox (Firefox) engine added in v4.0.0
|
|
128
141
|
- **LocalizationManager**: Multi-language content and localization
|
|
129
142
|
- **ChangeTracker**: Content change tracking over time
|
|
130
143
|
- **SnapshotManager**: Website snapshots and version history
|
|
144
|
+
- **ResourceRegistry**: MCP Resources (crawlforge:// URI scheme, 5 resource types) — D1.1, v3.6.0
|
|
145
|
+
- **PromptRegistry** (`src/prompts/`): 5 workflow prompts — D1.2, v3.6.0
|
|
146
|
+
- **SamplingClient**: MCP Sampling with Ollama-API fallback chain — D1.3, v3.6.0
|
|
147
|
+
- **ElicitationHelper**: MCP Elicitation for user confirmation on expensive operations — D1.4, v3.6.0
|
|
148
|
+
- **endpointGuard**: Allow-list guard for server's own backend calls — v3.0.18
|
|
131
149
|
|
|
132
150
|
### Tool Layer (`src/tools/`)
|
|
133
151
|
|
|
134
152
|
Tools are organized in subdirectories by category:
|
|
135
153
|
|
|
136
154
|
- `advanced/` - BatchScrapeTool, ScrapeWithActionsTool
|
|
155
|
+
- `basic/` - fetchUrl, extractText, extractLinks, extractMetadata, scrapeStructured
|
|
137
156
|
- `crawl/` - crawlDeep, mapSite
|
|
138
|
-
- `extract/` - analyzeContent, extractContent, processDocument, summarizeContent
|
|
157
|
+
- `extract/` - analyzeContent, extractContent, extractStructured, extractWithLlm, listOllamaModels, processDocument, summarizeContent
|
|
139
158
|
- `research/` - deepResearch
|
|
140
159
|
- `search/` - searchWeb (proxied through CrawlForge.dev API)
|
|
160
|
+
- `templates/` - ScrapeTemplateTool (10 pre-built site templates, v4.0.0)
|
|
141
161
|
- `tracking/` - trackChanges
|
|
142
162
|
- `llmstxt/` - generateLLMsTxt
|
|
143
163
|
|
|
144
|
-
### Available MCP Tools (
|
|
164
|
+
### Available MCP Tools (23 total)
|
|
145
165
|
|
|
146
|
-
**Basic Tools (server.js inline):**
|
|
166
|
+
**Basic Tools (server.js inline, 5):**
|
|
147
167
|
fetch_url, extract_text, extract_links, extract_metadata, scrape_structured
|
|
148
168
|
|
|
149
|
-
**Advanced Tools:**
|
|
150
|
-
search_web, crawl_deep, map_site, extract_content, process_document, summarize_content, analyze_content, extract_structured, batch_scrape, scrape_with_actions, deep_research, track_changes, generate_llms_txt, stealth_mode, localization
|
|
169
|
+
**Advanced Tools (18):**
|
|
170
|
+
search_web, crawl_deep, map_site, extract_content, process_document, summarize_content, analyze_content, extract_structured, extract_with_llm, list_ollama_models, batch_scrape, scrape_with_actions, deep_research, track_changes, generate_llms_txt, stealth_mode, localization, scrape_template
|
|
151
171
|
|
|
152
172
|
### MCP Server Entry Point
|
|
153
173
|
|
|
@@ -202,10 +222,23 @@ When adding a new tool to server.js:
|
|
|
202
222
|
5. Add to cleanup array in gracefulShutdown if it has `destroy()` or `cleanup()` methods
|
|
203
223
|
6. Update tool count in console log at server startup
|
|
204
224
|
|
|
225
|
+
## Sandboxing & Approvals
|
|
226
|
+
|
|
227
|
+
Key mechanisms for security-conscious future sessions:
|
|
228
|
+
|
|
229
|
+
- **SSRF** (`src/utils/ssrfProtection.js`): Every scraped URL validated — http/https only; blocks loopback, RFC1918, IPv6 ULA/link-local, cloud metadata endpoints; blocks dangerous ports (22, 25, 53, 445, 3306, 5432, 6379, 27017, etc.); redirects re-validated per hop, capped at 5; pre-parse path-traversal rejection. Blocklist-based — no per-deployment outbound allowlist.
|
|
230
|
+
- **endpointGuard** (`src/core/endpointGuard.js`): Hard allow-list of `{crawlforge.dev, www.crawlforge.dev, api.crawlforge.dev}` for the server's own backend calls; HTTPS required; fail-closed. Localhost only in creator mode (v3.0.18).
|
|
231
|
+
- **Action allowlist** (`src/core/ActionExecutor.js`): `scrape_with_actions` accepts only 7 action types: `wait`, `click`, `type`, `press`, `scroll`, `screenshot`, `executeJavaScript`. `executeJavaScript` throws unless `ALLOW_JAVASCRIPT_EXECUTION=true` is set at deploy time (off by default).
|
|
232
|
+
- **Elicitation** (`src/core/ElicitationHelper.js`): User confirmation requested for `deep_research` (>50 URLs), `batch_scrape` (sync, >25 URLs), `crawl_deep` (projected >500 pages), `extract_structured` (schema has >3 required fields, no LLM configured), and credit-low situations. Fail-open if client does not support elicitation.
|
|
233
|
+
- **Browser sandboxing**: Standard pool retains OS sandbox. Stealth Chromium uses `--no-sandbox` + `--disable-web-security` (deliberate fingerprint-spoofing trade-off). Camoufox (Firefox, v4.0.0) is the alternative — see `docs/stealth-engines.md`.
|
|
234
|
+
|
|
235
|
+
See `docs/sandboxing-and-approvals.md` for the full reference.
|
|
236
|
+
|
|
205
237
|
## Security
|
|
206
238
|
|
|
207
239
|
Security testing and CI/CD pipeline details are in:
|
|
208
240
|
|
|
241
|
+
- `docs/sandboxing-and-approvals.md` — Canonical sandboxing & approvals reference
|
|
209
242
|
- `docs/security-audit-report.md` — Full security audit
|
|
210
243
|
- `.github/workflows/ci.yml` — CI pipeline with security checks
|
|
211
244
|
- `.github/workflows/security.yml` — Daily scheduled security scanning
|
package/README.md
CHANGED
|
@@ -9,7 +9,7 @@ Professional web scraping and content extraction server implementing the Model C
|
|
|
9
9
|
|
|
10
10
|
## 🎯 Features
|
|
11
11
|
|
|
12
|
-
- **
|
|
12
|
+
- **23 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis, local-LLM extraction (Ollama)
|
|
13
13
|
- **Free Tier**: 1,000 credits to get started instantly
|
|
14
14
|
- **MCP Compatible**: Works with Claude, Cursor, and other MCP-enabled AI tools
|
|
15
15
|
- **Enterprise Ready**: Scale up with paid plans for production use
|
|
@@ -140,7 +140,7 @@ Restart Cursor to activate.
|
|
|
140
140
|
| **Enterprise** | 250,000 | Large scale operations |
|
|
141
141
|
|
|
142
142
|
**All plans include:**
|
|
143
|
-
- Access to all
|
|
143
|
+
- Access to all 23 tools
|
|
144
144
|
- Credits never expire and roll over month-to-month
|
|
145
145
|
- API access and webhook notifications
|
|
146
146
|
|
|
@@ -216,6 +216,17 @@ Once configured, use these tools in your AI assistant:
|
|
|
216
216
|
- **Rate Limiting**: Built-in protection against abuse
|
|
217
217
|
- **Compliance**: Respects robots.txt and GDPR requirements
|
|
218
218
|
|
|
219
|
+
### Security & Approvals
|
|
220
|
+
|
|
221
|
+
- **SSRF enforcement**: Every scraped URL is validated before the request is sent — http/https only; blocks loopback, RFC1918, IPv6 private/link-local ranges, cloud metadata endpoints (GCP, Azure), and dangerous ports (SSH, SMTP, DNS, MySQL, Postgres, Redis, MongoDB, etc.). Redirects are re-validated each hop, capped at 5.
|
|
222
|
+
- **Backend endpoint guard** (v3.0.18): The server's own calls to CrawlForge.dev use a separate fail-closed allow-list (`{crawlforge.dev, www.crawlforge.dev, api.crawlforge.dev}`, HTTPS required). Setting `CRAWLFORGE_API_URL` to an arbitrary host is blocked at parse time.
|
|
223
|
+
- **Action allowlist**: `scrape_with_actions` accepts only 7 action types (`wait`, `click`, `type`, `press`, `scroll`, `screenshot`, `executeJavaScript`). No download, file-write, or arbitrary cross-page navigation primitives exist.
|
|
224
|
+
- **JavaScript gate**: The `executeJavaScript` action throws by default. Set `ALLOW_JAVASCRIPT_EXECUTION=true` at deploy time to enable (not recommended in production).
|
|
225
|
+
- **MCP Elicitation** (v3.6.0): Four tools request user confirmation before executing expensive operations — `deep_research` (>50 URLs), `batch_scrape` (sync mode, >25 URLs), `crawl_deep` (projected >500 pages), `extract_structured` (schema has >3 required fields with no LLM configured). Credit-low situations also elicit. Confirmation is best-effort: if the MCP client does not support elicitation the tool proceeds (fail-open).
|
|
226
|
+
- **Per-tool credit gating**: Every tool is wrapped with `withAuth()`, which checks and deducts credits before execution. Fail-closed since v3.0.18.
|
|
227
|
+
|
|
228
|
+
See [docs/sandboxing-and-approvals.md](docs/sandboxing-and-approvals.md) for the full reference.
|
|
229
|
+
|
|
219
230
|
### Security Updates
|
|
220
231
|
|
|
221
232
|
**v3.0.3 (2025-10-01)**: Removed authentication bypass vulnerability. All users must authenticate with valid API keys.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "4.2.
|
|
3
|
+
"version": "4.2.4",
|
|
4
4
|
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 23 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. v4.0 adds Markdown-first output, pre-built site templates, Camoufox stealth engine, and cost transparency.",
|
|
5
5
|
"main": "server.js",
|
|
6
6
|
"bin": {
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
"test:tools": "node test-tools.js",
|
|
20
20
|
"test:real-world": "node test-real-world.js",
|
|
21
21
|
"test:all": "bash run-all-tests.sh",
|
|
22
|
-
"postinstall": "echo '\n
|
|
22
|
+
"postinstall": "echo '\n🎉 CrawlForge MCP Server installed!\n\nRun \"npx crawlforge-setup\" to configure your API key and get started.\n'",
|
|
23
23
|
"docker:build": "docker build -t crawlforge .",
|
|
24
24
|
"docker:dev": "docker-compose up crawlforge-dev",
|
|
25
25
|
"docker:prod": "docker-compose up crawlforge-prod"
|
package/server.js
CHANGED
|
@@ -57,7 +57,8 @@ if (!AuthManager.isAuthenticated() && !AuthManager.isCreatorMode()) {
|
|
|
57
57
|
const apiKey = process.env.CRAWLFORGE_API_KEY;
|
|
58
58
|
if (apiKey) {
|
|
59
59
|
// Auto-setup if API key is provided via environment
|
|
60
|
-
|
|
60
|
+
// Status → stderr; stdout is reserved for the MCP JSON-RPC stream.
|
|
61
|
+
console.error('🔧 Auto-configuring CrawlForge with provided API key...');
|
|
61
62
|
const success = await AuthManager.runSetup(apiKey);
|
|
62
63
|
if (!success) {
|
|
63
64
|
console.error('❌ Failed to authenticate with provided API key');
|
|
@@ -110,7 +111,7 @@ server.prompt("getting-started", {
|
|
|
110
111
|
role: "user",
|
|
111
112
|
content: {
|
|
112
113
|
type: "text",
|
|
113
|
-
text: "You have access to CrawlForge MCP with
|
|
114
|
+
text: "You have access to CrawlForge MCP with 23 web scraping tools. Key tools:\n\n" +
|
|
114
115
|
"- fetch_url: Fetch raw HTML/content from any URL\n" +
|
|
115
116
|
"- extract_text: Extract clean text from a webpage\n" +
|
|
116
117
|
"- extract_content: Smart content extraction with readability\n" +
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* analyze command — analyze content of a URL.
|
|
3
|
+
* Fetches and cleans the page content first (extract_content), then runs
|
|
4
|
+
* NLP analysis (analyze_content) on the extracted text.
|
|
3
5
|
*/
|
|
6
|
+
import { ExtractContentTool } from '../../tools/extract/extractContent.js';
|
|
4
7
|
import { AnalyzeContentTool } from '../../tools/extract/analyzeContent.js';
|
|
5
8
|
import { getToolConfig } from '../../constants/config.js';
|
|
6
9
|
import { runTool } from '../lib/runTool.js';
|
|
@@ -13,7 +16,29 @@ export function register(program) {
|
|
|
13
16
|
.action(async (url, opts, cmd) => {
|
|
14
17
|
const globals = cmd.parent.opts();
|
|
15
18
|
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
19
|
+
|
|
20
|
+
// analyze_content operates on text, so fetch & clean the page first.
|
|
21
|
+
const extractor = new ExtractContentTool(getToolConfig('extract_content'));
|
|
22
|
+
let text;
|
|
23
|
+
try {
|
|
24
|
+
const extracted = await extractor.execute({ url });
|
|
25
|
+
text = extracted?.content?.text;
|
|
26
|
+
} catch (e) {
|
|
27
|
+
process.stderr.write(`Error fetching content from ${url}: ${e.message}\n`);
|
|
28
|
+
process.exit(1);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
if (!text || text.trim().length < 10) {
|
|
32
|
+
process.stderr.write(`Error: could not extract enough text from ${url} to analyze\n`);
|
|
33
|
+
process.exit(1);
|
|
34
|
+
}
|
|
35
|
+
|
|
16
36
|
const tool = new AnalyzeContentTool(getToolConfig('analyze_content'));
|
|
17
|
-
|
|
37
|
+
// All analyses (language, topics, entities, sentiment, readability) default to true;
|
|
38
|
+
// --depth full additionally enables advanced metrics.
|
|
39
|
+
await runTool(tool, {
|
|
40
|
+
text,
|
|
41
|
+
options: { includeAdvancedMetrics: opts.depth === 'full' }
|
|
42
|
+
}, cliFlags);
|
|
18
43
|
});
|
|
19
44
|
}
|
|
@@ -1,29 +1,64 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* localize command — fetch
|
|
2
|
+
* localize command — fetch a URL with locale/geo-aware request headers.
|
|
3
|
+
* Builds a localization config (Accept-Language, User-Agent) for the target
|
|
4
|
+
* country via LocalizationManager, then fetches the URL with those headers.
|
|
3
5
|
*/
|
|
4
6
|
import { LocalizationManager } from '../../core/LocalizationManager.js';
|
|
7
|
+
import { fetchUrlHandler } from '../../tools/basic/fetchUrl.js';
|
|
5
8
|
import { getToolConfig } from '../../constants/config.js';
|
|
6
9
|
import { runTool } from '../lib/runTool.js';
|
|
7
10
|
|
|
11
|
+
// Derive a 2-letter country code from a --country flag or an en-US style locale.
|
|
12
|
+
function resolveCountry(country, locale) {
|
|
13
|
+
if (country) return country.toUpperCase();
|
|
14
|
+
if (locale && locale.includes('-')) return locale.split('-')[1].toUpperCase();
|
|
15
|
+
return 'US';
|
|
16
|
+
}
|
|
17
|
+
|
|
8
18
|
export function register(program) {
|
|
9
19
|
program
|
|
10
20
|
.command('localize <url>')
|
|
11
|
-
.description('Fetch URL with locale/geo-aware
|
|
21
|
+
.description('Fetch URL with locale/geo-aware request headers')
|
|
12
22
|
.option('--locale <locale>', 'Locale code (e.g. en-US, fr-FR)', 'en-US')
|
|
13
23
|
.option('--country <code>', 'Country code for geo-targeting (e.g. US, FR)')
|
|
14
24
|
.option('--currency <code>', 'Currency code (e.g. USD, EUR)')
|
|
15
25
|
.action(async (url, opts, cmd) => {
|
|
16
26
|
const globals = cmd.parent.opts();
|
|
17
27
|
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
18
|
-
|
|
28
|
+
|
|
29
|
+
const countryCode = resolveCountry(opts.country, opts.locale);
|
|
30
|
+
const language = opts.locale ? opts.locale.split('-')[0] : undefined;
|
|
31
|
+
|
|
19
32
|
const wrapperTool = {
|
|
20
|
-
execute: (
|
|
33
|
+
execute: async () => {
|
|
34
|
+
const mgr = new LocalizationManager(getToolConfig('localization'));
|
|
35
|
+
await mgr.initialize();
|
|
36
|
+
const config = await mgr.configureCountry(countryCode, {
|
|
37
|
+
language,
|
|
38
|
+
currency: opts.currency
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
const headers = {
|
|
42
|
+
'Accept-Language': config.acceptLanguage,
|
|
43
|
+
'User-Agent': mgr.generateUserAgent(countryCode),
|
|
44
|
+
...(config.customHeaders || {})
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
const fetched = await fetchUrlHandler({ url, headers });
|
|
48
|
+
return {
|
|
49
|
+
localization: {
|
|
50
|
+
countryCode: config.countryCode,
|
|
51
|
+
language: config.language,
|
|
52
|
+
timezone: config.timezone,
|
|
53
|
+
currency: config.currency,
|
|
54
|
+
acceptLanguage: config.acceptLanguage
|
|
55
|
+
},
|
|
56
|
+
request_headers: headers,
|
|
57
|
+
response: fetched
|
|
58
|
+
};
|
|
59
|
+
}
|
|
21
60
|
};
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
locale: opts.locale,
|
|
25
|
-
country: opts.country,
|
|
26
|
-
currency: opts.currency
|
|
27
|
-
}, cliFlags);
|
|
61
|
+
|
|
62
|
+
await runTool(wrapperTool, {}, cliFlags);
|
|
28
63
|
});
|
|
29
64
|
}
|
|
@@ -17,6 +17,7 @@ export function register(program) {
|
|
|
17
17
|
const globals = cmd.parent.opts();
|
|
18
18
|
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
19
19
|
const tool = new TrackChangesTool(getToolConfig('track_changes'));
|
|
20
|
+
// monitor runs continuously — do not auto-exit after the first result.
|
|
20
21
|
await runTool(tool, {
|
|
21
22
|
url,
|
|
22
23
|
scheduled: true,
|
|
@@ -24,6 +25,6 @@ export function register(program) {
|
|
|
24
25
|
selector: opts.selector,
|
|
25
26
|
webhook_url: opts.webhook,
|
|
26
27
|
change_threshold: parseFloat(opts.threshold)
|
|
27
|
-
}, cliFlags);
|
|
28
|
+
}, cliFlags, { exitOnSuccess: false });
|
|
28
29
|
});
|
|
29
30
|
}
|
|
@@ -7,7 +7,7 @@ import { runTool } from '../lib/runTool.js';
|
|
|
7
7
|
|
|
8
8
|
export function register(program) {
|
|
9
9
|
program
|
|
10
|
-
.command('template
|
|
10
|
+
.command('template [id] [target]')
|
|
11
11
|
.description('Scrape using a pre-built site template (e.g. amazon-product, github-repo)')
|
|
12
12
|
.option('--list', 'List all available templates')
|
|
13
13
|
.action(async (id, target, opts, cmd) => {
|
|
@@ -16,11 +16,16 @@ export function register(program) {
|
|
|
16
16
|
const tool = new ScrapeTemplateTool(getToolConfig('scrape_template'));
|
|
17
17
|
|
|
18
18
|
if (opts.list) {
|
|
19
|
-
const wrapperTool = { execute: () => tool.
|
|
19
|
+
const wrapperTool = { execute: () => tool.execute({ template: 'list' }) };
|
|
20
20
|
await runTool(wrapperTool, {}, cliFlags);
|
|
21
21
|
return;
|
|
22
22
|
}
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
if (!id || !target) {
|
|
25
|
+
process.stderr.write('Error: template requires <id> and <target>, or use --list\n');
|
|
26
|
+
process.exit(1);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
await runTool(tool, { template: id, url: target }, cliFlags);
|
|
25
30
|
});
|
|
26
31
|
}
|
package/src/cli/index.js
CHANGED
|
@@ -58,11 +58,28 @@ program
|
|
|
58
58
|
.option('--api-key <key>', 'CrawlForge API key (overrides CRAWLFORGE_API_KEY env var)')
|
|
59
59
|
.option('--timeout <ms>', 'Global request timeout in milliseconds', '30000');
|
|
60
60
|
|
|
61
|
+
// Resolve the API key from (in priority order): --api-key flag, CRAWLFORGE_API_KEY env,
|
|
62
|
+
// then the stored ~/.crawlforge/config.json written by `crawlforge-setup`.
|
|
63
|
+
function loadStoredApiKey() {
|
|
64
|
+
try {
|
|
65
|
+
const home = process.env.HOME || process.env.USERPROFILE;
|
|
66
|
+
if (!home) return undefined;
|
|
67
|
+
const cfgPath = join(home, '.crawlforge', 'config.json');
|
|
68
|
+
const cfg = JSON.parse(readFileSync(cfgPath, 'utf8'));
|
|
69
|
+
return cfg.apiKey || undefined;
|
|
70
|
+
} catch {
|
|
71
|
+
return undefined;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
61
75
|
// Apply --api-key globally before commands run
|
|
62
76
|
program.hook('preAction', (thisCommand) => {
|
|
63
77
|
const opts = program.opts();
|
|
64
78
|
if (opts.apiKey) {
|
|
65
79
|
process.env.CRAWLFORGE_API_KEY = opts.apiKey;
|
|
80
|
+
} else if (!process.env.CRAWLFORGE_API_KEY) {
|
|
81
|
+
const stored = loadStoredApiKey();
|
|
82
|
+
if (stored) process.env.CRAWLFORGE_API_KEY = stored;
|
|
66
83
|
}
|
|
67
84
|
if (opts.timeout) {
|
|
68
85
|
process.env.CRAWLFORGE_CLI_TIMEOUT = opts.timeout;
|
package/src/cli/lib/runTool.js
CHANGED
|
@@ -16,9 +16,13 @@ import { formatResult, formatError } from '../formatter.js';
|
|
|
16
16
|
* @param {object} cliFlags — { json, pretty, quiet }
|
|
17
17
|
* @param {object} [options]
|
|
18
18
|
* @param {boolean} [options.exitOnError=true]
|
|
19
|
+
* @param {boolean} [options.exitOnSuccess=true] Exit the process after writing
|
|
20
|
+
* output. One-shot CLI commands need this because background timers
|
|
21
|
+
* (metrics, cache/connection cleanup, etc.) otherwise keep the event loop
|
|
22
|
+
* alive. Long-running commands (e.g. `monitor`) pass false.
|
|
19
23
|
*/
|
|
20
24
|
export async function runTool(tool, params, cliFlags, options = {}) {
|
|
21
|
-
const { exitOnError = true } = options;
|
|
25
|
+
const { exitOnError = true, exitOnSuccess = true } = options;
|
|
22
26
|
|
|
23
27
|
try {
|
|
24
28
|
const result = await tool.execute(params);
|
|
@@ -32,7 +36,12 @@ export async function runTool(tool, params, cliFlags, options = {}) {
|
|
|
32
36
|
}
|
|
33
37
|
|
|
34
38
|
const output = formatResult(result, cliFlags);
|
|
35
|
-
if (output)
|
|
39
|
+
if (output) {
|
|
40
|
+
// Wait for stdout to flush (pipes/files buffer) before exiting.
|
|
41
|
+
process.stdout.write(output + '\n', () => { if (exitOnSuccess) process.exit(0); });
|
|
42
|
+
} else if (exitOnSuccess) {
|
|
43
|
+
process.exit(0);
|
|
44
|
+
}
|
|
36
45
|
} catch (error) {
|
|
37
46
|
process.stderr.write(formatError(error, cliFlags) + '\n');
|
|
38
47
|
if (exitOnError) process.exit(1);
|
|
@@ -926,7 +926,8 @@ export class ActionExecutor extends EventEmitter {
|
|
|
926
926
|
*/
|
|
927
927
|
log(level, message) {
|
|
928
928
|
if (this.enableLogging) {
|
|
929
|
-
|
|
929
|
+
// → stderr so stdout stays clean for MCP JSON-RPC / CLI --json output.
|
|
930
|
+
console.error('[ActionExecutor:' + level.toUpperCase() + '] ' + message);
|
|
930
931
|
}
|
|
931
932
|
}
|
|
932
933
|
|
package/src/core/AuthManager.js
CHANGED
|
@@ -69,7 +69,8 @@ class AuthManager {
|
|
|
69
69
|
|
|
70
70
|
// Skip config loading in creator mode
|
|
71
71
|
if (this.isCreatorMode()) {
|
|
72
|
-
|
|
72
|
+
// Status → stderr; stdout is reserved for MCP JSON-RPC / CLI --json output.
|
|
73
|
+
console.error('🚀 Creator Mode Active - Unlimited Access Enabled');
|
|
73
74
|
this.initialized = true;
|
|
74
75
|
return;
|
|
75
76
|
}
|
|
@@ -78,7 +79,7 @@ class AuthManager {
|
|
|
78
79
|
await this.loadConfig();
|
|
79
80
|
this.initialized = true;
|
|
80
81
|
} catch (error) {
|
|
81
|
-
console.
|
|
82
|
+
console.error('No existing CrawlForge configuration found. Run setup to configure.');
|
|
82
83
|
this.initialized = true;
|
|
83
84
|
}
|
|
84
85
|
|
|
@@ -771,6 +771,9 @@ export class PerformanceManager extends EventEmitter {
|
|
|
771
771
|
this.metricsTimer = setInterval(() => {
|
|
772
772
|
this.collectMetrics();
|
|
773
773
|
}, this.metricsInterval);
|
|
774
|
+
// Don't let the metrics interval keep a short-lived process (e.g. a one-shot
|
|
775
|
+
// CLI command) alive. The long-running server stays up via its stdio transport.
|
|
776
|
+
if (typeof this.metricsTimer.unref === 'function') this.metricsTimer.unref();
|
|
774
777
|
}
|
|
775
778
|
|
|
776
779
|
/**
|
package/src/core/creatorMode.js
CHANGED
|
@@ -29,7 +29,8 @@ if (process.env.CRAWLFORGE_CREATOR_SECRET) {
|
|
|
29
29
|
|
|
30
30
|
if (crypto.timingSafeEqual(Buffer.from(providedHash, 'hex'), Buffer.from(CREATOR_SECRET_HASH, 'hex'))) {
|
|
31
31
|
_creatorModeVerified = true;
|
|
32
|
-
|
|
32
|
+
// Status message → stderr so stdout stays clean (MCP JSON-RPC / CLI --json output).
|
|
33
|
+
console.error('Creator Mode Enabled - Unlimited Access');
|
|
33
34
|
} else {
|
|
34
35
|
console.warn('Invalid creator secret provided');
|
|
35
36
|
}
|
|
@@ -301,7 +301,8 @@ export class BatchScrapeTool extends EventEmitter {
|
|
|
301
301
|
}
|
|
302
302
|
|
|
303
303
|
_log(level, message) {
|
|
304
|
-
|
|
304
|
+
// → stderr so stdout stays clean for MCP JSON-RPC / CLI --json output.
|
|
305
|
+
if (this.enableLogging) console.error(`[BatchScrapeTool:${level.toUpperCase()}] ${message}`);
|
|
305
306
|
}
|
|
306
307
|
|
|
307
308
|
_initializeJobExecutors() {
|
|
@@ -36,7 +36,8 @@ export class SearchProviderFactory {
|
|
|
36
36
|
);
|
|
37
37
|
}
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
// Status message → stderr so stdout stays clean (MCP JSON-RPC / CLI --json).
|
|
40
|
+
console.error('🔍 Creator Mode: Using Google Search API directly');
|
|
40
41
|
return new GoogleSearchAdapter(googleApiKey, googleSearchEngineId);
|
|
41
42
|
}
|
|
42
43
|
|
package/src/utils/Logger.js
CHANGED
|
@@ -116,6 +116,9 @@ export class Logger {
|
|
|
116
116
|
|
|
117
117
|
if (enableConsole) {
|
|
118
118
|
transports.push(new winston.transports.Console({
|
|
119
|
+
// Route ALL log levels to stderr so stdout stays reserved for structured
|
|
120
|
+
// output (MCP JSON-RPC protocol and CLI --json results).
|
|
121
|
+
stderrLevels: ['error', 'warn', 'info', 'http', 'verbose', 'debug', 'silly'],
|
|
119
122
|
format: winston.format.combine(
|
|
120
123
|
winston.format.colorize(),
|
|
121
124
|
winston.format.simple()
|