crawlforge-mcp-server 4.2.3 → 4.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/server.js +2 -1
- package/src/cli/commands/analyze.js +26 -1
- package/src/cli/commands/localize.js +45 -10
- package/src/cli/commands/monitor.js +2 -1
- package/src/cli/commands/template.js +8 -3
- package/src/cli/index.js +17 -0
- package/src/cli/lib/runTool.js +11 -2
- package/src/core/ActionExecutor.js +2 -1
- package/src/core/AuthManager.js +3 -2
- package/src/core/PerformanceManager.js +3 -0
- package/src/core/creatorMode.js +2 -1
- package/src/tools/advanced/batchScrape/index.js +2 -1
- package/src/tools/search/adapters/searchProviderFactory.js +2 -1
- package/src/utils/Logger.js +3 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "4.2.
|
|
3
|
+
"version": "4.2.4",
|
|
4
4
|
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 23 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. v4.0 adds Markdown-first output, pre-built site templates, Camoufox stealth engine, and cost transparency.",
|
|
5
5
|
"main": "server.js",
|
|
6
6
|
"bin": {
|
package/server.js
CHANGED
|
@@ -57,7 +57,8 @@ if (!AuthManager.isAuthenticated() && !AuthManager.isCreatorMode()) {
|
|
|
57
57
|
const apiKey = process.env.CRAWLFORGE_API_KEY;
|
|
58
58
|
if (apiKey) {
|
|
59
59
|
// Auto-setup if API key is provided via environment
|
|
60
|
-
|
|
60
|
+
// Status → stderr; stdout is reserved for the MCP JSON-RPC stream.
|
|
61
|
+
console.error('🔧 Auto-configuring CrawlForge with provided API key...');
|
|
61
62
|
const success = await AuthManager.runSetup(apiKey);
|
|
62
63
|
if (!success) {
|
|
63
64
|
console.error('❌ Failed to authenticate with provided API key');
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* analyze command — analyze content of a URL.
|
|
3
|
+
* Fetches and cleans the page content first (extract_content), then runs
|
|
4
|
+
* NLP analysis (analyze_content) on the extracted text.
|
|
3
5
|
*/
|
|
6
|
+
import { ExtractContentTool } from '../../tools/extract/extractContent.js';
|
|
4
7
|
import { AnalyzeContentTool } from '../../tools/extract/analyzeContent.js';
|
|
5
8
|
import { getToolConfig } from '../../constants/config.js';
|
|
6
9
|
import { runTool } from '../lib/runTool.js';
|
|
@@ -13,7 +16,29 @@ export function register(program) {
|
|
|
13
16
|
.action(async (url, opts, cmd) => {
|
|
14
17
|
const globals = cmd.parent.opts();
|
|
15
18
|
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
19
|
+
|
|
20
|
+
// analyze_content operates on text, so fetch & clean the page first.
|
|
21
|
+
const extractor = new ExtractContentTool(getToolConfig('extract_content'));
|
|
22
|
+
let text;
|
|
23
|
+
try {
|
|
24
|
+
const extracted = await extractor.execute({ url });
|
|
25
|
+
text = extracted?.content?.text;
|
|
26
|
+
} catch (e) {
|
|
27
|
+
process.stderr.write(`Error fetching content from ${url}: ${e.message}\n`);
|
|
28
|
+
process.exit(1);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
if (!text || text.trim().length < 10) {
|
|
32
|
+
process.stderr.write(`Error: could not extract enough text from ${url} to analyze\n`);
|
|
33
|
+
process.exit(1);
|
|
34
|
+
}
|
|
35
|
+
|
|
16
36
|
const tool = new AnalyzeContentTool(getToolConfig('analyze_content'));
|
|
17
|
-
|
|
37
|
+
// All analyses (language, topics, entities, sentiment, readability) default to true;
|
|
38
|
+
// --depth full additionally enables advanced metrics.
|
|
39
|
+
await runTool(tool, {
|
|
40
|
+
text,
|
|
41
|
+
options: { includeAdvancedMetrics: opts.depth === 'full' }
|
|
42
|
+
}, cliFlags);
|
|
18
43
|
});
|
|
19
44
|
}
|
|
@@ -1,29 +1,64 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* localize command — fetch
|
|
2
|
+
* localize command — fetch a URL with locale/geo-aware request headers.
|
|
3
|
+
* Builds a localization config (Accept-Language, User-Agent) for the target
|
|
4
|
+
* country via LocalizationManager, then fetches the URL with those headers.
|
|
3
5
|
*/
|
|
4
6
|
import { LocalizationManager } from '../../core/LocalizationManager.js';
|
|
7
|
+
import { fetchUrlHandler } from '../../tools/basic/fetchUrl.js';
|
|
5
8
|
import { getToolConfig } from '../../constants/config.js';
|
|
6
9
|
import { runTool } from '../lib/runTool.js';
|
|
7
10
|
|
|
11
|
+
// Derive a 2-letter country code from a --country flag or an en-US style locale.
|
|
12
|
+
function resolveCountry(country, locale) {
|
|
13
|
+
if (country) return country.toUpperCase();
|
|
14
|
+
if (locale && locale.includes('-')) return locale.split('-')[1].toUpperCase();
|
|
15
|
+
return 'US';
|
|
16
|
+
}
|
|
17
|
+
|
|
8
18
|
export function register(program) {
|
|
9
19
|
program
|
|
10
20
|
.command('localize <url>')
|
|
11
|
-
.description('Fetch URL with locale/geo-aware
|
|
21
|
+
.description('Fetch URL with locale/geo-aware request headers')
|
|
12
22
|
.option('--locale <locale>', 'Locale code (e.g. en-US, fr-FR)', 'en-US')
|
|
13
23
|
.option('--country <code>', 'Country code for geo-targeting (e.g. US, FR)')
|
|
14
24
|
.option('--currency <code>', 'Currency code (e.g. USD, EUR)')
|
|
15
25
|
.action(async (url, opts, cmd) => {
|
|
16
26
|
const globals = cmd.parent.opts();
|
|
17
27
|
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
18
|
-
|
|
28
|
+
|
|
29
|
+
const countryCode = resolveCountry(opts.country, opts.locale);
|
|
30
|
+
const language = opts.locale ? opts.locale.split('-')[0] : undefined;
|
|
31
|
+
|
|
19
32
|
const wrapperTool = {
|
|
20
|
-
execute: (
|
|
33
|
+
execute: async () => {
|
|
34
|
+
const mgr = new LocalizationManager(getToolConfig('localization'));
|
|
35
|
+
await mgr.initialize();
|
|
36
|
+
const config = await mgr.configureCountry(countryCode, {
|
|
37
|
+
language,
|
|
38
|
+
currency: opts.currency
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
const headers = {
|
|
42
|
+
'Accept-Language': config.acceptLanguage,
|
|
43
|
+
'User-Agent': mgr.generateUserAgent(countryCode),
|
|
44
|
+
...(config.customHeaders || {})
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
const fetched = await fetchUrlHandler({ url, headers });
|
|
48
|
+
return {
|
|
49
|
+
localization: {
|
|
50
|
+
countryCode: config.countryCode,
|
|
51
|
+
language: config.language,
|
|
52
|
+
timezone: config.timezone,
|
|
53
|
+
currency: config.currency,
|
|
54
|
+
acceptLanguage: config.acceptLanguage
|
|
55
|
+
},
|
|
56
|
+
request_headers: headers,
|
|
57
|
+
response: fetched
|
|
58
|
+
};
|
|
59
|
+
}
|
|
21
60
|
};
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
locale: opts.locale,
|
|
25
|
-
country: opts.country,
|
|
26
|
-
currency: opts.currency
|
|
27
|
-
}, cliFlags);
|
|
61
|
+
|
|
62
|
+
await runTool(wrapperTool, {}, cliFlags);
|
|
28
63
|
});
|
|
29
64
|
}
|
|
@@ -17,6 +17,7 @@ export function register(program) {
|
|
|
17
17
|
const globals = cmd.parent.opts();
|
|
18
18
|
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
19
19
|
const tool = new TrackChangesTool(getToolConfig('track_changes'));
|
|
20
|
+
// monitor runs continuously — do not auto-exit after the first result.
|
|
20
21
|
await runTool(tool, {
|
|
21
22
|
url,
|
|
22
23
|
scheduled: true,
|
|
@@ -24,6 +25,6 @@ export function register(program) {
|
|
|
24
25
|
selector: opts.selector,
|
|
25
26
|
webhook_url: opts.webhook,
|
|
26
27
|
change_threshold: parseFloat(opts.threshold)
|
|
27
|
-
}, cliFlags);
|
|
28
|
+
}, cliFlags, { exitOnSuccess: false });
|
|
28
29
|
});
|
|
29
30
|
}
|
|
@@ -7,7 +7,7 @@ import { runTool } from '../lib/runTool.js';
|
|
|
7
7
|
|
|
8
8
|
export function register(program) {
|
|
9
9
|
program
|
|
10
|
-
.command('template
|
|
10
|
+
.command('template [id] [target]')
|
|
11
11
|
.description('Scrape using a pre-built site template (e.g. amazon-product, github-repo)')
|
|
12
12
|
.option('--list', 'List all available templates')
|
|
13
13
|
.action(async (id, target, opts, cmd) => {
|
|
@@ -16,11 +16,16 @@ export function register(program) {
|
|
|
16
16
|
const tool = new ScrapeTemplateTool(getToolConfig('scrape_template'));
|
|
17
17
|
|
|
18
18
|
if (opts.list) {
|
|
19
|
-
const wrapperTool = { execute: () => tool.
|
|
19
|
+
const wrapperTool = { execute: () => tool.execute({ template: 'list' }) };
|
|
20
20
|
await runTool(wrapperTool, {}, cliFlags);
|
|
21
21
|
return;
|
|
22
22
|
}
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
if (!id || !target) {
|
|
25
|
+
process.stderr.write('Error: template requires <id> and <target>, or use --list\n');
|
|
26
|
+
process.exit(1);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
await runTool(tool, { template: id, url: target }, cliFlags);
|
|
25
30
|
});
|
|
26
31
|
}
|
package/src/cli/index.js
CHANGED
|
@@ -58,11 +58,28 @@ program
|
|
|
58
58
|
.option('--api-key <key>', 'CrawlForge API key (overrides CRAWLFORGE_API_KEY env var)')
|
|
59
59
|
.option('--timeout <ms>', 'Global request timeout in milliseconds', '30000');
|
|
60
60
|
|
|
61
|
+
// Resolve the API key from (in priority order): --api-key flag, CRAWLFORGE_API_KEY env,
|
|
62
|
+
// then the stored ~/.crawlforge/config.json written by `crawlforge-setup`.
|
|
63
|
+
function loadStoredApiKey() {
|
|
64
|
+
try {
|
|
65
|
+
const home = process.env.HOME || process.env.USERPROFILE;
|
|
66
|
+
if (!home) return undefined;
|
|
67
|
+
const cfgPath = join(home, '.crawlforge', 'config.json');
|
|
68
|
+
const cfg = JSON.parse(readFileSync(cfgPath, 'utf8'));
|
|
69
|
+
return cfg.apiKey || undefined;
|
|
70
|
+
} catch {
|
|
71
|
+
return undefined;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
61
75
|
// Apply --api-key globally before commands run
|
|
62
76
|
program.hook('preAction', (thisCommand) => {
|
|
63
77
|
const opts = program.opts();
|
|
64
78
|
if (opts.apiKey) {
|
|
65
79
|
process.env.CRAWLFORGE_API_KEY = opts.apiKey;
|
|
80
|
+
} else if (!process.env.CRAWLFORGE_API_KEY) {
|
|
81
|
+
const stored = loadStoredApiKey();
|
|
82
|
+
if (stored) process.env.CRAWLFORGE_API_KEY = stored;
|
|
66
83
|
}
|
|
67
84
|
if (opts.timeout) {
|
|
68
85
|
process.env.CRAWLFORGE_CLI_TIMEOUT = opts.timeout;
|
package/src/cli/lib/runTool.js
CHANGED
|
@@ -16,9 +16,13 @@ import { formatResult, formatError } from '../formatter.js';
|
|
|
16
16
|
* @param {object} cliFlags — { json, pretty, quiet }
|
|
17
17
|
* @param {object} [options]
|
|
18
18
|
* @param {boolean} [options.exitOnError=true]
|
|
19
|
+
* @param {boolean} [options.exitOnSuccess=true] Exit the process after writing
|
|
20
|
+
* output. One-shot CLI commands need this because background timers
|
|
21
|
+
* (metrics, cache/connection cleanup, etc.) otherwise keep the event loop
|
|
22
|
+
* alive. Long-running commands (e.g. `monitor`) pass false.
|
|
19
23
|
*/
|
|
20
24
|
export async function runTool(tool, params, cliFlags, options = {}) {
|
|
21
|
-
const { exitOnError = true } = options;
|
|
25
|
+
const { exitOnError = true, exitOnSuccess = true } = options;
|
|
22
26
|
|
|
23
27
|
try {
|
|
24
28
|
const result = await tool.execute(params);
|
|
@@ -32,7 +36,12 @@ export async function runTool(tool, params, cliFlags, options = {}) {
|
|
|
32
36
|
}
|
|
33
37
|
|
|
34
38
|
const output = formatResult(result, cliFlags);
|
|
35
|
-
if (output)
|
|
39
|
+
if (output) {
|
|
40
|
+
// Wait for stdout to flush (pipes/files buffer) before exiting.
|
|
41
|
+
process.stdout.write(output + '\n', () => { if (exitOnSuccess) process.exit(0); });
|
|
42
|
+
} else if (exitOnSuccess) {
|
|
43
|
+
process.exit(0);
|
|
44
|
+
}
|
|
36
45
|
} catch (error) {
|
|
37
46
|
process.stderr.write(formatError(error, cliFlags) + '\n');
|
|
38
47
|
if (exitOnError) process.exit(1);
|
|
@@ -926,7 +926,8 @@ export class ActionExecutor extends EventEmitter {
|
|
|
926
926
|
*/
|
|
927
927
|
log(level, message) {
|
|
928
928
|
if (this.enableLogging) {
|
|
929
|
-
|
|
929
|
+
// → stderr so stdout stays clean for MCP JSON-RPC / CLI --json output.
|
|
930
|
+
console.error('[ActionExecutor:' + level.toUpperCase() + '] ' + message);
|
|
930
931
|
}
|
|
931
932
|
}
|
|
932
933
|
|
package/src/core/AuthManager.js
CHANGED
|
@@ -69,7 +69,8 @@ class AuthManager {
|
|
|
69
69
|
|
|
70
70
|
// Skip config loading in creator mode
|
|
71
71
|
if (this.isCreatorMode()) {
|
|
72
|
-
|
|
72
|
+
// Status → stderr; stdout is reserved for MCP JSON-RPC / CLI --json output.
|
|
73
|
+
console.error('🚀 Creator Mode Active - Unlimited Access Enabled');
|
|
73
74
|
this.initialized = true;
|
|
74
75
|
return;
|
|
75
76
|
}
|
|
@@ -78,7 +79,7 @@ class AuthManager {
|
|
|
78
79
|
await this.loadConfig();
|
|
79
80
|
this.initialized = true;
|
|
80
81
|
} catch (error) {
|
|
81
|
-
console.
|
|
82
|
+
console.error('No existing CrawlForge configuration found. Run setup to configure.');
|
|
82
83
|
this.initialized = true;
|
|
83
84
|
}
|
|
84
85
|
|
|
@@ -771,6 +771,9 @@ export class PerformanceManager extends EventEmitter {
|
|
|
771
771
|
this.metricsTimer = setInterval(() => {
|
|
772
772
|
this.collectMetrics();
|
|
773
773
|
}, this.metricsInterval);
|
|
774
|
+
// Don't let the metrics interval keep a short-lived process (e.g. a one-shot
|
|
775
|
+
// CLI command) alive. The long-running server stays up via its stdio transport.
|
|
776
|
+
if (typeof this.metricsTimer.unref === 'function') this.metricsTimer.unref();
|
|
774
777
|
}
|
|
775
778
|
|
|
776
779
|
/**
|
package/src/core/creatorMode.js
CHANGED
|
@@ -29,7 +29,8 @@ if (process.env.CRAWLFORGE_CREATOR_SECRET) {
|
|
|
29
29
|
|
|
30
30
|
if (crypto.timingSafeEqual(Buffer.from(providedHash, 'hex'), Buffer.from(CREATOR_SECRET_HASH, 'hex'))) {
|
|
31
31
|
_creatorModeVerified = true;
|
|
32
|
-
|
|
32
|
+
// Status message → stderr so stdout stays clean (MCP JSON-RPC / CLI --json output).
|
|
33
|
+
console.error('Creator Mode Enabled - Unlimited Access');
|
|
33
34
|
} else {
|
|
34
35
|
console.warn('Invalid creator secret provided');
|
|
35
36
|
}
|
|
@@ -301,7 +301,8 @@ export class BatchScrapeTool extends EventEmitter {
|
|
|
301
301
|
}
|
|
302
302
|
|
|
303
303
|
_log(level, message) {
|
|
304
|
-
|
|
304
|
+
// → stderr so stdout stays clean for MCP JSON-RPC / CLI --json output.
|
|
305
|
+
if (this.enableLogging) console.error(`[BatchScrapeTool:${level.toUpperCase()}] ${message}`);
|
|
305
306
|
}
|
|
306
307
|
|
|
307
308
|
_initializeJobExecutors() {
|
|
@@ -36,7 +36,8 @@ export class SearchProviderFactory {
|
|
|
36
36
|
);
|
|
37
37
|
}
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
// Status message → stderr so stdout stays clean (MCP JSON-RPC / CLI --json).
|
|
40
|
+
console.error('🔍 Creator Mode: Using Google Search API directly');
|
|
40
41
|
return new GoogleSearchAdapter(googleApiKey, googleSearchEngineId);
|
|
41
42
|
}
|
|
42
43
|
|
package/src/utils/Logger.js
CHANGED
|
@@ -116,6 +116,9 @@ export class Logger {
|
|
|
116
116
|
|
|
117
117
|
if (enableConsole) {
|
|
118
118
|
transports.push(new winston.transports.Console({
|
|
119
|
+
// Route ALL log levels to stderr so stdout stays reserved for structured
|
|
120
|
+
// output (MCP JSON-RPC protocol and CLI --json results).
|
|
121
|
+
stderrLevels: ['error', 'warn', 'info', 'http', 'verbose', 'debug', 'silly'],
|
|
119
122
|
format: winston.format.combine(
|
|
120
123
|
winston.format.colorize(),
|
|
121
124
|
winston.format.simple()
|