crawlforge-mcp-server 3.5.1 → 4.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -4
- package/server.js +138 -26
- package/src/cli/commands/actions.js +36 -0
- package/src/cli/commands/analyze.js +19 -0
- package/src/cli/commands/batch.js +45 -0
- package/src/cli/commands/crawl.js +30 -0
- package/src/cli/commands/extract.js +45 -0
- package/src/cli/commands/install-skills.js +46 -0
- package/src/cli/commands/llmstxt.js +24 -0
- package/src/cli/commands/localize.js +29 -0
- package/src/cli/commands/map.js +26 -0
- package/src/cli/commands/monitor.js +29 -0
- package/src/cli/commands/research.js +26 -0
- package/src/cli/commands/scrape.js +37 -0
- package/src/cli/commands/search.js +28 -0
- package/src/cli/commands/stealth.js +29 -0
- package/src/cli/commands/template.js +26 -0
- package/src/cli/commands/track.js +24 -0
- package/src/cli/commands/uninstall-skills.js +35 -0
- package/src/cli/formatter.js +57 -0
- package/src/cli/index.js +94 -0
- package/src/cli/lib/runTool.js +40 -0
- package/src/core/ActionExecutor.js +8 -6
- package/src/core/AuthManager.js +103 -3
- package/src/core/ChangeTracker.js +34 -0
- package/src/core/ElicitationHelper.js +112 -0
- package/src/core/JobManager.js +36 -2
- package/src/core/LocalizationManager.js +19 -5
- package/src/core/PerformanceManager.js +53 -17
- package/src/core/ResearchOrchestrator.js +40 -5
- package/src/core/SamplingClient.js +191 -0
- package/src/core/StealthBrowserManager.js +248 -2
- package/src/core/WebhookDispatcher.js +18 -10
- package/src/prompts/PromptRegistry.js +199 -0
- package/src/resources/ResourceRegistry.js +273 -0
- package/src/server/withAuth.js +25 -0
- package/src/skills/crawlforge-cli.md +157 -0
- package/src/skills/crawlforge-mcp.md +80 -0
- package/src/skills/crawlforge-research.md +104 -0
- package/src/skills/crawlforge-stealth.md +98 -0
- package/src/skills/installer.js +141 -0
- package/src/tools/advanced/batchScrape/index.js +30 -0
- package/src/tools/advanced/batchScrape/schema.js +1 -1
- package/src/tools/basic/extractText.js +19 -8
- package/src/tools/crawl/crawlDeep.js +27 -0
- package/src/tools/extract/extractContent.js +5 -17
- package/src/tools/extract/extractStructured.js +8 -0
- package/src/tools/extract/extractWithLlm.js +25 -5
- package/src/tools/extract/processDocument.js +7 -1
- package/src/tools/extract/summarizeContent.js +17 -0
- package/src/tools/research/deepResearch.js +34 -0
- package/src/tools/templates/ScrapeTemplateTool.js +68 -0
- package/src/tools/templates/TemplateRegistry.js +311 -0
- package/src/utils/Logger.js +15 -0
- package/src/utils/htmlToMarkdown.js +54 -0
- package/src/utils/secretMask.js +86 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* llmstxt command — generate llms.txt for a website.
|
|
3
|
+
*/
|
|
4
|
+
import { GenerateLLMsTxtTool } from '../../tools/llmstxt/generateLLMsTxt.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('llmstxt <url>')
|
|
11
|
+
.description('Generate llms.txt for a website (AI compliance file)')
|
|
12
|
+
.option('--include-full', 'Also generate llms-full.txt')
|
|
13
|
+
.option('--max-pages <n>', 'Maximum pages to analyze', '50')
|
|
14
|
+
.action(async (url, opts, cmd) => {
|
|
15
|
+
const globals = cmd.parent.opts();
|
|
16
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
17
|
+
const tool = new GenerateLLMsTxtTool(getToolConfig('generate_llms_txt'));
|
|
18
|
+
await runTool(tool, {
|
|
19
|
+
url,
|
|
20
|
+
include_full_txt: !!opts.includeFull,
|
|
21
|
+
max_pages: parseInt(opts.maxPages, 10)
|
|
22
|
+
}, cliFlags);
|
|
23
|
+
});
|
|
24
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* localize command — fetch content with locale/geo awareness.
|
|
3
|
+
*/
|
|
4
|
+
import { LocalizationManager } from '../../core/LocalizationManager.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('localize <url>')
|
|
11
|
+
.description('Fetch URL with locale/geo-aware settings')
|
|
12
|
+
.option('--locale <locale>', 'Locale code (e.g. en-US, fr-FR)', 'en-US')
|
|
13
|
+
.option('--country <code>', 'Country code for geo-targeting (e.g. US, FR)')
|
|
14
|
+
.option('--currency <code>', 'Currency code (e.g. USD, EUR)')
|
|
15
|
+
.action(async (url, opts, cmd) => {
|
|
16
|
+
const globals = cmd.parent.opts();
|
|
17
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
18
|
+
const mgr = new LocalizationManager(getToolConfig('localization'));
|
|
19
|
+
const wrapperTool = {
|
|
20
|
+
execute: (p) => mgr.fetchWithLocalization(p)
|
|
21
|
+
};
|
|
22
|
+
await runTool(wrapperTool, {
|
|
23
|
+
url,
|
|
24
|
+
locale: opts.locale,
|
|
25
|
+
country: opts.country,
|
|
26
|
+
currency: opts.currency
|
|
27
|
+
}, cliFlags);
|
|
28
|
+
});
|
|
29
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* map command — generate a sitemap using map_site tool.
|
|
3
|
+
*/
|
|
4
|
+
import { MapSiteTool } from '../../tools/crawl/mapSite.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('map <url>')
|
|
11
|
+
.description('Generate a sitemap for a website')
|
|
12
|
+
.option('--depth <n>', 'Maximum crawl depth', '3')
|
|
13
|
+
.option('--max-pages <n>', 'Maximum pages to include', '500')
|
|
14
|
+
.option('--format <fmt>', 'Output format: json or xml', 'json')
|
|
15
|
+
.action(async (url, opts, cmd) => {
|
|
16
|
+
const globals = cmd.parent.opts();
|
|
17
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
18
|
+
const tool = new MapSiteTool(getToolConfig('map_site'));
|
|
19
|
+
await runTool(tool, {
|
|
20
|
+
url,
|
|
21
|
+
max_depth: parseInt(opts.depth, 10),
|
|
22
|
+
max_pages: parseInt(opts.maxPages, 10),
|
|
23
|
+
output_format: opts.format
|
|
24
|
+
}, cliFlags);
|
|
25
|
+
});
|
|
26
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* monitor command — continuously monitor a URL for changes (scheduled mode).
|
|
3
|
+
*/
|
|
4
|
+
import { TrackChangesTool } from '../../tools/tracking/trackChanges/index.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('monitor <url>')
|
|
11
|
+
.description('Continuously monitor a URL for content changes')
|
|
12
|
+
.option('--interval <seconds>', 'Check interval in seconds', '300')
|
|
13
|
+
.option('--selector <css>', 'CSS selector to scope monitoring')
|
|
14
|
+
.option('--webhook <url>', 'Webhook URL to notify on changes')
|
|
15
|
+
.option('--threshold <pct>', 'Change threshold percentage (0-100)', '5')
|
|
16
|
+
.action(async (url, opts, cmd) => {
|
|
17
|
+
const globals = cmd.parent.opts();
|
|
18
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
19
|
+
const tool = new TrackChangesTool(getToolConfig('track_changes'));
|
|
20
|
+
await runTool(tool, {
|
|
21
|
+
url,
|
|
22
|
+
scheduled: true,
|
|
23
|
+
interval_seconds: parseInt(opts.interval, 10),
|
|
24
|
+
selector: opts.selector,
|
|
25
|
+
webhook_url: opts.webhook,
|
|
26
|
+
change_threshold: parseFloat(opts.threshold)
|
|
27
|
+
}, cliFlags);
|
|
28
|
+
});
|
|
29
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* research command — deep research on a topic.
|
|
3
|
+
*/
|
|
4
|
+
import { DeepResearchTool } from '../../tools/research/deepResearch.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('research <topic>')
|
|
11
|
+
.description('Conduct deep research on a topic')
|
|
12
|
+
.option('--depth <level>', 'Research depth: basic, standard, or deep', 'standard')
|
|
13
|
+
.option('--max-urls <n>', 'Maximum URLs to analyze', '20')
|
|
14
|
+
.option('--output-format <fmt>', 'Output format: summary or detailed', 'summary')
|
|
15
|
+
.action(async (topic, opts, cmd) => {
|
|
16
|
+
const globals = cmd.parent.opts();
|
|
17
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
18
|
+
const tool = new DeepResearchTool(getToolConfig('deep_research'));
|
|
19
|
+
await runTool(tool, {
|
|
20
|
+
query: topic,
|
|
21
|
+
depth: opts.depth,
|
|
22
|
+
max_urls: parseInt(opts.maxUrls, 10),
|
|
23
|
+
output_format: opts.outputFormat
|
|
24
|
+
}, cliFlags);
|
|
25
|
+
});
|
|
26
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* scrape command — fetches a URL and returns its content.
|
|
3
|
+
* Without --extract: uses fetch_url (raw HTML + headers).
|
|
4
|
+
* With --extract: uses extract_content (cleaned text/markdown).
|
|
5
|
+
*/
|
|
6
|
+
import { fetchUrlHandler } from '../../tools/basic/fetchUrl.js';
|
|
7
|
+
import { ExtractContentTool } from '../../tools/extract/extractContent.js';
|
|
8
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
9
|
+
import { runTool } from '../lib/runTool.js';
|
|
10
|
+
|
|
11
|
+
export function register(program) {
|
|
12
|
+
program
|
|
13
|
+
.command('scrape <url>')
|
|
14
|
+
.description('Fetch a URL and return its content')
|
|
15
|
+
.option('--extract', 'Use extract_content for cleaned text/markdown output')
|
|
16
|
+
.option('--format <format>', 'Output format: text, markdown, html (default: text)', 'text')
|
|
17
|
+
.option('--timeout <ms>', 'Request timeout in milliseconds', '10000')
|
|
18
|
+
.action(async (url, opts, cmd) => {
|
|
19
|
+
const globals = cmd.parent.opts();
|
|
20
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
21
|
+
const apiKey = globals.apiKey || process.env.CRAWLFORGE_API_KEY;
|
|
22
|
+
const timeout = parseInt(opts.timeout, 10);
|
|
23
|
+
|
|
24
|
+
if (opts.extract) {
|
|
25
|
+
const tool = new ExtractContentTool(getToolConfig('extract_content'));
|
|
26
|
+
const wrapperTool = {
|
|
27
|
+
execute: (p) => tool.execute(p)
|
|
28
|
+
};
|
|
29
|
+
await runTool(wrapperTool, { url, output_format: opts.format, timeout }, cliFlags);
|
|
30
|
+
} else {
|
|
31
|
+
const wrapperTool = {
|
|
32
|
+
execute: (p) => fetchUrlHandler(p)
|
|
33
|
+
};
|
|
34
|
+
await runTool(wrapperTool, { url, timeout }, cliFlags);
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* search command — searches the web using search_web tool.
|
|
3
|
+
*/
|
|
4
|
+
import { SearchWebTool } from '../../tools/search/searchWeb.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('search <query>')
|
|
11
|
+
.description('Search the web')
|
|
12
|
+
.option('--limit <n>', 'Number of results', '10')
|
|
13
|
+
.option('--lang <lang>', 'Language code (e.g. en, fr)', 'en')
|
|
14
|
+
.option('--provider <p>', 'Search provider: crawlforge or searxng', 'crawlforge')
|
|
15
|
+
.option('--no-safe-search', 'Disable safe search')
|
|
16
|
+
.action(async (query, opts, cmd) => {
|
|
17
|
+
const globals = cmd.parent.opts();
|
|
18
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
19
|
+
const tool = new SearchWebTool(getToolConfig('search_web'));
|
|
20
|
+
await runTool(tool, {
|
|
21
|
+
query,
|
|
22
|
+
limit: parseInt(opts.limit, 10),
|
|
23
|
+
lang: opts.lang,
|
|
24
|
+
provider: opts.provider,
|
|
25
|
+
safe_search: opts.safeSearch !== false
|
|
26
|
+
}, cliFlags);
|
|
27
|
+
});
|
|
28
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* stealth command — scrape a URL using stealth mode.
|
|
3
|
+
*/
|
|
4
|
+
import { StealthBrowserManager } from '../../core/StealthBrowserManager.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('stealth <url>')
|
|
11
|
+
.description('Scrape a URL using stealth/anti-bot browser mode')
|
|
12
|
+
.option('--engine <engine>', 'Browser engine: playwright or camoufox', 'playwright')
|
|
13
|
+
.option('--wait <ms>', 'Wait time after page load in milliseconds', '2000')
|
|
14
|
+
.option('--screenshot', 'Capture a screenshot')
|
|
15
|
+
.action(async (url, opts, cmd) => {
|
|
16
|
+
const globals = cmd.parent.opts();
|
|
17
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
18
|
+
const mgr = new StealthBrowserManager(getToolConfig('stealth_mode'));
|
|
19
|
+
const wrapperTool = {
|
|
20
|
+
execute: (p) => mgr.scrapeWithStealth(p)
|
|
21
|
+
};
|
|
22
|
+
await runTool(wrapperTool, {
|
|
23
|
+
url,
|
|
24
|
+
engine: opts.engine,
|
|
25
|
+
wait_for: parseInt(opts.wait, 10),
|
|
26
|
+
screenshot: !!opts.screenshot
|
|
27
|
+
}, cliFlags);
|
|
28
|
+
});
|
|
29
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* template command — scrape a target URL using a pre-built site template.
|
|
3
|
+
*/
|
|
4
|
+
import { ScrapeTemplateTool } from '../../tools/templates/ScrapeTemplateTool.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('template <id> <target>')
|
|
11
|
+
.description('Scrape using a pre-built site template (e.g. amazon-product, github-repo)')
|
|
12
|
+
.option('--list', 'List all available templates')
|
|
13
|
+
.action(async (id, target, opts, cmd) => {
|
|
14
|
+
const globals = cmd.parent.opts();
|
|
15
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
16
|
+
const tool = new ScrapeTemplateTool(getToolConfig('scrape_template'));
|
|
17
|
+
|
|
18
|
+
if (opts.list) {
|
|
19
|
+
const wrapperTool = { execute: () => tool.listTemplates() };
|
|
20
|
+
await runTool(wrapperTool, {}, cliFlags);
|
|
21
|
+
return;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
await runTool(tool, { template_id: id, url: target }, cliFlags);
|
|
25
|
+
});
|
|
26
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* track command — track content changes on a URL.
|
|
3
|
+
*/
|
|
4
|
+
import { TrackChangesTool } from '../../tools/tracking/trackChanges/index.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('track <url>')
|
|
11
|
+
.description('Track content changes on a URL')
|
|
12
|
+
.option('--selector <css>', 'CSS selector to scope tracking')
|
|
13
|
+
.option('--threshold <pct>', 'Change threshold percentage (0-100)', '5')
|
|
14
|
+
.action(async (url, opts, cmd) => {
|
|
15
|
+
const globals = cmd.parent.opts();
|
|
16
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
17
|
+
const tool = new TrackChangesTool(getToolConfig('track_changes'));
|
|
18
|
+
await runTool(tool, {
|
|
19
|
+
url,
|
|
20
|
+
selector: opts.selector,
|
|
21
|
+
change_threshold: parseFloat(opts.threshold)
|
|
22
|
+
}, cliFlags);
|
|
23
|
+
});
|
|
24
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* uninstall-skills command -- remove CrawlForge skill files.
|
|
3
|
+
*/
|
|
4
|
+
import { uninstall } from '../../skills/installer.js';
|
|
5
|
+
|
|
6
|
+
export function register(program) {
|
|
7
|
+
program
|
|
8
|
+
.command('uninstall-skills')
|
|
9
|
+
.description('Remove CrawlForge skill files from Claude Code, Cursor, or VS Code')
|
|
10
|
+
.option('--target <target>', 'Target: claude-code, cursor, vscode, or all', 'all')
|
|
11
|
+
.action(async (opts) => {
|
|
12
|
+
try {
|
|
13
|
+
const results = await uninstall({
|
|
14
|
+
target: opts.target,
|
|
15
|
+
cwd: process.cwd()
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
if (results.removed.length > 0) {
|
|
19
|
+
process.stdout.write('Removed:\n');
|
|
20
|
+
results.removed.forEach(p => process.stdout.write(' ' + p + '\n'));
|
|
21
|
+
}
|
|
22
|
+
if (results.notFound.length > 0) {
|
|
23
|
+
process.stdout.write('Not found (already removed):\n');
|
|
24
|
+
results.notFound.forEach(p => process.stdout.write(' ' + p + '\n'));
|
|
25
|
+
}
|
|
26
|
+
if (results.removed.length === 0) {
|
|
27
|
+
process.stdout.write('No skill files found to remove.\n');
|
|
28
|
+
}
|
|
29
|
+
process.exit(0);
|
|
30
|
+
} catch (err) {
|
|
31
|
+
process.stderr.write('Error: ' + err.message + '\n');
|
|
32
|
+
process.exit(1);
|
|
33
|
+
}
|
|
34
|
+
});
|
|
35
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* formatter.js — CLI output formatter shared across all CLI commands.
|
|
3
|
+
* Respects global flags: --json, --pretty, --quiet.
|
|
4
|
+
* No logic duplication with MCP tools — formats the same tool execute() output.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Format a tool result for CLI output.
|
|
9
|
+
* @param {object} result — raw object from tool.execute() or MCP handler
|
|
10
|
+
* @param {{ json?: boolean, pretty?: boolean, quiet?: boolean }} flags
|
|
11
|
+
* @returns {string}
|
|
12
|
+
*/
|
|
13
|
+
export function formatResult(result, flags = {}) {
|
|
14
|
+
const { json = false, pretty = false, quiet = false } = flags;
|
|
15
|
+
|
|
16
|
+
if (quiet) return '';
|
|
17
|
+
|
|
18
|
+
// If result has MCP content array, extract the text
|
|
19
|
+
if (result && Array.isArray(result.content)) {
|
|
20
|
+
const texts = result.content
|
|
21
|
+
.filter(c => c.type === 'text')
|
|
22
|
+
.map(c => c.text);
|
|
23
|
+
|
|
24
|
+
if (json || pretty) {
|
|
25
|
+
// Try to parse each text as JSON and re-serialize
|
|
26
|
+
const parsed = texts.map(t => {
|
|
27
|
+
try { return JSON.parse(t); } catch { return t; }
|
|
28
|
+
});
|
|
29
|
+
const output = parsed.length === 1 ? parsed[0] : parsed;
|
|
30
|
+
return pretty
|
|
31
|
+
? JSON.stringify(output, null, 2)
|
|
32
|
+
: JSON.stringify(output);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Plain text: return text blocks joined
|
|
36
|
+
return texts.join('\n');
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Plain object
|
|
40
|
+
if (json) return JSON.stringify(result);
|
|
41
|
+
if (pretty) return JSON.stringify(result, null, 2);
|
|
42
|
+
return typeof result === 'string' ? result : JSON.stringify(result, null, 2);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Format an error for CLI output.
|
|
47
|
+
* @param {Error|string} error
|
|
48
|
+
* @param {{ json?: boolean }} flags
|
|
49
|
+
* @returns {string}
|
|
50
|
+
*/
|
|
51
|
+
export function formatError(error, flags = {}) {
|
|
52
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
53
|
+
if (flags.json) {
|
|
54
|
+
return JSON.stringify({ error: message });
|
|
55
|
+
}
|
|
56
|
+
return `Error: ${message}`;
|
|
57
|
+
}
|
package/src/cli/index.js
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* CrawlForge CLI — src/cli/index.js
|
|
4
|
+
* Entry point for the `crawlforge` command.
|
|
5
|
+
*
|
|
6
|
+
* Global flags:
|
|
7
|
+
* --json Output raw JSON (compact)
|
|
8
|
+
* --pretty Output pretty-printed JSON
|
|
9
|
+
* --quiet Suppress all output (exit code only)
|
|
10
|
+
* --api-key CrawlForge API key (overrides CRAWLFORGE_API_KEY env)
|
|
11
|
+
* --timeout Global request timeout in ms (default: 30000)
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { Command } from 'commander';
|
|
15
|
+
import { createRequire } from 'node:module';
|
|
16
|
+
import { fileURLToPath } from 'node:url';
|
|
17
|
+
import { dirname, join } from 'node:path';
|
|
18
|
+
import { readFileSync } from 'node:fs';
|
|
19
|
+
|
|
20
|
+
// Load package.json for version
|
|
21
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
22
|
+
const __dirname = dirname(__filename);
|
|
23
|
+
const pkgPath = join(__dirname, '../../package.json');
|
|
24
|
+
let version = '4.1.0';
|
|
25
|
+
try {
|
|
26
|
+
const pkg = JSON.parse(readFileSync(pkgPath, 'utf8'));
|
|
27
|
+
version = pkg.version;
|
|
28
|
+
} catch { /* use fallback */ }
|
|
29
|
+
|
|
30
|
+
// Import all command registrars
|
|
31
|
+
import { register as registerScrape } from './commands/scrape.js';
|
|
32
|
+
import { register as registerSearch } from './commands/search.js';
|
|
33
|
+
import { register as registerCrawl } from './commands/crawl.js';
|
|
34
|
+
import { register as registerMap } from './commands/map.js';
|
|
35
|
+
import { register as registerExtract } from './commands/extract.js';
|
|
36
|
+
import { register as registerTrack } from './commands/track.js';
|
|
37
|
+
import { register as registerAnalyze } from './commands/analyze.js';
|
|
38
|
+
import { register as registerResearch } from './commands/research.js';
|
|
39
|
+
import { register as registerStealth } from './commands/stealth.js';
|
|
40
|
+
import { register as registerBatch } from './commands/batch.js';
|
|
41
|
+
import { register as registerActions } from './commands/actions.js';
|
|
42
|
+
import { register as registerLocalize } from './commands/localize.js';
|
|
43
|
+
import { register as registerLlmstxt } from './commands/llmstxt.js';
|
|
44
|
+
import { register as registerTemplate } from './commands/template.js';
|
|
45
|
+
import { register as registerMonitor } from './commands/monitor.js';
|
|
46
|
+
import { register as registerInstallSkills } from './commands/install-skills.js';
|
|
47
|
+
import { register as registerUninstallSkills } from './commands/uninstall-skills.js';
|
|
48
|
+
|
|
49
|
+
const program = new Command();
|
|
50
|
+
|
|
51
|
+
program
|
|
52
|
+
.name('crawlforge')
|
|
53
|
+
.description('CrawlForge CLI — web scraping, crawling, and content processing')
|
|
54
|
+
.version(version)
|
|
55
|
+
.option('--json', 'Output compact JSON')
|
|
56
|
+
.option('--pretty', 'Output pretty-printed JSON')
|
|
57
|
+
.option('--quiet', 'Suppress all stdout output (exit code only)')
|
|
58
|
+
.option('--api-key <key>', 'CrawlForge API key (overrides CRAWLFORGE_API_KEY env var)')
|
|
59
|
+
.option('--timeout <ms>', 'Global request timeout in milliseconds', '30000');
|
|
60
|
+
|
|
61
|
+
// Apply --api-key globally before commands run
|
|
62
|
+
program.hook('preAction', (thisCommand) => {
|
|
63
|
+
const opts = program.opts();
|
|
64
|
+
if (opts.apiKey) {
|
|
65
|
+
process.env.CRAWLFORGE_API_KEY = opts.apiKey;
|
|
66
|
+
}
|
|
67
|
+
if (opts.timeout) {
|
|
68
|
+
process.env.CRAWLFORGE_CLI_TIMEOUT = opts.timeout;
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
// Register all 15 tool commands + 2 skills commands
|
|
73
|
+
registerScrape(program);
|
|
74
|
+
registerSearch(program);
|
|
75
|
+
registerCrawl(program);
|
|
76
|
+
registerMap(program);
|
|
77
|
+
registerExtract(program);
|
|
78
|
+
registerTrack(program);
|
|
79
|
+
registerAnalyze(program);
|
|
80
|
+
registerResearch(program);
|
|
81
|
+
registerStealth(program);
|
|
82
|
+
registerBatch(program);
|
|
83
|
+
registerActions(program);
|
|
84
|
+
registerLocalize(program);
|
|
85
|
+
registerLlmstxt(program);
|
|
86
|
+
registerTemplate(program);
|
|
87
|
+
registerMonitor(program);
|
|
88
|
+
registerInstallSkills(program);
|
|
89
|
+
registerUninstallSkills(program);
|
|
90
|
+
|
|
91
|
+
program.parseAsync(process.argv).catch((err) => {
|
|
92
|
+
process.stderr.write(`Fatal error: ${err.message}\n`);
|
|
93
|
+
process.exit(1);
|
|
94
|
+
});
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* runTool.js — Thin wrapper that invokes a tool's execute() method directly
|
|
3
|
+
* and formats the output according to global CLI flags.
|
|
4
|
+
*
|
|
5
|
+
* This intentionally does NOT replicate withAuth credit logic — CLI invocations
|
|
6
|
+
* go through the same AuthManager path as MCP calls when a real API key is set.
|
|
7
|
+
* In creator mode (CRAWLFORGE_CREATOR_SECRET set) credits are skipped automatically.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { formatResult, formatError } from '../formatter.js';
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Run a tool and print formatted output.
|
|
14
|
+
* @param {object} tool — tool instance with execute(params) method
|
|
15
|
+
* @param {object} params — tool parameters
|
|
16
|
+
* @param {object} cliFlags — { json, pretty, quiet }
|
|
17
|
+
* @param {object} [options]
|
|
18
|
+
* @param {boolean} [options.exitOnError=true]
|
|
19
|
+
*/
|
|
20
|
+
export async function runTool(tool, params, cliFlags, options = {}) {
|
|
21
|
+
const { exitOnError = true } = options;
|
|
22
|
+
|
|
23
|
+
try {
|
|
24
|
+
const result = await tool.execute(params);
|
|
25
|
+
|
|
26
|
+
// Check for MCP-style error response
|
|
27
|
+
if (result && result.isError) {
|
|
28
|
+
const errText = result.content?.[0]?.text ?? 'Tool returned an error';
|
|
29
|
+
process.stderr.write(formatError(errText, cliFlags) + '\n');
|
|
30
|
+
if (exitOnError) process.exit(1);
|
|
31
|
+
return;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const output = formatResult(result, cliFlags);
|
|
35
|
+
if (output) process.stdout.write(output + '\n');
|
|
36
|
+
} catch (error) {
|
|
37
|
+
process.stderr.write(formatError(error, cliFlags) + '\n');
|
|
38
|
+
if (exitOnError) process.exit(1);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
@@ -202,13 +202,15 @@ export class ActionExecutor extends EventEmitter {
|
|
|
202
202
|
this.activeChains.set(chainId, executionContext);
|
|
203
203
|
this.emit('chainStarted', executionContext);
|
|
204
204
|
|
|
205
|
-
//
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
205
|
+
// D2.4: initialize page INSIDE try/finally so it is always closed even on
|
|
206
|
+
// errors thrown between acquisition and the inner try block.
|
|
207
|
+
let page = null;
|
|
209
208
|
let chainResult;
|
|
210
209
|
|
|
211
210
|
try {
|
|
211
|
+
page = await this.initializePage(url, browserOptions);
|
|
212
|
+
executionContext.page = page;
|
|
213
|
+
|
|
212
214
|
// Execute chain with potential retries
|
|
213
215
|
chainResult = await this.executeChainWithRetries(executionContext);
|
|
214
216
|
|
|
@@ -235,9 +237,9 @@ export class ActionExecutor extends EventEmitter {
|
|
|
235
237
|
|
|
236
238
|
throw error;
|
|
237
239
|
} finally {
|
|
238
|
-
//
|
|
240
|
+
// D2.4: always close page to prevent leaks
|
|
239
241
|
if (page) {
|
|
240
|
-
await page.close();
|
|
242
|
+
try { await page.close(); } catch (_) { /* ignore close errors */ }
|
|
241
243
|
}
|
|
242
244
|
|
|
243
245
|
// Update execution time
|