crawlforge-mcp-server 3.4.0 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -2
- package/package.json +6 -4
- package/server.js +166 -32
- package/src/cli/commands/actions.js +36 -0
- package/src/cli/commands/analyze.js +19 -0
- package/src/cli/commands/batch.js +45 -0
- package/src/cli/commands/crawl.js +30 -0
- package/src/cli/commands/extract.js +45 -0
- package/src/cli/commands/install-skills.js +46 -0
- package/src/cli/commands/llmstxt.js +24 -0
- package/src/cli/commands/localize.js +29 -0
- package/src/cli/commands/map.js +26 -0
- package/src/cli/commands/monitor.js +29 -0
- package/src/cli/commands/research.js +26 -0
- package/src/cli/commands/scrape.js +37 -0
- package/src/cli/commands/search.js +28 -0
- package/src/cli/commands/stealth.js +29 -0
- package/src/cli/commands/template.js +26 -0
- package/src/cli/commands/track.js +24 -0
- package/src/cli/commands/uninstall-skills.js +35 -0
- package/src/cli/formatter.js +57 -0
- package/src/cli/index.js +94 -0
- package/src/cli/lib/runTool.js +40 -0
- package/src/core/ActionExecutor.js +8 -6
- package/src/core/AuthManager.js +103 -3
- package/src/core/ChangeTracker.js +34 -0
- package/src/core/ElicitationHelper.js +112 -0
- package/src/core/JobManager.js +36 -2
- package/src/core/LocalizationManager.js +19 -5
- package/src/core/PerformanceManager.js +53 -17
- package/src/core/ResearchOrchestrator.js +40 -5
- package/src/core/SamplingClient.js +191 -0
- package/src/core/StealthBrowserManager.js +248 -2
- package/src/core/WebhookDispatcher.js +18 -10
- package/src/prompts/PromptRegistry.js +199 -0
- package/src/resources/ResourceRegistry.js +273 -0
- package/src/server/transports/streamableHttp.js +6 -6
- package/src/server/withAuth.js +25 -0
- package/src/skills/crawlforge-cli.md +157 -0
- package/src/skills/crawlforge-mcp.md +80 -0
- package/src/skills/crawlforge-research.md +104 -0
- package/src/skills/crawlforge-stealth.md +98 -0
- package/src/skills/installer.js +141 -0
- package/src/tools/advanced/batchScrape/index.js +30 -0
- package/src/tools/advanced/batchScrape/schema.js +1 -1
- package/src/tools/basic/extractText.js +19 -8
- package/src/tools/crawl/crawlDeep.js +27 -0
- package/src/tools/extract/extractContent.js +5 -17
- package/src/tools/extract/extractStructured.js +8 -0
- package/src/tools/extract/extractWithLlm.js +35 -25
- package/src/tools/extract/listOllamaModels.js +66 -0
- package/src/tools/extract/processDocument.js +7 -1
- package/src/tools/extract/summarizeContent.js +17 -0
- package/src/tools/research/deepResearch.js +34 -0
- package/src/tools/templates/ScrapeTemplateTool.js +68 -0
- package/src/tools/templates/TemplateRegistry.js +311 -0
- package/src/utils/Logger.js +15 -0
- package/src/utils/htmlToMarkdown.js +54 -0
- package/src/utils/secretMask.js +86 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* batch command — scrape multiple URLs from a file.
|
|
3
|
+
* Reads newline-delimited URLs from the specified file.
|
|
4
|
+
*/
|
|
5
|
+
import { BatchScrapeTool } from '../../tools/advanced/BatchScrapeTool.js';
|
|
6
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
7
|
+
import { runTool } from '../lib/runTool.js';
|
|
8
|
+
import { readFileSync } from 'node:fs';
|
|
9
|
+
|
|
10
|
+
export function register(program) {
|
|
11
|
+
program
|
|
12
|
+
.command('batch <urls-file>')
|
|
13
|
+
.description('Scrape multiple URLs from a newline-delimited file')
|
|
14
|
+
.option('--format <fmt>', 'Output format: text, markdown, html', 'markdown')
|
|
15
|
+
.option('--concurrency <n>', 'Concurrent requests', '5')
|
|
16
|
+
.option('--max-retries <n>', 'Maximum retries per URL', '2')
|
|
17
|
+
.action(async (urlsFile, opts, cmd) => {
|
|
18
|
+
const globals = cmd.parent.opts();
|
|
19
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
20
|
+
|
|
21
|
+
let urls;
|
|
22
|
+
try {
|
|
23
|
+
urls = readFileSync(urlsFile, 'utf8')
|
|
24
|
+
.split('\n')
|
|
25
|
+
.map(l => l.trim())
|
|
26
|
+
.filter(l => l && !l.startsWith('#'));
|
|
27
|
+
} catch (e) {
|
|
28
|
+
process.stderr.write(`Error reading URLs file: ${e.message}\n`);
|
|
29
|
+
process.exit(1);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if (urls.length === 0) {
|
|
33
|
+
process.stderr.write('Error: No URLs found in file\n');
|
|
34
|
+
process.exit(1);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const tool = new BatchScrapeTool(getToolConfig('batch_scrape'));
|
|
38
|
+
await runTool(tool, {
|
|
39
|
+
urls,
|
|
40
|
+
formats: [opts.format],
|
|
41
|
+
maxConcurrency: parseInt(opts.concurrency, 10),
|
|
42
|
+
jobOptions: { maxRetries: parseInt(opts.maxRetries, 10) }
|
|
43
|
+
}, cliFlags);
|
|
44
|
+
});
|
|
45
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* crawl command — deep crawl a website using crawl_deep tool.
|
|
3
|
+
*/
|
|
4
|
+
import { CrawlDeepTool } from '../../tools/crawl/crawlDeep.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('crawl <url>')
|
|
11
|
+
.description('Deep crawl a website and extract its content')
|
|
12
|
+
.option('--depth <n>', 'Maximum crawl depth (1-5)', '3')
|
|
13
|
+
.option('--max-pages <n>', 'Maximum pages to crawl', '100')
|
|
14
|
+
.option('--no-robots', 'Ignore robots.txt')
|
|
15
|
+
.option('--follow-external', 'Follow external links')
|
|
16
|
+
.option('--concurrency <n>', 'Concurrent requests (1-20)', '10')
|
|
17
|
+
.action(async (url, opts, cmd) => {
|
|
18
|
+
const globals = cmd.parent.opts();
|
|
19
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
20
|
+
const tool = new CrawlDeepTool(getToolConfig('crawl_deep'));
|
|
21
|
+
await runTool(tool, {
|
|
22
|
+
url,
|
|
23
|
+
max_depth: parseInt(opts.depth, 10),
|
|
24
|
+
max_pages: parseInt(opts.maxPages, 10),
|
|
25
|
+
respect_robots: opts.robots !== false,
|
|
26
|
+
follow_external: !!opts.followExternal,
|
|
27
|
+
concurrency: parseInt(opts.concurrency, 10)
|
|
28
|
+
}, cliFlags);
|
|
29
|
+
});
|
|
30
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* extract command — extract structured data or LLM-guided extraction.
|
|
3
|
+
* With --schema: uses extract_structured (JSON schema-based).
|
|
4
|
+
* With --prompt: uses extract_with_llm (natural language).
|
|
5
|
+
*/
|
|
6
|
+
import { ExtractStructuredTool } from '../../tools/extract/extractStructured.js';
|
|
7
|
+
import { ExtractWithLlm } from '../../tools/extract/extractWithLlm.js';
|
|
8
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
9
|
+
import { runTool } from '../lib/runTool.js';
|
|
10
|
+
import { readFileSync } from 'node:fs';
|
|
11
|
+
|
|
12
|
+
export function register(program) {
|
|
13
|
+
program
|
|
14
|
+
.command('extract <url>')
|
|
15
|
+
.description('Extract structured data from a URL')
|
|
16
|
+
.option('--schema <file>', 'JSON schema file for structured extraction')
|
|
17
|
+
.option('--prompt <text>', 'Natural language prompt for LLM-guided extraction')
|
|
18
|
+
.option('--model <model>', 'LLM model to use (ollama model name or openai/anthropic)')
|
|
19
|
+
.action(async (url, opts, cmd) => {
|
|
20
|
+
const globals = cmd.parent.opts();
|
|
21
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
22
|
+
|
|
23
|
+
if (opts.schema) {
|
|
24
|
+
let schema;
|
|
25
|
+
try {
|
|
26
|
+
schema = JSON.parse(readFileSync(opts.schema, 'utf8'));
|
|
27
|
+
} catch (e) {
|
|
28
|
+
process.stderr.write(`Error reading schema file: ${e.message}\n`);
|
|
29
|
+
process.exit(1);
|
|
30
|
+
}
|
|
31
|
+
const tool = new ExtractStructuredTool(getToolConfig('extract_structured'));
|
|
32
|
+
await runTool(tool, { url, schema }, cliFlags);
|
|
33
|
+
} else if (opts.prompt) {
|
|
34
|
+
const tool = new ExtractWithLlm(getToolConfig('extract_with_llm'));
|
|
35
|
+
await runTool(tool, {
|
|
36
|
+
url,
|
|
37
|
+
prompt: opts.prompt,
|
|
38
|
+
model: opts.model
|
|
39
|
+
}, cliFlags);
|
|
40
|
+
} else {
|
|
41
|
+
process.stderr.write('Error: extract requires --schema <file> or --prompt <text>\n');
|
|
42
|
+
process.exit(1);
|
|
43
|
+
}
|
|
44
|
+
});
|
|
45
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* install-skills command -- install CrawlForge skill files into AI coding tools.
|
|
3
|
+
*/
|
|
4
|
+
import { install } from '../../skills/installer.js';
|
|
5
|
+
|
|
6
|
+
export function register(program) {
|
|
7
|
+
program
|
|
8
|
+
.command('install-skills')
|
|
9
|
+
.description('Install CrawlForge skill files into Claude Code, Cursor, or VS Code')
|
|
10
|
+
.option('--target <target>', 'Target: claude-code, cursor, vscode, or all', 'all')
|
|
11
|
+
.option('--force', 'Overwrite existing skill files')
|
|
12
|
+
.option('--dry-run', 'Show what would be installed without writing files')
|
|
13
|
+
.action(async (opts) => {
|
|
14
|
+
try {
|
|
15
|
+
const results = await install({
|
|
16
|
+
target: opts.target,
|
|
17
|
+
force: Boolean(opts.force),
|
|
18
|
+
dryRun: Boolean(opts.dryRun),
|
|
19
|
+
cwd: process.cwd()
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
if (opts.dryRun) {
|
|
23
|
+
process.stdout.write('Dry run -- would install to:\n');
|
|
24
|
+
results.paths.forEach(p => process.stdout.write(' ' + p + '\n'));
|
|
25
|
+
process.exit(0);
|
|
26
|
+
return;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (results.installed.length > 0) {
|
|
30
|
+
process.stdout.write('Installed:\n');
|
|
31
|
+
results.installed.forEach(p => process.stdout.write(' ' + p + '\n'));
|
|
32
|
+
}
|
|
33
|
+
if (results.skipped.length > 0) {
|
|
34
|
+
process.stdout.write('Skipped (already installed; use --force to overwrite):\n');
|
|
35
|
+
results.skipped.forEach(p => process.stdout.write(' ' + p + '\n'));
|
|
36
|
+
}
|
|
37
|
+
if (results.installed.length === 0 && results.skipped.length === 0) {
|
|
38
|
+
process.stdout.write('Nothing to install.\n');
|
|
39
|
+
}
|
|
40
|
+
process.exit(0);
|
|
41
|
+
} catch (err) {
|
|
42
|
+
process.stderr.write('Error: ' + err.message + '\n');
|
|
43
|
+
process.exit(1);
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* llmstxt command — generate llms.txt for a website.
|
|
3
|
+
*/
|
|
4
|
+
import { GenerateLLMsTxtTool } from '../../tools/llmstxt/generateLLMsTxt.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('llmstxt <url>')
|
|
11
|
+
.description('Generate llms.txt for a website (AI compliance file)')
|
|
12
|
+
.option('--include-full', 'Also generate llms-full.txt')
|
|
13
|
+
.option('--max-pages <n>', 'Maximum pages to analyze', '50')
|
|
14
|
+
.action(async (url, opts, cmd) => {
|
|
15
|
+
const globals = cmd.parent.opts();
|
|
16
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
17
|
+
const tool = new GenerateLLMsTxtTool(getToolConfig('generate_llms_txt'));
|
|
18
|
+
await runTool(tool, {
|
|
19
|
+
url,
|
|
20
|
+
include_full_txt: !!opts.includeFull,
|
|
21
|
+
max_pages: parseInt(opts.maxPages, 10)
|
|
22
|
+
}, cliFlags);
|
|
23
|
+
});
|
|
24
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* localize command — fetch content with locale/geo awareness.
|
|
3
|
+
*/
|
|
4
|
+
import { LocalizationManager } from '../../core/LocalizationManager.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('localize <url>')
|
|
11
|
+
.description('Fetch URL with locale/geo-aware settings')
|
|
12
|
+
.option('--locale <locale>', 'Locale code (e.g. en-US, fr-FR)', 'en-US')
|
|
13
|
+
.option('--country <code>', 'Country code for geo-targeting (e.g. US, FR)')
|
|
14
|
+
.option('--currency <code>', 'Currency code (e.g. USD, EUR)')
|
|
15
|
+
.action(async (url, opts, cmd) => {
|
|
16
|
+
const globals = cmd.parent.opts();
|
|
17
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
18
|
+
const mgr = new LocalizationManager(getToolConfig('localization'));
|
|
19
|
+
const wrapperTool = {
|
|
20
|
+
execute: (p) => mgr.fetchWithLocalization(p)
|
|
21
|
+
};
|
|
22
|
+
await runTool(wrapperTool, {
|
|
23
|
+
url,
|
|
24
|
+
locale: opts.locale,
|
|
25
|
+
country: opts.country,
|
|
26
|
+
currency: opts.currency
|
|
27
|
+
}, cliFlags);
|
|
28
|
+
});
|
|
29
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* map command — generate a sitemap using map_site tool.
|
|
3
|
+
*/
|
|
4
|
+
import { MapSiteTool } from '../../tools/crawl/mapSite.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('map <url>')
|
|
11
|
+
.description('Generate a sitemap for a website')
|
|
12
|
+
.option('--depth <n>', 'Maximum crawl depth', '3')
|
|
13
|
+
.option('--max-pages <n>', 'Maximum pages to include', '500')
|
|
14
|
+
.option('--format <fmt>', 'Output format: json or xml', 'json')
|
|
15
|
+
.action(async (url, opts, cmd) => {
|
|
16
|
+
const globals = cmd.parent.opts();
|
|
17
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
18
|
+
const tool = new MapSiteTool(getToolConfig('map_site'));
|
|
19
|
+
await runTool(tool, {
|
|
20
|
+
url,
|
|
21
|
+
max_depth: parseInt(opts.depth, 10),
|
|
22
|
+
max_pages: parseInt(opts.maxPages, 10),
|
|
23
|
+
output_format: opts.format
|
|
24
|
+
}, cliFlags);
|
|
25
|
+
});
|
|
26
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* monitor command — continuously monitor a URL for changes (scheduled mode).
|
|
3
|
+
*/
|
|
4
|
+
import { TrackChangesTool } from '../../tools/tracking/trackChanges/index.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('monitor <url>')
|
|
11
|
+
.description('Continuously monitor a URL for content changes')
|
|
12
|
+
.option('--interval <seconds>', 'Check interval in seconds', '300')
|
|
13
|
+
.option('--selector <css>', 'CSS selector to scope monitoring')
|
|
14
|
+
.option('--webhook <url>', 'Webhook URL to notify on changes')
|
|
15
|
+
.option('--threshold <pct>', 'Change threshold percentage (0-100)', '5')
|
|
16
|
+
.action(async (url, opts, cmd) => {
|
|
17
|
+
const globals = cmd.parent.opts();
|
|
18
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
19
|
+
const tool = new TrackChangesTool(getToolConfig('track_changes'));
|
|
20
|
+
await runTool(tool, {
|
|
21
|
+
url,
|
|
22
|
+
scheduled: true,
|
|
23
|
+
interval_seconds: parseInt(opts.interval, 10),
|
|
24
|
+
selector: opts.selector,
|
|
25
|
+
webhook_url: opts.webhook,
|
|
26
|
+
change_threshold: parseFloat(opts.threshold)
|
|
27
|
+
}, cliFlags);
|
|
28
|
+
});
|
|
29
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* research command — deep research on a topic.
|
|
3
|
+
*/
|
|
4
|
+
import { DeepResearchTool } from '../../tools/research/deepResearch.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('research <topic>')
|
|
11
|
+
.description('Conduct deep research on a topic')
|
|
12
|
+
.option('--depth <level>', 'Research depth: basic, standard, or deep', 'standard')
|
|
13
|
+
.option('--max-urls <n>', 'Maximum URLs to analyze', '20')
|
|
14
|
+
.option('--output-format <fmt>', 'Output format: summary or detailed', 'summary')
|
|
15
|
+
.action(async (topic, opts, cmd) => {
|
|
16
|
+
const globals = cmd.parent.opts();
|
|
17
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
18
|
+
const tool = new DeepResearchTool(getToolConfig('deep_research'));
|
|
19
|
+
await runTool(tool, {
|
|
20
|
+
query: topic,
|
|
21
|
+
depth: opts.depth,
|
|
22
|
+
max_urls: parseInt(opts.maxUrls, 10),
|
|
23
|
+
output_format: opts.outputFormat
|
|
24
|
+
}, cliFlags);
|
|
25
|
+
});
|
|
26
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* scrape command — fetches a URL and returns its content.
|
|
3
|
+
* Without --extract: uses fetch_url (raw HTML + headers).
|
|
4
|
+
* With --extract: uses extract_content (cleaned text/markdown).
|
|
5
|
+
*/
|
|
6
|
+
import { fetchUrlHandler } from '../../tools/basic/fetchUrl.js';
|
|
7
|
+
import { ExtractContentTool } from '../../tools/extract/extractContent.js';
|
|
8
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
9
|
+
import { runTool } from '../lib/runTool.js';
|
|
10
|
+
|
|
11
|
+
export function register(program) {
|
|
12
|
+
program
|
|
13
|
+
.command('scrape <url>')
|
|
14
|
+
.description('Fetch a URL and return its content')
|
|
15
|
+
.option('--extract', 'Use extract_content for cleaned text/markdown output')
|
|
16
|
+
.option('--format <format>', 'Output format: text, markdown, html (default: text)', 'text')
|
|
17
|
+
.option('--timeout <ms>', 'Request timeout in milliseconds', '10000')
|
|
18
|
+
.action(async (url, opts, cmd) => {
|
|
19
|
+
const globals = cmd.parent.opts();
|
|
20
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
21
|
+
const apiKey = globals.apiKey || process.env.CRAWLFORGE_API_KEY;
|
|
22
|
+
const timeout = parseInt(opts.timeout, 10);
|
|
23
|
+
|
|
24
|
+
if (opts.extract) {
|
|
25
|
+
const tool = new ExtractContentTool(getToolConfig('extract_content'));
|
|
26
|
+
const wrapperTool = {
|
|
27
|
+
execute: (p) => tool.execute(p)
|
|
28
|
+
};
|
|
29
|
+
await runTool(wrapperTool, { url, output_format: opts.format, timeout }, cliFlags);
|
|
30
|
+
} else {
|
|
31
|
+
const wrapperTool = {
|
|
32
|
+
execute: (p) => fetchUrlHandler(p)
|
|
33
|
+
};
|
|
34
|
+
await runTool(wrapperTool, { url, timeout }, cliFlags);
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* search command — searches the web using search_web tool.
|
|
3
|
+
*/
|
|
4
|
+
import { SearchWebTool } from '../../tools/search/searchWeb.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('search <query>')
|
|
11
|
+
.description('Search the web')
|
|
12
|
+
.option('--limit <n>', 'Number of results', '10')
|
|
13
|
+
.option('--lang <lang>', 'Language code (e.g. en, fr)', 'en')
|
|
14
|
+
.option('--provider <p>', 'Search provider: crawlforge or searxng', 'crawlforge')
|
|
15
|
+
.option('--no-safe-search', 'Disable safe search')
|
|
16
|
+
.action(async (query, opts, cmd) => {
|
|
17
|
+
const globals = cmd.parent.opts();
|
|
18
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
19
|
+
const tool = new SearchWebTool(getToolConfig('search_web'));
|
|
20
|
+
await runTool(tool, {
|
|
21
|
+
query,
|
|
22
|
+
limit: parseInt(opts.limit, 10),
|
|
23
|
+
lang: opts.lang,
|
|
24
|
+
provider: opts.provider,
|
|
25
|
+
safe_search: opts.safeSearch !== false
|
|
26
|
+
}, cliFlags);
|
|
27
|
+
});
|
|
28
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* stealth command — scrape a URL using stealth mode.
|
|
3
|
+
*/
|
|
4
|
+
import { StealthBrowserManager } from '../../core/StealthBrowserManager.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('stealth <url>')
|
|
11
|
+
.description('Scrape a URL using stealth/anti-bot browser mode')
|
|
12
|
+
.option('--engine <engine>', 'Browser engine: playwright or camoufox', 'playwright')
|
|
13
|
+
.option('--wait <ms>', 'Wait time after page load in milliseconds', '2000')
|
|
14
|
+
.option('--screenshot', 'Capture a screenshot')
|
|
15
|
+
.action(async (url, opts, cmd) => {
|
|
16
|
+
const globals = cmd.parent.opts();
|
|
17
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
18
|
+
const mgr = new StealthBrowserManager(getToolConfig('stealth_mode'));
|
|
19
|
+
const wrapperTool = {
|
|
20
|
+
execute: (p) => mgr.scrapeWithStealth(p)
|
|
21
|
+
};
|
|
22
|
+
await runTool(wrapperTool, {
|
|
23
|
+
url,
|
|
24
|
+
engine: opts.engine,
|
|
25
|
+
wait_for: parseInt(opts.wait, 10),
|
|
26
|
+
screenshot: !!opts.screenshot
|
|
27
|
+
}, cliFlags);
|
|
28
|
+
});
|
|
29
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* template command — scrape a target URL using a pre-built site template.
|
|
3
|
+
*/
|
|
4
|
+
import { ScrapeTemplateTool } from '../../tools/templates/ScrapeTemplateTool.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('template <id> <target>')
|
|
11
|
+
.description('Scrape using a pre-built site template (e.g. amazon-product, github-repo)')
|
|
12
|
+
.option('--list', 'List all available templates')
|
|
13
|
+
.action(async (id, target, opts, cmd) => {
|
|
14
|
+
const globals = cmd.parent.opts();
|
|
15
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
16
|
+
const tool = new ScrapeTemplateTool(getToolConfig('scrape_template'));
|
|
17
|
+
|
|
18
|
+
if (opts.list) {
|
|
19
|
+
const wrapperTool = { execute: () => tool.listTemplates() };
|
|
20
|
+
await runTool(wrapperTool, {}, cliFlags);
|
|
21
|
+
return;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
await runTool(tool, { template_id: id, url: target }, cliFlags);
|
|
25
|
+
});
|
|
26
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* track command — track content changes on a URL.
|
|
3
|
+
*/
|
|
4
|
+
import { TrackChangesTool } from '../../tools/tracking/trackChanges/index.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('track <url>')
|
|
11
|
+
.description('Track content changes on a URL')
|
|
12
|
+
.option('--selector <css>', 'CSS selector to scope tracking')
|
|
13
|
+
.option('--threshold <pct>', 'Change threshold percentage (0-100)', '5')
|
|
14
|
+
.action(async (url, opts, cmd) => {
|
|
15
|
+
const globals = cmd.parent.opts();
|
|
16
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
17
|
+
const tool = new TrackChangesTool(getToolConfig('track_changes'));
|
|
18
|
+
await runTool(tool, {
|
|
19
|
+
url,
|
|
20
|
+
selector: opts.selector,
|
|
21
|
+
change_threshold: parseFloat(opts.threshold)
|
|
22
|
+
}, cliFlags);
|
|
23
|
+
});
|
|
24
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* uninstall-skills command -- remove CrawlForge skill files.
|
|
3
|
+
*/
|
|
4
|
+
import { uninstall } from '../../skills/installer.js';
|
|
5
|
+
|
|
6
|
+
export function register(program) {
|
|
7
|
+
program
|
|
8
|
+
.command('uninstall-skills')
|
|
9
|
+
.description('Remove CrawlForge skill files from Claude Code, Cursor, or VS Code')
|
|
10
|
+
.option('--target <target>', 'Target: claude-code, cursor, vscode, or all', 'all')
|
|
11
|
+
.action(async (opts) => {
|
|
12
|
+
try {
|
|
13
|
+
const results = await uninstall({
|
|
14
|
+
target: opts.target,
|
|
15
|
+
cwd: process.cwd()
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
if (results.removed.length > 0) {
|
|
19
|
+
process.stdout.write('Removed:\n');
|
|
20
|
+
results.removed.forEach(p => process.stdout.write(' ' + p + '\n'));
|
|
21
|
+
}
|
|
22
|
+
if (results.notFound.length > 0) {
|
|
23
|
+
process.stdout.write('Not found (already removed):\n');
|
|
24
|
+
results.notFound.forEach(p => process.stdout.write(' ' + p + '\n'));
|
|
25
|
+
}
|
|
26
|
+
if (results.removed.length === 0) {
|
|
27
|
+
process.stdout.write('No skill files found to remove.\n');
|
|
28
|
+
}
|
|
29
|
+
process.exit(0);
|
|
30
|
+
} catch (err) {
|
|
31
|
+
process.stderr.write('Error: ' + err.message + '\n');
|
|
32
|
+
process.exit(1);
|
|
33
|
+
}
|
|
34
|
+
});
|
|
35
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* formatter.js — CLI output formatter shared across all CLI commands.
|
|
3
|
+
* Respects global flags: --json, --pretty, --quiet.
|
|
4
|
+
* No logic duplication with MCP tools — formats the same tool execute() output.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Format a tool result for CLI output.
|
|
9
|
+
* @param {object} result — raw object from tool.execute() or MCP handler
|
|
10
|
+
* @param {{ json?: boolean, pretty?: boolean, quiet?: boolean }} flags
|
|
11
|
+
* @returns {string}
|
|
12
|
+
*/
|
|
13
|
+
export function formatResult(result, flags = {}) {
|
|
14
|
+
const { json = false, pretty = false, quiet = false } = flags;
|
|
15
|
+
|
|
16
|
+
if (quiet) return '';
|
|
17
|
+
|
|
18
|
+
// If result has MCP content array, extract the text
|
|
19
|
+
if (result && Array.isArray(result.content)) {
|
|
20
|
+
const texts = result.content
|
|
21
|
+
.filter(c => c.type === 'text')
|
|
22
|
+
.map(c => c.text);
|
|
23
|
+
|
|
24
|
+
if (json || pretty) {
|
|
25
|
+
// Try to parse each text as JSON and re-serialize
|
|
26
|
+
const parsed = texts.map(t => {
|
|
27
|
+
try { return JSON.parse(t); } catch { return t; }
|
|
28
|
+
});
|
|
29
|
+
const output = parsed.length === 1 ? parsed[0] : parsed;
|
|
30
|
+
return pretty
|
|
31
|
+
? JSON.stringify(output, null, 2)
|
|
32
|
+
: JSON.stringify(output);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Plain text: return text blocks joined
|
|
36
|
+
return texts.join('\n');
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Plain object
|
|
40
|
+
if (json) return JSON.stringify(result);
|
|
41
|
+
if (pretty) return JSON.stringify(result, null, 2);
|
|
42
|
+
return typeof result === 'string' ? result : JSON.stringify(result, null, 2);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Format an error for CLI output.
|
|
47
|
+
* @param {Error|string} error
|
|
48
|
+
* @param {{ json?: boolean }} flags
|
|
49
|
+
* @returns {string}
|
|
50
|
+
*/
|
|
51
|
+
export function formatError(error, flags = {}) {
|
|
52
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
53
|
+
if (flags.json) {
|
|
54
|
+
return JSON.stringify({ error: message });
|
|
55
|
+
}
|
|
56
|
+
return `Error: ${message}`;
|
|
57
|
+
}
|
package/src/cli/index.js
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* CrawlForge CLI — src/cli/index.js
|
|
4
|
+
* Entry point for the `crawlforge` command.
|
|
5
|
+
*
|
|
6
|
+
* Global flags:
|
|
7
|
+
* --json Output raw JSON (compact)
|
|
8
|
+
* --pretty Output pretty-printed JSON
|
|
9
|
+
* --quiet Suppress all output (exit code only)
|
|
10
|
+
* --api-key CrawlForge API key (overrides CRAWLFORGE_API_KEY env)
|
|
11
|
+
* --timeout Global request timeout in ms (default: 30000)
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { Command } from 'commander';
|
|
15
|
+
import { createRequire } from 'node:module';
|
|
16
|
+
import { fileURLToPath } from 'node:url';
|
|
17
|
+
import { dirname, join } from 'node:path';
|
|
18
|
+
import { readFileSync } from 'node:fs';
|
|
19
|
+
|
|
20
|
+
// Load package.json for version
|
|
21
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
22
|
+
const __dirname = dirname(__filename);
|
|
23
|
+
const pkgPath = join(__dirname, '../../package.json');
|
|
24
|
+
let version = '4.1.0';
|
|
25
|
+
try {
|
|
26
|
+
const pkg = JSON.parse(readFileSync(pkgPath, 'utf8'));
|
|
27
|
+
version = pkg.version;
|
|
28
|
+
} catch { /* use fallback */ }
|
|
29
|
+
|
|
30
|
+
// Import all command registrars
|
|
31
|
+
import { register as registerScrape } from './commands/scrape.js';
|
|
32
|
+
import { register as registerSearch } from './commands/search.js';
|
|
33
|
+
import { register as registerCrawl } from './commands/crawl.js';
|
|
34
|
+
import { register as registerMap } from './commands/map.js';
|
|
35
|
+
import { register as registerExtract } from './commands/extract.js';
|
|
36
|
+
import { register as registerTrack } from './commands/track.js';
|
|
37
|
+
import { register as registerAnalyze } from './commands/analyze.js';
|
|
38
|
+
import { register as registerResearch } from './commands/research.js';
|
|
39
|
+
import { register as registerStealth } from './commands/stealth.js';
|
|
40
|
+
import { register as registerBatch } from './commands/batch.js';
|
|
41
|
+
import { register as registerActions } from './commands/actions.js';
|
|
42
|
+
import { register as registerLocalize } from './commands/localize.js';
|
|
43
|
+
import { register as registerLlmstxt } from './commands/llmstxt.js';
|
|
44
|
+
import { register as registerTemplate } from './commands/template.js';
|
|
45
|
+
import { register as registerMonitor } from './commands/monitor.js';
|
|
46
|
+
import { register as registerInstallSkills } from './commands/install-skills.js';
|
|
47
|
+
import { register as registerUninstallSkills } from './commands/uninstall-skills.js';
|
|
48
|
+
|
|
49
|
+
const program = new Command();
|
|
50
|
+
|
|
51
|
+
program
|
|
52
|
+
.name('crawlforge')
|
|
53
|
+
.description('CrawlForge CLI — web scraping, crawling, and content processing')
|
|
54
|
+
.version(version)
|
|
55
|
+
.option('--json', 'Output compact JSON')
|
|
56
|
+
.option('--pretty', 'Output pretty-printed JSON')
|
|
57
|
+
.option('--quiet', 'Suppress all stdout output (exit code only)')
|
|
58
|
+
.option('--api-key <key>', 'CrawlForge API key (overrides CRAWLFORGE_API_KEY env var)')
|
|
59
|
+
.option('--timeout <ms>', 'Global request timeout in milliseconds', '30000');
|
|
60
|
+
|
|
61
|
+
// Apply --api-key globally before commands run
|
|
62
|
+
program.hook('preAction', (thisCommand) => {
|
|
63
|
+
const opts = program.opts();
|
|
64
|
+
if (opts.apiKey) {
|
|
65
|
+
process.env.CRAWLFORGE_API_KEY = opts.apiKey;
|
|
66
|
+
}
|
|
67
|
+
if (opts.timeout) {
|
|
68
|
+
process.env.CRAWLFORGE_CLI_TIMEOUT = opts.timeout;
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
// Register all 15 tool commands + 2 skills commands
|
|
73
|
+
registerScrape(program);
|
|
74
|
+
registerSearch(program);
|
|
75
|
+
registerCrawl(program);
|
|
76
|
+
registerMap(program);
|
|
77
|
+
registerExtract(program);
|
|
78
|
+
registerTrack(program);
|
|
79
|
+
registerAnalyze(program);
|
|
80
|
+
registerResearch(program);
|
|
81
|
+
registerStealth(program);
|
|
82
|
+
registerBatch(program);
|
|
83
|
+
registerActions(program);
|
|
84
|
+
registerLocalize(program);
|
|
85
|
+
registerLlmstxt(program);
|
|
86
|
+
registerTemplate(program);
|
|
87
|
+
registerMonitor(program);
|
|
88
|
+
registerInstallSkills(program);
|
|
89
|
+
registerUninstallSkills(program);
|
|
90
|
+
|
|
91
|
+
program.parseAsync(process.argv).catch((err) => {
|
|
92
|
+
process.stderr.write(`Fatal error: ${err.message}\n`);
|
|
93
|
+
process.exit(1);
|
|
94
|
+
});
|