crawlforge-mcp-server 3.4.0 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/README.md +28 -2
  2. package/package.json +6 -4
  3. package/server.js +166 -32
  4. package/src/cli/commands/actions.js +36 -0
  5. package/src/cli/commands/analyze.js +19 -0
  6. package/src/cli/commands/batch.js +45 -0
  7. package/src/cli/commands/crawl.js +30 -0
  8. package/src/cli/commands/extract.js +45 -0
  9. package/src/cli/commands/install-skills.js +46 -0
  10. package/src/cli/commands/llmstxt.js +24 -0
  11. package/src/cli/commands/localize.js +29 -0
  12. package/src/cli/commands/map.js +26 -0
  13. package/src/cli/commands/monitor.js +29 -0
  14. package/src/cli/commands/research.js +26 -0
  15. package/src/cli/commands/scrape.js +37 -0
  16. package/src/cli/commands/search.js +28 -0
  17. package/src/cli/commands/stealth.js +29 -0
  18. package/src/cli/commands/template.js +26 -0
  19. package/src/cli/commands/track.js +24 -0
  20. package/src/cli/commands/uninstall-skills.js +35 -0
  21. package/src/cli/formatter.js +57 -0
  22. package/src/cli/index.js +94 -0
  23. package/src/cli/lib/runTool.js +40 -0
  24. package/src/core/ActionExecutor.js +8 -6
  25. package/src/core/AuthManager.js +103 -3
  26. package/src/core/ChangeTracker.js +34 -0
  27. package/src/core/ElicitationHelper.js +112 -0
  28. package/src/core/JobManager.js +36 -2
  29. package/src/core/LocalizationManager.js +19 -5
  30. package/src/core/PerformanceManager.js +53 -17
  31. package/src/core/ResearchOrchestrator.js +40 -5
  32. package/src/core/SamplingClient.js +191 -0
  33. package/src/core/StealthBrowserManager.js +248 -2
  34. package/src/core/WebhookDispatcher.js +18 -10
  35. package/src/prompts/PromptRegistry.js +199 -0
  36. package/src/resources/ResourceRegistry.js +273 -0
  37. package/src/server/transports/streamableHttp.js +6 -6
  38. package/src/server/withAuth.js +25 -0
  39. package/src/skills/crawlforge-cli.md +157 -0
  40. package/src/skills/crawlforge-mcp.md +80 -0
  41. package/src/skills/crawlforge-research.md +104 -0
  42. package/src/skills/crawlforge-stealth.md +98 -0
  43. package/src/skills/installer.js +141 -0
  44. package/src/tools/advanced/batchScrape/index.js +30 -0
  45. package/src/tools/advanced/batchScrape/schema.js +1 -1
  46. package/src/tools/basic/extractText.js +19 -8
  47. package/src/tools/crawl/crawlDeep.js +27 -0
  48. package/src/tools/extract/extractContent.js +5 -17
  49. package/src/tools/extract/extractStructured.js +8 -0
  50. package/src/tools/extract/extractWithLlm.js +35 -25
  51. package/src/tools/extract/listOllamaModels.js +66 -0
  52. package/src/tools/extract/processDocument.js +7 -1
  53. package/src/tools/extract/summarizeContent.js +17 -0
  54. package/src/tools/research/deepResearch.js +34 -0
  55. package/src/tools/templates/ScrapeTemplateTool.js +68 -0
  56. package/src/tools/templates/TemplateRegistry.js +311 -0
  57. package/src/utils/Logger.js +15 -0
  58. package/src/utils/htmlToMarkdown.js +54 -0
  59. package/src/utils/secretMask.js +86 -0
@@ -0,0 +1,199 @@
1
+ /**
2
+ * PromptRegistry — MCP Prompts for CrawlForge
3
+ * Pre-defined workflows as MCP prompts the client can list and invoke.
4
+ */
5
+
6
+ export const PROMPTS = [
7
+ {
8
+ name: 'competitive-analysis',
9
+ description: 'Analyze competitor websites against your own to surface positioning, feature gaps, and SEO differences.',
10
+ arguments: [
11
+ { name: 'competitor_urls', description: 'Comma-separated list of competitor URLs to analyze', required: true },
12
+ { name: 'our_url', description: 'Your website URL for comparison', required: true },
13
+ ],
14
+ },
15
+ {
16
+ name: 'monitor-changes',
17
+ description: 'Set up continuous monitoring for content changes on a URL with webhook notifications.',
18
+ arguments: [
19
+ { name: 'url', description: 'URL to monitor for changes', required: true },
20
+ { name: 'interval', description: 'Check interval in seconds (default: 3600)', required: false },
21
+ { name: 'webhook', description: 'Webhook URL for change notifications', required: false },
22
+ ],
23
+ },
24
+ {
25
+ name: 'rag-ingest',
26
+ description: 'Scrape and convert one or more URLs into clean markdown suitable for RAG ingestion pipelines.',
27
+ arguments: [
28
+ { name: 'urls', description: 'Comma-separated list of URLs to ingest', required: true },
29
+ { name: 'output_format', description: 'Output format: markdown (default) or text', required: false },
30
+ ],
31
+ },
32
+ {
33
+ name: 'site-audit',
34
+ description: 'Comprehensive site audit: discovers all pages, extracts metadata, and generates an llms.txt summary.',
35
+ arguments: [
36
+ { name: 'url', description: 'Website URL to audit', required: true },
37
+ ],
38
+ },
39
+ {
40
+ name: 'research-deep-dive',
41
+ description: 'Conduct exhaustive multi-source research on a topic with synthesis, conflict detection, and citations.',
42
+ arguments: [
43
+ { name: 'topic', description: 'Research topic or question', required: true },
44
+ { name: 'depth', description: 'Research depth: shallow | medium | deep (default: medium)', required: false },
45
+ ],
46
+ },
47
+ ];
48
+
49
+ /**
50
+ * Generate the prompt messages for a given prompt name and arguments.
51
+ * @param {string} name
52
+ * @param {Record<string, string>} args
53
+ * @returns {{ messages: Array<{ role: string, content: { type: string, text: string } }> }}
54
+ */
55
+ export function getPromptMessages(name, args = {}) {
56
+ switch (name) {
57
+ case 'competitive-analysis': {
58
+ const competitors = args.competitor_urls || '';
59
+ const ourUrl = args.our_url || '';
60
+ return {
61
+ messages: [{
62
+ role: 'user',
63
+ content: {
64
+ type: 'text',
65
+ text: `Conduct a comprehensive competitive analysis.
66
+
67
+ Our website: ${ourUrl}
68
+ Competitors: ${competitors}
69
+
70
+ Steps to follow:
71
+ 1. Use fetch_url or extract_content on each competitor URL and our URL.
72
+ 2. Use extract_metadata on all URLs to compare titles, descriptions, and keywords.
73
+ 3. Use analyze_content to surface content quality, topics, and tone differences.
74
+ 4. Use map_site on each domain to compare site structure and depth.
75
+ 5. Summarize: positioning gaps, feature differences, SEO opportunities, and recommended actions.
76
+
77
+ Return a structured report with sections: Overview, Competitor Profiles, Gap Analysis, Recommendations.`,
78
+ },
79
+ }],
80
+ };
81
+ }
82
+
83
+ case 'monitor-changes': {
84
+ const url = args.url || '';
85
+ const interval = args.interval || '3600';
86
+ const webhook = args.webhook || '';
87
+ return {
88
+ messages: [{
89
+ role: 'user',
90
+ content: {
91
+ type: 'text',
92
+ text: `Set up change monitoring for: ${url}
93
+
94
+ Configuration:
95
+ - Check interval: ${interval} seconds
96
+ - Webhook URL: ${webhook || '(none — report changes inline)'}
97
+
98
+ Steps:
99
+ 1. Use track_changes with the URL to establish a baseline snapshot.
100
+ 2. Configure the check interval and webhook if provided.
101
+ 3. Report back the monitoring session ID and confirm setup.
102
+ 4. If no webhook is provided, describe how to retrieve changes later using track_changes.`,
103
+ },
104
+ }],
105
+ };
106
+ }
107
+
108
+ case 'rag-ingest': {
109
+ const urls = args.urls || '';
110
+ const outputFormat = args.output_format || 'markdown';
111
+ return {
112
+ messages: [{
113
+ role: 'user',
114
+ content: {
115
+ type: 'text',
116
+ text: `Ingest the following URLs for RAG (Retrieval-Augmented Generation):
117
+
118
+ URLs: ${urls}
119
+ Output format: ${outputFormat}
120
+
121
+ Steps:
122
+ 1. Use batch_scrape with the URL list to fetch all pages in parallel.
123
+ 2. Use extract_content on each result to extract clean, readable content.
124
+ 3. Convert content to ${outputFormat} format — remove navigation, ads, and boilerplate.
125
+ 4. Return each document with: URL, title, word count, and clean ${outputFormat} body.
126
+ 5. Flag any URLs that failed to load.
127
+
128
+ The output should be ready for chunking and embedding.`,
129
+ },
130
+ }],
131
+ };
132
+ }
133
+
134
+ case 'site-audit': {
135
+ const url = args.url || '';
136
+ return {
137
+ messages: [{
138
+ role: 'user',
139
+ content: {
140
+ type: 'text',
141
+ text: `Perform a comprehensive site audit for: ${url}
142
+
143
+ Steps:
144
+ 1. Use map_site to discover all pages and site structure.
145
+ 2. Use extract_metadata on the homepage and top-level pages.
146
+ 3. Use generate_llms_txt to produce the site's AI-readable summary.
147
+ 4. Use analyze_content on the homepage to assess content quality and topics.
148
+ 5. Report:
149
+ - Total pages discovered
150
+ - Site structure overview
151
+ - Metadata completeness (missing titles, descriptions)
152
+ - Content quality assessment
153
+ - llms.txt summary
154
+ - Recommendations for improvement`,
155
+ },
156
+ }],
157
+ };
158
+ }
159
+
160
+ case 'research-deep-dive': {
161
+ const topic = args.topic || '';
162
+ const depth = args.depth || 'medium';
163
+ const depthConfig = {
164
+ shallow: { maxUrls: 20, maxDepth: 3 },
165
+ medium: { maxUrls: 50, maxDepth: 5 },
166
+ deep: { maxUrls: 150, maxDepth: 8 },
167
+ };
168
+ const cfg = depthConfig[depth] || depthConfig.medium;
169
+ return {
170
+ messages: [{
171
+ role: 'user',
172
+ content: {
173
+ type: 'text',
174
+ text: `Conduct a deep research investigation on the following topic:
175
+
176
+ Topic: ${topic}
177
+ Depth: ${depth} (max ${cfg.maxUrls} sources, depth ${cfg.maxDepth})
178
+
179
+ Steps:
180
+ 1. Use deep_research with topic="${topic}", maxUrls=${cfg.maxUrls}, maxDepth=${cfg.maxDepth}.
181
+ 2. If deep_research returns raw evidence (no synthesis), synthesize it yourself:
182
+ - Group findings by sub-topic
183
+ - Identify agreements and conflicts between sources
184
+ - Rank sources by credibility
185
+ 3. Return a structured report with:
186
+ - Executive Summary
187
+ - Key Findings (with citations)
188
+ - Conflicting Information (if any)
189
+ - Source Quality Assessment
190
+ - Confidence Level and Gaps`,
191
+ },
192
+ }],
193
+ };
194
+ }
195
+
196
+ default:
197
+ throw new Error(`Unknown prompt: ${name}`);
198
+ }
199
+ }
@@ -0,0 +1,273 @@
1
+ /**
2
+ * ResourceRegistry — MCP Resources for CrawlForge
3
+ * URI scheme: crawlforge://<type>/<id>
4
+ * Exposes long-lived artifacts produced by tools as MCP Resources.
5
+ */
6
+
7
+ import { createHash } from 'crypto';
8
+
9
+ /**
10
+ * Supported resource types and their MIME types.
11
+ */
12
+ const RESOURCE_MIME = {
13
+ research: 'application/json',
14
+ snapshot: 'text/html',
15
+ job: 'application/json',
16
+ crawl: 'application/json',
17
+ screenshot: 'image/png',
18
+ };
19
+
20
+ /**
21
+ * Parse a crawlforge:// URI into its components.
22
+ * @param {string} uri
23
+ * @returns {{ type: string, parts: string[] } | null}
24
+ */
25
+ export function parseResourceUri(uri) {
26
+ if (!uri || !uri.startsWith('crawlforge://')) return null;
27
+ const rest = uri.slice('crawlforge://'.length);
28
+ const [type, ...parts] = rest.split('/');
29
+ if (!type || !RESOURCE_MIME[type]) return null;
30
+ return { type, parts };
31
+ }
32
+
33
+ /**
34
+ * Generate a URL hash for snapshot URIs.
35
+ * @param {string} url
36
+ * @returns {string}
37
+ */
38
+ export function hashUrl(url) {
39
+ return createHash('sha256').update(url).digest('hex').slice(0, 16);
40
+ }
41
+
42
+ export class ResourceRegistry {
43
+ constructor({ researchOrchestrator, snapshotManager, jobManager, mapSiteTool, scrapeWithActionsTool } = {}) {
44
+ this.researchOrchestrator = researchOrchestrator || null;
45
+ this.snapshotManager = snapshotManager || null;
46
+ this.jobManager = jobManager || null;
47
+ this.mapSiteTool = mapSiteTool || null;
48
+ this.scrapeWithActionsTool = scrapeWithActionsTool || null;
49
+
50
+ // In-memory stores for lightweight resource tracking
51
+ /** @type {Map<string, { data: any, createdAt: number, ttl: number }>} */
52
+ this._crawlSitemaps = new Map(); // sessionId -> sitemap
53
+ /** @type {Map<string, { data: Buffer, createdAt: number, ttl: number }>} */
54
+ this._screenshots = new Map(); // actionId -> PNG buffer
55
+
56
+ // Default TTL: 1 hour
57
+ this.defaultTtl = 60 * 60 * 1000;
58
+ }
59
+
60
+ /**
61
+ * Store a crawl sitemap result for later retrieval.
62
+ * @param {string} sessionId
63
+ * @param {object} sitemapData
64
+ */
65
+ storeCrawlSitemap(sessionId, sitemapData) {
66
+ this._crawlSitemaps.set(sessionId, {
67
+ data: sitemapData,
68
+ createdAt: Date.now(),
69
+ ttl: this.defaultTtl,
70
+ });
71
+ }
72
+
73
+ /**
74
+ * Store a screenshot for later retrieval.
75
+ * @param {string} actionId
76
+ * @param {Buffer|string} screenshotData - PNG buffer or base64 string
77
+ */
78
+ storeScreenshot(actionId, screenshotData) {
79
+ const buf = Buffer.isBuffer(screenshotData)
80
+ ? screenshotData
81
+ : Buffer.from(screenshotData, 'base64');
82
+ this._screenshots.set(actionId, {
83
+ data: buf,
84
+ createdAt: Date.now(),
85
+ ttl: this.defaultTtl,
86
+ });
87
+ }
88
+
89
+ /**
90
+ * List all available resources.
91
+ * @returns {Array<{ uri: string, name: string, description: string, mimeType: string }>}
92
+ */
93
+ listResources() {
94
+ const resources = [];
95
+ const now = Date.now();
96
+
97
+ // Research sessions
98
+ if (this.researchOrchestrator?.activeSessions) {
99
+ for (const [sessionId] of this.researchOrchestrator.activeSessions) {
100
+ resources.push({
101
+ uri: `crawlforge://research/${sessionId}`,
102
+ name: `Research Session ${sessionId}`,
103
+ description: 'Completed deep_research report',
104
+ mimeType: RESOURCE_MIME.research,
105
+ });
106
+ }
107
+ }
108
+
109
+ // Snapshots — list recent ones from SnapshotManager if available
110
+ if (this.snapshotManager?.snapshots) {
111
+ for (const [id, snap] of this.snapshotManager.snapshots) {
112
+ const urlHash = hashUrl(snap.url || id);
113
+ const ts = snap.metadata?.timestamp || snap.createdAt || now;
114
+ resources.push({
115
+ uri: `crawlforge://snapshot/${urlHash}/${ts}`,
116
+ name: `Snapshot ${urlHash}`,
117
+ description: `Snapshot of ${snap.url || id}`,
118
+ mimeType: RESOURCE_MIME.snapshot,
119
+ });
120
+ }
121
+ }
122
+
123
+ // Jobs — completed/failed only
124
+ if (this.jobManager?.jobs) {
125
+ for (const [jobId, job] of this.jobManager.jobs) {
126
+ if (job.status === 'completed' || job.status === 'failed') {
127
+ resources.push({
128
+ uri: `crawlforge://job/${jobId}`,
129
+ name: `Job ${jobId}`,
130
+ description: `Batch scrape job (${job.status})`,
131
+ mimeType: RESOURCE_MIME.job,
132
+ });
133
+ }
134
+ }
135
+ }
136
+
137
+ // Crawl sitemaps
138
+ for (const [sessionId, entry] of this._crawlSitemaps) {
139
+ if (now - entry.createdAt < entry.ttl) {
140
+ resources.push({
141
+ uri: `crawlforge://crawl/${sessionId}/sitemap`,
142
+ name: `Crawl Sitemap ${sessionId}`,
143
+ description: 'map_site output for a crawl session',
144
+ mimeType: RESOURCE_MIME.crawl,
145
+ });
146
+ }
147
+ }
148
+
149
+ // Screenshots
150
+ for (const [actionId, entry] of this._screenshots) {
151
+ if (now - entry.createdAt < entry.ttl) {
152
+ resources.push({
153
+ uri: `crawlforge://screenshot/${actionId}`,
154
+ name: `Screenshot ${actionId}`,
155
+ description: 'Screenshot from scrape_with_actions',
156
+ mimeType: RESOURCE_MIME.screenshot,
157
+ });
158
+ }
159
+ }
160
+
161
+ return resources;
162
+ }
163
+
164
+ /**
165
+ * Read a specific resource by URI.
166
+ * @param {string} uri
167
+ * @returns {{ contents: Array<{ uri: string, mimeType: string, text?: string, blob?: string }> }}
168
+ */
169
+ async readResource(uri) {
170
+ const parsed = parseResourceUri(uri);
171
+ if (!parsed) {
172
+ throw new Error(`Unknown resource URI: ${uri}`);
173
+ }
174
+
175
+ const { type, parts } = parsed;
176
+
177
+ if (type === 'research') {
178
+ return this._readResearch(uri, parts[0]);
179
+ }
180
+ if (type === 'snapshot') {
181
+ return this._readSnapshot(uri, parts[0], parts[1]);
182
+ }
183
+ if (type === 'job') {
184
+ return this._readJob(uri, parts[0]);
185
+ }
186
+ if (type === 'crawl') {
187
+ // parts: [sessionId, 'sitemap']
188
+ return this._readCrawlSitemap(uri, parts[0]);
189
+ }
190
+ if (type === 'screenshot') {
191
+ return this._readScreenshot(uri, parts[0]);
192
+ }
193
+
194
+ throw new Error(`Resource type not implemented: ${type}`);
195
+ }
196
+
197
+ async _readResearch(uri, sessionId) {
198
+ const session = this.researchOrchestrator?.activeSessions?.get(sessionId);
199
+ if (!session) {
200
+ throw new Error(`Research session not found: ${sessionId}`);
201
+ }
202
+ return {
203
+ contents: [{
204
+ uri,
205
+ mimeType: RESOURCE_MIME.research,
206
+ text: JSON.stringify(session, null, 2),
207
+ }],
208
+ };
209
+ }
210
+
211
+ async _readSnapshot(uri, urlHash, timestamp) {
212
+ if (!this.snapshotManager?.snapshots) {
213
+ throw new Error('SnapshotManager not available');
214
+ }
215
+ // Find snapshot by matching urlHash and timestamp
216
+ for (const [id, snap] of this.snapshotManager.snapshots) {
217
+ const sh = hashUrl(snap.url || id);
218
+ const ts = String(snap.metadata?.timestamp || snap.createdAt || '');
219
+ if (sh === urlHash && ts === timestamp) {
220
+ return {
221
+ contents: [{
222
+ uri,
223
+ mimeType: RESOURCE_MIME.snapshot,
224
+ text: snap.content || JSON.stringify(snap, null, 2),
225
+ }],
226
+ };
227
+ }
228
+ }
229
+ throw new Error(`Snapshot not found: ${uri}`);
230
+ }
231
+
232
+ async _readJob(uri, jobId) {
233
+ const job = this.jobManager?.jobs?.get(jobId);
234
+ if (!job) {
235
+ throw new Error(`Job not found: ${jobId}`);
236
+ }
237
+ return {
238
+ contents: [{
239
+ uri,
240
+ mimeType: RESOURCE_MIME.job,
241
+ text: JSON.stringify(job, null, 2),
242
+ }],
243
+ };
244
+ }
245
+
246
+ async _readCrawlSitemap(uri, sessionId) {
247
+ const entry = this._crawlSitemaps.get(sessionId);
248
+ if (!entry || Date.now() - entry.createdAt >= entry.ttl) {
249
+ throw new Error(`Crawl sitemap not found or expired: ${sessionId}`);
250
+ }
251
+ return {
252
+ contents: [{
253
+ uri,
254
+ mimeType: RESOURCE_MIME.crawl,
255
+ text: JSON.stringify(entry.data, null, 2),
256
+ }],
257
+ };
258
+ }
259
+
260
+ async _readScreenshot(uri, actionId) {
261
+ const entry = this._screenshots.get(actionId);
262
+ if (!entry || Date.now() - entry.createdAt >= entry.ttl) {
263
+ throw new Error(`Screenshot not found or expired: ${actionId}`);
264
+ }
265
+ return {
266
+ contents: [{
267
+ uri,
268
+ mimeType: RESOURCE_MIME.screenshot,
269
+ blob: entry.data.toString('base64'),
270
+ }],
271
+ };
272
+ }
273
+ }
@@ -28,7 +28,7 @@ import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/
28
28
  import { createServer } from 'node:http';
29
29
  import { randomUUID } from 'node:crypto';
30
30
 
31
- const SERVER_VERSION = '3.2.0';
31
+ const SERVER_VERSION = '3.5.1';
32
32
 
33
33
  /**
34
34
  * Stateful, session-aware Streamable HTTP transport.
@@ -160,11 +160,11 @@ export async function connectStreamableHttp(server, authManager, logger, options
160
160
  await new Promise((resolve) => {
161
161
  httpServer.listen(port, host, () => {
162
162
  const actual = httpServer.address()?.port ?? port;
163
- console.error(`CrawlForge MCP Server v${SERVER_VERSION} running on Streamable HTTP (${mode}) port ${actual}`);
164
- console.error(`MCP endpoint: http://localhost:${actual}/mcp`);
165
- console.error(`Health check: http://localhost:${actual}/health`);
166
- if (metrics) console.error(`Metrics: http://localhost:${actual}/metrics`);
167
- if (oauthProvider) console.error(`OAuth: http://localhost:${actual}/.well-known/oauth-authorization-server`);
163
+ console.error(`CrawlForge MCP Server v${SERVER_VERSION} listening on ${host}:${actual} (Streamable HTTP, ${mode})`);
164
+ console.error(`MCP endpoint: http://${host}:${actual}/mcp`);
165
+ console.error(`Health check: http://${host}:${actual}/health`);
166
+ if (metrics) console.error(`Metrics: http://${host}:${actual}/metrics`);
167
+ if (oauthProvider) console.error(`OAuth discovery: http://${host}:${actual}/.well-known/oauth-authorization-server`);
168
168
  resolve();
169
169
  });
170
170
  });
@@ -60,6 +60,31 @@ export function makeWithAuth({ authManager, logger, metrics = null }) {
60
60
  const result = await handler(params);
61
61
  outcome = 'success';
62
62
 
63
+ // D3.5: Surface cost transparency in all tool responses
64
+ try {
65
+ const projection = authManager.projectCost(toolName, params);
66
+ const remainingCredits = creatorMode ? Infinity : (authManager.creditCache ? [...authManager.creditCache.values()][0] ?? null : null);
67
+ const costMeta = {
68
+ projected: creditCost,
69
+ actual: creditCost,
70
+ remaining_credits: remainingCredits,
71
+ projection_note: projection.note
72
+ };
73
+
74
+ // Inject _cost into the first text content item if it's JSON
75
+ if (result && Array.isArray(result.content) && result.content[0]?.type === 'text') {
76
+ try {
77
+ const parsed = JSON.parse(result.content[0].text);
78
+ parsed._cost = costMeta;
79
+ result.content[0].text = JSON.stringify(parsed, null, 2);
80
+ } catch {
81
+ // Not JSON — skip injection silently
82
+ }
83
+ }
84
+ } catch {
85
+ // Cost injection must never break the request path
86
+ }
87
+
63
88
  if (!creatorMode) {
64
89
  await authManager.reportUsage(toolName, creditCost, params, 200, Date.now() - startTime);
65
90
  }
@@ -0,0 +1,157 @@
1
+ # CrawlForge CLI Usage Guide
2
+
3
+ The `crawlforge` CLI exposes all 23 MCP tools as command-line subcommands.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install -g crawlforge-mcp-server
9
+ # or run without installing:
10
+ npx crawlforge-mcp-server <command>
11
+ ```
12
+
13
+ ## Global Flags
14
+
15
+ All commands support these flags:
16
+ - `--json` — output compact JSON
17
+ - `--pretty` — output pretty-printed JSON
18
+ - `--quiet` — suppress output (exit code only)
19
+ - `--api-key <key>` — override CRAWLFORGE_API_KEY env var
20
+ - `--timeout <ms>` — global request timeout (default: 30000)
21
+
22
+ ## Commands
23
+
24
+ ### scrape — Fetch a URL
25
+ ```bash
26
+ crawlforge scrape https://example.com
27
+ crawlforge scrape https://example.com --extract --format markdown
28
+ crawlforge scrape https://example.com --pretty
29
+ ```
30
+
31
+ ### search — Search the web
32
+ ```bash
33
+ crawlforge search "MCP server tutorial" --limit 10
34
+ crawlforge search "nodejs scraping" --provider searxng --json
35
+ ```
36
+
37
+ ### crawl — Deep website crawl
38
+ ```bash
39
+ crawlforge crawl https://docs.example.com --depth 3 --max-pages 200
40
+ crawlforge crawl https://example.com --no-robots --concurrency 20
41
+ ```
42
+
43
+ ### map — Generate sitemap
44
+ ```bash
45
+ crawlforge map https://example.com --pretty
46
+ crawlforge map https://example.com --format xml > sitemap.xml
47
+ ```
48
+
49
+ ### extract — Structured data extraction
50
+ ```bash
51
+ # Schema-based extraction
52
+ crawlforge extract https://example.com/product --schema product-schema.json
53
+
54
+ # LLM-guided extraction
55
+ crawlforge extract https://example.com/article --prompt "extract title, author, date, summary"
56
+ ```
57
+
58
+ ### track — Track content changes
59
+ ```bash
60
+ crawlforge track https://example.com --threshold 10
61
+ crawlforge track https://example.com --selector ".main-content"
62
+ ```
63
+
64
+ ### analyze — Content analysis
65
+ ```bash
66
+ crawlforge analyze https://example.com --depth full --pretty
67
+ ```
68
+
69
+ ### research — Deep research
70
+ ```bash
71
+ crawlforge research "state of AI in 2025" --depth deep --max-urls 30
72
+ crawlforge research "competitor pricing" --output-format detailed --json
73
+ ```
74
+
75
+ ### stealth — Anti-bot scraping
76
+ ```bash
77
+ crawlforge stealth https://protected-site.com
78
+ crawlforge stealth https://protected-site.com --engine camoufox --screenshot
79
+ ```
80
+
81
+ ### batch — Batch scrape from file
82
+ ```bash
83
+ # Create a URLs file:
84
+ cat > urls.txt << EOF
85
+ https://example.com/page1
86
+ https://example.com/page2
87
+ https://example.com/page3
88
+ EOF
89
+
90
+ crawlforge batch urls.txt --format markdown --concurrency 10
91
+ ```
92
+
93
+ ### actions — Browser automation
94
+ ```bash
95
+ # Create an actions script:
96
+ cat > login.json << EOF
97
+ [
98
+ { "type": "click", "selector": "#login-btn" },
99
+ { "type": "type", "selector": "#email", "text": "user@example.com" },
100
+ { "type": "wait", "duration": 1000 }
101
+ ]
102
+ EOF
103
+
104
+ crawlforge actions https://example.com --script login.json --screenshot
105
+ ```
106
+
107
+ ### localize — Geo-targeted fetch
108
+ ```bash
109
+ crawlforge localize https://example.com --locale fr-FR --country FR
110
+ crawlforge localize https://shop.example.com --locale en-GB --currency GBP
111
+ ```
112
+
113
+ ### llmstxt — Generate llms.txt
114
+ ```bash
115
+ crawlforge llmstxt https://example.com
116
+ crawlforge llmstxt https://example.com --include-full > llms.txt
117
+ ```
118
+
119
+ ### template — Pre-built site scrapers
120
+ ```bash
121
+ crawlforge template github-repo https://github.com/owner/repo
122
+ crawlforge template amazon-product https://amazon.com/dp/B0XXXXX
123
+ crawlforge template npm-package https://npmjs.com/package/commander
124
+ crawlforge template --list # list all available templates
125
+ ```
126
+
127
+ ### monitor — Continuous change monitoring
128
+ ```bash
129
+ crawlforge monitor https://example.com --interval 60 --webhook https://my-site.com/hook
130
+ crawlforge monitor https://example.com --selector ".price" --threshold 1
131
+ ```
132
+
133
+ ### install-skills — Install AI assistant skills
134
+ ```bash
135
+ crawlforge install-skills --target claude-code
136
+ crawlforge install-skills --target cursor --force
137
+ crawlforge install-skills --target all --dry-run
138
+ ```
139
+
140
+ ### uninstall-skills — Remove AI assistant skills
141
+ ```bash
142
+ crawlforge uninstall-skills --target claude-code
143
+ crawlforge uninstall-skills --target all
144
+ ```
145
+
146
+ ## Output Piping Examples
147
+
148
+ ```bash
149
+ # Extract markdown and save to file
150
+ crawlforge scrape https://example.com --extract --format markdown > page.md
151
+
152
+ # Search and parse with jq
153
+ crawlforge search "nodejs MCP" --json | jq '.results[].url'
154
+
155
+ # Batch scrape and process results
156
+ crawlforge batch urls.txt --json | jq '.results | length'
157
+ ```