crawlforge-mcp-server 3.4.0 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -2
- package/package.json +6 -4
- package/server.js +166 -32
- package/src/cli/commands/actions.js +36 -0
- package/src/cli/commands/analyze.js +19 -0
- package/src/cli/commands/batch.js +45 -0
- package/src/cli/commands/crawl.js +30 -0
- package/src/cli/commands/extract.js +45 -0
- package/src/cli/commands/install-skills.js +46 -0
- package/src/cli/commands/llmstxt.js +24 -0
- package/src/cli/commands/localize.js +29 -0
- package/src/cli/commands/map.js +26 -0
- package/src/cli/commands/monitor.js +29 -0
- package/src/cli/commands/research.js +26 -0
- package/src/cli/commands/scrape.js +37 -0
- package/src/cli/commands/search.js +28 -0
- package/src/cli/commands/stealth.js +29 -0
- package/src/cli/commands/template.js +26 -0
- package/src/cli/commands/track.js +24 -0
- package/src/cli/commands/uninstall-skills.js +35 -0
- package/src/cli/formatter.js +57 -0
- package/src/cli/index.js +94 -0
- package/src/cli/lib/runTool.js +40 -0
- package/src/core/ActionExecutor.js +8 -6
- package/src/core/AuthManager.js +103 -3
- package/src/core/ChangeTracker.js +34 -0
- package/src/core/ElicitationHelper.js +112 -0
- package/src/core/JobManager.js +36 -2
- package/src/core/LocalizationManager.js +19 -5
- package/src/core/PerformanceManager.js +53 -17
- package/src/core/ResearchOrchestrator.js +40 -5
- package/src/core/SamplingClient.js +191 -0
- package/src/core/StealthBrowserManager.js +248 -2
- package/src/core/WebhookDispatcher.js +18 -10
- package/src/prompts/PromptRegistry.js +199 -0
- package/src/resources/ResourceRegistry.js +273 -0
- package/src/server/transports/streamableHttp.js +6 -6
- package/src/server/withAuth.js +25 -0
- package/src/skills/crawlforge-cli.md +157 -0
- package/src/skills/crawlforge-mcp.md +80 -0
- package/src/skills/crawlforge-research.md +104 -0
- package/src/skills/crawlforge-stealth.md +98 -0
- package/src/skills/installer.js +141 -0
- package/src/tools/advanced/batchScrape/index.js +30 -0
- package/src/tools/advanced/batchScrape/schema.js +1 -1
- package/src/tools/basic/extractText.js +19 -8
- package/src/tools/crawl/crawlDeep.js +27 -0
- package/src/tools/extract/extractContent.js +5 -17
- package/src/tools/extract/extractStructured.js +8 -0
- package/src/tools/extract/extractWithLlm.js +35 -25
- package/src/tools/extract/listOllamaModels.js +66 -0
- package/src/tools/extract/processDocument.js +7 -1
- package/src/tools/extract/summarizeContent.js +17 -0
- package/src/tools/research/deepResearch.js +34 -0
- package/src/tools/templates/ScrapeTemplateTool.js +68 -0
- package/src/tools/templates/TemplateRegistry.js +311 -0
- package/src/utils/Logger.js +15 -0
- package/src/utils/htmlToMarkdown.js +54 -0
- package/src/utils/secretMask.js +86 -0
package/README.md
CHANGED
|
@@ -9,7 +9,7 @@ Professional web scraping and content extraction server implementing the Model C
|
|
|
9
9
|
|
|
10
10
|
## 🎯 Features
|
|
11
11
|
|
|
12
|
-
- **
|
|
12
|
+
- **22 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis, local-LLM extraction (Ollama)
|
|
13
13
|
- **Free Tier**: 1,000 credits to get started instantly
|
|
14
14
|
- **MCP Compatible**: Works with Claude, Cursor, and other MCP-enabled AI tools
|
|
15
15
|
- **Enterprise Ready**: Scale up with paid plans for production use
|
|
@@ -112,6 +112,8 @@ Restart Cursor to activate.
|
|
|
112
112
|
- `summarize_content` - Generate intelligent summaries
|
|
113
113
|
- `analyze_content` - Comprehensive content analysis
|
|
114
114
|
- `extract_structured` - LLM-powered schema-driven extraction
|
|
115
|
+
- `extract_with_llm` - Natural-language extraction. **Defaults to a local Ollama model — no API key, no API costs.** Pass `provider: "openai" | "anthropic"` with the matching key for cloud models.
|
|
116
|
+
- `list_ollama_models` - List the Ollama models installed locally (free; helps you pick a `model` for `extract_with_llm`)
|
|
115
117
|
- `track_changes` - Monitor content changes over time
|
|
116
118
|
|
|
117
119
|
### Premium Tools (5-10 credits)
|
|
@@ -138,7 +140,7 @@ Restart Cursor to activate.
|
|
|
138
140
|
| **Enterprise** | 250,000 | Large scale operations |
|
|
139
141
|
|
|
140
142
|
**All plans include:**
|
|
141
|
-
- Access to all
|
|
143
|
+
- Access to all 22 tools
|
|
142
144
|
- Credits never expire and roll over month-to-month
|
|
143
145
|
- API access and webhook notifications
|
|
144
146
|
|
|
@@ -155,6 +157,30 @@ export CRAWLFORGE_API_KEY="cf_live_your_api_key_here"
|
|
|
155
157
|
# Optional: Custom API endpoint (for enterprise)
|
|
156
158
|
export CRAWLFORGE_API_URL="https://api.crawlforge.dev"
|
|
157
159
|
# As of v3.0.18, this variable is validated against an allow-list of CrawlForge backend hosts.
|
|
160
|
+
|
|
161
|
+
# Optional: Local LLM (Ollama) overrides — extract_with_llm defaults to Ollama
|
|
162
|
+
export OLLAMA_BASE_URL="http://localhost:11434" # default
|
|
163
|
+
export OLLAMA_DEFAULT_MODEL="llama3.2" # default; any locally-pulled model name works
|
|
164
|
+
|
|
165
|
+
# Optional: Cloud LLM keys — only needed when you pass provider: "openai" or "anthropic"
|
|
166
|
+
export OPENAI_API_KEY="sk-..."
|
|
167
|
+
export ANTHROPIC_API_KEY="sk-ant-..."
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### Local-LLM quickstart (`extract_with_llm` with Ollama)
|
|
171
|
+
|
|
172
|
+
`extract_with_llm` defaults to a local Ollama model — no API key, no API costs, no data leaving your machine.
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
# 1. Install Ollama: https://ollama.com
|
|
176
|
+
# 2. Pull any model from https://ollama.com/library
|
|
177
|
+
ollama pull llama3.2
|
|
178
|
+
|
|
179
|
+
# 3. Discover what's installed (from your MCP client)
|
|
180
|
+
# list_ollama_models()
|
|
181
|
+
|
|
182
|
+
# 4. Extract — defaults to Ollama with the model from step 2
|
|
183
|
+
# extract_with_llm({ url: "https://example.com", prompt: "…", model: "llama3.2" })
|
|
158
184
|
```
|
|
159
185
|
|
|
160
186
|
### Manual Configuration
|
package/package.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "
|
|
4
|
-
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with
|
|
3
|
+
"version": "4.2.1",
|
|
4
|
+
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 23 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. v4.0 adds Markdown-first output, pre-built site templates, Camoufox stealth engine, and cost transparency.",
|
|
5
5
|
"main": "server.js",
|
|
6
6
|
"bin": {
|
|
7
|
-
"crawlforge": "
|
|
7
|
+
"crawlforge": "src/cli/index.js",
|
|
8
8
|
"crawlforge-setup": "setup.js"
|
|
9
9
|
},
|
|
10
10
|
"scripts": {
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
"test:tools": "node test-tools.js",
|
|
20
20
|
"test:real-world": "node test-real-world.js",
|
|
21
21
|
"test:all": "bash run-all-tests.sh",
|
|
22
|
-
"postinstall": "echo '\n
|
|
22
|
+
"postinstall": "echo '\n\ud83c\udf89 CrawlForge MCP Server installed!\n\nRun \"npx crawlforge-setup\" to configure your API key and get started.\n'",
|
|
23
23
|
"docker:build": "docker build -t crawlforge .",
|
|
24
24
|
"docker:dev": "docker-compose up crawlforge-dev",
|
|
25
25
|
"docker:prod": "docker-compose up crawlforge-prod"
|
|
@@ -96,6 +96,7 @@
|
|
|
96
96
|
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
97
97
|
"@mozilla/readability": "^0.6.0",
|
|
98
98
|
"cheerio": "^1.1.2",
|
|
99
|
+
"commander": "^12.1.0",
|
|
99
100
|
"compromise": "^14.14.4",
|
|
100
101
|
"diff": "^8.0.2",
|
|
101
102
|
"dotenv": "^17.2.1",
|
|
@@ -109,6 +110,7 @@
|
|
|
109
110
|
"pdf-parse": "^1.1.1",
|
|
110
111
|
"playwright": "^1.54.2",
|
|
111
112
|
"robots-parser": "^3.0.1",
|
|
113
|
+
"turndown": "^7.2.4",
|
|
112
114
|
"winston": "^3.11.0",
|
|
113
115
|
"zod": "^3.23.8"
|
|
114
116
|
},
|
package/server.js
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
export { isCreatorModeVerified } from './src/core/creatorMode.js';
|
|
6
6
|
|
|
7
7
|
// Import everything else
|
|
8
|
-
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
8
|
+
import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
9
9
|
import { z } from "zod";
|
|
10
10
|
import { logger } from "./src/utils/Logger.js";
|
|
11
11
|
import { SearchWebTool } from "./src/tools/search/searchWeb.js";
|
|
@@ -17,11 +17,13 @@ import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js";
|
|
|
17
17
|
import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js";
|
|
18
18
|
import { ExtractStructuredTool } from "./src/tools/extract/extractStructured.js";
|
|
19
19
|
import { ExtractWithLlm } from "./src/tools/extract/extractWithLlm.js";
|
|
20
|
+
import { ListOllamaModelsTool } from "./src/tools/extract/listOllamaModels.js";
|
|
20
21
|
import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js";
|
|
21
22
|
import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js";
|
|
22
23
|
import { DeepResearchTool } from "./src/tools/research/deepResearch.js";
|
|
23
24
|
import { TrackChangesTool } from "./src/tools/tracking/trackChanges/index.js";
|
|
24
25
|
import { GenerateLLMsTxtTool } from "./src/tools/llmstxt/generateLLMsTxt.js";
|
|
26
|
+
import { ScrapeTemplateTool } from "./src/tools/templates/ScrapeTemplateTool.js"; // D3.3
|
|
25
27
|
import { StealthBrowserManager } from "./src/core/StealthBrowserManager.js";
|
|
26
28
|
import { LocalizationManager } from "./src/core/LocalizationManager.js";
|
|
27
29
|
import { memoryMonitor } from "./src/utils/MemoryMonitor.js";
|
|
@@ -42,6 +44,10 @@ import { extractTextHandler } from "./src/tools/basic/extractText.js";
|
|
|
42
44
|
import { extractLinksHandler } from "./src/tools/basic/extractLinks.js";
|
|
43
45
|
import { extractMetadataHandler } from "./src/tools/basic/extractMetadata.js";
|
|
44
46
|
import { scrapeStructuredHandler } from "./src/tools/basic/scrapeStructured.js";
|
|
47
|
+
// D1.1 Resources + D1.2 Prompts + D1.4 Elicitation
|
|
48
|
+
import { ResourceRegistry } from "./src/resources/ResourceRegistry.js";
|
|
49
|
+
import { PROMPTS, getPromptMessages } from "./src/prompts/PromptRegistry.js";
|
|
50
|
+
import { ElicitationHelper } from "./src/core/ElicitationHelper.js";
|
|
45
51
|
|
|
46
52
|
// Initialize Authentication Manager
|
|
47
53
|
await AuthManager.initialize();
|
|
@@ -89,8 +95,8 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
|
|
|
89
95
|
// Create the server
|
|
90
96
|
const server = new McpServer({
|
|
91
97
|
name: "crawlforge",
|
|
92
|
-
version: "
|
|
93
|
-
description: "Production-ready MCP server with
|
|
98
|
+
version: "4.2.1",
|
|
99
|
+
description: "Production-ready MCP server with 23 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, and local-LLM extraction via Ollama.",
|
|
94
100
|
homepage: "https://www.crawlforge.dev",
|
|
95
101
|
icon: "https://www.crawlforge.dev/icon.png"
|
|
96
102
|
});
|
|
@@ -104,7 +110,7 @@ server.prompt("getting-started", {
|
|
|
104
110
|
role: "user",
|
|
105
111
|
content: {
|
|
106
112
|
type: "text",
|
|
107
|
-
text: "You have access to CrawlForge MCP with
|
|
113
|
+
text: "You have access to CrawlForge MCP with 22 web scraping tools. Key tools:\n\n" +
|
|
108
114
|
"- fetch_url: Fetch raw HTML/content from any URL\n" +
|
|
109
115
|
"- extract_text: Extract clean text from a webpage\n" +
|
|
110
116
|
"- extract_content: Smart content extraction with readability\n" +
|
|
@@ -116,7 +122,8 @@ server.prompt("getting-started", {
|
|
|
116
122
|
"- deep_research: Multi-source research on any topic\n" +
|
|
117
123
|
"- stealth_mode: Anti-detection browsing for protected sites\n" +
|
|
118
124
|
"- extract_structured: LLM-powered structured data extraction\n" +
|
|
119
|
-
"- extract_with_llm: Natural-language extraction
|
|
125
|
+
"- extract_with_llm: Natural-language extraction — defaults to local Ollama (no API key); openai/anthropic available with key\n" +
|
|
126
|
+
"- list_ollama_models: List installed Ollama models so you can pick one for extract_with_llm\n" +
|
|
120
127
|
"- track_changes: Monitor website changes over time\n" +
|
|
121
128
|
"- generate_llms_txt: Generate llms.txt for any website\n\n" +
|
|
122
129
|
"Workflow: search_web -> fetch_url -> extract_content -> analyze_content\n\n" +
|
|
@@ -146,19 +153,105 @@ const summarizeContentTool = new SummarizeContentTool();
|
|
|
146
153
|
const analyzeContentTool = new AnalyzeContentTool();
|
|
147
154
|
const extractStructuredTool = new ExtractStructuredTool();
|
|
148
155
|
const extractWithLlmTool = new ExtractWithLlm();
|
|
156
|
+
const listOllamaModelsTool = new ListOllamaModelsTool();
|
|
149
157
|
const batchScrapeTool = new BatchScrapeTool();
|
|
150
158
|
const scrapeWithActionsTool = new ScrapeWithActionsTool();
|
|
151
159
|
const deepResearchTool = new DeepResearchTool();
|
|
152
160
|
const trackChangesTool = new TrackChangesTool();
|
|
153
161
|
const generateLLMsTxtTool = new GenerateLLMsTxtTool();
|
|
162
|
+
const scrapeTemplateTool = new ScrapeTemplateTool(); // D3.3
|
|
154
163
|
const stealthBrowserManager = new StealthBrowserManager();
|
|
155
164
|
const localizationManager = new LocalizationManager();
|
|
156
165
|
|
|
166
|
+
// D1.1: Resource Registry (wired to existing singletons)
|
|
167
|
+
const resourceRegistry = new ResourceRegistry({
|
|
168
|
+
researchOrchestrator: deepResearchTool, // exposes activeSessions
|
|
169
|
+
snapshotManager: null, // SnapshotManager not directly instantiated in server.js
|
|
170
|
+
jobManager: batchScrapeTool.jobManager,
|
|
171
|
+
mapSiteTool,
|
|
172
|
+
scrapeWithActionsTool,
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
// D1.4: Elicitation helper (client may not support — fails open)
|
|
176
|
+
const elicitation = new ElicitationHelper({ mcpServer: server, logger });
|
|
177
|
+
|
|
178
|
+
// D1.4: Wire elicitation into tools and AuthManager
|
|
179
|
+
deepResearchTool.setMcpServer(server);
|
|
180
|
+
batchScrapeTool.setMcpServer(server);
|
|
181
|
+
crawlDeepTool.setMcpServer(server);
|
|
182
|
+
extractStructuredTool.setMcpServer(server);
|
|
183
|
+
AuthManager.setElicitation(elicitation);
|
|
184
|
+
|
|
185
|
+
// ─── D1.1 Resource Templates (MCP Resources) ─────────────────────────────────
|
|
186
|
+
// Resources use the MCP ResourceTemplate URI pattern for dynamic crawlforge:// URIs.
|
|
187
|
+
// The registry is populated at runtime as tools produce artifacts.
|
|
188
|
+
|
|
189
|
+
// Research sessions: crawlforge://research/{sessionId}
|
|
190
|
+
server.resource(
|
|
191
|
+
"crawlforge-research",
|
|
192
|
+
new ResourceTemplate("crawlforge://research/{sessionId}", {
|
|
193
|
+
list: async () => ({
|
|
194
|
+
resources: resourceRegistry.listResources().filter(r => r.uri.startsWith("crawlforge://research/"))
|
|
195
|
+
})
|
|
196
|
+
}),
|
|
197
|
+
{ description: "Completed deep_research report stored in the server session" },
|
|
198
|
+
async (uri) => resourceRegistry.readResource(uri)
|
|
199
|
+
);
|
|
200
|
+
|
|
201
|
+
// Job results: crawlforge://job/{jobId}
|
|
202
|
+
server.resource(
|
|
203
|
+
"crawlforge-job",
|
|
204
|
+
new ResourceTemplate("crawlforge://job/{jobId}", {
|
|
205
|
+
list: async () => ({
|
|
206
|
+
resources: resourceRegistry.listResources().filter(r => r.uri.startsWith("crawlforge://job/"))
|
|
207
|
+
})
|
|
208
|
+
}),
|
|
209
|
+
{ description: "Completed batch_scrape job result" },
|
|
210
|
+
async (uri) => resourceRegistry.readResource(uri)
|
|
211
|
+
);
|
|
212
|
+
|
|
213
|
+
// Crawl sitemaps: crawlforge://crawl/{sessionId}/sitemap
|
|
214
|
+
server.resource(
|
|
215
|
+
"crawlforge-crawl-sitemap",
|
|
216
|
+
new ResourceTemplate("crawlforge://crawl/{sessionId}/sitemap", {
|
|
217
|
+
list: async () => ({
|
|
218
|
+
resources: resourceRegistry.listResources().filter(r => r.uri.startsWith("crawlforge://crawl/"))
|
|
219
|
+
})
|
|
220
|
+
}),
|
|
221
|
+
{ description: "map_site output stored for a crawl session" },
|
|
222
|
+
async (uri) => resourceRegistry.readResource(uri)
|
|
223
|
+
);
|
|
224
|
+
|
|
225
|
+
// Screenshots: crawlforge://screenshot/{actionId}
|
|
226
|
+
server.resource(
|
|
227
|
+
"crawlforge-screenshot",
|
|
228
|
+
new ResourceTemplate("crawlforge://screenshot/{actionId}", {
|
|
229
|
+
list: async () => ({
|
|
230
|
+
resources: resourceRegistry.listResources().filter(r => r.uri.startsWith("crawlforge://screenshot/"))
|
|
231
|
+
})
|
|
232
|
+
}),
|
|
233
|
+
{ description: "Screenshot from scrape_with_actions" },
|
|
234
|
+
async (uri) => resourceRegistry.readResource(uri)
|
|
235
|
+
);
|
|
236
|
+
|
|
237
|
+
// ─── D1.2 Prompts (workflow templates) ────────────────────────────────────────
|
|
238
|
+
// Register the 5 CrawlForge workflow prompts from PromptRegistry.
|
|
239
|
+
|
|
240
|
+
for (const p of PROMPTS) {
|
|
241
|
+
const argsShape = {};
|
|
242
|
+
for (const arg of p.arguments) {
|
|
243
|
+
argsShape[arg.name] = z.string().optional().describe(arg.description);
|
|
244
|
+
}
|
|
245
|
+
server.registerPrompt(p.name, { description: p.description, argsSchema: argsShape }, async (args) => {
|
|
246
|
+
return getPromptMessages(p.name, args || {});
|
|
247
|
+
});
|
|
248
|
+
}
|
|
249
|
+
|
|
157
250
|
// ─── Tool registrations ────────────────────────────────────────────────────────
|
|
158
251
|
|
|
159
252
|
// Tool: fetch_url
|
|
160
253
|
server.registerTool("fetch_url", {
|
|
161
|
-
description: "
|
|
254
|
+
description: "Use this when you need raw HTTP content from a URL — HTML, JSON, XML, or plain text. Ideal as the first step before extract_text or extract_content. Supports custom headers (e.g. auth tokens) and configurable timeout. Example: fetch_url({url: \"https://example.com\", timeout: 15000})",
|
|
162
255
|
annotations: { title: "Fetch URL", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
163
256
|
inputSchema: {
|
|
164
257
|
url: z.string().url().describe("The URL to fetch content from"),
|
|
@@ -169,18 +262,19 @@ server.registerTool("fetch_url", {
|
|
|
169
262
|
|
|
170
263
|
// Tool: extract_text
|
|
171
264
|
server.registerTool("extract_text", {
|
|
172
|
-
description: "
|
|
265
|
+
description: "Use this when you need a page's human-readable text or markdown stripped of HTML tags, scripts, and styles — e.g. for keyword search, summarization, RAG ingestion, or NLP. Use output_format:\"markdown\" for RAG workflows. Faster than extract_content but returns unstructured content. Example: extract_text({url: \"https://example.com/article\", output_format:\"markdown\"})",
|
|
173
266
|
annotations: { title: "Extract Text", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
174
267
|
inputSchema: {
|
|
175
268
|
url: z.string().url().describe("The URL to extract text from"),
|
|
176
269
|
remove_scripts: z.boolean().optional().default(true).describe("Remove script tags before extraction"),
|
|
177
|
-
remove_styles: z.boolean().optional().default(true).describe("Remove style tags before extraction")
|
|
270
|
+
remove_styles: z.boolean().optional().default(true).describe("Remove style tags before extraction"),
|
|
271
|
+
output_format: z.enum(["text", "markdown"]).optional().default("text").describe("Output format: \"text\" (default) or \"markdown\" — use markdown for RAG workflows")
|
|
178
272
|
}
|
|
179
273
|
}, withAuth("extract_text", extractTextHandler));
|
|
180
274
|
|
|
181
275
|
// Tool: extract_links
|
|
182
276
|
server.registerTool("extract_links", {
|
|
183
|
-
description: "
|
|
277
|
+
description: "Use this when you need to discover all hyperlinks on a page — e.g. to build a crawl seed list, audit broken links, or find related resources. Use filter_external:true to get only outbound links. Example: extract_links({url: \"https://example.com\", filter_external: true})",
|
|
184
278
|
annotations: { title: "Extract Links", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
185
279
|
inputSchema: {
|
|
186
280
|
url: z.string().url().describe("The URL to extract links from"),
|
|
@@ -191,7 +285,7 @@ server.registerTool("extract_links", {
|
|
|
191
285
|
|
|
192
286
|
// Tool: extract_metadata
|
|
193
287
|
server.registerTool("extract_metadata", {
|
|
194
|
-
description: "
|
|
288
|
+
description: "Use this when you need a page's SEO metadata: title, meta description, Open Graph tags, canonical URL, schema.org data. Ideal for site audits and competitive SEO analysis. Example: extract_metadata({url: \"https://example.com\"})",
|
|
195
289
|
annotations: { title: "Extract Metadata", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
196
290
|
inputSchema: {
|
|
197
291
|
url: z.string().url().describe("The URL to extract metadata from")
|
|
@@ -200,7 +294,7 @@ server.registerTool("extract_metadata", {
|
|
|
200
294
|
|
|
201
295
|
// Tool: scrape_structured
|
|
202
296
|
server.registerTool("scrape_structured", {
|
|
203
|
-
description: "
|
|
297
|
+
description: "Use this when you know the exact CSS selectors for the data you want — e.g. scraping a pricing table or product list with consistent markup. More reliable than LLM extraction for well-structured pages. Example: scrape_structured({url: \"https://shop.com/products\", selectors: {price: \".price\", name: \".product-title\"}})",
|
|
204
298
|
annotations: { title: "Scrape Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
205
299
|
inputSchema: {
|
|
206
300
|
url: z.string().url().describe("The URL to scrape"),
|
|
@@ -210,7 +304,7 @@ server.registerTool("scrape_structured", {
|
|
|
210
304
|
|
|
211
305
|
// Tool: search_web
|
|
212
306
|
server.registerTool("search_web", {
|
|
213
|
-
description: "
|
|
307
|
+
description: "Use this when you need web search results for a query — returns titles, URLs, snippets, and optional metadata. Supports language, date range, and site filters. Start research workflows here before using fetch_url or deep_research. Example: search_web({query: \"best MCP servers 2025\", limit: 10, time_range: \"month\"})",
|
|
214
308
|
annotations: { title: "Search the Web", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
215
309
|
inputSchema: {
|
|
216
310
|
query: z.string().describe("Search query string"),
|
|
@@ -236,7 +330,7 @@ server.registerTool("search_web", {
|
|
|
236
330
|
|
|
237
331
|
// Tool: crawl_deep
|
|
238
332
|
server.registerTool("crawl_deep", {
|
|
239
|
-
description: "
|
|
333
|
+
description: "Use this when you need to discover and optionally extract content from many pages within a site — e.g. building a knowledge base, indexing docs, or auditing all pages. Use map_site first to estimate scope, then crawl_deep for content. Example: crawl_deep({url: \"https://docs.example.com\", max_depth: 3, max_pages: 200, extract_content: true})",
|
|
240
334
|
annotations: { title: "Deep Crawl", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
241
335
|
inputSchema: {
|
|
242
336
|
url: z.string().url().describe("Starting URL for the crawl"),
|
|
@@ -263,7 +357,7 @@ server.registerTool("crawl_deep", {
|
|
|
263
357
|
|
|
264
358
|
// Tool: map_site
|
|
265
359
|
server.registerTool("map_site", {
|
|
266
|
-
description: "
|
|
360
|
+
description: "Use this when you need to know all URLs on a domain without fetching full page content — e.g. before a crawl_deep, for a site audit, or to find specific section URLs. Reads sitemap.xml when available. Example: map_site({url: \"https://example.com\", include_sitemap: true, max_urls: 500})",
|
|
267
361
|
annotations: { title: "Map Website", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
268
362
|
inputSchema: {
|
|
269
363
|
url: z.string().url().describe("The website URL to map"),
|
|
@@ -286,7 +380,7 @@ server.registerTool("map_site", {
|
|
|
286
380
|
|
|
287
381
|
// Tool: extract_content
|
|
288
382
|
server.registerTool("extract_content", {
|
|
289
|
-
description: "
|
|
383
|
+
description: "Use this when you need a clean, readable version of a web article or page — removes ads, nav, footers, and boilerplate. Ideal for RAG ingestion, summarization, or LLM context. Prefer this over extract_text for article-style pages. Example: extract_content({url: \"https://blog.example.com/post-title\"})",
|
|
290
384
|
annotations: { title: "Extract Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
291
385
|
inputSchema: {
|
|
292
386
|
url: z.string().url().describe("The URL to extract content from"),
|
|
@@ -306,7 +400,7 @@ server.registerTool("extract_content", {
|
|
|
306
400
|
|
|
307
401
|
// Tool: process_document
|
|
308
402
|
server.registerTool("process_document", {
|
|
309
|
-
description: "
|
|
403
|
+
description: "Use this when you need to extract text from a PDF URL or file — e.g. research papers, contracts, reports. Also handles HTML URLs. Returns structured sections, metadata, and word count. Example: process_document({source: \"https://example.com/report.pdf\", sourceType: \"pdf_url\"})",
|
|
310
404
|
annotations: { title: "Process Document", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
311
405
|
inputSchema: {
|
|
312
406
|
source: z.string().describe("Document source - URL or file path"),
|
|
@@ -327,7 +421,7 @@ server.registerTool("process_document", {
|
|
|
327
421
|
|
|
328
422
|
// Tool: summarize_content
|
|
329
423
|
server.registerTool("summarize_content", {
|
|
330
|
-
description: "
|
|
424
|
+
description: "Use this when you have text content (from extract_text or extract_content) and need a condensed version — e.g. for briefings, comparison tables, or LLM context reduction. Supports extractive (sentence selection) and abstractive (rewrite via Ollama/sampling) modes. Example: summarize_content({text: \"..long article..\", options: {summaryLength: \"short\", summaryType: \"abstractive\"}})",
|
|
331
425
|
annotations: { title: "Summarize Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
|
|
332
426
|
inputSchema: {
|
|
333
427
|
text: z.string().describe("The text content to summarize"),
|
|
@@ -347,7 +441,7 @@ server.registerTool("summarize_content", {
|
|
|
347
441
|
|
|
348
442
|
// Tool: analyze_content
|
|
349
443
|
server.registerTool("analyze_content", {
|
|
350
|
-
description: "
|
|
444
|
+
description: "Use this when you need NLP metrics for text — language detection, sentiment, topic extraction, entity recognition, readability score. Good for content auditing and classification. Example: analyze_content({text: \"..article text..\", options: {extractTopics: true, includeSentiment: true}})",
|
|
351
445
|
annotations: { title: "Analyze Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
|
|
352
446
|
inputSchema: {
|
|
353
447
|
text: z.string().describe("The text content to analyze"),
|
|
@@ -367,7 +461,7 @@ server.registerTool("analyze_content", {
|
|
|
367
461
|
|
|
368
462
|
// Tool: extract_structured
|
|
369
463
|
server.registerTool("extract_structured", {
|
|
370
|
-
description: "
|
|
464
|
+
description: "Use this when you need a specific data shape extracted from a page using a JSON schema — e.g. product details, job listings, event data. Uses LLM by default; falls back to CSS selectors when no LLM is configured. Example: extract_structured({url: \"https://jobs.example.com/post/123\", schema: {properties: {title: {type:\"string\"}, salary: {type:\"string\"}}, required:[\"title\"]}})",
|
|
371
465
|
annotations: { title: "Extract Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
372
466
|
inputSchema: {
|
|
373
467
|
url: z.string().url().describe("The URL to extract structured data from"),
|
|
@@ -395,15 +489,15 @@ server.registerTool("extract_structured", {
|
|
|
395
489
|
|
|
396
490
|
// Tool: extract_with_llm
|
|
397
491
|
server.registerTool("extract_with_llm", {
|
|
398
|
-
description: "Extract structured data from a URL or text using a natural-language prompt.
|
|
492
|
+
description: "Extract structured data from a URL or text using a natural-language prompt. Defaults to a local Ollama model (http://localhost:11434, no API key required) — call list_ollama_models first to see what's installed and pass the name via the `model` parameter. Pass provider: \"openai\" or \"anthropic\" with the matching API key to use a cloud model instead.",
|
|
399
493
|
annotations: { title: "Extract With LLM", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
400
494
|
inputSchema: {
|
|
401
495
|
url: z.string().url().optional().describe("URL to fetch and extract from (one of url/content required)"),
|
|
402
496
|
content: z.string().optional().describe("Pre-fetched text to extract from (one of url/content required)"),
|
|
403
497
|
prompt: z.string().describe("Natural-language extraction instruction"),
|
|
404
498
|
schema: z.record(z.unknown()).optional().describe("Optional JSON-schema for output shape (used as Ollama structured-outputs format when provider is 'ollama')"),
|
|
405
|
-
provider: z.enum(["openai", "anthropic", "ollama", "auto"]).optional().default("auto").describe("LLM provider.
|
|
406
|
-
model: z.string().optional().describe("Override
|
|
499
|
+
provider: z.enum(["openai", "anthropic", "ollama", "auto"]).optional().default("auto").describe("LLM provider. Defaults to 'ollama' (local, no key, http://localhost:11434). Use 'openai' or 'anthropic' for cloud models (requires the matching API key)."),
|
|
500
|
+
model: z.string().optional().describe("Override the model. For ollama, pass a name returned by list_ollama_models (e.g. 'llama3.2', 'qwen2.5:7b'). Defaults: openai='gpt-4o-mini', anthropic='claude-haiku-4-5-20251001', ollama='llama3.2' or $OLLAMA_DEFAULT_MODEL."),
|
|
407
501
|
maxTokens: z.number().optional().default(4096).describe("Maximum output tokens")
|
|
408
502
|
}
|
|
409
503
|
}, withAuth("extract_with_llm", async (params) => {
|
|
@@ -415,9 +509,26 @@ server.registerTool("extract_with_llm", {
|
|
|
415
509
|
}
|
|
416
510
|
}));
|
|
417
511
|
|
|
512
|
+
// Tool: list_ollama_models
|
|
513
|
+
server.registerTool("list_ollama_models", {
|
|
514
|
+
description: "List the Ollama models installed locally on this machine. Use this to discover which `model` values you can pass to extract_with_llm. Requires Ollama running on http://localhost:11434 (or $OLLAMA_BASE_URL).",
|
|
515
|
+
annotations: { title: "List Ollama Models", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
|
|
516
|
+
inputSchema: {}
|
|
517
|
+
}, withAuth("list_ollama_models", async () => {
|
|
518
|
+
try {
|
|
519
|
+
const result = await listOllamaModelsTool.execute();
|
|
520
|
+
return {
|
|
521
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
522
|
+
isError: !result.success
|
|
523
|
+
};
|
|
524
|
+
} catch (error) {
|
|
525
|
+
return { content: [{ type: "text", text: `Listing Ollama models failed: ${error.message}` }], isError: true };
|
|
526
|
+
}
|
|
527
|
+
}));
|
|
528
|
+
|
|
418
529
|
// Tool: batch_scrape
|
|
419
530
|
server.registerTool("batch_scrape", {
|
|
420
|
-
description: "
|
|
531
|
+
description: "Use this when you need to scrape 2–50 URLs in parallel — e.g. batch-collecting product pages, news articles, or competitor pages. Use mode:\"async\" with a webhook for large batches; mode:\"sync\" for up to ~25 URLs when you need results immediately. Example: batch_scrape({urls: [\"https://a.com\",\"https://b.com\"], formats: [\"json\"], maxConcurrency: 5})",
|
|
421
532
|
annotations: { title: "Batch Scrape", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
422
533
|
inputSchema: {
|
|
423
534
|
urls: z.array(z.union([
|
|
@@ -462,7 +573,7 @@ server.registerTool("batch_scrape", {
|
|
|
462
573
|
|
|
463
574
|
// Tool: scrape_with_actions
|
|
464
575
|
server.registerTool("scrape_with_actions", {
|
|
465
|
-
description: "
|
|
576
|
+
description: "Use this when you need to interact with a page before scraping — login, click buttons, fill forms, scroll, or wait for dynamic content to load. Use for SPAs, login-gated content, or multi-step flows. Screenshots from this tool are stored as crawlforge://screenshot/{actionId} resources. Example: scrape_with_actions({url: \"https://app.com/dashboard\", actions: [{type:\"click\",selector:\"#login\"},{type:\"type\",selector:\"#email\",text:\"user@a.com\"}]})",
|
|
466
577
|
annotations: { title: "Scrape with Browser Actions", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
467
578
|
inputSchema: {
|
|
468
579
|
url: z.string().url().describe("The URL to scrape"),
|
|
@@ -518,7 +629,7 @@ server.registerTool("scrape_with_actions", {
|
|
|
518
629
|
|
|
519
630
|
// Tool: deep_research
|
|
520
631
|
server.registerTool("deep_research", {
|
|
521
|
-
description: "
|
|
632
|
+
description: "Use this when you need exhaustive multi-source research on a topic — it searches the web, fetches and analyses sources, detects conflicts, and (when LLM keys or Ollama are configured) synthesizes a report. Best for complex questions needing 10+ sources. Will request confirmation (elicitation) if maxUrls > 50. Results are stored as crawlforge://research/{sessionId} resources. Example: deep_research({topic: \"quantum computing NISQ devices 2025\", maxUrls: 30, researchApproach: \"academic\"})",
|
|
522
633
|
annotations: { title: "Deep Research", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
523
634
|
inputSchema: {
|
|
524
635
|
topic: z.string().min(3).max(500).describe("Research topic or question"),
|
|
@@ -574,7 +685,7 @@ server.registerTool("deep_research", {
|
|
|
574
685
|
|
|
575
686
|
// Tool: track_changes
|
|
576
687
|
server.registerTool("track_changes", {
|
|
577
|
-
description: "
|
|
688
|
+
description: "Use this when you need to monitor a URL for content changes over time — e.g. competitor pricing, regulation updates, product availability. Start with operation:\"create_baseline\", then periodically use operation:\"compare\" to diff. Supports webhooks and scheduled monitoring. Example: track_changes({url: \"https://example.com/pricing\", operation: \"create_baseline\"})",
|
|
578
689
|
annotations: { title: "Track Changes", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
579
690
|
inputSchema: {
|
|
580
691
|
url: z.string().url().describe("The URL to track changes for"),
|
|
@@ -679,7 +790,7 @@ server.registerTool("track_changes", {
|
|
|
679
790
|
|
|
680
791
|
// Tool: generate_llms_txt
|
|
681
792
|
server.registerTool("generate_llms_txt", {
|
|
682
|
-
description: "
|
|
793
|
+
description: "Use this when you need to generate an llms.txt file for a website — the standard that tells AI models how to interact with a site's content. Useful for site owners preparing for AI discoverability, or for understanding a site's AI access policy. Example: generate_llms_txt({url: \"https://example.com\"})",
|
|
683
794
|
annotations: { title: "Generate llms.txt", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
684
795
|
inputSchema: {
|
|
685
796
|
url: z.string().url().describe("The website URL to generate llms.txt for"),
|
|
@@ -713,7 +824,7 @@ server.registerTool("generate_llms_txt", {
|
|
|
713
824
|
|
|
714
825
|
// Tool: stealth_mode
|
|
715
826
|
server.registerTool("stealth_mode", {
|
|
716
|
-
description: "
|
|
827
|
+
description: "Use this when a site blocks normal scraping — Cloudflare, Datadome, or other bot-detection systems. Manages a Playwright browser with randomized fingerprints, human behavior simulation, WebRTC/canvas spoofing. Start with operation:\"create_context\" then use the contextId. Example: stealth_mode({operation:\"create_context\", stealthConfig:{level:\"advanced\", simulateHumanBehavior:true}})",
|
|
717
828
|
annotations: { title: "Stealth Mode", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
718
829
|
inputSchema: {
|
|
719
830
|
operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure').describe("Stealth operation to perform"),
|
|
@@ -755,6 +866,7 @@ server.registerTool("stealth_mode", {
|
|
|
755
866
|
hardwareSpoofing: z.boolean().default(true)
|
|
756
867
|
}).optional()
|
|
757
868
|
}).optional().describe("Stealth browser configuration with anti-detection settings"),
|
|
869
|
+
engine: z.enum(["playwright", "camoufox"]).optional().default("playwright").describe("Browser engine: \"playwright\" (Chromium, default) or \"camoufox\" (Firefox-based, higher anti-detect score — install with npm install camoufox)"),
|
|
758
870
|
contextId: z.string().optional().describe("Browser context ID for page operations"),
|
|
759
871
|
urlToTest: z.string().url().optional().describe("URL to navigate to when creating a page")
|
|
760
872
|
}
|
|
@@ -807,7 +919,7 @@ server.registerTool("stealth_mode", {
|
|
|
807
919
|
|
|
808
920
|
// Tool: localization
|
|
809
921
|
server.registerTool("localization", {
|
|
810
|
-
description: "
|
|
922
|
+
description: "Use this when you need to scrape geo-restricted content or emulate a specific locale/timezone — e.g. seeing region-specific pricing, bypassing geo-blocks, or searching in another language. Use operation:\"configure_country\" to set country context. Example: localization({operation:\"configure_country\", countryCode:\"DE\", language:\"de\"})",
|
|
811
923
|
annotations: { title: "Localization", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
812
924
|
inputSchema: {
|
|
813
925
|
operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country').describe("Localization operation to perform"),
|
|
@@ -911,6 +1023,25 @@ server.registerTool("localization", {
|
|
|
911
1023
|
}
|
|
912
1024
|
}));
|
|
913
1025
|
|
|
1026
|
+
|
|
1027
|
+
// Tool: scrape_template (D3.3 — pre-built site templates)
|
|
1028
|
+
server.registerTool("scrape_template", {
|
|
1029
|
+
description: "Use this when you want structured data from a well-known site without writing custom selectors. Pass template:\"list\" to see all available templates. Supports: amazon-product, linkedin-profile, github-repo, youtube-video, tweet, reddit-thread, hacker-news-front-page, producthunt-launch, stackoverflow-question, npm-package. Example: scrape_template({template:\"github-repo\", url:\"https://github.com/user/repo\"})",
|
|
1030
|
+
annotations: { title: "Scrape Template", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
1031
|
+
inputSchema: {
|
|
1032
|
+
template: z.string().describe("Template ID (e.g. github-repo) or list to enumerate available templates"),
|
|
1033
|
+
url: z.string().url().optional().describe("URL to scrape — required unless template is list"),
|
|
1034
|
+
timeout: z.number().min(5000).max(60000).optional().default(15000).describe("Request timeout in milliseconds")
|
|
1035
|
+
}
|
|
1036
|
+
}, withAuth("scrape_template", async (params) => {
|
|
1037
|
+
try {
|
|
1038
|
+
const result = await scrapeTemplateTool.execute(params);
|
|
1039
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1040
|
+
} catch (error) {
|
|
1041
|
+
return { content: [{ type: "text", text: `Template scrape failed: ${error.message}` }], isError: true };
|
|
1042
|
+
}
|
|
1043
|
+
}));
|
|
1044
|
+
|
|
914
1045
|
// ─── Transport + startup ───────────────────────────────────────────────────────
|
|
915
1046
|
|
|
916
1047
|
const useHttp = process.argv.includes('--http') || process.env.MCP_HTTP === 'true';
|
|
@@ -918,7 +1049,9 @@ const useLegacyHttp = process.argv.includes('--legacy-http') || process.env.CRAW
|
|
|
918
1049
|
|
|
919
1050
|
async function runServer() {
|
|
920
1051
|
if (useHttp) {
|
|
921
|
-
|
|
1052
|
+
// Default to 10000 to match Render's default port-scan target and the
|
|
1053
|
+
// Dockerfile `EXPOSE 10000`. Most PaaS providers inject $PORT — we honor it.
|
|
1054
|
+
const port = parseInt(process.env.PORT || '10000', 10);
|
|
922
1055
|
|
|
923
1056
|
if (useLegacyHttp) {
|
|
924
1057
|
// One-release deprecation window for stateless legacy transport.
|
|
@@ -958,9 +1091,10 @@ async function runServer() {
|
|
|
958
1091
|
"extract_content", "process_document", "summarize_content", "analyze_content",
|
|
959
1092
|
"batch_scrape", "scrape_with_actions",
|
|
960
1093
|
"deep_research", "track_changes", "generate_llms_txt",
|
|
961
|
-
"stealth_mode", "localization", "extract_structured", "extract_with_llm"
|
|
1094
|
+
"stealth_mode", "localization", "extract_structured", "extract_with_llm",
|
|
1095
|
+
"scrape_template" // D3.3
|
|
962
1096
|
];
|
|
963
|
-
console.error(`Tools available: ${allTools.join(
|
|
1097
|
+
console.error(`Tools available (23): ${allTools.join(", ")}`);
|
|
964
1098
|
|
|
965
1099
|
// Start memory monitoring in development
|
|
966
1100
|
if (config.server.nodeEnv === "development") {
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* actions command — run browser automation actions from a script file.
|
|
3
|
+
*/
|
|
4
|
+
import { ScrapeWithActionsTool } from '../../tools/advanced/ScrapeWithActionsTool.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
import { readFileSync } from 'node:fs';
|
|
8
|
+
|
|
9
|
+
export function register(program) {
|
|
10
|
+
program
|
|
11
|
+
.command('actions <url>')
|
|
12
|
+
.description('Run browser automation actions against a URL')
|
|
13
|
+
.requiredOption('--script <file>', 'JSON file containing action script')
|
|
14
|
+
.option('--screenshot', 'Capture screenshot after actions')
|
|
15
|
+
.option('--wait <ms>', 'Wait time between actions in milliseconds', '500')
|
|
16
|
+
.action(async (url, opts, cmd) => {
|
|
17
|
+
const globals = cmd.parent.opts();
|
|
18
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
19
|
+
|
|
20
|
+
let actions;
|
|
21
|
+
try {
|
|
22
|
+
actions = JSON.parse(readFileSync(opts.script, 'utf8'));
|
|
23
|
+
} catch (e) {
|
|
24
|
+
process.stderr.write(`Error reading script file: ${e.message}\n`);
|
|
25
|
+
process.exit(1);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const tool = new ScrapeWithActionsTool(getToolConfig('scrape_with_actions'));
|
|
29
|
+
await runTool(tool, {
|
|
30
|
+
url,
|
|
31
|
+
actions,
|
|
32
|
+
screenshot: !!opts.screenshot,
|
|
33
|
+
wait_between_actions: parseInt(opts.wait, 10)
|
|
34
|
+
}, cliFlags);
|
|
35
|
+
});
|
|
36
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* analyze command — analyze content of a URL.
|
|
3
|
+
*/
|
|
4
|
+
import { AnalyzeContentTool } from '../../tools/extract/analyzeContent.js';
|
|
5
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
6
|
+
import { runTool } from '../lib/runTool.js';
|
|
7
|
+
|
|
8
|
+
export function register(program) {
|
|
9
|
+
program
|
|
10
|
+
.command('analyze <url>')
|
|
11
|
+
.description('Analyze content of a URL (sentiment, entities, readability)')
|
|
12
|
+
.option('--depth <level>', 'Analysis depth: basic or full', 'basic')
|
|
13
|
+
.action(async (url, opts, cmd) => {
|
|
14
|
+
const globals = cmd.parent.opts();
|
|
15
|
+
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
16
|
+
const tool = new AnalyzeContentTool(getToolConfig('analyze_content'));
|
|
17
|
+
await runTool(tool, { url, analysis_depth: opts.depth }, cliFlags);
|
|
18
|
+
});
|
|
19
|
+
}
|