crawlforge-mcp-server 4.5.0 → 4.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +19 -7
- package/README.md +11 -3
- package/package.json +2 -2
- package/server.js +63 -8
- package/src/cli/commands/init.js +107 -0
- package/src/cli/index.js +2 -0
- package/src/core/AgentOrchestrator.js +302 -0
- package/src/core/AuthManager.js +21 -1
- package/src/tools/agent/agent.js +71 -0
- package/src/tools/basic/extractText.js +23 -11
- package/src/tools/crawl/mapSite.js +29 -1
- package/src/tools/scrape/unifiedScrape.js +314 -0
package/CLAUDE.md
CHANGED
|
@@ -60,9 +60,9 @@ These guidelines are working if: fewer unnecessary changes in diffs, fewer rewri
|
|
|
60
60
|
|
|
61
61
|
## Project Overview
|
|
62
62
|
|
|
63
|
-
CrawlForge MCP Server - A professional MCP (Model Context Protocol) server providing
|
|
63
|
+
CrawlForge MCP Server - A professional MCP (Model Context Protocol) server providing 26 web scraping, crawling, and content processing tools (5 inline + 21 advanced).
|
|
64
64
|
|
|
65
|
-
**Current Version:** 4.
|
|
65
|
+
**Current Version:** 4.6.0
|
|
66
66
|
|
|
67
67
|
## Development Commands
|
|
68
68
|
|
|
@@ -92,8 +92,10 @@ npm run dev
|
|
|
92
92
|
# Test MCP protocol compliance
|
|
93
93
|
npm test
|
|
94
94
|
|
|
95
|
-
# Unit tests (
|
|
95
|
+
# Unit tests (400+ tests across tests/unit/, no live network)
|
|
96
96
|
npm run test:unit
|
|
97
|
+
# Phase D regressions live in tests/unit/phaseD-regressions.test.js (agent hard stops, unified scrape, map_site ranking)
|
|
98
|
+
# Run a single test file: node --test tests/unit/phaseD-regressions.test.js
|
|
97
99
|
# Note: add --test-force-exit if the run appears to hang at the end — importing
|
|
98
100
|
# StealthBrowserManager (d2-reliability.test.js) leaves a Playwright handle that
|
|
99
101
|
# otherwise delays process exit ~100s. Tests themselves pass either way.
|
|
@@ -109,7 +111,9 @@ node test-real-world.js # Test real-world usage scenarios
|
|
|
109
111
|
node tests/integration/mcp-protocol-compliance.test.js
|
|
110
112
|
|
|
111
113
|
# CLI (v4.1.0+, requires global install or npx)
|
|
112
|
-
crawlforge --help # Show all
|
|
114
|
+
crawlforge --help # Show all subcommands
|
|
115
|
+
crawlforge init # API-key detection + skill install + idempotent MCP-stanza merge (v4.6.0)
|
|
116
|
+
crawlforge init --all --yes # Merge MCP config into Claude Code / Desktop / Cursor non-interactively
|
|
113
117
|
crawlforge scrape https://example.com
|
|
114
118
|
crawlforge batch --urls urls.txt --format markdown
|
|
115
119
|
crawlforge install-skills --target claude-code
|
|
@@ -140,6 +144,7 @@ npm run docker:prod # Run production container
|
|
|
140
144
|
- **WebhookDispatcher**: Event notification system for job completion callbacks
|
|
141
145
|
- **ActionExecutor**: Browser automation engine (Playwright-based)
|
|
142
146
|
- **ResearchOrchestrator**: Multi-stage research with query expansion and synthesis
|
|
147
|
+
- **AgentOrchestrator**: Powers the `agent` tool — NL prompt → autonomous PLAN→GATHER→ACT→DECIDE→SHAPE loop with three orchestrator-enforced hard stops (maxSteps≤10, maxUrls≤20, wall-clock) never delegated to the LLM; degraded no-LLM-key path (D2, v4.6.0)
|
|
143
148
|
- **StealthBrowserManager**: Stealth mode scraping with anti-detection; Camoufox (Firefox) engine added in v4.0.0
|
|
144
149
|
- **LocalizationManager**: Multi-language content and localization
|
|
145
150
|
- **ChangeTracker**: Content change tracking over time
|
|
@@ -155,7 +160,9 @@ npm run docker:prod # Run production container
|
|
|
155
160
|
Tools are organized in subdirectories by category:
|
|
156
161
|
|
|
157
162
|
- `advanced/` - BatchScrapeTool, ScrapeWithActionsTool
|
|
163
|
+
- `agent/` - agent (AgentOrchestrator-driven autonomous tool, v4.6.0)
|
|
158
164
|
- `basic/` - fetchUrl, extractText, extractLinks, extractMetadata, scrapeStructured
|
|
165
|
+
- `scrape/` - unifiedScrape (single-fetch multi-format `scrape` tool, v4.6.0)
|
|
159
166
|
- `crawl/` - crawlDeep, mapSite
|
|
160
167
|
- `extract/` - analyzeContent, extractContent, extractStructured, extractWithLlm, listOllamaModels, processDocument, summarizeContent
|
|
161
168
|
- `research/` - deepResearch
|
|
@@ -164,13 +171,18 @@ Tools are organized in subdirectories by category:
|
|
|
164
171
|
- `tracking/` - trackChanges
|
|
165
172
|
- `llmstxt/` - generateLLMsTxt
|
|
166
173
|
|
|
167
|
-
### Available MCP Tools (
|
|
174
|
+
### Available MCP Tools (26 total)
|
|
168
175
|
|
|
169
176
|
**Basic Tools (server.js inline, 5):**
|
|
170
177
|
fetch_url, extract_text, extract_links, extract_metadata, scrape_structured
|
|
171
178
|
|
|
172
|
-
**Advanced Tools (
|
|
173
|
-
search_web, crawl_deep, map_site, extract_content, process_document, summarize_content, analyze_content, extract_structured, extract_with_llm, list_ollama_models, batch_scrape, scrape_with_actions, deep_research, track_changes, generate_llms_txt, stealth_mode, localization, scrape_template
|
|
179
|
+
**Advanced Tools (21):**
|
|
180
|
+
search_web, crawl_deep, map_site, extract_content, process_document, summarize_content, analyze_content, extract_structured, extract_with_llm, list_ollama_models, batch_scrape, scrape_with_actions, deep_research, track_changes, generate_llms_txt, stealth_mode, localization, scrape_template, scrape, agent
|
|
181
|
+
|
|
182
|
+
**v4.6.0 additions (Phase D):**
|
|
183
|
+
- `scrape` — single fetch + one cheerio load dispatching a `formats` array (markdown/html/rawHtml/text/links/metadata/screenshot/json-schema) + `onlyMainContent`; partial-success via per-format `warnings[]`. Cost: 2.
|
|
184
|
+
- `agent` — NL prompt → autonomous research/extract, no URLs required (see AgentOrchestrator above). Cost: 8.
|
|
185
|
+
- `map_site` gained an optional `search=` param that ranks discovered URLs (`ranked_urls:[{url,score}]`); default output unchanged.
|
|
174
186
|
|
|
175
187
|
### MCP Server Entry Point
|
|
176
188
|
|
package/README.md
CHANGED
|
@@ -9,7 +9,7 @@ Professional web scraping and content extraction server implementing the Model C
|
|
|
9
9
|
|
|
10
10
|
## 🎯 Features
|
|
11
11
|
|
|
12
|
-
- **
|
|
12
|
+
- **26 Professional Tools**: Web scraping, deep research, an autonomous `agent`, a unified multi-format `scrape`, stealth browsing, content analysis, local-LLM extraction (Ollama)
|
|
13
13
|
- **Free Tier**: 1,000 credits to get started instantly
|
|
14
14
|
- **MCP Compatible**: Works with Claude, Cursor, and other MCP-enabled AI tools
|
|
15
15
|
- **Enterprise Ready**: Scale up with paid plans for production use
|
|
@@ -37,6 +37,8 @@ This will:
|
|
|
37
37
|
|
|
38
38
|
**Don't have an API key?** Get one free at [https://www.crawlforge.dev/signup](https://www.crawlforge.dev/signup)
|
|
39
39
|
|
|
40
|
+
> **One-step setup (v4.6.0+):** `crawlforge init` detects your API key, installs the agent skill, and idempotently merges the MCP config stanza into Claude Code, Claude Desktop, and Cursor. Use `crawlforge init --all --yes` to configure every detected client non-interactively.
|
|
41
|
+
|
|
40
42
|
### 3. Configure Your IDE (if not auto-configured)
|
|
41
43
|
|
|
42
44
|
<details>
|
|
@@ -107,8 +109,10 @@ Restart Cursor to activate.
|
|
|
107
109
|
- `extract_text` - Extract clean text from web pages
|
|
108
110
|
- `extract_links` - Get all links from a page
|
|
109
111
|
- `extract_metadata` - Extract page metadata
|
|
112
|
+
- `scrape_template` - Structured data from well-known sites (Amazon, GitHub, LinkedIn, YouTube, Reddit, Hacker News, npm, and more) without writing selectors
|
|
110
113
|
|
|
111
114
|
### Advanced Tools (2-3 credits)
|
|
115
|
+
- `scrape` - **Unified single-fetch, multi-format extraction.** Pass a `formats` array (markdown/html/rawHtml/text/links/metadata/screenshot/json-schema) plus `onlyMainContent`; one fetch serves every requested format with per-format partial-success warnings
|
|
112
116
|
- `scrape_structured` - Extract structured data with CSS selectors
|
|
113
117
|
- `search_web` - Search the web using Google Search API
|
|
114
118
|
- `summarize_content` - Generate intelligent summaries
|
|
@@ -117,10 +121,12 @@ Restart Cursor to activate.
|
|
|
117
121
|
- `extract_with_llm` - Natural-language extraction. **Defaults to a local Ollama model — no API key, no API costs.** Pass `provider: "openai" | "anthropic"` with the matching key for cloud models.
|
|
118
122
|
- `list_ollama_models` - List the Ollama models installed locally (free; helps you pick a `model` for `extract_with_llm`)
|
|
119
123
|
- `track_changes` - Monitor content changes over time
|
|
124
|
+
- `get_batch_results` - Retrieve paginated results for a `batch_scrape` job by `batchId`
|
|
120
125
|
|
|
121
126
|
### Premium Tools (5-10 credits)
|
|
127
|
+
- `agent` - **Autonomous research/extraction from a natural-language prompt — no URLs required.** Plans, gathers, and shapes an answer under hard safety stops (max steps/URLs/wall-clock enforced by the orchestrator, never the LLM)
|
|
122
128
|
- `crawl_deep` - Deep crawl entire websites
|
|
123
|
-
- `map_site` - Discover and map website structure
|
|
129
|
+
- `map_site` - Discover and map website structure (optional `search=` ranks the discovered URLs)
|
|
124
130
|
- `batch_scrape` - Process multiple URLs simultaneously
|
|
125
131
|
- `deep_research` - Multi-stage research with source verification
|
|
126
132
|
- `stealth_mode` - Anti-detection browser management
|
|
@@ -132,6 +138,8 @@ Restart Cursor to activate.
|
|
|
132
138
|
- `generate_llms_txt` - Generate AI interaction guidelines
|
|
133
139
|
- `localization` - Multi-language and geo-location management
|
|
134
140
|
|
|
141
|
+
For the full canonical capabilities reference (all tools, CLI commands, stealth engines, research workflow), see [SKILL.md](SKILL.md).
|
|
142
|
+
|
|
135
143
|
## 💳 Pricing
|
|
136
144
|
|
|
137
145
|
| Plan | Credits/Month | Best For |
|
|
@@ -142,7 +150,7 @@ Restart Cursor to activate.
|
|
|
142
150
|
| **Enterprise** | 250,000 | Large scale operations |
|
|
143
151
|
|
|
144
152
|
**All plans include:**
|
|
145
|
-
- Access to all
|
|
153
|
+
- Access to all 26 tools
|
|
146
154
|
- Credits never expire and roll over month-to-month
|
|
147
155
|
- API access and webhook notifications
|
|
148
156
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.6.1",
|
|
4
4
|
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 23 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. v4.0 adds Markdown-first output, pre-built site templates, Camoufox stealth engine, and cost transparency.",
|
|
5
5
|
"main": "server.js",
|
|
6
6
|
"bin": {
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
"test:tools": "node test-tools.js",
|
|
22
22
|
"test:real-world": "node test-real-world.js",
|
|
23
23
|
"test:all": "bash run-all-tests.sh",
|
|
24
|
-
"postinstall": "echo '\
|
|
24
|
+
"postinstall": "echo '\nCrawlForge MCP Server installed!\n\nQuick start: run \"npx crawlforge init\" to configure your API key, install skills, and register the MCP server with your AI clients.\nOr run \"npx crawlforge-setup\" to configure your API key only.\n'",
|
|
25
25
|
"docker:build": "docker build -t crawlforge .",
|
|
26
26
|
"docker:dev": "docker-compose up crawlforge-dev",
|
|
27
27
|
"docker:prod": "docker-compose up crawlforge-prod"
|
package/server.js
CHANGED
|
@@ -24,6 +24,8 @@ import { DeepResearchTool } from "./src/tools/research/deepResearch.js";
|
|
|
24
24
|
import { TrackChangesTool } from "./src/tools/tracking/trackChanges/index.js";
|
|
25
25
|
import { GenerateLLMsTxtTool } from "./src/tools/llmstxt/generateLLMsTxt.js";
|
|
26
26
|
import { ScrapeTemplateTool } from "./src/tools/templates/ScrapeTemplateTool.js"; // D3.3
|
|
27
|
+
import { UnifiedScrapeTool } from "./src/tools/scrape/unifiedScrape.js"; // D4 D1
|
|
28
|
+
import { AgentTool } from "./src/tools/agent/agent.js"; // D4 D2
|
|
27
29
|
import { StealthBrowserManager } from "./src/core/StealthBrowserManager.js";
|
|
28
30
|
import { LocalizationManager } from "./src/core/LocalizationManager.js";
|
|
29
31
|
import { memoryMonitor } from "./src/utils/MemoryMonitor.js";
|
|
@@ -97,7 +99,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
|
|
|
97
99
|
const server = new McpServer({
|
|
98
100
|
name: "crawlforge",
|
|
99
101
|
version: "4.5.0",
|
|
100
|
-
description: "Production-ready MCP server with
|
|
102
|
+
description: "Production-ready MCP server with 26 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, local-LLM extraction via Ollama, unified multi-format scrape, and autonomous agent tool.",
|
|
101
103
|
homepage: "https://www.crawlforge.dev",
|
|
102
104
|
icon: "https://www.crawlforge.dev/icon.png"
|
|
103
105
|
});
|
|
@@ -111,7 +113,7 @@ server.prompt("getting-started", {
|
|
|
111
113
|
role: "user",
|
|
112
114
|
content: {
|
|
113
115
|
type: "text",
|
|
114
|
-
text: "You have access to CrawlForge MCP with
|
|
116
|
+
text: "You have access to CrawlForge MCP with 26 web scraping tools. Key tools:\n\n" +
|
|
115
117
|
"- fetch_url: Fetch raw HTML/content from any URL\n" +
|
|
116
118
|
"- extract_text: Extract clean text from a webpage\n" +
|
|
117
119
|
"- extract_content: Smart content extraction with readability\n" +
|
|
@@ -161,6 +163,8 @@ const deepResearchTool = new DeepResearchTool();
|
|
|
161
163
|
const trackChangesTool = new TrackChangesTool();
|
|
162
164
|
const generateLLMsTxtTool = new GenerateLLMsTxtTool();
|
|
163
165
|
const scrapeTemplateTool = new ScrapeTemplateTool(); // D3.3
|
|
166
|
+
const unifiedScrapeTool = new UnifiedScrapeTool(); // D4 D1
|
|
167
|
+
const agentTool = new AgentTool(); // D4 D2
|
|
164
168
|
const stealthBrowserManager = new StealthBrowserManager();
|
|
165
169
|
const localizationManager = new LocalizationManager();
|
|
166
170
|
|
|
@@ -181,6 +185,7 @@ deepResearchTool.setMcpServer(server);
|
|
|
181
185
|
batchScrapeTool.setMcpServer(server);
|
|
182
186
|
crawlDeepTool.setMcpServer(server);
|
|
183
187
|
extractStructuredTool.setMcpServer(server);
|
|
188
|
+
agentTool.setMcpServer(server); // D4 D2: SamplingClient + Elicitation
|
|
184
189
|
AuthManager.setElicitation(elicitation);
|
|
185
190
|
|
|
186
191
|
// ─── D1.1 Resource Templates (MCP Resources) ─────────────────────────────────
|
|
@@ -433,14 +438,15 @@ server.registerTool("map_site", {
|
|
|
433
438
|
include_patterns: z.array(z.string()).optional(),
|
|
434
439
|
exclude_patterns: z.array(z.string()).optional()
|
|
435
440
|
}).optional().describe("Per-domain allow/deny lists and URL include/exclude patterns"),
|
|
436
|
-
import_filter_config: z.string().optional().describe("JSON string of a previously exported domain-filter config")
|
|
441
|
+
import_filter_config: z.string().optional().describe("JSON string of a previously exported domain-filter config"),
|
|
442
|
+
search: z.string().optional().describe("When set, rank discovered URLs by relevance to this string and emit ranked_urls:[{url,score}]")
|
|
437
443
|
}
|
|
438
|
-
}, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config }) => {
|
|
444
|
+
}, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config, search }) => {
|
|
439
445
|
try {
|
|
440
446
|
if (!url) {
|
|
441
447
|
return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
|
|
442
448
|
}
|
|
443
|
-
const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config });
|
|
449
|
+
const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config, search });
|
|
444
450
|
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
445
451
|
} catch (error) {
|
|
446
452
|
return { content: [{ type: "text", text: `Site mapping failed: ${error.message}` }], isError: true };
|
|
@@ -801,6 +807,53 @@ server.registerTool("deep_research", {
|
|
|
801
807
|
}
|
|
802
808
|
}));
|
|
803
809
|
|
|
810
|
+
// Tool: scrape (D4 D1 — unified multi-format single-fetch)
|
|
811
|
+
server.registerTool("scrape", {
|
|
812
|
+
description: "Use this when you need multiple content formats from a single URL in one call — e.g. markdown + links + metadata together. One fetch, no N-request fan-out. Formats: \"markdown\", \"html\", \"rawHtml\", \"text\", \"links\", \"metadata\", or {type:\"json\",schema,prompt} for LLM-structured extraction. onlyMainContent:true (default) strips boilerplate via Readability. Partial success: per-format warnings never fail the whole call. Example: scrape({url:\"https://example.com\", formats:[\"markdown\",\"links\",\"metadata\"]})",
|
|
813
|
+
annotations: { title: "Scrape (Multi-Format)", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
814
|
+
inputSchema: {
|
|
815
|
+
url: z.string().url().describe("The URL to scrape"),
|
|
816
|
+
formats: z.array(z.union([
|
|
817
|
+
z.enum(["markdown", "html", "rawHtml", "text", "links", "metadata", "screenshot"]),
|
|
818
|
+
z.object({
|
|
819
|
+
type: z.literal("json"),
|
|
820
|
+
schema: z.record(z.any()).optional().describe("JSON schema for extraction"),
|
|
821
|
+
prompt: z.string().optional().describe("Extraction instruction for the LLM")
|
|
822
|
+
})
|
|
823
|
+
])).min(1).optional().default(["markdown"]).describe("Formats to return (default: [\"markdown\"])"),
|
|
824
|
+
onlyMainContent: z.boolean().optional().default(true).describe("Strip boilerplate via Readability (default: true)"),
|
|
825
|
+
timeoutMs: z.number().min(1000).max(60000).optional().default(15000).describe("Fetch timeout in ms")
|
|
826
|
+
}
|
|
827
|
+
}, withAuth("scrape", async (params) => {
|
|
828
|
+
try {
|
|
829
|
+
const result = await unifiedScrapeTool.execute(params);
|
|
830
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
831
|
+
} catch (error) {
|
|
832
|
+
return { content: [{ type: "text", text: `Scrape failed: ${error.message}` }], isError: true };
|
|
833
|
+
}
|
|
834
|
+
}));
|
|
835
|
+
|
|
836
|
+
// Tool: agent (D4 D2 — autonomous NL prompt → search/navigate/extract)
|
|
837
|
+
server.registerTool("agent", {
|
|
838
|
+
description: "Use this when you need an autonomous agent to research, navigate, and synthesise an answer from the web — no URLs required. The agent plans search queries, fetches and filters relevant pages, and returns a prose or structured answer. model:\"pro\" uses deep multi-source research. Hard limits: maxSteps≤10, maxUrls≤20, 120s wall-clock. Confirms before pro runs. Degraded-but-useful output if no LLM keys/Ollama. Example: agent({prompt:\"What are the top 5 MCP servers in 2025?\", maxUrls:10})",
|
|
839
|
+
annotations: { title: "Agent (Autonomous)", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
840
|
+
inputSchema: {
|
|
841
|
+
prompt: z.string().min(1).max(2000).describe("Natural-language task or question"),
|
|
842
|
+
urls: z.array(z.string().url()).max(20).optional().describe("Optional seed URLs to include (max 20)"),
|
|
843
|
+
schema: z.record(z.any()).optional().describe("Optional JSON schema for structured output"),
|
|
844
|
+
model: z.enum(["default", "pro"]).optional().default("default").describe("\"default\" = SamplingClient loop (no keys needed); \"pro\" = full ResearchOrchestrator"),
|
|
845
|
+
maxSteps: z.number().min(1).max(10).optional().default(5).describe("Max fetch iterations (hard cap: 10)"),
|
|
846
|
+
maxUrls: z.number().min(1).max(20).optional().default(10).describe("Max URLs to fetch (hard cap: 20)")
|
|
847
|
+
}
|
|
848
|
+
}, withAuth("agent", async (params) => {
|
|
849
|
+
try {
|
|
850
|
+
const result = await agentTool.execute(params);
|
|
851
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
852
|
+
} catch (error) {
|
|
853
|
+
return { content: [{ type: "text", text: `Agent failed: ${error.message}` }], isError: true };
|
|
854
|
+
}
|
|
855
|
+
}));
|
|
856
|
+
|
|
804
857
|
// Tool: track_changes
|
|
805
858
|
server.registerTool("track_changes", {
|
|
806
859
|
description: "Use this when you need to monitor a URL for content changes over time — e.g. competitor pricing, regulation updates, product availability. Start with operation:\"create_baseline\", then periodically use operation:\"compare\" to diff. Supports webhooks and scheduled monitoring. Example: track_changes({url: \"https://example.com/pricing\", operation: \"create_baseline\"})",
|
|
@@ -1224,9 +1277,10 @@ async function runServer() {
|
|
|
1224
1277
|
"batch_scrape", "get_batch_results", "scrape_with_actions",
|
|
1225
1278
|
"deep_research", "track_changes", "generate_llms_txt",
|
|
1226
1279
|
"stealth_mode", "localization", "extract_structured", "extract_with_llm",
|
|
1227
|
-
"list_ollama_models", "scrape_template"
|
|
1280
|
+
"list_ollama_models", "scrape_template", // D3.3
|
|
1281
|
+
"scrape", "agent" // D4
|
|
1228
1282
|
];
|
|
1229
|
-
console.error(`Tools available (
|
|
1283
|
+
console.error(`Tools available (26): ${allTools.join(", ")}`);
|
|
1230
1284
|
|
|
1231
1285
|
// Start memory monitoring in development
|
|
1232
1286
|
if (config.server.nodeEnv === "development") {
|
|
@@ -1252,7 +1306,8 @@ async function gracefulShutdown(signal) {
|
|
|
1252
1306
|
const toolsToCleanup = [
|
|
1253
1307
|
batchScrapeTool, scrapeWithActionsTool, deepResearchTool,
|
|
1254
1308
|
trackChangesTool, generateLLMsTxtTool, stealthBrowserManager,
|
|
1255
|
-
localizationManager, extractStructuredTool
|
|
1309
|
+
localizationManager, extractStructuredTool,
|
|
1310
|
+
agentTool // D4 D2: may hold ResearchOrchestrator
|
|
1256
1311
|
].filter(tool => tool && (typeof tool.destroy === 'function' || typeof tool.cleanup === 'function'));
|
|
1257
1312
|
|
|
1258
1313
|
console.error(`Cleaning up ${toolsToCleanup.length} tools...`);
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* init command — one-shot setup: API key check + skill install + MCP stanza merge.
|
|
3
|
+
*/
|
|
4
|
+
import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs';
|
|
5
|
+
import { join } from 'node:path';
|
|
6
|
+
import { install } from '../../skills/installer.js';
|
|
7
|
+
|
|
8
|
+
const HOME = process.env.HOME || process.env.USERPROFILE || '';
|
|
9
|
+
|
|
10
|
+
function loadStoredApiKey() {
|
|
11
|
+
try {
|
|
12
|
+
const cfg = JSON.parse(readFileSync(join(HOME, '.crawlforge', 'config.json'), 'utf8'));
|
|
13
|
+
return cfg.apiKey || undefined;
|
|
14
|
+
} catch {
|
|
15
|
+
return undefined;
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
function mcpStanza(apiKey) {
|
|
20
|
+
const stanza = { command: 'npx', args: ['-y', 'crawlforge@latest', 'mcp'] };
|
|
21
|
+
if (apiKey) stanza.env = { CRAWLFORGE_API_KEY: apiKey };
|
|
22
|
+
return stanza;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function mergeClientConfig(configPath, apiKey) {
|
|
26
|
+
let existing = {};
|
|
27
|
+
if (existsSync(configPath)) {
|
|
28
|
+
try { existing = JSON.parse(readFileSync(configPath, 'utf8')); } catch { /* keep {} */ }
|
|
29
|
+
} else {
|
|
30
|
+
const dir = configPath.substring(0, configPath.lastIndexOf('/'));
|
|
31
|
+
if (dir) mkdirSync(dir, { recursive: true });
|
|
32
|
+
}
|
|
33
|
+
existing.mcpServers = existing.mcpServers || {};
|
|
34
|
+
existing.mcpServers.crawlforge = mcpStanza(apiKey);
|
|
35
|
+
writeFileSync(configPath, JSON.stringify(existing, null, 2) + '\n', 'utf8');
|
|
36
|
+
return configPath;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function resolveClientPaths(client) {
|
|
40
|
+
const paths = [];
|
|
41
|
+
if (!client || client === 'claude-code') {
|
|
42
|
+
paths.push({ label: 'Claude Code', path: join(HOME, '.claude.json') });
|
|
43
|
+
}
|
|
44
|
+
if (!client || client === 'claude-desktop') {
|
|
45
|
+
const desktopPath = process.platform === 'darwin'
|
|
46
|
+
? join(HOME, 'Library', 'Application Support', 'Claude', 'claude_desktop_config.json')
|
|
47
|
+
: process.platform === 'win32'
|
|
48
|
+
? join(process.env.APPDATA || join(HOME, 'AppData', 'Roaming'), 'Claude', 'claude_desktop_config.json')
|
|
49
|
+
: join(HOME, '.config', 'Claude', 'claude_desktop_config.json');
|
|
50
|
+
paths.push({ label: 'Claude Desktop', path: desktopPath });
|
|
51
|
+
}
|
|
52
|
+
if (!client || client === 'cursor') {
|
|
53
|
+
paths.push({ label: 'Cursor', path: join(HOME, '.cursor', 'mcp.json') });
|
|
54
|
+
}
|
|
55
|
+
return paths;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export function register(program) {
|
|
59
|
+
program
|
|
60
|
+
.command('init')
|
|
61
|
+
.description('Set up CrawlForge: verify API key, install skills, and register the MCP server with your AI clients')
|
|
62
|
+
.option('--all', 'Install skills to all targets and register all detected client configs')
|
|
63
|
+
.option('--client <name>', 'Target client to register: claude-code, claude-desktop, or cursor')
|
|
64
|
+
.option('--yes', 'Non-interactive — assume yes to all prompts')
|
|
65
|
+
.action(async (opts) => {
|
|
66
|
+
const out = (msg) => process.stderr.write(msg + '\n');
|
|
67
|
+
|
|
68
|
+
// 1. API key check
|
|
69
|
+
const apiKey = loadStoredApiKey() || process.env.CRAWLFORGE_API_KEY;
|
|
70
|
+
if (!apiKey) {
|
|
71
|
+
out('No CrawlForge API key found.');
|
|
72
|
+
out('Run: npx crawlforge-setup');
|
|
73
|
+
out('Then re-run: crawlforge init');
|
|
74
|
+
process.exit(1);
|
|
75
|
+
}
|
|
76
|
+
out('API key: found (' + apiKey.slice(0, 8) + '...)');
|
|
77
|
+
|
|
78
|
+
// 2. Install skills
|
|
79
|
+
const skillTarget = opts.all ? 'all' : 'claude-code';
|
|
80
|
+
try {
|
|
81
|
+
const results = await install({ target: skillTarget, force: false, cwd: process.cwd() });
|
|
82
|
+
if (results.installed.length > 0) {
|
|
83
|
+
out('Skills installed: ' + results.installed.length + ' file(s)');
|
|
84
|
+
} else {
|
|
85
|
+
out('Skills: already up to date (use crawlforge install-skills --force to overwrite)');
|
|
86
|
+
}
|
|
87
|
+
} catch (err) {
|
|
88
|
+
out('Warning: skill install failed — ' + err.message);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// 3. MCP stanza merge
|
|
92
|
+
const clientFilter = opts.client || (opts.all ? undefined : 'claude-code');
|
|
93
|
+
const targets = resolveClientPaths(clientFilter);
|
|
94
|
+
|
|
95
|
+
for (const { label, path: cfgPath } of targets) {
|
|
96
|
+
try {
|
|
97
|
+
mergeClientConfig(cfgPath, apiKey);
|
|
98
|
+
out('MCP registered: ' + label + ' (' + cfgPath + ')');
|
|
99
|
+
} catch (err) {
|
|
100
|
+
out('Warning: could not update ' + label + ' config — ' + err.message);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
out('Done. Restart your AI client to pick up the crawlforge MCP server.');
|
|
105
|
+
process.exit(0);
|
|
106
|
+
});
|
|
107
|
+
}
|
package/src/cli/index.js
CHANGED
|
@@ -58,6 +58,7 @@ import { register as registerTemplate } from './commands/template.js';
|
|
|
58
58
|
import { register as registerMonitor } from './commands/monitor.js';
|
|
59
59
|
import { register as registerInstallSkills } from './commands/install-skills.js';
|
|
60
60
|
import { register as registerUninstallSkills } from './commands/uninstall-skills.js';
|
|
61
|
+
import { register as registerInit } from './commands/init.js';
|
|
61
62
|
|
|
62
63
|
// ─── MCP stdio server mode (backward compatibility) ──────────────────────────
|
|
63
64
|
// Before v4.1.0 the `crawlforge` bin WAS the MCP server. v4.1.0 turned it into
|
|
@@ -136,6 +137,7 @@ registerTemplate(program);
|
|
|
136
137
|
registerMonitor(program);
|
|
137
138
|
registerInstallSkills(program);
|
|
138
139
|
registerUninstallSkills(program);
|
|
140
|
+
registerInit(program);
|
|
139
141
|
|
|
140
142
|
// `crawlforge mcp` / `crawlforge serve` — explicitly start the MCP server over
|
|
141
143
|
// stdio. Extra args (e.g. --http) are read directly by server.js from argv.
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AgentOrchestrator — autonomous NL-prompt → search/navigate/extract → answer.
|
|
3
|
+
*
|
|
4
|
+
* Design: hardcoded 3-action state machine.
|
|
5
|
+
* PLAN — one SamplingClient call to decompose prompt into search queries
|
|
6
|
+
* GATHER — search_web (≤maxUrls results total)
|
|
7
|
+
* ACT — fetchAndParse + relevance gate per URL
|
|
8
|
+
* DECIDE — loop or answer (step/URL/time hard stops; never LLM-trusted)
|
|
9
|
+
* SHAPE — schema→ExtractWithLlm prose→synthesis via SamplingClient
|
|
10
|
+
*
|
|
11
|
+
* Hard stops (enforced here, not by the LLM):
|
|
12
|
+
* 1. maxSteps iterations of the ACT loop
|
|
13
|
+
* 2. maxUrls total URLs fetched
|
|
14
|
+
* 3. wallClockMs wall-clock milliseconds (default 120 000)
|
|
15
|
+
*
|
|
16
|
+
* No-LLM-key path: if all LLM calls fail, return collected evidence + {degraded:true}.
|
|
17
|
+
* pro model: delegates to ResearchOrchestrator.conductResearch() for richer synthesis.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { fetchAndParse } from '../tools/extract/_fetchAndParse.js';
|
|
21
|
+
import { SamplingClient } from './SamplingClient.js';
|
|
22
|
+
|
|
23
|
+
const DEFAULT_WALL_CLOCK_MS = 120_000;
|
|
24
|
+
const DEFAULT_MAX_STEPS = 5;
|
|
25
|
+
const DEFAULT_MAX_URLS = 10;
|
|
26
|
+
|
|
27
|
+
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Naive relevance gate: does the fetched text contain any query term?
|
|
31
|
+
* Avoids an LLM call for an obviously irrelevant page.
|
|
32
|
+
*/
|
|
33
|
+
function isRelevant(text, query) {
|
|
34
|
+
if (!text || !query) return true; // fail-open
|
|
35
|
+
const lc = text.toLowerCase();
|
|
36
|
+
return query.toLowerCase().split(/\s+/).some(term => term.length > 3 && lc.includes(term));
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Truncate text to a safe token budget (~8 000 chars ≈ ~2 000 tokens).
|
|
41
|
+
*/
|
|
42
|
+
function truncate(text, maxChars = 8000) {
|
|
43
|
+
if (!text || text.length <= maxChars) return text;
|
|
44
|
+
return text.slice(0, maxChars) + '\n[...truncated]';
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ── Orchestrator ──────────────────────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
export class AgentOrchestrator {
|
|
50
|
+
/**
|
|
51
|
+
* @param {object} options
|
|
52
|
+
* @param {object|null} options.mcpServer - McpServer instance (for SamplingClient)
|
|
53
|
+
* @param {object} options.searchConfig - passed to SearchWebTool constructor
|
|
54
|
+
* @param {object} options.llmConfig - passed to ExtractWithLlm constructor
|
|
55
|
+
*/
|
|
56
|
+
constructor(options = {}) {
|
|
57
|
+
this._mcpServer = options.mcpServer || null;
|
|
58
|
+
this._searchConfig = options.searchConfig || {};
|
|
59
|
+
this._llmConfig = options.llmConfig || {};
|
|
60
|
+
this._samplingClient = null;
|
|
61
|
+
this._searchTool = null;
|
|
62
|
+
this._extractWithLlm = null;
|
|
63
|
+
this._researchOrchestrator = null;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/** Set MCP server (called by agent.js after construction). */
|
|
67
|
+
setMcpServer(mcpServer) {
|
|
68
|
+
this._mcpServer = mcpServer;
|
|
69
|
+
this._samplingClient = null; // reset so it is rebuilt with the new server
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// ── Lazy accessors ──────────────────────────────────────────────────────────
|
|
73
|
+
|
|
74
|
+
_getSamplingClient() {
|
|
75
|
+
if (!this._samplingClient) {
|
|
76
|
+
this._samplingClient = new SamplingClient({ mcpServer: this._mcpServer });
|
|
77
|
+
}
|
|
78
|
+
return this._samplingClient;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
async _getSearchTool() {
|
|
82
|
+
if (!this._searchTool) {
|
|
83
|
+
const { SearchWebTool } = await import('../tools/search/searchWeb.js');
|
|
84
|
+
this._searchTool = new SearchWebTool(this._searchConfig);
|
|
85
|
+
}
|
|
86
|
+
return this._searchTool;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
async _getExtractWithLlm() {
|
|
90
|
+
if (!this._extractWithLlm) {
|
|
91
|
+
const { ExtractWithLlm } = await import('../tools/extract/extractWithLlm.js');
|
|
92
|
+
this._extractWithLlm = new ExtractWithLlm(this._llmConfig);
|
|
93
|
+
}
|
|
94
|
+
return this._extractWithLlm;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
async _getResearchOrchestrator() {
|
|
98
|
+
if (!this._researchOrchestrator) {
|
|
99
|
+
const { ResearchOrchestrator } = await import('./ResearchOrchestrator.js');
|
|
100
|
+
this._researchOrchestrator = new ResearchOrchestrator({
|
|
101
|
+
maxUrls: 50,
|
|
102
|
+
timeLimit: DEFAULT_WALL_CLOCK_MS
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
return this._researchOrchestrator;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// ── Main entry ──────────────────────────────────────────────────────────────
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Run the agent loop.
|
|
112
|
+
*
|
|
113
|
+
* @param {object} params
|
|
114
|
+
* @param {string} params.prompt - Natural-language task
|
|
115
|
+
* @param {string[]} [params.urls] - Seed URLs (skips search for those)
|
|
116
|
+
* @param {object} [params.schema] - JSON schema for structured output
|
|
117
|
+
* @param {string} [params.model] - 'default' | 'pro'
|
|
118
|
+
* @param {number} [params.maxSteps] - Max ACT iterations (≤10)
|
|
119
|
+
* @param {number} [params.maxUrls] - Max URLs to fetch (≤20)
|
|
120
|
+
* @param {number} [params.wallClockMs] - Wall-clock budget in ms
|
|
121
|
+
* @returns {Promise<object>}
|
|
122
|
+
*/
|
|
123
|
+
async run(params) {
|
|
124
|
+
const {
|
|
125
|
+
prompt,
|
|
126
|
+
urls: seedUrls = [],
|
|
127
|
+
schema,
|
|
128
|
+
model = 'default',
|
|
129
|
+
maxSteps = DEFAULT_MAX_STEPS,
|
|
130
|
+
maxUrls = DEFAULT_MAX_URLS,
|
|
131
|
+
wallClockMs = DEFAULT_WALL_CLOCK_MS
|
|
132
|
+
} = params;
|
|
133
|
+
|
|
134
|
+
const startTime = Date.now();
|
|
135
|
+
const deadline = () => (Date.now() - startTime) >= wallClockMs;
|
|
136
|
+
|
|
137
|
+
// Hard-cap params regardless of what caller sends
|
|
138
|
+
const capSteps = Math.min(maxSteps, 10);
|
|
139
|
+
const capUrls = Math.min(maxUrls, 20);
|
|
140
|
+
|
|
141
|
+
// pro model: delegate to ResearchOrchestrator
|
|
142
|
+
if (model === 'pro') {
|
|
143
|
+
try {
|
|
144
|
+
const orchestrator = await this._getResearchOrchestrator();
|
|
145
|
+
const result = await orchestrator.conductResearch(prompt, {
|
|
146
|
+
maxUrls: capUrls,
|
|
147
|
+
timeLimit: wallClockMs,
|
|
148
|
+
researchApproach: 'focused'
|
|
149
|
+
});
|
|
150
|
+
return { success: true, answer: result, model: 'pro', degraded: false };
|
|
151
|
+
} catch (err) {
|
|
152
|
+
// Fall through to default path on pro failure
|
|
153
|
+
return {
|
|
154
|
+
success: false,
|
|
155
|
+
degraded: true,
|
|
156
|
+
reason: `pro research failed: ${err.message}`,
|
|
157
|
+
answer: null
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// ── PLAN ──────────────────────────────────────────────────────────────────
|
|
163
|
+
let searchQueries = [prompt]; // fallback: use raw prompt as query
|
|
164
|
+
try {
|
|
165
|
+
const planPrompt =
|
|
166
|
+
`Decompose this research task into 1-3 concise web search queries (one per line, no bullets):\n\n${prompt}`;
|
|
167
|
+
const { text } = await this._getSamplingClient().complete(planPrompt, { maxTokens: 200 });
|
|
168
|
+
const lines = text.split('\n').map(l => l.replace(/^[-*\d.)\s]+/, '').trim()).filter(Boolean);
|
|
169
|
+
if (lines.length > 0) searchQueries = lines.slice(0, 3);
|
|
170
|
+
} catch {
|
|
171
|
+
// Sampling unavailable — use raw prompt
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// ── GATHER (search) ───────────────────────────────────────────────────────
|
|
175
|
+
const urlQueue = [...seedUrls]; // start with any user-provided seeds
|
|
176
|
+
const searchResults = [];
|
|
177
|
+
|
|
178
|
+
if (urlQueue.length < capUrls) {
|
|
179
|
+
try {
|
|
180
|
+
const searchTool = await this._getSearchTool();
|
|
181
|
+
for (const q of searchQueries) {
|
|
182
|
+
if (deadline()) break;
|
|
183
|
+
try {
|
|
184
|
+
const sr = await searchTool.execute({ query: q, limit: Math.ceil(capUrls / searchQueries.length) });
|
|
185
|
+
// SearchWebTool.execute() returns the raw results object; the MCP content-wrapped
|
|
186
|
+
// shape only appears if a caller (e.g. server.js) wraps it. Handle both.
|
|
187
|
+
const parsed = sr?.content?.[0]?.text ? JSON.parse(sr.content[0].text) : sr;
|
|
188
|
+
if (parsed?.results) {
|
|
189
|
+
for (const r of parsed.results) {
|
|
190
|
+
if (r.link && !urlQueue.includes(r.link)) urlQueue.push(r.link);
|
|
191
|
+
searchResults.push({ query: q, title: r.title || '', url: r.link || '', snippet: r.snippet || '' });
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
} catch { /* skip failed search */ }
|
|
195
|
+
}
|
|
196
|
+
} catch { /* search tool init failed */ }
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// ── ACT loop ──────────────────────────────────────────────────────────────
|
|
200
|
+
const evidence = [];
|
|
201
|
+
let urlsFetched = 0;
|
|
202
|
+
let step = 0;
|
|
203
|
+
|
|
204
|
+
for (const url of urlQueue) {
|
|
205
|
+
if (step >= capSteps || urlsFetched >= capUrls || deadline()) break;
|
|
206
|
+
step++;
|
|
207
|
+
urlsFetched++;
|
|
208
|
+
|
|
209
|
+
try {
|
|
210
|
+
const { textContent, finalUrl } = await fetchAndParse(url, { timeoutMs: 10000 });
|
|
211
|
+
if (!isRelevant(textContent, prompt)) continue;
|
|
212
|
+
evidence.push({
|
|
213
|
+
url: finalUrl,
|
|
214
|
+
text: truncate(textContent),
|
|
215
|
+
step
|
|
216
|
+
});
|
|
217
|
+
} catch { /* skip unreachable URL */ }
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// ── SHAPE ─────────────────────────────────────────────────────────────────
|
|
221
|
+
const combinedText = evidence.map(e => `--- Source: ${e.url} ---\n${e.text}`).join('\n\n');
|
|
222
|
+
|
|
223
|
+
if (!combinedText.trim()) {
|
|
224
|
+
return {
|
|
225
|
+
success: true,
|
|
226
|
+
degraded: true,
|
|
227
|
+
reason: 'No content could be fetched for the given prompt.',
|
|
228
|
+
search_results: searchResults,
|
|
229
|
+
evidence: [],
|
|
230
|
+
answer: null,
|
|
231
|
+
steps: step,
|
|
232
|
+
urls_fetched: urlsFetched
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// Schema path: use ExtractWithLlm for structured output
|
|
237
|
+
if (schema && Object.keys(schema).length > 0) {
|
|
238
|
+
try {
|
|
239
|
+
const extractWithLlm = await this._getExtractWithLlm();
|
|
240
|
+
const result = await extractWithLlm.execute({
|
|
241
|
+
content: combinedText,
|
|
242
|
+
prompt: `From the following research sources, answer this task and extract structured data:\n${prompt}`,
|
|
243
|
+
schema,
|
|
244
|
+
provider: 'auto'
|
|
245
|
+
});
|
|
246
|
+
return {
|
|
247
|
+
success: result.success,
|
|
248
|
+
answer: result.success ? result.data : null,
|
|
249
|
+
structured: true,
|
|
250
|
+
search_results: searchResults,
|
|
251
|
+
evidence: evidence.map(e => ({ url: e.url })),
|
|
252
|
+
degraded: !result.success,
|
|
253
|
+
reason: result.success ? undefined : result.error,
|
|
254
|
+
steps: step,
|
|
255
|
+
urls_fetched: urlsFetched
|
|
256
|
+
};
|
|
257
|
+
} catch (err) {
|
|
258
|
+
// Fall through to prose synthesis
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Prose synthesis via SamplingClient
|
|
263
|
+
let answer = null;
|
|
264
|
+
let degraded = false;
|
|
265
|
+
let degradedReason;
|
|
266
|
+
|
|
267
|
+
try {
|
|
268
|
+
const synthesisPrompt =
|
|
269
|
+
`You are a research assistant. Based on the sources below, answer this task:\n\n` +
|
|
270
|
+
`Task: ${prompt}\n\n` +
|
|
271
|
+
`${truncate(combinedText, 12000)}\n\n` +
|
|
272
|
+
`Provide a clear, concise answer.`;
|
|
273
|
+
|
|
274
|
+
const { text } = await this._getSamplingClient().complete(synthesisPrompt, { maxTokens: 1024 });
|
|
275
|
+
answer = text;
|
|
276
|
+
} catch (err) {
|
|
277
|
+
degraded = true;
|
|
278
|
+
degradedReason = `LLM synthesis unavailable: ${err.message}`;
|
|
279
|
+
// Return raw evidence so the host LLM can synthesize
|
|
280
|
+
answer = null;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
return {
|
|
284
|
+
success: true,
|
|
285
|
+
answer,
|
|
286
|
+
search_results: searchResults,
|
|
287
|
+
evidence: degraded ? evidence : evidence.map(e => ({ url: e.url })),
|
|
288
|
+
degraded,
|
|
289
|
+
reason: degradedReason,
|
|
290
|
+
steps: step,
|
|
291
|
+
urls_fetched: urlsFetched
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
async destroy() {
|
|
296
|
+
if (this._researchOrchestrator && typeof this._researchOrchestrator.destroy === 'function') {
|
|
297
|
+
await this._researchOrchestrator.destroy();
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
export default AgentOrchestrator;
|
package/src/core/AuthManager.js
CHANGED
|
@@ -538,7 +538,13 @@ class AuthManager {
|
|
|
538
538
|
extract_with_llm: 5,
|
|
539
539
|
|
|
540
540
|
// D3.3: Pre-built site templates (1 credit per template scrape)
|
|
541
|
-
scrape_template: 1
|
|
541
|
+
scrape_template: 1,
|
|
542
|
+
|
|
543
|
+
// Phase D (v4.6.0)
|
|
544
|
+
// scrape: base 2; projectCost() scales with format count
|
|
545
|
+
scrape: 2,
|
|
546
|
+
// agent: base 8; projectCost() scales with maxUrls
|
|
547
|
+
agent: 8
|
|
542
548
|
};
|
|
543
549
|
|
|
544
550
|
return costs[tool] || 1;
|
|
@@ -585,6 +591,20 @@ class AuthManager {
|
|
|
585
591
|
case 'extract_with_llm':
|
|
586
592
|
note = 'Includes external LLM API call cost (not billed in credits, billed by your LLM provider).';
|
|
587
593
|
break;
|
|
594
|
+
case 'scrape': {
|
|
595
|
+
// Base 2 + 1 per format beyond the first
|
|
596
|
+
const fmtCount = Array.isArray(params?.formats) ? params.formats.length : 1;
|
|
597
|
+
projected = Math.max(base, base + Math.max(0, fmtCount - 1));
|
|
598
|
+
note = `Estimated from ${fmtCount} format(s). json format may incur external LLM cost.`;
|
|
599
|
+
break;
|
|
600
|
+
}
|
|
601
|
+
case 'agent': {
|
|
602
|
+
const agentUrls = params?.maxUrls || 10;
|
|
603
|
+
const isPro = params?.model === 'pro';
|
|
604
|
+
projected = Math.max(base, base + Math.ceil(agentUrls / 5) + (isPro ? 5 : 0));
|
|
605
|
+
note = `Lower-bound estimate. Scales with maxUrls (${agentUrls}).${isPro ? ' pro model adds deep-research cost.' : ''} External LLM billed separately.`;
|
|
606
|
+
break;
|
|
607
|
+
}
|
|
588
608
|
default:
|
|
589
609
|
note = 'Fixed cost per invocation.';
|
|
590
610
|
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* agent tool — NL prompt → autonomous search/navigate/extract → answer.
|
|
3
|
+
*
|
|
4
|
+
* Wraps AgentOrchestrator for MCP registration.
|
|
5
|
+
* Mirrors the setMcpServer pattern from extractStructured.js.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { z } from 'zod';
|
|
9
|
+
import { AgentOrchestrator } from '../../core/AgentOrchestrator.js';
|
|
10
|
+
import { ElicitationHelper } from '../../core/ElicitationHelper.js';
|
|
11
|
+
import { getToolConfig } from '../../constants/config.js';
|
|
12
|
+
|
|
13
|
+
export const AgentInputSchema = z.object({
|
|
14
|
+
prompt: z.string().min(1).max(2000).describe('Natural-language task or question'),
|
|
15
|
+
urls: z.array(z.string().url()).max(20).optional().describe('Optional seed URLs to include (max 20)'),
|
|
16
|
+
schema: z.record(z.any()).optional().describe('Optional JSON schema for structured output'),
|
|
17
|
+
model: z.enum(['default', 'pro']).optional().default('default').describe('"default" = SamplingClient loop; "pro" = full ResearchOrchestrator'),
|
|
18
|
+
maxSteps: z.number().min(1).max(10).optional().default(5).describe('Max fetch iterations (hard cap: 10)'),
|
|
19
|
+
maxUrls: z.number().min(1).max(20).optional().default(10).describe('Max URLs to fetch (hard cap: 20)')
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
export class AgentTool {
|
|
23
|
+
constructor(options = {}) {
|
|
24
|
+
this._orchestrator = new AgentOrchestrator({
|
|
25
|
+
mcpServer: null,
|
|
26
|
+
searchConfig: getToolConfig('search_web') || {},
|
|
27
|
+
llmConfig: options.llmConfig || {}
|
|
28
|
+
});
|
|
29
|
+
this._elicitation = new ElicitationHelper({});
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/** Wire MCP server for SamplingClient + Elicitation (called from server.js). */
|
|
33
|
+
setMcpServer(mcpServer) {
|
|
34
|
+
this._orchestrator.setMcpServer(mcpServer);
|
|
35
|
+
this._elicitation = new ElicitationHelper({ mcpServer });
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
async execute(params) {
|
|
39
|
+
const validated = AgentInputSchema.parse(params);
|
|
40
|
+
|
|
41
|
+
// Request confirmation before a pro run (expensive)
|
|
42
|
+
if (validated.model === 'pro') {
|
|
43
|
+
const proceed = await this._elicitation.confirm(
|
|
44
|
+
'agent tool: pro model uses ResearchOrchestrator and may incur significant costs.',
|
|
45
|
+
{ model: 'pro', maxUrls: validated.maxUrls, note: 'External LLM API costs billed separately if keys are set.' }
|
|
46
|
+
);
|
|
47
|
+
if (!proceed) {
|
|
48
|
+
return {
|
|
49
|
+
success: false,
|
|
50
|
+
cancelled: true,
|
|
51
|
+
reason: 'User cancelled pro agent run.'
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return this._orchestrator.run({
|
|
57
|
+
prompt: validated.prompt,
|
|
58
|
+
urls: validated.urls,
|
|
59
|
+
schema: validated.schema,
|
|
60
|
+
model: validated.model,
|
|
61
|
+
maxSteps: validated.maxSteps,
|
|
62
|
+
maxUrls: validated.maxUrls
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
async destroy() {
|
|
67
|
+
await this._orchestrator.destroy();
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export default AgentTool;
|
|
@@ -25,7 +25,7 @@ const BLOCK_ELEMENTS = new Set([
|
|
|
25
25
|
* @param {import('cheerio').CheerioAPI} $ - loaded cheerio instance
|
|
26
26
|
* @returns {string}
|
|
27
27
|
*/
|
|
28
|
-
function extractBlockText($) {
|
|
28
|
+
export function extractBlockText($) {
|
|
29
29
|
const parts = [];
|
|
30
30
|
|
|
31
31
|
function walk(node) {
|
|
@@ -52,6 +52,27 @@ function extractBlockText($) {
|
|
|
52
52
|
return parts.join('').replace(/\n{3,}/g, '\n\n').trim();
|
|
53
53
|
}
|
|
54
54
|
|
|
55
|
+
/**
|
|
56
|
+
* Convert raw HTML to GFM markdown using Readability + Turndown.
|
|
57
|
+
* Accepts the original HTML string and the final URL (needed for Readability).
|
|
58
|
+
* Returns the markdown string.
|
|
59
|
+
* @param {string} html - raw HTML
|
|
60
|
+
* @param {string} pageUrl - URL of the page (used by Readability)
|
|
61
|
+
* @returns {string}
|
|
62
|
+
*/
|
|
63
|
+
export function readabilityToMarkdown(html, pageUrl) {
|
|
64
|
+
let articleHtml;
|
|
65
|
+
try {
|
|
66
|
+
const dom = new JSDOM(html, { url: pageUrl });
|
|
67
|
+
const reader = new Readability(dom.window.document);
|
|
68
|
+
const article = reader.parse();
|
|
69
|
+
articleHtml = article ? article.content : html;
|
|
70
|
+
} catch {
|
|
71
|
+
articleHtml = html;
|
|
72
|
+
}
|
|
73
|
+
return htmlToMarkdown(articleHtml);
|
|
74
|
+
}
|
|
75
|
+
|
|
55
76
|
/**
|
|
56
77
|
* @param {{ url: string, remove_scripts?: boolean, remove_styles?: boolean, output_format?: "text"|"markdown" }} params
|
|
57
78
|
*/
|
|
@@ -76,16 +97,7 @@ export async function extractTextHandler({ url, remove_scripts, remove_styles, o
|
|
|
76
97
|
|
|
77
98
|
if (output_format === 'markdown') {
|
|
78
99
|
// Run Readability first to get main content, then convert to GFM markdown
|
|
79
|
-
|
|
80
|
-
try {
|
|
81
|
-
const dom = new JSDOM(html, { url: response.url });
|
|
82
|
-
const reader = new Readability(dom.window.document);
|
|
83
|
-
const article = reader.parse();
|
|
84
|
-
articleHtml = article ? article.content : $.html('body');
|
|
85
|
-
} catch {
|
|
86
|
-
articleHtml = $.html('body');
|
|
87
|
-
}
|
|
88
|
-
result.markdown = htmlToMarkdown(articleHtml);
|
|
100
|
+
result.markdown = readabilityToMarkdown(html, response.url);
|
|
89
101
|
result.output_format = 'markdown';
|
|
90
102
|
const plainText = result.markdown.replace(/[#*`_\[\]]/g, '').replace(/\s+/g, ' ').trim();
|
|
91
103
|
result.word_count = plainText.split(/\s+/).filter(w => w.length > 0).length;
|
|
@@ -4,6 +4,14 @@ import { DomainFilter } from '../../utils/domainFilter.js';
|
|
|
4
4
|
import { normalizeUrl, getBaseUrl } from '../../utils/urlNormalizer.js';
|
|
5
5
|
import { CacheManager } from '../../core/cache/CacheManager.js';
|
|
6
6
|
import { SitemapParser } from '../../utils/sitemapParser.js';
|
|
7
|
+
import { ResultRanker } from '../search/ranking/ResultRanker.js';
|
|
8
|
+
|
|
9
|
+
// Lazy singleton — avoids creating a CacheManager timer per request
|
|
10
|
+
let _ranker = null;
|
|
11
|
+
function getRanker() {
|
|
12
|
+
if (!_ranker) _ranker = new ResultRanker({ cacheEnabled: false });
|
|
13
|
+
return _ranker;
|
|
14
|
+
}
|
|
7
15
|
|
|
8
16
|
const MapSiteSchema = z.object({
|
|
9
17
|
url: z.string().url(),
|
|
@@ -18,7 +26,8 @@ const MapSiteSchema = z.object({
|
|
|
18
26
|
include_patterns: z.array(z.string()).optional().default([]),
|
|
19
27
|
exclude_patterns: z.array(z.string()).optional().default([])
|
|
20
28
|
}).optional(),
|
|
21
|
-
import_filter_config: z.string().optional() // JSON string of exported config
|
|
29
|
+
import_filter_config: z.string().optional(), // JSON string of exported config
|
|
30
|
+
search: z.string().optional() // when set, rank URLs by relevance and emit ranked_urls
|
|
22
31
|
});
|
|
23
32
|
|
|
24
33
|
export class MapSiteTool {
|
|
@@ -120,6 +129,25 @@ export class MapSiteTool {
|
|
|
120
129
|
filter_stats: domainFilter ? domainFilter.getStats() : null
|
|
121
130
|
};
|
|
122
131
|
|
|
132
|
+
// Optional: rank URLs by relevance to a search string
|
|
133
|
+
if (validated.search) {
|
|
134
|
+
try {
|
|
135
|
+
const rankerInput = urlArray.map(url => {
|
|
136
|
+
let title = url;
|
|
137
|
+
try {
|
|
138
|
+
const { pathname } = new URL(url);
|
|
139
|
+
title = decodeURIComponent(pathname).replace(/[-_/]/g, ' ').trim();
|
|
140
|
+
} catch { /* keep raw url */ }
|
|
141
|
+
return { link: url, title, snippet: '' };
|
|
142
|
+
});
|
|
143
|
+
const ranked = await getRanker().rankResults(rankerInput, validated.search);
|
|
144
|
+
result.ranked_urls = ranked.map(r => ({ url: r.link, score: r.finalScore ?? 0 }));
|
|
145
|
+
} catch {
|
|
146
|
+
// ranking is best-effort; don't fail the whole call
|
|
147
|
+
result.ranked_urls = urlArray.map(u => ({ url: u, score: 0 }));
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
123
151
|
// Store in cache before returning
|
|
124
152
|
if (this.cache) {
|
|
125
153
|
const cacheKey = this.cache.generateKey('map_site', { url: validated.url, maxUrls: validated.max_urls });
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* unifiedScrape — single-fetch, multi-format scraping tool.
|
|
3
|
+
*
|
|
4
|
+
* One call, one fetch. formats[] drives what is returned.
|
|
5
|
+
* Mirrors the output shape of ScrapeWithActionsTool.generateFormats():
|
|
6
|
+
* content.html, content.rawHtml, content.text, content.markdown,
|
|
7
|
+
* content.links, content.metadata, content.screenshots, content.json
|
|
8
|
+
*
|
|
9
|
+
* onlyMainContent maps to Readability boilerplate removal (same as extractContent).
|
|
10
|
+
* Partial success: per-format warnings[] never fail the whole call.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { z } from 'zod';
|
|
14
|
+
import { JSDOM } from 'jsdom';
|
|
15
|
+
import { Readability } from '@mozilla/readability';
|
|
16
|
+
import { fetchAndParse } from '../extract/_fetchAndParse.js';
|
|
17
|
+
import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js';
|
|
18
|
+
import { extractBlockText, readabilityToMarkdown } from '../basic/extractText.js';
|
|
19
|
+
|
|
20
|
+
// ── Schema ────────────────────────────────────────────────────────────────────
|
|
21
|
+
|
|
22
|
+
const JsonFormatSchema = z.object({
|
|
23
|
+
type: z.literal('json'),
|
|
24
|
+
schema: z.record(z.any()).optional(),
|
|
25
|
+
prompt: z.string().optional()
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
const FormatSchema = z.union([
|
|
29
|
+
z.enum(['markdown', 'html', 'rawHtml', 'text', 'links', 'metadata', 'screenshot']),
|
|
30
|
+
JsonFormatSchema
|
|
31
|
+
]);
|
|
32
|
+
|
|
33
|
+
export const UnifiedScrapeSchema = z.object({
|
|
34
|
+
url: z.string().url(),
|
|
35
|
+
formats: z.array(FormatSchema).min(1).default(['markdown']),
|
|
36
|
+
onlyMainContent: z.boolean().optional().default(true),
|
|
37
|
+
// Pass-through to fetchAndParse
|
|
38
|
+
timeoutMs: z.number().min(1000).max(60000).optional().default(15000)
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Extract links from a loaded cheerio $ and the page URL.
|
|
45
|
+
*/
|
|
46
|
+
function extractLinksFromDom($, pageUrl) {
|
|
47
|
+
const links = [];
|
|
48
|
+
const seen = new Set();
|
|
49
|
+
let pageOrigin = '';
|
|
50
|
+
try { pageOrigin = new URL(pageUrl).origin; } catch { /* ignore */ }
|
|
51
|
+
|
|
52
|
+
$('a[href]').each((_, el) => {
|
|
53
|
+
const href = $(el).attr('href');
|
|
54
|
+
const text = $(el).text().trim();
|
|
55
|
+
if (!href) return;
|
|
56
|
+
try {
|
|
57
|
+
let absoluteUrl;
|
|
58
|
+
let isExternal = false;
|
|
59
|
+
if (href.startsWith('http://') || href.startsWith('https://')) {
|
|
60
|
+
absoluteUrl = href;
|
|
61
|
+
isExternal = new URL(href).origin !== pageOrigin;
|
|
62
|
+
} else if (href.startsWith('#') || href.startsWith('javascript:')) {
|
|
63
|
+
return;
|
|
64
|
+
} else {
|
|
65
|
+
absoluteUrl = new URL(href, pageUrl).toString();
|
|
66
|
+
isExternal = false;
|
|
67
|
+
}
|
|
68
|
+
if (!seen.has(absoluteUrl)) {
|
|
69
|
+
seen.add(absoluteUrl);
|
|
70
|
+
links.push({ href: absoluteUrl, text, is_external: isExternal, original_href: href });
|
|
71
|
+
}
|
|
72
|
+
} catch { /* skip invalid */ }
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
return {
|
|
76
|
+
links,
|
|
77
|
+
total_count: links.length,
|
|
78
|
+
internal_count: links.filter(l => !l.is_external).length,
|
|
79
|
+
external_count: links.filter(l => l.is_external).length
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Extract metadata from a loaded cheerio $.
|
|
85
|
+
*/
|
|
86
|
+
function extractMetadataFromDom($, pageUrl) {
|
|
87
|
+
// JSON-LD
|
|
88
|
+
const jsonLd = [];
|
|
89
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
90
|
+
try { const raw = $(el).html(); if (raw) jsonLd.push(JSON.parse(raw)); } catch { /* skip */ }
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
// Microdata
|
|
94
|
+
const microdata = [];
|
|
95
|
+
$('[itemscope]').each((_, el) => {
|
|
96
|
+
const $el = $(el);
|
|
97
|
+
const item = { type: $el.attr('itemtype') || null, properties: {} };
|
|
98
|
+
$el.find('[itemprop]').each((_, prop) => {
|
|
99
|
+
const $prop = $(prop);
|
|
100
|
+
const name = $prop.attr('itemprop');
|
|
101
|
+
if (!name) return;
|
|
102
|
+
const tag = ($prop.get(0).tagName || '').toLowerCase();
|
|
103
|
+
let value;
|
|
104
|
+
if (tag === 'meta') value = $prop.attr('content');
|
|
105
|
+
else if (tag === 'a' || tag === 'link') value = $prop.attr('href');
|
|
106
|
+
else if (tag === 'img') value = $prop.attr('src');
|
|
107
|
+
else if (tag === 'time') value = $prop.attr('datetime') || $prop.text().trim();
|
|
108
|
+
else value = $prop.text().trim();
|
|
109
|
+
if (value) {
|
|
110
|
+
if (!item.properties[name]) item.properties[name] = [];
|
|
111
|
+
item.properties[name].push(value);
|
|
112
|
+
}
|
|
113
|
+
});
|
|
114
|
+
microdata.push(item);
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
const title =
|
|
118
|
+
$('meta[property="og:title"]').attr('content') ||
|
|
119
|
+
$('title').text().trim() ||
|
|
120
|
+
$('h1').first().text().trim() || '';
|
|
121
|
+
|
|
122
|
+
const ogTags = {};
|
|
123
|
+
$('meta[property^="og:"]').each((_, el) => {
|
|
124
|
+
const property = $(el).attr('property');
|
|
125
|
+
const content = $(el).attr('content');
|
|
126
|
+
if (property && content) ogTags[property.replace('og:', '')] = content;
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
const twitterTags = {};
|
|
130
|
+
$('meta[name^="twitter:"]').each((_, el) => {
|
|
131
|
+
const name = $(el).attr('name');
|
|
132
|
+
const content = $(el).attr('content');
|
|
133
|
+
if (name && content) twitterTags[name.replace('twitter:', '')] = content;
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
title,
|
|
138
|
+
description: $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content') || '',
|
|
139
|
+
keywords: ($('meta[name="keywords"]').attr('content') || '').split(',').map(k => k.trim()).filter(Boolean),
|
|
140
|
+
canonical_url: $('link[rel="canonical"]').attr('href') || '',
|
|
141
|
+
author: $('meta[name="author"]').attr('content') || '',
|
|
142
|
+
robots: $('meta[name="robots"]').attr('content') || '',
|
|
143
|
+
viewport: $('meta[name="viewport"]').attr('content') || '',
|
|
144
|
+
og_tags: ogTags,
|
|
145
|
+
twitter_tags: twitterTags,
|
|
146
|
+
json_ld: jsonLd,
|
|
147
|
+
microdata,
|
|
148
|
+
url: pageUrl
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// ── Tool class ────────────────────────────────────────────────────────────────
|
|
153
|
+
|
|
154
|
+
export class UnifiedScrapeTool {
|
|
155
|
+
constructor(options = {}) {
|
|
156
|
+
this._extractWithLlm = null;
|
|
157
|
+
this._extractWithLlmConfig = options.llmConfig || {};
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/** Lazy-load ExtractWithLlm to avoid pulling in heavy deps unless needed. */
|
|
161
|
+
async _getExtractWithLlm() {
|
|
162
|
+
if (!this._extractWithLlm) {
|
|
163
|
+
const { ExtractWithLlm } = await import('../extract/extractWithLlm.js');
|
|
164
|
+
this._extractWithLlm = new ExtractWithLlm(this._extractWithLlmConfig);
|
|
165
|
+
}
|
|
166
|
+
return this._extractWithLlm;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Execute a unified scrape.
|
|
171
|
+
* @param {object} params - UnifiedScrapeSchema-compatible input
|
|
172
|
+
* @returns {Promise<object>}
|
|
173
|
+
*/
|
|
174
|
+
async execute(params) {
|
|
175
|
+
const validated = UnifiedScrapeSchema.parse(params);
|
|
176
|
+
const { url, formats, onlyMainContent, timeoutMs } = validated;
|
|
177
|
+
|
|
178
|
+
// Single fetch
|
|
179
|
+
let html, $, finalUrl;
|
|
180
|
+
try {
|
|
181
|
+
({ html, $, finalUrl } = await fetchAndParse(url, {
|
|
182
|
+
timeoutMs,
|
|
183
|
+
stripTags: [] // we handle boilerplate ourselves
|
|
184
|
+
}));
|
|
185
|
+
} catch (err) {
|
|
186
|
+
throw new Error(`scrape: fetch failed for ${url}: ${err.message}`);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// For onlyMainContent: extract main-content html via Readability once
|
|
190
|
+
let mainHtml = null;
|
|
191
|
+
function getMainHtml() {
|
|
192
|
+
if (mainHtml !== null) return mainHtml;
|
|
193
|
+
try {
|
|
194
|
+
const dom = new JSDOM(html, { url: finalUrl });
|
|
195
|
+
const reader = new Readability(dom.window.document);
|
|
196
|
+
const article = reader.parse();
|
|
197
|
+
mainHtml = article ? article.content : html;
|
|
198
|
+
} catch {
|
|
199
|
+
mainHtml = html;
|
|
200
|
+
}
|
|
201
|
+
return mainHtml;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
const content = {};
|
|
205
|
+
const warnings = [];
|
|
206
|
+
|
|
207
|
+
for (const fmt of formats) {
|
|
208
|
+
// JSON format object
|
|
209
|
+
if (fmt && typeof fmt === 'object' && fmt.type === 'json') {
|
|
210
|
+
try {
|
|
211
|
+
const extractWithLlm = await this._getExtractWithLlm();
|
|
212
|
+
const text = onlyMainContent
|
|
213
|
+
? htmlToMarkdown(getMainHtml())
|
|
214
|
+
: $('body').text().replace(/\s+/g, ' ').trim();
|
|
215
|
+
const result = await extractWithLlm.execute({
|
|
216
|
+
content: text,
|
|
217
|
+
prompt: fmt.prompt || 'Extract structured data from this page content.',
|
|
218
|
+
schema: fmt.schema,
|
|
219
|
+
provider: 'auto'
|
|
220
|
+
});
|
|
221
|
+
content.json = result.success ? result.data : { error: result.error };
|
|
222
|
+
if (!result.success) {
|
|
223
|
+
warnings.push(`json: extraction failed — ${result.error}`);
|
|
224
|
+
}
|
|
225
|
+
} catch (err) {
|
|
226
|
+
content.json = { error: err.message };
|
|
227
|
+
warnings.push(`json: ${err.message}`);
|
|
228
|
+
}
|
|
229
|
+
continue;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// String formats
|
|
233
|
+
switch (fmt) {
|
|
234
|
+
case 'markdown':
|
|
235
|
+
try {
|
|
236
|
+
content.markdown = onlyMainContent
|
|
237
|
+
? readabilityToMarkdown(html, finalUrl)
|
|
238
|
+
: htmlToMarkdown($.html('body') || html);
|
|
239
|
+
} catch (err) {
|
|
240
|
+
content.markdown = '';
|
|
241
|
+
warnings.push(`markdown: ${err.message}`);
|
|
242
|
+
}
|
|
243
|
+
break;
|
|
244
|
+
|
|
245
|
+
case 'html':
|
|
246
|
+
try {
|
|
247
|
+
content.html = onlyMainContent ? getMainHtml() : $.html('body') || html;
|
|
248
|
+
} catch (err) {
|
|
249
|
+
content.html = '';
|
|
250
|
+
warnings.push(`html: ${err.message}`);
|
|
251
|
+
}
|
|
252
|
+
break;
|
|
253
|
+
|
|
254
|
+
case 'rawHtml':
|
|
255
|
+
content.rawHtml = html;
|
|
256
|
+
break;
|
|
257
|
+
|
|
258
|
+
case 'text':
|
|
259
|
+
try {
|
|
260
|
+
if (onlyMainContent) {
|
|
261
|
+
// Plain text from Readability main content via cheerio
|
|
262
|
+
const { load } = await import('cheerio');
|
|
263
|
+
const $main = load(getMainHtml());
|
|
264
|
+
$main('script, style').remove();
|
|
265
|
+
content.text = extractBlockText($main);
|
|
266
|
+
} else {
|
|
267
|
+
$('script, style').remove();
|
|
268
|
+
content.text = extractBlockText($);
|
|
269
|
+
}
|
|
270
|
+
} catch (err) {
|
|
271
|
+
content.text = '';
|
|
272
|
+
warnings.push(`text: ${err.message}`);
|
|
273
|
+
}
|
|
274
|
+
break;
|
|
275
|
+
|
|
276
|
+
case 'links':
|
|
277
|
+
try {
|
|
278
|
+
content.links = extractLinksFromDom($, finalUrl);
|
|
279
|
+
} catch (err) {
|
|
280
|
+
content.links = { links: [], total_count: 0, internal_count: 0, external_count: 0 };
|
|
281
|
+
warnings.push(`links: ${err.message}`);
|
|
282
|
+
}
|
|
283
|
+
break;
|
|
284
|
+
|
|
285
|
+
case 'metadata':
|
|
286
|
+
try {
|
|
287
|
+
content.metadata = extractMetadataFromDom($, finalUrl);
|
|
288
|
+
} catch (err) {
|
|
289
|
+
content.metadata = {};
|
|
290
|
+
warnings.push(`metadata: ${err.message}`);
|
|
291
|
+
}
|
|
292
|
+
break;
|
|
293
|
+
|
|
294
|
+
case 'screenshot':
|
|
295
|
+
// Screenshot requires a browser; not available in the basic scrape path.
|
|
296
|
+
content.screenshots = [];
|
|
297
|
+
warnings.push('screenshot: browser screenshots are not available in the scrape tool; use scrape_with_actions for screenshots');
|
|
298
|
+
break;
|
|
299
|
+
|
|
300
|
+
default:
|
|
301
|
+
warnings.push(`unknown format: ${String(fmt)}`);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
return {
|
|
306
|
+
success: true,
|
|
307
|
+
url: finalUrl,
|
|
308
|
+
content,
|
|
309
|
+
warnings: warnings.length > 0 ? warnings : undefined
|
|
310
|
+
};
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
export default UnifiedScrapeTool;
|