crawlforge-mcp-server 4.7.2 → 4.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +2 -2
- package/package.json +2 -1
- package/server.js +42 -9
- package/src/cli/commands/init.js +13 -2
- package/src/cli/commands/install-skills.js +10 -1
- package/src/cli/commands/monitor.js +81 -0
- package/src/cli/commands/uninstall-skills.js +10 -1
- package/src/core/ActionExecutor.js +51 -9
- package/src/core/ElicitationHelper.js +18 -5
- package/src/core/LLMsTxtAnalyzer.js +2 -1
- package/src/core/MonitorScheduler.js +281 -0
- package/src/core/MonitorStore.js +79 -0
- package/src/core/ResearchOrchestrator.js +2 -1
- package/src/core/crawlers/BFSCrawler.js +2 -1
- package/src/skills/agent-skills/crawlforge-batch-automation/SKILL.md +126 -0
- package/src/skills/agent-skills/crawlforge-batch-automation/references/actions.md +127 -0
- package/src/skills/agent-skills/crawlforge-change-tracking/SKILL.md +116 -0
- package/src/skills/agent-skills/crawlforge-deep-research/SKILL.md +108 -0
- package/src/skills/agent-skills/crawlforge-deep-research/references/workflows.md +76 -0
- package/src/skills/agent-skills/crawlforge-getting-started/SKILL.md +89 -0
- package/src/skills/agent-skills/crawlforge-getting-started/references/cli.md +71 -0
- package/src/skills/agent-skills/crawlforge-getting-started/references/credits.md +75 -0
- package/src/skills/agent-skills/crawlforge-stealth-browsing/SKILL.md +106 -0
- package/src/skills/agent-skills/crawlforge-stealth-browsing/references/engine-selection.md +63 -0
- package/src/skills/agent-skills/crawlforge-structured-extraction/SKILL.md +121 -0
- package/src/skills/agent-skills/crawlforge-structured-extraction/references/templates.md +39 -0
- package/src/skills/agent-skills/crawlforge-web-scraping/SKILL.md +141 -0
- package/src/skills/agent-skills/crawlforge-web-scraping/references/tool-reference.md +95 -0
- package/src/skills/installer.js +186 -34
- package/src/tools/advanced/batchScrape/worker.js +8 -2
- package/src/tools/basic/_fetch.js +14 -1
- package/src/tools/crawl/_sessionContext.js +3 -1
- package/src/tools/extract/_fetchAndParse.js +2 -1
- package/src/tools/extract/extractContent.js +2 -1
- package/src/tools/extract/processDocument.js +2 -1
- package/src/tools/scrape/_brandingExtractor.js +378 -0
- package/src/tools/scrape/unifiedScrape.js +66 -6
- package/src/tools/templates/ScrapeTemplateTool.js +2 -1
- package/src/tools/tracking/trackChanges/differ.js +3 -1
- package/src/tools/tracking/trackChanges/index.js +74 -21
- package/src/tools/tracking/trackChanges/schema.js +7 -2
- package/src/utils/hostRateLimiter.js +46 -0
- package/src/utils/robotsChecker.js +2 -1
- package/src/utils/sitemapParser.js +2 -1
- package/src/utils/ssrfGuard.js +161 -0
- package/src/utils/ssrfProtection.js +6 -9
- package/src/skills/crawlforge-cli.md +0 -157
- package/src/skills/crawlforge-mcp.md +0 -80
- package/src/skills/crawlforge-research.md +0 -104
- package/src/skills/crawlforge-stealth.md +0 -98
package/CLAUDE.md
CHANGED
|
@@ -62,7 +62,7 @@ These guidelines are working if: fewer unnecessary changes in diffs, fewer rewri
|
|
|
62
62
|
|
|
63
63
|
CrawlForge MCP Server - A professional MCP (Model Context Protocol) server providing 26 web scraping, crawling, and content processing tools (5 inline + 21 advanced).
|
|
64
64
|
|
|
65
|
-
**Current Version:** 4.
|
|
65
|
+
**Current Version:** 4.8.0
|
|
66
66
|
|
|
67
67
|
## Development Commands
|
|
68
68
|
|
|
@@ -241,7 +241,7 @@ When adding a new tool to server.js:
|
|
|
241
241
|
|
|
242
242
|
Key mechanisms for security-conscious future sessions:
|
|
243
243
|
|
|
244
|
-
- **SSRF** (`src/utils/ssrfProtection.js`):
|
|
244
|
+
- **SSRF** (`src/utils/ssrfGuard.js` enforcing `src/utils/ssrfProtection.js`): As of v4.8.0 SSRF is actually enforced on the live scraping fetch path (previously `ssrfProtection.js` was unwired). `ssrfGuard` injects an undici dispatcher whose connect-time `lookup` validates every connection — initial request and each redirect hop — and pins the validated IP (closing the DNS-rebinding TOCTOU window). Stage 1 (default) blocks loopback, link-local/cloud-metadata (169.254.169.254), and 0.0.0.0; `SSRF_STRICT=true` adds full RFC1918/ULA/etc. enforcement. Kill switch `SSRF_PROTECTION_ENABLED=false`; `ALLOWED_DOMAINS` allowlist bypass. Wired into every read-scrape site (`_fetch.js`, `_fetchAndParse.js`, `batchScrape/worker.js`, `mapSite.js`, `BFSCrawler.js`, extract/template/session/research/llms-txt/robots/sitemap/track-changes fetches) via `ssrfGuard()`/`safeFetch()`. http/https only; dangerous ports + path-traversal still rejected by `ssrfProtection.js`.
|
|
245
245
|
- **endpointGuard** (`src/core/endpointGuard.js`): Hard allow-list of `{crawlforge.dev, www.crawlforge.dev, api.crawlforge.dev}` for the server's own backend calls; HTTPS required; fail-closed. Localhost only in creator mode (v3.0.18).
|
|
246
246
|
- **Action allowlist** (`src/core/ActionExecutor.js`): `scrape_with_actions` accepts only 7 action types: `wait`, `click`, `type`, `press`, `scroll`, `screenshot`, `executeJavaScript`. `executeJavaScript` throws unless `ALLOW_JAVASCRIPT_EXECUTION=true` is set at deploy time (off by default).
|
|
247
247
|
- **Elicitation** (`src/core/ElicitationHelper.js`): User confirmation requested for `deep_research` (>50 URLs), `batch_scrape` (sync, >25 URLs), `crawl_deep` (projected >500 pages), `extract_structured` (schema has >3 required fields, no LLM configured), and credit-low situations. Fail-open if client does not support elicitation.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.8.0",
|
|
4
4
|
"mcpName": "io.github.mysleekdesigns/crawlforge-mcp-server",
|
|
5
5
|
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 26 web scraping, crawling, deep-research, and autonomous-extraction tools. Returns clean Markdown and structured JSON for Claude, Cursor, and any MCP client. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. Includes a unified multi-format scrape tool, an autonomous agent, pre-built site templates, and Camoufox stealth browsing.",
|
|
6
6
|
"main": "server.js",
|
|
@@ -22,6 +22,7 @@
|
|
|
22
22
|
"test:tools": "node test-tools.js",
|
|
23
23
|
"test:real-world": "node test-real-world.js",
|
|
24
24
|
"test:all": "bash run-all-tests.sh",
|
|
25
|
+
"skills:gen": "node scripts/generate-skill-md.mjs",
|
|
25
26
|
"postinstall": "echo '\nCrawlForge MCP Server installed!\n\nQuick start: run \"npx crawlforge init\" to configure your API key, install skills, and register the MCP server with your AI clients.\nOr run \"npx crawlforge-setup\" to configure your API key only.\n'",
|
|
26
27
|
"docker:build": "docker build -t crawlforge .",
|
|
27
28
|
"docker:dev": "docker-compose up crawlforge-dev",
|
package/server.js
CHANGED
|
@@ -154,7 +154,7 @@ const deepResearchTool = new DeepResearchTool();
|
|
|
154
154
|
const trackChangesTool = new TrackChangesTool();
|
|
155
155
|
const generateLLMsTxtTool = new GenerateLLMsTxtTool();
|
|
156
156
|
const scrapeTemplateTool = new ScrapeTemplateTool(); // D3.3
|
|
157
|
-
const unifiedScrapeTool = new UnifiedScrapeTool(); // D4 D1
|
|
157
|
+
const unifiedScrapeTool = new UnifiedScrapeTool({ actionExecutor: scrapeWithActionsTool.actionExecutor }); // D4 D1 (+v4.8 screenshot reuses the shared browser pool)
|
|
158
158
|
const agentTool = new AgentTool(); // D4 D2
|
|
159
159
|
const stealthBrowserManager = new StealthBrowserManager();
|
|
160
160
|
const localizationManager = new LocalizationManager();
|
|
@@ -177,6 +177,7 @@ batchScrapeTool.setMcpServer(server);
|
|
|
177
177
|
crawlDeepTool.setMcpServer(server);
|
|
178
178
|
extractStructuredTool.setMcpServer(server);
|
|
179
179
|
agentTool.setMcpServer(server); // D4 D2: SamplingClient + Elicitation
|
|
180
|
+
trackChangesTool.setMcpServer(server); // v4.8: SamplingClient for scheduled-monitor goal judging
|
|
180
181
|
AuthManager.setElicitation(elicitation);
|
|
181
182
|
|
|
182
183
|
// ─── D1.1 Resource Templates (MCP Resources) ─────────────────────────────────
|
|
@@ -813,12 +814,12 @@ server.registerTool("deep_research", {
|
|
|
813
814
|
|
|
814
815
|
// Tool: scrape (D4 D1 — unified multi-format single-fetch)
|
|
815
816
|
server.registerTool("scrape", {
|
|
816
|
-
description: "Use this when you need multiple content formats from a single URL in one call — e.g. markdown + links + metadata together. One fetch, no N-request fan-out. Formats: \"markdown\", \"html\", \"rawHtml\", \"text\", \"links\", \"metadata\", or {type:\"json\",schema,prompt} for LLM-structured extraction. onlyMainContent:true (default) strips boilerplate via Readability. Partial success: per-format warnings never fail the whole call. Example: scrape({url:\"https://example.com\", formats:[\"markdown\",\"links\",\"
|
|
817
|
+
description: "Use this when you need multiple content formats from a single URL in one call — e.g. markdown + links + metadata together. One fetch, no N-request fan-out. Formats: \"markdown\", \"html\", \"rawHtml\", \"text\", \"links\", \"metadata\", \"branding\" (static design tokens: colors, fonts, logo), \"screenshot\" (renders in a browser, returns crawlforge://screenshot/{id} resources), or {type:\"json\",schema,prompt} for LLM-structured extraction. onlyMainContent:true (default) strips boilerplate via Readability. Partial success: per-format warnings never fail the whole call. Example: scrape({url:\"https://example.com\", formats:[\"markdown\",\"links\",\"branding\"]})",
|
|
817
818
|
annotations: { title: "Scrape (Multi-Format)", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
818
819
|
inputSchema: {
|
|
819
820
|
url: z.string().url().describe("The URL to scrape"),
|
|
820
821
|
formats: z.array(z.union([
|
|
821
|
-
z.enum(["markdown", "html", "rawHtml", "text", "links", "metadata", "screenshot"]),
|
|
822
|
+
z.enum(["markdown", "html", "rawHtml", "text", "links", "metadata", "screenshot", "branding"]),
|
|
822
823
|
z.object({
|
|
823
824
|
type: z.literal("json"),
|
|
824
825
|
schema: z.record(z.any()).optional().describe("JSON schema for extraction"),
|
|
@@ -826,11 +827,31 @@ server.registerTool("scrape", {
|
|
|
826
827
|
})
|
|
827
828
|
])).min(1).optional().default(["markdown"]).describe("Formats to return (default: [\"markdown\"])"),
|
|
828
829
|
onlyMainContent: z.boolean().optional().default(true).describe("Strip boilerplate via Readability (default: true)"),
|
|
829
|
-
timeoutMs: z.number().min(1000).max(60000).optional().default(15000).describe("Fetch timeout in ms")
|
|
830
|
+
timeoutMs: z.number().min(1000).max(60000).optional().default(15000).describe("Fetch timeout in ms"),
|
|
831
|
+
brandingOptions: z.object({
|
|
832
|
+
fetchLinkedCss: z.boolean().optional().default(true).describe("Fetch linked stylesheets for richer color/font extraction"),
|
|
833
|
+
maxStylesheets: z.number().min(0).max(20).optional().default(10).describe("Max linked stylesheets to fetch")
|
|
834
|
+
}).optional().describe("Options for the \"branding\" format"),
|
|
835
|
+
screenshotOptions: z.object({
|
|
836
|
+
fullPage: z.boolean().optional().default(false).describe("Capture the full scrollable page"),
|
|
837
|
+
format: z.enum(["png", "jpeg"]).optional().default("png"),
|
|
838
|
+
quality: z.number().min(0).max(100).optional().describe("JPEG quality (jpeg only)")
|
|
839
|
+
}).optional().describe("Options for the \"screenshot\" format")
|
|
830
840
|
}
|
|
831
841
|
}, withAuth("scrape", async (params) => {
|
|
832
842
|
try {
|
|
833
843
|
const result = await unifiedScrapeTool.execute(params);
|
|
844
|
+
// Publish any captured screenshots as crawlforge://screenshot/{actionId}
|
|
845
|
+
// resources and annotate each with its URI (mirrors scrape_with_actions).
|
|
846
|
+
if (Array.isArray(result?.content?.screenshots)) {
|
|
847
|
+
result.content.screenshots = result.content.screenshots.map((shot) => {
|
|
848
|
+
if (shot?.actionId && shot?.data) {
|
|
849
|
+
resourceRegistry.storeScreenshot(shot.actionId, shot.data);
|
|
850
|
+
return { ...shot, resourceUri: `crawlforge://screenshot/${shot.actionId}` };
|
|
851
|
+
}
|
|
852
|
+
return shot;
|
|
853
|
+
});
|
|
854
|
+
}
|
|
834
855
|
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
835
856
|
} catch (error) {
|
|
836
857
|
return { content: [{ type: "text", text: `Scrape failed: ${error.message}` }], isError: true };
|
|
@@ -863,10 +884,10 @@ server.registerTool("track_changes", {
|
|
|
863
884
|
description: "Use this when you need to monitor a URL for content changes over time — e.g. competitor pricing, regulation updates, product availability. Start with operation:\"create_baseline\", then periodically use operation:\"compare\" to diff. Supports webhooks and scheduled monitoring. Example: track_changes({url: \"https://example.com/pricing\", operation: \"create_baseline\"})",
|
|
864
885
|
annotations: { title: "Track Changes", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
865
886
|
inputSchema: {
|
|
866
|
-
url: z.string().url().describe("The URL to track changes for"),
|
|
887
|
+
url: z.string().url().optional().describe("The URL to track changes for (optional for list_scheduled_monitors)"),
|
|
867
888
|
operation: z.enum([
|
|
868
889
|
'create_baseline', 'compare', 'monitor', 'get_history', 'get_stats',
|
|
869
|
-
'create_scheduled_monitor', 'stop_scheduled_monitor', 'get_dashboard',
|
|
890
|
+
'create_scheduled_monitor', 'stop_scheduled_monitor', 'list_scheduled_monitors', 'get_dashboard',
|
|
870
891
|
'export_history', 'create_alert_rule', 'generate_trend_report', 'get_monitoring_templates'
|
|
871
892
|
]).default('compare').describe("Tracking operation to perform"),
|
|
872
893
|
content: z.string().optional().describe("Content to compare against baseline"),
|
|
@@ -930,10 +951,14 @@ server.registerTool("track_changes", {
|
|
|
930
951
|
}).optional()
|
|
931
952
|
}).optional().describe("Notification configuration for webhooks and Slack"),
|
|
932
953
|
scheduledMonitorOptions: z.object({
|
|
933
|
-
schedule: z.string().optional(),
|
|
954
|
+
schedule: z.string().optional().describe("Optional cron expression (power users)"),
|
|
934
955
|
templateId: z.string().optional(),
|
|
935
|
-
enabled: z.boolean().default(true)
|
|
936
|
-
|
|
956
|
+
enabled: z.boolean().default(true),
|
|
957
|
+
interval: z.number().min(60000).optional().describe("Polling interval in ms (default 1h)"),
|
|
958
|
+
goal: z.string().optional().describe("Plain-English alert goal; an LLM judges whether a change matches (degrades to threshold if no LLM)"),
|
|
959
|
+
monitorId: z.string().optional().describe("Monitor id for stop_scheduled_monitor"),
|
|
960
|
+
notificationThreshold: z.enum(['minor', 'moderate', 'major', 'critical']).optional()
|
|
961
|
+
}).optional().describe("Scheduled monitoring: recurring compare + notify, optional plain-English goal"),
|
|
937
962
|
alertRuleOptions: z.object({
|
|
938
963
|
ruleId: z.string().optional(),
|
|
939
964
|
condition: z.string().optional(),
|
|
@@ -1271,6 +1296,14 @@ async function runServer() {
|
|
|
1271
1296
|
await connectStdio(server);
|
|
1272
1297
|
}
|
|
1273
1298
|
|
|
1299
|
+
// v4.8: start the scheduled-monitor engine (loads persisted monitors, catches
|
|
1300
|
+
// up any due runs). Best-effort — a scheduler failure must not block startup.
|
|
1301
|
+
try {
|
|
1302
|
+
await trackChangesTool.startScheduler();
|
|
1303
|
+
} catch (err) {
|
|
1304
|
+
console.error('Scheduled-monitor engine failed to start:', err.message);
|
|
1305
|
+
}
|
|
1306
|
+
|
|
1274
1307
|
console.error(`Environment: ${config.server.nodeEnv}`);
|
|
1275
1308
|
console.error("Search enabled: true (via CrawlForge proxy)");
|
|
1276
1309
|
|
package/src/cli/commands/init.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
*/
|
|
4
4
|
import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs';
|
|
5
5
|
import { join } from 'node:path';
|
|
6
|
-
import { install } from '../../skills/installer.js';
|
|
6
|
+
import { install, installHook } from '../../skills/installer.js';
|
|
7
7
|
|
|
8
8
|
const HOME = process.env.HOME || process.env.USERPROFILE || '';
|
|
9
9
|
|
|
@@ -61,6 +61,7 @@ export function register(program) {
|
|
|
61
61
|
.description('Set up CrawlForge: verify API key, install skills, and register the MCP server with your AI clients')
|
|
62
62
|
.option('--all', 'Install skills to all targets and register all detected client configs')
|
|
63
63
|
.option('--client <name>', 'Target client to register: claude-code, claude-desktop, or cursor')
|
|
64
|
+
.option('--with-hook', 'Add an opt-in UserPromptSubmit reminder to boost skill auto-activation')
|
|
64
65
|
.option('--yes', 'Non-interactive — assume yes to all prompts')
|
|
65
66
|
.action(async (opts) => {
|
|
66
67
|
const out = (msg) => process.stderr.write(msg + '\n');
|
|
@@ -80,7 +81,7 @@ export function register(program) {
|
|
|
80
81
|
try {
|
|
81
82
|
const results = await install({ target: skillTarget, force: false, cwd: process.cwd() });
|
|
82
83
|
if (results.installed.length > 0) {
|
|
83
|
-
out('Skills installed: ' + results.installed.length + '
|
|
84
|
+
out('Skills installed: ' + results.installed.length + ' skill(s)');
|
|
84
85
|
} else {
|
|
85
86
|
out('Skills: already up to date (use crawlforge install-skills --force to overwrite)');
|
|
86
87
|
}
|
|
@@ -88,6 +89,16 @@ export function register(program) {
|
|
|
88
89
|
out('Warning: skill install failed — ' + err.message);
|
|
89
90
|
}
|
|
90
91
|
|
|
92
|
+
// 2b. Optional forced-eval hook (opt-in)
|
|
93
|
+
if (opts.withHook) {
|
|
94
|
+
try {
|
|
95
|
+
const hook = installHook();
|
|
96
|
+
out(hook.added ? 'Forced-eval hook added: ' + hook.path : 'Forced-eval hook already present');
|
|
97
|
+
} catch (err) {
|
|
98
|
+
out('Warning: could not add forced-eval hook — ' + err.message);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
91
102
|
// 3. MCP stanza merge
|
|
92
103
|
const clientFilter = opts.client || (opts.all ? undefined : 'claude-code');
|
|
93
104
|
const targets = resolveClientPaths(clientFilter);
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* install-skills command -- install CrawlForge skill files into AI coding tools.
|
|
3
3
|
*/
|
|
4
|
-
import { install } from '../../skills/installer.js';
|
|
4
|
+
import { install, installHook } from '../../skills/installer.js';
|
|
5
5
|
|
|
6
6
|
export function register(program) {
|
|
7
7
|
program
|
|
@@ -10,6 +10,7 @@ export function register(program) {
|
|
|
10
10
|
.option('--target <target>', 'Target: claude-code, cursor, vscode, or all', 'all')
|
|
11
11
|
.option('--force', 'Overwrite existing skill files')
|
|
12
12
|
.option('--dry-run', 'Show what would be installed without writing files')
|
|
13
|
+
.option('--with-hook', 'Also add an opt-in UserPromptSubmit reminder to boost skill auto-activation')
|
|
13
14
|
.action(async (opts) => {
|
|
14
15
|
try {
|
|
15
16
|
const results = await install({
|
|
@@ -19,6 +20,14 @@ export function register(program) {
|
|
|
19
20
|
cwd: process.cwd()
|
|
20
21
|
});
|
|
21
22
|
|
|
23
|
+
if (opts.withHook && !opts.dryRun) {
|
|
24
|
+
const hook = installHook();
|
|
25
|
+
process.stdout.write(
|
|
26
|
+
(hook.added ? 'Added forced-eval hook: ' : 'Forced-eval hook already present: ') +
|
|
27
|
+
hook.path + '\n'
|
|
28
|
+
);
|
|
29
|
+
}
|
|
30
|
+
|
|
22
31
|
if (opts.dryRun) {
|
|
23
32
|
process.stdout.write('Dry run -- would install to:\n');
|
|
24
33
|
results.paths.forEach(p => process.stdout.write(' ' + p + '\n'));
|
|
@@ -46,4 +46,85 @@ export function register(program) {
|
|
|
46
46
|
// monitor runs continuously — do not auto-exit after the first result.
|
|
47
47
|
await runTool(wrapperTool, params, cliFlags, { exitOnSuccess: false });
|
|
48
48
|
});
|
|
49
|
+
|
|
50
|
+
// ── Scheduled monitors (persisted; fire in-process while the server runs, or
|
|
51
|
+
// via `monitor:run-due` from system cron for guaranteed firing) ──────────
|
|
52
|
+
|
|
53
|
+
const emit = (obj) => process.stdout.write(JSON.stringify(obj, null, 2) + '\n');
|
|
54
|
+
|
|
55
|
+
program
|
|
56
|
+
.command('monitor:create <url>')
|
|
57
|
+
.description('Create a persisted scheduled monitor (optionally with a plain-English alert goal)')
|
|
58
|
+
.option('--every <seconds>', 'Polling interval in seconds', '3600')
|
|
59
|
+
.option('--goal <text>', 'Plain-English alert goal (LLM-judged; degrades to threshold if no LLM)')
|
|
60
|
+
.option('--webhook <url>', 'Webhook URL to notify on meaningful changes')
|
|
61
|
+
.option('--threshold <level>', 'Notification threshold: minor|moderate|major|critical', 'moderate')
|
|
62
|
+
.option('--cron <expr>', 'Optional cron expression (advanced)')
|
|
63
|
+
.option('--selector <css>', 'CSS selector to scope monitoring')
|
|
64
|
+
.action(async (url, opts) => {
|
|
65
|
+
const tool = new TrackChangesTool(getToolConfig('track_changes'));
|
|
66
|
+
try {
|
|
67
|
+
const res = await tool.execute({
|
|
68
|
+
url,
|
|
69
|
+
operation: 'create_scheduled_monitor',
|
|
70
|
+
...(opts.selector ? { trackingOptions: { customSelectors: [opts.selector] } } : {}),
|
|
71
|
+
...(opts.webhook ? { notificationOptions: { webhook: { enabled: true, url: opts.webhook } } } : {}),
|
|
72
|
+
scheduledMonitorOptions: {
|
|
73
|
+
interval: Math.max(parseInt(opts.every, 10), 60) * 1000,
|
|
74
|
+
...(opts.goal ? { goal: opts.goal } : {}),
|
|
75
|
+
...(opts.cron ? { schedule: opts.cron } : {}),
|
|
76
|
+
notificationThreshold: opts.threshold
|
|
77
|
+
}
|
|
78
|
+
});
|
|
79
|
+
emit(res);
|
|
80
|
+
process.exit(res.success ? 0 : 1);
|
|
81
|
+
} catch (err) {
|
|
82
|
+
process.stderr.write('Error: ' + err.message + '\n');
|
|
83
|
+
process.exit(1);
|
|
84
|
+
}
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
program
|
|
88
|
+
.command('monitor:list')
|
|
89
|
+
.description('List persisted scheduled monitors')
|
|
90
|
+
.action(async () => {
|
|
91
|
+
const tool = new TrackChangesTool(getToolConfig('track_changes'));
|
|
92
|
+
try {
|
|
93
|
+
emit(await tool.execute({ operation: 'list_scheduled_monitors' }));
|
|
94
|
+
process.exit(0);
|
|
95
|
+
} catch (err) {
|
|
96
|
+
process.stderr.write('Error: ' + err.message + '\n');
|
|
97
|
+
process.exit(1);
|
|
98
|
+
}
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
program
|
|
102
|
+
.command('monitor:stop <id>')
|
|
103
|
+
.description('Stop and remove a scheduled monitor by id')
|
|
104
|
+
.action(async (id) => {
|
|
105
|
+
const tool = new TrackChangesTool(getToolConfig('track_changes'));
|
|
106
|
+
try {
|
|
107
|
+
const res = await tool.execute({ operation: 'stop_scheduled_monitor', scheduledMonitorOptions: { monitorId: id } });
|
|
108
|
+
emit(res);
|
|
109
|
+
process.exit(res.success ? 0 : 1);
|
|
110
|
+
} catch (err) {
|
|
111
|
+
process.stderr.write('Error: ' + err.message + '\n');
|
|
112
|
+
process.exit(1);
|
|
113
|
+
}
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
program
|
|
117
|
+
.command('monitor:run-due')
|
|
118
|
+
.description('Fire every due scheduled monitor once and exit (wire into system cron for guaranteed firing)')
|
|
119
|
+
.action(async () => {
|
|
120
|
+
const tool = new TrackChangesTool(getToolConfig('track_changes'));
|
|
121
|
+
try {
|
|
122
|
+
const res = await tool.runDueOnce();
|
|
123
|
+
emit({ success: true, ...res });
|
|
124
|
+
process.exit(0);
|
|
125
|
+
} catch (err) {
|
|
126
|
+
process.stderr.write('Error: ' + err.message + '\n');
|
|
127
|
+
process.exit(1);
|
|
128
|
+
}
|
|
129
|
+
});
|
|
49
130
|
}
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* uninstall-skills command -- remove CrawlForge skill files.
|
|
3
3
|
*/
|
|
4
|
-
import { uninstall } from '../../skills/installer.js';
|
|
4
|
+
import { uninstall, uninstallHook } from '../../skills/installer.js';
|
|
5
5
|
|
|
6
6
|
export function register(program) {
|
|
7
7
|
program
|
|
8
8
|
.command('uninstall-skills')
|
|
9
9
|
.description('Remove CrawlForge skill files from Claude Code, Cursor, or VS Code')
|
|
10
10
|
.option('--target <target>', 'Target: claude-code, cursor, vscode, or all', 'all')
|
|
11
|
+
.option('--remove-hook', 'Also remove the opt-in UserPromptSubmit reminder hook')
|
|
11
12
|
.action(async (opts) => {
|
|
12
13
|
try {
|
|
13
14
|
const results = await uninstall({
|
|
@@ -15,6 +16,14 @@ export function register(program) {
|
|
|
15
16
|
cwd: process.cwd()
|
|
16
17
|
});
|
|
17
18
|
|
|
19
|
+
if (opts.removeHook) {
|
|
20
|
+
const hook = uninstallHook();
|
|
21
|
+
process.stdout.write(
|
|
22
|
+
(hook.removed ? 'Removed forced-eval hook: ' : 'No forced-eval hook found: ') +
|
|
23
|
+
hook.path + '\n'
|
|
24
|
+
);
|
|
25
|
+
}
|
|
26
|
+
|
|
18
27
|
if (results.removed.length > 0) {
|
|
19
28
|
process.stdout.write('Removed:\n');
|
|
20
29
|
results.removed.forEach(p => process.stdout.write(' ' + p + '\n'));
|
|
@@ -6,6 +6,12 @@
|
|
|
6
6
|
import { z } from 'zod';
|
|
7
7
|
import BrowserProcessor from './processing/BrowserProcessor.js';
|
|
8
8
|
import { EventEmitter } from 'events';
|
|
9
|
+
import { createHash } from 'node:crypto';
|
|
10
|
+
|
|
11
|
+
// executeJavaScript hardening limits (only relevant when the deploy-time flag
|
|
12
|
+
// ALLOW_JAVASCRIPT_EXECUTION=true is set; JS execution stays off by default).
|
|
13
|
+
const JS_MAX_SCRIPT_LENGTH = parseInt(process.env.JS_MAX_SCRIPT_LENGTH || '10000', 10);
|
|
14
|
+
const JS_EXECUTION_TIMEOUT_MS = parseInt(process.env.JS_EXECUTION_TIMEOUT_MS || '5000', 10);
|
|
9
15
|
|
|
10
16
|
// Action schemas
|
|
11
17
|
const BaseActionSchema = z.object({
|
|
@@ -715,17 +721,53 @@ export class ActionExecutor extends EventEmitter {
|
|
|
715
721
|
);
|
|
716
722
|
}
|
|
717
723
|
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
+
const script = typeof action.script === 'string' ? action.script : '';
|
|
725
|
+
const args = Array.isArray(action.args) ? action.args : [];
|
|
726
|
+
|
|
727
|
+
// Defense-in-depth: bound script size before evaluating.
|
|
728
|
+
if (script.length > JS_MAX_SCRIPT_LENGTH) {
|
|
729
|
+
throw new Error(
|
|
730
|
+
`JavaScript execution rejected: script length ${script.length} exceeds limit of ${JS_MAX_SCRIPT_LENGTH} ` +
|
|
731
|
+
`(set JS_MAX_SCRIPT_LENGTH to raise it).`
|
|
732
|
+
);
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
// Structured audit log to stderr (stdout is reserved for the MCP JSON-RPC stream).
|
|
736
|
+
const scriptHash = createHash('sha256').update(script).digest('hex').slice(0, 16);
|
|
737
|
+
let targetUrl = 'unknown';
|
|
738
|
+
try { targetUrl = page.url(); } catch { /* page may be closed */ }
|
|
739
|
+
console.warn(
|
|
740
|
+
'[security] executeJavaScript ' + JSON.stringify({
|
|
741
|
+
ts: new Date().toISOString(),
|
|
742
|
+
url: targetUrl,
|
|
743
|
+
scriptSha256: scriptHash,
|
|
744
|
+
scriptLength: script.length,
|
|
745
|
+
argCount: args.length
|
|
746
|
+
})
|
|
724
747
|
);
|
|
725
|
-
|
|
748
|
+
|
|
749
|
+
// Bound execution time independent of the generic per-action timeout.
|
|
750
|
+
let timer;
|
|
751
|
+
const timeout = new Promise((_, reject) => {
|
|
752
|
+
timer = setTimeout(
|
|
753
|
+
() => reject(new Error(`JavaScript execution timed out after ${JS_EXECUTION_TIMEOUT_MS}ms`)),
|
|
754
|
+
JS_EXECUTION_TIMEOUT_MS
|
|
755
|
+
);
|
|
756
|
+
});
|
|
757
|
+
|
|
758
|
+
let result;
|
|
759
|
+
try {
|
|
760
|
+
result = await Promise.race([
|
|
761
|
+
page.evaluate(new Function('...args', script), ...args),
|
|
762
|
+
timeout
|
|
763
|
+
]);
|
|
764
|
+
} finally {
|
|
765
|
+
clearTimeout(timer);
|
|
766
|
+
}
|
|
767
|
+
|
|
726
768
|
return {
|
|
727
|
-
script
|
|
728
|
-
args
|
|
769
|
+
script,
|
|
770
|
+
args,
|
|
729
771
|
result: action.returnResult ? result : undefined
|
|
730
772
|
};
|
|
731
773
|
}
|
|
@@ -24,7 +24,16 @@ export class ElicitationHelper {
|
|
|
24
24
|
* @returns {boolean}
|
|
25
25
|
*/
|
|
26
26
|
get supported() {
|
|
27
|
-
|
|
27
|
+
const server = this._mcpServer?.server;
|
|
28
|
+
// The MCP SDK exposes elicitation via Server.elicitInput(); it is only
|
|
29
|
+
// usable when the connected CLIENT advertised the `elicitation` capability.
|
|
30
|
+
if (typeof server?.elicitInput !== 'function') return false;
|
|
31
|
+
try {
|
|
32
|
+
const caps = server.getClientCapabilities?.();
|
|
33
|
+
return !!caps?.elicitation;
|
|
34
|
+
} catch {
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
28
37
|
}
|
|
29
38
|
|
|
30
39
|
/**
|
|
@@ -48,7 +57,7 @@ export class ElicitationHelper {
|
|
|
48
57
|
.join('\n');
|
|
49
58
|
const fullMessage = detailLines ? `${message}\n\n${detailLines}` : message;
|
|
50
59
|
|
|
51
|
-
const result = await this._mcpServer.server.
|
|
60
|
+
const result = await this._mcpServer.server.elicitInput({
|
|
52
61
|
message: fullMessage,
|
|
53
62
|
requestedSchema: {
|
|
54
63
|
type: 'object',
|
|
@@ -63,7 +72,8 @@ export class ElicitationHelper {
|
|
|
63
72
|
},
|
|
64
73
|
});
|
|
65
74
|
|
|
66
|
-
|
|
75
|
+
// Only an explicit accept + confirmed=true proceeds; decline/cancel = stop.
|
|
76
|
+
return result?.action === 'accept' && result?.content?.confirmed === true;
|
|
67
77
|
} catch (err) {
|
|
68
78
|
this._logger.warn('Elicitation request failed — proceeding without confirmation', { error: err.message });
|
|
69
79
|
return true; // fail-open
|
|
@@ -87,7 +97,7 @@ export class ElicitationHelper {
|
|
|
87
97
|
}
|
|
88
98
|
|
|
89
99
|
try {
|
|
90
|
-
const result = await this._mcpServer.server.
|
|
100
|
+
const result = await this._mcpServer.server.elicitInput({
|
|
91
101
|
message,
|
|
92
102
|
requestedSchema: {
|
|
93
103
|
type: 'object',
|
|
@@ -103,7 +113,10 @@ export class ElicitationHelper {
|
|
|
103
113
|
},
|
|
104
114
|
});
|
|
105
115
|
|
|
106
|
-
|
|
116
|
+
if (result?.action === 'accept' && result?.content?.[fieldName] != null) {
|
|
117
|
+
return result.content[fieldName];
|
|
118
|
+
}
|
|
119
|
+
return defaultValue || null;
|
|
107
120
|
} catch (err) {
|
|
108
121
|
this._logger.warn('Elicitation request failed', { error: err.message });
|
|
109
122
|
return defaultValue || null;
|
|
@@ -4,6 +4,7 @@ import { MapSiteTool } from '../tools/crawl/mapSite.js';
|
|
|
4
4
|
import { CrawlDeepTool } from '../tools/crawl/crawlDeep.js';
|
|
5
5
|
import { normalizeUrl, getBaseUrl } from '../utils/urlNormalizer.js';
|
|
6
6
|
import { Logger } from '../utils/Logger.js';
|
|
7
|
+
import { safeFetch } from '../utils/ssrfGuard.js';
|
|
7
8
|
|
|
8
9
|
const logger = new Logger('LLMsTxtAnalyzer');
|
|
9
10
|
|
|
@@ -442,7 +443,7 @@ export class LLMsTxtAnalyzer {
|
|
|
442
443
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
443
444
|
|
|
444
445
|
try {
|
|
445
|
-
const response = await
|
|
446
|
+
const response = await safeFetch(url, {
|
|
446
447
|
signal: controller.signal,
|
|
447
448
|
headers: {
|
|
448
449
|
'User-Agent': this.options.userAgent
|