crawlforge-mcp-server 4.7.2 → 4.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/CLAUDE.md +2 -2
  2. package/package.json +2 -1
  3. package/server.js +42 -9
  4. package/src/cli/commands/init.js +13 -2
  5. package/src/cli/commands/install-skills.js +10 -1
  6. package/src/cli/commands/monitor.js +81 -0
  7. package/src/cli/commands/uninstall-skills.js +10 -1
  8. package/src/core/ActionExecutor.js +51 -9
  9. package/src/core/ElicitationHelper.js +18 -5
  10. package/src/core/LLMsTxtAnalyzer.js +2 -1
  11. package/src/core/MonitorScheduler.js +281 -0
  12. package/src/core/MonitorStore.js +79 -0
  13. package/src/core/ResearchOrchestrator.js +2 -1
  14. package/src/core/crawlers/BFSCrawler.js +2 -1
  15. package/src/skills/agent-skills/crawlforge-batch-automation/SKILL.md +126 -0
  16. package/src/skills/agent-skills/crawlforge-batch-automation/references/actions.md +127 -0
  17. package/src/skills/agent-skills/crawlforge-change-tracking/SKILL.md +116 -0
  18. package/src/skills/agent-skills/crawlforge-deep-research/SKILL.md +108 -0
  19. package/src/skills/agent-skills/crawlforge-deep-research/references/workflows.md +76 -0
  20. package/src/skills/agent-skills/crawlforge-getting-started/SKILL.md +89 -0
  21. package/src/skills/agent-skills/crawlforge-getting-started/references/cli.md +71 -0
  22. package/src/skills/agent-skills/crawlforge-getting-started/references/credits.md +75 -0
  23. package/src/skills/agent-skills/crawlforge-stealth-browsing/SKILL.md +106 -0
  24. package/src/skills/agent-skills/crawlforge-stealth-browsing/references/engine-selection.md +63 -0
  25. package/src/skills/agent-skills/crawlforge-structured-extraction/SKILL.md +121 -0
  26. package/src/skills/agent-skills/crawlforge-structured-extraction/references/templates.md +39 -0
  27. package/src/skills/agent-skills/crawlforge-web-scraping/SKILL.md +141 -0
  28. package/src/skills/agent-skills/crawlforge-web-scraping/references/tool-reference.md +95 -0
  29. package/src/skills/installer.js +186 -34
  30. package/src/tools/advanced/batchScrape/worker.js +8 -2
  31. package/src/tools/basic/_fetch.js +14 -1
  32. package/src/tools/crawl/_sessionContext.js +3 -1
  33. package/src/tools/extract/_fetchAndParse.js +2 -1
  34. package/src/tools/extract/extractContent.js +2 -1
  35. package/src/tools/extract/processDocument.js +2 -1
  36. package/src/tools/scrape/_brandingExtractor.js +378 -0
  37. package/src/tools/scrape/unifiedScrape.js +66 -6
  38. package/src/tools/templates/ScrapeTemplateTool.js +2 -1
  39. package/src/tools/tracking/trackChanges/differ.js +3 -1
  40. package/src/tools/tracking/trackChanges/index.js +74 -21
  41. package/src/tools/tracking/trackChanges/schema.js +7 -2
  42. package/src/utils/hostRateLimiter.js +46 -0
  43. package/src/utils/robotsChecker.js +2 -1
  44. package/src/utils/sitemapParser.js +2 -1
  45. package/src/utils/ssrfGuard.js +161 -0
  46. package/src/utils/ssrfProtection.js +6 -9
  47. package/src/skills/crawlforge-cli.md +0 -157
  48. package/src/skills/crawlforge-mcp.md +0 -80
  49. package/src/skills/crawlforge-research.md +0 -104
  50. package/src/skills/crawlforge-stealth.md +0 -98
@@ -1,16 +1,32 @@
1
1
  /**
2
2
  * installer.js — Skills installer for CrawlForge.
3
- * Installs skill markdown files into Claude Code, Cursor, or VS Code.
3
+ *
4
+ * Installs CrawlForge's Claude Agent Skills into Claude Code, Cursor, or VS Code.
4
5
  *
5
6
  * Targets:
6
- * claude-code — ~/.claude/skills/crawlforge-*.md (one file per skill)
7
- * cursor — .cursor/rules/crawlforge.mdc (concatenated)
7
+ * claude-code — ~/.claude/skills/<skill-name>/SKILL.md (real Agent Skill
8
+ * folders with YAML frontmatter, so they auto-activate)
9
+ * cursor — .cursor/rules/crawlforge.mdc (concatenated bodies)
8
10
  * vscode — .github/instructions/crawlforge.instructions.md (concatenated)
9
11
  *
10
- * Idempotent: skips if already installed (use --force to overwrite).
12
+ * Source of truth: src/skills/agent-skills/<skill-name>/SKILL.md (+ references/).
13
+ *
14
+ * Idempotent: skips if already installed (use --force to overwrite). Installing /
15
+ * uninstalling also removes any leftover bare crawlforge-*.md files written by
16
+ * pre-4.8.0 versions (migration), without touching unrelated user skills.
11
17
  */
12
18
 
13
- import { readFileSync, writeFileSync, mkdirSync, existsSync, unlinkSync } from 'node:fs';
19
+ import {
20
+ readFileSync,
21
+ writeFileSync,
22
+ mkdirSync,
23
+ existsSync,
24
+ unlinkSync,
25
+ readdirSync,
26
+ statSync,
27
+ cpSync,
28
+ rmSync,
29
+ } from 'node:fs';
14
30
  import { join, dirname } from 'node:path';
15
31
  import { homedir } from 'node:os';
16
32
  import { fileURLToPath } from 'node:url';
@@ -18,51 +34,123 @@ import { fileURLToPath } from 'node:url';
18
34
  const __filename = fileURLToPath(import.meta.url);
19
35
  const __dirname = dirname(__filename);
20
36
 
21
- // Skill files shipped with the package
22
- const SKILL_FILES = [
37
+ // New source of truth: one folder per skill, each containing a SKILL.md.
38
+ const AGENT_SKILLS_DIR = join(__dirname, 'agent-skills');
39
+
40
+ // Pre-4.8.0 bare files that may linger in ~/.claude/skills (migration cleanup).
41
+ const LEGACY_SKILL_FILES = [
23
42
  'crawlforge-mcp.md',
24
43
  'crawlforge-cli.md',
25
44
  'crawlforge-stealth.md',
26
- 'crawlforge-research.md'
45
+ 'crawlforge-research.md',
27
46
  ];
28
47
 
29
- const SKILL_DIR = __dirname; // src/skills/
48
+ /**
49
+ * Discover the shipped Agent Skills.
50
+ * @returns {{ name: string, srcDir: string, skillMd: string }[]}
51
+ */
52
+ export function listAgentSkills() {
53
+ if (!existsSync(AGENT_SKILLS_DIR)) {
54
+ throw new Error(`Agent skills directory not found: ${AGENT_SKILLS_DIR}`);
55
+ }
56
+ return readdirSync(AGENT_SKILLS_DIR)
57
+ .filter((name) => !name.startsWith('_') && !name.startsWith('.'))
58
+ .map((name) => ({ name, srcDir: join(AGENT_SKILLS_DIR, name) }))
59
+ .filter((s) => {
60
+ try {
61
+ return statSync(s.srcDir).isDirectory() && existsSync(join(s.srcDir, 'SKILL.md'));
62
+ } catch {
63
+ return false;
64
+ }
65
+ })
66
+ .map((s) => ({ ...s, skillMd: join(s.srcDir, 'SKILL.md') }));
67
+ }
68
+
69
+ /**
70
+ * Read a skill's SKILL.md and strip the leading YAML frontmatter block, leaving
71
+ * just the markdown body (used for the concatenated cursor/vscode outputs).
72
+ * @param {string} skillMdPath
73
+ * @returns {string}
74
+ */
75
+ export function readSkillBody(skillMdPath) {
76
+ const raw = readFileSync(skillMdPath, 'utf8');
77
+ const m = raw.match(/^---\n[\s\S]*?\n---\n?/);
78
+ return (m ? raw.slice(m[0].length) : raw).trim();
79
+ }
30
80
 
31
- function readSkillFile(name) {
32
- const p = join(SKILL_DIR, name);
33
- if (!existsSync(p)) throw new Error(`Skill file not found: ${p}`);
34
- return readFileSync(p, 'utf8');
81
+ /**
82
+ * Concatenate all skill bodies into a single document (root SKILL.md + the
83
+ * cursor/vscode single-file targets). Kept named for backwards compatibility.
84
+ * @returns {string}
85
+ */
86
+ export function concatenateSkills() {
87
+ return listAgentSkills()
88
+ .map((s) => readSkillBody(s.skillMd))
89
+ .join('\n\n---\n\n');
35
90
  }
36
91
 
37
- function concatenateSkills() {
38
- return SKILL_FILES.map(f => readSkillFile(f)).join('\n\n---\n\n');
92
+ function copySkillFolder(srcDir, destDir) {
93
+ cpSync(srcDir, destDir, { recursive: true, force: true });
94
+ }
95
+
96
+ /**
97
+ * Remove leftover pre-4.8.0 bare crawlforge-*.md files from a skills dir.
98
+ * Strictly scoped to the four known filenames — never globs, never touches
99
+ * unrelated skills or folders.
100
+ * @param {string} skillsDir
101
+ * @returns {string[]} removed paths
102
+ */
103
+ export function cleanupLegacyClaudeSkills(skillsDir) {
104
+ const removed = [];
105
+ for (const fname of LEGACY_SKILL_FILES) {
106
+ const p = join(skillsDir, fname);
107
+ try {
108
+ if (existsSync(p) && statSync(p).isFile()) {
109
+ unlinkSync(p);
110
+ removed.push(p);
111
+ }
112
+ } catch {
113
+ /* ignore */
114
+ }
115
+ }
116
+ return removed;
39
117
  }
40
118
 
41
119
  /**
42
120
  * Install skills into the given target.
43
- * @param {{ target: 'claude-code'|'cursor'|'vscode'|'all', force?: boolean, dryRun?: boolean, cwd?: string }} opts
121
+ * @param {{ target?: 'claude-code'|'cursor'|'vscode'|'all', force?: boolean, dryRun?: boolean, cwd?: string, homeDir?: string }} opts
44
122
  * @returns {{ installed: string[], skipped: string[], paths: string[] }}
45
123
  */
46
- export async function install({ target = 'all', force = false, dryRun = false, cwd = process.cwd() } = {}) {
124
+ export async function install({
125
+ target = 'all',
126
+ force = false,
127
+ dryRun = false,
128
+ cwd = process.cwd(),
129
+ homeDir = homedir(),
130
+ } = {}) {
47
131
  const targets = target === 'all' ? ['claude-code', 'cursor', 'vscode'] : [target];
48
132
  const results = { installed: [], skipped: [], paths: [] };
133
+ const skills = listAgentSkills();
49
134
 
50
135
  for (const t of targets) {
51
136
  if (t === 'claude-code') {
52
- const skillsDir = join(homedir(), '.claude', 'skills');
53
- for (const fname of SKILL_FILES) {
54
- const dest = join(skillsDir, fname);
55
- results.paths.push(dest);
137
+ const skillsDir = join(homeDir, '.claude', 'skills');
138
+ for (const skill of skills) {
139
+ const destDir = join(skillsDir, skill.name);
140
+ const destSkillMd = join(destDir, 'SKILL.md');
141
+ results.paths.push(destSkillMd);
56
142
  if (!dryRun) {
57
- if (existsSync(dest) && !force) {
58
- results.skipped.push(dest);
143
+ if (existsSync(destSkillMd) && !force) {
144
+ results.skipped.push(destSkillMd);
59
145
  continue;
60
146
  }
61
147
  mkdirSync(skillsDir, { recursive: true });
62
- writeFileSync(dest, readSkillFile(fname), 'utf8');
148
+ copySkillFolder(skill.srcDir, destDir);
63
149
  }
64
- results.installed.push(dest);
150
+ results.installed.push(destSkillMd);
65
151
  }
152
+ // Migration: remove stale bare files from older versions.
153
+ if (!dryRun) cleanupLegacyClaudeSkills(skillsDir);
66
154
  } else if (t === 'cursor') {
67
155
  const dir = join(cwd, '.cursor', 'rules');
68
156
  const dest = join(dir, 'crawlforge.mdc');
@@ -99,25 +187,27 @@ export async function install({ target = 'all', force = false, dryRun = false, c
99
187
 
100
188
  /**
101
189
  * Uninstall skills from the given target.
102
- * @param {{ target: 'claude-code'|'cursor'|'vscode'|'all', cwd?: string }} opts
190
+ * @param {{ target?: 'claude-code'|'cursor'|'vscode'|'all', cwd?: string, homeDir?: string }} opts
103
191
  * @returns {{ removed: string[], notFound: string[] }}
104
192
  */
105
- export async function uninstall({ target = 'all', cwd = process.cwd() } = {}) {
193
+ export async function uninstall({ target = 'all', cwd = process.cwd(), homeDir = homedir() } = {}) {
106
194
  const targets = target === 'all' ? ['claude-code', 'cursor', 'vscode'] : [target];
107
195
  const results = { removed: [], notFound: [] };
196
+ const skills = listAgentSkills();
108
197
 
109
198
  for (const t of targets) {
110
199
  if (t === 'claude-code') {
111
- const skillsDir = join(homedir(), '.claude', 'skills');
112
- for (const fname of SKILL_FILES) {
113
- const dest = join(skillsDir, fname);
114
- if (existsSync(dest)) {
115
- unlinkSync(dest);
116
- results.removed.push(dest);
200
+ const skillsDir = join(homeDir, '.claude', 'skills');
201
+ for (const skill of skills) {
202
+ const destDir = join(skillsDir, skill.name);
203
+ if (existsSync(destDir)) {
204
+ rmSync(destDir, { recursive: true, force: true });
205
+ results.removed.push(destDir);
117
206
  } else {
118
- results.notFound.push(dest);
207
+ results.notFound.push(destDir);
119
208
  }
120
209
  }
210
+ results.removed.push(...cleanupLegacyClaudeSkills(skillsDir));
121
211
  } else if (t === 'cursor') {
122
212
  const dest = join(cwd, '.cursor', 'rules', 'crawlforge.mdc');
123
213
  if (existsSync(dest)) {
@@ -139,3 +229,65 @@ export async function uninstall({ target = 'all', cwd = process.cwd() } = {}) {
139
229
 
140
230
  return results;
141
231
  }
232
+
233
+ // --- Optional, opt-in forced-eval hook (boosts skill auto-activation) ---
234
+
235
+ const HOOK_MARKER = 'CrawlForge skill';
236
+ const HOOK_COMMAND =
237
+ "echo 'Consider whether a CrawlForge skill applies: web scraping, deep research, " +
238
+ "stealth browsing, structured extraction, change tracking, or batch automation.'";
239
+
240
+ /**
241
+ * Add a UserPromptSubmit forced-eval reminder to ~/.claude/settings.json.
242
+ * Idempotent additive merge — preserves all existing settings and only adds the
243
+ * hook if an equivalent one is not already present. Opt-in (CLI --with-hook).
244
+ * @param {{ homeDir?: string }} opts
245
+ * @returns {{ added: boolean, path: string }}
246
+ */
247
+ export function installHook({ homeDir = homedir() } = {}) {
248
+ const dir = join(homeDir, '.claude');
249
+ const path = join(dir, 'settings.json');
250
+ let settings = {};
251
+ if (existsSync(path)) {
252
+ try {
253
+ settings = JSON.parse(readFileSync(path, 'utf8'));
254
+ } catch {
255
+ settings = {};
256
+ }
257
+ }
258
+ settings.hooks = settings.hooks || {};
259
+ const list = Array.isArray(settings.hooks.UserPromptSubmit)
260
+ ? settings.hooks.UserPromptSubmit
261
+ : [];
262
+ const already = JSON.stringify(list).includes(HOOK_MARKER);
263
+ if (already) return { added: false, path };
264
+
265
+ list.push({ hooks: [{ type: 'command', command: HOOK_COMMAND }] });
266
+ settings.hooks.UserPromptSubmit = list;
267
+ mkdirSync(dir, { recursive: true });
268
+ writeFileSync(path, JSON.stringify(settings, null, 2) + '\n', 'utf8');
269
+ return { added: true, path };
270
+ }
271
+
272
+ /**
273
+ * Remove the CrawlForge forced-eval hook from ~/.claude/settings.json.
274
+ * @param {{ homeDir?: string }} opts
275
+ * @returns {{ removed: boolean, path: string }}
276
+ */
277
+ export function uninstallHook({ homeDir = homedir() } = {}) {
278
+ const path = join(homeDir, '.claude', 'settings.json');
279
+ if (!existsSync(path)) return { removed: false, path };
280
+ let settings;
281
+ try {
282
+ settings = JSON.parse(readFileSync(path, 'utf8'));
283
+ } catch {
284
+ return { removed: false, path };
285
+ }
286
+ const list = settings?.hooks?.UserPromptSubmit;
287
+ if (!Array.isArray(list)) return { removed: false, path };
288
+ const filtered = list.filter((entry) => !JSON.stringify(entry).includes(HOOK_MARKER));
289
+ if (filtered.length === list.length) return { removed: false, path };
290
+ settings.hooks.UserPromptSubmit = filtered;
291
+ writeFileSync(path, JSON.stringify(settings, null, 2) + '\n', 'utf8');
292
+ return { removed: true, path };
293
+ }
@@ -5,25 +5,31 @@
5
5
  */
6
6
 
7
7
  import { load } from 'cheerio';
8
+ import { ssrfGuard, isSsrfError } from '../../../utils/ssrfGuard.js';
9
+ import { throttleHost } from '../../../utils/hostRateLimiter.js';
8
10
 
9
11
  const USER_AGENT = 'MCP-WebScraper-BatchTool/1.0.0';
10
12
 
11
13
  /**
12
- * Fetch a URL with AbortController timeout.
14
+ * Fetch a URL with AbortController timeout (SSRF-guarded + per-host throttled).
13
15
  */
14
16
  export async function fetchUrl(url, options = {}) {
15
17
  const { timeout = 15000, headers = {} } = options;
18
+ const guard = ssrfGuard(url); // SSRF pre-flight (throws before connecting)
19
+ await throttleHost(url);
16
20
  const controller = new AbortController();
17
21
  const timeoutId = setTimeout(() => controller.abort(), timeout);
18
22
  try {
19
23
  const response = await fetch(url, {
20
24
  signal: controller.signal,
21
- headers: { 'User-Agent': USER_AGENT, ...headers }
25
+ headers: { 'User-Agent': USER_AGENT, ...headers },
26
+ ...guard
22
27
  });
23
28
  clearTimeout(timeoutId);
24
29
  return response;
25
30
  } catch (error) {
26
31
  clearTimeout(timeoutId);
32
+ if (isSsrfError(error)) throw new Error(error.cause?.message || error.message);
27
33
  if (error.name === 'AbortError') throw new Error(`Request timeout after ${timeout}ms`);
28
34
  throw error;
29
35
  }
@@ -5,6 +5,8 @@
5
5
 
6
6
  import { config } from '../../constants/config.js';
7
7
  import { createRequire } from 'module';
8
+ import { ssrfGuard, isSsrfError } from '../../utils/ssrfGuard.js';
9
+ import { throttleHost } from '../../utils/hostRateLimiter.js';
8
10
 
9
11
  // Derive User-Agent from package version so it reflects the actual release.
10
12
  const _require = createRequire(import.meta.url);
@@ -27,6 +29,13 @@ export async function fetchWithTimeout(url, options = {}) {
27
29
  const { timeout = 10000, headers = {} } = options;
28
30
  const maxBodySize = config.fetch.maxBodySize;
29
31
 
32
+ // SSRF pre-flight (protocol / metadata host). Throws a clear error before any
33
+ // connection is attempted; `guard.dispatcher` enforces IP rules at connect time.
34
+ const guard = ssrfGuard(url);
35
+
36
+ // Per-host politeness throttle (before the timeout window starts).
37
+ await throttleHost(url);
38
+
30
39
  const controller = new AbortController();
31
40
  const timeoutId = setTimeout(() => controller.abort(), timeout);
32
41
 
@@ -37,11 +46,15 @@ export async function fetchWithTimeout(url, options = {}) {
37
46
  headers: {
38
47
  'User-Agent': CRAWLFORGE_UA,
39
48
  ...headers
40
- }
49
+ },
50
+ ...guard
41
51
  });
42
52
  clearTimeout(timeoutId);
43
53
  } catch (error) {
44
54
  clearTimeout(timeoutId);
55
+ if (isSsrfError(error)) {
56
+ throw new Error(error.cause?.message || error.message);
57
+ }
45
58
  if (error.name === 'AbortError') {
46
59
  throw new Error(`Request timeout after ${timeout}ms`);
47
60
  }
@@ -15,6 +15,8 @@
15
15
  * - Keeping zero new runtime deps satisfies the project constraint.
16
16
  */
17
17
 
18
+ import { safeFetch } from '../../utils/ssrfGuard.js';
19
+
18
20
  /**
19
21
  * Parse a single Set-Cookie header value into a cookie object.
20
22
  * Returns null if the header is empty or unparseable.
@@ -220,7 +222,7 @@ export class SessionContext {
220
222
  fetchOpts.body = body;
221
223
  }
222
224
 
223
- const response = await fetch(url, fetchOpts);
225
+ const response = await safeFetch(url, fetchOpts);
224
226
  this.recordCookies(response, url);
225
227
 
226
228
  const text = await response.text().catch(() => '');
@@ -11,6 +11,7 @@
11
11
  */
12
12
 
13
13
  import { load } from 'cheerio';
14
+ import { safeFetch } from '../../utils/ssrfGuard.js';
14
15
 
15
16
  const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; CrawlForge-MCP/3.0)';
16
17
  const DEFAULT_TIMEOUT_MS = 15000;
@@ -32,7 +33,7 @@ export async function fetchAndParse(url, options = {}) {
32
33
  stripTags = ['script', 'style', 'noscript', 'iframe', 'svg']
33
34
  } = options;
34
35
 
35
- const response = await fetch(url, {
36
+ const response = await safeFetch(url, {
36
37
  headers: {
37
38
  'User-Agent': userAgent,
38
39
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
@@ -8,6 +8,7 @@ import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
8
8
  import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
9
9
  import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
10
10
  import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
11
+ import { safeFetch } from '../../utils/ssrfGuard.js';
11
12
 
12
13
  const ExtractContentSchema = z.object({
13
14
  url: z.string().url(),
@@ -169,7 +170,7 @@ export class ExtractContentTool {
169
170
  pageTitle = browserResult.title;
170
171
  } else {
171
172
  // Simple HTTP fetch
172
- const response = await fetch(url, {
173
+ const response = await safeFetch(url, {
173
174
  headers: {
174
175
  'User-Agent': 'Mozilla/5.0 (compatible; MCP-WebScraper/3.0; Enhanced-Content-Extractor)'
175
176
  },
@@ -9,6 +9,7 @@ import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
9
9
  import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
10
10
  import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
11
11
  import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
12
+ import { safeFetch } from '../../utils/ssrfGuard.js';
12
13
 
13
14
  const ProcessDocumentSchema = z.object({
14
15
  source: z.string().min(1),
@@ -275,7 +276,7 @@ export class ProcessDocumentTool {
275
276
  pageTitle = browserResult.title;
276
277
  } else {
277
278
  // Simple HTTP fetch
278
- const response = await fetch(source, {
279
+ const response = await safeFetch(source, {
279
280
  headers: {
280
281
  'User-Agent': 'Mozilla/5.0 (compatible; MCP-WebScraper/3.0; Document-Processor)'
281
282
  },