crawlforge-mcp-server 4.7.2 → 4.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +2 -2
- package/package.json +2 -1
- package/server.js +42 -9
- package/src/cli/commands/init.js +13 -2
- package/src/cli/commands/install-skills.js +10 -1
- package/src/cli/commands/monitor.js +81 -0
- package/src/cli/commands/uninstall-skills.js +10 -1
- package/src/core/ActionExecutor.js +51 -9
- package/src/core/ElicitationHelper.js +18 -5
- package/src/core/LLMsTxtAnalyzer.js +2 -1
- package/src/core/MonitorScheduler.js +281 -0
- package/src/core/MonitorStore.js +79 -0
- package/src/core/ResearchOrchestrator.js +2 -1
- package/src/core/crawlers/BFSCrawler.js +2 -1
- package/src/skills/agent-skills/crawlforge-batch-automation/SKILL.md +126 -0
- package/src/skills/agent-skills/crawlforge-batch-automation/references/actions.md +127 -0
- package/src/skills/agent-skills/crawlforge-change-tracking/SKILL.md +116 -0
- package/src/skills/agent-skills/crawlforge-deep-research/SKILL.md +108 -0
- package/src/skills/agent-skills/crawlforge-deep-research/references/workflows.md +76 -0
- package/src/skills/agent-skills/crawlforge-getting-started/SKILL.md +89 -0
- package/src/skills/agent-skills/crawlforge-getting-started/references/cli.md +71 -0
- package/src/skills/agent-skills/crawlforge-getting-started/references/credits.md +75 -0
- package/src/skills/agent-skills/crawlforge-stealth-browsing/SKILL.md +106 -0
- package/src/skills/agent-skills/crawlforge-stealth-browsing/references/engine-selection.md +63 -0
- package/src/skills/agent-skills/crawlforge-structured-extraction/SKILL.md +121 -0
- package/src/skills/agent-skills/crawlforge-structured-extraction/references/templates.md +39 -0
- package/src/skills/agent-skills/crawlforge-web-scraping/SKILL.md +141 -0
- package/src/skills/agent-skills/crawlforge-web-scraping/references/tool-reference.md +95 -0
- package/src/skills/installer.js +186 -34
- package/src/tools/advanced/batchScrape/worker.js +8 -2
- package/src/tools/basic/_fetch.js +14 -1
- package/src/tools/crawl/_sessionContext.js +3 -1
- package/src/tools/extract/_fetchAndParse.js +2 -1
- package/src/tools/extract/extractContent.js +2 -1
- package/src/tools/extract/processDocument.js +2 -1
- package/src/tools/scrape/_brandingExtractor.js +378 -0
- package/src/tools/scrape/unifiedScrape.js +66 -6
- package/src/tools/templates/ScrapeTemplateTool.js +2 -1
- package/src/tools/tracking/trackChanges/differ.js +3 -1
- package/src/tools/tracking/trackChanges/index.js +74 -21
- package/src/tools/tracking/trackChanges/schema.js +7 -2
- package/src/utils/hostRateLimiter.js +46 -0
- package/src/utils/robotsChecker.js +2 -1
- package/src/utils/sitemapParser.js +2 -1
- package/src/utils/ssrfGuard.js +161 -0
- package/src/utils/ssrfProtection.js +6 -9
- package/src/skills/crawlforge-cli.md +0 -157
- package/src/skills/crawlforge-mcp.md +0 -80
- package/src/skills/crawlforge-research.md +0 -104
- package/src/skills/crawlforge-stealth.md +0 -98
package/src/skills/installer.js
CHANGED
|
@@ -1,16 +1,32 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* installer.js — Skills installer for CrawlForge.
|
|
3
|
-
*
|
|
3
|
+
*
|
|
4
|
+
* Installs CrawlForge's Claude Agent Skills into Claude Code, Cursor, or VS Code.
|
|
4
5
|
*
|
|
5
6
|
* Targets:
|
|
6
|
-
* claude-code — ~/.claude/skills
|
|
7
|
-
*
|
|
7
|
+
* claude-code — ~/.claude/skills/<skill-name>/SKILL.md (real Agent Skill
|
|
8
|
+
* folders with YAML frontmatter, so they auto-activate)
|
|
9
|
+
* cursor — .cursor/rules/crawlforge.mdc (concatenated bodies)
|
|
8
10
|
* vscode — .github/instructions/crawlforge.instructions.md (concatenated)
|
|
9
11
|
*
|
|
10
|
-
*
|
|
12
|
+
* Source of truth: src/skills/agent-skills/<skill-name>/SKILL.md (+ references/).
|
|
13
|
+
*
|
|
14
|
+
* Idempotent: skips if already installed (use --force to overwrite). Installing /
|
|
15
|
+
* uninstalling also removes any leftover bare crawlforge-*.md files written by
|
|
16
|
+
* pre-4.8.0 versions (migration), without touching unrelated user skills.
|
|
11
17
|
*/
|
|
12
18
|
|
|
13
|
-
import {
|
|
19
|
+
import {
|
|
20
|
+
readFileSync,
|
|
21
|
+
writeFileSync,
|
|
22
|
+
mkdirSync,
|
|
23
|
+
existsSync,
|
|
24
|
+
unlinkSync,
|
|
25
|
+
readdirSync,
|
|
26
|
+
statSync,
|
|
27
|
+
cpSync,
|
|
28
|
+
rmSync,
|
|
29
|
+
} from 'node:fs';
|
|
14
30
|
import { join, dirname } from 'node:path';
|
|
15
31
|
import { homedir } from 'node:os';
|
|
16
32
|
import { fileURLToPath } from 'node:url';
|
|
@@ -18,51 +34,123 @@ import { fileURLToPath } from 'node:url';
|
|
|
18
34
|
const __filename = fileURLToPath(import.meta.url);
|
|
19
35
|
const __dirname = dirname(__filename);
|
|
20
36
|
|
|
21
|
-
//
|
|
22
|
-
const
|
|
37
|
+
// New source of truth: one folder per skill, each containing a SKILL.md.
|
|
38
|
+
const AGENT_SKILLS_DIR = join(__dirname, 'agent-skills');
|
|
39
|
+
|
|
40
|
+
// Pre-4.8.0 bare files that may linger in ~/.claude/skills (migration cleanup).
|
|
41
|
+
const LEGACY_SKILL_FILES = [
|
|
23
42
|
'crawlforge-mcp.md',
|
|
24
43
|
'crawlforge-cli.md',
|
|
25
44
|
'crawlforge-stealth.md',
|
|
26
|
-
'crawlforge-research.md'
|
|
45
|
+
'crawlforge-research.md',
|
|
27
46
|
];
|
|
28
47
|
|
|
29
|
-
|
|
48
|
+
/**
|
|
49
|
+
* Discover the shipped Agent Skills.
|
|
50
|
+
* @returns {{ name: string, srcDir: string, skillMd: string }[]}
|
|
51
|
+
*/
|
|
52
|
+
export function listAgentSkills() {
|
|
53
|
+
if (!existsSync(AGENT_SKILLS_DIR)) {
|
|
54
|
+
throw new Error(`Agent skills directory not found: ${AGENT_SKILLS_DIR}`);
|
|
55
|
+
}
|
|
56
|
+
return readdirSync(AGENT_SKILLS_DIR)
|
|
57
|
+
.filter((name) => !name.startsWith('_') && !name.startsWith('.'))
|
|
58
|
+
.map((name) => ({ name, srcDir: join(AGENT_SKILLS_DIR, name) }))
|
|
59
|
+
.filter((s) => {
|
|
60
|
+
try {
|
|
61
|
+
return statSync(s.srcDir).isDirectory() && existsSync(join(s.srcDir, 'SKILL.md'));
|
|
62
|
+
} catch {
|
|
63
|
+
return false;
|
|
64
|
+
}
|
|
65
|
+
})
|
|
66
|
+
.map((s) => ({ ...s, skillMd: join(s.srcDir, 'SKILL.md') }));
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Read a skill's SKILL.md and strip the leading YAML frontmatter block, leaving
|
|
71
|
+
* just the markdown body (used for the concatenated cursor/vscode outputs).
|
|
72
|
+
* @param {string} skillMdPath
|
|
73
|
+
* @returns {string}
|
|
74
|
+
*/
|
|
75
|
+
export function readSkillBody(skillMdPath) {
|
|
76
|
+
const raw = readFileSync(skillMdPath, 'utf8');
|
|
77
|
+
const m = raw.match(/^---\n[\s\S]*?\n---\n?/);
|
|
78
|
+
return (m ? raw.slice(m[0].length) : raw).trim();
|
|
79
|
+
}
|
|
30
80
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
81
|
+
/**
|
|
82
|
+
* Concatenate all skill bodies into a single document (root SKILL.md + the
|
|
83
|
+
* cursor/vscode single-file targets). Kept named for backwards compatibility.
|
|
84
|
+
* @returns {string}
|
|
85
|
+
*/
|
|
86
|
+
export function concatenateSkills() {
|
|
87
|
+
return listAgentSkills()
|
|
88
|
+
.map((s) => readSkillBody(s.skillMd))
|
|
89
|
+
.join('\n\n---\n\n');
|
|
35
90
|
}
|
|
36
91
|
|
|
37
|
-
function
|
|
38
|
-
|
|
92
|
+
function copySkillFolder(srcDir, destDir) {
|
|
93
|
+
cpSync(srcDir, destDir, { recursive: true, force: true });
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Remove leftover pre-4.8.0 bare crawlforge-*.md files from a skills dir.
|
|
98
|
+
* Strictly scoped to the four known filenames — never globs, never touches
|
|
99
|
+
* unrelated skills or folders.
|
|
100
|
+
* @param {string} skillsDir
|
|
101
|
+
* @returns {string[]} removed paths
|
|
102
|
+
*/
|
|
103
|
+
export function cleanupLegacyClaudeSkills(skillsDir) {
|
|
104
|
+
const removed = [];
|
|
105
|
+
for (const fname of LEGACY_SKILL_FILES) {
|
|
106
|
+
const p = join(skillsDir, fname);
|
|
107
|
+
try {
|
|
108
|
+
if (existsSync(p) && statSync(p).isFile()) {
|
|
109
|
+
unlinkSync(p);
|
|
110
|
+
removed.push(p);
|
|
111
|
+
}
|
|
112
|
+
} catch {
|
|
113
|
+
/* ignore */
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return removed;
|
|
39
117
|
}
|
|
40
118
|
|
|
41
119
|
/**
|
|
42
120
|
* Install skills into the given target.
|
|
43
|
-
* @param {{ target
|
|
121
|
+
* @param {{ target?: 'claude-code'|'cursor'|'vscode'|'all', force?: boolean, dryRun?: boolean, cwd?: string, homeDir?: string }} opts
|
|
44
122
|
* @returns {{ installed: string[], skipped: string[], paths: string[] }}
|
|
45
123
|
*/
|
|
46
|
-
export async function install({
|
|
124
|
+
export async function install({
|
|
125
|
+
target = 'all',
|
|
126
|
+
force = false,
|
|
127
|
+
dryRun = false,
|
|
128
|
+
cwd = process.cwd(),
|
|
129
|
+
homeDir = homedir(),
|
|
130
|
+
} = {}) {
|
|
47
131
|
const targets = target === 'all' ? ['claude-code', 'cursor', 'vscode'] : [target];
|
|
48
132
|
const results = { installed: [], skipped: [], paths: [] };
|
|
133
|
+
const skills = listAgentSkills();
|
|
49
134
|
|
|
50
135
|
for (const t of targets) {
|
|
51
136
|
if (t === 'claude-code') {
|
|
52
|
-
const skillsDir = join(
|
|
53
|
-
for (const
|
|
54
|
-
const
|
|
55
|
-
|
|
137
|
+
const skillsDir = join(homeDir, '.claude', 'skills');
|
|
138
|
+
for (const skill of skills) {
|
|
139
|
+
const destDir = join(skillsDir, skill.name);
|
|
140
|
+
const destSkillMd = join(destDir, 'SKILL.md');
|
|
141
|
+
results.paths.push(destSkillMd);
|
|
56
142
|
if (!dryRun) {
|
|
57
|
-
if (existsSync(
|
|
58
|
-
results.skipped.push(
|
|
143
|
+
if (existsSync(destSkillMd) && !force) {
|
|
144
|
+
results.skipped.push(destSkillMd);
|
|
59
145
|
continue;
|
|
60
146
|
}
|
|
61
147
|
mkdirSync(skillsDir, { recursive: true });
|
|
62
|
-
|
|
148
|
+
copySkillFolder(skill.srcDir, destDir);
|
|
63
149
|
}
|
|
64
|
-
results.installed.push(
|
|
150
|
+
results.installed.push(destSkillMd);
|
|
65
151
|
}
|
|
152
|
+
// Migration: remove stale bare files from older versions.
|
|
153
|
+
if (!dryRun) cleanupLegacyClaudeSkills(skillsDir);
|
|
66
154
|
} else if (t === 'cursor') {
|
|
67
155
|
const dir = join(cwd, '.cursor', 'rules');
|
|
68
156
|
const dest = join(dir, 'crawlforge.mdc');
|
|
@@ -99,25 +187,27 @@ export async function install({ target = 'all', force = false, dryRun = false, c
|
|
|
99
187
|
|
|
100
188
|
/**
|
|
101
189
|
* Uninstall skills from the given target.
|
|
102
|
-
* @param {{ target
|
|
190
|
+
* @param {{ target?: 'claude-code'|'cursor'|'vscode'|'all', cwd?: string, homeDir?: string }} opts
|
|
103
191
|
* @returns {{ removed: string[], notFound: string[] }}
|
|
104
192
|
*/
|
|
105
|
-
export async function uninstall({ target = 'all', cwd = process.cwd() } = {}) {
|
|
193
|
+
export async function uninstall({ target = 'all', cwd = process.cwd(), homeDir = homedir() } = {}) {
|
|
106
194
|
const targets = target === 'all' ? ['claude-code', 'cursor', 'vscode'] : [target];
|
|
107
195
|
const results = { removed: [], notFound: [] };
|
|
196
|
+
const skills = listAgentSkills();
|
|
108
197
|
|
|
109
198
|
for (const t of targets) {
|
|
110
199
|
if (t === 'claude-code') {
|
|
111
|
-
const skillsDir = join(
|
|
112
|
-
for (const
|
|
113
|
-
const
|
|
114
|
-
if (existsSync(
|
|
115
|
-
|
|
116
|
-
results.removed.push(
|
|
200
|
+
const skillsDir = join(homeDir, '.claude', 'skills');
|
|
201
|
+
for (const skill of skills) {
|
|
202
|
+
const destDir = join(skillsDir, skill.name);
|
|
203
|
+
if (existsSync(destDir)) {
|
|
204
|
+
rmSync(destDir, { recursive: true, force: true });
|
|
205
|
+
results.removed.push(destDir);
|
|
117
206
|
} else {
|
|
118
|
-
results.notFound.push(
|
|
207
|
+
results.notFound.push(destDir);
|
|
119
208
|
}
|
|
120
209
|
}
|
|
210
|
+
results.removed.push(...cleanupLegacyClaudeSkills(skillsDir));
|
|
121
211
|
} else if (t === 'cursor') {
|
|
122
212
|
const dest = join(cwd, '.cursor', 'rules', 'crawlforge.mdc');
|
|
123
213
|
if (existsSync(dest)) {
|
|
@@ -139,3 +229,65 @@ export async function uninstall({ target = 'all', cwd = process.cwd() } = {}) {
|
|
|
139
229
|
|
|
140
230
|
return results;
|
|
141
231
|
}
|
|
232
|
+
|
|
233
|
+
// --- Optional, opt-in forced-eval hook (boosts skill auto-activation) ---
|
|
234
|
+
|
|
235
|
+
const HOOK_MARKER = 'CrawlForge skill';
|
|
236
|
+
const HOOK_COMMAND =
|
|
237
|
+
"echo 'Consider whether a CrawlForge skill applies: web scraping, deep research, " +
|
|
238
|
+
"stealth browsing, structured extraction, change tracking, or batch automation.'";
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Add a UserPromptSubmit forced-eval reminder to ~/.claude/settings.json.
|
|
242
|
+
* Idempotent additive merge — preserves all existing settings and only adds the
|
|
243
|
+
* hook if an equivalent one is not already present. Opt-in (CLI --with-hook).
|
|
244
|
+
* @param {{ homeDir?: string }} opts
|
|
245
|
+
* @returns {{ added: boolean, path: string }}
|
|
246
|
+
*/
|
|
247
|
+
export function installHook({ homeDir = homedir() } = {}) {
|
|
248
|
+
const dir = join(homeDir, '.claude');
|
|
249
|
+
const path = join(dir, 'settings.json');
|
|
250
|
+
let settings = {};
|
|
251
|
+
if (existsSync(path)) {
|
|
252
|
+
try {
|
|
253
|
+
settings = JSON.parse(readFileSync(path, 'utf8'));
|
|
254
|
+
} catch {
|
|
255
|
+
settings = {};
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
settings.hooks = settings.hooks || {};
|
|
259
|
+
const list = Array.isArray(settings.hooks.UserPromptSubmit)
|
|
260
|
+
? settings.hooks.UserPromptSubmit
|
|
261
|
+
: [];
|
|
262
|
+
const already = JSON.stringify(list).includes(HOOK_MARKER);
|
|
263
|
+
if (already) return { added: false, path };
|
|
264
|
+
|
|
265
|
+
list.push({ hooks: [{ type: 'command', command: HOOK_COMMAND }] });
|
|
266
|
+
settings.hooks.UserPromptSubmit = list;
|
|
267
|
+
mkdirSync(dir, { recursive: true });
|
|
268
|
+
writeFileSync(path, JSON.stringify(settings, null, 2) + '\n', 'utf8');
|
|
269
|
+
return { added: true, path };
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/**
|
|
273
|
+
* Remove the CrawlForge forced-eval hook from ~/.claude/settings.json.
|
|
274
|
+
* @param {{ homeDir?: string }} opts
|
|
275
|
+
* @returns {{ removed: boolean, path: string }}
|
|
276
|
+
*/
|
|
277
|
+
export function uninstallHook({ homeDir = homedir() } = {}) {
|
|
278
|
+
const path = join(homeDir, '.claude', 'settings.json');
|
|
279
|
+
if (!existsSync(path)) return { removed: false, path };
|
|
280
|
+
let settings;
|
|
281
|
+
try {
|
|
282
|
+
settings = JSON.parse(readFileSync(path, 'utf8'));
|
|
283
|
+
} catch {
|
|
284
|
+
return { removed: false, path };
|
|
285
|
+
}
|
|
286
|
+
const list = settings?.hooks?.UserPromptSubmit;
|
|
287
|
+
if (!Array.isArray(list)) return { removed: false, path };
|
|
288
|
+
const filtered = list.filter((entry) => !JSON.stringify(entry).includes(HOOK_MARKER));
|
|
289
|
+
if (filtered.length === list.length) return { removed: false, path };
|
|
290
|
+
settings.hooks.UserPromptSubmit = filtered;
|
|
291
|
+
writeFileSync(path, JSON.stringify(settings, null, 2) + '\n', 'utf8');
|
|
292
|
+
return { removed: true, path };
|
|
293
|
+
}
|
|
@@ -5,25 +5,31 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
import { load } from 'cheerio';
|
|
8
|
+
import { ssrfGuard, isSsrfError } from '../../../utils/ssrfGuard.js';
|
|
9
|
+
import { throttleHost } from '../../../utils/hostRateLimiter.js';
|
|
8
10
|
|
|
9
11
|
const USER_AGENT = 'MCP-WebScraper-BatchTool/1.0.0';
|
|
10
12
|
|
|
11
13
|
/**
|
|
12
|
-
* Fetch a URL with AbortController timeout.
|
|
14
|
+
* Fetch a URL with AbortController timeout (SSRF-guarded + per-host throttled).
|
|
13
15
|
*/
|
|
14
16
|
export async function fetchUrl(url, options = {}) {
|
|
15
17
|
const { timeout = 15000, headers = {} } = options;
|
|
18
|
+
const guard = ssrfGuard(url); // SSRF pre-flight (throws before connecting)
|
|
19
|
+
await throttleHost(url);
|
|
16
20
|
const controller = new AbortController();
|
|
17
21
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
18
22
|
try {
|
|
19
23
|
const response = await fetch(url, {
|
|
20
24
|
signal: controller.signal,
|
|
21
|
-
headers: { 'User-Agent': USER_AGENT, ...headers }
|
|
25
|
+
headers: { 'User-Agent': USER_AGENT, ...headers },
|
|
26
|
+
...guard
|
|
22
27
|
});
|
|
23
28
|
clearTimeout(timeoutId);
|
|
24
29
|
return response;
|
|
25
30
|
} catch (error) {
|
|
26
31
|
clearTimeout(timeoutId);
|
|
32
|
+
if (isSsrfError(error)) throw new Error(error.cause?.message || error.message);
|
|
27
33
|
if (error.name === 'AbortError') throw new Error(`Request timeout after ${timeout}ms`);
|
|
28
34
|
throw error;
|
|
29
35
|
}
|
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
|
|
6
6
|
import { config } from '../../constants/config.js';
|
|
7
7
|
import { createRequire } from 'module';
|
|
8
|
+
import { ssrfGuard, isSsrfError } from '../../utils/ssrfGuard.js';
|
|
9
|
+
import { throttleHost } from '../../utils/hostRateLimiter.js';
|
|
8
10
|
|
|
9
11
|
// Derive User-Agent from package version so it reflects the actual release.
|
|
10
12
|
const _require = createRequire(import.meta.url);
|
|
@@ -27,6 +29,13 @@ export async function fetchWithTimeout(url, options = {}) {
|
|
|
27
29
|
const { timeout = 10000, headers = {} } = options;
|
|
28
30
|
const maxBodySize = config.fetch.maxBodySize;
|
|
29
31
|
|
|
32
|
+
// SSRF pre-flight (protocol / metadata host). Throws a clear error before any
|
|
33
|
+
// connection is attempted; `guard.dispatcher` enforces IP rules at connect time.
|
|
34
|
+
const guard = ssrfGuard(url);
|
|
35
|
+
|
|
36
|
+
// Per-host politeness throttle (before the timeout window starts).
|
|
37
|
+
await throttleHost(url);
|
|
38
|
+
|
|
30
39
|
const controller = new AbortController();
|
|
31
40
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
32
41
|
|
|
@@ -37,11 +46,15 @@ export async function fetchWithTimeout(url, options = {}) {
|
|
|
37
46
|
headers: {
|
|
38
47
|
'User-Agent': CRAWLFORGE_UA,
|
|
39
48
|
...headers
|
|
40
|
-
}
|
|
49
|
+
},
|
|
50
|
+
...guard
|
|
41
51
|
});
|
|
42
52
|
clearTimeout(timeoutId);
|
|
43
53
|
} catch (error) {
|
|
44
54
|
clearTimeout(timeoutId);
|
|
55
|
+
if (isSsrfError(error)) {
|
|
56
|
+
throw new Error(error.cause?.message || error.message);
|
|
57
|
+
}
|
|
45
58
|
if (error.name === 'AbortError') {
|
|
46
59
|
throw new Error(`Request timeout after ${timeout}ms`);
|
|
47
60
|
}
|
|
@@ -15,6 +15,8 @@
|
|
|
15
15
|
* - Keeping zero new runtime deps satisfies the project constraint.
|
|
16
16
|
*/
|
|
17
17
|
|
|
18
|
+
import { safeFetch } from '../../utils/ssrfGuard.js';
|
|
19
|
+
|
|
18
20
|
/**
|
|
19
21
|
* Parse a single Set-Cookie header value into a cookie object.
|
|
20
22
|
* Returns null if the header is empty or unparseable.
|
|
@@ -220,7 +222,7 @@ export class SessionContext {
|
|
|
220
222
|
fetchOpts.body = body;
|
|
221
223
|
}
|
|
222
224
|
|
|
223
|
-
const response = await
|
|
225
|
+
const response = await safeFetch(url, fetchOpts);
|
|
224
226
|
this.recordCookies(response, url);
|
|
225
227
|
|
|
226
228
|
const text = await response.text().catch(() => '');
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
13
|
import { load } from 'cheerio';
|
|
14
|
+
import { safeFetch } from '../../utils/ssrfGuard.js';
|
|
14
15
|
|
|
15
16
|
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; CrawlForge-MCP/3.0)';
|
|
16
17
|
const DEFAULT_TIMEOUT_MS = 15000;
|
|
@@ -32,7 +33,7 @@ export async function fetchAndParse(url, options = {}) {
|
|
|
32
33
|
stripTags = ['script', 'style', 'noscript', 'iframe', 'svg']
|
|
33
34
|
} = options;
|
|
34
35
|
|
|
35
|
-
const response = await
|
|
36
|
+
const response = await safeFetch(url, {
|
|
36
37
|
headers: {
|
|
37
38
|
'User-Agent': userAgent,
|
|
38
39
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
|
@@ -8,6 +8,7 @@ import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
|
|
|
8
8
|
import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
|
|
9
9
|
import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
|
|
10
10
|
import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
|
|
11
|
+
import { safeFetch } from '../../utils/ssrfGuard.js';
|
|
11
12
|
|
|
12
13
|
const ExtractContentSchema = z.object({
|
|
13
14
|
url: z.string().url(),
|
|
@@ -169,7 +170,7 @@ export class ExtractContentTool {
|
|
|
169
170
|
pageTitle = browserResult.title;
|
|
170
171
|
} else {
|
|
171
172
|
// Simple HTTP fetch
|
|
172
|
-
const response = await
|
|
173
|
+
const response = await safeFetch(url, {
|
|
173
174
|
headers: {
|
|
174
175
|
'User-Agent': 'Mozilla/5.0 (compatible; MCP-WebScraper/3.0; Enhanced-Content-Extractor)'
|
|
175
176
|
},
|
|
@@ -9,6 +9,7 @@ import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
|
|
|
9
9
|
import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
|
|
10
10
|
import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
|
|
11
11
|
import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
|
|
12
|
+
import { safeFetch } from '../../utils/ssrfGuard.js';
|
|
12
13
|
|
|
13
14
|
const ProcessDocumentSchema = z.object({
|
|
14
15
|
source: z.string().min(1),
|
|
@@ -275,7 +276,7 @@ export class ProcessDocumentTool {
|
|
|
275
276
|
pageTitle = browserResult.title;
|
|
276
277
|
} else {
|
|
277
278
|
// Simple HTTP fetch
|
|
278
|
-
const response = await
|
|
279
|
+
const response = await safeFetch(source, {
|
|
279
280
|
headers: {
|
|
280
281
|
'User-Agent': 'Mozilla/5.0 (compatible; MCP-WebScraper/3.0; Document-Processor)'
|
|
281
282
|
},
|