crawlforge-mcp-server 4.7.2 → 4.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +2 -2
- package/package.json +2 -1
- package/server.js +42 -9
- package/src/cli/commands/init.js +13 -2
- package/src/cli/commands/install-skills.js +10 -1
- package/src/cli/commands/monitor.js +81 -0
- package/src/cli/commands/uninstall-skills.js +10 -1
- package/src/core/ActionExecutor.js +51 -9
- package/src/core/ElicitationHelper.js +18 -5
- package/src/core/LLMsTxtAnalyzer.js +2 -1
- package/src/core/MonitorScheduler.js +281 -0
- package/src/core/MonitorStore.js +79 -0
- package/src/core/ResearchOrchestrator.js +2 -1
- package/src/core/crawlers/BFSCrawler.js +2 -1
- package/src/skills/agent-skills/crawlforge-batch-automation/SKILL.md +126 -0
- package/src/skills/agent-skills/crawlforge-batch-automation/references/actions.md +127 -0
- package/src/skills/agent-skills/crawlforge-change-tracking/SKILL.md +116 -0
- package/src/skills/agent-skills/crawlforge-deep-research/SKILL.md +108 -0
- package/src/skills/agent-skills/crawlforge-deep-research/references/workflows.md +76 -0
- package/src/skills/agent-skills/crawlforge-getting-started/SKILL.md +89 -0
- package/src/skills/agent-skills/crawlforge-getting-started/references/cli.md +71 -0
- package/src/skills/agent-skills/crawlforge-getting-started/references/credits.md +75 -0
- package/src/skills/agent-skills/crawlforge-stealth-browsing/SKILL.md +106 -0
- package/src/skills/agent-skills/crawlforge-stealth-browsing/references/engine-selection.md +63 -0
- package/src/skills/agent-skills/crawlforge-structured-extraction/SKILL.md +121 -0
- package/src/skills/agent-skills/crawlforge-structured-extraction/references/templates.md +39 -0
- package/src/skills/agent-skills/crawlforge-web-scraping/SKILL.md +141 -0
- package/src/skills/agent-skills/crawlforge-web-scraping/references/tool-reference.md +95 -0
- package/src/skills/installer.js +186 -34
- package/src/tools/advanced/batchScrape/worker.js +8 -2
- package/src/tools/basic/_fetch.js +14 -1
- package/src/tools/crawl/_sessionContext.js +3 -1
- package/src/tools/extract/_fetchAndParse.js +2 -1
- package/src/tools/extract/extractContent.js +2 -1
- package/src/tools/extract/processDocument.js +2 -1
- package/src/tools/scrape/_brandingExtractor.js +378 -0
- package/src/tools/scrape/unifiedScrape.js +66 -6
- package/src/tools/templates/ScrapeTemplateTool.js +2 -1
- package/src/tools/tracking/trackChanges/differ.js +3 -1
- package/src/tools/tracking/trackChanges/index.js +74 -21
- package/src/tools/tracking/trackChanges/schema.js +7 -2
- package/src/utils/hostRateLimiter.js +46 -0
- package/src/utils/robotsChecker.js +2 -1
- package/src/utils/sitemapParser.js +2 -1
- package/src/utils/ssrfGuard.js +161 -0
- package/src/utils/ssrfProtection.js +6 -9
- package/src/skills/crawlforge-cli.md +0 -157
- package/src/skills/crawlforge-mcp.md +0 -80
- package/src/skills/crawlforge-research.md +0 -104
- package/src/skills/crawlforge-stealth.md +0 -98
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MonitorScheduler — recurring change-monitoring engine.
|
|
3
|
+
*
|
|
4
|
+
* Lifecycle:
|
|
5
|
+
* start() load persisted monitors -> rehydrate baseline -> schedule
|
|
6
|
+
* setInterval timers -> catch-up fire anything already due.
|
|
7
|
+
* _fire(def) fetch -> (first run) create baseline, else compareWithBaseline
|
|
8
|
+
* -> significance threshold gate -> optional plain-English goal
|
|
9
|
+
* judge (SamplingClient, degrades gracefully) -> notify -> persist.
|
|
10
|
+
* runDueOnce() fire everything currently due exactly once (the external-cron
|
|
11
|
+
* one-shot path; guaranteed firing regardless of process uptime).
|
|
12
|
+
* stopAll() clear every timer (graceful shutdown — no leaked handles).
|
|
13
|
+
*
|
|
14
|
+
* Honest firing model: a stdio MCP server is not a daemon. In-process timers only
|
|
15
|
+
* fire while the process is alive; missed runs are caught up on next start(). For
|
|
16
|
+
* guaranteed firing, run `crawlforge monitor:run-due` from system cron.
|
|
17
|
+
*
|
|
18
|
+
* All dependencies are injected so the engine is unit-testable without network,
|
|
19
|
+
* timers, or an LLM.
|
|
20
|
+
*/
|
|
21
|
+
import { fetchContent, meetsNotificationThreshold } from '../tools/tracking/trackChanges/differ.js';
|
|
22
|
+
import { sendNotifications } from '../tools/tracking/trackChanges/notifier.js';
|
|
23
|
+
|
|
24
|
+
export const FIRING_GUARANTEE_NOTE =
|
|
25
|
+
'In-process scheduling fires only while the MCP server process is alive; missed runs are ' +
|
|
26
|
+
'caught up on restart. For guaranteed firing, run `crawlforge monitor:run-due` from system cron.';
|
|
27
|
+
|
|
28
|
+
const MIN_INTERVAL = 60_000;
|
|
29
|
+
|
|
30
|
+
export class MonitorScheduler {
|
|
31
|
+
constructor({ tool, store, samplingClient = null, logger = null, now = () => Date.now() } = {}) {
|
|
32
|
+
this.tool = tool;
|
|
33
|
+
this.store = store;
|
|
34
|
+
this.samplingClient = samplingClient;
|
|
35
|
+
this.logger = logger || { warn() {}, info() {}, error() {} };
|
|
36
|
+
this.now = now;
|
|
37
|
+
this.timers = new Map();
|
|
38
|
+
this._started = false;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async start() {
|
|
42
|
+
if (this._started) return;
|
|
43
|
+
this._started = true;
|
|
44
|
+
await this.store.load();
|
|
45
|
+
for (const def of this.store.list()) {
|
|
46
|
+
if (def.enabled === false) continue;
|
|
47
|
+
try {
|
|
48
|
+
await this._ensureBaseline(def);
|
|
49
|
+
} catch (err) {
|
|
50
|
+
this.logger.warn('monitor baseline rehydrate failed', { id: def.id, error: err.message });
|
|
51
|
+
}
|
|
52
|
+
this._schedule(def);
|
|
53
|
+
if (!def.nextDueAt || def.nextDueAt <= this.now()) {
|
|
54
|
+
this._fire(def).catch(() => {});
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
_schedule(def) {
|
|
60
|
+
this._clearTimer(def.id);
|
|
61
|
+
if (def.enabled === false) return;
|
|
62
|
+
const interval = Math.max(MIN_INTERVAL, def.interval || 3_600_000);
|
|
63
|
+
const timer = setInterval(() => {
|
|
64
|
+
this._fire(def).catch(() => {});
|
|
65
|
+
}, interval);
|
|
66
|
+
if (typeof timer.unref === 'function') timer.unref(); // never keep the process alive
|
|
67
|
+
this.timers.set(def.id, timer);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
_clearTimer(id) {
|
|
71
|
+
const t = this.timers.get(id);
|
|
72
|
+
if (t) {
|
|
73
|
+
clearInterval(t);
|
|
74
|
+
this.timers.delete(id);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
stopAll() {
|
|
79
|
+
for (const t of this.timers.values()) clearInterval(t);
|
|
80
|
+
this.timers.clear();
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
async createMonitor(input) {
|
|
84
|
+
if (!input || !input.url) throw new Error('createMonitor requires a url');
|
|
85
|
+
const interval = Math.max(MIN_INTERVAL, input.interval || 3_600_000);
|
|
86
|
+
const t = this.now();
|
|
87
|
+
const def = {
|
|
88
|
+
id: this.store.newId(),
|
|
89
|
+
url: input.url,
|
|
90
|
+
interval,
|
|
91
|
+
schedule: input.schedule || null,
|
|
92
|
+
goal: input.goal || null,
|
|
93
|
+
notificationThreshold: input.notificationThreshold || 'moderate',
|
|
94
|
+
trackingOptions: input.trackingOptions || {},
|
|
95
|
+
notificationOptions: input.notificationOptions || null,
|
|
96
|
+
enabled: true,
|
|
97
|
+
createdAt: t,
|
|
98
|
+
nextDueAt: t + interval,
|
|
99
|
+
lastCheckAt: null,
|
|
100
|
+
lastChangeAt: null,
|
|
101
|
+
stats: { checks: 0, changesDetected: 0, notificationsSent: 0, errors: 0 },
|
|
102
|
+
};
|
|
103
|
+
await this.store.save(def);
|
|
104
|
+
try {
|
|
105
|
+
await this._ensureBaseline(def);
|
|
106
|
+
} catch {
|
|
107
|
+
/* baseline will be created on first fire */
|
|
108
|
+
}
|
|
109
|
+
this._schedule(def);
|
|
110
|
+
return { ...def, firingGuarantee: FIRING_GUARANTEE_NOTE };
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
async stopMonitor(id) {
|
|
114
|
+
this._clearTimer(id);
|
|
115
|
+
const existed = !!this.store.get(id);
|
|
116
|
+
await this.store.remove(id);
|
|
117
|
+
return { stopped: existed, id };
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
async stopByUrl(url) {
|
|
121
|
+
let count = 0;
|
|
122
|
+
for (const def of this.store.list()) {
|
|
123
|
+
if (def.url === url) {
|
|
124
|
+
await this.stopMonitor(def.id);
|
|
125
|
+
count++;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
return { stopped: count, url };
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
list() {
|
|
132
|
+
return this.store.list().map((d) => ({
|
|
133
|
+
id: d.id,
|
|
134
|
+
url: d.url,
|
|
135
|
+
interval: d.interval,
|
|
136
|
+
goal: d.goal,
|
|
137
|
+
enabled: d.enabled,
|
|
138
|
+
nextDueAt: d.nextDueAt,
|
|
139
|
+
lastCheckAt: d.lastCheckAt,
|
|
140
|
+
lastChangeAt: d.lastChangeAt,
|
|
141
|
+
stats: d.stats,
|
|
142
|
+
scheduled: this.timers.has(d.id),
|
|
143
|
+
}));
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
async runDueOnce() {
|
|
147
|
+
if (!this.store._loaded) await this.store.load();
|
|
148
|
+
const t = this.now();
|
|
149
|
+
const due = this.store.list().filter((d) => d.enabled !== false && (!d.nextDueAt || d.nextDueAt <= t));
|
|
150
|
+
const results = [];
|
|
151
|
+
for (const def of due) {
|
|
152
|
+
try {
|
|
153
|
+
results.push(await this._fire(def));
|
|
154
|
+
} catch (err) {
|
|
155
|
+
results.push({ id: def.id, url: def.url, error: err.message });
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
return { fired: results.length, results };
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
async _ensureBaseline(def) {
|
|
162
|
+
const ct = this.tool.changeTracker;
|
|
163
|
+
if (ct?.snapshots?.has(def.url)) return;
|
|
164
|
+
try {
|
|
165
|
+
const q = await this.tool.snapshotManager.querySnapshots({ url: def.url, limit: 1, includeContent: true });
|
|
166
|
+
const content = q?.snapshots?.[0]?.content;
|
|
167
|
+
if (content && typeof content === 'string') {
|
|
168
|
+
await ct.createBaseline(def.url, content, def.trackingOptions);
|
|
169
|
+
}
|
|
170
|
+
} catch {
|
|
171
|
+
/* no usable snapshot — first fire will create the baseline */
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
async _fire(def) {
|
|
176
|
+
const ct = this.tool.changeTracker;
|
|
177
|
+
const finish = async (extra) => {
|
|
178
|
+
def.lastCheckAt = this.now();
|
|
179
|
+
def.nextDueAt = this.now() + Math.max(MIN_INTERVAL, def.interval || 3_600_000);
|
|
180
|
+
await this.store.save(def);
|
|
181
|
+
return { id: def.id, url: def.url, ...extra };
|
|
182
|
+
};
|
|
183
|
+
|
|
184
|
+
let fetched;
|
|
185
|
+
try {
|
|
186
|
+
fetched = await fetchContent(def.url);
|
|
187
|
+
} catch (err) {
|
|
188
|
+
def.stats.errors++;
|
|
189
|
+
return finish({ error: err.message });
|
|
190
|
+
}
|
|
191
|
+
const content = fetched.content;
|
|
192
|
+
|
|
193
|
+
// First fire (or unrehydrated) — establish a baseline.
|
|
194
|
+
if (!ct?.snapshots?.has(def.url)) {
|
|
195
|
+
await ct.createBaseline(def.url, content, def.trackingOptions);
|
|
196
|
+
try {
|
|
197
|
+
await this.tool.snapshotManager.storeSnapshot(def.url, content, { baseline: true, scheduledMonitor: def.id });
|
|
198
|
+
} catch {
|
|
199
|
+
/* snapshot persistence best-effort */
|
|
200
|
+
}
|
|
201
|
+
def.stats.checks++;
|
|
202
|
+
return finish({ baselineCreated: true });
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
let cmp;
|
|
206
|
+
try {
|
|
207
|
+
cmp = await ct.compareWithBaseline(def.url, content, def.trackingOptions);
|
|
208
|
+
} catch (err) {
|
|
209
|
+
def.stats.errors++;
|
|
210
|
+
return finish({ error: err.message });
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
def.stats.checks++;
|
|
214
|
+
const meets = cmp.hasChanges && meetsNotificationThreshold(cmp.significance, def.notificationThreshold);
|
|
215
|
+
let judge = null;
|
|
216
|
+
let notified = false;
|
|
217
|
+
|
|
218
|
+
if (meets) {
|
|
219
|
+
def.stats.changesDetected++;
|
|
220
|
+
def.lastChangeAt = this.now();
|
|
221
|
+
judge = await this._judgeGoal(def, cmp);
|
|
222
|
+
try {
|
|
223
|
+
await this.tool.snapshotManager.storeSnapshot(def.url, content, {
|
|
224
|
+
changes: cmp.summary,
|
|
225
|
+
significance: cmp.significance,
|
|
226
|
+
scheduledMonitor: def.id,
|
|
227
|
+
});
|
|
228
|
+
} catch {
|
|
229
|
+
/* best-effort */
|
|
230
|
+
}
|
|
231
|
+
if (judge.meaningful && def.notificationOptions) {
|
|
232
|
+
try {
|
|
233
|
+
await sendNotifications(def.url, { ...cmp, goalJudgment: judge }, def.notificationOptions, this.tool);
|
|
234
|
+
def.stats.notificationsSent++;
|
|
235
|
+
notified = true;
|
|
236
|
+
} catch (err) {
|
|
237
|
+
def.stats.errors++;
|
|
238
|
+
this.logger.warn('monitor notification failed', { id: def.id, error: err.message });
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
return finish({
|
|
244
|
+
hasChanges: cmp.hasChanges,
|
|
245
|
+
significance: cmp.significance,
|
|
246
|
+
notified,
|
|
247
|
+
mode: judge?.mode || 'threshold',
|
|
248
|
+
reason: judge?.reason,
|
|
249
|
+
});
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* Decide whether a detected change is "meaningful" given the monitor's
|
|
254
|
+
* plain-English goal. Degrades gracefully: no goal -> threshold; no LLM ->
|
|
255
|
+
* notify and tag the mode; never hard-requires an LLM key.
|
|
256
|
+
*/
|
|
257
|
+
async _judgeGoal(def, cmp) {
|
|
258
|
+
if (!def.goal) return { meaningful: true, mode: 'threshold' };
|
|
259
|
+
if (!this.samplingClient) return { meaningful: true, mode: 'degraded-no-llm' };
|
|
260
|
+
try {
|
|
261
|
+
const summary = JSON.stringify(cmp.summary || {}).slice(0, 1500);
|
|
262
|
+
const prompt =
|
|
263
|
+
`A monitored web page changed. The user's alert goal is:\n"${def.goal}"\n\n` +
|
|
264
|
+
`Change summary (JSON):\n${summary}\n\n` +
|
|
265
|
+
`Does this change match the goal and warrant an alert? ` +
|
|
266
|
+
`Reply ONLY with JSON: {"meaningful": true|false, "reason": "short reason"}.`;
|
|
267
|
+
const res = await this.samplingClient.complete(prompt, { maxTokens: 200 });
|
|
268
|
+
const text = typeof res === 'string' ? res : res?.text || '';
|
|
269
|
+
const m = text.match(/\{[\s\S]*\}/);
|
|
270
|
+
if (m) {
|
|
271
|
+
const parsed = JSON.parse(m[0]);
|
|
272
|
+
return { meaningful: parsed.meaningful !== false, reason: parsed.reason || '', mode: 'llm' };
|
|
273
|
+
}
|
|
274
|
+
return { meaningful: true, mode: 'degraded-llm-error', reason: 'unparseable judge output' };
|
|
275
|
+
} catch (err) {
|
|
276
|
+
return { meaningful: true, mode: 'degraded-llm-error', reason: err.message };
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
export default MonitorScheduler;
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MonitorStore — disk persistence for scheduled change-monitors.
|
|
3
|
+
*
|
|
4
|
+
* One JSON file per monitor under ./monitors/<id>.json. Mirrors JobManager's
|
|
5
|
+
* persistence *pattern* (mkdir-recursive, per-file JSON, randomUUID, load-on-
|
|
6
|
+
* start) but deliberately omits TTL/eviction — scheduled monitors are long-lived
|
|
7
|
+
* and must never be auto-expired.
|
|
8
|
+
*/
|
|
9
|
+
import { promises as fs } from 'node:fs';
|
|
10
|
+
import path from 'node:path';
|
|
11
|
+
import { randomUUID } from 'node:crypto';
|
|
12
|
+
|
|
13
|
+
export class MonitorStore {
|
|
14
|
+
constructor({ storageDir = './monitors' } = {}) {
|
|
15
|
+
this.storageDir = storageDir;
|
|
16
|
+
this.monitors = new Map();
|
|
17
|
+
this._loaded = false;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
async load() {
|
|
21
|
+
try {
|
|
22
|
+
await fs.mkdir(this.storageDir, { recursive: true });
|
|
23
|
+
const files = await fs.readdir(this.storageDir);
|
|
24
|
+
for (const f of files) {
|
|
25
|
+
if (!f.endsWith('.json')) continue;
|
|
26
|
+
try {
|
|
27
|
+
const def = JSON.parse(await fs.readFile(path.join(this.storageDir, f), 'utf8'));
|
|
28
|
+
if (def && def.id) this.monitors.set(def.id, def);
|
|
29
|
+
} catch {
|
|
30
|
+
/* skip corrupt file */
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
} catch {
|
|
34
|
+
/* dir unavailable — start empty */
|
|
35
|
+
}
|
|
36
|
+
this._loaded = true;
|
|
37
|
+
return this.monitors;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
newId() {
|
|
41
|
+
return randomUUID();
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
async save(def) {
|
|
45
|
+
this.monitors.set(def.id, def);
|
|
46
|
+
try {
|
|
47
|
+
await fs.mkdir(this.storageDir, { recursive: true });
|
|
48
|
+
await fs.writeFile(
|
|
49
|
+
path.join(this.storageDir, `${def.id}.json`),
|
|
50
|
+
JSON.stringify(def, null, 2),
|
|
51
|
+
'utf8'
|
|
52
|
+
);
|
|
53
|
+
} catch (err) {
|
|
54
|
+
/* keep the in-memory copy even if the write fails */
|
|
55
|
+
return { def, persisted: false, error: err.message };
|
|
56
|
+
}
|
|
57
|
+
return { def, persisted: true };
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
async remove(id) {
|
|
61
|
+
this.monitors.delete(id);
|
|
62
|
+
try {
|
|
63
|
+
await fs.unlink(path.join(this.storageDir, `${id}.json`));
|
|
64
|
+
return true;
|
|
65
|
+
} catch {
|
|
66
|
+
return false;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
get(id) {
|
|
71
|
+
return this.monitors.get(id);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
list() {
|
|
75
|
+
return [...this.monitors.values()];
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export default MonitorStore;
|
|
@@ -8,6 +8,7 @@ import { ResultRanker } from '../tools/search/ranking/ResultRanker.js';
|
|
|
8
8
|
import { CacheManager } from './cache/CacheManager.js';
|
|
9
9
|
import { Logger } from '../utils/Logger.js';
|
|
10
10
|
import { LLMManager } from './llm/LLMManager.js';
|
|
11
|
+
import { safeFetch } from '../utils/ssrfGuard.js';
|
|
11
12
|
|
|
12
13
|
/**
|
|
13
14
|
* ResearchOrchestrator - Multi-stage research orchestration engine with LLM integration
|
|
@@ -552,7 +553,7 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
552
553
|
});
|
|
553
554
|
// Fallback: use fetch + basic text extraction
|
|
554
555
|
try {
|
|
555
|
-
const fetchResponse = await
|
|
556
|
+
const fetchResponse = await safeFetch(source.link, {
|
|
556
557
|
headers: { 'User-Agent': 'CrawlForge-Research/1.0' },
|
|
557
558
|
signal: AbortSignal.timeout(10000)
|
|
558
559
|
});
|
|
@@ -7,6 +7,7 @@ import { DomainFilter } from '../../utils/domainFilter.js';
|
|
|
7
7
|
import { LinkAnalyzer } from '../analysis/LinkAnalyzer.js';
|
|
8
8
|
import { normalizeUrl, extractLinks, isValidUrl } from '../../utils/urlNormalizer.js';
|
|
9
9
|
import { Logger } from '../../utils/Logger.js';
|
|
10
|
+
import { safeFetch } from '../../utils/ssrfGuard.js';
|
|
10
11
|
|
|
11
12
|
const logger = new Logger('BFSCrawler');
|
|
12
13
|
|
|
@@ -284,7 +285,7 @@ export class BFSCrawler {
|
|
|
284
285
|
setTimeout(() => controller.abort(), effectiveTimeout);
|
|
285
286
|
}
|
|
286
287
|
|
|
287
|
-
const response = await
|
|
288
|
+
const response = await safeFetch(url, {
|
|
288
289
|
signal: controller.signal,
|
|
289
290
|
headers
|
|
290
291
|
});
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: crawlforge-batch-automation
|
|
3
|
+
description: "Automates large scraping jobs and browser interactions with CrawlForge's batch_scrape, get_batch_results, scrape_with_actions, and generate_llms_txt tools. Use when the user wants to scrape many URLs at once, batch-scrape a list of pages, collect dozens of product, news, or competitor pages, run browser actions (click, type, scroll, wait) before scraping, log in or fill a form before extracting, or generate an llms.txt for a site. Use sync mode for up to about 25 URLs and async with a webhook for large batches; retrieve paginated output with get_batch_results."
|
|
4
|
+
metadata:
|
|
5
|
+
version: 4.8.0
|
|
6
|
+
source: crawlforge-mcp-server
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# CrawlForge Batch & Automation
|
|
10
|
+
|
|
11
|
+
Scale up scraping and drive interactive pages. Use `batch_scrape` for many URLs,
|
|
12
|
+
`get_batch_results` to page through async output, `scrape_with_actions` to
|
|
13
|
+
interact before scraping, and `generate_llms_txt` to produce a site's AI policy
|
|
14
|
+
file.
|
|
15
|
+
|
|
16
|
+
## When to use
|
|
17
|
+
|
|
18
|
+
- "Scrape these 30 URLs" / "batch-scrape this list" → `batch_scrape`
|
|
19
|
+
- "Collect dozens of product / news / competitor pages" → `batch_scrape` (async)
|
|
20
|
+
- "Get the rest of the results from that batch" → `get_batch_results`
|
|
21
|
+
- "Click / type / scroll / wait before scraping" / "log in then extract" →
|
|
22
|
+
`scrape_with_actions`
|
|
23
|
+
- "Generate an llms.txt for this site" → `generate_llms_txt`
|
|
24
|
+
|
|
25
|
+
## batch_scrape — many URLs in parallel (cost: 5)
|
|
26
|
+
|
|
27
|
+
Sync mode (results returned immediately), good for up to ~25 URLs:
|
|
28
|
+
|
|
29
|
+
```json
|
|
30
|
+
{
|
|
31
|
+
"tool": "batch_scrape",
|
|
32
|
+
"params": {
|
|
33
|
+
"urls": ["https://a.com", "https://b.com", "https://c.com"],
|
|
34
|
+
"formats": ["markdown"],
|
|
35
|
+
"mode": "sync",
|
|
36
|
+
"maxConcurrency": 5
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Async mode with a webhook for large batches (returns a `batchId` immediately):
|
|
42
|
+
|
|
43
|
+
```json
|
|
44
|
+
{
|
|
45
|
+
"tool": "batch_scrape",
|
|
46
|
+
"params": {
|
|
47
|
+
"urls": ["https://a.com", "https://b.com"],
|
|
48
|
+
"formats": ["json"],
|
|
49
|
+
"mode": "async",
|
|
50
|
+
"webhook": { "url": "https://my-site.com/hook", "events": ["batch_completed", "batch_failed"] }
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
- `urls` accepts plain strings OR objects `{url, selectors, headers, timeout,
|
|
56
|
+
metadata}` for per-URL config. 1–50 URLs.
|
|
57
|
+
- `formats`: `markdown`, `html`, `json`, `text`.
|
|
58
|
+
- `extractionSchema` applies structured extraction to every URL.
|
|
59
|
+
- `maxConcurrency` 1–20 (default 10); `delayBetweenRequests` throttles.
|
|
60
|
+
- Sync batches over ~25 URLs trigger a confirmation prompt (elicitation) — use
|
|
61
|
+
async for large jobs.
|
|
62
|
+
|
|
63
|
+
CLI: `crawlforge batch urls.txt --format markdown --concurrency 10`.
|
|
64
|
+
|
|
65
|
+
## get_batch_results — page through output (cost: 1)
|
|
66
|
+
|
|
67
|
+
```json
|
|
68
|
+
{ "tool": "get_batch_results", "params": { "batchId": "batch_1234567890_abc", "page": 2, "pageSize": 25 } }
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Use the `batchId` from `batch_scrape` to retrieve paginated results for a
|
|
72
|
+
completed or in-progress job. Cheap (1 credit) because the batch was already
|
|
73
|
+
paid for. Completed jobs are also exposed as `crawlforge://job/{jobId}`
|
|
74
|
+
resources.
|
|
75
|
+
|
|
76
|
+
## scrape_with_actions — interact, then scrape (cost: 5)
|
|
77
|
+
|
|
78
|
+
For SPAs, login-gated content, or multi-step flows. Run an ordered list of
|
|
79
|
+
browser actions before extraction.
|
|
80
|
+
|
|
81
|
+
```json
|
|
82
|
+
{
|
|
83
|
+
"tool": "scrape_with_actions",
|
|
84
|
+
"params": {
|
|
85
|
+
"url": "https://app.com/login",
|
|
86
|
+
"actions": [
|
|
87
|
+
{ "type": "type", "selector": "#email", "text": "user@a.com" },
|
|
88
|
+
{ "type": "type", "selector": "#password", "text": "secret" },
|
|
89
|
+
{ "type": "click", "selector": "#login" },
|
|
90
|
+
{ "type": "wait", "duration": 1500 }
|
|
91
|
+
],
|
|
92
|
+
"formats": ["markdown"]
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Allowed action types: `wait`, `click`, `type`, `press`, `scroll`, `screenshot`,
|
|
98
|
+
`executeJavaScript`. `executeJavaScript` is disabled unless the deploy sets
|
|
99
|
+
`ALLOW_JAVASCRIPT_EXECUTION=true`. 1–20 actions per call. Screenshots are stored
|
|
100
|
+
as `crawlforge://screenshot/{actionId}` resources. Full action schemas:
|
|
101
|
+
[actions](references/actions.md). CLI:
|
|
102
|
+
`crawlforge actions https://example.com --script login.json --screenshot`.
|
|
103
|
+
|
|
104
|
+
## generate_llms_txt — AI policy file (cost: 5)
|
|
105
|
+
|
|
106
|
+
```json
|
|
107
|
+
{ "tool": "generate_llms_txt", "params": { "url": "https://example.com", "format": "both" } }
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Generates `llms.txt` (and optionally `llms-full.txt`) describing how AI models
|
|
111
|
+
should interact with a site. `format`: `both`, `llms-txt`, `llms-full-txt`.
|
|
112
|
+
Tune `analysisOptions` (`maxDepth`, `maxPages`, `respectRobots`) and
|
|
113
|
+
`outputOptions` (`contactEmail`, `organizationName`, custom guidelines). CLI:
|
|
114
|
+
`crawlforge llmstxt https://example.com --include-full > llms.txt`.
|
|
115
|
+
|
|
116
|
+
## Sync vs async decision
|
|
117
|
+
|
|
118
|
+
- **≤ ~25 URLs, need results now** → `batch_scrape` `mode:"sync"`.
|
|
119
|
+
- **Large list or long-running** → `mode:"async"` + `webhook`, then
|
|
120
|
+
`get_batch_results` (or read the `crawlforge://job/{jobId}` resource).
|
|
121
|
+
|
|
122
|
+
## Cost note
|
|
123
|
+
|
|
124
|
+
`batch_scrape`, `scrape_with_actions`, `generate_llms_txt` = 5 credits each;
|
|
125
|
+
`get_batch_results` = 1 (retrieval of an already-paid batch). Async batches are
|
|
126
|
+
billed once at submission — paging results afterward stays cheap.
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# scrape_with_actions — Action Types
|
|
2
|
+
|
|
3
|
+
`scrape_with_actions` runs an ordered `actions[]` array (1–20 items) before
|
|
4
|
+
scraping. Only these 7 action types are allowed (allow-listed in ActionExecutor).
|
|
5
|
+
Each action object has a `type` plus type-specific fields. Common optional
|
|
6
|
+
fields on every action: `timeout`, `description`, `continueOnError`, `retries`
|
|
7
|
+
(0–5), `captureAfter`.
|
|
8
|
+
|
|
9
|
+
## 1. wait
|
|
10
|
+
|
|
11
|
+
Pause or wait for a condition.
|
|
12
|
+
|
|
13
|
+
| Field | Type | Notes |
|
|
14
|
+
|-------|------|-------|
|
|
15
|
+
| `duration` | number | Milliseconds to wait (0–30000). |
|
|
16
|
+
| `selector` | string | Element to wait on. |
|
|
17
|
+
| `condition` | enum | `visible`, `hidden`, `enabled`, `disabled`, `stable`. |
|
|
18
|
+
|
|
19
|
+
```json
|
|
20
|
+
{ "type": "wait", "duration": 1500 }
|
|
21
|
+
{ "type": "wait", "selector": "#results", "condition": "visible" }
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## 2. click
|
|
25
|
+
|
|
26
|
+
| Field | Type | Notes |
|
|
27
|
+
|-------|------|-------|
|
|
28
|
+
| `selector` | string | Element to click. |
|
|
29
|
+
| `button` | enum | `left`, `right`, `middle`. |
|
|
30
|
+
| `clickCount` | number | 1–3. |
|
|
31
|
+
| `delay` | number | ms (0–1000). |
|
|
32
|
+
| `force` | boolean | Bypass actionability checks. |
|
|
33
|
+
| `position` | object | `{x, y}` relative offset. |
|
|
34
|
+
|
|
35
|
+
```json
|
|
36
|
+
{ "type": "click", "selector": "#login" }
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## 3. type
|
|
40
|
+
|
|
41
|
+
| Field | Type | Notes |
|
|
42
|
+
|-------|------|-------|
|
|
43
|
+
| `selector` | string | Input to type into. |
|
|
44
|
+
| `text` | string | Text to enter. |
|
|
45
|
+
| `clear` | boolean | Clear the field first. |
|
|
46
|
+
| `delay` | number | Per-keystroke delay (ms). |
|
|
47
|
+
|
|
48
|
+
```json
|
|
49
|
+
{ "type": "type", "selector": "#email", "text": "user@a.com", "clear": true }
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## 4. press
|
|
53
|
+
|
|
54
|
+
| Field | Type | Notes |
|
|
55
|
+
|-------|------|-------|
|
|
56
|
+
| `key` | string | Key to press (e.g. `Enter`). |
|
|
57
|
+
| `modifiers` | enum[] | `Alt`, `Control`, `Meta`, `Shift`. |
|
|
58
|
+
|
|
59
|
+
```json
|
|
60
|
+
{ "type": "press", "key": "Enter" }
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## 5. scroll
|
|
64
|
+
|
|
65
|
+
| Field | Type | Notes |
|
|
66
|
+
|-------|------|-------|
|
|
67
|
+
| `direction` | enum | `up`, `down`, `left`, `right`. |
|
|
68
|
+
| `distance` | number | Pixels. |
|
|
69
|
+
| `smooth` | boolean | Smooth scrolling. |
|
|
70
|
+
| `toElement` | string | Selector to scroll to. |
|
|
71
|
+
|
|
72
|
+
```json
|
|
73
|
+
{ "type": "scroll", "direction": "down", "distance": 800 }
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## 6. screenshot
|
|
77
|
+
|
|
78
|
+
| Field | Type | Notes |
|
|
79
|
+
|-------|------|-------|
|
|
80
|
+
| `fullPage` | boolean | Capture full page. |
|
|
81
|
+
| `format` | enum | `png`, `jpeg`. |
|
|
82
|
+
| `quality` | number | 0–100 (jpeg). |
|
|
83
|
+
|
|
84
|
+
Saved as a `crawlforge://screenshot/{actionId}` resource.
|
|
85
|
+
|
|
86
|
+
```json
|
|
87
|
+
{ "type": "screenshot", "fullPage": true, "format": "png" }
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## 7. executeJavaScript (gated)
|
|
91
|
+
|
|
92
|
+
Disabled unless the deployment sets `ALLOW_JAVASCRIPT_EXECUTION=true`.
|
|
93
|
+
|
|
94
|
+
| Field | Type | Notes |
|
|
95
|
+
|-------|------|-------|
|
|
96
|
+
| `script` | string | JS to run in page context. |
|
|
97
|
+
| `args` | any[] | Arguments passed to the script. |
|
|
98
|
+
| `returnResult` | boolean | Return the script's result. |
|
|
99
|
+
|
|
100
|
+
```json
|
|
101
|
+
{ "type": "executeJavaScript", "script": "return document.title", "returnResult": true }
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Top-level options
|
|
105
|
+
|
|
106
|
+
| Option | Default | Notes |
|
|
107
|
+
|--------|---------|-------|
|
|
108
|
+
| `formats` | `["json"]` | `markdown`, `html`, `json`, `text`, `screenshots`. |
|
|
109
|
+
| `captureIntermediateStates` | `false` | Snapshot after each action. |
|
|
110
|
+
| `captureScreenshots` | `true` | Screenshot during execution. |
|
|
111
|
+
| `formAutoFill` | — | Declarative form fill: `fields[]` + `submitSelector`. |
|
|
112
|
+
| `browserOptions` | — | `headless`, `userAgent`, `viewportWidth/Height`, `timeout`. |
|
|
113
|
+
| `continueOnActionError` | `false` | Keep going if one action fails. |
|
|
114
|
+
| `maxRetries` | `1` | 0–3 retries on failure. |
|
|
115
|
+
| `screenshotOnError` | `true` | Capture a screenshot when an error occurs. |
|
|
116
|
+
|
|
117
|
+
## CLI action-script format
|
|
118
|
+
|
|
119
|
+
```json
|
|
120
|
+
[
|
|
121
|
+
{ "type": "click", "selector": "#button" },
|
|
122
|
+
{ "type": "type", "selector": "#input", "text": "hello" },
|
|
123
|
+
{ "type": "wait", "duration": 1000 }
|
|
124
|
+
]
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Run with `crawlforge actions <url> --script ./flow.json --screenshot`.
|