crawlforge-mcp-server 4.7.1 → 4.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CLAUDE.md +2 -2
  2. package/package.json +2 -1
  3. package/server.js +56 -10
  4. package/src/cli/commands/init.js +13 -2
  5. package/src/cli/commands/install-skills.js +10 -1
  6. package/src/cli/commands/monitor.js +81 -0
  7. package/src/cli/commands/uninstall-skills.js +10 -1
  8. package/src/core/ActionExecutor.js +81 -15
  9. package/src/core/ElicitationHelper.js +18 -5
  10. package/src/core/LLMsTxtAnalyzer.js +2 -1
  11. package/src/core/MonitorScheduler.js +281 -0
  12. package/src/core/MonitorStore.js +79 -0
  13. package/src/core/ResearchOrchestrator.js +2 -1
  14. package/src/core/crawlers/BFSCrawler.js +2 -1
  15. package/src/resources/ResourceRegistry.js +3 -0
  16. package/src/skills/agent-skills/crawlforge-batch-automation/SKILL.md +126 -0
  17. package/src/skills/agent-skills/crawlforge-batch-automation/references/actions.md +127 -0
  18. package/src/skills/agent-skills/crawlforge-change-tracking/SKILL.md +116 -0
  19. package/src/skills/agent-skills/crawlforge-deep-research/SKILL.md +108 -0
  20. package/src/skills/agent-skills/crawlforge-deep-research/references/workflows.md +76 -0
  21. package/src/skills/agent-skills/crawlforge-getting-started/SKILL.md +89 -0
  22. package/src/skills/agent-skills/crawlforge-getting-started/references/cli.md +71 -0
  23. package/src/skills/agent-skills/crawlforge-getting-started/references/credits.md +75 -0
  24. package/src/skills/agent-skills/crawlforge-stealth-browsing/SKILL.md +106 -0
  25. package/src/skills/agent-skills/crawlforge-stealth-browsing/references/engine-selection.md +63 -0
  26. package/src/skills/agent-skills/crawlforge-structured-extraction/SKILL.md +121 -0
  27. package/src/skills/agent-skills/crawlforge-structured-extraction/references/templates.md +39 -0
  28. package/src/skills/agent-skills/crawlforge-web-scraping/SKILL.md +141 -0
  29. package/src/skills/agent-skills/crawlforge-web-scraping/references/tool-reference.md +95 -0
  30. package/src/skills/installer.js +186 -34
  31. package/src/tools/advanced/ScrapeWithActionsTool.js +7 -0
  32. package/src/tools/advanced/batchScrape/worker.js +8 -2
  33. package/src/tools/basic/_fetch.js +14 -1
  34. package/src/tools/crawl/_sessionContext.js +3 -1
  35. package/src/tools/extract/_fetchAndParse.js +2 -1
  36. package/src/tools/extract/extractContent.js +2 -1
  37. package/src/tools/extract/extractStructured.js +43 -0
  38. package/src/tools/extract/processDocument.js +2 -1
  39. package/src/tools/scrape/_brandingExtractor.js +378 -0
  40. package/src/tools/scrape/unifiedScrape.js +66 -6
  41. package/src/tools/templates/ScrapeTemplateTool.js +2 -1
  42. package/src/tools/tracking/trackChanges/differ.js +3 -1
  43. package/src/tools/tracking/trackChanges/index.js +74 -21
  44. package/src/tools/tracking/trackChanges/schema.js +7 -2
  45. package/src/utils/hostRateLimiter.js +46 -0
  46. package/src/utils/robotsChecker.js +2 -1
  47. package/src/utils/sitemapParser.js +2 -1
  48. package/src/utils/ssrfGuard.js +161 -0
  49. package/src/utils/ssrfProtection.js +6 -9
  50. package/src/skills/crawlforge-cli.md +0 -157
  51. package/src/skills/crawlforge-mcp.md +0 -80
  52. package/src/skills/crawlforge-research.md +0 -104
  53. package/src/skills/crawlforge-stealth.md +0 -98
@@ -0,0 +1,281 @@
1
+ /**
2
+ * MonitorScheduler — recurring change-monitoring engine.
3
+ *
4
+ * Lifecycle:
5
+ * start() load persisted monitors -> rehydrate baseline -> schedule
6
+ * setInterval timers -> catch-up fire anything already due.
7
+ * _fire(def) fetch -> (first run) create baseline, else compareWithBaseline
8
+ * -> significance threshold gate -> optional plain-English goal
9
+ * judge (SamplingClient, degrades gracefully) -> notify -> persist.
10
+ * runDueOnce() fire everything currently due exactly once (the external-cron
11
+ * one-shot path; guaranteed firing regardless of process uptime).
12
+ * stopAll() clear every timer (graceful shutdown — no leaked handles).
13
+ *
14
+ * Honest firing model: a stdio MCP server is not a daemon. In-process timers only
15
+ * fire while the process is alive; missed runs are caught up on next start(). For
16
+ * guaranteed firing, run `crawlforge monitor:run-due` from system cron.
17
+ *
18
+ * All dependencies are injected so the engine is unit-testable without network,
19
+ * timers, or an LLM.
20
+ */
21
+ import { fetchContent, meetsNotificationThreshold } from '../tools/tracking/trackChanges/differ.js';
22
+ import { sendNotifications } from '../tools/tracking/trackChanges/notifier.js';
23
+
24
+ export const FIRING_GUARANTEE_NOTE =
25
+ 'In-process scheduling fires only while the MCP server process is alive; missed runs are ' +
26
+ 'caught up on restart. For guaranteed firing, run `crawlforge monitor:run-due` from system cron.';
27
+
28
+ const MIN_INTERVAL = 60_000;
29
+
30
+ export class MonitorScheduler {
31
+ constructor({ tool, store, samplingClient = null, logger = null, now = () => Date.now() } = {}) {
32
+ this.tool = tool;
33
+ this.store = store;
34
+ this.samplingClient = samplingClient;
35
+ this.logger = logger || { warn() {}, info() {}, error() {} };
36
+ this.now = now;
37
+ this.timers = new Map();
38
+ this._started = false;
39
+ }
40
+
41
+ async start() {
42
+ if (this._started) return;
43
+ this._started = true;
44
+ await this.store.load();
45
+ for (const def of this.store.list()) {
46
+ if (def.enabled === false) continue;
47
+ try {
48
+ await this._ensureBaseline(def);
49
+ } catch (err) {
50
+ this.logger.warn('monitor baseline rehydrate failed', { id: def.id, error: err.message });
51
+ }
52
+ this._schedule(def);
53
+ if (!def.nextDueAt || def.nextDueAt <= this.now()) {
54
+ this._fire(def).catch(() => {});
55
+ }
56
+ }
57
+ }
58
+
59
+ _schedule(def) {
60
+ this._clearTimer(def.id);
61
+ if (def.enabled === false) return;
62
+ const interval = Math.max(MIN_INTERVAL, def.interval || 3_600_000);
63
+ const timer = setInterval(() => {
64
+ this._fire(def).catch(() => {});
65
+ }, interval);
66
+ if (typeof timer.unref === 'function') timer.unref(); // never keep the process alive
67
+ this.timers.set(def.id, timer);
68
+ }
69
+
70
+ _clearTimer(id) {
71
+ const t = this.timers.get(id);
72
+ if (t) {
73
+ clearInterval(t);
74
+ this.timers.delete(id);
75
+ }
76
+ }
77
+
78
+ stopAll() {
79
+ for (const t of this.timers.values()) clearInterval(t);
80
+ this.timers.clear();
81
+ }
82
+
83
+ async createMonitor(input) {
84
+ if (!input || !input.url) throw new Error('createMonitor requires a url');
85
+ const interval = Math.max(MIN_INTERVAL, input.interval || 3_600_000);
86
+ const t = this.now();
87
+ const def = {
88
+ id: this.store.newId(),
89
+ url: input.url,
90
+ interval,
91
+ schedule: input.schedule || null,
92
+ goal: input.goal || null,
93
+ notificationThreshold: input.notificationThreshold || 'moderate',
94
+ trackingOptions: input.trackingOptions || {},
95
+ notificationOptions: input.notificationOptions || null,
96
+ enabled: true,
97
+ createdAt: t,
98
+ nextDueAt: t + interval,
99
+ lastCheckAt: null,
100
+ lastChangeAt: null,
101
+ stats: { checks: 0, changesDetected: 0, notificationsSent: 0, errors: 0 },
102
+ };
103
+ await this.store.save(def);
104
+ try {
105
+ await this._ensureBaseline(def);
106
+ } catch {
107
+ /* baseline will be created on first fire */
108
+ }
109
+ this._schedule(def);
110
+ return { ...def, firingGuarantee: FIRING_GUARANTEE_NOTE };
111
+ }
112
+
113
+ async stopMonitor(id) {
114
+ this._clearTimer(id);
115
+ const existed = !!this.store.get(id);
116
+ await this.store.remove(id);
117
+ return { stopped: existed, id };
118
+ }
119
+
120
+ async stopByUrl(url) {
121
+ let count = 0;
122
+ for (const def of this.store.list()) {
123
+ if (def.url === url) {
124
+ await this.stopMonitor(def.id);
125
+ count++;
126
+ }
127
+ }
128
+ return { stopped: count, url };
129
+ }
130
+
131
+ list() {
132
+ return this.store.list().map((d) => ({
133
+ id: d.id,
134
+ url: d.url,
135
+ interval: d.interval,
136
+ goal: d.goal,
137
+ enabled: d.enabled,
138
+ nextDueAt: d.nextDueAt,
139
+ lastCheckAt: d.lastCheckAt,
140
+ lastChangeAt: d.lastChangeAt,
141
+ stats: d.stats,
142
+ scheduled: this.timers.has(d.id),
143
+ }));
144
+ }
145
+
146
+ async runDueOnce() {
147
+ if (!this.store._loaded) await this.store.load();
148
+ const t = this.now();
149
+ const due = this.store.list().filter((d) => d.enabled !== false && (!d.nextDueAt || d.nextDueAt <= t));
150
+ const results = [];
151
+ for (const def of due) {
152
+ try {
153
+ results.push(await this._fire(def));
154
+ } catch (err) {
155
+ results.push({ id: def.id, url: def.url, error: err.message });
156
+ }
157
+ }
158
+ return { fired: results.length, results };
159
+ }
160
+
161
+ async _ensureBaseline(def) {
162
+ const ct = this.tool.changeTracker;
163
+ if (ct?.snapshots?.has(def.url)) return;
164
+ try {
165
+ const q = await this.tool.snapshotManager.querySnapshots({ url: def.url, limit: 1, includeContent: true });
166
+ const content = q?.snapshots?.[0]?.content;
167
+ if (content && typeof content === 'string') {
168
+ await ct.createBaseline(def.url, content, def.trackingOptions);
169
+ }
170
+ } catch {
171
+ /* no usable snapshot — first fire will create the baseline */
172
+ }
173
+ }
174
+
175
+ async _fire(def) {
176
+ const ct = this.tool.changeTracker;
177
+ const finish = async (extra) => {
178
+ def.lastCheckAt = this.now();
179
+ def.nextDueAt = this.now() + Math.max(MIN_INTERVAL, def.interval || 3_600_000);
180
+ await this.store.save(def);
181
+ return { id: def.id, url: def.url, ...extra };
182
+ };
183
+
184
+ let fetched;
185
+ try {
186
+ fetched = await fetchContent(def.url);
187
+ } catch (err) {
188
+ def.stats.errors++;
189
+ return finish({ error: err.message });
190
+ }
191
+ const content = fetched.content;
192
+
193
+ // First fire (or unrehydrated) — establish a baseline.
194
+ if (!ct?.snapshots?.has(def.url)) {
195
+ await ct.createBaseline(def.url, content, def.trackingOptions);
196
+ try {
197
+ await this.tool.snapshotManager.storeSnapshot(def.url, content, { baseline: true, scheduledMonitor: def.id });
198
+ } catch {
199
+ /* snapshot persistence best-effort */
200
+ }
201
+ def.stats.checks++;
202
+ return finish({ baselineCreated: true });
203
+ }
204
+
205
+ let cmp;
206
+ try {
207
+ cmp = await ct.compareWithBaseline(def.url, content, def.trackingOptions);
208
+ } catch (err) {
209
+ def.stats.errors++;
210
+ return finish({ error: err.message });
211
+ }
212
+
213
+ def.stats.checks++;
214
+ const meets = cmp.hasChanges && meetsNotificationThreshold(cmp.significance, def.notificationThreshold);
215
+ let judge = null;
216
+ let notified = false;
217
+
218
+ if (meets) {
219
+ def.stats.changesDetected++;
220
+ def.lastChangeAt = this.now();
221
+ judge = await this._judgeGoal(def, cmp);
222
+ try {
223
+ await this.tool.snapshotManager.storeSnapshot(def.url, content, {
224
+ changes: cmp.summary,
225
+ significance: cmp.significance,
226
+ scheduledMonitor: def.id,
227
+ });
228
+ } catch {
229
+ /* best-effort */
230
+ }
231
+ if (judge.meaningful && def.notificationOptions) {
232
+ try {
233
+ await sendNotifications(def.url, { ...cmp, goalJudgment: judge }, def.notificationOptions, this.tool);
234
+ def.stats.notificationsSent++;
235
+ notified = true;
236
+ } catch (err) {
237
+ def.stats.errors++;
238
+ this.logger.warn('monitor notification failed', { id: def.id, error: err.message });
239
+ }
240
+ }
241
+ }
242
+
243
+ return finish({
244
+ hasChanges: cmp.hasChanges,
245
+ significance: cmp.significance,
246
+ notified,
247
+ mode: judge?.mode || 'threshold',
248
+ reason: judge?.reason,
249
+ });
250
+ }
251
+
252
+ /**
253
+ * Decide whether a detected change is "meaningful" given the monitor's
254
+ * plain-English goal. Degrades gracefully: no goal -> threshold; no LLM ->
255
+ * notify and tag the mode; never hard-requires an LLM key.
256
+ */
257
+ async _judgeGoal(def, cmp) {
258
+ if (!def.goal) return { meaningful: true, mode: 'threshold' };
259
+ if (!this.samplingClient) return { meaningful: true, mode: 'degraded-no-llm' };
260
+ try {
261
+ const summary = JSON.stringify(cmp.summary || {}).slice(0, 1500);
262
+ const prompt =
263
+ `A monitored web page changed. The user's alert goal is:\n"${def.goal}"\n\n` +
264
+ `Change summary (JSON):\n${summary}\n\n` +
265
+ `Does this change match the goal and warrant an alert? ` +
266
+ `Reply ONLY with JSON: {"meaningful": true|false, "reason": "short reason"}.`;
267
+ const res = await this.samplingClient.complete(prompt, { maxTokens: 200 });
268
+ const text = typeof res === 'string' ? res : res?.text || '';
269
+ const m = text.match(/\{[\s\S]*\}/);
270
+ if (m) {
271
+ const parsed = JSON.parse(m[0]);
272
+ return { meaningful: parsed.meaningful !== false, reason: parsed.reason || '', mode: 'llm' };
273
+ }
274
+ return { meaningful: true, mode: 'degraded-llm-error', reason: 'unparseable judge output' };
275
+ } catch (err) {
276
+ return { meaningful: true, mode: 'degraded-llm-error', reason: err.message };
277
+ }
278
+ }
279
+ }
280
+
281
+ export default MonitorScheduler;
@@ -0,0 +1,79 @@
1
+ /**
2
+ * MonitorStore — disk persistence for scheduled change-monitors.
3
+ *
4
+ * One JSON file per monitor under ./monitors/<id>.json. Mirrors JobManager's
5
+ * persistence *pattern* (mkdir-recursive, per-file JSON, randomUUID, load-on-
6
+ * start) but deliberately omits TTL/eviction — scheduled monitors are long-lived
7
+ * and must never be auto-expired.
8
+ */
9
+ import { promises as fs } from 'node:fs';
10
+ import path from 'node:path';
11
+ import { randomUUID } from 'node:crypto';
12
+
13
+ export class MonitorStore {
14
+ constructor({ storageDir = './monitors' } = {}) {
15
+ this.storageDir = storageDir;
16
+ this.monitors = new Map();
17
+ this._loaded = false;
18
+ }
19
+
20
+ async load() {
21
+ try {
22
+ await fs.mkdir(this.storageDir, { recursive: true });
23
+ const files = await fs.readdir(this.storageDir);
24
+ for (const f of files) {
25
+ if (!f.endsWith('.json')) continue;
26
+ try {
27
+ const def = JSON.parse(await fs.readFile(path.join(this.storageDir, f), 'utf8'));
28
+ if (def && def.id) this.monitors.set(def.id, def);
29
+ } catch {
30
+ /* skip corrupt file */
31
+ }
32
+ }
33
+ } catch {
34
+ /* dir unavailable — start empty */
35
+ }
36
+ this._loaded = true;
37
+ return this.monitors;
38
+ }
39
+
40
+ newId() {
41
+ return randomUUID();
42
+ }
43
+
44
+ async save(def) {
45
+ this.monitors.set(def.id, def);
46
+ try {
47
+ await fs.mkdir(this.storageDir, { recursive: true });
48
+ await fs.writeFile(
49
+ path.join(this.storageDir, `${def.id}.json`),
50
+ JSON.stringify(def, null, 2),
51
+ 'utf8'
52
+ );
53
+ } catch (err) {
54
+ /* keep the in-memory copy even if the write fails */
55
+ return { def, persisted: false, error: err.message };
56
+ }
57
+ return { def, persisted: true };
58
+ }
59
+
60
+ async remove(id) {
61
+ this.monitors.delete(id);
62
+ try {
63
+ await fs.unlink(path.join(this.storageDir, `${id}.json`));
64
+ return true;
65
+ } catch {
66
+ return false;
67
+ }
68
+ }
69
+
70
+ get(id) {
71
+ return this.monitors.get(id);
72
+ }
73
+
74
+ list() {
75
+ return [...this.monitors.values()];
76
+ }
77
+ }
78
+
79
+ export default MonitorStore;
@@ -8,6 +8,7 @@ import { ResultRanker } from '../tools/search/ranking/ResultRanker.js';
8
8
  import { CacheManager } from './cache/CacheManager.js';
9
9
  import { Logger } from '../utils/Logger.js';
10
10
  import { LLMManager } from './llm/LLMManager.js';
11
+ import { safeFetch } from '../utils/ssrfGuard.js';
11
12
 
12
13
  /**
13
14
  * ResearchOrchestrator - Multi-stage research orchestration engine with LLM integration
@@ -552,7 +553,7 @@ export class ResearchOrchestrator extends EventEmitter {
552
553
  });
553
554
  // Fallback: use fetch + basic text extraction
554
555
  try {
555
- const fetchResponse = await fetch(source.link, {
556
+ const fetchResponse = await safeFetch(source.link, {
556
557
  headers: { 'User-Agent': 'CrawlForge-Research/1.0' },
557
558
  signal: AbortSignal.timeout(10000)
558
559
  });
@@ -7,6 +7,7 @@ import { DomainFilter } from '../../utils/domainFilter.js';
7
7
  import { LinkAnalyzer } from '../analysis/LinkAnalyzer.js';
8
8
  import { normalizeUrl, extractLinks, isValidUrl } from '../../utils/urlNormalizer.js';
9
9
  import { Logger } from '../../utils/Logger.js';
10
+ import { safeFetch } from '../../utils/ssrfGuard.js';
10
11
 
11
12
  const logger = new Logger('BFSCrawler');
12
13
 
@@ -284,7 +285,7 @@ export class BFSCrawler {
284
285
  setTimeout(() => controller.abort(), effectiveTimeout);
285
286
  }
286
287
 
287
- const response = await fetch(url, {
288
+ const response = await safeFetch(url, {
288
289
  signal: controller.signal,
289
290
  headers
290
291
  });
@@ -167,6 +167,9 @@ export class ResourceRegistry {
167
167
  * @returns {{ contents: Array<{ uri: string, mimeType: string, text?: string, blob?: string }> }}
168
168
  */
169
169
  async readResource(uri) {
170
+ // The MCP SDK hands the read callback a URL object, not a string; coerce so
171
+ // the sub-readers and parseResourceUri (which calls String#startsWith) work.
172
+ uri = typeof uri === 'string' ? uri : (uri?.href ?? String(uri));
170
173
  const parsed = parseResourceUri(uri);
171
174
  if (!parsed) {
172
175
  throw new Error(`Unknown resource URI: ${uri}`);
@@ -0,0 +1,126 @@
1
+ ---
2
+ name: crawlforge-batch-automation
3
+ description: "Automates large scraping jobs and browser interactions with CrawlForge's batch_scrape, get_batch_results, scrape_with_actions, and generate_llms_txt tools. Use when the user wants to scrape many URLs at once, batch-scrape a list of pages, collect dozens of product, news, or competitor pages, run browser actions (click, type, scroll, wait) before scraping, log in or fill a form before extracting, or generate an llms.txt for a site. Use sync mode for up to about 25 URLs and async with a webhook for large batches; retrieve paginated output with get_batch_results."
4
+ metadata:
5
+ version: 4.8.0
6
+ source: crawlforge-mcp-server
7
+ ---
8
+
9
+ # CrawlForge Batch & Automation
10
+
11
+ Scale up scraping and drive interactive pages. Use `batch_scrape` for many URLs,
12
+ `get_batch_results` to page through async output, `scrape_with_actions` to
13
+ interact before scraping, and `generate_llms_txt` to produce a site's AI policy
14
+ file.
15
+
16
+ ## When to use
17
+
18
+ - "Scrape these 30 URLs" / "batch-scrape this list" → `batch_scrape`
19
+ - "Collect dozens of product / news / competitor pages" → `batch_scrape` (async)
20
+ - "Get the rest of the results from that batch" → `get_batch_results`
21
+ - "Click / type / scroll / wait before scraping" / "log in then extract" →
22
+ `scrape_with_actions`
23
+ - "Generate an llms.txt for this site" → `generate_llms_txt`
24
+
25
+ ## batch_scrape — many URLs in parallel (cost: 5)
26
+
27
+ Sync mode (results returned immediately), good for up to ~25 URLs:
28
+
29
+ ```json
30
+ {
31
+ "tool": "batch_scrape",
32
+ "params": {
33
+ "urls": ["https://a.com", "https://b.com", "https://c.com"],
34
+ "formats": ["markdown"],
35
+ "mode": "sync",
36
+ "maxConcurrency": 5
37
+ }
38
+ }
39
+ ```
40
+
41
+ Async mode with a webhook for large batches (returns a `batchId` immediately):
42
+
43
+ ```json
44
+ {
45
+ "tool": "batch_scrape",
46
+ "params": {
47
+ "urls": ["https://a.com", "https://b.com"],
48
+ "formats": ["json"],
49
+ "mode": "async",
50
+ "webhook": { "url": "https://my-site.com/hook", "events": ["batch_completed", "batch_failed"] }
51
+ }
52
+ }
53
+ ```
54
+
55
+ - `urls` accepts plain strings OR objects `{url, selectors, headers, timeout,
56
+ metadata}` for per-URL config. 1–50 URLs.
57
+ - `formats`: `markdown`, `html`, `json`, `text`.
58
+ - `extractionSchema` applies structured extraction to every URL.
59
+ - `maxConcurrency` 1–20 (default 10); `delayBetweenRequests` throttles.
60
+ - Sync batches over ~25 URLs trigger a confirmation prompt (elicitation) — use
61
+ async for large jobs.
62
+
63
+ CLI: `crawlforge batch urls.txt --format markdown --concurrency 10`.
64
+
65
+ ## get_batch_results — page through output (cost: 1)
66
+
67
+ ```json
68
+ { "tool": "get_batch_results", "params": { "batchId": "batch_1234567890_abc", "page": 2, "pageSize": 25 } }
69
+ ```
70
+
71
+ Use the `batchId` from `batch_scrape` to retrieve paginated results for a
72
+ completed or in-progress job. Cheap (1 credit) because the batch was already
73
+ paid for. Completed jobs are also exposed as `crawlforge://job/{jobId}`
74
+ resources.
75
+
76
+ ## scrape_with_actions — interact, then scrape (cost: 5)
77
+
78
+ For SPAs, login-gated content, or multi-step flows. Run an ordered list of
79
+ browser actions before extraction.
80
+
81
+ ```json
82
+ {
83
+ "tool": "scrape_with_actions",
84
+ "params": {
85
+ "url": "https://app.com/login",
86
+ "actions": [
87
+ { "type": "type", "selector": "#email", "text": "user@a.com" },
88
+ { "type": "type", "selector": "#password", "text": "secret" },
89
+ { "type": "click", "selector": "#login" },
90
+ { "type": "wait", "duration": 1500 }
91
+ ],
92
+ "formats": ["markdown"]
93
+ }
94
+ }
95
+ ```
96
+
97
+ Allowed action types: `wait`, `click`, `type`, `press`, `scroll`, `screenshot`,
98
+ `executeJavaScript`. `executeJavaScript` is disabled unless the deploy sets
99
+ `ALLOW_JAVASCRIPT_EXECUTION=true`. 1–20 actions per call. Screenshots are stored
100
+ as `crawlforge://screenshot/{actionId}` resources. Full action schemas:
101
+ [actions](references/actions.md). CLI:
102
+ `crawlforge actions https://example.com --script login.json --screenshot`.
103
+
104
+ ## generate_llms_txt — AI policy file (cost: 5)
105
+
106
+ ```json
107
+ { "tool": "generate_llms_txt", "params": { "url": "https://example.com", "format": "both" } }
108
+ ```
109
+
110
+ Generates `llms.txt` (and optionally `llms-full.txt`) describing how AI models
111
+ should interact with a site. `format`: `both`, `llms-txt`, `llms-full-txt`.
112
+ Tune `analysisOptions` (`maxDepth`, `maxPages`, `respectRobots`) and
113
+ `outputOptions` (`contactEmail`, `organizationName`, custom guidelines). CLI:
114
+ `crawlforge llmstxt https://example.com --include-full > llms.txt`.
115
+
116
+ ## Sync vs async decision
117
+
118
+ - **≤ ~25 URLs, need results now** → `batch_scrape` `mode:"sync"`.
119
+ - **Large list or long-running** → `mode:"async"` + `webhook`, then
120
+ `get_batch_results` (or read the `crawlforge://job/{jobId}` resource).
121
+
122
+ ## Cost note
123
+
124
+ `batch_scrape`, `scrape_with_actions`, `generate_llms_txt` = 5 credits each;
125
+ `get_batch_results` = 1 (retrieval of an already-paid batch). Async batches are
126
+ billed once at submission — paging results afterward stays cheap.
@@ -0,0 +1,127 @@
1
+ # scrape_with_actions — Action Types
2
+
3
+ `scrape_with_actions` runs an ordered `actions[]` array (1–20 items) before
4
+ scraping. Only these 7 action types are allowed (allow-listed in ActionExecutor).
5
+ Each action object has a `type` plus type-specific fields. Common optional
6
+ fields on every action: `timeout`, `description`, `continueOnError`, `retries`
7
+ (0–5), `captureAfter`.
8
+
9
+ ## 1. wait
10
+
11
+ Pause or wait for a condition.
12
+
13
+ | Field | Type | Notes |
14
+ |-------|------|-------|
15
+ | `duration` | number | Milliseconds to wait (0–30000). |
16
+ | `selector` | string | Element to wait on. |
17
+ | `condition` | enum | `visible`, `hidden`, `enabled`, `disabled`, `stable`. |
18
+
19
+ ```json
20
+ { "type": "wait", "duration": 1500 }
21
+ { "type": "wait", "selector": "#results", "condition": "visible" }
22
+ ```
23
+
24
+ ## 2. click
25
+
26
+ | Field | Type | Notes |
27
+ |-------|------|-------|
28
+ | `selector` | string | Element to click. |
29
+ | `button` | enum | `left`, `right`, `middle`. |
30
+ | `clickCount` | number | 1–3. |
31
+ | `delay` | number | ms (0–1000). |
32
+ | `force` | boolean | Bypass actionability checks. |
33
+ | `position` | object | `{x, y}` relative offset. |
34
+
35
+ ```json
36
+ { "type": "click", "selector": "#login" }
37
+ ```
38
+
39
+ ## 3. type
40
+
41
+ | Field | Type | Notes |
42
+ |-------|------|-------|
43
+ | `selector` | string | Input to type into. |
44
+ | `text` | string | Text to enter. |
45
+ | `clear` | boolean | Clear the field first. |
46
+ | `delay` | number | Per-keystroke delay (ms). |
47
+
48
+ ```json
49
+ { "type": "type", "selector": "#email", "text": "user@a.com", "clear": true }
50
+ ```
51
+
52
+ ## 4. press
53
+
54
+ | Field | Type | Notes |
55
+ |-------|------|-------|
56
+ | `key` | string | Key to press (e.g. `Enter`). |
57
+ | `modifiers` | enum[] | `Alt`, `Control`, `Meta`, `Shift`. |
58
+
59
+ ```json
60
+ { "type": "press", "key": "Enter" }
61
+ ```
62
+
63
+ ## 5. scroll
64
+
65
+ | Field | Type | Notes |
66
+ |-------|------|-------|
67
+ | `direction` | enum | `up`, `down`, `left`, `right`. |
68
+ | `distance` | number | Pixels. |
69
+ | `smooth` | boolean | Smooth scrolling. |
70
+ | `toElement` | string | Selector to scroll to. |
71
+
72
+ ```json
73
+ { "type": "scroll", "direction": "down", "distance": 800 }
74
+ ```
75
+
76
+ ## 6. screenshot
77
+
78
+ | Field | Type | Notes |
79
+ |-------|------|-------|
80
+ | `fullPage` | boolean | Capture full page. |
81
+ | `format` | enum | `png`, `jpeg`. |
82
+ | `quality` | number | 0–100 (jpeg). |
83
+
84
+ Saved as a `crawlforge://screenshot/{actionId}` resource.
85
+
86
+ ```json
87
+ { "type": "screenshot", "fullPage": true, "format": "png" }
88
+ ```
89
+
90
+ ## 7. executeJavaScript (gated)
91
+
92
+ Disabled unless the deployment sets `ALLOW_JAVASCRIPT_EXECUTION=true`.
93
+
94
+ | Field | Type | Notes |
95
+ |-------|------|-------|
96
+ | `script` | string | JS to run in page context. |
97
+ | `args` | any[] | Arguments passed to the script. |
98
+ | `returnResult` | boolean | Return the script's result. |
99
+
100
+ ```json
101
+ { "type": "executeJavaScript", "script": "return document.title", "returnResult": true }
102
+ ```
103
+
104
+ ## Top-level options
105
+
106
+ | Option | Default | Notes |
107
+ |--------|---------|-------|
108
+ | `formats` | `["json"]` | `markdown`, `html`, `json`, `text`, `screenshots`. |
109
+ | `captureIntermediateStates` | `false` | Snapshot after each action. |
110
+ | `captureScreenshots` | `true` | Screenshot during execution. |
111
+ | `formAutoFill` | — | Declarative form fill: `fields[]` + `submitSelector`. |
112
+ | `browserOptions` | — | `headless`, `userAgent`, `viewportWidth/Height`, `timeout`. |
113
+ | `continueOnActionError` | `false` | Keep going if one action fails. |
114
+ | `maxRetries` | `1` | 0–3 retries on failure. |
115
+ | `screenshotOnError` | `true` | Capture a screenshot when an error occurs. |
116
+
117
+ ## CLI action-script format
118
+
119
+ ```json
120
+ [
121
+ { "type": "click", "selector": "#button" },
122
+ { "type": "type", "selector": "#input", "text": "hello" },
123
+ { "type": "wait", "duration": 1000 }
124
+ ]
125
+ ```
126
+
127
+ Run with `crawlforge actions <url> --script ./flow.json --screenshot`.