crawlforge-mcp-server 4.7.1 → 4.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CLAUDE.md +2 -2
  2. package/package.json +2 -1
  3. package/server.js +56 -10
  4. package/src/cli/commands/init.js +13 -2
  5. package/src/cli/commands/install-skills.js +10 -1
  6. package/src/cli/commands/monitor.js +81 -0
  7. package/src/cli/commands/uninstall-skills.js +10 -1
  8. package/src/core/ActionExecutor.js +81 -15
  9. package/src/core/ElicitationHelper.js +18 -5
  10. package/src/core/LLMsTxtAnalyzer.js +2 -1
  11. package/src/core/MonitorScheduler.js +281 -0
  12. package/src/core/MonitorStore.js +79 -0
  13. package/src/core/ResearchOrchestrator.js +2 -1
  14. package/src/core/crawlers/BFSCrawler.js +2 -1
  15. package/src/resources/ResourceRegistry.js +3 -0
  16. package/src/skills/agent-skills/crawlforge-batch-automation/SKILL.md +126 -0
  17. package/src/skills/agent-skills/crawlforge-batch-automation/references/actions.md +127 -0
  18. package/src/skills/agent-skills/crawlforge-change-tracking/SKILL.md +116 -0
  19. package/src/skills/agent-skills/crawlforge-deep-research/SKILL.md +108 -0
  20. package/src/skills/agent-skills/crawlforge-deep-research/references/workflows.md +76 -0
  21. package/src/skills/agent-skills/crawlforge-getting-started/SKILL.md +89 -0
  22. package/src/skills/agent-skills/crawlforge-getting-started/references/cli.md +71 -0
  23. package/src/skills/agent-skills/crawlforge-getting-started/references/credits.md +75 -0
  24. package/src/skills/agent-skills/crawlforge-stealth-browsing/SKILL.md +106 -0
  25. package/src/skills/agent-skills/crawlforge-stealth-browsing/references/engine-selection.md +63 -0
  26. package/src/skills/agent-skills/crawlforge-structured-extraction/SKILL.md +121 -0
  27. package/src/skills/agent-skills/crawlforge-structured-extraction/references/templates.md +39 -0
  28. package/src/skills/agent-skills/crawlforge-web-scraping/SKILL.md +141 -0
  29. package/src/skills/agent-skills/crawlforge-web-scraping/references/tool-reference.md +95 -0
  30. package/src/skills/installer.js +186 -34
  31. package/src/tools/advanced/ScrapeWithActionsTool.js +7 -0
  32. package/src/tools/advanced/batchScrape/worker.js +8 -2
  33. package/src/tools/basic/_fetch.js +14 -1
  34. package/src/tools/crawl/_sessionContext.js +3 -1
  35. package/src/tools/extract/_fetchAndParse.js +2 -1
  36. package/src/tools/extract/extractContent.js +2 -1
  37. package/src/tools/extract/extractStructured.js +43 -0
  38. package/src/tools/extract/processDocument.js +2 -1
  39. package/src/tools/scrape/_brandingExtractor.js +378 -0
  40. package/src/tools/scrape/unifiedScrape.js +66 -6
  41. package/src/tools/templates/ScrapeTemplateTool.js +2 -1
  42. package/src/tools/tracking/trackChanges/differ.js +3 -1
  43. package/src/tools/tracking/trackChanges/index.js +74 -21
  44. package/src/tools/tracking/trackChanges/schema.js +7 -2
  45. package/src/utils/hostRateLimiter.js +46 -0
  46. package/src/utils/robotsChecker.js +2 -1
  47. package/src/utils/sitemapParser.js +2 -1
  48. package/src/utils/ssrfGuard.js +161 -0
  49. package/src/utils/ssrfProtection.js +6 -9
  50. package/src/skills/crawlforge-cli.md +0 -157
  51. package/src/skills/crawlforge-mcp.md +0 -80
  52. package/src/skills/crawlforge-research.md +0 -104
  53. package/src/skills/crawlforge-stealth.md +0 -98
@@ -0,0 +1,378 @@
1
+ /**
2
+ * _brandingExtractor.js — static design-token / branding extraction.
3
+ *
4
+ * Extracts a site's visual identity from HTML + CSS WITHOUT a browser:
5
+ * - colors (CSS custom properties, theme-color, declarations) -> hex + frequency
6
+ * - fonts (font-family, @font-face, Google/Adobe webfont links)
7
+ * - logo (favicons, og:image, header img/svg candidates)
8
+ * - tokens (border-radius, box-shadow, spacing custom props)
9
+ *
10
+ * Static-only limits (documented in the returned `notes`): computed/cascaded
11
+ * styles and JS-injected CSS are not visible without rendering. Linked CSS is
12
+ * fetched through the SSRF guard, capped in count/size/time.
13
+ *
14
+ * Every helper is defensive: failures push to `warnings` and never throw.
15
+ */
16
+
17
+ import { safeFetch } from '../../utils/ssrfGuard.js';
18
+
19
+ const GENERIC_FAMILIES = new Set([
20
+ 'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'system-ui',
21
+ 'ui-serif', 'ui-sans-serif', 'ui-monospace', 'ui-rounded', 'inherit',
22
+ 'initial', 'unset', 'revert', 'emoji', 'math', 'fangsong',
23
+ ]);
24
+
25
+ const NAMED_COLORS = {
26
+ black: '#000000', white: '#ffffff', red: '#ff0000', green: '#008000',
27
+ blue: '#0000ff', yellow: '#ffff00', orange: '#ffa500', purple: '#800080',
28
+ gray: '#808080', grey: '#808080', silver: '#c0c0c0', navy: '#000080',
29
+ teal: '#008080', maroon: '#800000', olive: '#808000', lime: '#00ff00',
30
+ aqua: '#00ffff', fuchsia: '#ff00ff',
31
+ };
32
+
33
+ // CSS variable names that strongly indicate brand colors.
34
+ const BRAND_VAR_RE = /(primary|secondary|brand|accent|theme|background|surface|foreground|text|link)/i;
35
+
36
+ function clamp(n, lo, hi) {
37
+ return Math.max(lo, Math.min(hi, n));
38
+ }
39
+
40
+ function toHex2(n) {
41
+ return clamp(Math.round(n), 0, 255).toString(16).padStart(2, '0');
42
+ }
43
+
44
+ /**
45
+ * Normalize a CSS color token to lowercase #rrggbb (drops alpha). Returns null
46
+ * for values we can't confidently parse (gradients, currentColor, var(), etc.).
47
+ */
48
+ export function normalizeColor(raw) {
49
+ if (!raw) return null;
50
+ const v = String(raw).trim().toLowerCase();
51
+
52
+ if (NAMED_COLORS[v]) return NAMED_COLORS[v];
53
+
54
+ // hex
55
+ let m = v.match(/^#([0-9a-f]{3,8})$/);
56
+ if (m) {
57
+ const h = m[1];
58
+ if (h.length === 3) return '#' + h.split('').map((c) => c + c).join('');
59
+ if (h.length === 4) return '#' + h.slice(0, 3).split('').map((c) => c + c).join('');
60
+ if (h.length === 6) return '#' + h;
61
+ if (h.length === 8) return '#' + h.slice(0, 6);
62
+ return null;
63
+ }
64
+
65
+ // rgb()/rgba()
66
+ m = v.match(/^rgba?\(([^)]+)\)$/);
67
+ if (m) {
68
+ const parts = m[1].split(/[,/\s]+/).filter(Boolean);
69
+ if (parts.length >= 3) {
70
+ const [r, g, b] = parts;
71
+ const pn = (x) => (x.endsWith('%') ? (parseFloat(x) / 100) * 255 : parseFloat(x));
72
+ const rr = pn(r), gg = pn(g), bb = pn(b);
73
+ if ([rr, gg, bb].every((x) => !Number.isNaN(x))) return '#' + toHex2(rr) + toHex2(gg) + toHex2(bb);
74
+ }
75
+ return null;
76
+ }
77
+
78
+ // hsl()/hsla()
79
+ m = v.match(/^hsla?\(([^)]+)\)$/);
80
+ if (m) {
81
+ const parts = m[1].split(/[,/\s]+/).filter(Boolean);
82
+ if (parts.length >= 3) {
83
+ const h = parseFloat(parts[0]);
84
+ const s = parseFloat(parts[1]) / 100;
85
+ const l = parseFloat(parts[2]) / 100;
86
+ if (![h, s, l].some((x) => Number.isNaN(x))) {
87
+ const c = (1 - Math.abs(2 * l - 1)) * s;
88
+ const x = c * (1 - Math.abs(((h / 60) % 2) - 1));
89
+ const mm = l - c / 2;
90
+ let r = 0, g = 0, b = 0;
91
+ const hh = ((h % 360) + 360) % 360;
92
+ if (hh < 60) [r, g, b] = [c, x, 0];
93
+ else if (hh < 120) [r, g, b] = [x, c, 0];
94
+ else if (hh < 180) [r, g, b] = [0, c, x];
95
+ else if (hh < 240) [r, g, b] = [0, x, c];
96
+ else if (hh < 300) [r, g, b] = [x, 0, c];
97
+ else [r, g, b] = [c, 0, x];
98
+ return '#' + toHex2((r + mm) * 255) + toHex2((g + mm) * 255) + toHex2((b + mm) * 255);
99
+ }
100
+ }
101
+ return null;
102
+ }
103
+
104
+ return null;
105
+ }
106
+
107
+ function resolveUrl(href, base) {
108
+ try {
109
+ return new URL(href, base).toString();
110
+ } catch {
111
+ return href;
112
+ }
113
+ }
114
+
115
+ /**
116
+ * Gather raw CSS text from <style> blocks, inline style="" attributes, and
117
+ * (optionally) linked stylesheets.
118
+ */
119
+ async function collectCssSources($, pageUrl, opts) {
120
+ const warnings = [];
121
+ const fetchedUrls = [];
122
+ let styleBlocks = 0;
123
+ let inlineStyleEls = 0;
124
+ let cssText = '';
125
+
126
+ $('style').each((_, el) => {
127
+ const t = $(el).html();
128
+ if (t) { cssText += '\n' + t; styleBlocks++; }
129
+ });
130
+
131
+ $('[style]').each((_, el) => {
132
+ const t = $(el).attr('style');
133
+ if (t) { cssText += '\n*{' + t + '}'; inlineStyleEls++; }
134
+ });
135
+
136
+ if (opts.fetchLinkedCss) {
137
+ const hrefs = [];
138
+ $('link[rel~="stylesheet"][href]').each((_, el) => {
139
+ const href = $(el).attr('href');
140
+ if (href) hrefs.push(resolveUrl(href, pageUrl));
141
+ });
142
+ const max = clamp(opts.maxStylesheets ?? 10, 0, 20);
143
+ for (const href of hrefs.slice(0, max)) {
144
+ try {
145
+ const res = await safeFetch(href, { signal: AbortSignal.timeout(opts.perFileTimeoutMs ?? 8000) });
146
+ if (!res.ok) { warnings.push(`branding: stylesheet ${res.status} ${href}`); continue; }
147
+ let text = await res.text();
148
+ if (text.length > 512 * 1024) text = text.slice(0, 512 * 1024); // size cap
149
+ cssText += '\n' + text;
150
+ fetchedUrls.push(href);
151
+ } catch (err) {
152
+ warnings.push(`branding: could not fetch stylesheet ${href} — ${err.message}`);
153
+ }
154
+ }
155
+ if (hrefs.length > max) warnings.push(`branding: ${hrefs.length - max} stylesheet(s) skipped (maxStylesheets=${max})`);
156
+ }
157
+
158
+ return { cssText, fetchedUrls, styleBlocks, inlineStyleEls, warnings };
159
+ }
160
+
161
+ function extractCssVariables(cssText) {
162
+ const vars = {};
163
+ const re = /--([\w-]+)\s*:\s*([^;}]+)[;}]/g;
164
+ let m;
165
+ while ((m = re.exec(cssText)) !== null) {
166
+ const name = m[1].trim();
167
+ const value = m[2].trim();
168
+ if (name && value && !(name in vars)) vars['--' + name] = value;
169
+ }
170
+ return vars;
171
+ }
172
+
173
+ function extractColors(cssText, $, cssVariables) {
174
+ const counts = new Map(); // hex -> { count, sources:Set }
175
+ const add = (hex, source) => {
176
+ if (!hex) return;
177
+ const cur = counts.get(hex) || { count: 0, sources: new Set() };
178
+ cur.count++;
179
+ cur.sources.add(source);
180
+ counts.set(hex, cur);
181
+ };
182
+
183
+ // meta theme colors (high confidence)
184
+ const themeColor = $('meta[name="theme-color"]').attr('content');
185
+ if (themeColor) add(normalizeColor(themeColor), 'theme-color');
186
+ const tileColor = $('meta[name="msapplication-TileColor"]').attr('content');
187
+ if (tileColor) add(normalizeColor(tileColor), 'theme-color');
188
+
189
+ // brand-ish CSS variables (high confidence)
190
+ for (const [name, value] of Object.entries(cssVariables)) {
191
+ if (BRAND_VAR_RE.test(name)) {
192
+ const hex = normalizeColor(value);
193
+ if (hex) add(hex, 'css-var');
194
+ }
195
+ }
196
+
197
+ // color values in declarations
198
+ const colorProp = /(?:color|background(?:-color)?|border(?:-[a-z]+)?-color|fill|stroke|outline-color)\s*:\s*([^;}!]+)/gi;
199
+ let m;
200
+ while ((m = colorProp.exec(cssText)) !== null) {
201
+ const raw = m[1].trim();
202
+ const hex = normalizeColor(raw);
203
+ if (hex) add(hex, 'declaration');
204
+ }
205
+ // also catch standalone hex/rgb/hsl tokens
206
+ const token = /#[0-9a-fA-F]{3,8}\b|rgba?\([^)]+\)|hsla?\([^)]+\)/g;
207
+ while ((m = token.exec(cssText)) !== null) {
208
+ const hex = normalizeColor(m[0]);
209
+ if (hex) add(hex, 'declaration');
210
+ }
211
+
212
+ const rank = { 'css-var': 0, 'theme-color': 1, 'declaration': 2 };
213
+ return [...counts.entries()]
214
+ .map(([value, info]) => ({
215
+ value,
216
+ count: info.count,
217
+ source: [...info.sources].sort((a, b) => rank[a] - rank[b])[0],
218
+ }))
219
+ .sort((a, b) => rank[a.source] - rank[b.source] || b.count - a.count)
220
+ .slice(0, 24);
221
+ }
222
+
223
+ function extractFonts(cssText, $) {
224
+ const families = new Map();
225
+ const generics = new Set();
226
+ const fontFaces = [];
227
+
228
+ const ffRe = /font-family\s*:\s*([^;}{]+)/gi;
229
+ let m;
230
+ while ((m = ffRe.exec(cssText)) !== null) {
231
+ const list = m[1].split(',').map((f) => f.trim().replace(/^['"]|['"]$/g, '')).filter(Boolean);
232
+ for (const f of list) {
233
+ const lf = f.toLowerCase();
234
+ if (GENERIC_FAMILIES.has(lf)) { generics.add(lf); continue; }
235
+ families.set(f, (families.get(f) || 0) + 1);
236
+ }
237
+ }
238
+
239
+ const faceRe = /@font-face\s*\{([^}]*)\}/gi;
240
+ while ((m = faceRe.exec(cssText)) !== null) {
241
+ const block = m[1];
242
+ const fam = (block.match(/font-family\s*:\s*([^;]+)/i) || [])[1];
243
+ const src = (block.match(/src\s*:\s*([^;]+)/i) || [])[1];
244
+ if (fam) fontFaces.push({ family: fam.trim().replace(/^['"]|['"]$/g, ''), src: src ? src.trim().slice(0, 300) : null });
245
+ }
246
+
247
+ // webfont providers
248
+ const providers = new Set();
249
+ $('link[href*="fonts.googleapis.com"], link[href*="fonts.gstatic.com"]').each((_, el) => {
250
+ providers.add('google-fonts');
251
+ const href = $(el).attr('href') || '';
252
+ const fam = href.match(/[?&]family=([^&]+)/);
253
+ if (fam) {
254
+ decodeURIComponent(fam[1]).split('|').forEach((entry) => {
255
+ const name = entry.split(':')[0].replace(/\+/g, ' ').trim();
256
+ if (name && !GENERIC_FAMILIES.has(name.toLowerCase())) families.set(name, (families.get(name) || 0) + 1);
257
+ });
258
+ }
259
+ });
260
+ $('link[href*="use.typekit"], link[href*="typekit.net"]').each(() => providers.add('adobe-fonts'));
261
+
262
+ return {
263
+ fonts: [...families.entries()].sort((a, b) => b[1] - a[1]).slice(0, 12).map(([family, count]) => ({ family, count })),
264
+ genericFallbacks: [...generics],
265
+ webfontProviders: [...providers],
266
+ fontFaces: fontFaces.slice(0, 12),
267
+ };
268
+ }
269
+
270
+ function extractLogo($, pageUrl) {
271
+ const favicons = [];
272
+ $('link[rel~="icon"], link[rel="shortcut icon"], link[rel="apple-touch-icon"], link[rel="mask-icon"]').each((_, el) => {
273
+ const href = $(el).attr('href');
274
+ if (href) favicons.push({ href: resolveUrl(href, pageUrl), rel: $(el).attr('rel') || null, sizes: $(el).attr('sizes') || null, type: $(el).attr('type') || null });
275
+ });
276
+
277
+ const ogImage = $('meta[property="og:image"]').attr('content') || $('meta[name="twitter:image"]').attr('content') || null;
278
+
279
+ const candidates = [];
280
+ const pushImg = (el, where) => {
281
+ const src = $(el).attr('src') || $(el).attr('data-src');
282
+ if (src) candidates.push({ src: resolveUrl(src, pageUrl), alt: $(el).attr('alt') || null, where });
283
+ };
284
+ $('header img, a[href="/"] img, img[alt*="logo" i], img[class*="logo" i], [class*="logo" i] img').each((_, el) => pushImg(el, 'logo-candidate'));
285
+ const headerSvg = $('header svg').first();
286
+ let inlineSvg = null;
287
+ if (headerSvg.length) {
288
+ try { inlineSvg = $.html(headerSvg).slice(0, 2000); } catch { /* ignore */ }
289
+ }
290
+
291
+ // de-dup candidates by src
292
+ const seen = new Set();
293
+ const deduped = candidates.filter((c) => (seen.has(c.src) ? false : seen.add(c.src)));
294
+
295
+ return {
296
+ favicons: favicons.slice(0, 8),
297
+ ogImage: ogImage ? resolveUrl(ogImage, pageUrl) : null,
298
+ candidates: deduped.slice(0, 8),
299
+ inlineHeaderSvg: inlineSvg,
300
+ };
301
+ }
302
+
303
+ function extractTokens(cssText, cssVariables) {
304
+ const collect = (re) => {
305
+ const counts = new Map();
306
+ let m;
307
+ while ((m = re.exec(cssText)) !== null) {
308
+ const v = m[1].trim();
309
+ if (v && !v.includes('var(')) counts.set(v, (counts.get(v) || 0) + 1);
310
+ }
311
+ return [...counts.entries()].sort((a, b) => b[1] - a[1]).slice(0, 8).map(([value, count]) => ({ value, count }));
312
+ };
313
+
314
+ const spacing = {};
315
+ for (const [name, value] of Object.entries(cssVariables)) {
316
+ if (/(space|spacing|gap|radius|size)/i.test(name)) spacing[name] = value;
317
+ }
318
+
319
+ return {
320
+ radii: collect(/border-radius\s*:\s*([^;}{]+)/gi),
321
+ shadows: collect(/box-shadow\s*:\s*([^;}{]+)/gi),
322
+ spacingVariables: spacing,
323
+ };
324
+ }
325
+
326
+ /**
327
+ * Extract the full branding object from a loaded cheerio $.
328
+ * @param {import('cheerio').CheerioAPI} $
329
+ * @param {string} pageUrl
330
+ * @param {{ fetchLinkedCss?: boolean, maxStylesheets?: number, perFileTimeoutMs?: number }} [opts]
331
+ * @returns {Promise<object>}
332
+ */
333
+ export async function extractBranding($, pageUrl, opts = {}) {
334
+ const options = { fetchLinkedCss: true, maxStylesheets: 10, perFileTimeoutMs: 8000, ...opts };
335
+ const warnings = [];
336
+
337
+ let sources = { cssText: '', fetchedUrls: [], styleBlocks: 0, inlineStyleEls: 0, warnings: [] };
338
+ try {
339
+ sources = await collectCssSources($, pageUrl, options);
340
+ warnings.push(...sources.warnings);
341
+ } catch (err) {
342
+ warnings.push(`branding: CSS collection failed — ${err.message}`);
343
+ }
344
+
345
+ const safe = (fn, fallback, label) => {
346
+ try { return fn(); } catch (err) { warnings.push(`branding: ${label} failed — ${err.message}`); return fallback; }
347
+ };
348
+
349
+ const cssVariables = safe(() => extractCssVariables(sources.cssText), {}, 'css-variables');
350
+ const colors = safe(() => extractColors(sources.cssText, $, cssVariables), [], 'colors');
351
+ const fontInfo = safe(() => extractFonts(sources.cssText, $), { fonts: [], genericFallbacks: [], webfontProviders: [], fontFaces: [] }, 'fonts');
352
+ const logo = safe(() => extractLogo($, pageUrl), { favicons: [], ogImage: null, candidates: [], inlineHeaderSvg: null }, 'logo');
353
+ const tokens = safe(() => extractTokens(sources.cssText, cssVariables), { radii: [], shadows: [], spacingVariables: {} }, 'tokens');
354
+
355
+ return {
356
+ colors,
357
+ fonts: fontInfo.fonts,
358
+ genericFallbacks: fontInfo.genericFallbacks,
359
+ webfontProviders: fontInfo.webfontProviders,
360
+ fontFaces: fontInfo.fontFaces,
361
+ logo,
362
+ cssVariables,
363
+ radii: tokens.radii,
364
+ shadows: tokens.shadows,
365
+ spacing: tokens.spacingVariables,
366
+ sources: {
367
+ styleBlocks: sources.styleBlocks,
368
+ inlineStyleEls: sources.inlineStyleEls,
369
+ linkedStylesheetsFetched: sources.fetchedUrls,
370
+ },
371
+ notes: [
372
+ 'Static extraction from HTML + linked/inline CSS. Computed/cascaded colors and JS-injected styles require browser rendering and are not reflected here.',
373
+ ],
374
+ warnings: warnings.length ? warnings : undefined,
375
+ };
376
+ }
377
+
378
+ export default extractBranding;
@@ -26,7 +26,7 @@ const JsonFormatSchema = z.object({
26
26
  });
27
27
 
28
28
  const FormatSchema = z.union([
29
- z.enum(['markdown', 'html', 'rawHtml', 'text', 'links', 'metadata', 'screenshot']),
29
+ z.enum(['markdown', 'html', 'rawHtml', 'text', 'links', 'metadata', 'screenshot', 'branding']),
30
30
  JsonFormatSchema
31
31
  ]);
32
32
 
@@ -35,7 +35,17 @@ export const UnifiedScrapeSchema = z.object({
35
35
  formats: z.array(FormatSchema).min(1).default(['markdown']),
36
36
  onlyMainContent: z.boolean().optional().default(true),
37
37
  // Pass-through to fetchAndParse
38
- timeoutMs: z.number().min(1000).max(60000).optional().default(15000)
38
+ timeoutMs: z.number().min(1000).max(60000).optional().default(15000),
39
+ // Optional, additive: only consulted when 'branding' / 'screenshot' is requested.
40
+ brandingOptions: z.object({
41
+ fetchLinkedCss: z.boolean().optional().default(true),
42
+ maxStylesheets: z.number().min(0).max(20).optional().default(10)
43
+ }).optional(),
44
+ screenshotOptions: z.object({
45
+ fullPage: z.boolean().optional().default(false),
46
+ format: z.enum(['png', 'jpeg']).optional().default('png'),
47
+ quality: z.number().min(0).max(100).optional()
48
+ }).optional()
39
49
  });
40
50
 
41
51
  // ── Helpers ───────────────────────────────────────────────────────────────────
@@ -155,6 +165,9 @@ export class UnifiedScrapeTool {
155
165
  constructor(options = {}) {
156
166
  this._extractWithLlm = null;
157
167
  this._extractWithLlmConfig = options.llmConfig || {};
168
+ // Optional shared ActionExecutor (injected from server.js so we reuse the
169
+ // existing browser pool rather than spinning up a second one).
170
+ this._actionExecutor = options.actionExecutor || null;
158
171
  }
159
172
 
160
173
  /** Lazy-load ExtractWithLlm to avoid pulling in heavy deps unless needed. */
@@ -166,6 +179,15 @@ export class UnifiedScrapeTool {
166
179
  return this._extractWithLlm;
167
180
  }
168
181
 
182
+ /** Lazy-load an ActionExecutor only when a screenshot is actually requested. */
183
+ async _getActionExecutor() {
184
+ if (!this._actionExecutor) {
185
+ const { default: ActionExecutor } = await import('../../core/ActionExecutor.js');
186
+ this._actionExecutor = new ActionExecutor({ enableLogging: false });
187
+ }
188
+ return this._actionExecutor;
189
+ }
190
+
169
191
  /**
170
192
  * Execute a unified scrape.
171
193
  * @param {object} params - UnifiedScrapeSchema-compatible input
@@ -173,7 +195,7 @@ export class UnifiedScrapeTool {
173
195
  */
174
196
  async execute(params) {
175
197
  const validated = UnifiedScrapeSchema.parse(params);
176
- const { url, formats, onlyMainContent, timeoutMs } = validated;
198
+ const { url, formats, onlyMainContent, timeoutMs, brandingOptions, screenshotOptions } = validated;
177
199
 
178
200
  // Single fetch
179
201
  let html, $, finalUrl;
@@ -291,10 +313,48 @@ export class UnifiedScrapeTool {
291
313
  }
292
314
  break;
293
315
 
316
+ case 'branding':
317
+ try {
318
+ const { extractBranding } = await import('./_brandingExtractor.js');
319
+ const branding = await extractBranding($, finalUrl, {
320
+ fetchLinkedCss: brandingOptions?.fetchLinkedCss ?? true,
321
+ maxStylesheets: brandingOptions?.maxStylesheets ?? 10
322
+ });
323
+ if (Array.isArray(branding.warnings)) {
324
+ warnings.push(...branding.warnings);
325
+ delete branding.warnings;
326
+ }
327
+ content.branding = branding;
328
+ } catch (err) {
329
+ content.branding = {};
330
+ warnings.push(`branding: ${err.message}`);
331
+ }
332
+ break;
333
+
294
334
  case 'screenshot':
295
- // Screenshot requires a browser; not available in the basic scrape path.
296
- content.screenshots = [];
297
- warnings.push('screenshot: browser screenshots are not available in the scrape tool; use scrape_with_actions for screenshots');
335
+ // Opt-in browser path: only launched when 'screenshot' is requested.
336
+ try {
337
+ const exec = await this._getActionExecutor();
338
+ const r = await exec.executeActionChain(
339
+ finalUrl,
340
+ {
341
+ actions: [{
342
+ type: 'screenshot',
343
+ fullPage: screenshotOptions?.fullPage ?? false,
344
+ format: screenshotOptions?.format ?? 'png',
345
+ ...(screenshotOptions?.quality != null ? { quality: screenshotOptions.quality } : {})
346
+ }]
347
+ },
348
+ { headless: true, timeout: 30000 }
349
+ );
350
+ content.screenshots = Array.isArray(r?.screenshots) ? r.screenshots : [];
351
+ if (content.screenshots.length === 0) {
352
+ warnings.push('screenshot: capture produced no image');
353
+ }
354
+ } catch (err) {
355
+ content.screenshots = [];
356
+ warnings.push(`screenshot: ${err.message}`);
357
+ }
298
358
  break;
299
359
 
300
360
  default:
@@ -7,6 +7,7 @@
7
7
  */
8
8
 
9
9
  import { TemplateRegistry } from './TemplateRegistry.js';
10
+ import { safeFetch } from '../../utils/ssrfGuard.js';
10
11
 
11
12
  export class ScrapeTemplateTool {
12
13
  constructor() {
@@ -39,7 +40,7 @@ export class ScrapeTemplateTool {
39
40
  const timeoutId = setTimeout(() => controller.abort(), timeout);
40
41
  let html;
41
42
  try {
42
- const response = await fetch(url, {
43
+ const response = await safeFetch(url, {
43
44
  signal: controller.signal,
44
45
  headers: {
45
46
  'User-Agent': 'Mozilla/5.0 (compatible; CrawlForge-TemplateScraper/4.0)'
@@ -3,6 +3,8 @@
3
3
  * URL content fetching and history/stat helper functions.
4
4
  */
5
5
 
6
+ import { safeFetch } from '../../../utils/ssrfGuard.js';
7
+
6
8
  /**
7
9
  * Default Jaccard similarity threshold below which a change is considered
8
10
  * meaningful (i.e. worth flagging). 0.85 means content must be at least 85 %
@@ -42,7 +44,7 @@ export function calculateSimilarity(text1, text2) {
42
44
  */
43
45
  export async function fetchContent(url) {
44
46
  try {
45
- const response = await fetch(url, {
47
+ const response = await safeFetch(url, {
46
48
  headers: {
47
49
  'User-Agent': 'MCP-WebScraper-ChangeTracker/3.0',
48
50
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@@ -17,6 +17,8 @@ import { EventEmitter } from 'events';
17
17
  import ChangeTracker from '../../../core/ChangeTracker.js';
18
18
  import SnapshotManager from '../../../core/SnapshotManager.js';
19
19
  import CacheManager from '../../../core/cache/CacheManager.js';
20
+ import { MonitorStore } from '../../../core/MonitorStore.js';
21
+ import { MonitorScheduler } from '../../../core/MonitorScheduler.js';
20
22
  import { TrackChangesSchema } from './schema.js';
21
23
  import { fetchContent, mergeHistoryData, matchesSignificanceFilter, calculateAverageInterval, calculateSignificanceDistribution } from './differ.js';
22
24
  import { performMonitoringCheck, stopMonitor } from './monitor.js';
@@ -56,9 +58,44 @@ export class TrackChangesTool extends EventEmitter {
56
58
  this.activeMonitors = new Map();
57
59
  this.monitorStats = new Map();
58
60
 
61
+ // Scheduled-monitor subsystem (timers are NOT started here — only the
62
+ // single server-owned instance calls startScheduler()).
63
+ this._mcpServer = null;
64
+ this.monitorStore = new MonitorStore({ storageDir: this.options.monitorStorageDir || './monitors' });
65
+ this.scheduler = new MonitorScheduler({ tool: this, store: this.monitorStore });
66
+
59
67
  this.initialize();
60
68
  }
61
69
 
70
+ /** Wire the MCP server so the goal-judge can use SamplingClient (Ollama-first). */
71
+ setMcpServer(server) {
72
+ this._mcpServer = server;
73
+ }
74
+
75
+ /** Start the in-process scheduler (called once, by the server). */
76
+ async startScheduler() {
77
+ if (this._mcpServer && !this.scheduler.samplingClient) {
78
+ try {
79
+ const { SamplingClient } = await import('../../../core/SamplingClient.js');
80
+ this.scheduler.samplingClient = new SamplingClient({ mcpServer: this._mcpServer });
81
+ } catch {
82
+ /* goal-judge will degrade to threshold mode */
83
+ }
84
+ }
85
+ await this.scheduler.start();
86
+ }
87
+
88
+ /** Fire every due monitor once and exit (the external-cron one-shot path). */
89
+ async runDueOnce() {
90
+ if (this._mcpServer && !this.scheduler.samplingClient) {
91
+ try {
92
+ const { SamplingClient } = await import('../../../core/SamplingClient.js');
93
+ this.scheduler.samplingClient = new SamplingClient({ mcpServer: this._mcpServer });
94
+ } catch { /* degrade */ }
95
+ }
96
+ return this.scheduler.runDueOnce();
97
+ }
98
+
62
99
  async initialize() {
63
100
  try {
64
101
  await this.snapshotManager.initialize();
@@ -103,6 +140,7 @@ export class TrackChangesTool extends EventEmitter {
103
140
  case 'get_stats': return await this.getStatistics(validated);
104
141
  case 'create_scheduled_monitor':return await this.createScheduledMonitor(validated);
105
142
  case 'stop_scheduled_monitor': return await this.stopScheduledMonitor(validated);
143
+ case 'list_scheduled_monitors': return await this.listScheduledMonitors(validated);
106
144
  case 'get_dashboard': return await this.getMonitoringDashboard(validated);
107
145
  case 'export_history': return await this.exportHistoricalData(validated);
108
146
  case 'create_alert_rule': return await this.createAlertRule(validated);
@@ -281,32 +319,37 @@ export class TrackChangesTool extends EventEmitter {
281
319
  }
282
320
 
283
321
  async createScheduledMonitor(params) {
284
- const { url, scheduledMonitorOptions, trackingOptions, notificationOptions } = params;
285
- const schedule = scheduledMonitorOptions?.schedule || '0 */1 * * *';
286
- const templateId = scheduledMonitorOptions?.templateId;
287
- let monitorOptions = { ...trackingOptions };
288
- if (templateId && this.changeTracker.monitoringTemplates.has(templateId)) {
289
- monitorOptions = { ...this.changeTracker.monitoringTemplates.get(templateId).options, ...monitorOptions };
290
- }
291
- const result = await this.changeTracker.createScheduledMonitor(url, schedule, {
292
- ...monitorOptions,
293
- alertRules: { threshold: 'moderate', methods: ['webhook'], throttle: 600000, ...notificationOptions }
322
+ const { url, scheduledMonitorOptions, monitoringOptions, trackingOptions, notificationOptions } = params;
323
+ if (!url) throw new Error('create_scheduled_monitor requires a url');
324
+ const opts = scheduledMonitorOptions || {};
325
+ const monitor = await this.scheduler.createMonitor({
326
+ url,
327
+ interval: opts.interval ?? monitoringOptions?.interval,
328
+ schedule: opts.schedule,
329
+ goal: opts.goal,
330
+ notificationThreshold: opts.notificationThreshold || monitoringOptions?.notificationThreshold || 'moderate',
331
+ trackingOptions,
332
+ notificationOptions
294
333
  });
295
- return { success: true, operation: 'create_scheduled_monitor', url, monitor: result, template: templateId ? this.changeTracker.monitoringTemplates.get(templateId)?.name : null, timestamp: Date.now() };
334
+ return { success: true, operation: 'create_scheduled_monitor', url, monitor, firingGuarantee: monitor.firingGuarantee, timestamp: Date.now() };
296
335
  }
297
336
 
298
337
  async stopScheduledMonitor(params) {
299
- const { url } = params;
300
- let stoppedMonitors = 0;
301
- for (const [id, monitor] of this.changeTracker.scheduledMonitors.entries()) {
302
- if (monitor.url === url) {
303
- monitor.cronJob?.destroy();
304
- monitor.status = 'stopped';
305
- this.changeTracker.scheduledMonitors.delete(id);
306
- stoppedMonitors++;
307
- }
338
+ const { url, scheduledMonitorOptions } = params;
339
+ const monitorId = scheduledMonitorOptions?.monitorId;
340
+ if (monitorId) {
341
+ const result = await this.scheduler.stopMonitor(monitorId);
342
+ return { success: true, operation: 'stop_scheduled_monitor', monitorId, stopped: result.stopped, timestamp: Date.now() };
308
343
  }
309
- return { success: true, operation: 'stop_scheduled_monitor', url, stoppedMonitors, timestamp: Date.now() };
344
+ if (!url) throw new Error('stop_scheduled_monitor requires a url or scheduledMonitorOptions.monitorId');
345
+ const result = await this.scheduler.stopByUrl(url);
346
+ return { success: true, operation: 'stop_scheduled_monitor', url, stoppedMonitors: result.stopped, timestamp: Date.now() };
347
+ }
348
+
349
+ async listScheduledMonitors() {
350
+ if (!this.monitorStore._loaded) await this.monitorStore.load();
351
+ const monitors = this.scheduler.list();
352
+ return { success: true, operation: 'list_scheduled_monitors', monitors, count: monitors.length, timestamp: Date.now() };
310
353
  }
311
354
 
312
355
  async getMonitoringDashboard(params) {
@@ -379,11 +422,21 @@ export class TrackChangesTool extends EventEmitter {
379
422
 
380
423
  async shutdown() {
381
424
  this.stopAllMonitoring();
425
+ this.scheduler?.stopAll();
382
426
  await this.snapshotManager.shutdown();
383
427
  await this.changeTracker.cleanup();
384
428
  this.emit('shutdown');
385
429
  }
386
430
 
431
+ /**
432
+ * Alias so the server's graceful-shutdown sweep (which filters tools by a
433
+ * `destroy`/`cleanup` method) actually tears this tool down — including the
434
+ * scheduler timers. Without this, scheduled-monitor intervals would leak.
435
+ */
436
+ async cleanup() {
437
+ return this.shutdown();
438
+ }
439
+
387
440
  // ── Private helpers ────────────────────────────────────────────────────────────
388
441
 
389
442
  _getAggregatedMonitoringStats() {