crawlforge-mcp-server 4.7.2 → 4.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +2 -2
- package/package.json +2 -1
- package/server.js +42 -9
- package/src/cli/commands/init.js +13 -2
- package/src/cli/commands/install-skills.js +10 -1
- package/src/cli/commands/monitor.js +81 -0
- package/src/cli/commands/uninstall-skills.js +10 -1
- package/src/core/ActionExecutor.js +51 -9
- package/src/core/ElicitationHelper.js +18 -5
- package/src/core/LLMsTxtAnalyzer.js +2 -1
- package/src/core/MonitorScheduler.js +281 -0
- package/src/core/MonitorStore.js +79 -0
- package/src/core/ResearchOrchestrator.js +2 -1
- package/src/core/crawlers/BFSCrawler.js +2 -1
- package/src/skills/agent-skills/crawlforge-batch-automation/SKILL.md +126 -0
- package/src/skills/agent-skills/crawlforge-batch-automation/references/actions.md +127 -0
- package/src/skills/agent-skills/crawlforge-change-tracking/SKILL.md +116 -0
- package/src/skills/agent-skills/crawlforge-deep-research/SKILL.md +108 -0
- package/src/skills/agent-skills/crawlforge-deep-research/references/workflows.md +76 -0
- package/src/skills/agent-skills/crawlforge-getting-started/SKILL.md +89 -0
- package/src/skills/agent-skills/crawlforge-getting-started/references/cli.md +71 -0
- package/src/skills/agent-skills/crawlforge-getting-started/references/credits.md +75 -0
- package/src/skills/agent-skills/crawlforge-stealth-browsing/SKILL.md +106 -0
- package/src/skills/agent-skills/crawlforge-stealth-browsing/references/engine-selection.md +63 -0
- package/src/skills/agent-skills/crawlforge-structured-extraction/SKILL.md +121 -0
- package/src/skills/agent-skills/crawlforge-structured-extraction/references/templates.md +39 -0
- package/src/skills/agent-skills/crawlforge-web-scraping/SKILL.md +141 -0
- package/src/skills/agent-skills/crawlforge-web-scraping/references/tool-reference.md +95 -0
- package/src/skills/installer.js +186 -34
- package/src/tools/advanced/batchScrape/worker.js +8 -2
- package/src/tools/basic/_fetch.js +14 -1
- package/src/tools/crawl/_sessionContext.js +3 -1
- package/src/tools/extract/_fetchAndParse.js +2 -1
- package/src/tools/extract/extractContent.js +2 -1
- package/src/tools/extract/processDocument.js +2 -1
- package/src/tools/scrape/_brandingExtractor.js +378 -0
- package/src/tools/scrape/unifiedScrape.js +66 -6
- package/src/tools/templates/ScrapeTemplateTool.js +2 -1
- package/src/tools/tracking/trackChanges/differ.js +3 -1
- package/src/tools/tracking/trackChanges/index.js +74 -21
- package/src/tools/tracking/trackChanges/schema.js +7 -2
- package/src/utils/hostRateLimiter.js +46 -0
- package/src/utils/robotsChecker.js +2 -1
- package/src/utils/sitemapParser.js +2 -1
- package/src/utils/ssrfGuard.js +161 -0
- package/src/utils/ssrfProtection.js +6 -9
- package/src/skills/crawlforge-cli.md +0 -157
- package/src/skills/crawlforge-mcp.md +0 -80
- package/src/skills/crawlforge-research.md +0 -104
- package/src/skills/crawlforge-stealth.md +0 -98
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* _brandingExtractor.js — static design-token / branding extraction.
|
|
3
|
+
*
|
|
4
|
+
* Extracts a site's visual identity from HTML + CSS WITHOUT a browser:
|
|
5
|
+
* - colors (CSS custom properties, theme-color, declarations) -> hex + frequency
|
|
6
|
+
* - fonts (font-family, @font-face, Google/Adobe webfont links)
|
|
7
|
+
* - logo (favicons, og:image, header img/svg candidates)
|
|
8
|
+
* - tokens (border-radius, box-shadow, spacing custom props)
|
|
9
|
+
*
|
|
10
|
+
* Static-only limits (documented in the returned `notes`): computed/cascaded
|
|
11
|
+
* styles and JS-injected CSS are not visible without rendering. Linked CSS is
|
|
12
|
+
* fetched through the SSRF guard, capped in count/size/time.
|
|
13
|
+
*
|
|
14
|
+
* Every helper is defensive: failures push to `warnings` and never throw.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { safeFetch } from '../../utils/ssrfGuard.js';
|
|
18
|
+
|
|
19
|
+
const GENERIC_FAMILIES = new Set([
|
|
20
|
+
'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'system-ui',
|
|
21
|
+
'ui-serif', 'ui-sans-serif', 'ui-monospace', 'ui-rounded', 'inherit',
|
|
22
|
+
'initial', 'unset', 'revert', 'emoji', 'math', 'fangsong',
|
|
23
|
+
]);
|
|
24
|
+
|
|
25
|
+
const NAMED_COLORS = {
|
|
26
|
+
black: '#000000', white: '#ffffff', red: '#ff0000', green: '#008000',
|
|
27
|
+
blue: '#0000ff', yellow: '#ffff00', orange: '#ffa500', purple: '#800080',
|
|
28
|
+
gray: '#808080', grey: '#808080', silver: '#c0c0c0', navy: '#000080',
|
|
29
|
+
teal: '#008080', maroon: '#800000', olive: '#808000', lime: '#00ff00',
|
|
30
|
+
aqua: '#00ffff', fuchsia: '#ff00ff',
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
// CSS variable names that strongly indicate brand colors.
|
|
34
|
+
const BRAND_VAR_RE = /(primary|secondary|brand|accent|theme|background|surface|foreground|text|link)/i;
|
|
35
|
+
|
|
36
|
+
function clamp(n, lo, hi) {
|
|
37
|
+
return Math.max(lo, Math.min(hi, n));
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function toHex2(n) {
|
|
41
|
+
return clamp(Math.round(n), 0, 255).toString(16).padStart(2, '0');
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Normalize a CSS color token to lowercase #rrggbb (drops alpha). Returns null
|
|
46
|
+
* for values we can't confidently parse (gradients, currentColor, var(), etc.).
|
|
47
|
+
*/
|
|
48
|
+
export function normalizeColor(raw) {
|
|
49
|
+
if (!raw) return null;
|
|
50
|
+
const v = String(raw).trim().toLowerCase();
|
|
51
|
+
|
|
52
|
+
if (NAMED_COLORS[v]) return NAMED_COLORS[v];
|
|
53
|
+
|
|
54
|
+
// hex
|
|
55
|
+
let m = v.match(/^#([0-9a-f]{3,8})$/);
|
|
56
|
+
if (m) {
|
|
57
|
+
const h = m[1];
|
|
58
|
+
if (h.length === 3) return '#' + h.split('').map((c) => c + c).join('');
|
|
59
|
+
if (h.length === 4) return '#' + h.slice(0, 3).split('').map((c) => c + c).join('');
|
|
60
|
+
if (h.length === 6) return '#' + h;
|
|
61
|
+
if (h.length === 8) return '#' + h.slice(0, 6);
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// rgb()/rgba()
|
|
66
|
+
m = v.match(/^rgba?\(([^)]+)\)$/);
|
|
67
|
+
if (m) {
|
|
68
|
+
const parts = m[1].split(/[,/\s]+/).filter(Boolean);
|
|
69
|
+
if (parts.length >= 3) {
|
|
70
|
+
const [r, g, b] = parts;
|
|
71
|
+
const pn = (x) => (x.endsWith('%') ? (parseFloat(x) / 100) * 255 : parseFloat(x));
|
|
72
|
+
const rr = pn(r), gg = pn(g), bb = pn(b);
|
|
73
|
+
if ([rr, gg, bb].every((x) => !Number.isNaN(x))) return '#' + toHex2(rr) + toHex2(gg) + toHex2(bb);
|
|
74
|
+
}
|
|
75
|
+
return null;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// hsl()/hsla()
|
|
79
|
+
m = v.match(/^hsla?\(([^)]+)\)$/);
|
|
80
|
+
if (m) {
|
|
81
|
+
const parts = m[1].split(/[,/\s]+/).filter(Boolean);
|
|
82
|
+
if (parts.length >= 3) {
|
|
83
|
+
const h = parseFloat(parts[0]);
|
|
84
|
+
const s = parseFloat(parts[1]) / 100;
|
|
85
|
+
const l = parseFloat(parts[2]) / 100;
|
|
86
|
+
if (![h, s, l].some((x) => Number.isNaN(x))) {
|
|
87
|
+
const c = (1 - Math.abs(2 * l - 1)) * s;
|
|
88
|
+
const x = c * (1 - Math.abs(((h / 60) % 2) - 1));
|
|
89
|
+
const mm = l - c / 2;
|
|
90
|
+
let r = 0, g = 0, b = 0;
|
|
91
|
+
const hh = ((h % 360) + 360) % 360;
|
|
92
|
+
if (hh < 60) [r, g, b] = [c, x, 0];
|
|
93
|
+
else if (hh < 120) [r, g, b] = [x, c, 0];
|
|
94
|
+
else if (hh < 180) [r, g, b] = [0, c, x];
|
|
95
|
+
else if (hh < 240) [r, g, b] = [0, x, c];
|
|
96
|
+
else if (hh < 300) [r, g, b] = [x, 0, c];
|
|
97
|
+
else [r, g, b] = [c, 0, x];
|
|
98
|
+
return '#' + toHex2((r + mm) * 255) + toHex2((g + mm) * 255) + toHex2((b + mm) * 255);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
return null;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return null;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function resolveUrl(href, base) {
|
|
108
|
+
try {
|
|
109
|
+
return new URL(href, base).toString();
|
|
110
|
+
} catch {
|
|
111
|
+
return href;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Gather raw CSS text from <style> blocks, inline style="" attributes, and
|
|
117
|
+
* (optionally) linked stylesheets.
|
|
118
|
+
*/
|
|
119
|
+
async function collectCssSources($, pageUrl, opts) {
|
|
120
|
+
const warnings = [];
|
|
121
|
+
const fetchedUrls = [];
|
|
122
|
+
let styleBlocks = 0;
|
|
123
|
+
let inlineStyleEls = 0;
|
|
124
|
+
let cssText = '';
|
|
125
|
+
|
|
126
|
+
$('style').each((_, el) => {
|
|
127
|
+
const t = $(el).html();
|
|
128
|
+
if (t) { cssText += '\n' + t; styleBlocks++; }
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
$('[style]').each((_, el) => {
|
|
132
|
+
const t = $(el).attr('style');
|
|
133
|
+
if (t) { cssText += '\n*{' + t + '}'; inlineStyleEls++; }
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
if (opts.fetchLinkedCss) {
|
|
137
|
+
const hrefs = [];
|
|
138
|
+
$('link[rel~="stylesheet"][href]').each((_, el) => {
|
|
139
|
+
const href = $(el).attr('href');
|
|
140
|
+
if (href) hrefs.push(resolveUrl(href, pageUrl));
|
|
141
|
+
});
|
|
142
|
+
const max = clamp(opts.maxStylesheets ?? 10, 0, 20);
|
|
143
|
+
for (const href of hrefs.slice(0, max)) {
|
|
144
|
+
try {
|
|
145
|
+
const res = await safeFetch(href, { signal: AbortSignal.timeout(opts.perFileTimeoutMs ?? 8000) });
|
|
146
|
+
if (!res.ok) { warnings.push(`branding: stylesheet ${res.status} ${href}`); continue; }
|
|
147
|
+
let text = await res.text();
|
|
148
|
+
if (text.length > 512 * 1024) text = text.slice(0, 512 * 1024); // size cap
|
|
149
|
+
cssText += '\n' + text;
|
|
150
|
+
fetchedUrls.push(href);
|
|
151
|
+
} catch (err) {
|
|
152
|
+
warnings.push(`branding: could not fetch stylesheet ${href} — ${err.message}`);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
if (hrefs.length > max) warnings.push(`branding: ${hrefs.length - max} stylesheet(s) skipped (maxStylesheets=${max})`);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
return { cssText, fetchedUrls, styleBlocks, inlineStyleEls, warnings };
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
function extractCssVariables(cssText) {
|
|
162
|
+
const vars = {};
|
|
163
|
+
const re = /--([\w-]+)\s*:\s*([^;}]+)[;}]/g;
|
|
164
|
+
let m;
|
|
165
|
+
while ((m = re.exec(cssText)) !== null) {
|
|
166
|
+
const name = m[1].trim();
|
|
167
|
+
const value = m[2].trim();
|
|
168
|
+
if (name && value && !(name in vars)) vars['--' + name] = value;
|
|
169
|
+
}
|
|
170
|
+
return vars;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
function extractColors(cssText, $, cssVariables) {
|
|
174
|
+
const counts = new Map(); // hex -> { count, sources:Set }
|
|
175
|
+
const add = (hex, source) => {
|
|
176
|
+
if (!hex) return;
|
|
177
|
+
const cur = counts.get(hex) || { count: 0, sources: new Set() };
|
|
178
|
+
cur.count++;
|
|
179
|
+
cur.sources.add(source);
|
|
180
|
+
counts.set(hex, cur);
|
|
181
|
+
};
|
|
182
|
+
|
|
183
|
+
// meta theme colors (high confidence)
|
|
184
|
+
const themeColor = $('meta[name="theme-color"]').attr('content');
|
|
185
|
+
if (themeColor) add(normalizeColor(themeColor), 'theme-color');
|
|
186
|
+
const tileColor = $('meta[name="msapplication-TileColor"]').attr('content');
|
|
187
|
+
if (tileColor) add(normalizeColor(tileColor), 'theme-color');
|
|
188
|
+
|
|
189
|
+
// brand-ish CSS variables (high confidence)
|
|
190
|
+
for (const [name, value] of Object.entries(cssVariables)) {
|
|
191
|
+
if (BRAND_VAR_RE.test(name)) {
|
|
192
|
+
const hex = normalizeColor(value);
|
|
193
|
+
if (hex) add(hex, 'css-var');
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// color values in declarations
|
|
198
|
+
const colorProp = /(?:color|background(?:-color)?|border(?:-[a-z]+)?-color|fill|stroke|outline-color)\s*:\s*([^;}!]+)/gi;
|
|
199
|
+
let m;
|
|
200
|
+
while ((m = colorProp.exec(cssText)) !== null) {
|
|
201
|
+
const raw = m[1].trim();
|
|
202
|
+
const hex = normalizeColor(raw);
|
|
203
|
+
if (hex) add(hex, 'declaration');
|
|
204
|
+
}
|
|
205
|
+
// also catch standalone hex/rgb/hsl tokens
|
|
206
|
+
const token = /#[0-9a-fA-F]{3,8}\b|rgba?\([^)]+\)|hsla?\([^)]+\)/g;
|
|
207
|
+
while ((m = token.exec(cssText)) !== null) {
|
|
208
|
+
const hex = normalizeColor(m[0]);
|
|
209
|
+
if (hex) add(hex, 'declaration');
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
const rank = { 'css-var': 0, 'theme-color': 1, 'declaration': 2 };
|
|
213
|
+
return [...counts.entries()]
|
|
214
|
+
.map(([value, info]) => ({
|
|
215
|
+
value,
|
|
216
|
+
count: info.count,
|
|
217
|
+
source: [...info.sources].sort((a, b) => rank[a] - rank[b])[0],
|
|
218
|
+
}))
|
|
219
|
+
.sort((a, b) => rank[a.source] - rank[b.source] || b.count - a.count)
|
|
220
|
+
.slice(0, 24);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
function extractFonts(cssText, $) {
|
|
224
|
+
const families = new Map();
|
|
225
|
+
const generics = new Set();
|
|
226
|
+
const fontFaces = [];
|
|
227
|
+
|
|
228
|
+
const ffRe = /font-family\s*:\s*([^;}{]+)/gi;
|
|
229
|
+
let m;
|
|
230
|
+
while ((m = ffRe.exec(cssText)) !== null) {
|
|
231
|
+
const list = m[1].split(',').map((f) => f.trim().replace(/^['"]|['"]$/g, '')).filter(Boolean);
|
|
232
|
+
for (const f of list) {
|
|
233
|
+
const lf = f.toLowerCase();
|
|
234
|
+
if (GENERIC_FAMILIES.has(lf)) { generics.add(lf); continue; }
|
|
235
|
+
families.set(f, (families.get(f) || 0) + 1);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
const faceRe = /@font-face\s*\{([^}]*)\}/gi;
|
|
240
|
+
while ((m = faceRe.exec(cssText)) !== null) {
|
|
241
|
+
const block = m[1];
|
|
242
|
+
const fam = (block.match(/font-family\s*:\s*([^;]+)/i) || [])[1];
|
|
243
|
+
const src = (block.match(/src\s*:\s*([^;]+)/i) || [])[1];
|
|
244
|
+
if (fam) fontFaces.push({ family: fam.trim().replace(/^['"]|['"]$/g, ''), src: src ? src.trim().slice(0, 300) : null });
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// webfont providers
|
|
248
|
+
const providers = new Set();
|
|
249
|
+
$('link[href*="fonts.googleapis.com"], link[href*="fonts.gstatic.com"]').each((_, el) => {
|
|
250
|
+
providers.add('google-fonts');
|
|
251
|
+
const href = $(el).attr('href') || '';
|
|
252
|
+
const fam = href.match(/[?&]family=([^&]+)/);
|
|
253
|
+
if (fam) {
|
|
254
|
+
decodeURIComponent(fam[1]).split('|').forEach((entry) => {
|
|
255
|
+
const name = entry.split(':')[0].replace(/\+/g, ' ').trim();
|
|
256
|
+
if (name && !GENERIC_FAMILIES.has(name.toLowerCase())) families.set(name, (families.get(name) || 0) + 1);
|
|
257
|
+
});
|
|
258
|
+
}
|
|
259
|
+
});
|
|
260
|
+
$('link[href*="use.typekit"], link[href*="typekit.net"]').each(() => providers.add('adobe-fonts'));
|
|
261
|
+
|
|
262
|
+
return {
|
|
263
|
+
fonts: [...families.entries()].sort((a, b) => b[1] - a[1]).slice(0, 12).map(([family, count]) => ({ family, count })),
|
|
264
|
+
genericFallbacks: [...generics],
|
|
265
|
+
webfontProviders: [...providers],
|
|
266
|
+
fontFaces: fontFaces.slice(0, 12),
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
function extractLogo($, pageUrl) {
|
|
271
|
+
const favicons = [];
|
|
272
|
+
$('link[rel~="icon"], link[rel="shortcut icon"], link[rel="apple-touch-icon"], link[rel="mask-icon"]').each((_, el) => {
|
|
273
|
+
const href = $(el).attr('href');
|
|
274
|
+
if (href) favicons.push({ href: resolveUrl(href, pageUrl), rel: $(el).attr('rel') || null, sizes: $(el).attr('sizes') || null, type: $(el).attr('type') || null });
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
const ogImage = $('meta[property="og:image"]').attr('content') || $('meta[name="twitter:image"]').attr('content') || null;
|
|
278
|
+
|
|
279
|
+
const candidates = [];
|
|
280
|
+
const pushImg = (el, where) => {
|
|
281
|
+
const src = $(el).attr('src') || $(el).attr('data-src');
|
|
282
|
+
if (src) candidates.push({ src: resolveUrl(src, pageUrl), alt: $(el).attr('alt') || null, where });
|
|
283
|
+
};
|
|
284
|
+
$('header img, a[href="/"] img, img[alt*="logo" i], img[class*="logo" i], [class*="logo" i] img').each((_, el) => pushImg(el, 'logo-candidate'));
|
|
285
|
+
const headerSvg = $('header svg').first();
|
|
286
|
+
let inlineSvg = null;
|
|
287
|
+
if (headerSvg.length) {
|
|
288
|
+
try { inlineSvg = $.html(headerSvg).slice(0, 2000); } catch { /* ignore */ }
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// de-dup candidates by src
|
|
292
|
+
const seen = new Set();
|
|
293
|
+
const deduped = candidates.filter((c) => (seen.has(c.src) ? false : seen.add(c.src)));
|
|
294
|
+
|
|
295
|
+
return {
|
|
296
|
+
favicons: favicons.slice(0, 8),
|
|
297
|
+
ogImage: ogImage ? resolveUrl(ogImage, pageUrl) : null,
|
|
298
|
+
candidates: deduped.slice(0, 8),
|
|
299
|
+
inlineHeaderSvg: inlineSvg,
|
|
300
|
+
};
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
function extractTokens(cssText, cssVariables) {
|
|
304
|
+
const collect = (re) => {
|
|
305
|
+
const counts = new Map();
|
|
306
|
+
let m;
|
|
307
|
+
while ((m = re.exec(cssText)) !== null) {
|
|
308
|
+
const v = m[1].trim();
|
|
309
|
+
if (v && !v.includes('var(')) counts.set(v, (counts.get(v) || 0) + 1);
|
|
310
|
+
}
|
|
311
|
+
return [...counts.entries()].sort((a, b) => b[1] - a[1]).slice(0, 8).map(([value, count]) => ({ value, count }));
|
|
312
|
+
};
|
|
313
|
+
|
|
314
|
+
const spacing = {};
|
|
315
|
+
for (const [name, value] of Object.entries(cssVariables)) {
|
|
316
|
+
if (/(space|spacing|gap|radius|size)/i.test(name)) spacing[name] = value;
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
return {
|
|
320
|
+
radii: collect(/border-radius\s*:\s*([^;}{]+)/gi),
|
|
321
|
+
shadows: collect(/box-shadow\s*:\s*([^;}{]+)/gi),
|
|
322
|
+
spacingVariables: spacing,
|
|
323
|
+
};
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
/**
|
|
327
|
+
* Extract the full branding object from a loaded cheerio $.
|
|
328
|
+
* @param {import('cheerio').CheerioAPI} $
|
|
329
|
+
* @param {string} pageUrl
|
|
330
|
+
* @param {{ fetchLinkedCss?: boolean, maxStylesheets?: number, perFileTimeoutMs?: number }} [opts]
|
|
331
|
+
* @returns {Promise<object>}
|
|
332
|
+
*/
|
|
333
|
+
export async function extractBranding($, pageUrl, opts = {}) {
|
|
334
|
+
const options = { fetchLinkedCss: true, maxStylesheets: 10, perFileTimeoutMs: 8000, ...opts };
|
|
335
|
+
const warnings = [];
|
|
336
|
+
|
|
337
|
+
let sources = { cssText: '', fetchedUrls: [], styleBlocks: 0, inlineStyleEls: 0, warnings: [] };
|
|
338
|
+
try {
|
|
339
|
+
sources = await collectCssSources($, pageUrl, options);
|
|
340
|
+
warnings.push(...sources.warnings);
|
|
341
|
+
} catch (err) {
|
|
342
|
+
warnings.push(`branding: CSS collection failed — ${err.message}`);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
const safe = (fn, fallback, label) => {
|
|
346
|
+
try { return fn(); } catch (err) { warnings.push(`branding: ${label} failed — ${err.message}`); return fallback; }
|
|
347
|
+
};
|
|
348
|
+
|
|
349
|
+
const cssVariables = safe(() => extractCssVariables(sources.cssText), {}, 'css-variables');
|
|
350
|
+
const colors = safe(() => extractColors(sources.cssText, $, cssVariables), [], 'colors');
|
|
351
|
+
const fontInfo = safe(() => extractFonts(sources.cssText, $), { fonts: [], genericFallbacks: [], webfontProviders: [], fontFaces: [] }, 'fonts');
|
|
352
|
+
const logo = safe(() => extractLogo($, pageUrl), { favicons: [], ogImage: null, candidates: [], inlineHeaderSvg: null }, 'logo');
|
|
353
|
+
const tokens = safe(() => extractTokens(sources.cssText, cssVariables), { radii: [], shadows: [], spacingVariables: {} }, 'tokens');
|
|
354
|
+
|
|
355
|
+
return {
|
|
356
|
+
colors,
|
|
357
|
+
fonts: fontInfo.fonts,
|
|
358
|
+
genericFallbacks: fontInfo.genericFallbacks,
|
|
359
|
+
webfontProviders: fontInfo.webfontProviders,
|
|
360
|
+
fontFaces: fontInfo.fontFaces,
|
|
361
|
+
logo,
|
|
362
|
+
cssVariables,
|
|
363
|
+
radii: tokens.radii,
|
|
364
|
+
shadows: tokens.shadows,
|
|
365
|
+
spacing: tokens.spacingVariables,
|
|
366
|
+
sources: {
|
|
367
|
+
styleBlocks: sources.styleBlocks,
|
|
368
|
+
inlineStyleEls: sources.inlineStyleEls,
|
|
369
|
+
linkedStylesheetsFetched: sources.fetchedUrls,
|
|
370
|
+
},
|
|
371
|
+
notes: [
|
|
372
|
+
'Static extraction from HTML + linked/inline CSS. Computed/cascaded colors and JS-injected styles require browser rendering and are not reflected here.',
|
|
373
|
+
],
|
|
374
|
+
warnings: warnings.length ? warnings : undefined,
|
|
375
|
+
};
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
export default extractBranding;
|
|
@@ -26,7 +26,7 @@ const JsonFormatSchema = z.object({
|
|
|
26
26
|
});
|
|
27
27
|
|
|
28
28
|
const FormatSchema = z.union([
|
|
29
|
-
z.enum(['markdown', 'html', 'rawHtml', 'text', 'links', 'metadata', 'screenshot']),
|
|
29
|
+
z.enum(['markdown', 'html', 'rawHtml', 'text', 'links', 'metadata', 'screenshot', 'branding']),
|
|
30
30
|
JsonFormatSchema
|
|
31
31
|
]);
|
|
32
32
|
|
|
@@ -35,7 +35,17 @@ export const UnifiedScrapeSchema = z.object({
|
|
|
35
35
|
formats: z.array(FormatSchema).min(1).default(['markdown']),
|
|
36
36
|
onlyMainContent: z.boolean().optional().default(true),
|
|
37
37
|
// Pass-through to fetchAndParse
|
|
38
|
-
timeoutMs: z.number().min(1000).max(60000).optional().default(15000)
|
|
38
|
+
timeoutMs: z.number().min(1000).max(60000).optional().default(15000),
|
|
39
|
+
// Optional, additive: only consulted when 'branding' / 'screenshot' is requested.
|
|
40
|
+
brandingOptions: z.object({
|
|
41
|
+
fetchLinkedCss: z.boolean().optional().default(true),
|
|
42
|
+
maxStylesheets: z.number().min(0).max(20).optional().default(10)
|
|
43
|
+
}).optional(),
|
|
44
|
+
screenshotOptions: z.object({
|
|
45
|
+
fullPage: z.boolean().optional().default(false),
|
|
46
|
+
format: z.enum(['png', 'jpeg']).optional().default('png'),
|
|
47
|
+
quality: z.number().min(0).max(100).optional()
|
|
48
|
+
}).optional()
|
|
39
49
|
});
|
|
40
50
|
|
|
41
51
|
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
@@ -155,6 +165,9 @@ export class UnifiedScrapeTool {
|
|
|
155
165
|
constructor(options = {}) {
|
|
156
166
|
this._extractWithLlm = null;
|
|
157
167
|
this._extractWithLlmConfig = options.llmConfig || {};
|
|
168
|
+
// Optional shared ActionExecutor (injected from server.js so we reuse the
|
|
169
|
+
// existing browser pool rather than spinning up a second one).
|
|
170
|
+
this._actionExecutor = options.actionExecutor || null;
|
|
158
171
|
}
|
|
159
172
|
|
|
160
173
|
/** Lazy-load ExtractWithLlm to avoid pulling in heavy deps unless needed. */
|
|
@@ -166,6 +179,15 @@ export class UnifiedScrapeTool {
|
|
|
166
179
|
return this._extractWithLlm;
|
|
167
180
|
}
|
|
168
181
|
|
|
182
|
+
/** Lazy-load an ActionExecutor only when a screenshot is actually requested. */
|
|
183
|
+
async _getActionExecutor() {
|
|
184
|
+
if (!this._actionExecutor) {
|
|
185
|
+
const { default: ActionExecutor } = await import('../../core/ActionExecutor.js');
|
|
186
|
+
this._actionExecutor = new ActionExecutor({ enableLogging: false });
|
|
187
|
+
}
|
|
188
|
+
return this._actionExecutor;
|
|
189
|
+
}
|
|
190
|
+
|
|
169
191
|
/**
|
|
170
192
|
* Execute a unified scrape.
|
|
171
193
|
* @param {object} params - UnifiedScrapeSchema-compatible input
|
|
@@ -173,7 +195,7 @@ export class UnifiedScrapeTool {
|
|
|
173
195
|
*/
|
|
174
196
|
async execute(params) {
|
|
175
197
|
const validated = UnifiedScrapeSchema.parse(params);
|
|
176
|
-
const { url, formats, onlyMainContent, timeoutMs } = validated;
|
|
198
|
+
const { url, formats, onlyMainContent, timeoutMs, brandingOptions, screenshotOptions } = validated;
|
|
177
199
|
|
|
178
200
|
// Single fetch
|
|
179
201
|
let html, $, finalUrl;
|
|
@@ -291,10 +313,48 @@ export class UnifiedScrapeTool {
|
|
|
291
313
|
}
|
|
292
314
|
break;
|
|
293
315
|
|
|
316
|
+
case 'branding':
|
|
317
|
+
try {
|
|
318
|
+
const { extractBranding } = await import('./_brandingExtractor.js');
|
|
319
|
+
const branding = await extractBranding($, finalUrl, {
|
|
320
|
+
fetchLinkedCss: brandingOptions?.fetchLinkedCss ?? true,
|
|
321
|
+
maxStylesheets: brandingOptions?.maxStylesheets ?? 10
|
|
322
|
+
});
|
|
323
|
+
if (Array.isArray(branding.warnings)) {
|
|
324
|
+
warnings.push(...branding.warnings);
|
|
325
|
+
delete branding.warnings;
|
|
326
|
+
}
|
|
327
|
+
content.branding = branding;
|
|
328
|
+
} catch (err) {
|
|
329
|
+
content.branding = {};
|
|
330
|
+
warnings.push(`branding: ${err.message}`);
|
|
331
|
+
}
|
|
332
|
+
break;
|
|
333
|
+
|
|
294
334
|
case 'screenshot':
|
|
295
|
-
//
|
|
296
|
-
|
|
297
|
-
|
|
335
|
+
// Opt-in browser path: only launched when 'screenshot' is requested.
|
|
336
|
+
try {
|
|
337
|
+
const exec = await this._getActionExecutor();
|
|
338
|
+
const r = await exec.executeActionChain(
|
|
339
|
+
finalUrl,
|
|
340
|
+
{
|
|
341
|
+
actions: [{
|
|
342
|
+
type: 'screenshot',
|
|
343
|
+
fullPage: screenshotOptions?.fullPage ?? false,
|
|
344
|
+
format: screenshotOptions?.format ?? 'png',
|
|
345
|
+
...(screenshotOptions?.quality != null ? { quality: screenshotOptions.quality } : {})
|
|
346
|
+
}]
|
|
347
|
+
},
|
|
348
|
+
{ headless: true, timeout: 30000 }
|
|
349
|
+
);
|
|
350
|
+
content.screenshots = Array.isArray(r?.screenshots) ? r.screenshots : [];
|
|
351
|
+
if (content.screenshots.length === 0) {
|
|
352
|
+
warnings.push('screenshot: capture produced no image');
|
|
353
|
+
}
|
|
354
|
+
} catch (err) {
|
|
355
|
+
content.screenshots = [];
|
|
356
|
+
warnings.push(`screenshot: ${err.message}`);
|
|
357
|
+
}
|
|
298
358
|
break;
|
|
299
359
|
|
|
300
360
|
default:
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
9
|
import { TemplateRegistry } from './TemplateRegistry.js';
|
|
10
|
+
import { safeFetch } from '../../utils/ssrfGuard.js';
|
|
10
11
|
|
|
11
12
|
export class ScrapeTemplateTool {
|
|
12
13
|
constructor() {
|
|
@@ -39,7 +40,7 @@ export class ScrapeTemplateTool {
|
|
|
39
40
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
40
41
|
let html;
|
|
41
42
|
try {
|
|
42
|
-
const response = await
|
|
43
|
+
const response = await safeFetch(url, {
|
|
43
44
|
signal: controller.signal,
|
|
44
45
|
headers: {
|
|
45
46
|
'User-Agent': 'Mozilla/5.0 (compatible; CrawlForge-TemplateScraper/4.0)'
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
* URL content fetching and history/stat helper functions.
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
+
import { safeFetch } from '../../../utils/ssrfGuard.js';
|
|
7
|
+
|
|
6
8
|
/**
|
|
7
9
|
* Default Jaccard similarity threshold below which a change is considered
|
|
8
10
|
* meaningful (i.e. worth flagging). 0.85 means content must be at least 85 %
|
|
@@ -42,7 +44,7 @@ export function calculateSimilarity(text1, text2) {
|
|
|
42
44
|
*/
|
|
43
45
|
export async function fetchContent(url) {
|
|
44
46
|
try {
|
|
45
|
-
const response = await
|
|
47
|
+
const response = await safeFetch(url, {
|
|
46
48
|
headers: {
|
|
47
49
|
'User-Agent': 'MCP-WebScraper-ChangeTracker/3.0',
|
|
48
50
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
@@ -17,6 +17,8 @@ import { EventEmitter } from 'events';
|
|
|
17
17
|
import ChangeTracker from '../../../core/ChangeTracker.js';
|
|
18
18
|
import SnapshotManager from '../../../core/SnapshotManager.js';
|
|
19
19
|
import CacheManager from '../../../core/cache/CacheManager.js';
|
|
20
|
+
import { MonitorStore } from '../../../core/MonitorStore.js';
|
|
21
|
+
import { MonitorScheduler } from '../../../core/MonitorScheduler.js';
|
|
20
22
|
import { TrackChangesSchema } from './schema.js';
|
|
21
23
|
import { fetchContent, mergeHistoryData, matchesSignificanceFilter, calculateAverageInterval, calculateSignificanceDistribution } from './differ.js';
|
|
22
24
|
import { performMonitoringCheck, stopMonitor } from './monitor.js';
|
|
@@ -56,9 +58,44 @@ export class TrackChangesTool extends EventEmitter {
|
|
|
56
58
|
this.activeMonitors = new Map();
|
|
57
59
|
this.monitorStats = new Map();
|
|
58
60
|
|
|
61
|
+
// Scheduled-monitor subsystem (timers are NOT started here — only the
|
|
62
|
+
// single server-owned instance calls startScheduler()).
|
|
63
|
+
this._mcpServer = null;
|
|
64
|
+
this.monitorStore = new MonitorStore({ storageDir: this.options.monitorStorageDir || './monitors' });
|
|
65
|
+
this.scheduler = new MonitorScheduler({ tool: this, store: this.monitorStore });
|
|
66
|
+
|
|
59
67
|
this.initialize();
|
|
60
68
|
}
|
|
61
69
|
|
|
70
|
+
/** Wire the MCP server so the goal-judge can use SamplingClient (Ollama-first). */
|
|
71
|
+
setMcpServer(server) {
|
|
72
|
+
this._mcpServer = server;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/** Start the in-process scheduler (called once, by the server). */
|
|
76
|
+
async startScheduler() {
|
|
77
|
+
if (this._mcpServer && !this.scheduler.samplingClient) {
|
|
78
|
+
try {
|
|
79
|
+
const { SamplingClient } = await import('../../../core/SamplingClient.js');
|
|
80
|
+
this.scheduler.samplingClient = new SamplingClient({ mcpServer: this._mcpServer });
|
|
81
|
+
} catch {
|
|
82
|
+
/* goal-judge will degrade to threshold mode */
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
await this.scheduler.start();
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/** Fire every due monitor once and exit (the external-cron one-shot path). */
|
|
89
|
+
async runDueOnce() {
|
|
90
|
+
if (this._mcpServer && !this.scheduler.samplingClient) {
|
|
91
|
+
try {
|
|
92
|
+
const { SamplingClient } = await import('../../../core/SamplingClient.js');
|
|
93
|
+
this.scheduler.samplingClient = new SamplingClient({ mcpServer: this._mcpServer });
|
|
94
|
+
} catch { /* degrade */ }
|
|
95
|
+
}
|
|
96
|
+
return this.scheduler.runDueOnce();
|
|
97
|
+
}
|
|
98
|
+
|
|
62
99
|
async initialize() {
|
|
63
100
|
try {
|
|
64
101
|
await this.snapshotManager.initialize();
|
|
@@ -103,6 +140,7 @@ export class TrackChangesTool extends EventEmitter {
|
|
|
103
140
|
case 'get_stats': return await this.getStatistics(validated);
|
|
104
141
|
case 'create_scheduled_monitor':return await this.createScheduledMonitor(validated);
|
|
105
142
|
case 'stop_scheduled_monitor': return await this.stopScheduledMonitor(validated);
|
|
143
|
+
case 'list_scheduled_monitors': return await this.listScheduledMonitors(validated);
|
|
106
144
|
case 'get_dashboard': return await this.getMonitoringDashboard(validated);
|
|
107
145
|
case 'export_history': return await this.exportHistoricalData(validated);
|
|
108
146
|
case 'create_alert_rule': return await this.createAlertRule(validated);
|
|
@@ -281,32 +319,37 @@ export class TrackChangesTool extends EventEmitter {
|
|
|
281
319
|
}
|
|
282
320
|
|
|
283
321
|
async createScheduledMonitor(params) {
|
|
284
|
-
const { url, scheduledMonitorOptions, trackingOptions, notificationOptions } = params;
|
|
285
|
-
|
|
286
|
-
const
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
322
|
+
const { url, scheduledMonitorOptions, monitoringOptions, trackingOptions, notificationOptions } = params;
|
|
323
|
+
if (!url) throw new Error('create_scheduled_monitor requires a url');
|
|
324
|
+
const opts = scheduledMonitorOptions || {};
|
|
325
|
+
const monitor = await this.scheduler.createMonitor({
|
|
326
|
+
url,
|
|
327
|
+
interval: opts.interval ?? monitoringOptions?.interval,
|
|
328
|
+
schedule: opts.schedule,
|
|
329
|
+
goal: opts.goal,
|
|
330
|
+
notificationThreshold: opts.notificationThreshold || monitoringOptions?.notificationThreshold || 'moderate',
|
|
331
|
+
trackingOptions,
|
|
332
|
+
notificationOptions
|
|
294
333
|
});
|
|
295
|
-
return { success: true, operation: 'create_scheduled_monitor', url, monitor
|
|
334
|
+
return { success: true, operation: 'create_scheduled_monitor', url, monitor, firingGuarantee: monitor.firingGuarantee, timestamp: Date.now() };
|
|
296
335
|
}
|
|
297
336
|
|
|
298
337
|
async stopScheduledMonitor(params) {
|
|
299
|
-
const { url } = params;
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
monitor.status = 'stopped';
|
|
305
|
-
this.changeTracker.scheduledMonitors.delete(id);
|
|
306
|
-
stoppedMonitors++;
|
|
307
|
-
}
|
|
338
|
+
const { url, scheduledMonitorOptions } = params;
|
|
339
|
+
const monitorId = scheduledMonitorOptions?.monitorId;
|
|
340
|
+
if (monitorId) {
|
|
341
|
+
const result = await this.scheduler.stopMonitor(monitorId);
|
|
342
|
+
return { success: true, operation: 'stop_scheduled_monitor', monitorId, stopped: result.stopped, timestamp: Date.now() };
|
|
308
343
|
}
|
|
309
|
-
|
|
344
|
+
if (!url) throw new Error('stop_scheduled_monitor requires a url or scheduledMonitorOptions.monitorId');
|
|
345
|
+
const result = await this.scheduler.stopByUrl(url);
|
|
346
|
+
return { success: true, operation: 'stop_scheduled_monitor', url, stoppedMonitors: result.stopped, timestamp: Date.now() };
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
async listScheduledMonitors() {
|
|
350
|
+
if (!this.monitorStore._loaded) await this.monitorStore.load();
|
|
351
|
+
const monitors = this.scheduler.list();
|
|
352
|
+
return { success: true, operation: 'list_scheduled_monitors', monitors, count: monitors.length, timestamp: Date.now() };
|
|
310
353
|
}
|
|
311
354
|
|
|
312
355
|
async getMonitoringDashboard(params) {
|
|
@@ -379,11 +422,21 @@ export class TrackChangesTool extends EventEmitter {
|
|
|
379
422
|
|
|
380
423
|
async shutdown() {
|
|
381
424
|
this.stopAllMonitoring();
|
|
425
|
+
this.scheduler?.stopAll();
|
|
382
426
|
await this.snapshotManager.shutdown();
|
|
383
427
|
await this.changeTracker.cleanup();
|
|
384
428
|
this.emit('shutdown');
|
|
385
429
|
}
|
|
386
430
|
|
|
431
|
+
/**
|
|
432
|
+
* Alias so the server's graceful-shutdown sweep (which filters tools by a
|
|
433
|
+
* `destroy`/`cleanup` method) actually tears this tool down — including the
|
|
434
|
+
* scheduler timers. Without this, scheduled-monitor intervals would leak.
|
|
435
|
+
*/
|
|
436
|
+
async cleanup() {
|
|
437
|
+
return this.shutdown();
|
|
438
|
+
}
|
|
439
|
+
|
|
387
440
|
// ── Private helpers ────────────────────────────────────────────────────────────
|
|
388
441
|
|
|
389
442
|
_getAggregatedMonitoringStats() {
|