crawlforge-mcp-server 3.0.17 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/CLAUDE.md +2 -0
  2. package/README.md +1 -0
  3. package/package.json +6 -2
  4. package/server.js +192 -1277
  5. package/src/constants/config.js +2 -1
  6. package/src/core/ActionExecutor.js +2 -43
  7. package/src/core/AuthManager.js +230 -32
  8. package/src/core/BrowserContextPool.js +187 -0
  9. package/src/core/JobManager.js +7 -5
  10. package/src/core/LocalizationManager.js +14 -125
  11. package/src/core/ResearchOrchestrator.js +86 -5
  12. package/src/core/StealthBrowserManager.js +26 -18
  13. package/src/core/cache/CacheManager.js +4 -1
  14. package/src/core/crawlers/BFSCrawler.js +19 -5
  15. package/src/core/endpointGuard.js +37 -0
  16. package/src/observability/metrics.js +137 -0
  17. package/src/observability/tracing.js +74 -0
  18. package/src/server/auth/oauth.js +388 -0
  19. package/src/server/registerTool.js +41 -0
  20. package/src/server/schemas/common.js +29 -0
  21. package/src/server/transports/http.js +22 -0
  22. package/src/server/transports/stdio.js +16 -0
  23. package/src/server/transports/streamableHttp.js +226 -0
  24. package/src/server/withAuth.js +121 -0
  25. package/src/tools/advanced/BatchScrapeTool.js +12 -1086
  26. package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
  27. package/src/tools/advanced/batchScrape/index.js +328 -0
  28. package/src/tools/advanced/batchScrape/queue.js +91 -0
  29. package/src/tools/advanced/batchScrape/reporter.js +26 -0
  30. package/src/tools/advanced/batchScrape/schema.js +37 -0
  31. package/src/tools/advanced/batchScrape/worker.js +179 -0
  32. package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
  33. package/src/tools/basic/_fetch.js +35 -0
  34. package/src/tools/basic/extractLinks.js +74 -0
  35. package/src/tools/basic/extractMetadata.js +74 -0
  36. package/src/tools/basic/extractText.js +46 -0
  37. package/src/tools/basic/fetchUrl.js +44 -0
  38. package/src/tools/basic/scrapeStructured.js +58 -0
  39. package/src/tools/crawl/_sessionContext.js +234 -0
  40. package/src/tools/crawl/crawlDeep.js +55 -5
  41. package/src/tools/crawl/mapSite.js +23 -2
  42. package/src/tools/extract/_fetchAndParse.js +57 -0
  43. package/src/tools/extract/extractStructured.js +3 -19
  44. package/src/tools/extract/extractWithLlm.js +295 -0
  45. package/src/tools/research/deepResearch.js +33 -8
  46. package/src/tools/search/providers/searxng.js +126 -0
  47. package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
  48. package/src/tools/search/ranking/ResultRanker.js +17 -10
  49. package/src/tools/search/ranking/SearchResultCache.js +52 -0
  50. package/src/tools/search/searchWeb.js +112 -6
  51. package/src/tools/tracking/trackChanges/differ.js +98 -0
  52. package/src/tools/tracking/trackChanges/index.js +432 -0
  53. package/src/tools/tracking/trackChanges/monitor.js +93 -0
  54. package/src/tools/tracking/trackChanges/notifier.js +105 -0
  55. package/src/tools/tracking/trackChanges/schema.js +127 -0
  56. package/src/tools/tracking/trackChanges.js +12 -1374
@@ -0,0 +1,234 @@
1
+ /**
2
+ * _sessionContext.js
3
+ *
4
+ * Lightweight in-memory cookie jar for crawl session reuse.
5
+ * Zero external runtime dependencies — Set-Cookie headers are parsed
6
+ * with a minimal hand-rolled implementation that handles the attributes
7
+ * needed for a single-host crawl session (name, value, path, domain,
8
+ * secure, httponly, max-age, expires).
9
+ *
10
+ * Rationale for not using set-cookie-parser / tough-cookie:
11
+ * - We only need same-origin cookie persistence within one crawl run.
12
+ * - The crawl never spans multiple registered domains in a way that
13
+ * requires full RFC 6265 compliance (partitioned jars, public suffix
14
+ * list, etc.).
15
+ * - Keeping zero new runtime deps satisfies the project constraint.
16
+ */
17
+
18
+ /**
19
+ * Parse a single Set-Cookie header value into a cookie object.
20
+ * Returns null if the header is empty or unparseable.
21
+ *
22
+ * @param {string} header - Raw Set-Cookie header value
23
+ * @param {string} requestUrl - URL that issued the Set-Cookie response
24
+ * @returns {{ name: string, value: string, domain: string, path: string,
25
+ * secure: boolean, expires: number|null }|null}
26
+ */
27
+ function parseSetCookie(header, requestUrl) {
28
+ if (!header) return null;
29
+
30
+ const parts = header.split(';').map(s => s.trim());
31
+ if (parts.length === 0 || !parts[0].includes('=')) return null;
32
+
33
+ const eqIdx = parts[0].indexOf('=');
34
+ const name = parts[0].slice(0, eqIdx).trim();
35
+ const value = parts[0].slice(eqIdx + 1).trim();
36
+ if (!name) return null;
37
+
38
+ let requestUrlObj;
39
+ try {
40
+ requestUrlObj = new URL(requestUrl);
41
+ } catch {
42
+ return null;
43
+ }
44
+
45
+ // Defaults derived from the request URL
46
+ let domain = requestUrlObj.hostname;
47
+ let path = '/';
48
+ let secure = false;
49
+ let expires = null; // null = session cookie (lives until crawl ends)
50
+
51
+ for (let i = 1; i < parts.length; i++) {
52
+ const part = parts[i];
53
+ const lower = part.toLowerCase();
54
+
55
+ if (lower.startsWith('domain=')) {
56
+ // Strip leading dot — we do exact hostname matching
57
+ domain = part.slice('domain='.length).trim().replace(/^\./, '');
58
+ } else if (lower.startsWith('path=')) {
59
+ path = part.slice('path='.length).trim() || '/';
60
+ } else if (lower === 'secure') {
61
+ secure = true;
62
+ } else if (lower.startsWith('max-age=')) {
63
+ const maxAge = parseInt(part.slice('max-age='.length), 10);
64
+ if (!isNaN(maxAge)) {
65
+ expires = maxAge <= 0 ? 0 : Date.now() + maxAge * 1000;
66
+ }
67
+ } else if (lower.startsWith('expires=')) {
68
+ const dateStr = part.slice('expires='.length).trim();
69
+ const ts = Date.parse(dateStr);
70
+ if (!isNaN(ts) && expires === null) {
71
+ // max-age takes precedence over expires
72
+ expires = ts;
73
+ }
74
+ }
75
+ // httponly is intentionally ignored — not relevant for a server-side crawler
76
+ }
77
+
78
+ return { name, value, domain, path, secure, expires };
79
+ }
80
+
81
+ /**
82
+ * Determine whether a stored cookie should be sent for the given URL.
83
+ *
84
+ * @param {object} cookie - Stored cookie object
85
+ * @param {URL} urlObj - Parsed URL of the outgoing request
86
+ * @returns {boolean}
87
+ */
88
+ function cookieMatchesUrl(cookie, urlObj) {
89
+ // Honour expiry
90
+ if (cookie.expires !== null && Date.now() > cookie.expires) return false;
91
+
92
+ // Domain: exact match or subdomain match (cookie.domain is already dot-stripped)
93
+ const host = urlObj.hostname;
94
+ if (host !== cookie.domain && !host.endsWith('.' + cookie.domain)) return false;
95
+
96
+ // Secure flag
97
+ if (cookie.secure && urlObj.protocol !== 'https:') return false;
98
+
99
+ // Path: request path must start with cookie path
100
+ const reqPath = urlObj.pathname || '/';
101
+ if (!reqPath.startsWith(cookie.path)) return false;
102
+
103
+ return true;
104
+ }
105
+
106
+ /**
107
+ * SessionContext — holds the shared cookie jar and custom headers for one
108
+ * crawl session. Passed into BFSCrawler so every page fetch participates
109
+ * in the same session.
110
+ */
111
+ export class SessionContext {
112
+ /**
113
+ * @param {object} [options]
114
+ * @param {boolean} [options.persistCookies=true]
115
+ * @param {Record<string,string>} [options.headers={}]
116
+ */
117
+ constructor(options = {}) {
118
+ this.persistCookies = options.persistCookies !== false; // default true
119
+ this.headers = options.headers || {};
120
+ /** @type {Array<{name,value,domain,path,secure,expires}>} */
121
+ this._jar = [];
122
+ }
123
+
124
+ /**
125
+ * Record cookies from a fetch Response.
126
+ * Handles the `set-cookie` header (Node fetch returns it as a single
127
+ * string value; actual multi-cookie responses are represented as multiple
128
+ * headers which the Headers API coalesces with ', ' for some values — we
129
+ * deal with raw strings from getSetCookie() when available).
130
+ *
131
+ * @param {Response} response - Native fetch Response
132
+ * @param {string} requestUrl - URL the response came from
133
+ */
134
+ recordCookies(response, requestUrl) {
135
+ if (!this.persistCookies) return;
136
+
137
+ // Node 18+ exposes `getSetCookie()` which returns an array, one per header
138
+ const rawHeaders = typeof response.headers.getSetCookie === 'function'
139
+ ? response.headers.getSetCookie()
140
+ : [response.headers.get('set-cookie')].filter(Boolean);
141
+
142
+ for (const raw of rawHeaders) {
143
+ const cookie = parseSetCookie(raw, requestUrl);
144
+ if (!cookie) continue;
145
+ // Upsert: replace any existing cookie with same name+domain+path
146
+ const idx = this._jar.findIndex(
147
+ c => c.name === cookie.name && c.domain === cookie.domain && c.path === cookie.path
148
+ );
149
+ if (cookie.expires !== null && Date.now() > cookie.expires) {
150
+ // Explicit deletion (max-age=0 or past expires)
151
+ if (idx !== -1) this._jar.splice(idx, 1);
152
+ } else if (idx !== -1) {
153
+ this._jar[idx] = cookie;
154
+ } else {
155
+ this._jar.push(cookie);
156
+ }
157
+ }
158
+ }
159
+
160
+ /**
161
+ * Build the `Cookie` header string for outgoing requests to the given URL.
162
+ *
163
+ * @param {string} url
164
+ * @returns {string} Cookie header value, or empty string
165
+ */
166
+ getCookieHeader(url) {
167
+ let urlObj;
168
+ try {
169
+ urlObj = new URL(url);
170
+ } catch {
171
+ return '';
172
+ }
173
+
174
+ const matching = this._jar.filter(c => cookieMatchesUrl(c, urlObj));
175
+ return matching.map(c => `${c.name}=${c.value}`).join('; ');
176
+ }
177
+
178
+ /**
179
+ * Merge session headers + cookie header into a headers object.
180
+ * The caller's own headers take priority over session headers.
181
+ *
182
+ * @param {string} url
183
+ * @param {Record<string,string>} baseHeaders - Headers already built by the caller
184
+ * @returns {Record<string,string>}
185
+ */
186
+ applyToHeaders(url, baseHeaders) {
187
+ const merged = { ...this.headers, ...baseHeaders };
188
+ const cookieHeader = this.getCookieHeader(url);
189
+ if (cookieHeader) {
190
+ // Append to any existing Cookie header rather than clobber
191
+ const existing = merged['Cookie'] || merged['cookie'] || '';
192
+ merged['Cookie'] = existing ? `${existing}; ${cookieHeader}` : cookieHeader;
193
+ }
194
+ return merged;
195
+ }
196
+
197
+ /**
198
+ * Perform an optional "initial request" (e.g. a login POST) and capture
199
+ * any cookies it sets into the jar. Returns the response body text.
200
+ *
201
+ * @param {{ url: string, method?: string, headers?: Record<string,string>, body?: string }} req
202
+ * @returns {Promise<{ status: number, body: string }>}
203
+ */
204
+ async performInitialRequest(req) {
205
+ const { url, method = 'GET', headers: extraHeaders = {}, body } = req;
206
+
207
+ const requestHeaders = this.applyToHeaders(url, {
208
+ 'User-Agent': 'MCP-WebScraper/1.0',
209
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
210
+ ...extraHeaders
211
+ });
212
+
213
+ const fetchOpts = {
214
+ method,
215
+ headers: requestHeaders,
216
+ redirect: 'follow'
217
+ };
218
+
219
+ if (body) {
220
+ fetchOpts.body = body;
221
+ }
222
+
223
+ const response = await fetch(url, fetchOpts);
224
+ this.recordCookies(response, url);
225
+
226
+ const text = await response.text().catch(() => '');
227
+ return { status: response.status, body: text };
228
+ }
229
+
230
+ /** Number of cookies currently held in the jar (for diagnostics). */
231
+ get cookieCount() {
232
+ return this._jar.length;
233
+ }
234
+ }
@@ -1,6 +1,8 @@
1
1
  import { z } from 'zod';
2
2
  import { BFSCrawler } from '../../core/crawlers/BFSCrawler.js';
3
3
  import { DomainFilter } from '../../utils/domainFilter.js';
4
+ import { CacheManager } from '../../core/cache/CacheManager.js';
5
+ import { SessionContext } from './_sessionContext.js';
4
6
 
5
7
  const CrawlDeepSchema = z.object({
6
8
  url: z.string().url(),
@@ -56,24 +58,48 @@ const CrawlDeepSchema = z.object({
56
58
  concurrency: z.number().optional()
57
59
  })).optional().default({})
58
60
  }).optional(),
59
- import_filter_config: z.string().optional() // JSON string of exported config
61
+ import_filter_config: z.string().optional(), // JSON string of exported config
62
+ // Session reuse: when enabled, all page fetches share a cookie jar and
63
+ // consistent headers — enabling login-then-crawl workflows.
64
+ session: z.object({
65
+ enabled: z.boolean(),
66
+ persistCookies: z.boolean().optional().default(true),
67
+ headers: z.record(z.string()).optional().default({}),
68
+ initialRequest: z.object({
69
+ url: z.string().url(),
70
+ method: z.string().optional().default('GET'),
71
+ headers: z.record(z.string()).optional().default({}),
72
+ body: z.string().optional()
73
+ }).optional()
74
+ }).optional()
60
75
  });
61
76
 
62
77
  export class CrawlDeepTool {
63
78
  constructor(options = {}) {
64
79
  const {
65
80
  userAgent = 'MCP-WebScraper/1.0',
66
- timeout = 30000
81
+ timeout = 30000,
82
+ cacheEnabled = true,
83
+ cacheTTL = 3600000
67
84
  } = options;
68
85
 
69
86
  this.userAgent = userAgent;
70
87
  this.timeout = timeout;
88
+ // Per-session result cache: avoids redundant crawls of the same root URL
89
+ this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
71
90
  }
72
91
 
73
92
  async execute(params) {
74
93
  try {
75
94
  const validated = CrawlDeepSchema.parse(params);
76
-
95
+
96
+ // Cache dedup: skip re-crawling the same root URL within the TTL window
97
+ if (this.cache) {
98
+ const cacheKey = this.cache.generateKey('crawl_deep', { url: validated.url, depth: validated.max_depth, pages: validated.max_pages });
99
+ const cached = await this.cache.get(cacheKey);
100
+ if (cached) return cached;
101
+ }
102
+
77
103
  // Create domain filter if configuration provided
78
104
  let domainFilter = null;
79
105
  if (validated.import_filter_config) {
@@ -117,6 +143,20 @@ export class CrawlDeepTool {
117
143
  }
118
144
  }
119
145
 
146
+ // Set up session context when requested
147
+ let sessionContext = null;
148
+ if (validated.session?.enabled) {
149
+ sessionContext = new SessionContext({
150
+ persistCookies: validated.session.persistCookies,
151
+ headers: validated.session.headers || {}
152
+ });
153
+
154
+ // Perform optional login / pre-crawl request
155
+ if (validated.session.initialRequest) {
156
+ await sessionContext.performInitialRequest(validated.session.initialRequest);
157
+ }
158
+ }
159
+
120
160
  // Create crawler instance
121
161
  const crawler = new BFSCrawler({
122
162
  maxDepth: validated.max_depth,
@@ -128,7 +168,8 @@ export class CrawlDeepTool {
128
168
  concurrency: validated.concurrency,
129
169
  domainFilter: domainFilter,
130
170
  enableLinkAnalysis: validated.enable_link_analysis,
131
- linkAnalyzerOptions: validated.link_analysis_options
171
+ linkAnalyzerOptions: validated.link_analysis_options,
172
+ sessionContext
132
173
  });
133
174
 
134
175
  // Start crawling
@@ -154,9 +195,18 @@ export class CrawlDeepTool {
154
195
  stats: results.stats,
155
196
  site_structure: this.analyzeSiteStructure(results.urls),
156
197
  domain_filter_config: domainFilter ? domainFilter.exportConfig() : null,
157
- link_analysis: results.linkAnalysis
198
+ link_analysis: results.linkAnalysis,
199
+ session: sessionContext
200
+ ? { enabled: true, cookies_captured: sessionContext.cookieCount }
201
+ : { enabled: false }
158
202
  };
159
203
 
204
+ // Store in cache before returning
205
+ if (this.cache) {
206
+ const cacheKey = this.cache.generateKey('crawl_deep', { url: validated.url, depth: validated.max_depth, pages: validated.max_pages });
207
+ await this.cache.set(cacheKey, response);
208
+ }
209
+
160
210
  return response;
161
211
  } catch (error) {
162
212
  throw new Error(`Crawl failed: ${error.message}`);
@@ -2,6 +2,7 @@ import { z } from 'zod';
2
2
  import { load } from 'cheerio';
3
3
  import { DomainFilter } from '../../utils/domainFilter.js';
4
4
  import { normalizeUrl, getBaseUrl } from '../../utils/urlNormalizer.js';
5
+ import { CacheManager } from '../../core/cache/CacheManager.js';
5
6
 
6
7
  const MapSiteSchema = z.object({
7
8
  url: z.string().url(),
@@ -23,16 +24,28 @@ export class MapSiteTool {
23
24
  constructor(options = {}) {
24
25
  const {
25
26
  userAgent = 'MCP-WebScraper/1.0',
26
- timeout = 10000
27
+ timeout = 10000,
28
+ cacheEnabled = true,
29
+ cacheTTL = 3600000
27
30
  } = options;
28
31
 
29
32
  this.userAgent = userAgent;
30
33
  this.timeout = timeout;
34
+ // Per-session result cache: avoids redundant site maps for the same root URL
35
+ this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
31
36
  }
32
37
 
33
38
  async execute(params) {
34
39
  try {
35
40
  const validated = MapSiteSchema.parse(params);
41
+
42
+ // Cache dedup: skip re-mapping the same site within the TTL window
43
+ if (this.cache) {
44
+ const cacheKey = this.cache.generateKey('map_site', { url: validated.url, maxUrls: validated.max_urls });
45
+ const cached = await this.cache.get(cacheKey);
46
+ if (cached) return cached;
47
+ }
48
+
36
49
  const baseUrl = getBaseUrl(validated.url);
37
50
  const urls = new Set();
38
51
  const metadata = new Map();
@@ -94,7 +107,7 @@ export class MapSiteTool {
94
107
  ? this.groupByPath(urlArray)
95
108
  : urlArray;
96
109
 
97
- return {
110
+ const result = {
98
111
  base_url: baseUrl,
99
112
  total_urls: urlArray.length,
100
113
  urls: organized,
@@ -104,6 +117,14 @@ export class MapSiteTool {
104
117
  domain_filter_config: domainFilter ? domainFilter.exportConfig() : null,
105
118
  filter_stats: domainFilter ? domainFilter.getStats() : null
106
119
  };
120
+
121
+ // Store in cache before returning
122
+ if (this.cache) {
123
+ const cacheKey = this.cache.generateKey('map_site', { url: validated.url, maxUrls: validated.max_urls });
124
+ await this.cache.set(cacheKey, result);
125
+ }
126
+
127
+ return result;
107
128
  } catch (error) {
108
129
  throw new Error(`Site mapping failed: ${error.message}`);
109
130
  }
@@ -0,0 +1,57 @@
1
+ /**
2
+ * _fetchAndParse.js — shared fetch + HTML parse helper for extract tools.
3
+ *
4
+ * Used by:
5
+ * extractStructured.js
6
+ * extractContent.js (uses native fetch directly but can adopt this)
7
+ * processDocument.js (URL sources)
8
+ *
9
+ * Returns { html, $, textContent, finalUrl } so callers don't repeat
10
+ * the same fetch/cheerio/cleanup boilerplate.
11
+ */
12
+
13
+ import { load } from 'cheerio';
14
+
15
+ const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; CrawlForge-MCP/3.0)';
16
+ const DEFAULT_TIMEOUT_MS = 15000;
17
+
18
+ /**
19
+ * Fetch a URL and return parsed HTML via Cheerio.
20
+ *
21
+ * @param {string} url
22
+ * @param {Object} [options]
23
+ * @param {string} [options.userAgent]
24
+ * @param {number} [options.timeoutMs]
25
+ * @param {string[]} [options.stripTags] — additional tags to strip (default: script, style, noscript, iframe, svg)
26
+ * @returns {Promise<{ html: string, $: import('cheerio').CheerioAPI, textContent: string, finalUrl: string }>}
27
+ */
28
+ export async function fetchAndParse(url, options = {}) {
29
+ const {
30
+ userAgent = DEFAULT_USER_AGENT,
31
+ timeoutMs = DEFAULT_TIMEOUT_MS,
32
+ stripTags = ['script', 'style', 'noscript', 'iframe', 'svg']
33
+ } = options;
34
+
35
+ const response = await fetch(url, {
36
+ headers: {
37
+ 'User-Agent': userAgent,
38
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
39
+ },
40
+ signal: AbortSignal.timeout(timeoutMs)
41
+ });
42
+
43
+ if (!response.ok) {
44
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
45
+ }
46
+
47
+ const html = await response.text();
48
+ const $ = load(html);
49
+
50
+ if (stripTags.length > 0) {
51
+ $(stripTags.join(', ')).remove();
52
+ }
53
+
54
+ const textContent = $('body').text().replace(/\s+/g, ' ').trim();
55
+
56
+ return { html, $, textContent, finalUrl: response.url };
57
+ }
@@ -7,6 +7,7 @@
7
7
  import { z } from 'zod';
8
8
  import { load } from 'cheerio';
9
9
  import { LLMManager } from '../../core/llm/LLMManager.js';
10
+ import { fetchAndParse } from './_fetchAndParse.js';
10
11
 
11
12
  const ExtractStructuredSchema = z.object({
12
13
  url: z.string().url(),
@@ -73,25 +74,8 @@ export class ExtractStructuredTool {
73
74
  const validated = ExtractStructuredSchema.parse(params);
74
75
  const { url, schema, prompt, llmConfig, fallbackToSelectors, selectorHints } = validated;
75
76
 
76
- // Step 1: Fetch URL
77
- const response = await fetch(url, {
78
- headers: {
79
- 'User-Agent': this.userAgent,
80
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
81
- },
82
- signal: AbortSignal.timeout(15000)
83
- });
84
-
85
- if (!response.ok) {
86
- throw new Error(`HTTP ${response.status}: ${response.statusText}`);
87
- }
88
-
89
- const html = await response.text();
90
-
91
- // Step 2: Parse HTML with Cheerio, strip scripts/styles
92
- const $ = load(html);
93
- $('script, style, noscript, iframe, svg').remove();
94
- const textContent = $('body').text().replace(/\s+/g, ' ').trim();
77
+ // Step 1: Fetch and parse — shared helper strips scripts/styles/iframes/svgs
78
+ const { html, $, textContent } = await fetchAndParse(url, { userAgent: this.userAgent });
95
79
 
96
80
  // Step 3: Try LLM extraction first
97
81
  let extractionResult = null;