crawlforge-mcp-server 3.0.18 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -2
- package/server.js +192 -1277
- package/src/core/ActionExecutor.js +2 -43
- package/src/core/AuthManager.js +127 -14
- package/src/core/BrowserContextPool.js +187 -0
- package/src/core/JobManager.js +7 -5
- package/src/core/LocalizationManager.js +14 -125
- package/src/core/StealthBrowserManager.js +26 -18
- package/src/core/cache/CacheManager.js +4 -1
- package/src/core/crawlers/BFSCrawler.js +19 -5
- package/src/observability/metrics.js +137 -0
- package/src/observability/tracing.js +74 -0
- package/src/server/auth/oauth.js +388 -0
- package/src/server/registerTool.js +41 -0
- package/src/server/schemas/common.js +29 -0
- package/src/server/transports/http.js +22 -0
- package/src/server/transports/stdio.js +16 -0
- package/src/server/transports/streamableHttp.js +226 -0
- package/src/server/withAuth.js +121 -0
- package/src/tools/advanced/BatchScrapeTool.js +12 -1086
- package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
- package/src/tools/advanced/batchScrape/index.js +328 -0
- package/src/tools/advanced/batchScrape/queue.js +91 -0
- package/src/tools/advanced/batchScrape/reporter.js +26 -0
- package/src/tools/advanced/batchScrape/schema.js +37 -0
- package/src/tools/advanced/batchScrape/worker.js +179 -0
- package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
- package/src/tools/basic/_fetch.js +35 -0
- package/src/tools/basic/extractLinks.js +74 -0
- package/src/tools/basic/extractMetadata.js +74 -0
- package/src/tools/basic/extractText.js +46 -0
- package/src/tools/basic/fetchUrl.js +44 -0
- package/src/tools/basic/scrapeStructured.js +58 -0
- package/src/tools/crawl/_sessionContext.js +234 -0
- package/src/tools/crawl/crawlDeep.js +55 -5
- package/src/tools/crawl/mapSite.js +23 -2
- package/src/tools/extract/_fetchAndParse.js +57 -0
- package/src/tools/extract/extractStructured.js +3 -19
- package/src/tools/extract/extractWithLlm.js +365 -0
- package/src/tools/search/providers/searxng.js +126 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
- package/src/tools/search/ranking/ResultRanker.js +17 -10
- package/src/tools/search/ranking/SearchResultCache.js +52 -0
- package/src/tools/search/searchWeb.js +112 -6
- package/src/tools/tracking/trackChanges/differ.js +98 -0
- package/src/tools/tracking/trackChanges/index.js +432 -0
- package/src/tools/tracking/trackChanges/monitor.js +93 -0
- package/src/tools/tracking/trackChanges/notifier.js +105 -0
- package/src/tools/tracking/trackChanges/schema.js +127 -0
- package/src/tools/tracking/trackChanges.js +12 -1374
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* _sessionContext.js
|
|
3
|
+
*
|
|
4
|
+
* Lightweight in-memory cookie jar for crawl session reuse.
|
|
5
|
+
* Zero external runtime dependencies — Set-Cookie headers are parsed
|
|
6
|
+
* with a minimal hand-rolled implementation that handles the attributes
|
|
7
|
+
* needed for a single-host crawl session (name, value, path, domain,
|
|
8
|
+
* secure, httponly, max-age, expires).
|
|
9
|
+
*
|
|
10
|
+
* Rationale for not using set-cookie-parser / tough-cookie:
|
|
11
|
+
* - We only need same-origin cookie persistence within one crawl run.
|
|
12
|
+
* - The crawl never spans multiple registered domains in a way that
|
|
13
|
+
* requires full RFC 6265 compliance (partitioned jars, public suffix
|
|
14
|
+
* list, etc.).
|
|
15
|
+
* - Keeping zero new runtime deps satisfies the project constraint.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Parse a single Set-Cookie header value into a cookie object.
|
|
20
|
+
* Returns null if the header is empty or unparseable.
|
|
21
|
+
*
|
|
22
|
+
* @param {string} header - Raw Set-Cookie header value
|
|
23
|
+
* @param {string} requestUrl - URL that issued the Set-Cookie response
|
|
24
|
+
* @returns {{ name: string, value: string, domain: string, path: string,
|
|
25
|
+
* secure: boolean, expires: number|null }|null}
|
|
26
|
+
*/
|
|
27
|
+
function parseSetCookie(header, requestUrl) {
|
|
28
|
+
if (!header) return null;
|
|
29
|
+
|
|
30
|
+
const parts = header.split(';').map(s => s.trim());
|
|
31
|
+
if (parts.length === 0 || !parts[0].includes('=')) return null;
|
|
32
|
+
|
|
33
|
+
const eqIdx = parts[0].indexOf('=');
|
|
34
|
+
const name = parts[0].slice(0, eqIdx).trim();
|
|
35
|
+
const value = parts[0].slice(eqIdx + 1).trim();
|
|
36
|
+
if (!name) return null;
|
|
37
|
+
|
|
38
|
+
let requestUrlObj;
|
|
39
|
+
try {
|
|
40
|
+
requestUrlObj = new URL(requestUrl);
|
|
41
|
+
} catch {
|
|
42
|
+
return null;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Defaults derived from the request URL
|
|
46
|
+
let domain = requestUrlObj.hostname;
|
|
47
|
+
let path = '/';
|
|
48
|
+
let secure = false;
|
|
49
|
+
let expires = null; // null = session cookie (lives until crawl ends)
|
|
50
|
+
|
|
51
|
+
for (let i = 1; i < parts.length; i++) {
|
|
52
|
+
const part = parts[i];
|
|
53
|
+
const lower = part.toLowerCase();
|
|
54
|
+
|
|
55
|
+
if (lower.startsWith('domain=')) {
|
|
56
|
+
// Strip leading dot — we do exact hostname matching
|
|
57
|
+
domain = part.slice('domain='.length).trim().replace(/^\./, '');
|
|
58
|
+
} else if (lower.startsWith('path=')) {
|
|
59
|
+
path = part.slice('path='.length).trim() || '/';
|
|
60
|
+
} else if (lower === 'secure') {
|
|
61
|
+
secure = true;
|
|
62
|
+
} else if (lower.startsWith('max-age=')) {
|
|
63
|
+
const maxAge = parseInt(part.slice('max-age='.length), 10);
|
|
64
|
+
if (!isNaN(maxAge)) {
|
|
65
|
+
expires = maxAge <= 0 ? 0 : Date.now() + maxAge * 1000;
|
|
66
|
+
}
|
|
67
|
+
} else if (lower.startsWith('expires=')) {
|
|
68
|
+
const dateStr = part.slice('expires='.length).trim();
|
|
69
|
+
const ts = Date.parse(dateStr);
|
|
70
|
+
if (!isNaN(ts) && expires === null) {
|
|
71
|
+
// max-age takes precedence over expires
|
|
72
|
+
expires = ts;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
// httponly is intentionally ignored — not relevant for a server-side crawler
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return { name, value, domain, path, secure, expires };
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Determine whether a stored cookie should be sent for the given URL.
|
|
83
|
+
*
|
|
84
|
+
* @param {object} cookie - Stored cookie object
|
|
85
|
+
* @param {URL} urlObj - Parsed URL of the outgoing request
|
|
86
|
+
* @returns {boolean}
|
|
87
|
+
*/
|
|
88
|
+
function cookieMatchesUrl(cookie, urlObj) {
|
|
89
|
+
// Honour expiry
|
|
90
|
+
if (cookie.expires !== null && Date.now() > cookie.expires) return false;
|
|
91
|
+
|
|
92
|
+
// Domain: exact match or subdomain match (cookie.domain is already dot-stripped)
|
|
93
|
+
const host = urlObj.hostname;
|
|
94
|
+
if (host !== cookie.domain && !host.endsWith('.' + cookie.domain)) return false;
|
|
95
|
+
|
|
96
|
+
// Secure flag
|
|
97
|
+
if (cookie.secure && urlObj.protocol !== 'https:') return false;
|
|
98
|
+
|
|
99
|
+
// Path: request path must start with cookie path
|
|
100
|
+
const reqPath = urlObj.pathname || '/';
|
|
101
|
+
if (!reqPath.startsWith(cookie.path)) return false;
|
|
102
|
+
|
|
103
|
+
return true;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* SessionContext — holds the shared cookie jar and custom headers for one
|
|
108
|
+
* crawl session. Passed into BFSCrawler so every page fetch participates
|
|
109
|
+
* in the same session.
|
|
110
|
+
*/
|
|
111
|
+
export class SessionContext {
|
|
112
|
+
/**
|
|
113
|
+
* @param {object} [options]
|
|
114
|
+
* @param {boolean} [options.persistCookies=true]
|
|
115
|
+
* @param {Record<string,string>} [options.headers={}]
|
|
116
|
+
*/
|
|
117
|
+
constructor(options = {}) {
|
|
118
|
+
this.persistCookies = options.persistCookies !== false; // default true
|
|
119
|
+
this.headers = options.headers || {};
|
|
120
|
+
/** @type {Array<{name,value,domain,path,secure,expires}>} */
|
|
121
|
+
this._jar = [];
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Record cookies from a fetch Response.
|
|
126
|
+
* Handles the `set-cookie` header (Node fetch returns it as a single
|
|
127
|
+
* string value; actual multi-cookie responses are represented as multiple
|
|
128
|
+
* headers which the Headers API coalesces with ', ' for some values — we
|
|
129
|
+
* deal with raw strings from getSetCookie() when available).
|
|
130
|
+
*
|
|
131
|
+
* @param {Response} response - Native fetch Response
|
|
132
|
+
* @param {string} requestUrl - URL the response came from
|
|
133
|
+
*/
|
|
134
|
+
recordCookies(response, requestUrl) {
|
|
135
|
+
if (!this.persistCookies) return;
|
|
136
|
+
|
|
137
|
+
// Node 18+ exposes `getSetCookie()` which returns an array, one per header
|
|
138
|
+
const rawHeaders = typeof response.headers.getSetCookie === 'function'
|
|
139
|
+
? response.headers.getSetCookie()
|
|
140
|
+
: [response.headers.get('set-cookie')].filter(Boolean);
|
|
141
|
+
|
|
142
|
+
for (const raw of rawHeaders) {
|
|
143
|
+
const cookie = parseSetCookie(raw, requestUrl);
|
|
144
|
+
if (!cookie) continue;
|
|
145
|
+
// Upsert: replace any existing cookie with same name+domain+path
|
|
146
|
+
const idx = this._jar.findIndex(
|
|
147
|
+
c => c.name === cookie.name && c.domain === cookie.domain && c.path === cookie.path
|
|
148
|
+
);
|
|
149
|
+
if (cookie.expires !== null && Date.now() > cookie.expires) {
|
|
150
|
+
// Explicit deletion (max-age=0 or past expires)
|
|
151
|
+
if (idx !== -1) this._jar.splice(idx, 1);
|
|
152
|
+
} else if (idx !== -1) {
|
|
153
|
+
this._jar[idx] = cookie;
|
|
154
|
+
} else {
|
|
155
|
+
this._jar.push(cookie);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Build the `Cookie` header string for outgoing requests to the given URL.
|
|
162
|
+
*
|
|
163
|
+
* @param {string} url
|
|
164
|
+
* @returns {string} Cookie header value, or empty string
|
|
165
|
+
*/
|
|
166
|
+
getCookieHeader(url) {
|
|
167
|
+
let urlObj;
|
|
168
|
+
try {
|
|
169
|
+
urlObj = new URL(url);
|
|
170
|
+
} catch {
|
|
171
|
+
return '';
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
const matching = this._jar.filter(c => cookieMatchesUrl(c, urlObj));
|
|
175
|
+
return matching.map(c => `${c.name}=${c.value}`).join('; ');
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Merge session headers + cookie header into a headers object.
|
|
180
|
+
* The caller's own headers take priority over session headers.
|
|
181
|
+
*
|
|
182
|
+
* @param {string} url
|
|
183
|
+
* @param {Record<string,string>} baseHeaders - Headers already built by the caller
|
|
184
|
+
* @returns {Record<string,string>}
|
|
185
|
+
*/
|
|
186
|
+
applyToHeaders(url, baseHeaders) {
|
|
187
|
+
const merged = { ...this.headers, ...baseHeaders };
|
|
188
|
+
const cookieHeader = this.getCookieHeader(url);
|
|
189
|
+
if (cookieHeader) {
|
|
190
|
+
// Append to any existing Cookie header rather than clobber
|
|
191
|
+
const existing = merged['Cookie'] || merged['cookie'] || '';
|
|
192
|
+
merged['Cookie'] = existing ? `${existing}; ${cookieHeader}` : cookieHeader;
|
|
193
|
+
}
|
|
194
|
+
return merged;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
* Perform an optional "initial request" (e.g. a login POST) and capture
|
|
199
|
+
* any cookies it sets into the jar. Returns the response body text.
|
|
200
|
+
*
|
|
201
|
+
* @param {{ url: string, method?: string, headers?: Record<string,string>, body?: string }} req
|
|
202
|
+
* @returns {Promise<{ status: number, body: string }>}
|
|
203
|
+
*/
|
|
204
|
+
async performInitialRequest(req) {
|
|
205
|
+
const { url, method = 'GET', headers: extraHeaders = {}, body } = req;
|
|
206
|
+
|
|
207
|
+
const requestHeaders = this.applyToHeaders(url, {
|
|
208
|
+
'User-Agent': 'MCP-WebScraper/1.0',
|
|
209
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
210
|
+
...extraHeaders
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
const fetchOpts = {
|
|
214
|
+
method,
|
|
215
|
+
headers: requestHeaders,
|
|
216
|
+
redirect: 'follow'
|
|
217
|
+
};
|
|
218
|
+
|
|
219
|
+
if (body) {
|
|
220
|
+
fetchOpts.body = body;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
const response = await fetch(url, fetchOpts);
|
|
224
|
+
this.recordCookies(response, url);
|
|
225
|
+
|
|
226
|
+
const text = await response.text().catch(() => '');
|
|
227
|
+
return { status: response.status, body: text };
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/** Number of cookies currently held in the jar (for diagnostics). */
|
|
231
|
+
get cookieCount() {
|
|
232
|
+
return this._jar.length;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
2
|
import { BFSCrawler } from '../../core/crawlers/BFSCrawler.js';
|
|
3
3
|
import { DomainFilter } from '../../utils/domainFilter.js';
|
|
4
|
+
import { CacheManager } from '../../core/cache/CacheManager.js';
|
|
5
|
+
import { SessionContext } from './_sessionContext.js';
|
|
4
6
|
|
|
5
7
|
const CrawlDeepSchema = z.object({
|
|
6
8
|
url: z.string().url(),
|
|
@@ -56,24 +58,48 @@ const CrawlDeepSchema = z.object({
|
|
|
56
58
|
concurrency: z.number().optional()
|
|
57
59
|
})).optional().default({})
|
|
58
60
|
}).optional(),
|
|
59
|
-
import_filter_config: z.string().optional() // JSON string of exported config
|
|
61
|
+
import_filter_config: z.string().optional(), // JSON string of exported config
|
|
62
|
+
// Session reuse: when enabled, all page fetches share a cookie jar and
|
|
63
|
+
// consistent headers — enabling login-then-crawl workflows.
|
|
64
|
+
session: z.object({
|
|
65
|
+
enabled: z.boolean(),
|
|
66
|
+
persistCookies: z.boolean().optional().default(true),
|
|
67
|
+
headers: z.record(z.string()).optional().default({}),
|
|
68
|
+
initialRequest: z.object({
|
|
69
|
+
url: z.string().url(),
|
|
70
|
+
method: z.string().optional().default('GET'),
|
|
71
|
+
headers: z.record(z.string()).optional().default({}),
|
|
72
|
+
body: z.string().optional()
|
|
73
|
+
}).optional()
|
|
74
|
+
}).optional()
|
|
60
75
|
});
|
|
61
76
|
|
|
62
77
|
export class CrawlDeepTool {
|
|
63
78
|
constructor(options = {}) {
|
|
64
79
|
const {
|
|
65
80
|
userAgent = 'MCP-WebScraper/1.0',
|
|
66
|
-
timeout = 30000
|
|
81
|
+
timeout = 30000,
|
|
82
|
+
cacheEnabled = true,
|
|
83
|
+
cacheTTL = 3600000
|
|
67
84
|
} = options;
|
|
68
85
|
|
|
69
86
|
this.userAgent = userAgent;
|
|
70
87
|
this.timeout = timeout;
|
|
88
|
+
// Per-session result cache: avoids redundant crawls of the same root URL
|
|
89
|
+
this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
|
|
71
90
|
}
|
|
72
91
|
|
|
73
92
|
async execute(params) {
|
|
74
93
|
try {
|
|
75
94
|
const validated = CrawlDeepSchema.parse(params);
|
|
76
|
-
|
|
95
|
+
|
|
96
|
+
// Cache dedup: skip re-crawling the same root URL within the TTL window
|
|
97
|
+
if (this.cache) {
|
|
98
|
+
const cacheKey = this.cache.generateKey('crawl_deep', { url: validated.url, depth: validated.max_depth, pages: validated.max_pages });
|
|
99
|
+
const cached = await this.cache.get(cacheKey);
|
|
100
|
+
if (cached) return cached;
|
|
101
|
+
}
|
|
102
|
+
|
|
77
103
|
// Create domain filter if configuration provided
|
|
78
104
|
let domainFilter = null;
|
|
79
105
|
if (validated.import_filter_config) {
|
|
@@ -117,6 +143,20 @@ export class CrawlDeepTool {
|
|
|
117
143
|
}
|
|
118
144
|
}
|
|
119
145
|
|
|
146
|
+
// Set up session context when requested
|
|
147
|
+
let sessionContext = null;
|
|
148
|
+
if (validated.session?.enabled) {
|
|
149
|
+
sessionContext = new SessionContext({
|
|
150
|
+
persistCookies: validated.session.persistCookies,
|
|
151
|
+
headers: validated.session.headers || {}
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
// Perform optional login / pre-crawl request
|
|
155
|
+
if (validated.session.initialRequest) {
|
|
156
|
+
await sessionContext.performInitialRequest(validated.session.initialRequest);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
120
160
|
// Create crawler instance
|
|
121
161
|
const crawler = new BFSCrawler({
|
|
122
162
|
maxDepth: validated.max_depth,
|
|
@@ -128,7 +168,8 @@ export class CrawlDeepTool {
|
|
|
128
168
|
concurrency: validated.concurrency,
|
|
129
169
|
domainFilter: domainFilter,
|
|
130
170
|
enableLinkAnalysis: validated.enable_link_analysis,
|
|
131
|
-
linkAnalyzerOptions: validated.link_analysis_options
|
|
171
|
+
linkAnalyzerOptions: validated.link_analysis_options,
|
|
172
|
+
sessionContext
|
|
132
173
|
});
|
|
133
174
|
|
|
134
175
|
// Start crawling
|
|
@@ -154,9 +195,18 @@ export class CrawlDeepTool {
|
|
|
154
195
|
stats: results.stats,
|
|
155
196
|
site_structure: this.analyzeSiteStructure(results.urls),
|
|
156
197
|
domain_filter_config: domainFilter ? domainFilter.exportConfig() : null,
|
|
157
|
-
link_analysis: results.linkAnalysis
|
|
198
|
+
link_analysis: results.linkAnalysis,
|
|
199
|
+
session: sessionContext
|
|
200
|
+
? { enabled: true, cookies_captured: sessionContext.cookieCount }
|
|
201
|
+
: { enabled: false }
|
|
158
202
|
};
|
|
159
203
|
|
|
204
|
+
// Store in cache before returning
|
|
205
|
+
if (this.cache) {
|
|
206
|
+
const cacheKey = this.cache.generateKey('crawl_deep', { url: validated.url, depth: validated.max_depth, pages: validated.max_pages });
|
|
207
|
+
await this.cache.set(cacheKey, response);
|
|
208
|
+
}
|
|
209
|
+
|
|
160
210
|
return response;
|
|
161
211
|
} catch (error) {
|
|
162
212
|
throw new Error(`Crawl failed: ${error.message}`);
|
|
@@ -2,6 +2,7 @@ import { z } from 'zod';
|
|
|
2
2
|
import { load } from 'cheerio';
|
|
3
3
|
import { DomainFilter } from '../../utils/domainFilter.js';
|
|
4
4
|
import { normalizeUrl, getBaseUrl } from '../../utils/urlNormalizer.js';
|
|
5
|
+
import { CacheManager } from '../../core/cache/CacheManager.js';
|
|
5
6
|
|
|
6
7
|
const MapSiteSchema = z.object({
|
|
7
8
|
url: z.string().url(),
|
|
@@ -23,16 +24,28 @@ export class MapSiteTool {
|
|
|
23
24
|
constructor(options = {}) {
|
|
24
25
|
const {
|
|
25
26
|
userAgent = 'MCP-WebScraper/1.0',
|
|
26
|
-
timeout = 10000
|
|
27
|
+
timeout = 10000,
|
|
28
|
+
cacheEnabled = true,
|
|
29
|
+
cacheTTL = 3600000
|
|
27
30
|
} = options;
|
|
28
31
|
|
|
29
32
|
this.userAgent = userAgent;
|
|
30
33
|
this.timeout = timeout;
|
|
34
|
+
// Per-session result cache: avoids redundant site maps for the same root URL
|
|
35
|
+
this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
|
|
31
36
|
}
|
|
32
37
|
|
|
33
38
|
async execute(params) {
|
|
34
39
|
try {
|
|
35
40
|
const validated = MapSiteSchema.parse(params);
|
|
41
|
+
|
|
42
|
+
// Cache dedup: skip re-mapping the same site within the TTL window
|
|
43
|
+
if (this.cache) {
|
|
44
|
+
const cacheKey = this.cache.generateKey('map_site', { url: validated.url, maxUrls: validated.max_urls });
|
|
45
|
+
const cached = await this.cache.get(cacheKey);
|
|
46
|
+
if (cached) return cached;
|
|
47
|
+
}
|
|
48
|
+
|
|
36
49
|
const baseUrl = getBaseUrl(validated.url);
|
|
37
50
|
const urls = new Set();
|
|
38
51
|
const metadata = new Map();
|
|
@@ -94,7 +107,7 @@ export class MapSiteTool {
|
|
|
94
107
|
? this.groupByPath(urlArray)
|
|
95
108
|
: urlArray;
|
|
96
109
|
|
|
97
|
-
|
|
110
|
+
const result = {
|
|
98
111
|
base_url: baseUrl,
|
|
99
112
|
total_urls: urlArray.length,
|
|
100
113
|
urls: organized,
|
|
@@ -104,6 +117,14 @@ export class MapSiteTool {
|
|
|
104
117
|
domain_filter_config: domainFilter ? domainFilter.exportConfig() : null,
|
|
105
118
|
filter_stats: domainFilter ? domainFilter.getStats() : null
|
|
106
119
|
};
|
|
120
|
+
|
|
121
|
+
// Store in cache before returning
|
|
122
|
+
if (this.cache) {
|
|
123
|
+
const cacheKey = this.cache.generateKey('map_site', { url: validated.url, maxUrls: validated.max_urls });
|
|
124
|
+
await this.cache.set(cacheKey, result);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return result;
|
|
107
128
|
} catch (error) {
|
|
108
129
|
throw new Error(`Site mapping failed: ${error.message}`);
|
|
109
130
|
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* _fetchAndParse.js — shared fetch + HTML parse helper for extract tools.
|
|
3
|
+
*
|
|
4
|
+
* Used by:
|
|
5
|
+
* extractStructured.js
|
|
6
|
+
* extractContent.js (uses native fetch directly but can adopt this)
|
|
7
|
+
* processDocument.js (URL sources)
|
|
8
|
+
*
|
|
9
|
+
* Returns { html, $, textContent, finalUrl } so callers don't repeat
|
|
10
|
+
* the same fetch/cheerio/cleanup boilerplate.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { load } from 'cheerio';
|
|
14
|
+
|
|
15
|
+
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; CrawlForge-MCP/3.0)';
|
|
16
|
+
const DEFAULT_TIMEOUT_MS = 15000;
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Fetch a URL and return parsed HTML via Cheerio.
|
|
20
|
+
*
|
|
21
|
+
* @param {string} url
|
|
22
|
+
* @param {Object} [options]
|
|
23
|
+
* @param {string} [options.userAgent]
|
|
24
|
+
* @param {number} [options.timeoutMs]
|
|
25
|
+
* @param {string[]} [options.stripTags] — additional tags to strip (default: script, style, noscript, iframe, svg)
|
|
26
|
+
* @returns {Promise<{ html: string, $: import('cheerio').CheerioAPI, textContent: string, finalUrl: string }>}
|
|
27
|
+
*/
|
|
28
|
+
export async function fetchAndParse(url, options = {}) {
|
|
29
|
+
const {
|
|
30
|
+
userAgent = DEFAULT_USER_AGENT,
|
|
31
|
+
timeoutMs = DEFAULT_TIMEOUT_MS,
|
|
32
|
+
stripTags = ['script', 'style', 'noscript', 'iframe', 'svg']
|
|
33
|
+
} = options;
|
|
34
|
+
|
|
35
|
+
const response = await fetch(url, {
|
|
36
|
+
headers: {
|
|
37
|
+
'User-Agent': userAgent,
|
|
38
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
|
39
|
+
},
|
|
40
|
+
signal: AbortSignal.timeout(timeoutMs)
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
if (!response.ok) {
|
|
44
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const html = await response.text();
|
|
48
|
+
const $ = load(html);
|
|
49
|
+
|
|
50
|
+
if (stripTags.length > 0) {
|
|
51
|
+
$(stripTags.join(', ')).remove();
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const textContent = $('body').text().replace(/\s+/g, ' ').trim();
|
|
55
|
+
|
|
56
|
+
return { html, $, textContent, finalUrl: response.url };
|
|
57
|
+
}
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
import { z } from 'zod';
|
|
8
8
|
import { load } from 'cheerio';
|
|
9
9
|
import { LLMManager } from '../../core/llm/LLMManager.js';
|
|
10
|
+
import { fetchAndParse } from './_fetchAndParse.js';
|
|
10
11
|
|
|
11
12
|
const ExtractStructuredSchema = z.object({
|
|
12
13
|
url: z.string().url(),
|
|
@@ -73,25 +74,8 @@ export class ExtractStructuredTool {
|
|
|
73
74
|
const validated = ExtractStructuredSchema.parse(params);
|
|
74
75
|
const { url, schema, prompt, llmConfig, fallbackToSelectors, selectorHints } = validated;
|
|
75
76
|
|
|
76
|
-
// Step 1: Fetch
|
|
77
|
-
const
|
|
78
|
-
headers: {
|
|
79
|
-
'User-Agent': this.userAgent,
|
|
80
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
|
81
|
-
},
|
|
82
|
-
signal: AbortSignal.timeout(15000)
|
|
83
|
-
});
|
|
84
|
-
|
|
85
|
-
if (!response.ok) {
|
|
86
|
-
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
const html = await response.text();
|
|
90
|
-
|
|
91
|
-
// Step 2: Parse HTML with Cheerio, strip scripts/styles
|
|
92
|
-
const $ = load(html);
|
|
93
|
-
$('script, style, noscript, iframe, svg').remove();
|
|
94
|
-
const textContent = $('body').text().replace(/\s+/g, ' ').trim();
|
|
77
|
+
// Step 1: Fetch and parse — shared helper strips scripts/styles/iframes/svgs
|
|
78
|
+
const { html, $, textContent } = await fetchAndParse(url, { userAgent: this.userAgent });
|
|
95
79
|
|
|
96
80
|
// Step 3: Try LLM extraction first
|
|
97
81
|
let extractionResult = null;
|