crawlforge-mcp-server 3.0.18 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/package.json +5 -2
  2. package/server.js +192 -1277
  3. package/src/core/ActionExecutor.js +2 -43
  4. package/src/core/AuthManager.js +127 -14
  5. package/src/core/BrowserContextPool.js +187 -0
  6. package/src/core/JobManager.js +7 -5
  7. package/src/core/LocalizationManager.js +14 -125
  8. package/src/core/StealthBrowserManager.js +26 -18
  9. package/src/core/cache/CacheManager.js +4 -1
  10. package/src/core/crawlers/BFSCrawler.js +19 -5
  11. package/src/observability/metrics.js +137 -0
  12. package/src/observability/tracing.js +74 -0
  13. package/src/server/auth/oauth.js +388 -0
  14. package/src/server/registerTool.js +41 -0
  15. package/src/server/schemas/common.js +29 -0
  16. package/src/server/transports/http.js +22 -0
  17. package/src/server/transports/stdio.js +16 -0
  18. package/src/server/transports/streamableHttp.js +226 -0
  19. package/src/server/withAuth.js +121 -0
  20. package/src/tools/advanced/BatchScrapeTool.js +12 -1086
  21. package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
  22. package/src/tools/advanced/batchScrape/index.js +328 -0
  23. package/src/tools/advanced/batchScrape/queue.js +91 -0
  24. package/src/tools/advanced/batchScrape/reporter.js +26 -0
  25. package/src/tools/advanced/batchScrape/schema.js +37 -0
  26. package/src/tools/advanced/batchScrape/worker.js +179 -0
  27. package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
  28. package/src/tools/basic/_fetch.js +35 -0
  29. package/src/tools/basic/extractLinks.js +74 -0
  30. package/src/tools/basic/extractMetadata.js +74 -0
  31. package/src/tools/basic/extractText.js +46 -0
  32. package/src/tools/basic/fetchUrl.js +44 -0
  33. package/src/tools/basic/scrapeStructured.js +58 -0
  34. package/src/tools/crawl/_sessionContext.js +234 -0
  35. package/src/tools/crawl/crawlDeep.js +55 -5
  36. package/src/tools/crawl/mapSite.js +23 -2
  37. package/src/tools/extract/_fetchAndParse.js +57 -0
  38. package/src/tools/extract/extractStructured.js +3 -19
  39. package/src/tools/extract/extractWithLlm.js +295 -0
  40. package/src/tools/search/providers/searxng.js +126 -0
  41. package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
  42. package/src/tools/search/ranking/ResultRanker.js +17 -10
  43. package/src/tools/search/ranking/SearchResultCache.js +52 -0
  44. package/src/tools/search/searchWeb.js +112 -6
  45. package/src/tools/tracking/trackChanges/differ.js +98 -0
  46. package/src/tools/tracking/trackChanges/index.js +432 -0
  47. package/src/tools/tracking/trackChanges/monitor.js +93 -0
  48. package/src/tools/tracking/trackChanges/notifier.js +105 -0
  49. package/src/tools/tracking/trackChanges/schema.js +127 -0
  50. package/src/tools/tracking/trackChanges.js +12 -1374
@@ -0,0 +1,295 @@
1
+ /**
2
+ * Extract With LLM MCP Tool
3
+ * Natural-language extraction powered by OpenAI or Anthropic.
4
+ * Mirrors ScrapeGraphAI positioning: describe what you want, get structured JSON back.
5
+ *
6
+ * Requires OPENAI_API_KEY or ANTHROPIC_API_KEY in environment.
7
+ * Gate: tool throws a clear error when neither key is present.
8
+ */
9
+
10
+ import { fetchAndParse } from './_fetchAndParse.js';
11
+
12
+ // ── Constants ─────────────────────────────────────────────────────────────────
13
+
14
+ const MAX_INPUT_CHARS = 50_000;
15
+
16
+ const OPENAI_DEFAULT_MODEL = 'gpt-4o-mini';
17
+ const ANTHROPIC_DEFAULT_MODEL = 'claude-haiku-4-5-20251001';
18
+
19
+ // Support test-time overrides so the test suite can stub endpoints.
20
+ function openaiBaseUrl() {
21
+ return (process.env.OPENAI_BASE_URL || 'https://api.openai.com').replace(/\/$/, '');
22
+ }
23
+ function anthropicBaseUrl() {
24
+ return (process.env.ANTHROPIC_BASE_URL || 'https://api.anthropic.com').replace(/\/$/, '');
25
+ }
26
+
27
+ // ── Helpers ───────────────────────────────────────────────────────────────────
28
+
29
+ /**
30
+ * Resolve which provider to use.
31
+ * @param {'openai'|'anthropic'|'auto'} provider
32
+ * @returns {{ provider: 'openai'|'anthropic', apiKey: string }}
33
+ */
34
+ function resolveProvider(provider) {
35
+ const anthropicKey = process.env.ANTHROPIC_API_KEY;
36
+ const openaiKey = process.env.OPENAI_API_KEY;
37
+
38
+ if (provider === 'auto') {
39
+ if (anthropicKey) return { provider: 'anthropic', apiKey: anthropicKey };
40
+ if (openaiKey) return { provider: 'openai', apiKey: openaiKey };
41
+ throw new Error(
42
+ 'extract_with_llm requires OPENAI_API_KEY or ANTHROPIC_API_KEY in environment'
43
+ );
44
+ }
45
+
46
+ if (provider === 'anthropic') {
47
+ if (!anthropicKey) throw new Error('extract_with_llm: ANTHROPIC_API_KEY is not set');
48
+ return { provider: 'anthropic', apiKey: anthropicKey };
49
+ }
50
+
51
+ if (provider === 'openai') {
52
+ if (!openaiKey) throw new Error('extract_with_llm: OPENAI_API_KEY is not set');
53
+ return { provider: 'openai', apiKey: openaiKey };
54
+ }
55
+
56
+ throw new Error(`extract_with_llm: unknown provider "${provider}"`);
57
+ }
58
+
59
+ /**
60
+ * Build the user message text that goes to the LLM.
61
+ */
62
+ function buildUserMessage(userPrompt, text, schema) {
63
+ const truncated = text.length > MAX_INPUT_CHARS ? text.slice(0, MAX_INPUT_CHARS) + '\n[...truncated]' : text;
64
+ let msg = `Extraction instruction: ${userPrompt}\n\n`;
65
+ if (schema && Object.keys(schema).length > 0) {
66
+ msg += `Output schema hint:\n${JSON.stringify(schema, null, 2)}\n\n`;
67
+ }
68
+ msg += `Web page content:\n${truncated}\n\nReturn only valid JSON.`;
69
+ return msg;
70
+ }
71
+
72
+ /**
73
+ * Parse JSON from an LLM response string defensively.
74
+ * Strips markdown code fences if present.
75
+ * Returns parsed object or throws.
76
+ */
77
+ function parseJson(raw) {
78
+ // Strip markdown fences
79
+ const stripped = raw
80
+ .replace(/^```(?:json)?\s*/i, '')
81
+ .replace(/\s*```\s*$/, '')
82
+ .trim();
83
+ return JSON.parse(stripped);
84
+ }
85
+
86
+ // ── OpenAI call ───────────────────────────────────────────────────────────────
87
+
88
+ async function callOpenAI({ apiKey, model, systemMessage, userMessage, maxTokens }) {
89
+ const url = `${openaiBaseUrl()}/v1/chat/completions`;
90
+ const body = {
91
+ model,
92
+ messages: [
93
+ { role: 'system', content: systemMessage },
94
+ { role: 'user', content: userMessage }
95
+ ],
96
+ max_tokens: maxTokens,
97
+ response_format: { type: 'json_object' }
98
+ };
99
+
100
+ const response = await fetch(url, {
101
+ method: 'POST',
102
+ headers: {
103
+ 'Content-Type': 'application/json',
104
+ 'Authorization': `Bearer ${apiKey}`
105
+ },
106
+ body: JSON.stringify(body),
107
+ signal: AbortSignal.timeout(120_000)
108
+ });
109
+
110
+ if (!response.ok) {
111
+ const errText = await response.text().catch(() => '');
112
+ throw new Error(`OpenAI API error ${response.status}: ${errText.slice(0, 200)}`);
113
+ }
114
+
115
+ const json = await response.json();
116
+ const content = json.choices?.[0]?.message?.content ?? '';
117
+ const usage = {
118
+ input_tokens: json.usage?.prompt_tokens ?? 0,
119
+ output_tokens: json.usage?.completion_tokens ?? 0
120
+ };
121
+ return { rawText: content, usage, model: json.model || model };
122
+ }
123
+
124
+ // ── Anthropic call ────────────────────────────────────────────────────────────
125
+
126
+ async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens }) {
127
+ const url = `${anthropicBaseUrl()}/v1/messages`;
128
+ const body = {
129
+ model,
130
+ system: systemMessage,
131
+ messages: [{ role: 'user', content: userMessage }],
132
+ max_tokens: maxTokens
133
+ };
134
+
135
+ const response = await fetch(url, {
136
+ method: 'POST',
137
+ headers: {
138
+ 'Content-Type': 'application/json',
139
+ 'x-api-key': apiKey,
140
+ 'anthropic-version': '2023-06-01'
141
+ },
142
+ body: JSON.stringify(body),
143
+ signal: AbortSignal.timeout(120_000)
144
+ });
145
+
146
+ if (!response.ok) {
147
+ const errText = await response.text().catch(() => '');
148
+ throw new Error(`Anthropic API error ${response.status}: ${errText.slice(0, 200)}`);
149
+ }
150
+
151
+ const json = await response.json();
152
+ const content = json.content?.[0]?.text ?? '';
153
+ const usage = {
154
+ input_tokens: json.usage?.input_tokens ?? 0,
155
+ output_tokens: json.usage?.output_tokens ?? 0
156
+ };
157
+ return { rawText: content, usage, model: json.model || model };
158
+ }
159
+
160
+ // ── LLM dispatch ─────────────────────────────────────────────────────────────
161
+
162
+ async function callLLM({ provider, apiKey, model, systemMessage, userMessage, maxTokens }) {
163
+ if (provider === 'openai') {
164
+ return callOpenAI({ apiKey, model, systemMessage, userMessage, maxTokens });
165
+ }
166
+ return callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens });
167
+ }
168
+
169
+ // ── Tool class ────────────────────────────────────────────────────────────────
170
+
171
+ export class ExtractWithLlm {
172
+ constructor(config = {}) {
173
+ this.config = config;
174
+ }
175
+
176
+ /**
177
+ * Execute LLM-powered extraction.
178
+ * @param {Object} params
179
+ * @param {string} [params.url] - URL to fetch (one of url/content required)
180
+ * @param {string} [params.content] - Pre-fetched text content
181
+ * @param {string} params.prompt - Natural-language extraction instruction
182
+ * @param {Object} [params.schema] - Optional JSON-schema-like output hint
183
+ * @param {string} [params.provider] - 'openai' | 'anthropic' | 'auto'
184
+ * @param {string} [params.model] - Override default model
185
+ * @param {number} [params.maxTokens] - Max output tokens (default 4096)
186
+ * @returns {Promise<Object>}
187
+ */
188
+ async execute(params) {
189
+ const {
190
+ url,
191
+ content,
192
+ prompt,
193
+ schema,
194
+ provider: providerParam = 'auto',
195
+ model: modelParam,
196
+ maxTokens = 4096
197
+ } = params;
198
+
199
+ // Validate: exactly one of url or content must be provided
200
+ if (!url && !content) {
201
+ return {
202
+ success: false,
203
+ error: 'extract_with_llm: either "url" or "content" must be provided'
204
+ };
205
+ }
206
+ if (!prompt) {
207
+ return { success: false, error: 'extract_with_llm: "prompt" is required' };
208
+ }
209
+
210
+ // Resolve provider + API key (throws clearly if neither key is set)
211
+ let resolved;
212
+ try {
213
+ resolved = resolveProvider(providerParam);
214
+ } catch (err) {
215
+ return { success: false, error: err.message };
216
+ }
217
+
218
+ const { provider, apiKey } = resolved;
219
+ const defaultModel = provider === 'openai' ? OPENAI_DEFAULT_MODEL : ANTHROPIC_DEFAULT_MODEL;
220
+ const model = modelParam || defaultModel;
221
+
222
+ // Step 1: Get text to extract from
223
+ let text;
224
+ try {
225
+ if (url) {
226
+ const { textContent } = await fetchAndParse(url);
227
+ text = textContent;
228
+ } else {
229
+ text = content;
230
+ }
231
+ } catch (fetchErr) {
232
+ return { success: false, error: `Failed to fetch content: ${fetchErr.message}` };
233
+ }
234
+
235
+ const systemMessage =
236
+ 'You extract structured data from web content per the user\'s instructions. Return JSON only.';
237
+
238
+ const userMessage = buildUserMessage(prompt, text, schema);
239
+
240
+ // Step 2: First LLM call
241
+ let rawText, usage;
242
+ try {
243
+ ({ rawText, usage } = await callLLM({
244
+ provider, apiKey, model, systemMessage, userMessage, maxTokens
245
+ }));
246
+ } catch (llmErr) {
247
+ return { success: false, error: `LLM call failed: ${llmErr.message}` };
248
+ }
249
+
250
+ // Step 3: Parse JSON; retry once with stricter prompt if it fails
251
+ let parsed;
252
+ try {
253
+ parsed = parseJson(rawText);
254
+ } catch (_parseErr) {
255
+ // Retry with stricter instruction
256
+ const retryUserMessage =
257
+ `${userMessage}\n\nIMPORTANT: Your previous response was not valid JSON. ` +
258
+ 'Respond with ONLY a JSON object or array. No explanation, no markdown fences.';
259
+ let retryRaw, retryUsage;
260
+ try {
261
+ ({ rawText: retryRaw, usage: retryUsage } = await callLLM({
262
+ provider, apiKey, model, systemMessage,
263
+ userMessage: retryUserMessage, maxTokens
264
+ }));
265
+ // Merge usage
266
+ usage = {
267
+ input_tokens: usage.input_tokens + retryUsage.input_tokens,
268
+ output_tokens: usage.output_tokens + retryUsage.output_tokens
269
+ };
270
+ } catch (retryLlmErr) {
271
+ return { success: false, error: `LLM retry call failed: ${retryLlmErr.message}` };
272
+ }
273
+
274
+ try {
275
+ parsed = parseJson(retryRaw);
276
+ } catch (_retryParseErr) {
277
+ return {
278
+ success: false,
279
+ error: 'LLM did not return valid JSON after retry',
280
+ raw: retryRaw.slice(0, 500)
281
+ };
282
+ }
283
+ }
284
+
285
+ return {
286
+ success: true,
287
+ data: parsed,
288
+ provider,
289
+ model,
290
+ usage
291
+ };
292
+ }
293
+ }
294
+
295
+ export default ExtractWithLlm;
@@ -0,0 +1,126 @@
1
+ /**
2
+ * SearXNG Search Provider
3
+ *
4
+ * Executes searches against a self-hosted SearXNG instance via its JSON API.
5
+ * Instance URL is read from the CRAWLFORGE_SEARXNG_URL environment variable.
6
+ *
7
+ * SearXNG JSON API reference:
8
+ * https://docs.searxng.org/dev/search_api.html
9
+ *
10
+ * Result shape is normalised to match the CrawlForge/Google adapter format so
11
+ * the rest of the search pipeline (ranking, deduplication, caching) is unaffected.
12
+ */
13
+
14
+ /**
15
+ * Map a single SearXNG result object to the internal item shape used throughout
16
+ * the search pipeline.
17
+ *
18
+ * SearXNG field → internal field
19
+ * title → title
20
+ * url → link, displayLink, formattedUrl
21
+ * content → snippet, htmlSnippet
22
+ * (all others) → ignored / defaulted
23
+ *
24
+ * @param {Object} result - Raw SearXNG result entry
25
+ * @returns {Object} Normalised item
26
+ */
27
+ export function normalizeSearxngResult(result) {
28
+ const url = result.url || '';
29
+ let displayLink = '';
30
+ try {
31
+ displayLink = new URL(url).hostname;
32
+ } catch {
33
+ displayLink = url;
34
+ }
35
+
36
+ return {
37
+ title: result.title || '',
38
+ link: url,
39
+ snippet: result.content || '',
40
+ displayLink,
41
+ formattedUrl: url,
42
+ htmlSnippet: result.content || '',
43
+ pagemap: {},
44
+ metadata: {
45
+ mime: null,
46
+ fileFormat: null,
47
+ cacheId: null
48
+ }
49
+ };
50
+ }
51
+
52
+ /**
53
+ * Fetch search results from a SearXNG instance.
54
+ *
55
+ * @param {Object} opts
56
+ * @param {string} opts.query - Search query string
57
+ * @param {number} [opts.limit=10] - Maximum number of results to return
58
+ * @param {number} [opts.page=1] - Page number (1-based)
59
+ * @param {boolean} [opts.safeSearch=true] - Whether safe search is enabled
60
+ * @param {string} [opts.language='en'] - Language code (e.g. 'en', 'de')
61
+ * @param {string} [opts.instanceUrl] - Override for CRAWLFORGE_SEARXNG_URL
62
+ * @returns {Promise<Object>} Results in the internal adapter format
63
+ * { items: Array, searchInformation: { totalResults, searchTime }, queries: {}, context: {} }
64
+ */
65
+ export async function searchViaSearxng(opts = {}) {
66
+ const instanceUrl = opts.instanceUrl || process.env.CRAWLFORGE_SEARXNG_URL;
67
+
68
+ if (!instanceUrl) {
69
+ throw new Error(
70
+ "provider 'searxng' requires CRAWLFORGE_SEARXNG_URL in environment"
71
+ );
72
+ }
73
+
74
+ const {
75
+ query,
76
+ limit = 10,
77
+ page = 1,
78
+ safeSearch = true,
79
+ language = 'en'
80
+ } = opts;
81
+
82
+ // SearXNG safesearch: 0=off, 1=moderate, 2=strict
83
+ const safesearch = safeSearch ? 1 : 0;
84
+
85
+ const url = new URL('/search', instanceUrl);
86
+ url.searchParams.set('q', query);
87
+ url.searchParams.set('format', 'json');
88
+ url.searchParams.set('pageno', String(page));
89
+ url.searchParams.set('safesearch', String(safesearch));
90
+ url.searchParams.set('language', language);
91
+
92
+ let response;
93
+ try {
94
+ response = await fetch(url.toString(), {
95
+ headers: { Accept: 'application/json' }
96
+ });
97
+ } catch (err) {
98
+ throw new Error(`SearXNG request failed: ${err.message}`);
99
+ }
100
+
101
+ if (!response.ok) {
102
+ throw new Error(
103
+ `SearXNG returned HTTP ${response.status}: ${response.statusText}`
104
+ );
105
+ }
106
+
107
+ let data;
108
+ try {
109
+ data = await response.json();
110
+ } catch {
111
+ throw new Error('SearXNG returned invalid JSON');
112
+ }
113
+
114
+ const rawResults = Array.isArray(data.results) ? data.results : [];
115
+ const items = rawResults.slice(0, limit).map(normalizeSearxngResult);
116
+
117
+ return {
118
+ items,
119
+ searchInformation: {
120
+ totalResults: String(rawResults.length),
121
+ searchTime: data.answers ? 0 : 0
122
+ },
123
+ queries: {},
124
+ context: {}
125
+ };
126
+ }
@@ -1,10 +1,13 @@
1
1
  import { CacheManager } from '../../../core/cache/CacheManager.js';
2
2
 
3
3
  /**
4
- * Advanced search result deduplication system using multiple similarity algorithms
4
+ * Advanced search result deduplication system using multiple similarity algorithms.
5
+ * Accepts an optional `sharedCache` (SearchResultCache instance) to avoid
6
+ * creating a duplicate CacheManager when used alongside ResultRanker.
5
7
  */
6
8
  export class ResultDeduplicator {
7
9
  constructor(options = {}) {
10
+ const { sharedCache, ...serializableOptions } = options;
8
11
  this.options = {
9
12
  // Similarity thresholds
10
13
  thresholds: {
@@ -13,7 +16,7 @@ export class ResultDeduplicator {
13
16
  content: 0.85, // Content similarity threshold
14
17
  combined: 0.8 // Combined similarity threshold for final decision
15
18
  },
16
-
19
+
17
20
  // Deduplication strategies
18
21
  strategies: {
19
22
  urlNormalization: true, // Normalize URLs for comparison
@@ -21,7 +24,7 @@ export class ResultDeduplicator {
21
24
  contentSimhash: true, // Use SimHash for content comparison
22
25
  domainClustering: true // Cluster results by domain
23
26
  },
24
-
27
+
25
28
  // URL normalization options
26
29
  urlNormalization: {
27
30
  removeProtocol: true, // Remove http/https difference
@@ -32,7 +35,7 @@ export class ResultDeduplicator {
32
35
  removeEmptyParams: true, // Remove empty query parameters
33
36
  lowercaseDomain: true // Convert domain to lowercase
34
37
  },
35
-
38
+
36
39
  // Content similarity options
37
40
  contentSimilarity: {
38
41
  minLength: 10, // Minimum content length to compare
@@ -40,7 +43,7 @@ export class ResultDeduplicator {
40
43
  simhashBits: 64, // SimHash bit size
41
44
  hammingThreshold: 16 // Hamming distance threshold for SimHash
42
45
  },
43
-
46
+
44
47
  // Merge strategy
45
48
  mergeStrategy: {
46
49
  preserveBestRank: true, // Keep the best ranking result as primary
@@ -48,17 +51,21 @@ export class ResultDeduplicator {
48
51
  preferHttps: true, // Prefer HTTPS URLs when merging
49
52
  preferShorterUrl: true // Prefer shorter, cleaner URLs
50
53
  },
51
-
54
+
52
55
  // Performance options
53
56
  cacheEnabled: true,
54
57
  cacheTTL: 3600000, // 1 hour
55
- ...options
58
+ ...serializableOptions
56
59
  };
57
60
 
58
- // Initialize cache for deduplication computation
59
- this.cache = this.options.cacheEnabled ?
60
- new CacheManager({ ttl: this.options.cacheTTL }) : null;
61
-
61
+ // Use shared cache if provided, otherwise create own CacheManager instance.
62
+ // sharedCache is held separately — never in this.options because it holds
63
+ // a setInterval Timer that would create a circular reference when the
64
+ // options object is JSON.stringify'd to build a cache key (see generateKey).
65
+ this.cache = sharedCache || (this.options.cacheEnabled
66
+ ? new CacheManager({ ttl: this.options.cacheTTL })
67
+ : null);
68
+
62
69
  // Statistics tracking
63
70
  this.stats = {
64
71
  totalProcessed: 0,
@@ -1,10 +1,13 @@
1
1
  import { CacheManager } from '../../../core/cache/CacheManager.js';
2
2
 
3
3
  /**
4
- * Advanced search result ranking system with multiple scoring algorithms
4
+ * Advanced search result ranking system with multiple scoring algorithms.
5
+ * Accepts an optional `sharedCache` (SearchResultCache instance) to avoid
6
+ * creating a duplicate CacheManager when used alongside ResultDeduplicator.
5
7
  */
6
8
  export class ResultRanker {
7
9
  constructor(options = {}) {
10
+ const { sharedCache, ...serializableOptions } = options;
8
11
  this.options = {
9
12
  // Ranking weight configuration
10
13
  weights: {
@@ -13,13 +16,13 @@ export class ResultRanker {
13
16
  authority: 0.2, // URL/domain authority
14
17
  freshness: 0.1 // Content freshness
15
18
  },
16
-
19
+
17
20
  // BM25 parameters
18
21
  bm25: {
19
22
  k1: 1.5, // Term frequency saturation parameter
20
23
  b: 0.75 // Length normalization parameter
21
24
  },
22
-
25
+
23
26
  // Authority scoring parameters
24
27
  authority: {
25
28
  domainBoosts: { // Domain authority boosts
@@ -32,23 +35,27 @@ export class ResultRanker {
32
35
  httpsBoost: 0.1, // HTTPS boost
33
36
  pathDepthPenalty: 0.02 // Penalty per path segment
34
37
  },
35
-
38
+
36
39
  // Freshness parameters
37
40
  freshness: {
38
41
  maxAgeMonths: 24, // Content older than this gets 0 freshness score
39
42
  decayRate: 0.1 // Exponential decay rate per month
40
43
  },
41
-
44
+
42
45
  // Performance options
43
46
  cacheEnabled: true,
44
47
  cacheTTL: 3600000, // 1 hour
45
- ...options
48
+ ...serializableOptions
46
49
  };
47
50
 
48
- // Initialize cache for score computation
49
- this.cache = this.options.cacheEnabled ?
50
- new CacheManager({ ttl: this.options.cacheTTL }) : null;
51
-
51
+ // Use shared cache if provided, otherwise create own CacheManager instance.
52
+ // sharedCache is held separately — never in this.options because it holds
53
+ // a setInterval Timer that would create a circular reference when the
54
+ // options object is JSON.stringify'd to build a cache key (see generateKey).
55
+ this.cache = sharedCache || (this.options.cacheEnabled
56
+ ? new CacheManager({ ttl: this.options.cacheTTL })
57
+ : null);
58
+
52
59
  // Precompute domain authority scores
53
60
  this.domainAuthorityMap = new Map();
54
61
  this.initializeDomainAuthority();
@@ -0,0 +1,52 @@
1
+ /**
2
+ * SearchResultCache — unified cache layer for search ranking and deduplication.
3
+ *
4
+ * Both ResultRanker and ResultDeduplicator previously held separate CacheManager
5
+ * instances with identical TTL configuration. This module provides a single
6
+ * shared cache they can both use, halving the number of LRU cache instances
7
+ * created per SearchWebTool instantiation.
8
+ *
9
+ * Usage:
10
+ * const cache = new SearchResultCache({ ttl: 3600000 });
11
+ * // pass to ResultRanker and ResultDeduplicator via options.sharedCache
12
+ */
13
+
14
+ import { CacheManager } from '../../../core/cache/CacheManager.js';
15
+
16
+ export class SearchResultCache {
17
+ /**
18
+ * @param {Object} [options]
19
+ * @param {number} [options.ttl=3600000] — cache TTL in milliseconds
20
+ * @param {boolean} [options.enabled=true] — disable to skip caching
21
+ */
22
+ constructor(options = {}) {
23
+ const { ttl = 3600000, enabled = true } = options;
24
+ this.enabled = enabled;
25
+ this._cache = enabled ? new CacheManager({ ttl }) : null;
26
+ }
27
+
28
+ /** Retrieve a cached value by key (returns undefined on miss or when disabled). */
29
+ async get(key) {
30
+ if (!this.enabled || !this._cache) return undefined;
31
+ return this._cache.get(key);
32
+ }
33
+
34
+ /** Store a value under the given key. */
35
+ async set(key, value) {
36
+ if (!this.enabled || !this._cache) return;
37
+ return this._cache.set(key, value);
38
+ }
39
+
40
+ /** Generate a deterministic cache key from an arbitrary descriptor object. */
41
+ generateKey(namespace, descriptor) {
42
+ if (!this._cache) return null;
43
+ return this._cache.generateKey(namespace, descriptor);
44
+ }
45
+
46
+ /** Return underlying cache stats (or null when disabled). */
47
+ getStats() {
48
+ return this._cache ? this._cache.getStats() : null;
49
+ }
50
+ }
51
+
52
+ export default SearchResultCache;