crawlforge-mcp-server 3.0.18 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -2
- package/server.js +192 -1277
- package/src/core/ActionExecutor.js +2 -43
- package/src/core/AuthManager.js +127 -14
- package/src/core/BrowserContextPool.js +187 -0
- package/src/core/JobManager.js +7 -5
- package/src/core/LocalizationManager.js +14 -125
- package/src/core/StealthBrowserManager.js +26 -18
- package/src/core/cache/CacheManager.js +4 -1
- package/src/core/crawlers/BFSCrawler.js +19 -5
- package/src/observability/metrics.js +137 -0
- package/src/observability/tracing.js +74 -0
- package/src/server/auth/oauth.js +388 -0
- package/src/server/registerTool.js +41 -0
- package/src/server/schemas/common.js +29 -0
- package/src/server/transports/http.js +22 -0
- package/src/server/transports/stdio.js +16 -0
- package/src/server/transports/streamableHttp.js +226 -0
- package/src/server/withAuth.js +121 -0
- package/src/tools/advanced/BatchScrapeTool.js +12 -1086
- package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
- package/src/tools/advanced/batchScrape/index.js +328 -0
- package/src/tools/advanced/batchScrape/queue.js +91 -0
- package/src/tools/advanced/batchScrape/reporter.js +26 -0
- package/src/tools/advanced/batchScrape/schema.js +37 -0
- package/src/tools/advanced/batchScrape/worker.js +179 -0
- package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
- package/src/tools/basic/_fetch.js +35 -0
- package/src/tools/basic/extractLinks.js +74 -0
- package/src/tools/basic/extractMetadata.js +74 -0
- package/src/tools/basic/extractText.js +46 -0
- package/src/tools/basic/fetchUrl.js +44 -0
- package/src/tools/basic/scrapeStructured.js +58 -0
- package/src/tools/crawl/_sessionContext.js +234 -0
- package/src/tools/crawl/crawlDeep.js +55 -5
- package/src/tools/crawl/mapSite.js +23 -2
- package/src/tools/extract/_fetchAndParse.js +57 -0
- package/src/tools/extract/extractStructured.js +3 -19
- package/src/tools/extract/extractWithLlm.js +295 -0
- package/src/tools/search/providers/searxng.js +126 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
- package/src/tools/search/ranking/ResultRanker.js +17 -10
- package/src/tools/search/ranking/SearchResultCache.js +52 -0
- package/src/tools/search/searchWeb.js +112 -6
- package/src/tools/tracking/trackChanges/differ.js +98 -0
- package/src/tools/tracking/trackChanges/index.js +432 -0
- package/src/tools/tracking/trackChanges/monitor.js +93 -0
- package/src/tools/tracking/trackChanges/notifier.js +105 -0
- package/src/tools/tracking/trackChanges/schema.js +127 -0
- package/src/tools/tracking/trackChanges.js +12 -1374
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract With LLM MCP Tool
|
|
3
|
+
* Natural-language extraction powered by OpenAI or Anthropic.
|
|
4
|
+
* Mirrors ScrapeGraphAI positioning: describe what you want, get structured JSON back.
|
|
5
|
+
*
|
|
6
|
+
* Requires OPENAI_API_KEY or ANTHROPIC_API_KEY in environment.
|
|
7
|
+
* Gate: tool throws a clear error when neither key is present.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { fetchAndParse } from './_fetchAndParse.js';
|
|
11
|
+
|
|
12
|
+
// ── Constants ─────────────────────────────────────────────────────────────────
|
|
13
|
+
|
|
14
|
+
const MAX_INPUT_CHARS = 50_000;
|
|
15
|
+
|
|
16
|
+
const OPENAI_DEFAULT_MODEL = 'gpt-4o-mini';
|
|
17
|
+
const ANTHROPIC_DEFAULT_MODEL = 'claude-haiku-4-5-20251001';
|
|
18
|
+
|
|
19
|
+
// Support test-time overrides so the test suite can stub endpoints.
|
|
20
|
+
function openaiBaseUrl() {
|
|
21
|
+
return (process.env.OPENAI_BASE_URL || 'https://api.openai.com').replace(/\/$/, '');
|
|
22
|
+
}
|
|
23
|
+
function anthropicBaseUrl() {
|
|
24
|
+
return (process.env.ANTHROPIC_BASE_URL || 'https://api.anthropic.com').replace(/\/$/, '');
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Resolve which provider to use.
|
|
31
|
+
* @param {'openai'|'anthropic'|'auto'} provider
|
|
32
|
+
* @returns {{ provider: 'openai'|'anthropic', apiKey: string }}
|
|
33
|
+
*/
|
|
34
|
+
function resolveProvider(provider) {
|
|
35
|
+
const anthropicKey = process.env.ANTHROPIC_API_KEY;
|
|
36
|
+
const openaiKey = process.env.OPENAI_API_KEY;
|
|
37
|
+
|
|
38
|
+
if (provider === 'auto') {
|
|
39
|
+
if (anthropicKey) return { provider: 'anthropic', apiKey: anthropicKey };
|
|
40
|
+
if (openaiKey) return { provider: 'openai', apiKey: openaiKey };
|
|
41
|
+
throw new Error(
|
|
42
|
+
'extract_with_llm requires OPENAI_API_KEY or ANTHROPIC_API_KEY in environment'
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if (provider === 'anthropic') {
|
|
47
|
+
if (!anthropicKey) throw new Error('extract_with_llm: ANTHROPIC_API_KEY is not set');
|
|
48
|
+
return { provider: 'anthropic', apiKey: anthropicKey };
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if (provider === 'openai') {
|
|
52
|
+
if (!openaiKey) throw new Error('extract_with_llm: OPENAI_API_KEY is not set');
|
|
53
|
+
return { provider: 'openai', apiKey: openaiKey };
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
throw new Error(`extract_with_llm: unknown provider "${provider}"`);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Build the user message text that goes to the LLM.
|
|
61
|
+
*/
|
|
62
|
+
function buildUserMessage(userPrompt, text, schema) {
|
|
63
|
+
const truncated = text.length > MAX_INPUT_CHARS ? text.slice(0, MAX_INPUT_CHARS) + '\n[...truncated]' : text;
|
|
64
|
+
let msg = `Extraction instruction: ${userPrompt}\n\n`;
|
|
65
|
+
if (schema && Object.keys(schema).length > 0) {
|
|
66
|
+
msg += `Output schema hint:\n${JSON.stringify(schema, null, 2)}\n\n`;
|
|
67
|
+
}
|
|
68
|
+
msg += `Web page content:\n${truncated}\n\nReturn only valid JSON.`;
|
|
69
|
+
return msg;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Parse JSON from an LLM response string defensively.
|
|
74
|
+
* Strips markdown code fences if present.
|
|
75
|
+
* Returns parsed object or throws.
|
|
76
|
+
*/
|
|
77
|
+
function parseJson(raw) {
|
|
78
|
+
// Strip markdown fences
|
|
79
|
+
const stripped = raw
|
|
80
|
+
.replace(/^```(?:json)?\s*/i, '')
|
|
81
|
+
.replace(/\s*```\s*$/, '')
|
|
82
|
+
.trim();
|
|
83
|
+
return JSON.parse(stripped);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ── OpenAI call ───────────────────────────────────────────────────────────────
|
|
87
|
+
|
|
88
|
+
async function callOpenAI({ apiKey, model, systemMessage, userMessage, maxTokens }) {
|
|
89
|
+
const url = `${openaiBaseUrl()}/v1/chat/completions`;
|
|
90
|
+
const body = {
|
|
91
|
+
model,
|
|
92
|
+
messages: [
|
|
93
|
+
{ role: 'system', content: systemMessage },
|
|
94
|
+
{ role: 'user', content: userMessage }
|
|
95
|
+
],
|
|
96
|
+
max_tokens: maxTokens,
|
|
97
|
+
response_format: { type: 'json_object' }
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
const response = await fetch(url, {
|
|
101
|
+
method: 'POST',
|
|
102
|
+
headers: {
|
|
103
|
+
'Content-Type': 'application/json',
|
|
104
|
+
'Authorization': `Bearer ${apiKey}`
|
|
105
|
+
},
|
|
106
|
+
body: JSON.stringify(body),
|
|
107
|
+
signal: AbortSignal.timeout(120_000)
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
if (!response.ok) {
|
|
111
|
+
const errText = await response.text().catch(() => '');
|
|
112
|
+
throw new Error(`OpenAI API error ${response.status}: ${errText.slice(0, 200)}`);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const json = await response.json();
|
|
116
|
+
const content = json.choices?.[0]?.message?.content ?? '';
|
|
117
|
+
const usage = {
|
|
118
|
+
input_tokens: json.usage?.prompt_tokens ?? 0,
|
|
119
|
+
output_tokens: json.usage?.completion_tokens ?? 0
|
|
120
|
+
};
|
|
121
|
+
return { rawText: content, usage, model: json.model || model };
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// ── Anthropic call ────────────────────────────────────────────────────────────
|
|
125
|
+
|
|
126
|
+
async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens }) {
|
|
127
|
+
const url = `${anthropicBaseUrl()}/v1/messages`;
|
|
128
|
+
const body = {
|
|
129
|
+
model,
|
|
130
|
+
system: systemMessage,
|
|
131
|
+
messages: [{ role: 'user', content: userMessage }],
|
|
132
|
+
max_tokens: maxTokens
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
const response = await fetch(url, {
|
|
136
|
+
method: 'POST',
|
|
137
|
+
headers: {
|
|
138
|
+
'Content-Type': 'application/json',
|
|
139
|
+
'x-api-key': apiKey,
|
|
140
|
+
'anthropic-version': '2023-06-01'
|
|
141
|
+
},
|
|
142
|
+
body: JSON.stringify(body),
|
|
143
|
+
signal: AbortSignal.timeout(120_000)
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
if (!response.ok) {
|
|
147
|
+
const errText = await response.text().catch(() => '');
|
|
148
|
+
throw new Error(`Anthropic API error ${response.status}: ${errText.slice(0, 200)}`);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
const json = await response.json();
|
|
152
|
+
const content = json.content?.[0]?.text ?? '';
|
|
153
|
+
const usage = {
|
|
154
|
+
input_tokens: json.usage?.input_tokens ?? 0,
|
|
155
|
+
output_tokens: json.usage?.output_tokens ?? 0
|
|
156
|
+
};
|
|
157
|
+
return { rawText: content, usage, model: json.model || model };
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// ── LLM dispatch ─────────────────────────────────────────────────────────────
|
|
161
|
+
|
|
162
|
+
async function callLLM({ provider, apiKey, model, systemMessage, userMessage, maxTokens }) {
|
|
163
|
+
if (provider === 'openai') {
|
|
164
|
+
return callOpenAI({ apiKey, model, systemMessage, userMessage, maxTokens });
|
|
165
|
+
}
|
|
166
|
+
return callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens });
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// ── Tool class ────────────────────────────────────────────────────────────────
|
|
170
|
+
|
|
171
|
+
export class ExtractWithLlm {
|
|
172
|
+
constructor(config = {}) {
|
|
173
|
+
this.config = config;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Execute LLM-powered extraction.
|
|
178
|
+
* @param {Object} params
|
|
179
|
+
* @param {string} [params.url] - URL to fetch (one of url/content required)
|
|
180
|
+
* @param {string} [params.content] - Pre-fetched text content
|
|
181
|
+
* @param {string} params.prompt - Natural-language extraction instruction
|
|
182
|
+
* @param {Object} [params.schema] - Optional JSON-schema-like output hint
|
|
183
|
+
* @param {string} [params.provider] - 'openai' | 'anthropic' | 'auto'
|
|
184
|
+
* @param {string} [params.model] - Override default model
|
|
185
|
+
* @param {number} [params.maxTokens] - Max output tokens (default 4096)
|
|
186
|
+
* @returns {Promise<Object>}
|
|
187
|
+
*/
|
|
188
|
+
async execute(params) {
|
|
189
|
+
const {
|
|
190
|
+
url,
|
|
191
|
+
content,
|
|
192
|
+
prompt,
|
|
193
|
+
schema,
|
|
194
|
+
provider: providerParam = 'auto',
|
|
195
|
+
model: modelParam,
|
|
196
|
+
maxTokens = 4096
|
|
197
|
+
} = params;
|
|
198
|
+
|
|
199
|
+
// Validate: exactly one of url or content must be provided
|
|
200
|
+
if (!url && !content) {
|
|
201
|
+
return {
|
|
202
|
+
success: false,
|
|
203
|
+
error: 'extract_with_llm: either "url" or "content" must be provided'
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
if (!prompt) {
|
|
207
|
+
return { success: false, error: 'extract_with_llm: "prompt" is required' };
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Resolve provider + API key (throws clearly if neither key is set)
|
|
211
|
+
let resolved;
|
|
212
|
+
try {
|
|
213
|
+
resolved = resolveProvider(providerParam);
|
|
214
|
+
} catch (err) {
|
|
215
|
+
return { success: false, error: err.message };
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
const { provider, apiKey } = resolved;
|
|
219
|
+
const defaultModel = provider === 'openai' ? OPENAI_DEFAULT_MODEL : ANTHROPIC_DEFAULT_MODEL;
|
|
220
|
+
const model = modelParam || defaultModel;
|
|
221
|
+
|
|
222
|
+
// Step 1: Get text to extract from
|
|
223
|
+
let text;
|
|
224
|
+
try {
|
|
225
|
+
if (url) {
|
|
226
|
+
const { textContent } = await fetchAndParse(url);
|
|
227
|
+
text = textContent;
|
|
228
|
+
} else {
|
|
229
|
+
text = content;
|
|
230
|
+
}
|
|
231
|
+
} catch (fetchErr) {
|
|
232
|
+
return { success: false, error: `Failed to fetch content: ${fetchErr.message}` };
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const systemMessage =
|
|
236
|
+
'You extract structured data from web content per the user\'s instructions. Return JSON only.';
|
|
237
|
+
|
|
238
|
+
const userMessage = buildUserMessage(prompt, text, schema);
|
|
239
|
+
|
|
240
|
+
// Step 2: First LLM call
|
|
241
|
+
let rawText, usage;
|
|
242
|
+
try {
|
|
243
|
+
({ rawText, usage } = await callLLM({
|
|
244
|
+
provider, apiKey, model, systemMessage, userMessage, maxTokens
|
|
245
|
+
}));
|
|
246
|
+
} catch (llmErr) {
|
|
247
|
+
return { success: false, error: `LLM call failed: ${llmErr.message}` };
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Step 3: Parse JSON; retry once with stricter prompt if it fails
|
|
251
|
+
let parsed;
|
|
252
|
+
try {
|
|
253
|
+
parsed = parseJson(rawText);
|
|
254
|
+
} catch (_parseErr) {
|
|
255
|
+
// Retry with stricter instruction
|
|
256
|
+
const retryUserMessage =
|
|
257
|
+
`${userMessage}\n\nIMPORTANT: Your previous response was not valid JSON. ` +
|
|
258
|
+
'Respond with ONLY a JSON object or array. No explanation, no markdown fences.';
|
|
259
|
+
let retryRaw, retryUsage;
|
|
260
|
+
try {
|
|
261
|
+
({ rawText: retryRaw, usage: retryUsage } = await callLLM({
|
|
262
|
+
provider, apiKey, model, systemMessage,
|
|
263
|
+
userMessage: retryUserMessage, maxTokens
|
|
264
|
+
}));
|
|
265
|
+
// Merge usage
|
|
266
|
+
usage = {
|
|
267
|
+
input_tokens: usage.input_tokens + retryUsage.input_tokens,
|
|
268
|
+
output_tokens: usage.output_tokens + retryUsage.output_tokens
|
|
269
|
+
};
|
|
270
|
+
} catch (retryLlmErr) {
|
|
271
|
+
return { success: false, error: `LLM retry call failed: ${retryLlmErr.message}` };
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
try {
|
|
275
|
+
parsed = parseJson(retryRaw);
|
|
276
|
+
} catch (_retryParseErr) {
|
|
277
|
+
return {
|
|
278
|
+
success: false,
|
|
279
|
+
error: 'LLM did not return valid JSON after retry',
|
|
280
|
+
raw: retryRaw.slice(0, 500)
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
return {
|
|
286
|
+
success: true,
|
|
287
|
+
data: parsed,
|
|
288
|
+
provider,
|
|
289
|
+
model,
|
|
290
|
+
usage
|
|
291
|
+
};
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
export default ExtractWithLlm;
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SearXNG Search Provider
|
|
3
|
+
*
|
|
4
|
+
* Executes searches against a self-hosted SearXNG instance via its JSON API.
|
|
5
|
+
* Instance URL is read from the CRAWLFORGE_SEARXNG_URL environment variable.
|
|
6
|
+
*
|
|
7
|
+
* SearXNG JSON API reference:
|
|
8
|
+
* https://docs.searxng.org/dev/search_api.html
|
|
9
|
+
*
|
|
10
|
+
* Result shape is normalised to match the CrawlForge/Google adapter format so
|
|
11
|
+
* the rest of the search pipeline (ranking, deduplication, caching) is unaffected.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Map a single SearXNG result object to the internal item shape used throughout
|
|
16
|
+
* the search pipeline.
|
|
17
|
+
*
|
|
18
|
+
* SearXNG field → internal field
|
|
19
|
+
* title → title
|
|
20
|
+
* url → link, displayLink, formattedUrl
|
|
21
|
+
* content → snippet, htmlSnippet
|
|
22
|
+
* (all others) → ignored / defaulted
|
|
23
|
+
*
|
|
24
|
+
* @param {Object} result - Raw SearXNG result entry
|
|
25
|
+
* @returns {Object} Normalised item
|
|
26
|
+
*/
|
|
27
|
+
export function normalizeSearxngResult(result) {
|
|
28
|
+
const url = result.url || '';
|
|
29
|
+
let displayLink = '';
|
|
30
|
+
try {
|
|
31
|
+
displayLink = new URL(url).hostname;
|
|
32
|
+
} catch {
|
|
33
|
+
displayLink = url;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
title: result.title || '',
|
|
38
|
+
link: url,
|
|
39
|
+
snippet: result.content || '',
|
|
40
|
+
displayLink,
|
|
41
|
+
formattedUrl: url,
|
|
42
|
+
htmlSnippet: result.content || '',
|
|
43
|
+
pagemap: {},
|
|
44
|
+
metadata: {
|
|
45
|
+
mime: null,
|
|
46
|
+
fileFormat: null,
|
|
47
|
+
cacheId: null
|
|
48
|
+
}
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Fetch search results from a SearXNG instance.
|
|
54
|
+
*
|
|
55
|
+
* @param {Object} opts
|
|
56
|
+
* @param {string} opts.query - Search query string
|
|
57
|
+
* @param {number} [opts.limit=10] - Maximum number of results to return
|
|
58
|
+
* @param {number} [opts.page=1] - Page number (1-based)
|
|
59
|
+
* @param {boolean} [opts.safeSearch=true] - Whether safe search is enabled
|
|
60
|
+
* @param {string} [opts.language='en'] - Language code (e.g. 'en', 'de')
|
|
61
|
+
* @param {string} [opts.instanceUrl] - Override for CRAWLFORGE_SEARXNG_URL
|
|
62
|
+
* @returns {Promise<Object>} Results in the internal adapter format
|
|
63
|
+
* { items: Array, searchInformation: { totalResults, searchTime }, queries: {}, context: {} }
|
|
64
|
+
*/
|
|
65
|
+
export async function searchViaSearxng(opts = {}) {
|
|
66
|
+
const instanceUrl = opts.instanceUrl || process.env.CRAWLFORGE_SEARXNG_URL;
|
|
67
|
+
|
|
68
|
+
if (!instanceUrl) {
|
|
69
|
+
throw new Error(
|
|
70
|
+
"provider 'searxng' requires CRAWLFORGE_SEARXNG_URL in environment"
|
|
71
|
+
);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const {
|
|
75
|
+
query,
|
|
76
|
+
limit = 10,
|
|
77
|
+
page = 1,
|
|
78
|
+
safeSearch = true,
|
|
79
|
+
language = 'en'
|
|
80
|
+
} = opts;
|
|
81
|
+
|
|
82
|
+
// SearXNG safesearch: 0=off, 1=moderate, 2=strict
|
|
83
|
+
const safesearch = safeSearch ? 1 : 0;
|
|
84
|
+
|
|
85
|
+
const url = new URL('/search', instanceUrl);
|
|
86
|
+
url.searchParams.set('q', query);
|
|
87
|
+
url.searchParams.set('format', 'json');
|
|
88
|
+
url.searchParams.set('pageno', String(page));
|
|
89
|
+
url.searchParams.set('safesearch', String(safesearch));
|
|
90
|
+
url.searchParams.set('language', language);
|
|
91
|
+
|
|
92
|
+
let response;
|
|
93
|
+
try {
|
|
94
|
+
response = await fetch(url.toString(), {
|
|
95
|
+
headers: { Accept: 'application/json' }
|
|
96
|
+
});
|
|
97
|
+
} catch (err) {
|
|
98
|
+
throw new Error(`SearXNG request failed: ${err.message}`);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (!response.ok) {
|
|
102
|
+
throw new Error(
|
|
103
|
+
`SearXNG returned HTTP ${response.status}: ${response.statusText}`
|
|
104
|
+
);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
let data;
|
|
108
|
+
try {
|
|
109
|
+
data = await response.json();
|
|
110
|
+
} catch {
|
|
111
|
+
throw new Error('SearXNG returned invalid JSON');
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const rawResults = Array.isArray(data.results) ? data.results : [];
|
|
115
|
+
const items = rawResults.slice(0, limit).map(normalizeSearxngResult);
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
items,
|
|
119
|
+
searchInformation: {
|
|
120
|
+
totalResults: String(rawResults.length),
|
|
121
|
+
searchTime: data.answers ? 0 : 0
|
|
122
|
+
},
|
|
123
|
+
queries: {},
|
|
124
|
+
context: {}
|
|
125
|
+
};
|
|
126
|
+
}
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import { CacheManager } from '../../../core/cache/CacheManager.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* Advanced search result deduplication system using multiple similarity algorithms
|
|
4
|
+
* Advanced search result deduplication system using multiple similarity algorithms.
|
|
5
|
+
* Accepts an optional `sharedCache` (SearchResultCache instance) to avoid
|
|
6
|
+
* creating a duplicate CacheManager when used alongside ResultRanker.
|
|
5
7
|
*/
|
|
6
8
|
export class ResultDeduplicator {
|
|
7
9
|
constructor(options = {}) {
|
|
10
|
+
const { sharedCache, ...serializableOptions } = options;
|
|
8
11
|
this.options = {
|
|
9
12
|
// Similarity thresholds
|
|
10
13
|
thresholds: {
|
|
@@ -13,7 +16,7 @@ export class ResultDeduplicator {
|
|
|
13
16
|
content: 0.85, // Content similarity threshold
|
|
14
17
|
combined: 0.8 // Combined similarity threshold for final decision
|
|
15
18
|
},
|
|
16
|
-
|
|
19
|
+
|
|
17
20
|
// Deduplication strategies
|
|
18
21
|
strategies: {
|
|
19
22
|
urlNormalization: true, // Normalize URLs for comparison
|
|
@@ -21,7 +24,7 @@ export class ResultDeduplicator {
|
|
|
21
24
|
contentSimhash: true, // Use SimHash for content comparison
|
|
22
25
|
domainClustering: true // Cluster results by domain
|
|
23
26
|
},
|
|
24
|
-
|
|
27
|
+
|
|
25
28
|
// URL normalization options
|
|
26
29
|
urlNormalization: {
|
|
27
30
|
removeProtocol: true, // Remove http/https difference
|
|
@@ -32,7 +35,7 @@ export class ResultDeduplicator {
|
|
|
32
35
|
removeEmptyParams: true, // Remove empty query parameters
|
|
33
36
|
lowercaseDomain: true // Convert domain to lowercase
|
|
34
37
|
},
|
|
35
|
-
|
|
38
|
+
|
|
36
39
|
// Content similarity options
|
|
37
40
|
contentSimilarity: {
|
|
38
41
|
minLength: 10, // Minimum content length to compare
|
|
@@ -40,7 +43,7 @@ export class ResultDeduplicator {
|
|
|
40
43
|
simhashBits: 64, // SimHash bit size
|
|
41
44
|
hammingThreshold: 16 // Hamming distance threshold for SimHash
|
|
42
45
|
},
|
|
43
|
-
|
|
46
|
+
|
|
44
47
|
// Merge strategy
|
|
45
48
|
mergeStrategy: {
|
|
46
49
|
preserveBestRank: true, // Keep the best ranking result as primary
|
|
@@ -48,17 +51,21 @@ export class ResultDeduplicator {
|
|
|
48
51
|
preferHttps: true, // Prefer HTTPS URLs when merging
|
|
49
52
|
preferShorterUrl: true // Prefer shorter, cleaner URLs
|
|
50
53
|
},
|
|
51
|
-
|
|
54
|
+
|
|
52
55
|
// Performance options
|
|
53
56
|
cacheEnabled: true,
|
|
54
57
|
cacheTTL: 3600000, // 1 hour
|
|
55
|
-
...
|
|
58
|
+
...serializableOptions
|
|
56
59
|
};
|
|
57
60
|
|
|
58
|
-
//
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
61
|
+
// Use shared cache if provided, otherwise create own CacheManager instance.
|
|
62
|
+
// sharedCache is held separately — never in this.options — because it holds
|
|
63
|
+
// a setInterval Timer that would create a circular reference when the
|
|
64
|
+
// options object is JSON.stringify'd to build a cache key (see generateKey).
|
|
65
|
+
this.cache = sharedCache || (this.options.cacheEnabled
|
|
66
|
+
? new CacheManager({ ttl: this.options.cacheTTL })
|
|
67
|
+
: null);
|
|
68
|
+
|
|
62
69
|
// Statistics tracking
|
|
63
70
|
this.stats = {
|
|
64
71
|
totalProcessed: 0,
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import { CacheManager } from '../../../core/cache/CacheManager.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* Advanced search result ranking system with multiple scoring algorithms
|
|
4
|
+
* Advanced search result ranking system with multiple scoring algorithms.
|
|
5
|
+
* Accepts an optional `sharedCache` (SearchResultCache instance) to avoid
|
|
6
|
+
* creating a duplicate CacheManager when used alongside ResultDeduplicator.
|
|
5
7
|
*/
|
|
6
8
|
export class ResultRanker {
|
|
7
9
|
constructor(options = {}) {
|
|
10
|
+
const { sharedCache, ...serializableOptions } = options;
|
|
8
11
|
this.options = {
|
|
9
12
|
// Ranking weight configuration
|
|
10
13
|
weights: {
|
|
@@ -13,13 +16,13 @@ export class ResultRanker {
|
|
|
13
16
|
authority: 0.2, // URL/domain authority
|
|
14
17
|
freshness: 0.1 // Content freshness
|
|
15
18
|
},
|
|
16
|
-
|
|
19
|
+
|
|
17
20
|
// BM25 parameters
|
|
18
21
|
bm25: {
|
|
19
22
|
k1: 1.5, // Term frequency saturation parameter
|
|
20
23
|
b: 0.75 // Length normalization parameter
|
|
21
24
|
},
|
|
22
|
-
|
|
25
|
+
|
|
23
26
|
// Authority scoring parameters
|
|
24
27
|
authority: {
|
|
25
28
|
domainBoosts: { // Domain authority boosts
|
|
@@ -32,23 +35,27 @@ export class ResultRanker {
|
|
|
32
35
|
httpsBoost: 0.1, // HTTPS boost
|
|
33
36
|
pathDepthPenalty: 0.02 // Penalty per path segment
|
|
34
37
|
},
|
|
35
|
-
|
|
38
|
+
|
|
36
39
|
// Freshness parameters
|
|
37
40
|
freshness: {
|
|
38
41
|
maxAgeMonths: 24, // Content older than this gets 0 freshness score
|
|
39
42
|
decayRate: 0.1 // Exponential decay rate per month
|
|
40
43
|
},
|
|
41
|
-
|
|
44
|
+
|
|
42
45
|
// Performance options
|
|
43
46
|
cacheEnabled: true,
|
|
44
47
|
cacheTTL: 3600000, // 1 hour
|
|
45
|
-
...
|
|
48
|
+
...serializableOptions
|
|
46
49
|
};
|
|
47
50
|
|
|
48
|
-
//
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
51
|
+
// Use shared cache if provided, otherwise create own CacheManager instance.
|
|
52
|
+
// sharedCache is held separately — never in this.options — because it holds
|
|
53
|
+
// a setInterval Timer that would create a circular reference when the
|
|
54
|
+
// options object is JSON.stringify'd to build a cache key (see generateKey).
|
|
55
|
+
this.cache = sharedCache || (this.options.cacheEnabled
|
|
56
|
+
? new CacheManager({ ttl: this.options.cacheTTL })
|
|
57
|
+
: null);
|
|
58
|
+
|
|
52
59
|
// Precompute domain authority scores
|
|
53
60
|
this.domainAuthorityMap = new Map();
|
|
54
61
|
this.initializeDomainAuthority();
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SearchResultCache — unified cache layer for search ranking and deduplication.
|
|
3
|
+
*
|
|
4
|
+
* Both ResultRanker and ResultDeduplicator previously held separate CacheManager
|
|
5
|
+
* instances with identical TTL configuration. This module provides a single
|
|
6
|
+
* shared cache they can both use, halving the number of LRU cache instances
|
|
7
|
+
* created per SearchWebTool instantiation.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* const cache = new SearchResultCache({ ttl: 3600000 });
|
|
11
|
+
* // pass to ResultRanker and ResultDeduplicator via options.sharedCache
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { CacheManager } from '../../../core/cache/CacheManager.js';
|
|
15
|
+
|
|
16
|
+
export class SearchResultCache {
|
|
17
|
+
/**
|
|
18
|
+
* @param {Object} [options]
|
|
19
|
+
* @param {number} [options.ttl=3600000] — cache TTL in milliseconds
|
|
20
|
+
* @param {boolean} [options.enabled=true] — disable to skip caching
|
|
21
|
+
*/
|
|
22
|
+
constructor(options = {}) {
|
|
23
|
+
const { ttl = 3600000, enabled = true } = options;
|
|
24
|
+
this.enabled = enabled;
|
|
25
|
+
this._cache = enabled ? new CacheManager({ ttl }) : null;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/** Retrieve a cached value by key (returns undefined on miss or when disabled). */
|
|
29
|
+
async get(key) {
|
|
30
|
+
if (!this.enabled || !this._cache) return undefined;
|
|
31
|
+
return this._cache.get(key);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** Store a value under the given key. */
|
|
35
|
+
async set(key, value) {
|
|
36
|
+
if (!this.enabled || !this._cache) return;
|
|
37
|
+
return this._cache.set(key, value);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/** Generate a deterministic cache key from an arbitrary descriptor object. */
|
|
41
|
+
generateKey(namespace, descriptor) {
|
|
42
|
+
if (!this._cache) return null;
|
|
43
|
+
return this._cache.generateKey(namespace, descriptor);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Return underlying cache stats (or null when disabled). */
|
|
47
|
+
getStats() {
|
|
48
|
+
return this._cache ? this._cache.getStats() : null;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export default SearchResultCache;
|