crawlforge-mcp-server 3.0.17 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +2 -0
- package/README.md +1 -0
- package/package.json +6 -2
- package/server.js +192 -1277
- package/src/constants/config.js +2 -1
- package/src/core/ActionExecutor.js +2 -43
- package/src/core/AuthManager.js +230 -32
- package/src/core/BrowserContextPool.js +187 -0
- package/src/core/JobManager.js +7 -5
- package/src/core/LocalizationManager.js +14 -125
- package/src/core/ResearchOrchestrator.js +86 -5
- package/src/core/StealthBrowserManager.js +26 -18
- package/src/core/cache/CacheManager.js +4 -1
- package/src/core/crawlers/BFSCrawler.js +19 -5
- package/src/core/endpointGuard.js +37 -0
- package/src/observability/metrics.js +137 -0
- package/src/observability/tracing.js +74 -0
- package/src/server/auth/oauth.js +388 -0
- package/src/server/registerTool.js +41 -0
- package/src/server/schemas/common.js +29 -0
- package/src/server/transports/http.js +22 -0
- package/src/server/transports/stdio.js +16 -0
- package/src/server/transports/streamableHttp.js +226 -0
- package/src/server/withAuth.js +121 -0
- package/src/tools/advanced/BatchScrapeTool.js +12 -1086
- package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
- package/src/tools/advanced/batchScrape/index.js +328 -0
- package/src/tools/advanced/batchScrape/queue.js +91 -0
- package/src/tools/advanced/batchScrape/reporter.js +26 -0
- package/src/tools/advanced/batchScrape/schema.js +37 -0
- package/src/tools/advanced/batchScrape/worker.js +179 -0
- package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
- package/src/tools/basic/_fetch.js +35 -0
- package/src/tools/basic/extractLinks.js +74 -0
- package/src/tools/basic/extractMetadata.js +74 -0
- package/src/tools/basic/extractText.js +46 -0
- package/src/tools/basic/fetchUrl.js +44 -0
- package/src/tools/basic/scrapeStructured.js +58 -0
- package/src/tools/crawl/_sessionContext.js +234 -0
- package/src/tools/crawl/crawlDeep.js +55 -5
- package/src/tools/crawl/mapSite.js +23 -2
- package/src/tools/extract/_fetchAndParse.js +57 -0
- package/src/tools/extract/extractStructured.js +3 -19
- package/src/tools/extract/extractWithLlm.js +295 -0
- package/src/tools/research/deepResearch.js +33 -8
- package/src/tools/search/providers/searxng.js +126 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
- package/src/tools/search/ranking/ResultRanker.js +17 -10
- package/src/tools/search/ranking/SearchResultCache.js +52 -0
- package/src/tools/search/searchWeb.js +112 -6
- package/src/tools/tracking/trackChanges/differ.js +98 -0
- package/src/tools/tracking/trackChanges/index.js +432 -0
- package/src/tools/tracking/trackChanges/monitor.js +93 -0
- package/src/tools/tracking/trackChanges/notifier.js +105 -0
- package/src/tools/tracking/trackChanges/schema.js +127 -0
- package/src/tools/tracking/trackChanges.js +12 -1374
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract With LLM MCP Tool
|
|
3
|
+
* Natural-language extraction powered by OpenAI or Anthropic.
|
|
4
|
+
* Mirrors ScrapeGraphAI positioning: describe what you want, get structured JSON back.
|
|
5
|
+
*
|
|
6
|
+
* Requires OPENAI_API_KEY or ANTHROPIC_API_KEY in environment.
|
|
7
|
+
* Gate: tool throws a clear error when neither key is present.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { fetchAndParse } from './_fetchAndParse.js';
|
|
11
|
+
|
|
12
|
+
// ── Constants ─────────────────────────────────────────────────────────────────
|
|
13
|
+
|
|
14
|
+
const MAX_INPUT_CHARS = 50_000;
|
|
15
|
+
|
|
16
|
+
const OPENAI_DEFAULT_MODEL = 'gpt-4o-mini';
|
|
17
|
+
const ANTHROPIC_DEFAULT_MODEL = 'claude-haiku-4-5-20251001';
|
|
18
|
+
|
|
19
|
+
// Support test-time overrides so the test suite can stub endpoints.
|
|
20
|
+
function openaiBaseUrl() {
|
|
21
|
+
return (process.env.OPENAI_BASE_URL || 'https://api.openai.com').replace(/\/$/, '');
|
|
22
|
+
}
|
|
23
|
+
function anthropicBaseUrl() {
|
|
24
|
+
return (process.env.ANTHROPIC_BASE_URL || 'https://api.anthropic.com').replace(/\/$/, '');
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Resolve which provider to use.
|
|
31
|
+
* @param {'openai'|'anthropic'|'auto'} provider
|
|
32
|
+
* @returns {{ provider: 'openai'|'anthropic', apiKey: string }}
|
|
33
|
+
*/
|
|
34
|
+
function resolveProvider(provider) {
|
|
35
|
+
const anthropicKey = process.env.ANTHROPIC_API_KEY;
|
|
36
|
+
const openaiKey = process.env.OPENAI_API_KEY;
|
|
37
|
+
|
|
38
|
+
if (provider === 'auto') {
|
|
39
|
+
if (anthropicKey) return { provider: 'anthropic', apiKey: anthropicKey };
|
|
40
|
+
if (openaiKey) return { provider: 'openai', apiKey: openaiKey };
|
|
41
|
+
throw new Error(
|
|
42
|
+
'extract_with_llm requires OPENAI_API_KEY or ANTHROPIC_API_KEY in environment'
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if (provider === 'anthropic') {
|
|
47
|
+
if (!anthropicKey) throw new Error('extract_with_llm: ANTHROPIC_API_KEY is not set');
|
|
48
|
+
return { provider: 'anthropic', apiKey: anthropicKey };
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if (provider === 'openai') {
|
|
52
|
+
if (!openaiKey) throw new Error('extract_with_llm: OPENAI_API_KEY is not set');
|
|
53
|
+
return { provider: 'openai', apiKey: openaiKey };
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
throw new Error(`extract_with_llm: unknown provider "${provider}"`);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Build the user message text that goes to the LLM.
|
|
61
|
+
*/
|
|
62
|
+
function buildUserMessage(userPrompt, text, schema) {
|
|
63
|
+
const truncated = text.length > MAX_INPUT_CHARS ? text.slice(0, MAX_INPUT_CHARS) + '\n[...truncated]' : text;
|
|
64
|
+
let msg = `Extraction instruction: ${userPrompt}\n\n`;
|
|
65
|
+
if (schema && Object.keys(schema).length > 0) {
|
|
66
|
+
msg += `Output schema hint:\n${JSON.stringify(schema, null, 2)}\n\n`;
|
|
67
|
+
}
|
|
68
|
+
msg += `Web page content:\n${truncated}\n\nReturn only valid JSON.`;
|
|
69
|
+
return msg;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Parse JSON from an LLM response string defensively.
|
|
74
|
+
* Strips markdown code fences if present.
|
|
75
|
+
* Returns parsed object or throws.
|
|
76
|
+
*/
|
|
77
|
+
function parseJson(raw) {
|
|
78
|
+
// Strip markdown fences
|
|
79
|
+
const stripped = raw
|
|
80
|
+
.replace(/^```(?:json)?\s*/i, '')
|
|
81
|
+
.replace(/\s*```\s*$/, '')
|
|
82
|
+
.trim();
|
|
83
|
+
return JSON.parse(stripped);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ── OpenAI call ───────────────────────────────────────────────────────────────
|
|
87
|
+
|
|
88
|
+
async function callOpenAI({ apiKey, model, systemMessage, userMessage, maxTokens }) {
|
|
89
|
+
const url = `${openaiBaseUrl()}/v1/chat/completions`;
|
|
90
|
+
const body = {
|
|
91
|
+
model,
|
|
92
|
+
messages: [
|
|
93
|
+
{ role: 'system', content: systemMessage },
|
|
94
|
+
{ role: 'user', content: userMessage }
|
|
95
|
+
],
|
|
96
|
+
max_tokens: maxTokens,
|
|
97
|
+
response_format: { type: 'json_object' }
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
const response = await fetch(url, {
|
|
101
|
+
method: 'POST',
|
|
102
|
+
headers: {
|
|
103
|
+
'Content-Type': 'application/json',
|
|
104
|
+
'Authorization': `Bearer ${apiKey}`
|
|
105
|
+
},
|
|
106
|
+
body: JSON.stringify(body),
|
|
107
|
+
signal: AbortSignal.timeout(120_000)
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
if (!response.ok) {
|
|
111
|
+
const errText = await response.text().catch(() => '');
|
|
112
|
+
throw new Error(`OpenAI API error ${response.status}: ${errText.slice(0, 200)}`);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const json = await response.json();
|
|
116
|
+
const content = json.choices?.[0]?.message?.content ?? '';
|
|
117
|
+
const usage = {
|
|
118
|
+
input_tokens: json.usage?.prompt_tokens ?? 0,
|
|
119
|
+
output_tokens: json.usage?.completion_tokens ?? 0
|
|
120
|
+
};
|
|
121
|
+
return { rawText: content, usage, model: json.model || model };
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// ── Anthropic call ────────────────────────────────────────────────────────────
|
|
125
|
+
|
|
126
|
+
async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens }) {
|
|
127
|
+
const url = `${anthropicBaseUrl()}/v1/messages`;
|
|
128
|
+
const body = {
|
|
129
|
+
model,
|
|
130
|
+
system: systemMessage,
|
|
131
|
+
messages: [{ role: 'user', content: userMessage }],
|
|
132
|
+
max_tokens: maxTokens
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
const response = await fetch(url, {
|
|
136
|
+
method: 'POST',
|
|
137
|
+
headers: {
|
|
138
|
+
'Content-Type': 'application/json',
|
|
139
|
+
'x-api-key': apiKey,
|
|
140
|
+
'anthropic-version': '2023-06-01'
|
|
141
|
+
},
|
|
142
|
+
body: JSON.stringify(body),
|
|
143
|
+
signal: AbortSignal.timeout(120_000)
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
if (!response.ok) {
|
|
147
|
+
const errText = await response.text().catch(() => '');
|
|
148
|
+
throw new Error(`Anthropic API error ${response.status}: ${errText.slice(0, 200)}`);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
const json = await response.json();
|
|
152
|
+
const content = json.content?.[0]?.text ?? '';
|
|
153
|
+
const usage = {
|
|
154
|
+
input_tokens: json.usage?.input_tokens ?? 0,
|
|
155
|
+
output_tokens: json.usage?.output_tokens ?? 0
|
|
156
|
+
};
|
|
157
|
+
return { rawText: content, usage, model: json.model || model };
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// ── LLM dispatch ─────────────────────────────────────────────────────────────
|
|
161
|
+
|
|
162
|
+
async function callLLM({ provider, apiKey, model, systemMessage, userMessage, maxTokens }) {
|
|
163
|
+
if (provider === 'openai') {
|
|
164
|
+
return callOpenAI({ apiKey, model, systemMessage, userMessage, maxTokens });
|
|
165
|
+
}
|
|
166
|
+
return callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens });
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// ── Tool class ────────────────────────────────────────────────────────────────
|
|
170
|
+
|
|
171
|
+
export class ExtractWithLlm {
|
|
172
|
+
constructor(config = {}) {
|
|
173
|
+
this.config = config;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Execute LLM-powered extraction.
|
|
178
|
+
* @param {Object} params
|
|
179
|
+
* @param {string} [params.url] - URL to fetch (one of url/content required)
|
|
180
|
+
* @param {string} [params.content] - Pre-fetched text content
|
|
181
|
+
* @param {string} params.prompt - Natural-language extraction instruction
|
|
182
|
+
* @param {Object} [params.schema] - Optional JSON-schema-like output hint
|
|
183
|
+
* @param {string} [params.provider] - 'openai' | 'anthropic' | 'auto'
|
|
184
|
+
* @param {string} [params.model] - Override default model
|
|
185
|
+
* @param {number} [params.maxTokens] - Max output tokens (default 4096)
|
|
186
|
+
* @returns {Promise<Object>}
|
|
187
|
+
*/
|
|
188
|
+
async execute(params) {
|
|
189
|
+
const {
|
|
190
|
+
url,
|
|
191
|
+
content,
|
|
192
|
+
prompt,
|
|
193
|
+
schema,
|
|
194
|
+
provider: providerParam = 'auto',
|
|
195
|
+
model: modelParam,
|
|
196
|
+
maxTokens = 4096
|
|
197
|
+
} = params;
|
|
198
|
+
|
|
199
|
+
// Validate: exactly one of url or content must be provided
|
|
200
|
+
if (!url && !content) {
|
|
201
|
+
return {
|
|
202
|
+
success: false,
|
|
203
|
+
error: 'extract_with_llm: either "url" or "content" must be provided'
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
if (!prompt) {
|
|
207
|
+
return { success: false, error: 'extract_with_llm: "prompt" is required' };
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Resolve provider + API key (throws clearly if neither key is set)
|
|
211
|
+
let resolved;
|
|
212
|
+
try {
|
|
213
|
+
resolved = resolveProvider(providerParam);
|
|
214
|
+
} catch (err) {
|
|
215
|
+
return { success: false, error: err.message };
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
const { provider, apiKey } = resolved;
|
|
219
|
+
const defaultModel = provider === 'openai' ? OPENAI_DEFAULT_MODEL : ANTHROPIC_DEFAULT_MODEL;
|
|
220
|
+
const model = modelParam || defaultModel;
|
|
221
|
+
|
|
222
|
+
// Step 1: Get text to extract from
|
|
223
|
+
let text;
|
|
224
|
+
try {
|
|
225
|
+
if (url) {
|
|
226
|
+
const { textContent } = await fetchAndParse(url);
|
|
227
|
+
text = textContent;
|
|
228
|
+
} else {
|
|
229
|
+
text = content;
|
|
230
|
+
}
|
|
231
|
+
} catch (fetchErr) {
|
|
232
|
+
return { success: false, error: `Failed to fetch content: ${fetchErr.message}` };
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const systemMessage =
|
|
236
|
+
'You extract structured data from web content per the user\'s instructions. Return JSON only.';
|
|
237
|
+
|
|
238
|
+
const userMessage = buildUserMessage(prompt, text, schema);
|
|
239
|
+
|
|
240
|
+
// Step 2: First LLM call
|
|
241
|
+
let rawText, usage;
|
|
242
|
+
try {
|
|
243
|
+
({ rawText, usage } = await callLLM({
|
|
244
|
+
provider, apiKey, model, systemMessage, userMessage, maxTokens
|
|
245
|
+
}));
|
|
246
|
+
} catch (llmErr) {
|
|
247
|
+
return { success: false, error: `LLM call failed: ${llmErr.message}` };
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Step 3: Parse JSON; retry once with stricter prompt if it fails
|
|
251
|
+
let parsed;
|
|
252
|
+
try {
|
|
253
|
+
parsed = parseJson(rawText);
|
|
254
|
+
} catch (_parseErr) {
|
|
255
|
+
// Retry with stricter instruction
|
|
256
|
+
const retryUserMessage =
|
|
257
|
+
`${userMessage}\n\nIMPORTANT: Your previous response was not valid JSON. ` +
|
|
258
|
+
'Respond with ONLY a JSON object or array. No explanation, no markdown fences.';
|
|
259
|
+
let retryRaw, retryUsage;
|
|
260
|
+
try {
|
|
261
|
+
({ rawText: retryRaw, usage: retryUsage } = await callLLM({
|
|
262
|
+
provider, apiKey, model, systemMessage,
|
|
263
|
+
userMessage: retryUserMessage, maxTokens
|
|
264
|
+
}));
|
|
265
|
+
// Merge usage
|
|
266
|
+
usage = {
|
|
267
|
+
input_tokens: usage.input_tokens + retryUsage.input_tokens,
|
|
268
|
+
output_tokens: usage.output_tokens + retryUsage.output_tokens
|
|
269
|
+
};
|
|
270
|
+
} catch (retryLlmErr) {
|
|
271
|
+
return { success: false, error: `LLM retry call failed: ${retryLlmErr.message}` };
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
try {
|
|
275
|
+
parsed = parseJson(retryRaw);
|
|
276
|
+
} catch (_retryParseErr) {
|
|
277
|
+
return {
|
|
278
|
+
success: false,
|
|
279
|
+
error: 'LLM did not return valid JSON after retry',
|
|
280
|
+
raw: retryRaw.slice(0, 500)
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
return {
|
|
286
|
+
success: true,
|
|
287
|
+
data: parsed,
|
|
288
|
+
provider,
|
|
289
|
+
model,
|
|
290
|
+
usage
|
|
291
|
+
};
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
export default ExtractWithLlm;
|
|
@@ -208,11 +208,20 @@ export class DeepResearchTool {
|
|
|
208
208
|
baseConfig.llmConfig = params.llmConfig;
|
|
209
209
|
}
|
|
210
210
|
|
|
211
|
-
//
|
|
211
|
+
// Every approach must propagate the user's scope params (maxUrls,
|
|
212
|
+
// timeLimit, concurrency) — only `broad` did before, so non-broad
|
|
213
|
+
// approaches silently fell back to orchestrator defaults.
|
|
214
|
+
const scopeConfig = {
|
|
215
|
+
maxUrls: params.maxUrls,
|
|
216
|
+
timeLimit: params.timeLimit,
|
|
217
|
+
concurrency: params.concurrency
|
|
218
|
+
};
|
|
219
|
+
|
|
212
220
|
switch (params.researchApproach) {
|
|
213
221
|
case 'academic':
|
|
214
222
|
return {
|
|
215
223
|
...baseConfig,
|
|
224
|
+
...scopeConfig,
|
|
216
225
|
maxDepth: Math.min(params.maxDepth, 8),
|
|
217
226
|
enableSourceVerification: true,
|
|
218
227
|
searchConfig: {
|
|
@@ -225,10 +234,11 @@ export class DeepResearchTool {
|
|
|
225
234
|
}
|
|
226
235
|
}
|
|
227
236
|
};
|
|
228
|
-
|
|
237
|
+
|
|
229
238
|
case 'current_events':
|
|
230
239
|
return {
|
|
231
240
|
...baseConfig,
|
|
241
|
+
...scopeConfig,
|
|
232
242
|
maxDepth: Math.min(params.maxDepth, 6),
|
|
233
243
|
searchConfig: {
|
|
234
244
|
enableRanking: true,
|
|
@@ -240,18 +250,20 @@ export class DeepResearchTool {
|
|
|
240
250
|
}
|
|
241
251
|
}
|
|
242
252
|
};
|
|
243
|
-
|
|
253
|
+
|
|
244
254
|
case 'focused':
|
|
245
255
|
return {
|
|
246
256
|
...baseConfig,
|
|
257
|
+
...scopeConfig,
|
|
247
258
|
maxDepth: Math.min(params.maxDepth, 4),
|
|
248
259
|
maxUrls: Math.min(params.maxUrls, 30),
|
|
249
260
|
concurrency: Math.min(params.concurrency, 3)
|
|
250
261
|
};
|
|
251
|
-
|
|
262
|
+
|
|
252
263
|
case 'comparative':
|
|
253
264
|
return {
|
|
254
265
|
...baseConfig,
|
|
266
|
+
...scopeConfig,
|
|
255
267
|
enableConflictDetection: true,
|
|
256
268
|
maxDepth: params.maxDepth,
|
|
257
269
|
searchConfig: {
|
|
@@ -263,14 +275,13 @@ export class DeepResearchTool {
|
|
|
263
275
|
}
|
|
264
276
|
}
|
|
265
277
|
};
|
|
266
|
-
|
|
278
|
+
|
|
267
279
|
case 'broad':
|
|
268
280
|
default:
|
|
269
281
|
return {
|
|
270
282
|
...baseConfig,
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
timeLimit: params.timeLimit
|
|
283
|
+
...scopeConfig,
|
|
284
|
+
maxDepth: params.maxDepth
|
|
274
285
|
};
|
|
275
286
|
}
|
|
276
287
|
}
|
|
@@ -334,6 +345,20 @@ export class DeepResearchTool {
|
|
|
334
345
|
* Format research results according to output preferences
|
|
335
346
|
*/
|
|
336
347
|
formatResults(results, params) {
|
|
348
|
+
// Raw evidence mode (no LLM configured): pass through the clean shape
|
|
349
|
+
// designed for the calling LLM to synthesize.
|
|
350
|
+
if (results.synthesisMode === 'raw_evidence') {
|
|
351
|
+
return {
|
|
352
|
+
synthesisMode: 'raw_evidence',
|
|
353
|
+
note: results.note,
|
|
354
|
+
sources: results.sources,
|
|
355
|
+
researchSummary: results.researchSummary,
|
|
356
|
+
metadata: results.metadata,
|
|
357
|
+
performance: results.performance,
|
|
358
|
+
activityLog: params.includeActivityLog ? results.activityLog : undefined
|
|
359
|
+
};
|
|
360
|
+
}
|
|
361
|
+
|
|
337
362
|
const formatted = {
|
|
338
363
|
researchSummary: results.researchSummary,
|
|
339
364
|
metadata: results.metadata
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SearXNG Search Provider
|
|
3
|
+
*
|
|
4
|
+
* Executes searches against a self-hosted SearXNG instance via its JSON API.
|
|
5
|
+
* Instance URL is read from the CRAWLFORGE_SEARXNG_URL environment variable.
|
|
6
|
+
*
|
|
7
|
+
* SearXNG JSON API reference:
|
|
8
|
+
* https://docs.searxng.org/dev/search_api.html
|
|
9
|
+
*
|
|
10
|
+
* Result shape is normalised to match the CrawlForge/Google adapter format so
|
|
11
|
+
* the rest of the search pipeline (ranking, deduplication, caching) is unaffected.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Map a single SearXNG result object to the internal item shape used throughout
|
|
16
|
+
* the search pipeline.
|
|
17
|
+
*
|
|
18
|
+
* SearXNG field → internal field
|
|
19
|
+
* title → title
|
|
20
|
+
* url → link, displayLink, formattedUrl
|
|
21
|
+
* content → snippet, htmlSnippet
|
|
22
|
+
* (all others) → ignored / defaulted
|
|
23
|
+
*
|
|
24
|
+
* @param {Object} result - Raw SearXNG result entry
|
|
25
|
+
* @returns {Object} Normalised item
|
|
26
|
+
*/
|
|
27
|
+
export function normalizeSearxngResult(result) {
|
|
28
|
+
const url = result.url || '';
|
|
29
|
+
let displayLink = '';
|
|
30
|
+
try {
|
|
31
|
+
displayLink = new URL(url).hostname;
|
|
32
|
+
} catch {
|
|
33
|
+
displayLink = url;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
title: result.title || '',
|
|
38
|
+
link: url,
|
|
39
|
+
snippet: result.content || '',
|
|
40
|
+
displayLink,
|
|
41
|
+
formattedUrl: url,
|
|
42
|
+
htmlSnippet: result.content || '',
|
|
43
|
+
pagemap: {},
|
|
44
|
+
metadata: {
|
|
45
|
+
mime: null,
|
|
46
|
+
fileFormat: null,
|
|
47
|
+
cacheId: null
|
|
48
|
+
}
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Fetch search results from a SearXNG instance.
|
|
54
|
+
*
|
|
55
|
+
* @param {Object} opts
|
|
56
|
+
* @param {string} opts.query - Search query string
|
|
57
|
+
* @param {number} [opts.limit=10] - Maximum number of results to return
|
|
58
|
+
* @param {number} [opts.page=1] - Page number (1-based)
|
|
59
|
+
* @param {boolean} [opts.safeSearch=true] - Whether safe search is enabled
|
|
60
|
+
* @param {string} [opts.language='en'] - Language code (e.g. 'en', 'de')
|
|
61
|
+
* @param {string} [opts.instanceUrl] - Override for CRAWLFORGE_SEARXNG_URL
|
|
62
|
+
* @returns {Promise<Object>} Results in the internal adapter format
|
|
63
|
+
* { items: Array, searchInformation: { totalResults, searchTime }, queries: {}, context: {} }
|
|
64
|
+
*/
|
|
65
|
+
export async function searchViaSearxng(opts = {}) {
|
|
66
|
+
const instanceUrl = opts.instanceUrl || process.env.CRAWLFORGE_SEARXNG_URL;
|
|
67
|
+
|
|
68
|
+
if (!instanceUrl) {
|
|
69
|
+
throw new Error(
|
|
70
|
+
"provider 'searxng' requires CRAWLFORGE_SEARXNG_URL in environment"
|
|
71
|
+
);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const {
|
|
75
|
+
query,
|
|
76
|
+
limit = 10,
|
|
77
|
+
page = 1,
|
|
78
|
+
safeSearch = true,
|
|
79
|
+
language = 'en'
|
|
80
|
+
} = opts;
|
|
81
|
+
|
|
82
|
+
// SearXNG safesearch: 0=off, 1=moderate, 2=strict
|
|
83
|
+
const safesearch = safeSearch ? 1 : 0;
|
|
84
|
+
|
|
85
|
+
const url = new URL('/search', instanceUrl);
|
|
86
|
+
url.searchParams.set('q', query);
|
|
87
|
+
url.searchParams.set('format', 'json');
|
|
88
|
+
url.searchParams.set('pageno', String(page));
|
|
89
|
+
url.searchParams.set('safesearch', String(safesearch));
|
|
90
|
+
url.searchParams.set('language', language);
|
|
91
|
+
|
|
92
|
+
let response;
|
|
93
|
+
try {
|
|
94
|
+
response = await fetch(url.toString(), {
|
|
95
|
+
headers: { Accept: 'application/json' }
|
|
96
|
+
});
|
|
97
|
+
} catch (err) {
|
|
98
|
+
throw new Error(`SearXNG request failed: ${err.message}`);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (!response.ok) {
|
|
102
|
+
throw new Error(
|
|
103
|
+
`SearXNG returned HTTP ${response.status}: ${response.statusText}`
|
|
104
|
+
);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
let data;
|
|
108
|
+
try {
|
|
109
|
+
data = await response.json();
|
|
110
|
+
} catch {
|
|
111
|
+
throw new Error('SearXNG returned invalid JSON');
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const rawResults = Array.isArray(data.results) ? data.results : [];
|
|
115
|
+
const items = rawResults.slice(0, limit).map(normalizeSearxngResult);
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
items,
|
|
119
|
+
searchInformation: {
|
|
120
|
+
totalResults: String(rawResults.length),
|
|
121
|
+
searchTime: data.answers ? 0 : 0
|
|
122
|
+
},
|
|
123
|
+
queries: {},
|
|
124
|
+
context: {}
|
|
125
|
+
};
|
|
126
|
+
}
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import { CacheManager } from '../../../core/cache/CacheManager.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* Advanced search result deduplication system using multiple similarity algorithms
|
|
4
|
+
* Advanced search result deduplication system using multiple similarity algorithms.
|
|
5
|
+
* Accepts an optional `sharedCache` (SearchResultCache instance) to avoid
|
|
6
|
+
* creating a duplicate CacheManager when used alongside ResultRanker.
|
|
5
7
|
*/
|
|
6
8
|
export class ResultDeduplicator {
|
|
7
9
|
constructor(options = {}) {
|
|
10
|
+
const { sharedCache, ...serializableOptions } = options;
|
|
8
11
|
this.options = {
|
|
9
12
|
// Similarity thresholds
|
|
10
13
|
thresholds: {
|
|
@@ -13,7 +16,7 @@ export class ResultDeduplicator {
|
|
|
13
16
|
content: 0.85, // Content similarity threshold
|
|
14
17
|
combined: 0.8 // Combined similarity threshold for final decision
|
|
15
18
|
},
|
|
16
|
-
|
|
19
|
+
|
|
17
20
|
// Deduplication strategies
|
|
18
21
|
strategies: {
|
|
19
22
|
urlNormalization: true, // Normalize URLs for comparison
|
|
@@ -21,7 +24,7 @@ export class ResultDeduplicator {
|
|
|
21
24
|
contentSimhash: true, // Use SimHash for content comparison
|
|
22
25
|
domainClustering: true // Cluster results by domain
|
|
23
26
|
},
|
|
24
|
-
|
|
27
|
+
|
|
25
28
|
// URL normalization options
|
|
26
29
|
urlNormalization: {
|
|
27
30
|
removeProtocol: true, // Remove http/https difference
|
|
@@ -32,7 +35,7 @@ export class ResultDeduplicator {
|
|
|
32
35
|
removeEmptyParams: true, // Remove empty query parameters
|
|
33
36
|
lowercaseDomain: true // Convert domain to lowercase
|
|
34
37
|
},
|
|
35
|
-
|
|
38
|
+
|
|
36
39
|
// Content similarity options
|
|
37
40
|
contentSimilarity: {
|
|
38
41
|
minLength: 10, // Minimum content length to compare
|
|
@@ -40,7 +43,7 @@ export class ResultDeduplicator {
|
|
|
40
43
|
simhashBits: 64, // SimHash bit size
|
|
41
44
|
hammingThreshold: 16 // Hamming distance threshold for SimHash
|
|
42
45
|
},
|
|
43
|
-
|
|
46
|
+
|
|
44
47
|
// Merge strategy
|
|
45
48
|
mergeStrategy: {
|
|
46
49
|
preserveBestRank: true, // Keep the best ranking result as primary
|
|
@@ -48,17 +51,21 @@ export class ResultDeduplicator {
|
|
|
48
51
|
preferHttps: true, // Prefer HTTPS URLs when merging
|
|
49
52
|
preferShorterUrl: true // Prefer shorter, cleaner URLs
|
|
50
53
|
},
|
|
51
|
-
|
|
54
|
+
|
|
52
55
|
// Performance options
|
|
53
56
|
cacheEnabled: true,
|
|
54
57
|
cacheTTL: 3600000, // 1 hour
|
|
55
|
-
...
|
|
58
|
+
...serializableOptions
|
|
56
59
|
};
|
|
57
60
|
|
|
58
|
-
//
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
61
|
+
// Use shared cache if provided, otherwise create own CacheManager instance.
|
|
62
|
+
// sharedCache is held separately — never in this.options — because it holds
|
|
63
|
+
// a setInterval Timer that would create a circular reference when the
|
|
64
|
+
// options object is JSON.stringify'd to build a cache key (see generateKey).
|
|
65
|
+
this.cache = sharedCache || (this.options.cacheEnabled
|
|
66
|
+
? new CacheManager({ ttl: this.options.cacheTTL })
|
|
67
|
+
: null);
|
|
68
|
+
|
|
62
69
|
// Statistics tracking
|
|
63
70
|
this.stats = {
|
|
64
71
|
totalProcessed: 0,
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import { CacheManager } from '../../../core/cache/CacheManager.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* Advanced search result ranking system with multiple scoring algorithms
|
|
4
|
+
* Advanced search result ranking system with multiple scoring algorithms.
|
|
5
|
+
* Accepts an optional `sharedCache` (SearchResultCache instance) to avoid
|
|
6
|
+
* creating a duplicate CacheManager when used alongside ResultDeduplicator.
|
|
5
7
|
*/
|
|
6
8
|
export class ResultRanker {
|
|
7
9
|
constructor(options = {}) {
|
|
10
|
+
const { sharedCache, ...serializableOptions } = options;
|
|
8
11
|
this.options = {
|
|
9
12
|
// Ranking weight configuration
|
|
10
13
|
weights: {
|
|
@@ -13,13 +16,13 @@ export class ResultRanker {
|
|
|
13
16
|
authority: 0.2, // URL/domain authority
|
|
14
17
|
freshness: 0.1 // Content freshness
|
|
15
18
|
},
|
|
16
|
-
|
|
19
|
+
|
|
17
20
|
// BM25 parameters
|
|
18
21
|
bm25: {
|
|
19
22
|
k1: 1.5, // Term frequency saturation parameter
|
|
20
23
|
b: 0.75 // Length normalization parameter
|
|
21
24
|
},
|
|
22
|
-
|
|
25
|
+
|
|
23
26
|
// Authority scoring parameters
|
|
24
27
|
authority: {
|
|
25
28
|
domainBoosts: { // Domain authority boosts
|
|
@@ -32,23 +35,27 @@ export class ResultRanker {
|
|
|
32
35
|
httpsBoost: 0.1, // HTTPS boost
|
|
33
36
|
pathDepthPenalty: 0.02 // Penalty per path segment
|
|
34
37
|
},
|
|
35
|
-
|
|
38
|
+
|
|
36
39
|
// Freshness parameters
|
|
37
40
|
freshness: {
|
|
38
41
|
maxAgeMonths: 24, // Content older than this gets 0 freshness score
|
|
39
42
|
decayRate: 0.1 // Exponential decay rate per month
|
|
40
43
|
},
|
|
41
|
-
|
|
44
|
+
|
|
42
45
|
// Performance options
|
|
43
46
|
cacheEnabled: true,
|
|
44
47
|
cacheTTL: 3600000, // 1 hour
|
|
45
|
-
...
|
|
48
|
+
...serializableOptions
|
|
46
49
|
};
|
|
47
50
|
|
|
48
|
-
//
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
51
|
+
// Use shared cache if provided, otherwise create own CacheManager instance.
|
|
52
|
+
// sharedCache is held separately — never in this.options — because it holds
|
|
53
|
+
// a setInterval Timer that would create a circular reference when the
|
|
54
|
+
// options object is JSON.stringify'd to build a cache key (see generateKey).
|
|
55
|
+
this.cache = sharedCache || (this.options.cacheEnabled
|
|
56
|
+
? new CacheManager({ ttl: this.options.cacheTTL })
|
|
57
|
+
: null);
|
|
58
|
+
|
|
52
59
|
// Precompute domain authority scores
|
|
53
60
|
this.domainAuthorityMap = new Map();
|
|
54
61
|
this.initializeDomainAuthority();
|