searchfetch 2.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -13
- package/index.js +895 -197
- package/package.json +5 -4
- package/templates/crates-package.json +37 -0
- package/templates/docs-page.json +31 -0
- package/templates/duckduckgo-search.json +67 -0
- package/templates/github-issue.json +69 -0
- package/templates/github-repo.json +36 -0
- package/templates/google-search.json +75 -0
- package/templates/npm-package.json +49 -0
- package/templates/pypi-package.json +43 -0
package/index.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
|
+
// === STDOUT/STDERR REDIRECTION ===========================================
|
|
3
4
|
const originalStdoutWrite = process.stdout.write.bind(process.stdout);
|
|
4
5
|
process.stdout.write = (chunk, encoding, callback) => {
|
|
5
6
|
return process.stderr.write(chunk, encoding, callback);
|
|
@@ -7,6 +8,10 @@ process.stdout.write = (chunk, encoding, callback) => {
|
|
|
7
8
|
console.log = (...args) => console.error(...args);
|
|
8
9
|
console.info = (...args) => console.error(...args);
|
|
9
10
|
|
|
11
|
+
// === IMPORTS =============================================================
|
|
12
|
+
import { readdirSync, readFileSync } from "node:fs";
|
|
13
|
+
import { dirname, join } from "node:path";
|
|
14
|
+
import { fileURLToPath } from "node:url";
|
|
10
15
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
11
16
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
12
17
|
import { z } from "zod";
|
|
@@ -14,6 +19,7 @@ import { launch, ensureBinary } from "cloakbrowser";
|
|
|
14
19
|
import * as cheerio from "cheerio";
|
|
15
20
|
import TurndownService from "turndown";
|
|
16
21
|
|
|
22
|
+
// === BROWSER MANAGER =====================================================
|
|
17
23
|
class BrowserManager {
|
|
18
24
|
constructor() {
|
|
19
25
|
this.browser = null;
|
|
@@ -67,225 +73,856 @@ const cleanup = async () => {
|
|
|
67
73
|
process.on("SIGINT", cleanup);
|
|
68
74
|
process.on("SIGTERM", cleanup);
|
|
69
75
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
76
|
+
// === TURNDOWN ============================================================
|
|
77
|
+
const turndown = new TurndownService({
|
|
78
|
+
headingStyle: "atx",
|
|
79
|
+
codeBlockStyle: "fenced",
|
|
80
|
+
emDelimiter: "*",
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
// === BUILT-IN TEMPLATES (loaded from templates/*.json) ====================
|
|
84
|
+
|
|
85
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
86
|
+
const __dirname = dirname(__filename);
|
|
87
|
+
const TEMPLATES_DIR = join(__dirname, "templates");
|
|
88
|
+
|
|
89
|
+
function loadBuiltinTemplates() {
|
|
90
|
+
let files;
|
|
91
|
+
try {
|
|
92
|
+
files = readdirSync(TEMPLATES_DIR);
|
|
93
|
+
} catch (err) {
|
|
94
|
+
throw new Error(
|
|
95
|
+
`Cannot read templates directory '${TEMPLATES_DIR}': ${err.message}`,
|
|
96
|
+
);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const jsonFiles = files
|
|
100
|
+
.filter((f) => f.endsWith(".json"))
|
|
101
|
+
.sort();
|
|
102
|
+
|
|
103
|
+
if (jsonFiles.length === 0) {
|
|
104
|
+
throw new Error(
|
|
105
|
+
`No template JSON files found in '${TEMPLATES_DIR}'`,
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
const templates = [];
|
|
110
|
+
for (const file of jsonFiles) {
|
|
111
|
+
const filePath = join(TEMPLATES_DIR, file);
|
|
112
|
+
const content = readFileSync(filePath, "utf-8");
|
|
113
|
+
let template;
|
|
114
|
+
try {
|
|
115
|
+
template = JSON.parse(content);
|
|
116
|
+
} catch (err) {
|
|
117
|
+
throw new Error(
|
|
118
|
+
`Invalid JSON in template file '${filePath}': ${err.message}`,
|
|
119
|
+
);
|
|
120
|
+
}
|
|
121
|
+
if (!template.name || typeof template.name !== "string") {
|
|
122
|
+
throw new Error(
|
|
123
|
+
`Template file '${filePath}' is missing a valid "name" field`,
|
|
124
|
+
);
|
|
125
|
+
}
|
|
126
|
+
templates.push(template);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Sort by "order" field for deterministic URL-pattern matching
|
|
130
|
+
templates.sort((a, b) => (a.order ?? 999) - (b.order ?? 999));
|
|
131
|
+
|
|
132
|
+
return templates;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const BUILTIN_TEMPLATES = loadBuiltinTemplates();
|
|
136
|
+
|
|
137
|
+
// === TEMPLATE LOOKUP =====================================================
|
|
138
|
+
const TEMPLATE_MAP = new Map();
|
|
139
|
+
for (const t of BUILTIN_TEMPLATES) {
|
|
140
|
+
TEMPLATE_MAP.set(t.name, t);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function getTemplateByName(name) {
|
|
144
|
+
const t = TEMPLATE_MAP.get(name);
|
|
145
|
+
if (!t) {
|
|
146
|
+
const names = [...TEMPLATE_MAP.keys()].join(", ");
|
|
147
|
+
throw new Error(
|
|
148
|
+
`Unknown template '${name}'. Available: ${names}`,
|
|
149
|
+
);
|
|
150
|
+
}
|
|
151
|
+
return t;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function detectTemplateByUrl(url) {
|
|
155
|
+
for (const template of BUILTIN_TEMPLATES) {
|
|
156
|
+
if (!template.url_patterns) continue;
|
|
157
|
+
for (const pattern of template.url_patterns) {
|
|
158
|
+
try {
|
|
159
|
+
if (new RegExp(pattern).test(url)) {
|
|
160
|
+
return template;
|
|
161
|
+
}
|
|
162
|
+
} catch (_) {
|
|
163
|
+
// Skip invalid regex
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
return null;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// === URL TEMPLATE RESOLUTION =============================================
|
|
171
|
+
|
|
172
|
+
function resolveUrlTemplate(template, providedParams) {
|
|
173
|
+
const urlParams = template.url_params || {};
|
|
174
|
+
let url = template.url_template;
|
|
175
|
+
if (!url) return null;
|
|
176
|
+
|
|
177
|
+
let match;
|
|
178
|
+
const re = /\{(\w+)\}/g;
|
|
179
|
+
while ((match = re.exec(url)) !== null) {
|
|
180
|
+
const name = match[1];
|
|
181
|
+
const def = urlParams[name] || {};
|
|
182
|
+
|
|
183
|
+
let value;
|
|
184
|
+
if (
|
|
185
|
+
name in providedParams &&
|
|
186
|
+
providedParams[name] !== null &&
|
|
187
|
+
providedParams[name] !== undefined
|
|
188
|
+
) {
|
|
189
|
+
value = String(providedParams[name]);
|
|
190
|
+
} else if (def.default !== undefined) {
|
|
191
|
+
value = String(def.default);
|
|
192
|
+
} else if (def.required) {
|
|
193
|
+
throw new Error(
|
|
194
|
+
`Required URL parameter '${name}' not provided for template '${template.name}'.`,
|
|
195
|
+
);
|
|
196
|
+
} else {
|
|
197
|
+
value = "";
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (def.encode === "url") {
|
|
201
|
+
value = encodeURIComponent(value);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
url = url.replace(match[0], value);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Remove any remaining unreplaced placeholders
|
|
208
|
+
url = url.replace(/\{\w+\}/g, "").replace(/&{2,}/g, "&").replace(/\?&/, "?");
|
|
209
|
+
|
|
210
|
+
return url;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// === SEARCH PARAM MAPPING ================================================
|
|
214
|
+
|
|
215
|
+
function resolveEngineToTemplateName(engine) {
|
|
216
|
+
if (engine === "duckduckgo") return "duckduckgo-search";
|
|
217
|
+
if (engine === "google") return "google-search";
|
|
218
|
+
return engine;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
function mapSearchParams(engine, query, region, safeSearch) {
|
|
222
|
+
const params = { query };
|
|
223
|
+
const resolved = resolveEngineToTemplateName(engine);
|
|
224
|
+
|
|
225
|
+
if (resolved === "duckduckgo-search") {
|
|
226
|
+
if (region !== null && region !== undefined) {
|
|
227
|
+
params.kl = region;
|
|
228
|
+
}
|
|
229
|
+
if (safeSearch === true) {
|
|
230
|
+
params.kp = "1";
|
|
231
|
+
} else if (safeSearch === false) {
|
|
232
|
+
params.kp = "-2";
|
|
233
|
+
}
|
|
234
|
+
} else if (resolved === "google-search") {
|
|
235
|
+
if (region !== null && region !== undefined) {
|
|
236
|
+
const parts = region.split("-");
|
|
237
|
+
params.hl = parts[0];
|
|
238
|
+
params.gl = parts.length > 1 ? parts[1] : parts[0];
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
return params;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// === FETCH ===============================================================
|
|
246
|
+
|
|
247
|
+
function isAccessDenied($) {
|
|
248
|
+
const title = ($("title").text() || "").toLowerCase();
|
|
249
|
+
const bodyText = ($("body").text() || "").replace(/\s+/g, " ").trim().toLowerCase();
|
|
250
|
+
|
|
251
|
+
const titleDenyPatterns = [
|
|
252
|
+
"captcha",
|
|
253
|
+
"are you a robot",
|
|
254
|
+
"access denied",
|
|
255
|
+
"blocked",
|
|
256
|
+
"forbidden",
|
|
257
|
+
"unusual traffic",
|
|
258
|
+
"sorry, you have been blocked",
|
|
259
|
+
"verify you are human",
|
|
260
|
+
"one more step",
|
|
261
|
+
"security check",
|
|
262
|
+
"ddos protection",
|
|
263
|
+
"cloudflare",
|
|
264
|
+
];
|
|
265
|
+
|
|
266
|
+
if (titleDenyPatterns.some((pattern) => title.includes(pattern))) return true;
|
|
267
|
+
|
|
268
|
+
const bodyDenyPatterns = [
|
|
269
|
+
"to continue, please type the characters",
|
|
270
|
+
"our systems have detected unusual traffic",
|
|
271
|
+
"verify you are human",
|
|
272
|
+
"are you a robot",
|
|
273
|
+
"sorry, you have been blocked",
|
|
274
|
+
"access denied",
|
|
275
|
+
];
|
|
276
|
+
|
|
277
|
+
if (bodyText.length < 1200 && bodyDenyPatterns.some((pattern) => bodyText.includes(pattern))) return true;
|
|
278
|
+
|
|
279
|
+
return false;
|
|
75
280
|
}
|
|
76
281
|
|
|
77
|
-
async function
|
|
282
|
+
async function fetchHtml(url, template, blockMedia) {
|
|
78
283
|
const browser = await browserManager.getBrowser();
|
|
79
284
|
const context = await browser.newContext();
|
|
80
285
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
path: "/",
|
|
87
|
-
},
|
|
88
|
-
]);
|
|
286
|
+
try {
|
|
287
|
+
// Pre-load cookies from template
|
|
288
|
+
if (template && template.cookies && template.cookies.length > 0) {
|
|
289
|
+
await context.addCookies(template.cookies);
|
|
290
|
+
}
|
|
89
291
|
|
|
90
|
-
|
|
292
|
+
const page = await context.newPage();
|
|
91
293
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
294
|
+
try {
|
|
295
|
+
// Route blocked resource types
|
|
296
|
+
if (blockMedia) {
|
|
297
|
+
const blockedTypes =
|
|
298
|
+
template && template.block_resources
|
|
299
|
+
? template.block_resources
|
|
300
|
+
: ["image", "media", "font"];
|
|
301
|
+
|
|
302
|
+
if (blockedTypes.length > 0) {
|
|
303
|
+
await page.route("**/*", (route) => {
|
|
304
|
+
const type = route.request().resourceType();
|
|
305
|
+
if (blockedTypes.includes(type)) {
|
|
306
|
+
route.abort();
|
|
307
|
+
} else {
|
|
308
|
+
route.continue();
|
|
309
|
+
}
|
|
310
|
+
});
|
|
311
|
+
}
|
|
99
312
|
}
|
|
100
|
-
});
|
|
101
313
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
314
|
+
let response;
|
|
315
|
+
try {
|
|
316
|
+
response = await page.goto(url, {
|
|
317
|
+
waitUntil: "networkidle",
|
|
318
|
+
timeout: 15000,
|
|
319
|
+
});
|
|
320
|
+
} catch (_navError) {
|
|
321
|
+
// Allow partial rendering on timeout
|
|
322
|
+
}
|
|
107
323
|
|
|
324
|
+
// Check HTTP status for access failures
|
|
325
|
+
if (response) {
|
|
326
|
+
const status = response.status();
|
|
327
|
+
if ([401, 403, 429].includes(status)) {
|
|
328
|
+
throw new Error(
|
|
329
|
+
`Access denied: HTTP ${status} when fetching ${url}`,
|
|
330
|
+
);
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
const pageContent = await page.content();
|
|
335
|
+
|
|
336
|
+
// Check for CAPTCHA / access-denied pages
|
|
337
|
+
const $ = cheerio.load(pageContent);
|
|
338
|
+
if (isAccessDenied($)) {
|
|
339
|
+
throw new Error(
|
|
340
|
+
`Access denied: CAPTCHA or block page detected at ${url}. The site is blocking automated access.`,
|
|
341
|
+
);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
return pageContent;
|
|
345
|
+
} finally {
|
|
346
|
+
await page.close();
|
|
347
|
+
}
|
|
348
|
+
} finally {
|
|
349
|
+
await context.close();
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
async function fetchHtmlWithRetry(url, template, blockMedia) {
|
|
354
|
+
let lastError;
|
|
355
|
+
for (let attempt = 0; attempt < 2; attempt++) {
|
|
108
356
|
try {
|
|
109
|
-
await
|
|
110
|
-
} catch (
|
|
111
|
-
|
|
357
|
+
return await fetchHtml(url, template, blockMedia);
|
|
358
|
+
} catch (err) {
|
|
359
|
+
lastError = err;
|
|
360
|
+
if (
|
|
361
|
+
attempt === 0 &&
|
|
362
|
+
(err.message.includes("net::") ||
|
|
363
|
+
err.message.includes("ERR_") ||
|
|
364
|
+
err.message.includes("Navigation failed"))
|
|
365
|
+
) {
|
|
366
|
+
// Network error — retry once
|
|
367
|
+
continue;
|
|
368
|
+
}
|
|
369
|
+
throw err;
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
throw lastError;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
// === HTML CLEANUP ========================================================
|
|
376
|
+
|
|
377
|
+
const DEFAULT_REMOVE_SELECTORS = [
|
|
378
|
+
"script", "style", "svg", "nav", "footer", "noscript", "iframe",
|
|
379
|
+
".advertisement",
|
|
380
|
+
];
|
|
381
|
+
|
|
382
|
+
function applyRemove($, template) {
|
|
383
|
+
const removeSelectors =
|
|
384
|
+
template && template.remove && template.remove.length > 0
|
|
385
|
+
? template.remove
|
|
386
|
+
: DEFAULT_REMOVE_SELECTORS;
|
|
387
|
+
|
|
388
|
+
for (const selector of removeSelectors) {
|
|
389
|
+
try {
|
|
390
|
+
$(selector).remove();
|
|
391
|
+
} catch (_) {
|
|
392
|
+
// Skip invalid selectors
|
|
112
393
|
}
|
|
394
|
+
}
|
|
113
395
|
|
|
114
|
-
|
|
115
|
-
|
|
396
|
+
// Strip style attributes and data:image src
|
|
397
|
+
$("[style]").removeAttr("style");
|
|
398
|
+
$("*").each((_i, el) => {
|
|
399
|
+
const src = $(el).attr("src");
|
|
400
|
+
if (src && src.startsWith("data:image")) {
|
|
401
|
+
$(el).removeAttr("src");
|
|
402
|
+
}
|
|
403
|
+
});
|
|
404
|
+
}
|
|
116
405
|
|
|
117
|
-
|
|
118
|
-
$("h3").each((i, el) => {
|
|
119
|
-
if (results.length >= maxResults) return;
|
|
406
|
+
// === EXTRACTION ENGINE ===================================================
|
|
120
407
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
408
|
+
/**
|
|
409
|
+
* Find elements matching selector, scoped to $parent.
|
|
410
|
+
* Search order: descendants → closest ancestor → ancestor subtrees (up to 4 levels).
|
|
411
|
+
*/
|
|
412
|
+
function findScoped($parent, selector) {
|
|
413
|
+
if (!selector || selector.trim() === "") {
|
|
414
|
+
return $parent;
|
|
415
|
+
}
|
|
125
416
|
|
|
126
|
-
|
|
417
|
+
// 1. Descendants
|
|
418
|
+
let result = $parent.find(selector);
|
|
419
|
+
if (result.length > 0) return result;
|
|
127
420
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
link.includes("google.com/search") ||
|
|
132
|
-
link.includes("support.google.com")
|
|
133
|
-
)
|
|
134
|
-
return;
|
|
421
|
+
// 2. Closest ancestor matching selector
|
|
422
|
+
result = $parent.closest(selector);
|
|
423
|
+
if (result.length > 0) return result;
|
|
135
424
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
425
|
+
// 3. Ancestor subtrees (up to 4 levels up)
|
|
426
|
+
let ancestor = $parent.parent();
|
|
427
|
+
for (let i = 0; i < 4 && ancestor.length > 0; i++) {
|
|
428
|
+
result = ancestor.find(selector);
|
|
429
|
+
if (result.length > 0) return result;
|
|
430
|
+
ancestor = ancestor.parent();
|
|
431
|
+
}
|
|
141
432
|
|
|
142
|
-
|
|
143
|
-
|
|
433
|
+
return $parent.find("__nonexistent__");
|
|
434
|
+
}
|
|
144
435
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
436
|
+
/**
|
|
437
|
+
* Try comma-separated selectors in order; first match wins.
|
|
438
|
+
*/
|
|
439
|
+
function findFirstMatch($parent, selectorStr) {
|
|
440
|
+
if (!selectorStr || selectorStr.trim() === "") {
|
|
441
|
+
return $parent;
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
const selectors = selectorStr
|
|
445
|
+
.split(",")
|
|
446
|
+
.map((s) => s.trim())
|
|
447
|
+
.filter(Boolean);
|
|
448
|
+
|
|
449
|
+
for (const sel of selectors) {
|
|
450
|
+
const matches = findScoped($parent, sel);
|
|
451
|
+
if (matches.length > 0) return matches;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
return $parent.find("__nonexistent__");
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
/**
|
|
458
|
+
* Resolve top-level elements for a section (document-wide with fallback).
|
|
459
|
+
*/
|
|
460
|
+
function resolveTopElements($, selectorStr) {
|
|
461
|
+
if (!selectorStr || selectorStr.trim() === "") {
|
|
462
|
+
return $("body");
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
const selectors = selectorStr
|
|
466
|
+
.split(",")
|
|
467
|
+
.map((s) => s.trim())
|
|
468
|
+
.filter(Boolean);
|
|
469
|
+
|
|
470
|
+
for (const sel of selectors) {
|
|
471
|
+
try {
|
|
472
|
+
const matches = $(sel);
|
|
473
|
+
if (matches.length > 0) return matches;
|
|
474
|
+
} catch (_) {
|
|
475
|
+
// Skip invalid selectors
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
return $();
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
// === TRANSFORMS ==========================================================
|
|
483
|
+
|
|
484
|
+
function applyTransform(value, transform, origin) {
|
|
485
|
+
const transforms = Array.isArray(transform) ? transform : [transform];
|
|
486
|
+
let result = value;
|
|
487
|
+
|
|
488
|
+
for (const t of transforms) {
|
|
489
|
+
if (!result) continue;
|
|
490
|
+
switch (t) {
|
|
491
|
+
case "strip":
|
|
492
|
+
result = result.trim();
|
|
493
|
+
break;
|
|
494
|
+
|
|
495
|
+
case "decode_google_url":
|
|
496
|
+
if (result.startsWith("/url?q=")) {
|
|
497
|
+
try {
|
|
498
|
+
const urlPart = result.split("/url?q=")[1].split("&")[0];
|
|
499
|
+
result = decodeURIComponent(urlPart);
|
|
500
|
+
} catch (_) {
|
|
501
|
+
// Leave as-is
|
|
154
502
|
}
|
|
155
|
-
parent = parent.parent();
|
|
156
503
|
}
|
|
504
|
+
break;
|
|
157
505
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
506
|
+
case "decode_ddg_url":
|
|
507
|
+
if (result.includes("/l/?uddg=")) {
|
|
508
|
+
try {
|
|
509
|
+
const queryString = result.split("?")[1] || "";
|
|
510
|
+
const params = new URLSearchParams(queryString);
|
|
511
|
+
const uddg = params.get("uddg");
|
|
512
|
+
if (uddg) result = decodeURIComponent(uddg);
|
|
513
|
+
} catch (_) {
|
|
514
|
+
// Leave as-is
|
|
166
515
|
}
|
|
167
516
|
}
|
|
168
|
-
|
|
169
|
-
} else {
|
|
170
|
-
$(".result").each((i, el) => {
|
|
171
|
-
if (results.length >= maxResults) return;
|
|
172
|
-
const titleEl = $(el).find(".result__title a");
|
|
173
|
-
let link = titleEl.attr("href") || "";
|
|
517
|
+
break;
|
|
174
518
|
|
|
175
|
-
|
|
519
|
+
case "json_parse":
|
|
520
|
+
try {
|
|
521
|
+
result = JSON.stringify(JSON.parse(result), null, 2);
|
|
522
|
+
} catch (_) {
|
|
523
|
+
// Leave as-is
|
|
524
|
+
}
|
|
525
|
+
break;
|
|
526
|
+
|
|
527
|
+
case "resolve_href":
|
|
528
|
+
if (origin && result.startsWith("/") && !result.startsWith("//")) {
|
|
176
529
|
try {
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
530
|
+
result = new URL(result, origin).href;
|
|
531
|
+
} catch (_) {
|
|
532
|
+
// Leave as-is
|
|
533
|
+
}
|
|
180
534
|
}
|
|
535
|
+
break;
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
return result;
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
// === EXTRACTION ==========================================================
|
|
543
|
+
|
|
544
|
+
function extractValue($el, section, origin) {
|
|
545
|
+
let value;
|
|
546
|
+
|
|
547
|
+
switch (section.format) {
|
|
548
|
+
case "text":
|
|
549
|
+
value = $el.text().replace(/\s+/g, " ").trim();
|
|
550
|
+
break;
|
|
551
|
+
|
|
552
|
+
case "markdown": {
|
|
553
|
+
const html = $el.html() || "";
|
|
554
|
+
value = turndown
|
|
555
|
+
.turndown(html)
|
|
556
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
557
|
+
.trim();
|
|
558
|
+
break;
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
case "attribute":
|
|
562
|
+
value = $el.attr(section.attribute) || "";
|
|
563
|
+
break;
|
|
564
|
+
|
|
565
|
+
case "html":
|
|
566
|
+
value = $el.html() || "";
|
|
567
|
+
break;
|
|
568
|
+
|
|
569
|
+
default:
|
|
570
|
+
value = $el.text().replace(/\s+/g, " ").trim();
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
if (section.transform && value) {
|
|
574
|
+
value = applyTransform(value, section.transform, origin);
|
|
575
|
+
}
|
|
181
576
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
577
|
+
return value;
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
/**
|
|
581
|
+
* Extract one child section, scoped to $parentEl.
|
|
582
|
+
* Returns { type: "value", text } or null.
|
|
583
|
+
*/
|
|
584
|
+
function extractChildSection($, $parentEl, section, origin) {
|
|
585
|
+
const elements = findFirstMatch($parentEl, section.selector);
|
|
586
|
+
|
|
587
|
+
if (!elements || elements.length === 0) {
|
|
588
|
+
if (section.required) {
|
|
589
|
+
throw new Error(
|
|
590
|
+
`Required section '${section.name}' not found on page.`,
|
|
591
|
+
);
|
|
592
|
+
}
|
|
593
|
+
return null;
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
const el = elements.eq(0);
|
|
597
|
+
const value = extractValue(el, section, origin);
|
|
598
|
+
return { type: "value", text: value };
|
|
599
|
+
}
|
|
188
600
|
|
|
189
|
-
|
|
190
|
-
|
|
601
|
+
/**
|
|
602
|
+
* Extract a top-level section from the document.
|
|
603
|
+
* Returns a SectionResult or null.
|
|
604
|
+
*/
|
|
605
|
+
function extractSection($, section, context) {
|
|
606
|
+
const elements = resolveTopElements($, section.selector);
|
|
607
|
+
|
|
608
|
+
if (!elements || elements.length === 0) {
|
|
609
|
+
if (section.required) {
|
|
610
|
+
throw new Error(
|
|
611
|
+
`Required section '${section.name}' not found on page.`,
|
|
612
|
+
);
|
|
613
|
+
}
|
|
614
|
+
return null;
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
// Determine limit
|
|
618
|
+
let limit = elements.length;
|
|
619
|
+
if (section.multiple && section.max_items) {
|
|
620
|
+
limit = Math.min(limit, section.max_items);
|
|
621
|
+
}
|
|
622
|
+
// Override max_items with max_results for first multiple+children section
|
|
623
|
+
if (
|
|
624
|
+
context.isWebsearch &&
|
|
625
|
+
context.maxResultsOverride &&
|
|
626
|
+
!context._maxResultsConsumed &&
|
|
627
|
+
section.multiple &&
|
|
628
|
+
section.children &&
|
|
629
|
+
section.children.length > 0
|
|
630
|
+
) {
|
|
631
|
+
limit = Math.min(limit, context.maxResultsOverride);
|
|
632
|
+
context._maxResultsConsumed = true;
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
if (section.multiple) {
|
|
636
|
+
const items = [];
|
|
637
|
+
|
|
638
|
+
for (let i = 0; i < limit; i++) {
|
|
639
|
+
const el = elements.eq(i);
|
|
640
|
+
|
|
641
|
+
if (section.children && section.children.length > 0) {
|
|
642
|
+
// Multiple parents, each with children
|
|
643
|
+
const childValues = {};
|
|
644
|
+
for (const child of section.children) {
|
|
645
|
+
const cr = extractChildSection($, el, child, context.origin);
|
|
646
|
+
if (cr && cr.type === "value" && cr.text !== null && cr.text !== undefined) {
|
|
647
|
+
childValues[child.name] = cr.text;
|
|
648
|
+
}
|
|
191
649
|
}
|
|
192
|
-
|
|
650
|
+
if (Object.keys(childValues).length > 0) {
|
|
651
|
+
items.push(childValues);
|
|
652
|
+
}
|
|
653
|
+
} else {
|
|
654
|
+
// Multiple parents, no children
|
|
655
|
+
const value = extractValue(el, section, context.origin);
|
|
656
|
+
if (value && value.trim()) {
|
|
657
|
+
items.push(value.trim());
|
|
658
|
+
}
|
|
659
|
+
}
|
|
193
660
|
}
|
|
194
661
|
|
|
195
|
-
if (
|
|
196
|
-
return
|
|
662
|
+
if (section.children && section.children.length > 0) {
|
|
663
|
+
return { section, type: "children-multiple", items };
|
|
664
|
+
} else {
|
|
665
|
+
return { section, type: "list", items };
|
|
197
666
|
}
|
|
667
|
+
} else {
|
|
668
|
+
// Single parent
|
|
669
|
+
const el = elements.eq(0);
|
|
198
670
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
671
|
+
if (section.children && section.children.length > 0) {
|
|
672
|
+
// Single parent with children — parent format ignored
|
|
673
|
+
const childValues = {};
|
|
674
|
+
for (const child of section.children) {
|
|
675
|
+
const cr = extractChildSection($, el, child, context.origin);
|
|
676
|
+
if (cr && cr.type === "value" && cr.text !== null && cr.text !== undefined) {
|
|
677
|
+
childValues[child.name] = cr.text;
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
return { section, type: "children", items: childValues };
|
|
681
|
+
} else {
|
|
682
|
+
const value = extractValue(el, section, context.origin);
|
|
683
|
+
return { section, type: "value", text: value };
|
|
684
|
+
}
|
|
211
685
|
}
|
|
212
686
|
}
|
|
213
687
|
|
|
214
|
-
|
|
215
|
-
const
|
|
216
|
-
const context = await browser.newContext();
|
|
217
|
-
const page = await context.newPage();
|
|
688
|
+
function extractTemplate($, template, context) {
|
|
689
|
+
const results = [];
|
|
218
690
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
691
|
+
for (const section of template.sections) {
|
|
692
|
+
try {
|
|
693
|
+
const result = extractSection($, section, context);
|
|
694
|
+
if (result !== null) {
|
|
695
|
+
results.push(result);
|
|
696
|
+
}
|
|
697
|
+
} catch (err) {
|
|
698
|
+
if (
|
|
699
|
+
err.message &&
|
|
700
|
+
err.message.includes("Required section")
|
|
701
|
+
) {
|
|
702
|
+
throw err;
|
|
703
|
+
}
|
|
704
|
+
// Non-required failures are silently skipped
|
|
705
|
+
}
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
return results;
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
// === COMPOSITION: WEBFETCH ===============================================
|
|
712
|
+
|
|
713
|
+
function isCommentStyle(result) {
|
|
714
|
+
if (!result.items || result.items.length === 0) return false;
|
|
715
|
+
const first = result.items[0];
|
|
716
|
+
const keys = Object.keys(first).map((k) => k.toLowerCase());
|
|
717
|
+
return (
|
|
718
|
+
(keys.includes("author") && (keys.includes("comment") || keys.includes("body"))) ||
|
|
719
|
+
(keys.includes("user") && (keys.includes("comment") || keys.includes("body")))
|
|
720
|
+
);
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
function composeSections(extracted, template, startIndex, maxLength) {
|
|
724
|
+
const parts = [];
|
|
725
|
+
|
|
726
|
+
for (const result of extracted) {
|
|
727
|
+
if (result.type === "value") {
|
|
728
|
+
const text = result.text;
|
|
729
|
+
if (text && String(text).trim()) {
|
|
730
|
+
parts.push(`## ${result.section.name}\n\n${String(text).trim()}`);
|
|
731
|
+
}
|
|
732
|
+
} else if (result.type === "list") {
|
|
733
|
+
if (result.items && result.items.length > 0) {
|
|
734
|
+
const listText = result.items.map((item) => `- ${item}`).join("\n");
|
|
735
|
+
parts.push(`## ${result.section.name}\n\n${listText}`);
|
|
736
|
+
}
|
|
737
|
+
} else if (result.type === "children") {
|
|
738
|
+
if (result.items && Object.keys(result.items).length > 0) {
|
|
739
|
+
for (const [childName, value] of Object.entries(result.items)) {
|
|
740
|
+
if (value && String(value).trim()) {
|
|
741
|
+
parts.push(`## ${childName}\n\n${String(value).trim()}`);
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
}
|
|
745
|
+
} else if (result.type === "children-multiple") {
|
|
746
|
+
if (result.items && result.items.length > 0) {
|
|
747
|
+
if (isCommentStyle(result)) {
|
|
748
|
+
const commentParts = [];
|
|
749
|
+
for (const item of result.items) {
|
|
750
|
+
const author =
|
|
751
|
+
item["Author"] || item["author"] || item["User"] || item["user"] || "";
|
|
752
|
+
const comment =
|
|
753
|
+
item["Comment"] || item["Body"] || item["comment"] || item["body"] || "";
|
|
754
|
+
if (author) {
|
|
755
|
+
commentParts.push(`**${author}:**\n\n${comment}`);
|
|
756
|
+
} else if (comment) {
|
|
757
|
+
commentParts.push(comment);
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
if (commentParts.length > 0) {
|
|
761
|
+
parts.push(
|
|
762
|
+
`## ${result.section.name}\n\n${commentParts.join("\n\n---\n\n")}`,
|
|
763
|
+
);
|
|
764
|
+
}
|
|
225
765
|
} else {
|
|
226
|
-
|
|
766
|
+
const itemParts = [];
|
|
767
|
+
for (const item of result.items) {
|
|
768
|
+
const lines = [];
|
|
769
|
+
for (const [key, value] of Object.entries(item)) {
|
|
770
|
+
if (value && String(value).trim()) {
|
|
771
|
+
lines.push(` ${key}: ${String(value).trim()}`);
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
if (lines.length > 0) itemParts.push(lines.join("\n"));
|
|
775
|
+
}
|
|
776
|
+
if (itemParts.length > 0) {
|
|
777
|
+
parts.push(
|
|
778
|
+
`## ${result.section.name}\n\n${itemParts.join("\n\n")}`,
|
|
779
|
+
);
|
|
780
|
+
}
|
|
227
781
|
}
|
|
228
|
-
}
|
|
782
|
+
}
|
|
229
783
|
}
|
|
784
|
+
}
|
|
230
785
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
786
|
+
if (parts.length === 0) {
|
|
787
|
+
return "(No content extracted from this page.)";
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
const full = parts.join("\n\n---\n\n");
|
|
791
|
+
const totalLength = full.length;
|
|
792
|
+
const paginated = full.substring(startIndex, startIndex + maxLength);
|
|
793
|
+
|
|
794
|
+
const templateName = template ? template.name : "auto";
|
|
795
|
+
let metadata = `\n\n---\n[webfetch: template="${templateName}", showing characters ${startIndex} to ${startIndex + paginated.length} of ${totalLength} total.`;
|
|
796
|
+
if (startIndex + maxLength < totalLength) {
|
|
797
|
+
metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
|
|
798
|
+
}
|
|
799
|
+
metadata += `]`;
|
|
800
|
+
|
|
801
|
+
return paginated + metadata;
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
// === COMPOSITION: WEBSEARCH ==============================================
|
|
805
|
+
|
|
806
|
+
function composeSearchResults(extracted) {
|
|
807
|
+
// Find the search results section (first children-multiple)
|
|
808
|
+
const searchSection = extracted.find((r) => r.type === "children-multiple");
|
|
809
|
+
|
|
810
|
+
if (!searchSection || !searchSection.items || searchSection.items.length === 0) {
|
|
811
|
+
// Fall back to section-based output
|
|
812
|
+
return composeSections(extracted, null, 0, Infinity);
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
const items = searchSection.items;
|
|
816
|
+
const parts = [];
|
|
817
|
+
|
|
818
|
+
for (let i = 0; i < items.length; i++) {
|
|
819
|
+
const item = items[i];
|
|
820
|
+
const num = i + 1;
|
|
821
|
+
|
|
822
|
+
const title =
|
|
823
|
+
item["Title"] || item["title"] || Object.values(item)[0] || "";
|
|
824
|
+
const url =
|
|
825
|
+
item["URL"] || item["url"] || item["Url"] || "";
|
|
826
|
+
const snippet =
|
|
827
|
+
item["Snippet"] || item["snippet"] || "";
|
|
828
|
+
|
|
829
|
+
// Filter out non-http URLs and google internal links
|
|
830
|
+
let cleanUrl = url;
|
|
831
|
+
if (cleanUrl && !cleanUrl.startsWith("http")) {
|
|
832
|
+
cleanUrl = ""; // Skip internal/non-web URLs
|
|
833
|
+
}
|
|
834
|
+
if (
|
|
835
|
+
cleanUrl &&
|
|
836
|
+
(cleanUrl.includes("google.com/search") ||
|
|
837
|
+
cleanUrl.includes("support.google.com"))
|
|
838
|
+
) {
|
|
839
|
+
cleanUrl = ""; // Skip google internal links
|
|
235
840
|
}
|
|
236
841
|
|
|
237
|
-
|
|
238
|
-
let finalContent = "";
|
|
842
|
+
if (!title) continue;
|
|
239
843
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
const $ = cheerio.load(pageContent);
|
|
844
|
+
const lines = [`[${num}] ${title}`];
|
|
845
|
+
if (cleanUrl) lines.push(` URL: ${cleanUrl}`);
|
|
846
|
+
if (snippet) lines.push(` Snippet: ${snippet}`);
|
|
244
847
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
).remove();
|
|
248
|
-
$("*")
|
|
249
|
-
.removeAttr("style")
|
|
250
|
-
.each((i, el) => {
|
|
251
|
-
const src = $(el).attr("src");
|
|
252
|
-
if (src && src.startsWith("data:image")) $(el).removeAttr("src");
|
|
253
|
-
});
|
|
848
|
+
parts.push(lines.join("\n"));
|
|
849
|
+
}
|
|
254
850
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
851
|
+
if (parts.length === 0) {
|
|
852
|
+
return "(No content extracted from this page.)";
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
return `## ${searchSection.section.name}\n\n${parts.join("\n\n")}`;
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
// === GENERIC FALLBACK ====================================================
|
|
859
|
+
|
|
860
|
+
function genericFallback($, startIndex, maxLength) {
|
|
861
|
+
applyRemove($, null);
|
|
862
|
+
|
|
863
|
+
const bodyHtml = $("body").html() || "";
|
|
864
|
+
let markdown = turndown
|
|
865
|
+
.turndown(bodyHtml)
|
|
866
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
867
|
+
.trim();
|
|
868
|
+
|
|
869
|
+
if (!markdown || markdown.trim().length === 0) {
|
|
870
|
+
return "(No content extracted from this page.)";
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
const totalLength = markdown.length;
|
|
874
|
+
const paginated = markdown.substring(startIndex, startIndex + maxLength);
|
|
875
|
+
|
|
876
|
+
let metadata = `\n\n---\n[webfetch: template="auto (fallback)", showing characters ${startIndex} to ${startIndex + paginated.length} of ${totalLength} total.`;
|
|
877
|
+
if (startIndex + maxLength < totalLength) {
|
|
878
|
+
metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
|
|
879
|
+
}
|
|
880
|
+
metadata += `]`;
|
|
881
|
+
|
|
882
|
+
return paginated + metadata;
|
|
883
|
+
}
|
|
884
|
+
|
|
885
|
+
// === SEARCH TEMPLATE RESOLUTION ==========================================
|
|
886
|
+
|
|
887
|
+
function resolveSearchTemplate(engine, query, region, safeSearch) {
|
|
888
|
+
const templateName = resolveEngineToTemplateName(engine);
|
|
889
|
+
|
|
890
|
+
let template;
|
|
891
|
+
if (templateName.startsWith("{")) {
|
|
892
|
+
try {
|
|
893
|
+
template = JSON.parse(templateName);
|
|
894
|
+
} catch (e) {
|
|
895
|
+
throw new Error(`Invalid inline JSON template: ${e.message}`);
|
|
267
896
|
}
|
|
897
|
+
} else {
|
|
898
|
+
template = getTemplateByName(templateName);
|
|
899
|
+
}
|
|
268
900
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
startIndex + maxLength,
|
|
901
|
+
if (!template.url_template) {
|
|
902
|
+
throw new Error(
|
|
903
|
+
`Template '${template.name}' is not a search template (no url_template).`,
|
|
273
904
|
);
|
|
905
|
+
}
|
|
274
906
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
|
|
278
|
-
}
|
|
279
|
-
metadata += `]`;
|
|
907
|
+
const params = mapSearchParams(engine, query, region, safeSearch);
|
|
908
|
+
let url = resolveUrlTemplate(template, params);
|
|
280
909
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
910
|
+
// Google safe_search: append safe=active to URL
|
|
911
|
+
if (
|
|
912
|
+
(engine === "google" || templateName === "google-search") &&
|
|
913
|
+
safeSearch === true
|
|
914
|
+
) {
|
|
915
|
+
url += "&safe=active";
|
|
285
916
|
}
|
|
917
|
+
|
|
918
|
+
return { template, url };
|
|
286
919
|
}
|
|
287
920
|
|
|
288
|
-
|
|
921
|
+
// === MCP SERVER & TOOLS ==================================================
|
|
922
|
+
|
|
923
|
+
const server = new McpServer({ name: "searchfetch", version: "3.0.0" });
|
|
924
|
+
|
|
925
|
+
// --- websearch tool ---
|
|
289
926
|
|
|
290
927
|
server.registerTool(
|
|
291
928
|
"websearch",
|
|
@@ -296,38 +933,67 @@ server.registerTool(
|
|
|
296
933
|
inputSchema: z.object({
|
|
297
934
|
query: z.string().describe("The search query string."),
|
|
298
935
|
engine: z
|
|
299
|
-
.
|
|
936
|
+
.string()
|
|
300
937
|
.default("duckduckgo")
|
|
301
938
|
.describe(
|
|
302
939
|
"Search engine to use. Can be 'duckduckgo' or 'google'. Default is 'duckduckgo'.",
|
|
303
940
|
),
|
|
304
|
-
max_results: z
|
|
305
|
-
.number()
|
|
306
|
-
.default(10)
|
|
307
|
-
.describe("Maximum number of results to return. Default is 10."),
|
|
308
941
|
region: z
|
|
309
942
|
.string()
|
|
310
|
-
.
|
|
943
|
+
.nullable()
|
|
944
|
+
.default(null)
|
|
311
945
|
.describe(
|
|
312
|
-
"Region and language code to localize search results (e.g., 'us-en', 'uk-en', 'de-de'). For DuckDuckGo it maps directly. For Google, 'us' is country code and 'en' is language. Default is
|
|
946
|
+
"Region and language code to localize search results (e.g., 'us-en', 'uk-en', 'de-de'). For DuckDuckGo it maps directly. For Google, 'us' is country code and 'en' is language. Default is null (uses template default).",
|
|
313
947
|
),
|
|
314
948
|
safe_search: z
|
|
315
|
-
.
|
|
316
|
-
.
|
|
949
|
+
.boolean()
|
|
950
|
+
.nullable()
|
|
951
|
+
.default(null)
|
|
952
|
+
.describe(
|
|
953
|
+
"Enable safe search filtering. null = use template default. Applies to both DuckDuckGo and Google.",
|
|
954
|
+
),
|
|
955
|
+
max_results: z
|
|
956
|
+
.number()
|
|
957
|
+
.default(10)
|
|
958
|
+
.describe("Maximum number of search results to return. Default is 10."),
|
|
959
|
+
block_media: z
|
|
960
|
+
.boolean()
|
|
961
|
+
.default(true)
|
|
317
962
|
.describe(
|
|
318
|
-
"
|
|
963
|
+
"Block images, videos, and fonts entirely at the network layer. Default is true.",
|
|
319
964
|
),
|
|
320
965
|
}),
|
|
321
966
|
},
|
|
322
|
-
async ({ query,
|
|
967
|
+
async ({ query, engine, region, safe_search, max_results, block_media }) => {
|
|
323
968
|
try {
|
|
324
|
-
|
|
969
|
+
// 1. Resolve search template (+ url_params mapping + url building)
|
|
970
|
+
const { template, url } = resolveSearchTemplate(
|
|
971
|
+
engine,
|
|
325
972
|
query,
|
|
326
|
-
max_results,
|
|
327
973
|
region,
|
|
328
974
|
safe_search,
|
|
329
|
-
engine,
|
|
330
975
|
);
|
|
976
|
+
|
|
977
|
+
// 2. Fetch
|
|
978
|
+
const html = await fetchHtmlWithRetry(url, template, block_media);
|
|
979
|
+
|
|
980
|
+
// 3. Extract
|
|
981
|
+
const $ = cheerio.load(html);
|
|
982
|
+
applyRemove($, template);
|
|
983
|
+
|
|
984
|
+
const pageOrigin = new URL(url).origin;
|
|
985
|
+
const context = {
|
|
986
|
+
origin: pageOrigin,
|
|
987
|
+
isWebsearch: true,
|
|
988
|
+
maxResultsOverride: max_results,
|
|
989
|
+
_maxResultsConsumed: false,
|
|
990
|
+
};
|
|
991
|
+
|
|
992
|
+
const extracted = extractTemplate($, template, context);
|
|
993
|
+
|
|
994
|
+
// 4. Compose
|
|
995
|
+
const result = composeSearchResults(extracted);
|
|
996
|
+
|
|
331
997
|
return { content: [{ type: "text", text: result }] };
|
|
332
998
|
} catch (err) {
|
|
333
999
|
return {
|
|
@@ -338,6 +1004,8 @@ server.registerTool(
|
|
|
338
1004
|
},
|
|
339
1005
|
);
|
|
340
1006
|
|
|
1007
|
+
// --- webfetch tool ---
|
|
1008
|
+
|
|
341
1009
|
server.registerTool(
|
|
342
1010
|
"webfetch",
|
|
343
1011
|
{
|
|
@@ -345,22 +1013,20 @@ server.registerTool(
|
|
|
345
1013
|
description:
|
|
346
1014
|
"Fetch and extract the main text content from any webpage. Fully executes JavaScript to load React/SPAs and aggressively strips images/media (including base64) to save context tokens.",
|
|
347
1015
|
inputSchema: z.object({
|
|
348
|
-
url: z
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
)
|
|
353
|
-
|
|
354
|
-
.enum(["markdown", "clean_html", "raw_html"])
|
|
355
|
-
.default("markdown")
|
|
1016
|
+
url: z.string().describe(
|
|
1017
|
+
"The full URL of the webpage to fetch (must start with http/https).",
|
|
1018
|
+
),
|
|
1019
|
+
template: z
|
|
1020
|
+
.string()
|
|
1021
|
+
.default("auto")
|
|
356
1022
|
.describe(
|
|
357
|
-
"
|
|
1023
|
+
"Template to use: 'auto' (auto-detect from URL), a built-in name, or inline JSON.",
|
|
358
1024
|
),
|
|
359
1025
|
start_index: z
|
|
360
1026
|
.number()
|
|
361
1027
|
.default(0)
|
|
362
1028
|
.describe(
|
|
363
|
-
"Character offset
|
|
1029
|
+
"Character offset for pagination. Default: 0.",
|
|
364
1030
|
),
|
|
365
1031
|
max_length: z
|
|
366
1032
|
.number()
|
|
@@ -372,20 +1038,50 @@ server.registerTool(
|
|
|
372
1038
|
.boolean()
|
|
373
1039
|
.default(true)
|
|
374
1040
|
.describe(
|
|
375
|
-
"Block images, videos, and fonts entirely at the network layer
|
|
1041
|
+
"Block images, videos, and fonts entirely at the network layer. Default is true.",
|
|
376
1042
|
),
|
|
377
1043
|
}),
|
|
378
1044
|
},
|
|
379
|
-
async ({ url,
|
|
1045
|
+
async ({ url, template: templateParam, start_index, max_length, block_media }) => {
|
|
380
1046
|
try {
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
1047
|
+
// 1. Resolve template
|
|
1048
|
+
let template;
|
|
1049
|
+
|
|
1050
|
+
if (templateParam.startsWith("{")) {
|
|
1051
|
+
try {
|
|
1052
|
+
template = JSON.parse(templateParam);
|
|
1053
|
+
} catch (e) {
|
|
1054
|
+
throw new Error(`Invalid inline JSON template: ${e.message}`);
|
|
1055
|
+
}
|
|
1056
|
+
} else if (templateParam === "auto") {
|
|
1057
|
+
template = detectTemplateByUrl(url);
|
|
1058
|
+
} else {
|
|
1059
|
+
template = getTemplateByName(templateParam);
|
|
1060
|
+
}
|
|
1061
|
+
|
|
1062
|
+
// 2. Fetch
|
|
1063
|
+
const html = await fetchHtmlWithRetry(url, template, block_media);
|
|
1064
|
+
|
|
1065
|
+
// 3. Extract and compose
|
|
1066
|
+
const $ = cheerio.load(html);
|
|
1067
|
+
|
|
1068
|
+
if (template) {
|
|
1069
|
+
applyRemove($, template);
|
|
1070
|
+
|
|
1071
|
+
const pageOrigin = new URL(url).origin;
|
|
1072
|
+
const context = {
|
|
1073
|
+
origin: pageOrigin,
|
|
1074
|
+
isWebsearch: false,
|
|
1075
|
+
};
|
|
1076
|
+
|
|
1077
|
+
const extracted = extractTemplate($, template, context);
|
|
1078
|
+
const result = composeSections(extracted, template, start_index, max_length);
|
|
1079
|
+
return { content: [{ type: "text", text: result }] };
|
|
1080
|
+
} else {
|
|
1081
|
+
// Generic fallback
|
|
1082
|
+
const result = genericFallback($, start_index, max_length);
|
|
1083
|
+
return { content: [{ type: "text", text: result }] };
|
|
1084
|
+
}
|
|
389
1085
|
} catch (err) {
|
|
390
1086
|
return {
|
|
391
1087
|
content: [{ type: "text", text: `Fetch Error: ${err.message}` }],
|
|
@@ -395,6 +1091,8 @@ server.registerTool(
|
|
|
395
1091
|
},
|
|
396
1092
|
);
|
|
397
1093
|
|
|
1094
|
+
// === MAIN =================================================================
|
|
1095
|
+
|
|
398
1096
|
async function main() {
|
|
399
1097
|
await ensureBinary();
|
|
400
1098
|
process.stdout.write = originalStdoutWrite;
|