searchfetch 2.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -13
- package/index.js +931 -197
- package/package.json +5 -4
- package/templates/crates-package.json +37 -0
- package/templates/docs-page.json +31 -0
- package/templates/duckduckgo-search.json +67 -0
- package/templates/github-issue.json +69 -0
- package/templates/github-repo.json +36 -0
- package/templates/google-search.json +75 -0
- package/templates/npm-package.json +49 -0
- package/templates/pypi-package.json +43 -0
package/index.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
|
+
// === STDOUT/STDERR REDIRECTION ===========================================
|
|
3
4
|
const originalStdoutWrite = process.stdout.write.bind(process.stdout);
|
|
4
5
|
process.stdout.write = (chunk, encoding, callback) => {
|
|
5
6
|
return process.stderr.write(chunk, encoding, callback);
|
|
@@ -7,6 +8,10 @@ process.stdout.write = (chunk, encoding, callback) => {
|
|
|
7
8
|
console.log = (...args) => console.error(...args);
|
|
8
9
|
console.info = (...args) => console.error(...args);
|
|
9
10
|
|
|
11
|
+
// === IMPORTS =============================================================
|
|
12
|
+
import { readdirSync, readFileSync } from "node:fs";
|
|
13
|
+
import { dirname, join } from "node:path";
|
|
14
|
+
import { fileURLToPath } from "node:url";
|
|
10
15
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
11
16
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
12
17
|
import { z } from "zod";
|
|
@@ -14,6 +19,7 @@ import { launch, ensureBinary } from "cloakbrowser";
|
|
|
14
19
|
import * as cheerio from "cheerio";
|
|
15
20
|
import TurndownService from "turndown";
|
|
16
21
|
|
|
22
|
+
// === BROWSER MANAGER =====================================================
|
|
17
23
|
class BrowserManager {
|
|
18
24
|
constructor() {
|
|
19
25
|
this.browser = null;
|
|
@@ -67,225 +73,892 @@ const cleanup = async () => {
|
|
|
67
73
|
process.on("SIGINT", cleanup);
|
|
68
74
|
process.on("SIGTERM", cleanup);
|
|
69
75
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
76
|
+
// === TURNDOWN ============================================================
|
|
77
|
+
const turndown = new TurndownService({
|
|
78
|
+
headingStyle: "atx",
|
|
79
|
+
codeBlockStyle: "fenced",
|
|
80
|
+
emDelimiter: "*",
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
// === BUILT-IN TEMPLATES (loaded from templates/*.json) ====================
|
|
84
|
+
|
|
85
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
86
|
+
const __dirname = dirname(__filename);
|
|
87
|
+
const TEMPLATES_DIR = join(__dirname, "templates");
|
|
88
|
+
|
|
89
|
+
function loadBuiltinTemplates() {
|
|
90
|
+
let files;
|
|
91
|
+
try {
|
|
92
|
+
files = readdirSync(TEMPLATES_DIR);
|
|
93
|
+
} catch (err) {
|
|
94
|
+
throw new Error(
|
|
95
|
+
`Cannot read templates directory '${TEMPLATES_DIR}': ${err.message}`,
|
|
96
|
+
);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const jsonFiles = files
|
|
100
|
+
.filter((f) => f.endsWith(".json"))
|
|
101
|
+
.sort();
|
|
102
|
+
|
|
103
|
+
if (jsonFiles.length === 0) {
|
|
104
|
+
throw new Error(
|
|
105
|
+
`No template JSON files found in '${TEMPLATES_DIR}'`,
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
const templates = [];
|
|
110
|
+
for (const file of jsonFiles) {
|
|
111
|
+
const filePath = join(TEMPLATES_DIR, file);
|
|
112
|
+
const content = readFileSync(filePath, "utf-8");
|
|
113
|
+
let template;
|
|
114
|
+
try {
|
|
115
|
+
template = JSON.parse(content);
|
|
116
|
+
} catch (err) {
|
|
117
|
+
throw new Error(
|
|
118
|
+
`Invalid JSON in template file '${filePath}': ${err.message}`,
|
|
119
|
+
);
|
|
120
|
+
}
|
|
121
|
+
if (!template.name || typeof template.name !== "string") {
|
|
122
|
+
throw new Error(
|
|
123
|
+
`Template file '${filePath}' is missing a valid "name" field`,
|
|
124
|
+
);
|
|
125
|
+
}
|
|
126
|
+
templates.push(template);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Sort by "order" field for deterministic URL-pattern matching
|
|
130
|
+
templates.sort((a, b) => (a.order ?? 999) - (b.order ?? 999));
|
|
131
|
+
|
|
132
|
+
return templates;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const BUILTIN_TEMPLATES = loadBuiltinTemplates();
|
|
136
|
+
|
|
137
|
+
// === TEMPLATE LOOKUP =====================================================
|
|
138
|
+
const TEMPLATE_MAP = new Map();
|
|
139
|
+
for (const t of BUILTIN_TEMPLATES) {
|
|
140
|
+
TEMPLATE_MAP.set(t.name, t);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function getTemplateByName(name) {
|
|
144
|
+
const t = TEMPLATE_MAP.get(name);
|
|
145
|
+
if (!t) {
|
|
146
|
+
const names = [...TEMPLATE_MAP.keys()].join(", ");
|
|
147
|
+
throw new Error(
|
|
148
|
+
`Unknown template '${name}'. Available: ${names}`,
|
|
149
|
+
);
|
|
150
|
+
}
|
|
151
|
+
return t;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function detectTemplateByUrl(url) {
|
|
155
|
+
for (const template of BUILTIN_TEMPLATES) {
|
|
156
|
+
if (!template.url_patterns) continue;
|
|
157
|
+
for (const pattern of template.url_patterns) {
|
|
158
|
+
try {
|
|
159
|
+
if (new RegExp(pattern).test(url)) {
|
|
160
|
+
return template;
|
|
161
|
+
}
|
|
162
|
+
} catch (_) {
|
|
163
|
+
// Skip invalid regex
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
return null;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// === URL TEMPLATE RESOLUTION =============================================
|
|
171
|
+
|
|
172
|
+
function resolveUrlTemplate(template, providedParams) {
|
|
173
|
+
const urlParams = template.url_params || {};
|
|
174
|
+
let url = template.url_template;
|
|
175
|
+
if (!url) return null;
|
|
176
|
+
|
|
177
|
+
let match;
|
|
178
|
+
const re = /\{(\w+)\}/g;
|
|
179
|
+
while ((match = re.exec(url)) !== null) {
|
|
180
|
+
const name = match[1];
|
|
181
|
+
const def = urlParams[name] || {};
|
|
182
|
+
|
|
183
|
+
let value;
|
|
184
|
+
if (
|
|
185
|
+
name in providedParams &&
|
|
186
|
+
providedParams[name] !== null &&
|
|
187
|
+
providedParams[name] !== undefined
|
|
188
|
+
) {
|
|
189
|
+
value = String(providedParams[name]);
|
|
190
|
+
} else if (def.default !== undefined) {
|
|
191
|
+
value = String(def.default);
|
|
192
|
+
} else if (def.required) {
|
|
193
|
+
throw new Error(
|
|
194
|
+
`Required URL parameter '${name}' not provided for template '${template.name}'.`,
|
|
195
|
+
);
|
|
196
|
+
} else {
|
|
197
|
+
value = "";
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (def.encode === "url") {
|
|
201
|
+
value = encodeURIComponent(value);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
url = url.replace(match[0], value);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Remove any remaining unreplaced placeholders
|
|
208
|
+
url = url.replace(/\{\w+\}/g, "").replace(/&{2,}/g, "&").replace(/\?&/, "?");
|
|
209
|
+
|
|
210
|
+
return url;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// === SEARCH PARAM MAPPING ================================================
|
|
214
|
+
|
|
215
|
+
function resolveEngineToTemplateName(engine) {
|
|
216
|
+
if (engine === "duckduckgo") return "duckduckgo-search";
|
|
217
|
+
if (engine === "google") return "google-search";
|
|
218
|
+
return engine;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
function mapSearchParams(engine, query, region, safeSearch) {
|
|
222
|
+
const params = { query };
|
|
223
|
+
const resolved = resolveEngineToTemplateName(engine);
|
|
224
|
+
|
|
225
|
+
if (resolved === "duckduckgo-search") {
|
|
226
|
+
if (region !== null && region !== undefined) {
|
|
227
|
+
params.kl = region;
|
|
228
|
+
}
|
|
229
|
+
if (safeSearch === true) {
|
|
230
|
+
params.kp = "1";
|
|
231
|
+
} else if (safeSearch === false) {
|
|
232
|
+
params.kp = "-2";
|
|
233
|
+
}
|
|
234
|
+
} else if (resolved === "google-search") {
|
|
235
|
+
if (region !== null && region !== undefined) {
|
|
236
|
+
const parts = region.split("-");
|
|
237
|
+
params.hl = parts[0];
|
|
238
|
+
params.gl = parts.length > 1 ? parts[1] : parts[0];
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
return params;
|
|
75
243
|
}
|
|
76
244
|
|
|
77
|
-
|
|
245
|
+
// === FETCH ===============================================================
|
|
246
|
+
|
|
247
|
+
const FETCH_MAX_ATTEMPTS = 2;
|
|
248
|
+
const HTTP_429_RETRY_DELAY_MS = 2000;
|
|
249
|
+
|
|
250
|
+
function sleep(ms) {
|
|
251
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
function parseRetryAfterMs(value) {
|
|
255
|
+
if (!value) return HTTP_429_RETRY_DELAY_MS;
|
|
256
|
+
const seconds = Number(value);
|
|
257
|
+
if (Number.isFinite(seconds) && seconds >= 0) {
|
|
258
|
+
return Math.min(seconds * 1000, 30000);
|
|
259
|
+
}
|
|
260
|
+
const dateMs = Date.parse(value);
|
|
261
|
+
if (Number.isFinite(dateMs)) {
|
|
262
|
+
return Math.min(Math.max(dateMs - Date.now(), 0), 30000);
|
|
263
|
+
}
|
|
264
|
+
return HTTP_429_RETRY_DELAY_MS;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
function makeHttpStatusError(status, url, retryAfterMs = null) {
|
|
268
|
+
const err = new Error(`Access denied: HTTP ${status} when fetching ${url}`);
|
|
269
|
+
err.httpStatus = status;
|
|
270
|
+
err.retryAfterMs = retryAfterMs;
|
|
271
|
+
return err;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
function isAccessDenied($) {
|
|
275
|
+
const title = ($("title").text() || "").toLowerCase();
|
|
276
|
+
const bodyText = ($("body").text() || "").replace(/\s+/g, " ").trim().toLowerCase();
|
|
277
|
+
|
|
278
|
+
const titleDenyPatterns = [
|
|
279
|
+
"captcha",
|
|
280
|
+
"are you a robot",
|
|
281
|
+
"access denied",
|
|
282
|
+
"blocked",
|
|
283
|
+
"forbidden",
|
|
284
|
+
"unusual traffic",
|
|
285
|
+
"sorry, you have been blocked",
|
|
286
|
+
"verify you are human",
|
|
287
|
+
"one more step",
|
|
288
|
+
"security check",
|
|
289
|
+
"ddos protection",
|
|
290
|
+
"cloudflare",
|
|
291
|
+
];
|
|
292
|
+
|
|
293
|
+
if (titleDenyPatterns.some((pattern) => title.includes(pattern))) return true;
|
|
294
|
+
|
|
295
|
+
const bodyDenyPatterns = [
|
|
296
|
+
"to continue, please type the characters",
|
|
297
|
+
"our systems have detected unusual traffic",
|
|
298
|
+
"verify you are human",
|
|
299
|
+
"are you a robot",
|
|
300
|
+
"sorry, you have been blocked",
|
|
301
|
+
"access denied",
|
|
302
|
+
];
|
|
303
|
+
|
|
304
|
+
if (bodyText.length < 1200 && bodyDenyPatterns.some((pattern) => bodyText.includes(pattern))) return true;
|
|
305
|
+
|
|
306
|
+
return false;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
async function fetchHtml(url, template, blockMedia) {
|
|
78
310
|
const browser = await browserManager.getBrowser();
|
|
79
311
|
const context = await browser.newContext();
|
|
80
312
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
]);
|
|
313
|
+
try {
|
|
314
|
+
// Pre-load cookies from template
|
|
315
|
+
if (template && template.cookies && template.cookies.length > 0) {
|
|
316
|
+
await context.addCookies(template.cookies);
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
const page = await context.newPage();
|
|
89
320
|
|
|
90
|
-
|
|
321
|
+
try {
|
|
322
|
+
// Route blocked resource types
|
|
323
|
+
if (blockMedia) {
|
|
324
|
+
const blockedTypes =
|
|
325
|
+
template && template.block_resources
|
|
326
|
+
? template.block_resources
|
|
327
|
+
: ["image", "media", "font"];
|
|
91
328
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
329
|
+
if (blockedTypes.length > 0) {
|
|
330
|
+
await page.route("**/*", (route) => {
|
|
331
|
+
const type = route.request().resourceType();
|
|
332
|
+
if (blockedTypes.includes(type)) {
|
|
333
|
+
route.abort();
|
|
334
|
+
} else {
|
|
335
|
+
route.continue();
|
|
336
|
+
}
|
|
337
|
+
});
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
let response;
|
|
342
|
+
try {
|
|
343
|
+
response = await page.goto(url, {
|
|
344
|
+
waitUntil: "networkidle",
|
|
345
|
+
timeout: 15000,
|
|
346
|
+
});
|
|
347
|
+
} catch (_navError) {
|
|
348
|
+
// Allow partial rendering on timeout
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
// Check HTTP status for access failures
|
|
352
|
+
if (response) {
|
|
353
|
+
const status = response.status();
|
|
354
|
+
if ([401, 403, 429].includes(status)) {
|
|
355
|
+
throw makeHttpStatusError(
|
|
356
|
+
status,
|
|
357
|
+
url,
|
|
358
|
+
status === 429 ? parseRetryAfterMs(response.headers()["retry-after"]) : null,
|
|
359
|
+
);
|
|
360
|
+
}
|
|
99
361
|
}
|
|
100
|
-
});
|
|
101
362
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
363
|
+
const pageContent = await page.content();
|
|
364
|
+
|
|
365
|
+
// Check for CAPTCHA / access-denied pages
|
|
366
|
+
const $ = cheerio.load(pageContent);
|
|
367
|
+
if (isAccessDenied($)) {
|
|
368
|
+
throw new Error(
|
|
369
|
+
`Access denied: CAPTCHA or block page detected at ${url}. The site is blocking automated access.`,
|
|
370
|
+
);
|
|
371
|
+
}
|
|
107
372
|
|
|
373
|
+
return pageContent;
|
|
374
|
+
} finally {
|
|
375
|
+
await page.close();
|
|
376
|
+
}
|
|
377
|
+
} finally {
|
|
378
|
+
await context.close();
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
async function fetchHtmlWithRetry(url, template, blockMedia) {
|
|
383
|
+
let lastError;
|
|
384
|
+
for (let attempt = 0; attempt < FETCH_MAX_ATTEMPTS; attempt++) {
|
|
108
385
|
try {
|
|
109
|
-
await
|
|
110
|
-
} catch (
|
|
111
|
-
|
|
386
|
+
return await fetchHtml(url, template, blockMedia);
|
|
387
|
+
} catch (err) {
|
|
388
|
+
lastError = err;
|
|
389
|
+
if (
|
|
390
|
+
attempt < FETCH_MAX_ATTEMPTS - 1 &&
|
|
391
|
+
err.httpStatus === 429
|
|
392
|
+
) {
|
|
393
|
+
await sleep(err.retryAfterMs ?? HTTP_429_RETRY_DELAY_MS);
|
|
394
|
+
continue;
|
|
395
|
+
}
|
|
396
|
+
if (
|
|
397
|
+
attempt < FETCH_MAX_ATTEMPTS - 1 &&
|
|
398
|
+
(err.message.includes("net::") ||
|
|
399
|
+
err.message.includes("ERR_") ||
|
|
400
|
+
err.message.includes("Navigation failed"))
|
|
401
|
+
) {
|
|
402
|
+
await sleep(500);
|
|
403
|
+
continue;
|
|
404
|
+
}
|
|
405
|
+
throw err;
|
|
112
406
|
}
|
|
407
|
+
}
|
|
408
|
+
throw lastError;
|
|
409
|
+
}
|
|
113
410
|
|
|
114
|
-
|
|
115
|
-
const $ = cheerio.load(pageContent);
|
|
411
|
+
// === HTML CLEANUP ========================================================
|
|
116
412
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
413
|
+
const DEFAULT_REMOVE_SELECTORS = [
|
|
414
|
+
"script", "style", "svg", "nav", "footer", "noscript", "iframe",
|
|
415
|
+
".advertisement",
|
|
416
|
+
];
|
|
120
417
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
418
|
+
function applyRemove($, template) {
|
|
419
|
+
const removeSelectors =
|
|
420
|
+
template && template.remove && template.remove.length > 0
|
|
421
|
+
? template.remove
|
|
422
|
+
: DEFAULT_REMOVE_SELECTORS;
|
|
125
423
|
|
|
126
|
-
|
|
424
|
+
for (const selector of removeSelectors) {
|
|
425
|
+
try {
|
|
426
|
+
$(selector).remove();
|
|
427
|
+
} catch (_) {
|
|
428
|
+
// Skip invalid selectors
|
|
429
|
+
}
|
|
430
|
+
}
|
|
127
431
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
432
|
+
// Strip style attributes and data:image src
|
|
433
|
+
$("[style]").removeAttr("style");
|
|
434
|
+
$("*").each((_i, el) => {
|
|
435
|
+
const src = $(el).attr("src");
|
|
436
|
+
if (src && src.startsWith("data:image")) {
|
|
437
|
+
$(el).removeAttr("src");
|
|
438
|
+
}
|
|
439
|
+
});
|
|
440
|
+
}
|
|
135
441
|
|
|
136
|
-
|
|
137
|
-
try {
|
|
138
|
-
link = decodeURIComponent(link.split("/url?q=")[1].split("&")[0]);
|
|
139
|
-
} catch (e) {}
|
|
140
|
-
}
|
|
442
|
+
// === EXTRACTION ENGINE ===================================================
|
|
141
443
|
|
|
142
|
-
|
|
143
|
-
|
|
444
|
+
/**
|
|
445
|
+
* Find elements matching selector, scoped to $parent.
|
|
446
|
+
* Search order: descendants → closest ancestor → ancestor subtrees (up to 4 levels).
|
|
447
|
+
*/
|
|
448
|
+
function findScoped($parent, selector) {
|
|
449
|
+
if (!selector || selector.trim() === "") {
|
|
450
|
+
return $parent;
|
|
451
|
+
}
|
|
144
452
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
453
|
+
// 1. Descendants
|
|
454
|
+
let result = $parent.find(selector);
|
|
455
|
+
if (result.length > 0) return result;
|
|
456
|
+
|
|
457
|
+
// 2. Closest ancestor matching selector
|
|
458
|
+
result = $parent.closest(selector);
|
|
459
|
+
if (result.length > 0) return result;
|
|
460
|
+
|
|
461
|
+
// 3. Ancestor subtrees (up to 4 levels up)
|
|
462
|
+
let ancestor = $parent.parent();
|
|
463
|
+
for (let i = 0; i < 4 && ancestor.length > 0; i++) {
|
|
464
|
+
result = ancestor.find(selector);
|
|
465
|
+
if (result.length > 0) return result;
|
|
466
|
+
ancestor = ancestor.parent();
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
return $parent.find("__nonexistent__");
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
/**
|
|
473
|
+
* Try comma-separated selectors in order; first match wins.
|
|
474
|
+
*/
|
|
475
|
+
function findFirstMatch($parent, selectorStr) {
|
|
476
|
+
if (!selectorStr || selectorStr.trim() === "") {
|
|
477
|
+
return $parent;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
const selectors = selectorStr
|
|
481
|
+
.split(",")
|
|
482
|
+
.map((s) => s.trim())
|
|
483
|
+
.filter(Boolean);
|
|
484
|
+
|
|
485
|
+
for (const sel of selectors) {
|
|
486
|
+
const matches = findScoped($parent, sel);
|
|
487
|
+
if (matches.length > 0) return matches;
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
return $parent.find("__nonexistent__");
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
/**
|
|
494
|
+
* Resolve top-level elements for a section (document-wide with fallback).
|
|
495
|
+
*/
|
|
496
|
+
function resolveTopElements($, selectorStr) {
|
|
497
|
+
if (!selectorStr || selectorStr.trim() === "") {
|
|
498
|
+
return $("body");
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
const selectors = selectorStr
|
|
502
|
+
.split(",")
|
|
503
|
+
.map((s) => s.trim())
|
|
504
|
+
.filter(Boolean);
|
|
505
|
+
|
|
506
|
+
for (const sel of selectors) {
|
|
507
|
+
try {
|
|
508
|
+
const matches = $(sel);
|
|
509
|
+
if (matches.length > 0) return matches;
|
|
510
|
+
} catch (_) {
|
|
511
|
+
// Skip invalid selectors
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
return $();
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
// === TRANSFORMS ==========================================================
|
|
519
|
+
|
|
520
|
+
function applyTransform(value, transform, origin) {
|
|
521
|
+
const transforms = Array.isArray(transform) ? transform : [transform];
|
|
522
|
+
let result = value;
|
|
523
|
+
|
|
524
|
+
for (const t of transforms) {
|
|
525
|
+
if (!result) continue;
|
|
526
|
+
switch (t) {
|
|
527
|
+
case "strip":
|
|
528
|
+
result = result.trim();
|
|
529
|
+
break;
|
|
530
|
+
|
|
531
|
+
case "decode_google_url":
|
|
532
|
+
if (result.startsWith("/url?q=")) {
|
|
533
|
+
try {
|
|
534
|
+
const urlPart = result.split("/url?q=")[1].split("&")[0];
|
|
535
|
+
result = decodeURIComponent(urlPart);
|
|
536
|
+
} catch (_) {
|
|
537
|
+
// Leave as-is
|
|
154
538
|
}
|
|
155
|
-
parent = parent.parent();
|
|
156
539
|
}
|
|
540
|
+
break;
|
|
157
541
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
542
|
+
case "decode_ddg_url":
|
|
543
|
+
if (result.includes("/l/?uddg=")) {
|
|
544
|
+
try {
|
|
545
|
+
const queryString = result.split("?")[1] || "";
|
|
546
|
+
const params = new URLSearchParams(queryString);
|
|
547
|
+
const uddg = params.get("uddg");
|
|
548
|
+
if (uddg) result = decodeURIComponent(uddg);
|
|
549
|
+
} catch (_) {
|
|
550
|
+
// Leave as-is
|
|
166
551
|
}
|
|
167
552
|
}
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
553
|
+
break;
|
|
554
|
+
|
|
555
|
+
case "json_parse":
|
|
556
|
+
try {
|
|
557
|
+
result = JSON.stringify(JSON.parse(result), null, 2);
|
|
558
|
+
} catch (_) {
|
|
559
|
+
// Leave as-is
|
|
560
|
+
}
|
|
561
|
+
break;
|
|
174
562
|
|
|
175
|
-
|
|
563
|
+
case "resolve_href":
|
|
564
|
+
if (origin && result.startsWith("/") && !result.startsWith("//")) {
|
|
176
565
|
try {
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
566
|
+
result = new URL(result, origin).href;
|
|
567
|
+
} catch (_) {
|
|
568
|
+
// Leave as-is
|
|
569
|
+
}
|
|
180
570
|
}
|
|
571
|
+
break;
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
return result;
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
// === EXTRACTION ==========================================================
|
|
579
|
+
|
|
580
|
+
function extractValue($el, section, origin) {
|
|
581
|
+
let value;
|
|
582
|
+
|
|
583
|
+
switch (section.format) {
|
|
584
|
+
case "text":
|
|
585
|
+
value = $el.text().replace(/\s+/g, " ").trim();
|
|
586
|
+
break;
|
|
587
|
+
|
|
588
|
+
case "markdown": {
|
|
589
|
+
const html = $el.html() || "";
|
|
590
|
+
value = turndown
|
|
591
|
+
.turndown(html)
|
|
592
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
593
|
+
.trim();
|
|
594
|
+
break;
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
case "attribute":
|
|
598
|
+
value = $el.attr(section.attribute) || "";
|
|
599
|
+
break;
|
|
181
600
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
.text()
|
|
186
|
-
.replace(/\s+/g, " ")
|
|
187
|
-
.trim();
|
|
601
|
+
case "html":
|
|
602
|
+
value = $el.html() || "";
|
|
603
|
+
break;
|
|
188
604
|
|
|
189
|
-
|
|
190
|
-
|
|
605
|
+
default:
|
|
606
|
+
value = $el.text().replace(/\s+/g, " ").trim();
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
if (section.transform && value) {
|
|
610
|
+
value = applyTransform(value, section.transform, origin);
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
return value;
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
/**
|
|
617
|
+
* Extract one child section, scoped to $parentEl.
|
|
618
|
+
* Returns { type: "value", text } or null.
|
|
619
|
+
*/
|
|
620
|
+
function extractChildSection($, $parentEl, section, origin) {
|
|
621
|
+
const elements = findFirstMatch($parentEl, section.selector);
|
|
622
|
+
|
|
623
|
+
if (!elements || elements.length === 0) {
|
|
624
|
+
if (section.required) {
|
|
625
|
+
throw new Error(
|
|
626
|
+
`Required section '${section.name}' not found on page.`,
|
|
627
|
+
);
|
|
628
|
+
}
|
|
629
|
+
return null;
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
const el = elements.eq(0);
|
|
633
|
+
const value = extractValue(el, section, origin);
|
|
634
|
+
return { type: "value", text: value };
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
/**
|
|
638
|
+
* Extract a top-level section from the document.
|
|
639
|
+
* Returns a SectionResult or null.
|
|
640
|
+
*/
|
|
641
|
+
function extractSection($, section, context) {
|
|
642
|
+
const elements = resolveTopElements($, section.selector);
|
|
643
|
+
|
|
644
|
+
if (!elements || elements.length === 0) {
|
|
645
|
+
if (section.required) {
|
|
646
|
+
throw new Error(
|
|
647
|
+
`Required section '${section.name}' not found on page.`,
|
|
648
|
+
);
|
|
649
|
+
}
|
|
650
|
+
return null;
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
// Determine limit
|
|
654
|
+
let limit = elements.length;
|
|
655
|
+
if (section.multiple && section.max_items) {
|
|
656
|
+
limit = Math.min(limit, section.max_items);
|
|
657
|
+
}
|
|
658
|
+
// Override max_items with max_results for first multiple+children section
|
|
659
|
+
if (
|
|
660
|
+
context.isWebsearch &&
|
|
661
|
+
context.maxResultsOverride &&
|
|
662
|
+
!context._maxResultsConsumed &&
|
|
663
|
+
section.multiple &&
|
|
664
|
+
section.children &&
|
|
665
|
+
section.children.length > 0
|
|
666
|
+
) {
|
|
667
|
+
limit = Math.min(limit, context.maxResultsOverride);
|
|
668
|
+
context._maxResultsConsumed = true;
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
if (section.multiple) {
|
|
672
|
+
const items = [];
|
|
673
|
+
|
|
674
|
+
for (let i = 0; i < limit; i++) {
|
|
675
|
+
const el = elements.eq(i);
|
|
676
|
+
|
|
677
|
+
if (section.children && section.children.length > 0) {
|
|
678
|
+
// Multiple parents, each with children
|
|
679
|
+
const childValues = {};
|
|
680
|
+
for (const child of section.children) {
|
|
681
|
+
const cr = extractChildSection($, el, child, context.origin);
|
|
682
|
+
if (cr && cr.type === "value" && cr.text !== null && cr.text !== undefined) {
|
|
683
|
+
childValues[child.name] = cr.text;
|
|
684
|
+
}
|
|
191
685
|
}
|
|
192
|
-
|
|
686
|
+
if (Object.keys(childValues).length > 0) {
|
|
687
|
+
items.push(childValues);
|
|
688
|
+
}
|
|
689
|
+
} else {
|
|
690
|
+
// Multiple parents, no children
|
|
691
|
+
const value = extractValue(el, section, context.origin);
|
|
692
|
+
if (value && value.trim()) {
|
|
693
|
+
items.push(value.trim());
|
|
694
|
+
}
|
|
695
|
+
}
|
|
193
696
|
}
|
|
194
697
|
|
|
195
|
-
if (
|
|
196
|
-
return
|
|
698
|
+
if (section.children && section.children.length > 0) {
|
|
699
|
+
return { section, type: "children-multiple", items };
|
|
700
|
+
} else {
|
|
701
|
+
return { section, type: "list", items };
|
|
197
702
|
}
|
|
703
|
+
} else {
|
|
704
|
+
// Single parent
|
|
705
|
+
const el = elements.eq(0);
|
|
198
706
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
707
|
+
if (section.children && section.children.length > 0) {
|
|
708
|
+
// Single parent with children — parent format ignored
|
|
709
|
+
const childValues = {};
|
|
710
|
+
for (const child of section.children) {
|
|
711
|
+
const cr = extractChildSection($, el, child, context.origin);
|
|
712
|
+
if (cr && cr.type === "value" && cr.text !== null && cr.text !== undefined) {
|
|
713
|
+
childValues[child.name] = cr.text;
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
return { section, type: "children", items: childValues };
|
|
717
|
+
} else {
|
|
718
|
+
const value = extractValue(el, section, context.origin);
|
|
719
|
+
return { section, type: "value", text: value };
|
|
720
|
+
}
|
|
211
721
|
}
|
|
212
722
|
}
|
|
213
723
|
|
|
214
|
-
|
|
215
|
-
const
|
|
216
|
-
const context = await browser.newContext();
|
|
217
|
-
const page = await context.newPage();
|
|
724
|
+
function extractTemplate($, template, context) {
|
|
725
|
+
const results = [];
|
|
218
726
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
727
|
+
for (const section of template.sections) {
|
|
728
|
+
try {
|
|
729
|
+
const result = extractSection($, section, context);
|
|
730
|
+
if (result !== null) {
|
|
731
|
+
results.push(result);
|
|
732
|
+
}
|
|
733
|
+
} catch (err) {
|
|
734
|
+
if (
|
|
735
|
+
err.message &&
|
|
736
|
+
err.message.includes("Required section")
|
|
737
|
+
) {
|
|
738
|
+
throw err;
|
|
739
|
+
}
|
|
740
|
+
// Non-required failures are silently skipped
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
return results;
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
// === COMPOSITION: WEBFETCH ===============================================
|
|
748
|
+
|
|
749
|
+
function isCommentStyle(result) {
|
|
750
|
+
if (!result.items || result.items.length === 0) return false;
|
|
751
|
+
const first = result.items[0];
|
|
752
|
+
const keys = Object.keys(first).map((k) => k.toLowerCase());
|
|
753
|
+
return (
|
|
754
|
+
(keys.includes("author") && (keys.includes("comment") || keys.includes("body"))) ||
|
|
755
|
+
(keys.includes("user") && (keys.includes("comment") || keys.includes("body")))
|
|
756
|
+
);
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
function composeSections(extracted, template, startIndex, maxLength) {
|
|
760
|
+
const parts = [];
|
|
761
|
+
|
|
762
|
+
for (const result of extracted) {
|
|
763
|
+
if (result.type === "value") {
|
|
764
|
+
const text = result.text;
|
|
765
|
+
if (text && String(text).trim()) {
|
|
766
|
+
parts.push(`## ${result.section.name}\n\n${String(text).trim()}`);
|
|
767
|
+
}
|
|
768
|
+
} else if (result.type === "list") {
|
|
769
|
+
if (result.items && result.items.length > 0) {
|
|
770
|
+
const listText = result.items.map((item) => `- ${item}`).join("\n");
|
|
771
|
+
parts.push(`## ${result.section.name}\n\n${listText}`);
|
|
772
|
+
}
|
|
773
|
+
} else if (result.type === "children") {
|
|
774
|
+
if (result.items && Object.keys(result.items).length > 0) {
|
|
775
|
+
for (const [childName, value] of Object.entries(result.items)) {
|
|
776
|
+
if (value && String(value).trim()) {
|
|
777
|
+
parts.push(`## ${childName}\n\n${String(value).trim()}`);
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
} else if (result.type === "children-multiple") {
|
|
782
|
+
if (result.items && result.items.length > 0) {
|
|
783
|
+
if (isCommentStyle(result)) {
|
|
784
|
+
const commentParts = [];
|
|
785
|
+
for (const item of result.items) {
|
|
786
|
+
const author =
|
|
787
|
+
item["Author"] || item["author"] || item["User"] || item["user"] || "";
|
|
788
|
+
const comment =
|
|
789
|
+
item["Comment"] || item["Body"] || item["comment"] || item["body"] || "";
|
|
790
|
+
if (author) {
|
|
791
|
+
commentParts.push(`**${author}:**\n\n${comment}`);
|
|
792
|
+
} else if (comment) {
|
|
793
|
+
commentParts.push(comment);
|
|
794
|
+
}
|
|
795
|
+
}
|
|
796
|
+
if (commentParts.length > 0) {
|
|
797
|
+
parts.push(
|
|
798
|
+
`## ${result.section.name}\n\n${commentParts.join("\n\n---\n\n")}`,
|
|
799
|
+
);
|
|
800
|
+
}
|
|
225
801
|
} else {
|
|
226
|
-
|
|
802
|
+
const itemParts = [];
|
|
803
|
+
for (const item of result.items) {
|
|
804
|
+
const lines = [];
|
|
805
|
+
for (const [key, value] of Object.entries(item)) {
|
|
806
|
+
if (value && String(value).trim()) {
|
|
807
|
+
lines.push(` ${key}: ${String(value).trim()}`);
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
if (lines.length > 0) itemParts.push(lines.join("\n"));
|
|
811
|
+
}
|
|
812
|
+
if (itemParts.length > 0) {
|
|
813
|
+
parts.push(
|
|
814
|
+
`## ${result.section.name}\n\n${itemParts.join("\n\n")}`,
|
|
815
|
+
);
|
|
816
|
+
}
|
|
227
817
|
}
|
|
228
|
-
}
|
|
818
|
+
}
|
|
229
819
|
}
|
|
820
|
+
}
|
|
230
821
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
822
|
+
if (parts.length === 0) {
|
|
823
|
+
return "(No content extracted from this page.)";
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
const full = parts.join("\n\n---\n\n");
|
|
827
|
+
const totalLength = full.length;
|
|
828
|
+
const paginated = full.substring(startIndex, startIndex + maxLength);
|
|
829
|
+
|
|
830
|
+
const templateName = template ? template.name : "auto";
|
|
831
|
+
let metadata = `\n\n---\n[webfetch: template="${templateName}", showing characters ${startIndex} to ${startIndex + paginated.length} of ${totalLength} total.`;
|
|
832
|
+
if (startIndex + maxLength < totalLength) {
|
|
833
|
+
metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
|
|
834
|
+
}
|
|
835
|
+
metadata += `]`;
|
|
836
|
+
|
|
837
|
+
return paginated + metadata;
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
// === COMPOSITION: WEBSEARCH ==============================================
|
|
841
|
+
|
|
842
|
+
function composeSearchResults(extracted) {
|
|
843
|
+
// Find the search results section (first children-multiple)
|
|
844
|
+
const searchSection = extracted.find((r) => r.type === "children-multiple");
|
|
845
|
+
|
|
846
|
+
if (!searchSection || !searchSection.items || searchSection.items.length === 0) {
|
|
847
|
+
// Fall back to section-based output
|
|
848
|
+
return composeSections(extracted, null, 0, Infinity);
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
const items = searchSection.items;
|
|
852
|
+
const parts = [];
|
|
853
|
+
|
|
854
|
+
for (let i = 0; i < items.length; i++) {
|
|
855
|
+
const item = items[i];
|
|
856
|
+
const num = i + 1;
|
|
857
|
+
|
|
858
|
+
const title =
|
|
859
|
+
item["Title"] || item["title"] || Object.values(item)[0] || "";
|
|
860
|
+
const url =
|
|
861
|
+
item["URL"] || item["url"] || item["Url"] || "";
|
|
862
|
+
const snippet =
|
|
863
|
+
item["Snippet"] || item["snippet"] || "";
|
|
864
|
+
|
|
865
|
+
// Filter out non-http URLs and google internal links
|
|
866
|
+
let cleanUrl = url;
|
|
867
|
+
if (cleanUrl && !cleanUrl.startsWith("http")) {
|
|
868
|
+
cleanUrl = ""; // Skip internal/non-web URLs
|
|
869
|
+
}
|
|
870
|
+
if (
|
|
871
|
+
cleanUrl &&
|
|
872
|
+
(cleanUrl.includes("google.com/search") ||
|
|
873
|
+
cleanUrl.includes("support.google.com"))
|
|
874
|
+
) {
|
|
875
|
+
cleanUrl = ""; // Skip google internal links
|
|
235
876
|
}
|
|
236
877
|
|
|
237
|
-
|
|
238
|
-
let finalContent = "";
|
|
878
|
+
if (!title) continue;
|
|
239
879
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
const $ = cheerio.load(pageContent);
|
|
880
|
+
const lines = [`[${num}] ${title}`];
|
|
881
|
+
if (cleanUrl) lines.push(` URL: ${cleanUrl}`);
|
|
882
|
+
if (snippet) lines.push(` Snippet: ${snippet}`);
|
|
244
883
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
).remove();
|
|
248
|
-
$("*")
|
|
249
|
-
.removeAttr("style")
|
|
250
|
-
.each((i, el) => {
|
|
251
|
-
const src = $(el).attr("src");
|
|
252
|
-
if (src && src.startsWith("data:image")) $(el).removeAttr("src");
|
|
253
|
-
});
|
|
884
|
+
parts.push(lines.join("\n"));
|
|
885
|
+
}
|
|
254
886
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
887
|
+
if (parts.length === 0) {
|
|
888
|
+
return "(No content extracted from this page.)";
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
return `## ${searchSection.section.name}\n\n${parts.join("\n\n")}`;
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
// === GENERIC FALLBACK ====================================================
|
|
895
|
+
|
|
896
|
+
function genericFallback($, startIndex, maxLength) {
|
|
897
|
+
applyRemove($, null);
|
|
898
|
+
|
|
899
|
+
const bodyHtml = $("body").html() || "";
|
|
900
|
+
let markdown = turndown
|
|
901
|
+
.turndown(bodyHtml)
|
|
902
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
903
|
+
.trim();
|
|
904
|
+
|
|
905
|
+
if (!markdown || markdown.trim().length === 0) {
|
|
906
|
+
return "(No content extracted from this page.)";
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
const totalLength = markdown.length;
|
|
910
|
+
const paginated = markdown.substring(startIndex, startIndex + maxLength);
|
|
911
|
+
|
|
912
|
+
let metadata = `\n\n---\n[webfetch: template="auto (fallback)", showing characters ${startIndex} to ${startIndex + paginated.length} of ${totalLength} total.`;
|
|
913
|
+
if (startIndex + maxLength < totalLength) {
|
|
914
|
+
metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
|
|
915
|
+
}
|
|
916
|
+
metadata += `]`;
|
|
917
|
+
|
|
918
|
+
return paginated + metadata;
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
// === SEARCH TEMPLATE RESOLUTION ==========================================
|
|
922
|
+
|
|
923
|
+
function resolveSearchTemplate(engine, query, region, safeSearch) {
|
|
924
|
+
const templateName = resolveEngineToTemplateName(engine);
|
|
925
|
+
|
|
926
|
+
let template;
|
|
927
|
+
if (templateName.startsWith("{")) {
|
|
928
|
+
try {
|
|
929
|
+
template = JSON.parse(templateName);
|
|
930
|
+
} catch (e) {
|
|
931
|
+
throw new Error(`Invalid inline JSON template: ${e.message}`);
|
|
267
932
|
}
|
|
933
|
+
} else {
|
|
934
|
+
template = getTemplateByName(templateName);
|
|
935
|
+
}
|
|
268
936
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
startIndex + maxLength,
|
|
937
|
+
if (!template.url_template) {
|
|
938
|
+
throw new Error(
|
|
939
|
+
`Template '${template.name}' is not a search template (no url_template).`,
|
|
273
940
|
);
|
|
941
|
+
}
|
|
274
942
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
|
|
278
|
-
}
|
|
279
|
-
metadata += `]`;
|
|
943
|
+
const params = mapSearchParams(engine, query, region, safeSearch);
|
|
944
|
+
let url = resolveUrlTemplate(template, params);
|
|
280
945
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
946
|
+
// Google safe_search: append safe=active to URL
|
|
947
|
+
if (
|
|
948
|
+
(engine === "google" || templateName === "google-search") &&
|
|
949
|
+
safeSearch === true
|
|
950
|
+
) {
|
|
951
|
+
url += "&safe=active";
|
|
285
952
|
}
|
|
953
|
+
|
|
954
|
+
return { template, url };
|
|
286
955
|
}
|
|
287
956
|
|
|
288
|
-
|
|
957
|
+
// === MCP SERVER & TOOLS ==================================================
|
|
958
|
+
|
|
959
|
+
const server = new McpServer({ name: "searchfetch", version: "3.0.1" });
|
|
960
|
+
|
|
961
|
+
// --- websearch tool ---
|
|
289
962
|
|
|
290
963
|
server.registerTool(
|
|
291
964
|
"websearch",
|
|
@@ -296,38 +969,67 @@ server.registerTool(
|
|
|
296
969
|
inputSchema: z.object({
|
|
297
970
|
query: z.string().describe("The search query string."),
|
|
298
971
|
engine: z
|
|
299
|
-
.
|
|
972
|
+
.string()
|
|
300
973
|
.default("duckduckgo")
|
|
301
974
|
.describe(
|
|
302
975
|
"Search engine to use. Can be 'duckduckgo' or 'google'. Default is 'duckduckgo'.",
|
|
303
976
|
),
|
|
304
|
-
max_results: z
|
|
305
|
-
.number()
|
|
306
|
-
.default(10)
|
|
307
|
-
.describe("Maximum number of results to return. Default is 10."),
|
|
308
977
|
region: z
|
|
309
978
|
.string()
|
|
310
|
-
.
|
|
979
|
+
.nullable()
|
|
980
|
+
.default(null)
|
|
311
981
|
.describe(
|
|
312
|
-
"Region and language code to localize search results (e.g., 'us-en', 'uk-en', 'de-de'). For DuckDuckGo it maps directly. For Google, 'us' is country code and 'en' is language. Default is
|
|
982
|
+
"Region and language code to localize search results (e.g., 'us-en', 'uk-en', 'de-de'). For DuckDuckGo it maps directly. For Google, 'us' is country code and 'en' is language. Default is null (uses template default).",
|
|
313
983
|
),
|
|
314
984
|
safe_search: z
|
|
315
|
-
.
|
|
316
|
-
.
|
|
985
|
+
.boolean()
|
|
986
|
+
.nullable()
|
|
987
|
+
.default(null)
|
|
988
|
+
.describe(
|
|
989
|
+
"Enable safe search filtering. null = use template default. Applies to both DuckDuckGo and Google.",
|
|
990
|
+
),
|
|
991
|
+
max_results: z
|
|
992
|
+
.number()
|
|
993
|
+
.default(10)
|
|
994
|
+
.describe("Maximum number of search results to return. Default is 10."),
|
|
995
|
+
block_media: z
|
|
996
|
+
.boolean()
|
|
997
|
+
.default(true)
|
|
317
998
|
.describe(
|
|
318
|
-
"
|
|
999
|
+
"Block images, videos, and fonts entirely at the network layer. Default is true.",
|
|
319
1000
|
),
|
|
320
1001
|
}),
|
|
321
1002
|
},
|
|
322
|
-
async ({ query,
|
|
1003
|
+
async ({ query, engine, region, safe_search, max_results, block_media }) => {
|
|
323
1004
|
try {
|
|
324
|
-
|
|
1005
|
+
// 1. Resolve search template (+ url_params mapping + url building)
|
|
1006
|
+
const { template, url } = resolveSearchTemplate(
|
|
1007
|
+
engine,
|
|
325
1008
|
query,
|
|
326
|
-
max_results,
|
|
327
1009
|
region,
|
|
328
1010
|
safe_search,
|
|
329
|
-
engine,
|
|
330
1011
|
);
|
|
1012
|
+
|
|
1013
|
+
// 2. Fetch
|
|
1014
|
+
const html = await fetchHtmlWithRetry(url, template, block_media);
|
|
1015
|
+
|
|
1016
|
+
// 3. Extract
|
|
1017
|
+
const $ = cheerio.load(html);
|
|
1018
|
+
applyRemove($, template);
|
|
1019
|
+
|
|
1020
|
+
const pageOrigin = new URL(url).origin;
|
|
1021
|
+
const context = {
|
|
1022
|
+
origin: pageOrigin,
|
|
1023
|
+
isWebsearch: true,
|
|
1024
|
+
maxResultsOverride: max_results,
|
|
1025
|
+
_maxResultsConsumed: false,
|
|
1026
|
+
};
|
|
1027
|
+
|
|
1028
|
+
const extracted = extractTemplate($, template, context);
|
|
1029
|
+
|
|
1030
|
+
// 4. Compose
|
|
1031
|
+
const result = composeSearchResults(extracted);
|
|
1032
|
+
|
|
331
1033
|
return { content: [{ type: "text", text: result }] };
|
|
332
1034
|
} catch (err) {
|
|
333
1035
|
return {
|
|
@@ -338,6 +1040,8 @@ server.registerTool(
|
|
|
338
1040
|
},
|
|
339
1041
|
);
|
|
340
1042
|
|
|
1043
|
+
// --- webfetch tool ---
|
|
1044
|
+
|
|
341
1045
|
server.registerTool(
|
|
342
1046
|
"webfetch",
|
|
343
1047
|
{
|
|
@@ -345,22 +1049,20 @@ server.registerTool(
|
|
|
345
1049
|
description:
|
|
346
1050
|
"Fetch and extract the main text content from any webpage. Fully executes JavaScript to load React/SPAs and aggressively strips images/media (including base64) to save context tokens.",
|
|
347
1051
|
inputSchema: z.object({
|
|
348
|
-
url: z
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
)
|
|
353
|
-
|
|
354
|
-
.enum(["markdown", "clean_html", "raw_html"])
|
|
355
|
-
.default("markdown")
|
|
1052
|
+
url: z.string().describe(
|
|
1053
|
+
"The full URL of the webpage to fetch (must start with http/https).",
|
|
1054
|
+
),
|
|
1055
|
+
template: z
|
|
1056
|
+
.string()
|
|
1057
|
+
.default("auto")
|
|
356
1058
|
.describe(
|
|
357
|
-
"
|
|
1059
|
+
"Template to use: 'auto' (auto-detect from URL), a built-in name, or inline JSON.",
|
|
358
1060
|
),
|
|
359
1061
|
start_index: z
|
|
360
1062
|
.number()
|
|
361
1063
|
.default(0)
|
|
362
1064
|
.describe(
|
|
363
|
-
"Character offset
|
|
1065
|
+
"Character offset for pagination. Default: 0.",
|
|
364
1066
|
),
|
|
365
1067
|
max_length: z
|
|
366
1068
|
.number()
|
|
@@ -372,20 +1074,50 @@ server.registerTool(
|
|
|
372
1074
|
.boolean()
|
|
373
1075
|
.default(true)
|
|
374
1076
|
.describe(
|
|
375
|
-
"Block images, videos, and fonts entirely at the network layer
|
|
1077
|
+
"Block images, videos, and fonts entirely at the network layer. Default is true.",
|
|
376
1078
|
),
|
|
377
1079
|
}),
|
|
378
1080
|
},
|
|
379
|
-
async ({ url,
|
|
1081
|
+
async ({ url, template: templateParam, start_index, max_length, block_media }) => {
|
|
380
1082
|
try {
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
1083
|
+
// 1. Resolve template
|
|
1084
|
+
let template;
|
|
1085
|
+
|
|
1086
|
+
if (templateParam.startsWith("{")) {
|
|
1087
|
+
try {
|
|
1088
|
+
template = JSON.parse(templateParam);
|
|
1089
|
+
} catch (e) {
|
|
1090
|
+
throw new Error(`Invalid inline JSON template: ${e.message}`);
|
|
1091
|
+
}
|
|
1092
|
+
} else if (templateParam === "auto") {
|
|
1093
|
+
template = detectTemplateByUrl(url);
|
|
1094
|
+
} else {
|
|
1095
|
+
template = getTemplateByName(templateParam);
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
// 2. Fetch
|
|
1099
|
+
const html = await fetchHtmlWithRetry(url, template, block_media);
|
|
1100
|
+
|
|
1101
|
+
// 3. Extract and compose
|
|
1102
|
+
const $ = cheerio.load(html);
|
|
1103
|
+
|
|
1104
|
+
if (template) {
|
|
1105
|
+
applyRemove($, template);
|
|
1106
|
+
|
|
1107
|
+
const pageOrigin = new URL(url).origin;
|
|
1108
|
+
const context = {
|
|
1109
|
+
origin: pageOrigin,
|
|
1110
|
+
isWebsearch: false,
|
|
1111
|
+
};
|
|
1112
|
+
|
|
1113
|
+
const extracted = extractTemplate($, template, context);
|
|
1114
|
+
const result = composeSections(extracted, template, start_index, max_length);
|
|
1115
|
+
return { content: [{ type: "text", text: result }] };
|
|
1116
|
+
} else {
|
|
1117
|
+
// Generic fallback
|
|
1118
|
+
const result = genericFallback($, start_index, max_length);
|
|
1119
|
+
return { content: [{ type: "text", text: result }] };
|
|
1120
|
+
}
|
|
389
1121
|
} catch (err) {
|
|
390
1122
|
return {
|
|
391
1123
|
content: [{ type: "text", text: `Fetch Error: ${err.message}` }],
|
|
@@ -395,6 +1127,8 @@ server.registerTool(
|
|
|
395
1127
|
},
|
|
396
1128
|
);
|
|
397
1129
|
|
|
1130
|
+
// === MAIN =================================================================
|
|
1131
|
+
|
|
398
1132
|
async function main() {
|
|
399
1133
|
await ensureBinary();
|
|
400
1134
|
process.stdout.write = originalStdoutWrite;
|