searchfetch 1.0.2 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +68 -53
- package/index.js +961 -265
- package/package.json +7 -10
- package/templates/crates-package.json +37 -0
- package/templates/docs-page.json +31 -0
- package/templates/duckduckgo-search.json +67 -0
- package/templates/github-issue.json +69 -0
- package/templates/github-repo.json +36 -0
- package/templates/google-search.json +75 -0
- package/templates/npm-package.json +49 -0
- package/templates/pypi-package.json +43 -0
package/index.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
|
+
// === STDOUT/STDERR REDIRECTION ===========================================
|
|
3
4
|
const originalStdoutWrite = process.stdout.write.bind(process.stdout);
|
|
4
5
|
process.stdout.write = (chunk, encoding, callback) => {
|
|
5
6
|
return process.stderr.write(chunk, encoding, callback);
|
|
@@ -7,6 +8,10 @@ process.stdout.write = (chunk, encoding, callback) => {
|
|
|
7
8
|
console.log = (...args) => console.error(...args);
|
|
8
9
|
console.info = (...args) => console.error(...args);
|
|
9
10
|
|
|
11
|
+
// === IMPORTS =============================================================
|
|
12
|
+
import { readdirSync, readFileSync } from "node:fs";
|
|
13
|
+
import { dirname, join } from "node:path";
|
|
14
|
+
import { fileURLToPath } from "node:url";
|
|
10
15
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
11
16
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
12
17
|
import { z } from "zod";
|
|
@@ -14,37 +19,47 @@ import { launch, ensureBinary } from "cloakbrowser";
|
|
|
14
19
|
import * as cheerio from "cheerio";
|
|
15
20
|
import TurndownService from "turndown";
|
|
16
21
|
|
|
17
|
-
|
|
18
|
-
info: (msg) => console.error(`[INFO] ${msg}`),
|
|
19
|
-
warn: (msg) => console.error(`[WARN] ${msg}`),
|
|
20
|
-
error: (msg, err) => console.error(`[ERROR] ${msg}`, err || ""),
|
|
21
|
-
};
|
|
22
|
-
|
|
23
|
-
// ==========================================
|
|
24
|
-
// BROWSER LIFECYCLE MANAGEMENT
|
|
25
|
-
// ==========================================
|
|
22
|
+
// === BROWSER MANAGER =====================================================
|
|
26
23
|
class BrowserManager {
|
|
27
24
|
constructor() {
|
|
28
25
|
this.browser = null;
|
|
26
|
+
this.launchPromise = null;
|
|
29
27
|
}
|
|
30
28
|
|
|
31
29
|
async getBrowser() {
|
|
32
|
-
if (
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
30
|
+
if (this.browser && this.browser.isConnected()) return this.browser;
|
|
31
|
+
if (this.launchPromise) return this.launchPromise;
|
|
32
|
+
|
|
33
|
+
this.launchPromise = launch({
|
|
34
|
+
headless: true,
|
|
35
|
+
humanize: true,
|
|
36
|
+
args: [
|
|
37
|
+
"--disable-blink-features=AutomationControlled",
|
|
38
|
+
"--no-sandbox",
|
|
39
|
+
"--disable-dev-shm-usage",
|
|
40
|
+
],
|
|
41
|
+
})
|
|
42
|
+
.then((browser) => {
|
|
43
|
+
this.browser = browser;
|
|
44
|
+
this.browser.on("disconnected", () => {
|
|
45
|
+
this.browser = null;
|
|
46
|
+
});
|
|
47
|
+
return browser;
|
|
48
|
+
})
|
|
49
|
+
.catch((err) => {
|
|
50
|
+
throw err;
|
|
51
|
+
})
|
|
52
|
+
.finally(() => {
|
|
53
|
+
this.launchPromise = null;
|
|
38
54
|
});
|
|
39
|
-
|
|
40
|
-
return this.
|
|
55
|
+
|
|
56
|
+
return this.launchPromise;
|
|
41
57
|
}
|
|
42
58
|
|
|
43
59
|
async close() {
|
|
44
60
|
if (this.browser) {
|
|
45
61
|
await this.browser.close();
|
|
46
62
|
this.browser = null;
|
|
47
|
-
logger.info("Browser instance securely closed.");
|
|
48
63
|
}
|
|
49
64
|
}
|
|
50
65
|
}
|
|
@@ -52,358 +67,1039 @@ class BrowserManager {
|
|
|
52
67
|
const browserManager = new BrowserManager();
|
|
53
68
|
|
|
54
69
|
const cleanup = async () => {
|
|
55
|
-
logger.info("Received termination signal. Shutting down browser...");
|
|
56
70
|
await browserManager.close();
|
|
57
71
|
process.exit(0);
|
|
58
72
|
};
|
|
59
73
|
process.on("SIGINT", cleanup);
|
|
60
74
|
process.on("SIGTERM", cleanup);
|
|
61
75
|
|
|
62
|
-
//
|
|
63
|
-
|
|
64
|
-
|
|
76
|
+
// === TURNDOWN ============================================================
|
|
77
|
+
const turndown = new TurndownService({
|
|
78
|
+
headingStyle: "atx",
|
|
79
|
+
codeBlockStyle: "fenced",
|
|
80
|
+
emDelimiter: "*",
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
// === BUILT-IN TEMPLATES (loaded from templates/*.json) ====================
|
|
65
84
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
85
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
86
|
+
const __dirname = dirname(__filename);
|
|
87
|
+
const TEMPLATES_DIR = join(__dirname, "templates");
|
|
88
|
+
|
|
89
|
+
function loadBuiltinTemplates() {
|
|
90
|
+
let files;
|
|
91
|
+
try {
|
|
92
|
+
files = readdirSync(TEMPLATES_DIR);
|
|
93
|
+
} catch (err) {
|
|
94
|
+
throw new Error(
|
|
95
|
+
`Cannot read templates directory '${TEMPLATES_DIR}': ${err.message}`,
|
|
96
|
+
);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const jsonFiles = files
|
|
100
|
+
.filter((f) => f.endsWith(".json"))
|
|
101
|
+
.sort();
|
|
102
|
+
|
|
103
|
+
if (jsonFiles.length === 0) {
|
|
104
|
+
throw new Error(
|
|
105
|
+
`No template JSON files found in '${TEMPLATES_DIR}'`,
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
const templates = [];
|
|
110
|
+
for (const file of jsonFiles) {
|
|
111
|
+
const filePath = join(TEMPLATES_DIR, file);
|
|
112
|
+
const content = readFileSync(filePath, "utf-8");
|
|
113
|
+
let template;
|
|
114
|
+
try {
|
|
115
|
+
template = JSON.parse(content);
|
|
116
|
+
} catch (err) {
|
|
117
|
+
throw new Error(
|
|
118
|
+
`Invalid JSON in template file '${filePath}': ${err.message}`,
|
|
119
|
+
);
|
|
120
|
+
}
|
|
121
|
+
if (!template.name || typeof template.name !== "string") {
|
|
122
|
+
throw new Error(
|
|
123
|
+
`Template file '${filePath}' is missing a valid "name" field`,
|
|
124
|
+
);
|
|
125
|
+
}
|
|
126
|
+
templates.push(template);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Sort by "order" field for deterministic URL-pattern matching
|
|
130
|
+
templates.sort((a, b) => (a.order ?? 999) - (b.order ?? 999));
|
|
131
|
+
|
|
132
|
+
return templates;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const BUILTIN_TEMPLATES = loadBuiltinTemplates();
|
|
136
|
+
|
|
137
|
+
// === TEMPLATE LOOKUP =====================================================
|
|
138
|
+
const TEMPLATE_MAP = new Map();
|
|
139
|
+
for (const t of BUILTIN_TEMPLATES) {
|
|
140
|
+
TEMPLATE_MAP.set(t.name, t);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function getTemplateByName(name) {
|
|
144
|
+
const t = TEMPLATE_MAP.get(name);
|
|
145
|
+
if (!t) {
|
|
146
|
+
const names = [...TEMPLATE_MAP.keys()].join(", ");
|
|
147
|
+
throw new Error(
|
|
148
|
+
`Unknown template '${name}'. Available: ${names}`,
|
|
149
|
+
);
|
|
150
|
+
}
|
|
151
|
+
return t;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function detectTemplateByUrl(url) {
|
|
155
|
+
for (const template of BUILTIN_TEMPLATES) {
|
|
156
|
+
if (!template.url_patterns) continue;
|
|
157
|
+
for (const pattern of template.url_patterns) {
|
|
158
|
+
try {
|
|
159
|
+
if (new RegExp(pattern).test(url)) {
|
|
160
|
+
return template;
|
|
161
|
+
}
|
|
162
|
+
} catch (_) {
|
|
163
|
+
// Skip invalid regex
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
return null;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// === URL TEMPLATE RESOLUTION =============================================
|
|
171
|
+
|
|
172
|
+
function resolveUrlTemplate(template, providedParams) {
|
|
173
|
+
const urlParams = template.url_params || {};
|
|
174
|
+
let url = template.url_template;
|
|
175
|
+
if (!url) return null;
|
|
176
|
+
|
|
177
|
+
let match;
|
|
178
|
+
const re = /\{(\w+)\}/g;
|
|
179
|
+
while ((match = re.exec(url)) !== null) {
|
|
180
|
+
const name = match[1];
|
|
181
|
+
const def = urlParams[name] || {};
|
|
182
|
+
|
|
183
|
+
let value;
|
|
184
|
+
if (
|
|
185
|
+
name in providedParams &&
|
|
186
|
+
providedParams[name] !== null &&
|
|
187
|
+
providedParams[name] !== undefined
|
|
188
|
+
) {
|
|
189
|
+
value = String(providedParams[name]);
|
|
190
|
+
} else if (def.default !== undefined) {
|
|
191
|
+
value = String(def.default);
|
|
192
|
+
} else if (def.required) {
|
|
193
|
+
throw new Error(
|
|
194
|
+
`Required URL parameter '${name}' not provided for template '${template.name}'.`,
|
|
195
|
+
);
|
|
196
|
+
} else {
|
|
197
|
+
value = "";
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (def.encode === "url") {
|
|
201
|
+
value = encodeURIComponent(value);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
url = url.replace(match[0], value);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Remove any remaining unreplaced placeholders
|
|
208
|
+
url = url.replace(/\{\w+\}/g, "").replace(/&{2,}/g, "&").replace(/\?&/, "?");
|
|
209
|
+
|
|
210
|
+
return url;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// === SEARCH PARAM MAPPING ================================================
|
|
214
|
+
|
|
215
|
+
function resolveEngineToTemplateName(engine) {
|
|
216
|
+
if (engine === "duckduckgo") return "duckduckgo-search";
|
|
217
|
+
if (engine === "google") return "google-search";
|
|
218
|
+
return engine;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
function mapSearchParams(engine, query, region, safeSearch) {
|
|
222
|
+
const params = { query };
|
|
223
|
+
const resolved = resolveEngineToTemplateName(engine);
|
|
224
|
+
|
|
225
|
+
if (resolved === "duckduckgo-search") {
|
|
226
|
+
if (region !== null && region !== undefined) {
|
|
227
|
+
params.kl = region;
|
|
228
|
+
}
|
|
229
|
+
if (safeSearch === true) {
|
|
230
|
+
params.kp = "1";
|
|
231
|
+
} else if (safeSearch === false) {
|
|
232
|
+
params.kp = "-2";
|
|
233
|
+
}
|
|
234
|
+
} else if (resolved === "google-search") {
|
|
235
|
+
if (region !== null && region !== undefined) {
|
|
236
|
+
const parts = region.split("-");
|
|
237
|
+
params.hl = parts[0];
|
|
238
|
+
params.gl = parts.length > 1 ? parts[1] : parts[0];
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
return params;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// === FETCH ===============================================================
|
|
246
|
+
|
|
247
|
+
function isAccessDenied($) {
|
|
248
|
+
const title = ($("title").text() || "").toLowerCase();
|
|
249
|
+
const bodyText = ($("body").text() || "").replace(/\s+/g, " ").trim().toLowerCase();
|
|
250
|
+
|
|
251
|
+
const titleDenyPatterns = [
|
|
252
|
+
"captcha",
|
|
253
|
+
"are you a robot",
|
|
254
|
+
"access denied",
|
|
255
|
+
"blocked",
|
|
256
|
+
"forbidden",
|
|
257
|
+
"unusual traffic",
|
|
258
|
+
"sorry, you have been blocked",
|
|
259
|
+
"verify you are human",
|
|
260
|
+
"one more step",
|
|
261
|
+
"security check",
|
|
262
|
+
"ddos protection",
|
|
263
|
+
"cloudflare",
|
|
264
|
+
];
|
|
265
|
+
|
|
266
|
+
if (titleDenyPatterns.some((pattern) => title.includes(pattern))) return true;
|
|
267
|
+
|
|
268
|
+
const bodyDenyPatterns = [
|
|
269
|
+
"to continue, please type the characters",
|
|
270
|
+
"our systems have detected unusual traffic",
|
|
271
|
+
"verify you are human",
|
|
272
|
+
"are you a robot",
|
|
273
|
+
"sorry, you have been blocked",
|
|
274
|
+
"access denied",
|
|
275
|
+
];
|
|
276
|
+
|
|
277
|
+
if (bodyText.length < 1200 && bodyDenyPatterns.some((pattern) => bodyText.includes(pattern))) return true;
|
|
70
278
|
|
|
279
|
+
return false;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
async function fetchHtml(url, template, blockMedia) {
|
|
71
283
|
const browser = await browserManager.getBrowser();
|
|
72
284
|
const context = await browser.newContext();
|
|
73
285
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
{
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
domain: ".google.com",
|
|
80
|
-
path: "/",
|
|
81
|
-
},
|
|
82
|
-
]);
|
|
286
|
+
try {
|
|
287
|
+
// Pre-load cookies from template
|
|
288
|
+
if (template && template.cookies && template.cookies.length > 0) {
|
|
289
|
+
await context.addCookies(template.cookies);
|
|
290
|
+
}
|
|
83
291
|
|
|
84
|
-
|
|
292
|
+
const page = await context.newPage();
|
|
85
293
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
294
|
+
try {
|
|
295
|
+
// Route blocked resource types
|
|
296
|
+
if (blockMedia) {
|
|
297
|
+
const blockedTypes =
|
|
298
|
+
template && template.block_resources
|
|
299
|
+
? template.block_resources
|
|
300
|
+
: ["image", "media", "font"];
|
|
301
|
+
|
|
302
|
+
if (blockedTypes.length > 0) {
|
|
303
|
+
await page.route("**/*", (route) => {
|
|
304
|
+
const type = route.request().resourceType();
|
|
305
|
+
if (blockedTypes.includes(type)) {
|
|
306
|
+
route.abort();
|
|
307
|
+
} else {
|
|
308
|
+
route.continue();
|
|
309
|
+
}
|
|
310
|
+
});
|
|
311
|
+
}
|
|
94
312
|
}
|
|
95
|
-
});
|
|
96
313
|
|
|
97
|
-
|
|
98
|
-
|
|
314
|
+
let response;
|
|
315
|
+
try {
|
|
316
|
+
response = await page.goto(url, {
|
|
317
|
+
waitUntil: "networkidle",
|
|
318
|
+
timeout: 15000,
|
|
319
|
+
});
|
|
320
|
+
} catch (_navError) {
|
|
321
|
+
// Allow partial rendering on timeout
|
|
322
|
+
}
|
|
99
323
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
324
|
+
// Check HTTP status for access failures
|
|
325
|
+
if (response) {
|
|
326
|
+
const status = response.status();
|
|
327
|
+
if ([401, 403, 429].includes(status)) {
|
|
328
|
+
throw new Error(
|
|
329
|
+
`Access denied: HTTP ${status} when fetching ${url}`,
|
|
330
|
+
);
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
const pageContent = await page.content();
|
|
335
|
+
|
|
336
|
+
// Check for CAPTCHA / access-denied pages
|
|
337
|
+
const $ = cheerio.load(pageContent);
|
|
338
|
+
if (isAccessDenied($)) {
|
|
339
|
+
throw new Error(
|
|
340
|
+
`Access denied: CAPTCHA or block page detected at ${url}. The site is blocking automated access.`,
|
|
341
|
+
);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
return pageContent;
|
|
345
|
+
} finally {
|
|
346
|
+
await page.close();
|
|
104
347
|
}
|
|
348
|
+
} finally {
|
|
349
|
+
await context.close();
|
|
350
|
+
}
|
|
351
|
+
}
|
|
105
352
|
|
|
353
|
+
async function fetchHtmlWithRetry(url, template, blockMedia) {
|
|
354
|
+
let lastError;
|
|
355
|
+
for (let attempt = 0; attempt < 2; attempt++) {
|
|
106
356
|
try {
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
if (
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
357
|
+
return await fetchHtml(url, template, blockMedia);
|
|
358
|
+
} catch (err) {
|
|
359
|
+
lastError = err;
|
|
360
|
+
if (
|
|
361
|
+
attempt === 0 &&
|
|
362
|
+
(err.message.includes("net::") ||
|
|
363
|
+
err.message.includes("ERR_") ||
|
|
364
|
+
err.message.includes("Navigation failed"))
|
|
365
|
+
) {
|
|
366
|
+
// Network error — retry once
|
|
367
|
+
continue;
|
|
114
368
|
}
|
|
369
|
+
throw err;
|
|
115
370
|
}
|
|
371
|
+
}
|
|
372
|
+
throw lastError;
|
|
373
|
+
}
|
|
116
374
|
|
|
117
|
-
|
|
118
|
-
const $ = cheerio.load(pageContent);
|
|
375
|
+
// === HTML CLEANUP ========================================================
|
|
119
376
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
377
|
+
const DEFAULT_REMOVE_SELECTORS = [
|
|
378
|
+
"script", "style", "svg", "nav", "footer", "noscript", "iframe",
|
|
379
|
+
".advertisement",
|
|
380
|
+
];
|
|
124
381
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
382
|
+
function applyRemove($, template) {
|
|
383
|
+
const removeSelectors =
|
|
384
|
+
template && template.remove && template.remove.length > 0
|
|
385
|
+
? template.remove
|
|
386
|
+
: DEFAULT_REMOVE_SELECTORS;
|
|
128
387
|
|
|
129
|
-
|
|
130
|
-
|
|
388
|
+
for (const selector of removeSelectors) {
|
|
389
|
+
try {
|
|
390
|
+
$(selector).remove();
|
|
391
|
+
} catch (_) {
|
|
392
|
+
// Skip invalid selectors
|
|
393
|
+
}
|
|
394
|
+
}
|
|
131
395
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
396
|
+
// Strip style attributes and data:image src
|
|
397
|
+
$("[style]").removeAttr("style");
|
|
398
|
+
$("*").each((_i, el) => {
|
|
399
|
+
const src = $(el).attr("src");
|
|
400
|
+
if (src && src.startsWith("data:image")) {
|
|
401
|
+
$(el).removeAttr("src");
|
|
402
|
+
}
|
|
403
|
+
});
|
|
404
|
+
}
|
|
138
405
|
|
|
139
|
-
|
|
140
|
-
const cloned = $(el).clone();
|
|
141
|
-
cloned.find("h3, a, script, style, cite").remove();
|
|
142
|
-
const snippet = cloned.text().replace(/\s+/g, " ").trim();
|
|
406
|
+
// === EXTRACTION ENGINE ===================================================
|
|
143
407
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
408
|
+
/**
|
|
409
|
+
* Find elements matching selector, scoped to $parent.
|
|
410
|
+
* Search order: descendants → closest ancestor → ancestor subtrees (up to 4 levels).
|
|
411
|
+
*/
|
|
412
|
+
function findScoped($parent, selector) {
|
|
413
|
+
if (!selector || selector.trim() === "") {
|
|
414
|
+
return $parent;
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
// 1. Descendants
|
|
418
|
+
let result = $parent.find(selector);
|
|
419
|
+
if (result.length > 0) return result;
|
|
420
|
+
|
|
421
|
+
// 2. Closest ancestor matching selector
|
|
422
|
+
result = $parent.closest(selector);
|
|
423
|
+
if (result.length > 0) return result;
|
|
424
|
+
|
|
425
|
+
// 3. Ancestor subtrees (up to 4 levels up)
|
|
426
|
+
let ancestor = $parent.parent();
|
|
427
|
+
for (let i = 0; i < 4 && ancestor.length > 0; i++) {
|
|
428
|
+
result = ancestor.find(selector);
|
|
429
|
+
if (result.length > 0) return result;
|
|
430
|
+
ancestor = ancestor.parent();
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
return $parent.find("__nonexistent__");
|
|
434
|
+
}
|
|
152
435
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
436
|
+
/**
|
|
437
|
+
* Try comma-separated selectors in order; first match wins.
|
|
438
|
+
*/
|
|
439
|
+
function findFirstMatch($parent, selectorStr) {
|
|
440
|
+
if (!selectorStr || selectorStr.trim() === "") {
|
|
441
|
+
return $parent;
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
const selectors = selectorStr
|
|
445
|
+
.split(",")
|
|
446
|
+
.map((s) => s.trim())
|
|
447
|
+
.filter(Boolean);
|
|
156
448
|
|
|
157
|
-
|
|
158
|
-
|
|
449
|
+
for (const sel of selectors) {
|
|
450
|
+
const matches = findScoped($parent, sel);
|
|
451
|
+
if (matches.length > 0) return matches;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
return $parent.find("__nonexistent__");
|
|
455
|
+
}
|
|
159
456
|
|
|
160
|
-
|
|
457
|
+
/**
|
|
458
|
+
* Resolve top-level elements for a section (document-wide with fallback).
|
|
459
|
+
*/
|
|
460
|
+
function resolveTopElements($, selectorStr) {
|
|
461
|
+
if (!selectorStr || selectorStr.trim() === "") {
|
|
462
|
+
return $("body");
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
const selectors = selectorStr
|
|
466
|
+
.split(",")
|
|
467
|
+
.map((s) => s.trim())
|
|
468
|
+
.filter(Boolean);
|
|
469
|
+
|
|
470
|
+
for (const sel of selectors) {
|
|
471
|
+
try {
|
|
472
|
+
const matches = $(sel);
|
|
473
|
+
if (matches.length > 0) return matches;
|
|
474
|
+
} catch (_) {
|
|
475
|
+
// Skip invalid selectors
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
return $();
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
// === TRANSFORMS ==========================================================
|
|
483
|
+
|
|
484
|
+
function applyTransform(value, transform, origin) {
|
|
485
|
+
const transforms = Array.isArray(transform) ? transform : [transform];
|
|
486
|
+
let result = value;
|
|
487
|
+
|
|
488
|
+
for (const t of transforms) {
|
|
489
|
+
if (!result) continue;
|
|
490
|
+
switch (t) {
|
|
491
|
+
case "strip":
|
|
492
|
+
result = result.trim();
|
|
493
|
+
break;
|
|
494
|
+
|
|
495
|
+
case "decode_google_url":
|
|
496
|
+
if (result.startsWith("/url?q=")) {
|
|
161
497
|
try {
|
|
162
|
-
const
|
|
163
|
-
|
|
164
|
-
} catch (
|
|
498
|
+
const urlPart = result.split("/url?q=")[1].split("&")[0];
|
|
499
|
+
result = decodeURIComponent(urlPart);
|
|
500
|
+
} catch (_) {
|
|
501
|
+
// Leave as-is
|
|
502
|
+
}
|
|
165
503
|
}
|
|
504
|
+
break;
|
|
166
505
|
|
|
167
|
-
|
|
168
|
-
if (
|
|
169
|
-
|
|
506
|
+
case "decode_ddg_url":
|
|
507
|
+
if (result.includes("/l/?uddg=")) {
|
|
508
|
+
try {
|
|
509
|
+
const queryString = result.split("?")[1] || "";
|
|
510
|
+
const params = new URLSearchParams(queryString);
|
|
511
|
+
const uddg = params.get("uddg");
|
|
512
|
+
if (uddg) result = decodeURIComponent(uddg);
|
|
513
|
+
} catch (_) {
|
|
514
|
+
// Leave as-is
|
|
515
|
+
}
|
|
170
516
|
}
|
|
171
|
-
|
|
517
|
+
break;
|
|
518
|
+
|
|
519
|
+
case "json_parse":
|
|
520
|
+
try {
|
|
521
|
+
result = JSON.stringify(JSON.parse(result), null, 2);
|
|
522
|
+
} catch (_) {
|
|
523
|
+
// Leave as-is
|
|
524
|
+
}
|
|
525
|
+
break;
|
|
526
|
+
|
|
527
|
+
case "resolve_href":
|
|
528
|
+
if (origin && result.startsWith("/") && !result.startsWith("//")) {
|
|
529
|
+
try {
|
|
530
|
+
result = new URL(result, origin).href;
|
|
531
|
+
} catch (_) {
|
|
532
|
+
// Leave as-is
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
break;
|
|
172
536
|
}
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
return result;
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
// === EXTRACTION ==========================================================
|
|
543
|
+
|
|
544
|
+
function extractValue($el, section, origin) {
|
|
545
|
+
let value;
|
|
173
546
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
547
|
+
switch (section.format) {
|
|
548
|
+
case "text":
|
|
549
|
+
value = $el.text().replace(/\s+/g, " ").trim();
|
|
550
|
+
break;
|
|
551
|
+
|
|
552
|
+
case "markdown": {
|
|
553
|
+
const html = $el.html() || "";
|
|
554
|
+
value = turndown
|
|
555
|
+
.turndown(html)
|
|
556
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
557
|
+
.trim();
|
|
558
|
+
break;
|
|
178
559
|
}
|
|
179
560
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
await page.close();
|
|
191
|
-
await context.close();
|
|
561
|
+
case "attribute":
|
|
562
|
+
value = $el.attr(section.attribute) || "";
|
|
563
|
+
break;
|
|
564
|
+
|
|
565
|
+
case "html":
|
|
566
|
+
value = $el.html() || "";
|
|
567
|
+
break;
|
|
568
|
+
|
|
569
|
+
default:
|
|
570
|
+
value = $el.text().replace(/\s+/g, " ").trim();
|
|
192
571
|
}
|
|
572
|
+
|
|
573
|
+
if (section.transform && value) {
|
|
574
|
+
value = applyTransform(value, section.transform, origin);
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
return value;
|
|
193
578
|
}
|
|
194
579
|
|
|
195
|
-
|
|
196
|
-
|
|
580
|
+
/**
|
|
581
|
+
* Extract one child section, scoped to $parentEl.
|
|
582
|
+
* Returns { type: "value", text } or null.
|
|
583
|
+
*/
|
|
584
|
+
function extractChildSection($, $parentEl, section, origin) {
|
|
585
|
+
const elements = findFirstMatch($parentEl, section.selector);
|
|
197
586
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
587
|
+
if (!elements || elements.length === 0) {
|
|
588
|
+
if (section.required) {
|
|
589
|
+
throw new Error(
|
|
590
|
+
`Required section '${section.name}' not found on page.`,
|
|
591
|
+
);
|
|
592
|
+
}
|
|
593
|
+
return null;
|
|
594
|
+
}
|
|
201
595
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
596
|
+
const el = elements.eq(0);
|
|
597
|
+
const value = extractValue(el, section, origin);
|
|
598
|
+
return { type: "value", text: value };
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
/**
|
|
602
|
+
* Extract a top-level section from the document.
|
|
603
|
+
* Returns a SectionResult or null.
|
|
604
|
+
*/
|
|
605
|
+
function extractSection($, section, context) {
|
|
606
|
+
const elements = resolveTopElements($, section.selector);
|
|
607
|
+
|
|
608
|
+
if (!elements || elements.length === 0) {
|
|
609
|
+
if (section.required) {
|
|
610
|
+
throw new Error(
|
|
611
|
+
`Required section '${section.name}' not found on page.`,
|
|
612
|
+
);
|
|
212
613
|
}
|
|
614
|
+
return null;
|
|
615
|
+
}
|
|
213
616
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
617
|
+
// Determine limit
|
|
618
|
+
let limit = elements.length;
|
|
619
|
+
if (section.multiple && section.max_items) {
|
|
620
|
+
limit = Math.min(limit, section.max_items);
|
|
621
|
+
}
|
|
622
|
+
// Override max_items with max_results for first multiple+children section
|
|
623
|
+
if (
|
|
624
|
+
context.isWebsearch &&
|
|
625
|
+
context.maxResultsOverride &&
|
|
626
|
+
!context._maxResultsConsumed &&
|
|
627
|
+
section.multiple &&
|
|
628
|
+
section.children &&
|
|
629
|
+
section.children.length > 0
|
|
630
|
+
) {
|
|
631
|
+
limit = Math.min(limit, context.maxResultsOverride);
|
|
632
|
+
context._maxResultsConsumed = true;
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
if (section.multiple) {
|
|
636
|
+
const items = [];
|
|
637
|
+
|
|
638
|
+
for (let i = 0; i < limit; i++) {
|
|
639
|
+
const el = elements.eq(i);
|
|
640
|
+
|
|
641
|
+
if (section.children && section.children.length > 0) {
|
|
642
|
+
// Multiple parents, each with children
|
|
643
|
+
const childValues = {};
|
|
644
|
+
for (const child of section.children) {
|
|
645
|
+
const cr = extractChildSection($, el, child, context.origin);
|
|
646
|
+
if (cr && cr.type === "value" && cr.text !== null && cr.text !== undefined) {
|
|
647
|
+
childValues[child.name] = cr.text;
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
if (Object.keys(childValues).length > 0) {
|
|
651
|
+
items.push(childValues);
|
|
652
|
+
}
|
|
221
653
|
} else {
|
|
222
|
-
|
|
654
|
+
// Multiple parents, no children
|
|
655
|
+
const value = extractValue(el, section, context.origin);
|
|
656
|
+
if (value && value.trim()) {
|
|
657
|
+
items.push(value.trim());
|
|
658
|
+
}
|
|
223
659
|
}
|
|
224
660
|
}
|
|
225
661
|
|
|
226
|
-
|
|
227
|
-
|
|
662
|
+
if (section.children && section.children.length > 0) {
|
|
663
|
+
return { section, type: "children-multiple", items };
|
|
664
|
+
} else {
|
|
665
|
+
return { section, type: "list", items };
|
|
666
|
+
}
|
|
667
|
+
} else {
|
|
668
|
+
// Single parent
|
|
669
|
+
const el = elements.eq(0);
|
|
228
670
|
|
|
229
|
-
if (
|
|
230
|
-
|
|
671
|
+
if (section.children && section.children.length > 0) {
|
|
672
|
+
// Single parent with children — parent format ignored
|
|
673
|
+
const childValues = {};
|
|
674
|
+
for (const child of section.children) {
|
|
675
|
+
const cr = extractChildSection($, el, child, context.origin);
|
|
676
|
+
if (cr && cr.type === "value" && cr.text !== null && cr.text !== undefined) {
|
|
677
|
+
childValues[child.name] = cr.text;
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
return { section, type: "children", items: childValues };
|
|
231
681
|
} else {
|
|
232
|
-
const
|
|
682
|
+
const value = extractValue(el, section, context.origin);
|
|
683
|
+
return { section, type: "value", text: value };
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
function extractTemplate($, template, context) {
|
|
689
|
+
const results = [];
|
|
233
690
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
691
|
+
for (const section of template.sections) {
|
|
692
|
+
try {
|
|
693
|
+
const result = extractSection($, section, context);
|
|
694
|
+
if (result !== null) {
|
|
695
|
+
results.push(result);
|
|
696
|
+
}
|
|
697
|
+
} catch (err) {
|
|
698
|
+
if (
|
|
699
|
+
err.message &&
|
|
700
|
+
err.message.includes("Required section")
|
|
701
|
+
) {
|
|
702
|
+
throw err;
|
|
703
|
+
}
|
|
704
|
+
// Non-required failures are silently skipped
|
|
705
|
+
}
|
|
706
|
+
}
|
|
239
707
|
|
|
240
|
-
|
|
241
|
-
|
|
708
|
+
return results;
|
|
709
|
+
}
|
|
242
710
|
|
|
243
|
-
|
|
244
|
-
$("*").each((i, el) => {
|
|
245
|
-
const src = $(el).attr("src");
|
|
246
|
-
if (src && src.startsWith("data:image")) $(el).removeAttr("src");
|
|
247
|
-
});
|
|
711
|
+
// === COMPOSITION: WEBFETCH ===============================================
|
|
248
712
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
713
|
+
function isCommentStyle(result) {
|
|
714
|
+
if (!result.items || result.items.length === 0) return false;
|
|
715
|
+
const first = result.items[0];
|
|
716
|
+
const keys = Object.keys(first).map((k) => k.toLowerCase());
|
|
717
|
+
return (
|
|
718
|
+
(keys.includes("author") && (keys.includes("comment") || keys.includes("body"))) ||
|
|
719
|
+
(keys.includes("user") && (keys.includes("comment") || keys.includes("body")))
|
|
720
|
+
);
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
function composeSections(extracted, template, startIndex, maxLength) {
|
|
724
|
+
const parts = [];
|
|
725
|
+
|
|
726
|
+
for (const result of extracted) {
|
|
727
|
+
if (result.type === "value") {
|
|
728
|
+
const text = result.text;
|
|
729
|
+
if (text && String(text).trim()) {
|
|
730
|
+
parts.push(`## ${result.section.name}\n\n${String(text).trim()}`);
|
|
731
|
+
}
|
|
732
|
+
} else if (result.type === "list") {
|
|
733
|
+
if (result.items && result.items.length > 0) {
|
|
734
|
+
const listText = result.items.map((item) => `- ${item}`).join("\n");
|
|
735
|
+
parts.push(`## ${result.section.name}\n\n${listText}`);
|
|
736
|
+
}
|
|
737
|
+
} else if (result.type === "children") {
|
|
738
|
+
if (result.items && Object.keys(result.items).length > 0) {
|
|
739
|
+
for (const [childName, value] of Object.entries(result.items)) {
|
|
740
|
+
if (value && String(value).trim()) {
|
|
741
|
+
parts.push(`## ${childName}\n\n${String(value).trim()}`);
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
}
|
|
745
|
+
} else if (result.type === "children-multiple") {
|
|
746
|
+
if (result.items && result.items.length > 0) {
|
|
747
|
+
if (isCommentStyle(result)) {
|
|
748
|
+
const commentParts = [];
|
|
749
|
+
for (const item of result.items) {
|
|
750
|
+
const author =
|
|
751
|
+
item["Author"] || item["author"] || item["User"] || item["user"] || "";
|
|
752
|
+
const comment =
|
|
753
|
+
item["Comment"] || item["Body"] || item["comment"] || item["body"] || "";
|
|
754
|
+
if (author) {
|
|
755
|
+
commentParts.push(`**${author}:**\n\n${comment}`);
|
|
756
|
+
} else if (comment) {
|
|
757
|
+
commentParts.push(comment);
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
if (commentParts.length > 0) {
|
|
761
|
+
parts.push(
|
|
762
|
+
`## ${result.section.name}\n\n${commentParts.join("\n\n---\n\n")}`,
|
|
763
|
+
);
|
|
764
|
+
}
|
|
765
|
+
} else {
|
|
766
|
+
const itemParts = [];
|
|
767
|
+
for (const item of result.items) {
|
|
768
|
+
const lines = [];
|
|
769
|
+
for (const [key, value] of Object.entries(item)) {
|
|
770
|
+
if (value && String(value).trim()) {
|
|
771
|
+
lines.push(` ${key}: ${String(value).trim()}`);
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
if (lines.length > 0) itemParts.push(lines.join("\n"));
|
|
775
|
+
}
|
|
776
|
+
if (itemParts.length > 0) {
|
|
777
|
+
parts.push(
|
|
778
|
+
`## ${result.section.name}\n\n${itemParts.join("\n\n")}`,
|
|
779
|
+
);
|
|
780
|
+
}
|
|
781
|
+
}
|
|
258
782
|
}
|
|
259
783
|
}
|
|
784
|
+
}
|
|
260
785
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
786
|
+
if (parts.length === 0) {
|
|
787
|
+
return "(No content extracted from this page.)";
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
const full = parts.join("\n\n---\n\n");
|
|
791
|
+
const totalLength = full.length;
|
|
792
|
+
const paginated = full.substring(startIndex, startIndex + maxLength);
|
|
793
|
+
|
|
794
|
+
const templateName = template ? template.name : "auto";
|
|
795
|
+
let metadata = `\n\n---\n[webfetch: template="${templateName}", showing characters ${startIndex} to ${startIndex + paginated.length} of ${totalLength} total.`;
|
|
796
|
+
if (startIndex + maxLength < totalLength) {
|
|
797
|
+
metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
|
|
798
|
+
}
|
|
799
|
+
metadata += `]`;
|
|
800
|
+
|
|
801
|
+
return paginated + metadata;
|
|
802
|
+
}
|
|
267
803
|
|
|
268
|
-
|
|
269
|
-
startIndex + paginatedText.length
|
|
270
|
-
} of ${totalLength} total.`;
|
|
804
|
+
// === COMPOSITION: WEBSEARCH ==============================================
|
|
271
805
|
|
|
272
|
-
|
|
273
|
-
|
|
806
|
+
function composeSearchResults(extracted) {
|
|
807
|
+
// Find the search results section (first children-multiple)
|
|
808
|
+
const searchSection = extracted.find((r) => r.type === "children-multiple");
|
|
809
|
+
|
|
810
|
+
if (!searchSection || !searchSection.items || searchSection.items.length === 0) {
|
|
811
|
+
// Fall back to section-based output
|
|
812
|
+
return composeSections(extracted, null, 0, Infinity);
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
const items = searchSection.items;
|
|
816
|
+
const parts = [];
|
|
817
|
+
|
|
818
|
+
for (let i = 0; i < items.length; i++) {
|
|
819
|
+
const item = items[i];
|
|
820
|
+
const num = i + 1;
|
|
821
|
+
|
|
822
|
+
const title =
|
|
823
|
+
item["Title"] || item["title"] || Object.values(item)[0] || "";
|
|
824
|
+
const url =
|
|
825
|
+
item["URL"] || item["url"] || item["Url"] || "";
|
|
826
|
+
const snippet =
|
|
827
|
+
item["Snippet"] || item["snippet"] || "";
|
|
828
|
+
|
|
829
|
+
// Filter out non-http URLs and google internal links
|
|
830
|
+
let cleanUrl = url;
|
|
831
|
+
if (cleanUrl && !cleanUrl.startsWith("http")) {
|
|
832
|
+
cleanUrl = ""; // Skip internal/non-web URLs
|
|
833
|
+
}
|
|
834
|
+
if (
|
|
835
|
+
cleanUrl &&
|
|
836
|
+
(cleanUrl.includes("google.com/search") ||
|
|
837
|
+
cleanUrl.includes("support.google.com"))
|
|
838
|
+
) {
|
|
839
|
+
cleanUrl = ""; // Skip google internal links
|
|
274
840
|
}
|
|
275
|
-
metadata += `]`;
|
|
276
841
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
842
|
+
if (!title) continue;
|
|
843
|
+
|
|
844
|
+
const lines = [`[${num}] ${title}`];
|
|
845
|
+
if (cleanUrl) lines.push(` URL: ${cleanUrl}`);
|
|
846
|
+
if (snippet) lines.push(` Snippet: ${snippet}`);
|
|
847
|
+
|
|
848
|
+
parts.push(lines.join("\n"));
|
|
281
849
|
}
|
|
850
|
+
|
|
851
|
+
if (parts.length === 0) {
|
|
852
|
+
return "(No content extracted from this page.)";
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
return `## ${searchSection.section.name}\n\n${parts.join("\n\n")}`;
|
|
282
856
|
}
|
|
283
857
|
|
|
284
|
-
//
|
|
285
|
-
// MCP SERVER INIT & TOOL REGISTRATION
|
|
286
|
-
// ==========================================
|
|
858
|
+
// === GENERIC FALLBACK ====================================================
|
|
287
859
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
860
|
+
function genericFallback($, startIndex, maxLength) {
|
|
861
|
+
applyRemove($, null);
|
|
862
|
+
|
|
863
|
+
const bodyHtml = $("body").html() || "";
|
|
864
|
+
let markdown = turndown
|
|
865
|
+
.turndown(bodyHtml)
|
|
866
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
867
|
+
.trim();
|
|
868
|
+
|
|
869
|
+
if (!markdown || markdown.trim().length === 0) {
|
|
870
|
+
return "(No content extracted from this page.)";
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
const totalLength = markdown.length;
|
|
874
|
+
const paginated = markdown.substring(startIndex, startIndex + maxLength);
|
|
875
|
+
|
|
876
|
+
let metadata = `\n\n---\n[webfetch: template="auto (fallback)", showing characters ${startIndex} to ${startIndex + paginated.length} of ${totalLength} total.`;
|
|
877
|
+
if (startIndex + maxLength < totalLength) {
|
|
878
|
+
metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
|
|
879
|
+
}
|
|
880
|
+
metadata += `]`;
|
|
881
|
+
|
|
882
|
+
return paginated + metadata;
|
|
883
|
+
}
|
|
884
|
+
|
|
885
|
+
// === SEARCH TEMPLATE RESOLUTION ==========================================
|
|
886
|
+
|
|
887
|
+
function resolveSearchTemplate(engine, query, region, safeSearch) {
|
|
888
|
+
const templateName = resolveEngineToTemplateName(engine);
|
|
889
|
+
|
|
890
|
+
let template;
|
|
891
|
+
if (templateName.startsWith("{")) {
|
|
892
|
+
try {
|
|
893
|
+
template = JSON.parse(templateName);
|
|
894
|
+
} catch (e) {
|
|
895
|
+
throw new Error(`Invalid inline JSON template: ${e.message}`);
|
|
896
|
+
}
|
|
897
|
+
} else {
|
|
898
|
+
template = getTemplateByName(templateName);
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
if (!template.url_template) {
|
|
902
|
+
throw new Error(
|
|
903
|
+
`Template '${template.name}' is not a search template (no url_template).`,
|
|
904
|
+
);
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
const params = mapSearchParams(engine, query, region, safeSearch);
|
|
908
|
+
let url = resolveUrlTemplate(template, params);
|
|
909
|
+
|
|
910
|
+
// Google safe_search: append safe=active to URL
|
|
911
|
+
if (
|
|
912
|
+
(engine === "google" || templateName === "google-search") &&
|
|
913
|
+
safeSearch === true
|
|
914
|
+
) {
|
|
915
|
+
url += "&safe=active";
|
|
916
|
+
}
|
|
292
917
|
|
|
293
|
-
|
|
918
|
+
return { template, url };
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
// === MCP SERVER & TOOLS ==================================================
|
|
922
|
+
|
|
923
|
+
const server = new McpServer({ name: "searchfetch", version: "3.0.0" });
|
|
924
|
+
|
|
925
|
+
// --- websearch tool ---
|
|
926
|
+
|
|
927
|
+
server.registerTool(
|
|
294
928
|
"websearch",
|
|
295
|
-
"Search the web using DuckDuckGo or Google. Returns a clean list of titles, URLs, and snippets. Excellent for researching general knowledge, news, and finding URLs.",
|
|
296
929
|
{
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
.
|
|
300
|
-
|
|
301
|
-
.describe("
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
930
|
+
title: "Web Search",
|
|
931
|
+
description:
|
|
932
|
+
"Search the web using DuckDuckGo or Google. Returns a clean list of titles, URLs, and snippets. Excellent for researching general knowledge, news, and finding URLs.",
|
|
933
|
+
inputSchema: z.object({
|
|
934
|
+
query: z.string().describe("The search query string."),
|
|
935
|
+
engine: z
|
|
936
|
+
.string()
|
|
937
|
+
.default("duckduckgo")
|
|
938
|
+
.describe(
|
|
939
|
+
"Search engine to use. Can be 'duckduckgo' or 'google'. Default is 'duckduckgo'.",
|
|
940
|
+
),
|
|
941
|
+
region: z
|
|
942
|
+
.string()
|
|
943
|
+
.nullable()
|
|
944
|
+
.default(null)
|
|
945
|
+
.describe(
|
|
946
|
+
"Region and language code to localize search results (e.g., 'us-en', 'uk-en', 'de-de'). For DuckDuckGo it maps directly. For Google, 'us' is country code and 'en' is language. Default is null (uses template default).",
|
|
947
|
+
),
|
|
948
|
+
safe_search: z
|
|
949
|
+
.boolean()
|
|
950
|
+
.nullable()
|
|
951
|
+
.default(null)
|
|
952
|
+
.describe(
|
|
953
|
+
"Enable safe search filtering. null = use template default. Applies to both DuckDuckGo and Google.",
|
|
954
|
+
),
|
|
955
|
+
max_results: z
|
|
956
|
+
.number()
|
|
957
|
+
.default(10)
|
|
958
|
+
.describe("Maximum number of search results to return. Default is 10."),
|
|
959
|
+
block_media: z
|
|
960
|
+
.boolean()
|
|
961
|
+
.default(true)
|
|
962
|
+
.describe(
|
|
963
|
+
"Block images, videos, and fonts entirely at the network layer. Default is true.",
|
|
964
|
+
),
|
|
965
|
+
}),
|
|
316
966
|
},
|
|
317
|
-
async ({ query, engine,
|
|
967
|
+
async ({ query, engine, region, safe_search, max_results, block_media }) => {
|
|
318
968
|
try {
|
|
319
|
-
|
|
969
|
+
// 1. Resolve search template (+ url_params mapping + url building)
|
|
970
|
+
const { template, url } = resolveSearchTemplate(
|
|
971
|
+
engine,
|
|
320
972
|
query,
|
|
321
|
-
max_results,
|
|
322
973
|
region,
|
|
323
974
|
safe_search,
|
|
324
|
-
engine,
|
|
325
975
|
);
|
|
976
|
+
|
|
977
|
+
// 2. Fetch
|
|
978
|
+
const html = await fetchHtmlWithRetry(url, template, block_media);
|
|
979
|
+
|
|
980
|
+
// 3. Extract
|
|
981
|
+
const $ = cheerio.load(html);
|
|
982
|
+
applyRemove($, template);
|
|
983
|
+
|
|
984
|
+
const pageOrigin = new URL(url).origin;
|
|
985
|
+
const context = {
|
|
986
|
+
origin: pageOrigin,
|
|
987
|
+
isWebsearch: true,
|
|
988
|
+
maxResultsOverride: max_results,
|
|
989
|
+
_maxResultsConsumed: false,
|
|
990
|
+
};
|
|
991
|
+
|
|
992
|
+
const extracted = extractTemplate($, template, context);
|
|
993
|
+
|
|
994
|
+
// 4. Compose
|
|
995
|
+
const result = composeSearchResults(extracted);
|
|
996
|
+
|
|
326
997
|
return { content: [{ type: "text", text: result }] };
|
|
327
|
-
} catch (
|
|
328
|
-
logger.error("Search Tool failed:", error);
|
|
998
|
+
} catch (err) {
|
|
329
999
|
return {
|
|
330
|
-
content: [{ type: "text", text: `Search Error: ${
|
|
1000
|
+
content: [{ type: "text", text: `Search Error: ${err.message}` }],
|
|
331
1001
|
isError: true,
|
|
332
1002
|
};
|
|
333
1003
|
}
|
|
334
1004
|
},
|
|
335
1005
|
);
|
|
336
1006
|
|
|
337
|
-
|
|
1007
|
+
// --- webfetch tool ---
|
|
1008
|
+
|
|
1009
|
+
server.registerTool(
|
|
338
1010
|
"webfetch",
|
|
339
|
-
"Fetch and extract the main text content from any webpage. Fully executes JavaScript to load React/SPAs and aggressively strips images/media (including base64) to save context tokens.",
|
|
340
1011
|
{
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
.
|
|
344
|
-
|
|
1012
|
+
title: "Web Fetch",
|
|
1013
|
+
description:
|
|
1014
|
+
"Fetch and extract the main text content from any webpage. Fully executes JavaScript to load React/SPAs and aggressively strips images/media (including base64) to save context tokens.",
|
|
1015
|
+
inputSchema: z.object({
|
|
1016
|
+
url: z.string().describe(
|
|
345
1017
|
"The full URL of the webpage to fetch (must start with http/https).",
|
|
346
1018
|
),
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
1019
|
+
template: z
|
|
1020
|
+
.string()
|
|
1021
|
+
.default("auto")
|
|
1022
|
+
.describe(
|
|
1023
|
+
"Template to use: 'auto' (auto-detect from URL), a built-in name, or inline JSON.",
|
|
1024
|
+
),
|
|
1025
|
+
start_index: z
|
|
1026
|
+
.number()
|
|
1027
|
+
.default(0)
|
|
1028
|
+
.describe(
|
|
1029
|
+
"Character offset for pagination. Default: 0.",
|
|
1030
|
+
),
|
|
1031
|
+
max_length: z
|
|
1032
|
+
.number()
|
|
1033
|
+
.default(10000)
|
|
1034
|
+
.describe(
|
|
1035
|
+
"Maximum characters to return per request. Default is 10000.",
|
|
1036
|
+
),
|
|
1037
|
+
block_media: z
|
|
1038
|
+
.boolean()
|
|
1039
|
+
.default(true)
|
|
1040
|
+
.describe(
|
|
1041
|
+
"Block images, videos, and fonts entirely at the network layer. Default is true.",
|
|
1042
|
+
),
|
|
1043
|
+
}),
|
|
367
1044
|
},
|
|
368
|
-
async ({ url,
|
|
1045
|
+
async ({ url, template: templateParam, start_index, max_length, block_media }) => {
|
|
369
1046
|
try {
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
1047
|
+
// 1. Resolve template
|
|
1048
|
+
let template;
|
|
1049
|
+
|
|
1050
|
+
if (templateParam.startsWith("{")) {
|
|
1051
|
+
try {
|
|
1052
|
+
template = JSON.parse(templateParam);
|
|
1053
|
+
} catch (e) {
|
|
1054
|
+
throw new Error(`Invalid inline JSON template: ${e.message}`);
|
|
1055
|
+
}
|
|
1056
|
+
} else if (templateParam === "auto") {
|
|
1057
|
+
template = detectTemplateByUrl(url);
|
|
1058
|
+
} else {
|
|
1059
|
+
template = getTemplateByName(templateParam);
|
|
1060
|
+
}
|
|
1061
|
+
|
|
1062
|
+
// 2. Fetch
|
|
1063
|
+
const html = await fetchHtmlWithRetry(url, template, block_media);
|
|
1064
|
+
|
|
1065
|
+
// 3. Extract and compose
|
|
1066
|
+
const $ = cheerio.load(html);
|
|
1067
|
+
|
|
1068
|
+
if (template) {
|
|
1069
|
+
applyRemove($, template);
|
|
1070
|
+
|
|
1071
|
+
const pageOrigin = new URL(url).origin;
|
|
1072
|
+
const context = {
|
|
1073
|
+
origin: pageOrigin,
|
|
1074
|
+
isWebsearch: false,
|
|
1075
|
+
};
|
|
1076
|
+
|
|
1077
|
+
const extracted = extractTemplate($, template, context);
|
|
1078
|
+
const result = composeSections(extracted, template, start_index, max_length);
|
|
1079
|
+
return { content: [{ type: "text", text: result }] };
|
|
1080
|
+
} else {
|
|
1081
|
+
// Generic fallback
|
|
1082
|
+
const result = genericFallback($, start_index, max_length);
|
|
1083
|
+
return { content: [{ type: "text", text: result }] };
|
|
1084
|
+
}
|
|
1085
|
+
} catch (err) {
|
|
380
1086
|
return {
|
|
381
|
-
content: [{ type: "text", text: `Fetch Error: ${
|
|
1087
|
+
content: [{ type: "text", text: `Fetch Error: ${err.message}` }],
|
|
382
1088
|
isError: true,
|
|
383
1089
|
};
|
|
384
1090
|
}
|
|
385
1091
|
},
|
|
386
1092
|
);
|
|
387
1093
|
|
|
388
|
-
//
|
|
389
|
-
// BOOTSTRAP
|
|
390
|
-
// ==========================================
|
|
1094
|
+
// === MAIN =================================================================
|
|
391
1095
|
|
|
392
1096
|
async function main() {
|
|
393
|
-
logger.info("Initializing MCP Server...");
|
|
394
|
-
|
|
395
1097
|
await ensureBinary();
|
|
396
|
-
|
|
397
|
-
// Re-enable STDOUT right before protocol hook-in
|
|
398
1098
|
process.stdout.write = originalStdoutWrite;
|
|
399
|
-
|
|
400
1099
|
const transport = new StdioServerTransport();
|
|
401
1100
|
await server.connect(transport);
|
|
402
|
-
|
|
403
|
-
logger.info("searchfetch successfully connected and listening for requests.");
|
|
404
1101
|
}
|
|
405
1102
|
|
|
406
1103
|
main().catch((err) => {
|
|
407
|
-
logger.error("Fatal error during startup:", err);
|
|
408
1104
|
process.exit(1);
|
|
409
1105
|
});
|