searchfetch 2.0.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env node
2
2
 
3
+ // === STDOUT/STDERR REDIRECTION ===========================================
3
4
  const originalStdoutWrite = process.stdout.write.bind(process.stdout);
4
5
  process.stdout.write = (chunk, encoding, callback) => {
5
6
  return process.stderr.write(chunk, encoding, callback);
@@ -7,6 +8,10 @@ process.stdout.write = (chunk, encoding, callback) => {
7
8
  console.log = (...args) => console.error(...args);
8
9
  console.info = (...args) => console.error(...args);
9
10
 
11
+ // === IMPORTS =============================================================
12
+ import { readdirSync, readFileSync } from "node:fs";
13
+ import { dirname, join } from "node:path";
14
+ import { fileURLToPath } from "node:url";
10
15
  import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
11
16
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
12
17
  import { z } from "zod";
@@ -14,6 +19,7 @@ import { launch, ensureBinary } from "cloakbrowser";
14
19
  import * as cheerio from "cheerio";
15
20
  import TurndownService from "turndown";
16
21
 
22
+ // === BROWSER MANAGER =====================================================
17
23
  class BrowserManager {
18
24
  constructor() {
19
25
  this.browser = null;
@@ -67,225 +73,856 @@ const cleanup = async () => {
67
73
  process.on("SIGINT", cleanup);
68
74
  process.on("SIGTERM", cleanup);
69
75
 
70
- function getGoogleRegionParams(region) {
71
- if (!region || region === "wt-wt") return "hl=en&gl=us";
72
- const parts = region.split("-");
73
- if (parts.length === 2) return `gl=${parts[0]}&hl=${parts[1]}`;
74
- return `gl=${region}&hl=en`;
76
+ // === TURNDOWN ============================================================
77
+ const turndown = new TurndownService({
78
+ headingStyle: "atx",
79
+ codeBlockStyle: "fenced",
80
+ emDelimiter: "*",
81
+ });
82
+
83
+ // === BUILT-IN TEMPLATES (loaded from templates/*.json) ====================
84
+
85
+ const __filename = fileURLToPath(import.meta.url);
86
+ const __dirname = dirname(__filename);
87
+ const TEMPLATES_DIR = join(__dirname, "templates");
88
+
89
+ function loadBuiltinTemplates() {
90
+ let files;
91
+ try {
92
+ files = readdirSync(TEMPLATES_DIR);
93
+ } catch (err) {
94
+ throw new Error(
95
+ `Cannot read templates directory '${TEMPLATES_DIR}': ${err.message}`,
96
+ );
97
+ }
98
+
99
+ const jsonFiles = files
100
+ .filter((f) => f.endsWith(".json"))
101
+ .sort();
102
+
103
+ if (jsonFiles.length === 0) {
104
+ throw new Error(
105
+ `No template JSON files found in '${TEMPLATES_DIR}'`,
106
+ );
107
+ }
108
+
109
+ const templates = [];
110
+ for (const file of jsonFiles) {
111
+ const filePath = join(TEMPLATES_DIR, file);
112
+ const content = readFileSync(filePath, "utf-8");
113
+ let template;
114
+ try {
115
+ template = JSON.parse(content);
116
+ } catch (err) {
117
+ throw new Error(
118
+ `Invalid JSON in template file '${filePath}': ${err.message}`,
119
+ );
120
+ }
121
+ if (!template.name || typeof template.name !== "string") {
122
+ throw new Error(
123
+ `Template file '${filePath}' is missing a valid "name" field`,
124
+ );
125
+ }
126
+ templates.push(template);
127
+ }
128
+
129
+ // Sort by "order" field for deterministic URL-pattern matching
130
+ templates.sort((a, b) => (a.order ?? 999) - (b.order ?? 999));
131
+
132
+ return templates;
133
+ }
134
+
135
+ const BUILTIN_TEMPLATES = loadBuiltinTemplates();
136
+
137
+ // === TEMPLATE LOOKUP =====================================================
138
+ const TEMPLATE_MAP = new Map();
139
+ for (const t of BUILTIN_TEMPLATES) {
140
+ TEMPLATE_MAP.set(t.name, t);
141
+ }
142
+
143
+ function getTemplateByName(name) {
144
+ const t = TEMPLATE_MAP.get(name);
145
+ if (!t) {
146
+ const names = [...TEMPLATE_MAP.keys()].join(", ");
147
+ throw new Error(
148
+ `Unknown template '${name}'. Available: ${names}`,
149
+ );
150
+ }
151
+ return t;
152
+ }
153
+
154
+ function detectTemplateByUrl(url) {
155
+ for (const template of BUILTIN_TEMPLATES) {
156
+ if (!template.url_patterns) continue;
157
+ for (const pattern of template.url_patterns) {
158
+ try {
159
+ if (new RegExp(pattern).test(url)) {
160
+ return template;
161
+ }
162
+ } catch (_) {
163
+ // Skip invalid regex
164
+ }
165
+ }
166
+ }
167
+ return null;
168
+ }
169
+
170
+ // === URL TEMPLATE RESOLUTION =============================================
171
+
172
+ function resolveUrlTemplate(template, providedParams) {
173
+ const urlParams = template.url_params || {};
174
+ let url = template.url_template;
175
+ if (!url) return null;
176
+
177
+ let match;
178
+ const re = /\{(\w+)\}/g;
179
+ while ((match = re.exec(url)) !== null) {
180
+ const name = match[1];
181
+ const def = urlParams[name] || {};
182
+
183
+ let value;
184
+ if (
185
+ name in providedParams &&
186
+ providedParams[name] !== null &&
187
+ providedParams[name] !== undefined
188
+ ) {
189
+ value = String(providedParams[name]);
190
+ } else if (def.default !== undefined) {
191
+ value = String(def.default);
192
+ } else if (def.required) {
193
+ throw new Error(
194
+ `Required URL parameter '${name}' not provided for template '${template.name}'.`,
195
+ );
196
+ } else {
197
+ value = "";
198
+ }
199
+
200
+ if (def.encode === "url") {
201
+ value = encodeURIComponent(value);
202
+ }
203
+
204
+ url = url.replace(match[0], value);
205
+ }
206
+
207
+ // Remove any remaining unreplaced placeholders
208
+ url = url.replace(/\{\w+\}/g, "").replace(/&{2,}/g, "&").replace(/\?&/, "?");
209
+
210
+ return url;
211
+ }
212
+
213
+ // === SEARCH PARAM MAPPING ================================================
214
+
215
+ function resolveEngineToTemplateName(engine) {
216
+ if (engine === "duckduckgo") return "duckduckgo-search";
217
+ if (engine === "google") return "google-search";
218
+ return engine;
219
+ }
220
+
221
+ function mapSearchParams(engine, query, region, safeSearch) {
222
+ const params = { query };
223
+ const resolved = resolveEngineToTemplateName(engine);
224
+
225
+ if (resolved === "duckduckgo-search") {
226
+ if (region !== null && region !== undefined) {
227
+ params.kl = region;
228
+ }
229
+ if (safeSearch === true) {
230
+ params.kp = "1";
231
+ } else if (safeSearch === false) {
232
+ params.kp = "-2";
233
+ }
234
+ } else if (resolved === "google-search") {
235
+ if (region !== null && region !== undefined) {
236
+ const parts = region.split("-");
237
+ params.hl = parts[0];
238
+ params.gl = parts.length > 1 ? parts[1] : parts[0];
239
+ }
240
+ }
241
+
242
+ return params;
243
+ }
244
+
245
+ // === FETCH ===============================================================
246
+
247
+ function isAccessDenied($) {
248
+ const title = ($("title").text() || "").toLowerCase();
249
+ const bodyText = ($("body").text() || "").replace(/\s+/g, " ").trim().toLowerCase();
250
+
251
+ const titleDenyPatterns = [
252
+ "captcha",
253
+ "are you a robot",
254
+ "access denied",
255
+ "blocked",
256
+ "forbidden",
257
+ "unusual traffic",
258
+ "sorry, you have been blocked",
259
+ "verify you are human",
260
+ "one more step",
261
+ "security check",
262
+ "ddos protection",
263
+ "cloudflare",
264
+ ];
265
+
266
+ if (titleDenyPatterns.some((pattern) => title.includes(pattern))) return true;
267
+
268
+ const bodyDenyPatterns = [
269
+ "to continue, please type the characters",
270
+ "our systems have detected unusual traffic",
271
+ "verify you are human",
272
+ "are you a robot",
273
+ "sorry, you have been blocked",
274
+ "access denied",
275
+ ];
276
+
277
+ if (bodyText.length < 1200 && bodyDenyPatterns.some((pattern) => bodyText.includes(pattern))) return true;
278
+
279
+ return false;
75
280
  }
76
281
 
77
- async function executeSearch(query, maxResults, region, safeSearch, engine) {
282
+ async function fetchHtml(url, template, blockMedia) {
78
283
  const browser = await browserManager.getBrowser();
79
284
  const context = await browser.newContext();
80
285
 
81
- await context.addCookies([
82
- {
83
- name: "CONSENT",
84
- value: "YES+cb.20250101-01-p0.en+FX+999",
85
- domain: ".google.com",
86
- path: "/",
87
- },
88
- ]);
286
+ try {
287
+ // Pre-load cookies from template
288
+ if (template && template.cookies && template.cookies.length > 0) {
289
+ await context.addCookies(template.cookies);
290
+ }
89
291
 
90
- const page = await context.newPage();
292
+ const page = await context.newPage();
91
293
 
92
- try {
93
- await page.route("**/*", (route) => {
94
- const type = route.request().resourceType();
95
- if (["image", "media", "font", "stylesheet"].includes(type)) {
96
- route.abort();
97
- } else {
98
- route.continue();
294
+ try {
295
+ // Route blocked resource types
296
+ if (blockMedia) {
297
+ const blockedTypes =
298
+ template && template.block_resources
299
+ ? template.block_resources
300
+ : ["image", "media", "font"];
301
+
302
+ if (blockedTypes.length > 0) {
303
+ await page.route("**/*", (route) => {
304
+ const type = route.request().resourceType();
305
+ if (blockedTypes.includes(type)) {
306
+ route.abort();
307
+ } else {
308
+ route.continue();
309
+ }
310
+ });
311
+ }
99
312
  }
100
- });
101
313
 
102
- const results = [];
103
- const searchUrl =
104
- engine === "google"
105
- ? `https://www.google.com/search?udm=web&udm=14&q=${encodeURIComponent(query)}&${getGoogleRegionParams(region)}`
106
- : `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}&kl=${encodeURIComponent(region)}&kp=${encodeURIComponent(safeSearch)}`;
314
+ let response;
315
+ try {
316
+ response = await page.goto(url, {
317
+ waitUntil: "networkidle",
318
+ timeout: 15000,
319
+ });
320
+ } catch (_navError) {
321
+ // Allow partial rendering on timeout
322
+ }
107
323
 
324
+ // Check HTTP status for access failures
325
+ if (response) {
326
+ const status = response.status();
327
+ if ([401, 403, 429].includes(status)) {
328
+ throw new Error(
329
+ `Access denied: HTTP ${status} when fetching ${url}`,
330
+ );
331
+ }
332
+ }
333
+
334
+ const pageContent = await page.content();
335
+
336
+ // Check for CAPTCHA / access-denied pages
337
+ const $ = cheerio.load(pageContent);
338
+ if (isAccessDenied($)) {
339
+ throw new Error(
340
+ `Access denied: CAPTCHA or block page detected at ${url}. The site is blocking automated access.`,
341
+ );
342
+ }
343
+
344
+ return pageContent;
345
+ } finally {
346
+ await page.close();
347
+ }
348
+ } finally {
349
+ await context.close();
350
+ }
351
+ }
352
+
353
+ async function fetchHtmlWithRetry(url, template, blockMedia) {
354
+ let lastError;
355
+ for (let attempt = 0; attempt < 2; attempt++) {
108
356
  try {
109
- await page.goto(searchUrl, { waitUntil: "networkidle", timeout: 15000 });
110
- } catch (e) {
111
- // Allow partial rendering on timeout
357
+ return await fetchHtml(url, template, blockMedia);
358
+ } catch (err) {
359
+ lastError = err;
360
+ if (
361
+ attempt === 0 &&
362
+ (err.message.includes("net::") ||
363
+ err.message.includes("ERR_") ||
364
+ err.message.includes("Navigation failed"))
365
+ ) {
366
+ // Network error — retry once
367
+ continue;
368
+ }
369
+ throw err;
370
+ }
371
+ }
372
+ throw lastError;
373
+ }
374
+
375
+ // === HTML CLEANUP ========================================================
376
+
377
+ const DEFAULT_REMOVE_SELECTORS = [
378
+ "script", "style", "svg", "nav", "footer", "noscript", "iframe",
379
+ ".advertisement",
380
+ ];
381
+
382
+ function applyRemove($, template) {
383
+ const removeSelectors =
384
+ template && template.remove && template.remove.length > 0
385
+ ? template.remove
386
+ : DEFAULT_REMOVE_SELECTORS;
387
+
388
+ for (const selector of removeSelectors) {
389
+ try {
390
+ $(selector).remove();
391
+ } catch (_) {
392
+ // Skip invalid selectors
112
393
  }
394
+ }
113
395
 
114
- const pageContent = await page.content();
115
- const $ = cheerio.load(pageContent);
396
+ // Strip style attributes and data:image src
397
+ $("[style]").removeAttr("style");
398
+ $("*").each((_i, el) => {
399
+ const src = $(el).attr("src");
400
+ if (src && src.startsWith("data:image")) {
401
+ $(el).removeAttr("src");
402
+ }
403
+ });
404
+ }
116
405
 
117
- if (engine === "google") {
118
- $("h3").each((i, el) => {
119
- if (results.length >= maxResults) return;
406
+ // === EXTRACTION ENGINE ===================================================
120
407
 
121
- const h3 = $(el);
122
- let linkEl = h3.closest("a");
123
- if (!linkEl.length) linkEl = h3.find("a");
124
- if (!linkEl.length) return;
408
+ /**
409
+ * Find elements matching selector, scoped to $parent.
410
+ * Search order: descendants → closest ancestor → ancestor subtrees (up to 4 levels).
411
+ */
412
+ function findScoped($parent, selector) {
413
+ if (!selector || selector.trim() === "") {
414
+ return $parent;
415
+ }
125
416
 
126
- let link = linkEl.attr("href") || "";
417
+ // 1. Descendants
418
+ let result = $parent.find(selector);
419
+ if (result.length > 0) return result;
127
420
 
128
- if (!link || (link.startsWith("/") && !link.startsWith("/url?q=")))
129
- return;
130
- if (
131
- link.includes("google.com/search") ||
132
- link.includes("support.google.com")
133
- )
134
- return;
421
+ // 2. Closest ancestor matching selector
422
+ result = $parent.closest(selector);
423
+ if (result.length > 0) return result;
135
424
 
136
- if (link.startsWith("/url?q=")) {
137
- try {
138
- link = decodeURIComponent(link.split("/url?q=")[1].split("&")[0]);
139
- } catch (e) {}
140
- }
425
+ // 3. Ancestor subtrees (up to 4 levels up)
426
+ let ancestor = $parent.parent();
427
+ for (let i = 0; i < 4 && ancestor.length > 0; i++) {
428
+ result = ancestor.find(selector);
429
+ if (result.length > 0) return result;
430
+ ancestor = ancestor.parent();
431
+ }
141
432
 
142
- const title = h3.text().trim();
143
- if (!title) return;
433
+ return $parent.find("__nonexistent__");
434
+ }
144
435
 
145
- let snippet = "";
146
- let parent = h3.parent();
147
- while (parent.length && parent.prop("tagName") !== "BODY") {
148
- const snippetEl = parent.find(
149
- "div.VwiC3b, div[style*='-webkit-line-clamp'], div.yXK7lf, div.Uroaid",
150
- );
151
- if (snippetEl.length) {
152
- snippet = snippetEl.first().text().replace(/\s+/g, " ").trim();
153
- break;
436
+ /**
437
+ * Try comma-separated selectors in order; first match wins.
438
+ */
439
+ function findFirstMatch($parent, selectorStr) {
440
+ if (!selectorStr || selectorStr.trim() === "") {
441
+ return $parent;
442
+ }
443
+
444
+ const selectors = selectorStr
445
+ .split(",")
446
+ .map((s) => s.trim())
447
+ .filter(Boolean);
448
+
449
+ for (const sel of selectors) {
450
+ const matches = findScoped($parent, sel);
451
+ if (matches.length > 0) return matches;
452
+ }
453
+
454
+ return $parent.find("__nonexistent__");
455
+ }
456
+
457
+ /**
458
+ * Resolve top-level elements for a section (document-wide with fallback).
459
+ */
460
+ function resolveTopElements($, selectorStr) {
461
+ if (!selectorStr || selectorStr.trim() === "") {
462
+ return $("body");
463
+ }
464
+
465
+ const selectors = selectorStr
466
+ .split(",")
467
+ .map((s) => s.trim())
468
+ .filter(Boolean);
469
+
470
+ for (const sel of selectors) {
471
+ try {
472
+ const matches = $(sel);
473
+ if (matches.length > 0) return matches;
474
+ } catch (_) {
475
+ // Skip invalid selectors
476
+ }
477
+ }
478
+
479
+ return $();
480
+ }
481
+
482
+ // === TRANSFORMS ==========================================================
483
+
484
+ function applyTransform(value, transform, origin) {
485
+ const transforms = Array.isArray(transform) ? transform : [transform];
486
+ let result = value;
487
+
488
+ for (const t of transforms) {
489
+ if (!result) continue;
490
+ switch (t) {
491
+ case "strip":
492
+ result = result.trim();
493
+ break;
494
+
495
+ case "decode_google_url":
496
+ if (result.startsWith("/url?q=")) {
497
+ try {
498
+ const urlPart = result.split("/url?q=")[1].split("&")[0];
499
+ result = decodeURIComponent(urlPart);
500
+ } catch (_) {
501
+ // Leave as-is
154
502
  }
155
- parent = parent.parent();
156
503
  }
504
+ break;
157
505
 
158
- if (link.startsWith("http")) {
159
- if (!results.some((r) => r.link === link)) {
160
- results.push({
161
- position: results.length + 1,
162
- title,
163
- link,
164
- snippet,
165
- });
506
+ case "decode_ddg_url":
507
+ if (result.includes("/l/?uddg=")) {
508
+ try {
509
+ const queryString = result.split("?")[1] || "";
510
+ const params = new URLSearchParams(queryString);
511
+ const uddg = params.get("uddg");
512
+ if (uddg) result = decodeURIComponent(uddg);
513
+ } catch (_) {
514
+ // Leave as-is
166
515
  }
167
516
  }
168
- });
169
- } else {
170
- $(".result").each((i, el) => {
171
- if (results.length >= maxResults) return;
172
- const titleEl = $(el).find(".result__title a");
173
- let link = titleEl.attr("href") || "";
517
+ break;
174
518
 
175
- if (link.includes("/l/?uddg=")) {
519
+ case "json_parse":
520
+ try {
521
+ result = JSON.stringify(JSON.parse(result), null, 2);
522
+ } catch (_) {
523
+ // Leave as-is
524
+ }
525
+ break;
526
+
527
+ case "resolve_href":
528
+ if (origin && result.startsWith("/") && !result.startsWith("//")) {
176
529
  try {
177
- const urlParams = new URLSearchParams(link.split("?")[1]);
178
- link = decodeURIComponent(urlParams.get("uddg") || link);
179
- } catch (e) {}
530
+ result = new URL(result, origin).href;
531
+ } catch (_) {
532
+ // Leave as-is
533
+ }
180
534
  }
535
+ break;
536
+ }
537
+ }
538
+
539
+ return result;
540
+ }
541
+
542
+ // === EXTRACTION ==========================================================
543
+
544
+ function extractValue($el, section, origin) {
545
+ let value;
546
+
547
+ switch (section.format) {
548
+ case "text":
549
+ value = $el.text().replace(/\s+/g, " ").trim();
550
+ break;
551
+
552
+ case "markdown": {
553
+ const html = $el.html() || "";
554
+ value = turndown
555
+ .turndown(html)
556
+ .replace(/\n{3,}/g, "\n\n")
557
+ .trim();
558
+ break;
559
+ }
560
+
561
+ case "attribute":
562
+ value = $el.attr(section.attribute) || "";
563
+ break;
564
+
565
+ case "html":
566
+ value = $el.html() || "";
567
+ break;
568
+
569
+ default:
570
+ value = $el.text().replace(/\s+/g, " ").trim();
571
+ }
572
+
573
+ if (section.transform && value) {
574
+ value = applyTransform(value, section.transform, origin);
575
+ }
181
576
 
182
- const title = titleEl.text().trim();
183
- const snippet = $(el)
184
- .find(".result__snippet")
185
- .text()
186
- .replace(/\s+/g, " ")
187
- .trim();
577
+ return value;
578
+ }
579
+
580
+ /**
581
+ * Extract one child section, scoped to $parentEl.
582
+ * Returns { type: "value", text } or null.
583
+ */
584
+ function extractChildSection($, $parentEl, section, origin) {
585
+ const elements = findFirstMatch($parentEl, section.selector);
586
+
587
+ if (!elements || elements.length === 0) {
588
+ if (section.required) {
589
+ throw new Error(
590
+ `Required section '${section.name}' not found on page.`,
591
+ );
592
+ }
593
+ return null;
594
+ }
595
+
596
+ const el = elements.eq(0);
597
+ const value = extractValue(el, section, origin);
598
+ return { type: "value", text: value };
599
+ }
188
600
 
189
- if (title && link.startsWith("http")) {
190
- results.push({ position: results.length + 1, title, link, snippet });
601
+ /**
602
+ * Extract a top-level section from the document.
603
+ * Returns a SectionResult or null.
604
+ */
605
+ function extractSection($, section, context) {
606
+ const elements = resolveTopElements($, section.selector);
607
+
608
+ if (!elements || elements.length === 0) {
609
+ if (section.required) {
610
+ throw new Error(
611
+ `Required section '${section.name}' not found on page.`,
612
+ );
613
+ }
614
+ return null;
615
+ }
616
+
617
+ // Determine limit
618
+ let limit = elements.length;
619
+ if (section.multiple && section.max_items) {
620
+ limit = Math.min(limit, section.max_items);
621
+ }
622
+ // Override max_items with max_results for first multiple+children section
623
+ if (
624
+ context.isWebsearch &&
625
+ context.maxResultsOverride &&
626
+ !context._maxResultsConsumed &&
627
+ section.multiple &&
628
+ section.children &&
629
+ section.children.length > 0
630
+ ) {
631
+ limit = Math.min(limit, context.maxResultsOverride);
632
+ context._maxResultsConsumed = true;
633
+ }
634
+
635
+ if (section.multiple) {
636
+ const items = [];
637
+
638
+ for (let i = 0; i < limit; i++) {
639
+ const el = elements.eq(i);
640
+
641
+ if (section.children && section.children.length > 0) {
642
+ // Multiple parents, each with children
643
+ const childValues = {};
644
+ for (const child of section.children) {
645
+ const cr = extractChildSection($, el, child, context.origin);
646
+ if (cr && cr.type === "value" && cr.text !== null && cr.text !== undefined) {
647
+ childValues[child.name] = cr.text;
648
+ }
191
649
  }
192
- });
650
+ if (Object.keys(childValues).length > 0) {
651
+ items.push(childValues);
652
+ }
653
+ } else {
654
+ // Multiple parents, no children
655
+ const value = extractValue(el, section, context.origin);
656
+ if (value && value.trim()) {
657
+ items.push(value.trim());
658
+ }
659
+ }
193
660
  }
194
661
 
195
- if (results.length === 0) {
196
- return `No results found on ${engine}. The engine may have shown a captcha, or the query returned nothing.`;
662
+ if (section.children && section.children.length > 0) {
663
+ return { section, type: "children-multiple", items };
664
+ } else {
665
+ return { section, type: "list", items };
197
666
  }
667
+ } else {
668
+ // Single parent
669
+ const el = elements.eq(0);
198
670
 
199
- return (
200
- `Found ${results.length} search results on ${engine}:\n\n` +
201
- results
202
- .map(
203
- (r) =>
204
- `[${r.position}] ${r.title}\n URL: ${r.link}\n Summary: ${r.snippet}`,
205
- )
206
- .join("\n\n")
207
- );
208
- } finally {
209
- await page.close();
210
- await context.close();
671
+ if (section.children && section.children.length > 0) {
672
+ // Single parent with children parent format ignored
673
+ const childValues = {};
674
+ for (const child of section.children) {
675
+ const cr = extractChildSection($, el, child, context.origin);
676
+ if (cr && cr.type === "value" && cr.text !== null && cr.text !== undefined) {
677
+ childValues[child.name] = cr.text;
678
+ }
679
+ }
680
+ return { section, type: "children", items: childValues };
681
+ } else {
682
+ const value = extractValue(el, section, context.origin);
683
+ return { section, type: "value", text: value };
684
+ }
211
685
  }
212
686
  }
213
687
 
214
- async function executeFetch(url, format, startIndex, maxLength, blockMedia) {
215
- const browser = await browserManager.getBrowser();
216
- const context = await browser.newContext();
217
- const page = await context.newPage();
688
+ function extractTemplate($, template, context) {
689
+ const results = [];
218
690
 
219
- try {
220
- if (blockMedia) {
221
- await page.route("**/*", (route) => {
222
- const type = route.request().resourceType();
223
- if (["image", "media", "font"].includes(type)) {
224
- route.abort();
691
+ for (const section of template.sections) {
692
+ try {
693
+ const result = extractSection($, section, context);
694
+ if (result !== null) {
695
+ results.push(result);
696
+ }
697
+ } catch (err) {
698
+ if (
699
+ err.message &&
700
+ err.message.includes("Required section")
701
+ ) {
702
+ throw err;
703
+ }
704
+ // Non-required failures are silently skipped
705
+ }
706
+ }
707
+
708
+ return results;
709
+ }
710
+
711
+ // === COMPOSITION: WEBFETCH ===============================================
712
+
713
+ function isCommentStyle(result) {
714
+ if (!result.items || result.items.length === 0) return false;
715
+ const first = result.items[0];
716
+ const keys = Object.keys(first).map((k) => k.toLowerCase());
717
+ return (
718
+ (keys.includes("author") && (keys.includes("comment") || keys.includes("body"))) ||
719
+ (keys.includes("user") && (keys.includes("comment") || keys.includes("body")))
720
+ );
721
+ }
722
+
723
+ function composeSections(extracted, template, startIndex, maxLength) {
724
+ const parts = [];
725
+
726
+ for (const result of extracted) {
727
+ if (result.type === "value") {
728
+ const text = result.text;
729
+ if (text && String(text).trim()) {
730
+ parts.push(`## ${result.section.name}\n\n${String(text).trim()}`);
731
+ }
732
+ } else if (result.type === "list") {
733
+ if (result.items && result.items.length > 0) {
734
+ const listText = result.items.map((item) => `- ${item}`).join("\n");
735
+ parts.push(`## ${result.section.name}\n\n${listText}`);
736
+ }
737
+ } else if (result.type === "children") {
738
+ if (result.items && Object.keys(result.items).length > 0) {
739
+ for (const [childName, value] of Object.entries(result.items)) {
740
+ if (value && String(value).trim()) {
741
+ parts.push(`## ${childName}\n\n${String(value).trim()}`);
742
+ }
743
+ }
744
+ }
745
+ } else if (result.type === "children-multiple") {
746
+ if (result.items && result.items.length > 0) {
747
+ if (isCommentStyle(result)) {
748
+ const commentParts = [];
749
+ for (const item of result.items) {
750
+ const author =
751
+ item["Author"] || item["author"] || item["User"] || item["user"] || "";
752
+ const comment =
753
+ item["Comment"] || item["Body"] || item["comment"] || item["body"] || "";
754
+ if (author) {
755
+ commentParts.push(`**${author}:**\n\n${comment}`);
756
+ } else if (comment) {
757
+ commentParts.push(comment);
758
+ }
759
+ }
760
+ if (commentParts.length > 0) {
761
+ parts.push(
762
+ `## ${result.section.name}\n\n${commentParts.join("\n\n---\n\n")}`,
763
+ );
764
+ }
225
765
  } else {
226
- route.continue();
766
+ const itemParts = [];
767
+ for (const item of result.items) {
768
+ const lines = [];
769
+ for (const [key, value] of Object.entries(item)) {
770
+ if (value && String(value).trim()) {
771
+ lines.push(` ${key}: ${String(value).trim()}`);
772
+ }
773
+ }
774
+ if (lines.length > 0) itemParts.push(lines.join("\n"));
775
+ }
776
+ if (itemParts.length > 0) {
777
+ parts.push(
778
+ `## ${result.section.name}\n\n${itemParts.join("\n\n")}`,
779
+ );
780
+ }
227
781
  }
228
- });
782
+ }
229
783
  }
784
+ }
230
785
 
231
- try {
232
- await page.goto(url, { waitUntil: "networkidle", timeout: 15000 });
233
- } catch (navError) {
234
- // Allow partial rendering on timeout
786
+ if (parts.length === 0) {
787
+ return "(No content extracted from this page.)";
788
+ }
789
+
790
+ const full = parts.join("\n\n---\n\n");
791
+ const totalLength = full.length;
792
+ const paginated = full.substring(startIndex, startIndex + maxLength);
793
+
794
+ const templateName = template ? template.name : "auto";
795
+ let metadata = `\n\n---\n[webfetch: template="${templateName}", showing characters ${startIndex} to ${startIndex + paginated.length} of ${totalLength} total.`;
796
+ if (startIndex + maxLength < totalLength) {
797
+ metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
798
+ }
799
+ metadata += `]`;
800
+
801
+ return paginated + metadata;
802
+ }
803
+
804
+ // === COMPOSITION: WEBSEARCH ==============================================
805
+
806
+ function composeSearchResults(extracted) {
807
+ // Find the search results section (first children-multiple)
808
+ const searchSection = extracted.find((r) => r.type === "children-multiple");
809
+
810
+ if (!searchSection || !searchSection.items || searchSection.items.length === 0) {
811
+ // Fall back to section-based output
812
+ return composeSections(extracted, null, 0, Infinity);
813
+ }
814
+
815
+ const items = searchSection.items;
816
+ const parts = [];
817
+
818
+ for (let i = 0; i < items.length; i++) {
819
+ const item = items[i];
820
+ const num = i + 1;
821
+
822
+ const title =
823
+ item["Title"] || item["title"] || Object.values(item)[0] || "";
824
+ const url =
825
+ item["URL"] || item["url"] || item["Url"] || "";
826
+ const snippet =
827
+ item["Snippet"] || item["snippet"] || "";
828
+
829
+ // Filter out non-http URLs and google internal links
830
+ let cleanUrl = url;
831
+ if (cleanUrl && !cleanUrl.startsWith("http")) {
832
+ cleanUrl = ""; // Skip internal/non-web URLs
833
+ }
834
+ if (
835
+ cleanUrl &&
836
+ (cleanUrl.includes("google.com/search") ||
837
+ cleanUrl.includes("support.google.com"))
838
+ ) {
839
+ cleanUrl = ""; // Skip google internal links
235
840
  }
236
841
 
237
- const pageContent = await page.content();
238
- let finalContent = "";
842
+ if (!title) continue;
239
843
 
240
- if (format === "raw_html") {
241
- finalContent = pageContent;
242
- } else {
243
- const $ = cheerio.load(pageContent);
844
+ const lines = [`[${num}] ${title}`];
845
+ if (cleanUrl) lines.push(` URL: ${cleanUrl}`);
846
+ if (snippet) lines.push(` Snippet: ${snippet}`);
244
847
 
245
- $(
246
- "script, style, nav, header, footer, noscript, iframe, svg, aside, .advertisement, img, picture, video, audio, canvas, map, area, dialog",
247
- ).remove();
248
- $("*")
249
- .removeAttr("style")
250
- .each((i, el) => {
251
- const src = $(el).attr("src");
252
- if (src && src.startsWith("data:image")) $(el).removeAttr("src");
253
- });
848
+ parts.push(lines.join("\n"));
849
+ }
254
850
 
255
- if (format === "clean_html") {
256
- finalContent = $.html();
257
- } else {
258
- const turndownService = new TurndownService({
259
- headingStyle: "atx",
260
- codeBlockStyle: "fenced",
261
- });
262
- finalContent = turndownService
263
- .turndown($.html())
264
- .replace(/\n{3,}/g, "\n\n")
265
- .trim();
266
- }
851
+ if (parts.length === 0) {
852
+ return "(No content extracted from this page.)";
853
+ }
854
+
855
+ return `## ${searchSection.section.name}\n\n${parts.join("\n\n")}`;
856
+ }
857
+
858
+ // === GENERIC FALLBACK ====================================================
859
+
860
+ function genericFallback($, startIndex, maxLength) {
861
+ applyRemove($, null);
862
+
863
+ const bodyHtml = $("body").html() || "";
864
+ let markdown = turndown
865
+ .turndown(bodyHtml)
866
+ .replace(/\n{3,}/g, "\n\n")
867
+ .trim();
868
+
869
+ if (!markdown || markdown.trim().length === 0) {
870
+ return "(No content extracted from this page.)";
871
+ }
872
+
873
+ const totalLength = markdown.length;
874
+ const paginated = markdown.substring(startIndex, startIndex + maxLength);
875
+
876
+ let metadata = `\n\n---\n[webfetch: template="auto (fallback)", showing characters ${startIndex} to ${startIndex + paginated.length} of ${totalLength} total.`;
877
+ if (startIndex + maxLength < totalLength) {
878
+ metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
879
+ }
880
+ metadata += `]`;
881
+
882
+ return paginated + metadata;
883
+ }
884
+
885
+ // === SEARCH TEMPLATE RESOLUTION ==========================================
886
+
887
+ function resolveSearchTemplate(engine, query, region, safeSearch) {
888
+ const templateName = resolveEngineToTemplateName(engine);
889
+
890
+ let template;
891
+ if (templateName.startsWith("{")) {
892
+ try {
893
+ template = JSON.parse(templateName);
894
+ } catch (e) {
895
+ throw new Error(`Invalid inline JSON template: ${e.message}`);
267
896
  }
897
+ } else {
898
+ template = getTemplateByName(templateName);
899
+ }
268
900
 
269
- const totalLength = finalContent.length;
270
- let paginatedText = finalContent.substring(
271
- startIndex,
272
- startIndex + maxLength,
901
+ if (!template.url_template) {
902
+ throw new Error(
903
+ `Template '${template.name}' is not a search template (no url_template).`,
273
904
  );
905
+ }
274
906
 
275
- let metadata = `\n\n---\n[Document Info: Showing characters ${startIndex} to ${startIndex + paginatedText.length} of ${totalLength} total.`;
276
- if (startIndex + maxLength < totalLength) {
277
- metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
278
- }
279
- metadata += `]`;
907
+ const params = mapSearchParams(engine, query, region, safeSearch);
908
+ let url = resolveUrlTemplate(template, params);
280
909
 
281
- return paginatedText + metadata;
282
- } finally {
283
- await page.close();
284
- await context.close();
910
+ // Google safe_search: append safe=active to URL
911
+ if (
912
+ (engine === "google" || templateName === "google-search") &&
913
+ safeSearch === true
914
+ ) {
915
+ url += "&safe=active";
285
916
  }
917
+
918
+ return { template, url };
286
919
  }
287
920
 
288
- const server = new McpServer({ name: "searchfetch", version: "2.0.0" });
921
+ // === MCP SERVER & TOOLS ==================================================
922
+
923
+ const server = new McpServer({ name: "searchfetch", version: "3.0.0" });
924
+
925
+ // --- websearch tool ---
289
926
 
290
927
  server.registerTool(
291
928
  "websearch",
@@ -296,38 +933,67 @@ server.registerTool(
296
933
  inputSchema: z.object({
297
934
  query: z.string().describe("The search query string."),
298
935
  engine: z
299
- .enum(["duckduckgo", "google"])
936
+ .string()
300
937
  .default("duckduckgo")
301
938
  .describe(
302
939
  "Search engine to use. Can be 'duckduckgo' or 'google'. Default is 'duckduckgo'.",
303
940
  ),
304
- max_results: z
305
- .number()
306
- .default(10)
307
- .describe("Maximum number of results to return. Default is 10."),
308
941
  region: z
309
942
  .string()
310
- .default("wt-wt")
943
+ .nullable()
944
+ .default(null)
311
945
  .describe(
312
- "Region and language code to localize search results (e.g., 'us-en', 'uk-en', 'de-de'). For DuckDuckGo it maps directly. For Google, 'us' is country code and 'en' is language. Default is 'wt-wt' (global/US English).",
946
+ "Region and language code to localize search results (e.g., 'us-en', 'uk-en', 'de-de'). For DuckDuckGo it maps directly. For Google, 'us' is country code and 'en' is language. Default is null (uses template default).",
313
947
  ),
314
948
  safe_search: z
315
- .string()
316
- .default("-1")
949
+ .boolean()
950
+ .nullable()
951
+ .default(null)
952
+ .describe(
953
+ "Enable safe search filtering. null = use template default. Applies to both DuckDuckGo and Google.",
954
+ ),
955
+ max_results: z
956
+ .number()
957
+ .default(10)
958
+ .describe("Maximum number of search results to return. Default is 10."),
959
+ block_media: z
960
+ .boolean()
961
+ .default(true)
317
962
  .describe(
318
- "Safe search filtering mode. '-1' for Moderate, '1' for Strict, '-2' for Off. Default is '-1'. Note: Only applies to DuckDuckGo.",
963
+ "Block images, videos, and fonts entirely at the network layer. Default is true.",
319
964
  ),
320
965
  }),
321
966
  },
322
- async ({ query, max_results, region, safe_search, engine }) => {
967
+ async ({ query, engine, region, safe_search, max_results, block_media }) => {
323
968
  try {
324
- const result = await executeSearch(
969
+ // 1. Resolve search template (+ url_params mapping + url building)
970
+ const { template, url } = resolveSearchTemplate(
971
+ engine,
325
972
  query,
326
- max_results,
327
973
  region,
328
974
  safe_search,
329
- engine,
330
975
  );
976
+
977
+ // 2. Fetch
978
+ const html = await fetchHtmlWithRetry(url, template, block_media);
979
+
980
+ // 3. Extract
981
+ const $ = cheerio.load(html);
982
+ applyRemove($, template);
983
+
984
+ const pageOrigin = new URL(url).origin;
985
+ const context = {
986
+ origin: pageOrigin,
987
+ isWebsearch: true,
988
+ maxResultsOverride: max_results,
989
+ _maxResultsConsumed: false,
990
+ };
991
+
992
+ const extracted = extractTemplate($, template, context);
993
+
994
+ // 4. Compose
995
+ const result = composeSearchResults(extracted);
996
+
331
997
  return { content: [{ type: "text", text: result }] };
332
998
  } catch (err) {
333
999
  return {
@@ -338,6 +1004,8 @@ server.registerTool(
338
1004
  },
339
1005
  );
340
1006
 
1007
+ // --- webfetch tool ---
1008
+
341
1009
  server.registerTool(
342
1010
  "webfetch",
343
1011
  {
@@ -345,22 +1013,20 @@ server.registerTool(
345
1013
  description:
346
1014
  "Fetch and extract the main text content from any webpage. Fully executes JavaScript to load React/SPAs and aggressively strips images/media (including base64) to save context tokens.",
347
1015
  inputSchema: z.object({
348
- url: z
349
- .url()
350
- .describe(
351
- "The full URL of the webpage to fetch (must start with http/https).",
352
- ),
353
- format: z
354
- .enum(["markdown", "clean_html", "raw_html"])
355
- .default("markdown")
1016
+ url: z.string().describe(
1017
+ "The full URL of the webpage to fetch (must start with http/https).",
1018
+ ),
1019
+ template: z
1020
+ .string()
1021
+ .default("auto")
356
1022
  .describe(
357
- "Output format. Set to 'markdown', 'clean_html', or 'raw_html'. Default is 'markdown' (highly recommended to save context tokens).",
1023
+ "Template to use: 'auto' (auto-detect from URL), a built-in name, or inline JSON.",
358
1024
  ),
359
1025
  start_index: z
360
1026
  .number()
361
1027
  .default(0)
362
1028
  .describe(
363
- "Character offset to start reading from for pagination. Use this if a document is too large to fit in the context window. Default is 0.",
1029
+ "Character offset for pagination. Default: 0.",
364
1030
  ),
365
1031
  max_length: z
366
1032
  .number()
@@ -372,20 +1038,50 @@ server.registerTool(
372
1038
  .boolean()
373
1039
  .default(true)
374
1040
  .describe(
375
- "Block images, videos, and fonts entirely at the network layer to drastically speed up page loads and dodge tracking pixels. Default is true.",
1041
+ "Block images, videos, and fonts entirely at the network layer. Default is true.",
376
1042
  ),
377
1043
  }),
378
1044
  },
379
- async ({ url, format, start_index, max_length, block_media }) => {
1045
+ async ({ url, template: templateParam, start_index, max_length, block_media }) => {
380
1046
  try {
381
- const result = await executeFetch(
382
- url,
383
- format,
384
- start_index,
385
- max_length,
386
- block_media,
387
- );
388
- return { content: [{ type: "text", text: result }] };
1047
+ // 1. Resolve template
1048
+ let template;
1049
+
1050
+ if (templateParam.startsWith("{")) {
1051
+ try {
1052
+ template = JSON.parse(templateParam);
1053
+ } catch (e) {
1054
+ throw new Error(`Invalid inline JSON template: ${e.message}`);
1055
+ }
1056
+ } else if (templateParam === "auto") {
1057
+ template = detectTemplateByUrl(url);
1058
+ } else {
1059
+ template = getTemplateByName(templateParam);
1060
+ }
1061
+
1062
+ // 2. Fetch
1063
+ const html = await fetchHtmlWithRetry(url, template, block_media);
1064
+
1065
+ // 3. Extract and compose
1066
+ const $ = cheerio.load(html);
1067
+
1068
+ if (template) {
1069
+ applyRemove($, template);
1070
+
1071
+ const pageOrigin = new URL(url).origin;
1072
+ const context = {
1073
+ origin: pageOrigin,
1074
+ isWebsearch: false,
1075
+ };
1076
+
1077
+ const extracted = extractTemplate($, template, context);
1078
+ const result = composeSections(extracted, template, start_index, max_length);
1079
+ return { content: [{ type: "text", text: result }] };
1080
+ } else {
1081
+ // Generic fallback
1082
+ const result = genericFallback($, start_index, max_length);
1083
+ return { content: [{ type: "text", text: result }] };
1084
+ }
389
1085
  } catch (err) {
390
1086
  return {
391
1087
  content: [{ type: "text", text: `Fetch Error: ${err.message}` }],
@@ -395,6 +1091,8 @@ server.registerTool(
395
1091
  },
396
1092
  );
397
1093
 
1094
+ // === MAIN =================================================================
1095
+
398
1096
  async function main() {
399
1097
  await ensureBinary();
400
1098
  process.stdout.write = originalStdoutWrite;