searchfetch 2.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env node
2
2
 
3
+ // === STDOUT/STDERR REDIRECTION ===========================================
3
4
  const originalStdoutWrite = process.stdout.write.bind(process.stdout);
4
5
  process.stdout.write = (chunk, encoding, callback) => {
5
6
  return process.stderr.write(chunk, encoding, callback);
@@ -7,6 +8,10 @@ process.stdout.write = (chunk, encoding, callback) => {
7
8
  console.log = (...args) => console.error(...args);
8
9
  console.info = (...args) => console.error(...args);
9
10
 
11
+ // === IMPORTS =============================================================
12
+ import { readdirSync, readFileSync } from "node:fs";
13
+ import { dirname, join } from "node:path";
14
+ import { fileURLToPath } from "node:url";
10
15
  import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
11
16
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
12
17
  import { z } from "zod";
@@ -14,6 +19,7 @@ import { launch, ensureBinary } from "cloakbrowser";
14
19
  import * as cheerio from "cheerio";
15
20
  import TurndownService from "turndown";
16
21
 
22
+ // === BROWSER MANAGER =====================================================
17
23
  class BrowserManager {
18
24
  constructor() {
19
25
  this.browser = null;
@@ -67,225 +73,892 @@ const cleanup = async () => {
67
73
  process.on("SIGINT", cleanup);
68
74
  process.on("SIGTERM", cleanup);
69
75
 
70
- function getGoogleRegionParams(region) {
71
- if (!region || region === "wt-wt") return "hl=en&gl=us";
72
- const parts = region.split("-");
73
- if (parts.length === 2) return `gl=${parts[0]}&hl=${parts[1]}`;
74
- return `gl=${region}&hl=en`;
76
+ // === TURNDOWN ============================================================
77
+ const turndown = new TurndownService({
78
+ headingStyle: "atx",
79
+ codeBlockStyle: "fenced",
80
+ emDelimiter: "*",
81
+ });
82
+
83
+ // === BUILT-IN TEMPLATES (loaded from templates/*.json) ====================
84
+
85
+ const __filename = fileURLToPath(import.meta.url);
86
+ const __dirname = dirname(__filename);
87
+ const TEMPLATES_DIR = join(__dirname, "templates");
88
+
89
+ function loadBuiltinTemplates() {
90
+ let files;
91
+ try {
92
+ files = readdirSync(TEMPLATES_DIR);
93
+ } catch (err) {
94
+ throw new Error(
95
+ `Cannot read templates directory '${TEMPLATES_DIR}': ${err.message}`,
96
+ );
97
+ }
98
+
99
+ const jsonFiles = files
100
+ .filter((f) => f.endsWith(".json"))
101
+ .sort();
102
+
103
+ if (jsonFiles.length === 0) {
104
+ throw new Error(
105
+ `No template JSON files found in '${TEMPLATES_DIR}'`,
106
+ );
107
+ }
108
+
109
+ const templates = [];
110
+ for (const file of jsonFiles) {
111
+ const filePath = join(TEMPLATES_DIR, file);
112
+ const content = readFileSync(filePath, "utf-8");
113
+ let template;
114
+ try {
115
+ template = JSON.parse(content);
116
+ } catch (err) {
117
+ throw new Error(
118
+ `Invalid JSON in template file '${filePath}': ${err.message}`,
119
+ );
120
+ }
121
+ if (!template.name || typeof template.name !== "string") {
122
+ throw new Error(
123
+ `Template file '${filePath}' is missing a valid "name" field`,
124
+ );
125
+ }
126
+ templates.push(template);
127
+ }
128
+
129
+ // Sort by "order" field for deterministic URL-pattern matching
130
+ templates.sort((a, b) => (a.order ?? 999) - (b.order ?? 999));
131
+
132
+ return templates;
133
+ }
134
+
135
+ const BUILTIN_TEMPLATES = loadBuiltinTemplates();
136
+
137
+ // === TEMPLATE LOOKUP =====================================================
138
+ const TEMPLATE_MAP = new Map();
139
+ for (const t of BUILTIN_TEMPLATES) {
140
+ TEMPLATE_MAP.set(t.name, t);
141
+ }
142
+
143
+ function getTemplateByName(name) {
144
+ const t = TEMPLATE_MAP.get(name);
145
+ if (!t) {
146
+ const names = [...TEMPLATE_MAP.keys()].join(", ");
147
+ throw new Error(
148
+ `Unknown template '${name}'. Available: ${names}`,
149
+ );
150
+ }
151
+ return t;
152
+ }
153
+
154
+ function detectTemplateByUrl(url) {
155
+ for (const template of BUILTIN_TEMPLATES) {
156
+ if (!template.url_patterns) continue;
157
+ for (const pattern of template.url_patterns) {
158
+ try {
159
+ if (new RegExp(pattern).test(url)) {
160
+ return template;
161
+ }
162
+ } catch (_) {
163
+ // Skip invalid regex
164
+ }
165
+ }
166
+ }
167
+ return null;
168
+ }
169
+
170
+ // === URL TEMPLATE RESOLUTION =============================================
171
+
172
+ function resolveUrlTemplate(template, providedParams) {
173
+ const urlParams = template.url_params || {};
174
+ let url = template.url_template;
175
+ if (!url) return null;
176
+
177
+ let match;
178
+ const re = /\{(\w+)\}/g;
179
+ while ((match = re.exec(url)) !== null) {
180
+ const name = match[1];
181
+ const def = urlParams[name] || {};
182
+
183
+ let value;
184
+ if (
185
+ name in providedParams &&
186
+ providedParams[name] !== null &&
187
+ providedParams[name] !== undefined
188
+ ) {
189
+ value = String(providedParams[name]);
190
+ } else if (def.default !== undefined) {
191
+ value = String(def.default);
192
+ } else if (def.required) {
193
+ throw new Error(
194
+ `Required URL parameter '${name}' not provided for template '${template.name}'.`,
195
+ );
196
+ } else {
197
+ value = "";
198
+ }
199
+
200
+ if (def.encode === "url") {
201
+ value = encodeURIComponent(value);
202
+ }
203
+
204
+ url = url.replace(match[0], value);
205
+ }
206
+
207
+ // Remove any remaining unreplaced placeholders
208
+ url = url.replace(/\{\w+\}/g, "").replace(/&{2,}/g, "&").replace(/\?&/, "?");
209
+
210
+ return url;
211
+ }
212
+
213
+ // === SEARCH PARAM MAPPING ================================================
214
+
215
+ function resolveEngineToTemplateName(engine) {
216
+ if (engine === "duckduckgo") return "duckduckgo-search";
217
+ if (engine === "google") return "google-search";
218
+ return engine;
219
+ }
220
+
221
+ function mapSearchParams(engine, query, region, safeSearch) {
222
+ const params = { query };
223
+ const resolved = resolveEngineToTemplateName(engine);
224
+
225
+ if (resolved === "duckduckgo-search") {
226
+ if (region !== null && region !== undefined) {
227
+ params.kl = region;
228
+ }
229
+ if (safeSearch === true) {
230
+ params.kp = "1";
231
+ } else if (safeSearch === false) {
232
+ params.kp = "-2";
233
+ }
234
+ } else if (resolved === "google-search") {
235
+ if (region !== null && region !== undefined) {
236
+ const parts = region.split("-");
237
+ params.hl = parts[0];
238
+ params.gl = parts.length > 1 ? parts[1] : parts[0];
239
+ }
240
+ }
241
+
242
+ return params;
75
243
  }
76
244
 
77
- async function executeSearch(query, maxResults, region, safeSearch, engine) {
245
+ // === FETCH ===============================================================
246
+
247
+ const FETCH_MAX_ATTEMPTS = 2;
248
+ const HTTP_429_RETRY_DELAY_MS = 2000;
249
+
250
+ function sleep(ms) {
251
+ return new Promise((resolve) => setTimeout(resolve, ms));
252
+ }
253
+
254
+ function parseRetryAfterMs(value) {
255
+ if (!value) return HTTP_429_RETRY_DELAY_MS;
256
+ const seconds = Number(value);
257
+ if (Number.isFinite(seconds) && seconds >= 0) {
258
+ return Math.min(seconds * 1000, 30000);
259
+ }
260
+ const dateMs = Date.parse(value);
261
+ if (Number.isFinite(dateMs)) {
262
+ return Math.min(Math.max(dateMs - Date.now(), 0), 30000);
263
+ }
264
+ return HTTP_429_RETRY_DELAY_MS;
265
+ }
266
+
267
+ function makeHttpStatusError(status, url, retryAfterMs = null) {
268
+ const err = new Error(`Access denied: HTTP ${status} when fetching ${url}`);
269
+ err.httpStatus = status;
270
+ err.retryAfterMs = retryAfterMs;
271
+ return err;
272
+ }
273
+
274
+ function isAccessDenied($) {
275
+ const title = ($("title").text() || "").toLowerCase();
276
+ const bodyText = ($("body").text() || "").replace(/\s+/g, " ").trim().toLowerCase();
277
+
278
+ const titleDenyPatterns = [
279
+ "captcha",
280
+ "are you a robot",
281
+ "access denied",
282
+ "blocked",
283
+ "forbidden",
284
+ "unusual traffic",
285
+ "sorry, you have been blocked",
286
+ "verify you are human",
287
+ "one more step",
288
+ "security check",
289
+ "ddos protection",
290
+ "cloudflare",
291
+ ];
292
+
293
+ if (titleDenyPatterns.some((pattern) => title.includes(pattern))) return true;
294
+
295
+ const bodyDenyPatterns = [
296
+ "to continue, please type the characters",
297
+ "our systems have detected unusual traffic",
298
+ "verify you are human",
299
+ "are you a robot",
300
+ "sorry, you have been blocked",
301
+ "access denied",
302
+ ];
303
+
304
+ if (bodyText.length < 1200 && bodyDenyPatterns.some((pattern) => bodyText.includes(pattern))) return true;
305
+
306
+ return false;
307
+ }
308
+
309
+ async function fetchHtml(url, template, blockMedia) {
78
310
  const browser = await browserManager.getBrowser();
79
311
  const context = await browser.newContext();
80
312
 
81
- await context.addCookies([
82
- {
83
- name: "CONSENT",
84
- value: "YES+cb.20250101-01-p0.en+FX+999",
85
- domain: ".google.com",
86
- path: "/",
87
- },
88
- ]);
313
+ try {
314
+ // Pre-load cookies from template
315
+ if (template && template.cookies && template.cookies.length > 0) {
316
+ await context.addCookies(template.cookies);
317
+ }
318
+
319
+ const page = await context.newPage();
89
320
 
90
- const page = await context.newPage();
321
+ try {
322
+ // Route blocked resource types
323
+ if (blockMedia) {
324
+ const blockedTypes =
325
+ template && template.block_resources
326
+ ? template.block_resources
327
+ : ["image", "media", "font"];
91
328
 
92
- try {
93
- await page.route("**/*", (route) => {
94
- const type = route.request().resourceType();
95
- if (["image", "media", "font", "stylesheet"].includes(type)) {
96
- route.abort();
97
- } else {
98
- route.continue();
329
+ if (blockedTypes.length > 0) {
330
+ await page.route("**/*", (route) => {
331
+ const type = route.request().resourceType();
332
+ if (blockedTypes.includes(type)) {
333
+ route.abort();
334
+ } else {
335
+ route.continue();
336
+ }
337
+ });
338
+ }
339
+ }
340
+
341
+ let response;
342
+ try {
343
+ response = await page.goto(url, {
344
+ waitUntil: "networkidle",
345
+ timeout: 15000,
346
+ });
347
+ } catch (_navError) {
348
+ // Allow partial rendering on timeout
349
+ }
350
+
351
+ // Check HTTP status for access failures
352
+ if (response) {
353
+ const status = response.status();
354
+ if ([401, 403, 429].includes(status)) {
355
+ throw makeHttpStatusError(
356
+ status,
357
+ url,
358
+ status === 429 ? parseRetryAfterMs(response.headers()["retry-after"]) : null,
359
+ );
360
+ }
99
361
  }
100
- });
101
362
 
102
- const results = [];
103
- const searchUrl =
104
- engine === "google"
105
- ? `https://www.google.com/search?udm=web&udm=14&q=${encodeURIComponent(query)}&${getGoogleRegionParams(region)}`
106
- : `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}&kl=${encodeURIComponent(region)}&kp=${encodeURIComponent(safeSearch)}`;
363
+ const pageContent = await page.content();
364
+
365
+ // Check for CAPTCHA / access-denied pages
366
+ const $ = cheerio.load(pageContent);
367
+ if (isAccessDenied($)) {
368
+ throw new Error(
369
+ `Access denied: CAPTCHA or block page detected at ${url}. The site is blocking automated access.`,
370
+ );
371
+ }
107
372
 
373
+ return pageContent;
374
+ } finally {
375
+ await page.close();
376
+ }
377
+ } finally {
378
+ await context.close();
379
+ }
380
+ }
381
+
382
+ async function fetchHtmlWithRetry(url, template, blockMedia) {
383
+ let lastError;
384
+ for (let attempt = 0; attempt < FETCH_MAX_ATTEMPTS; attempt++) {
108
385
  try {
109
- await page.goto(searchUrl, { waitUntil: "networkidle", timeout: 15000 });
110
- } catch (e) {
111
- // Allow partial rendering on timeout
386
+ return await fetchHtml(url, template, blockMedia);
387
+ } catch (err) {
388
+ lastError = err;
389
+ if (
390
+ attempt < FETCH_MAX_ATTEMPTS - 1 &&
391
+ err.httpStatus === 429
392
+ ) {
393
+ await sleep(err.retryAfterMs ?? HTTP_429_RETRY_DELAY_MS);
394
+ continue;
395
+ }
396
+ if (
397
+ attempt < FETCH_MAX_ATTEMPTS - 1 &&
398
+ (err.message.includes("net::") ||
399
+ err.message.includes("ERR_") ||
400
+ err.message.includes("Navigation failed"))
401
+ ) {
402
+ await sleep(500);
403
+ continue;
404
+ }
405
+ throw err;
112
406
  }
407
+ }
408
+ throw lastError;
409
+ }
113
410
 
114
- const pageContent = await page.content();
115
- const $ = cheerio.load(pageContent);
411
+ // === HTML CLEANUP ========================================================
116
412
 
117
- if (engine === "google") {
118
- $("h3").each((i, el) => {
119
- if (results.length >= maxResults) return;
413
+ const DEFAULT_REMOVE_SELECTORS = [
414
+ "script", "style", "svg", "nav", "footer", "noscript", "iframe",
415
+ ".advertisement",
416
+ ];
120
417
 
121
- const h3 = $(el);
122
- let linkEl = h3.closest("a");
123
- if (!linkEl.length) linkEl = h3.find("a");
124
- if (!linkEl.length) return;
418
+ function applyRemove($, template) {
419
+ const removeSelectors =
420
+ template && template.remove && template.remove.length > 0
421
+ ? template.remove
422
+ : DEFAULT_REMOVE_SELECTORS;
125
423
 
126
- let link = linkEl.attr("href") || "";
424
+ for (const selector of removeSelectors) {
425
+ try {
426
+ $(selector).remove();
427
+ } catch (_) {
428
+ // Skip invalid selectors
429
+ }
430
+ }
127
431
 
128
- if (!link || (link.startsWith("/") && !link.startsWith("/url?q=")))
129
- return;
130
- if (
131
- link.includes("google.com/search") ||
132
- link.includes("support.google.com")
133
- )
134
- return;
432
+ // Strip style attributes and data:image src
433
+ $("[style]").removeAttr("style");
434
+ $("*").each((_i, el) => {
435
+ const src = $(el).attr("src");
436
+ if (src && src.startsWith("data:image")) {
437
+ $(el).removeAttr("src");
438
+ }
439
+ });
440
+ }
135
441
 
136
- if (link.startsWith("/url?q=")) {
137
- try {
138
- link = decodeURIComponent(link.split("/url?q=")[1].split("&")[0]);
139
- } catch (e) {}
140
- }
442
+ // === EXTRACTION ENGINE ===================================================
141
443
 
142
- const title = h3.text().trim();
143
- if (!title) return;
444
+ /**
445
+ * Find elements matching selector, scoped to $parent.
446
+ * Search order: descendants → closest ancestor → ancestor subtrees (up to 4 levels).
447
+ */
448
+ function findScoped($parent, selector) {
449
+ if (!selector || selector.trim() === "") {
450
+ return $parent;
451
+ }
144
452
 
145
- let snippet = "";
146
- let parent = h3.parent();
147
- while (parent.length && parent.prop("tagName") !== "BODY") {
148
- const snippetEl = parent.find(
149
- "div.VwiC3b, div[style*='-webkit-line-clamp'], div.yXK7lf, div.Uroaid",
150
- );
151
- if (snippetEl.length) {
152
- snippet = snippetEl.first().text().replace(/\s+/g, " ").trim();
153
- break;
453
+ // 1. Descendants
454
+ let result = $parent.find(selector);
455
+ if (result.length > 0) return result;
456
+
457
+ // 2. Closest ancestor matching selector
458
+ result = $parent.closest(selector);
459
+ if (result.length > 0) return result;
460
+
461
+ // 3. Ancestor subtrees (up to 4 levels up)
462
+ let ancestor = $parent.parent();
463
+ for (let i = 0; i < 4 && ancestor.length > 0; i++) {
464
+ result = ancestor.find(selector);
465
+ if (result.length > 0) return result;
466
+ ancestor = ancestor.parent();
467
+ }
468
+
469
+ return $parent.find("__nonexistent__");
470
+ }
471
+
472
+ /**
473
+ * Try comma-separated selectors in order; first match wins.
474
+ */
475
+ function findFirstMatch($parent, selectorStr) {
476
+ if (!selectorStr || selectorStr.trim() === "") {
477
+ return $parent;
478
+ }
479
+
480
+ const selectors = selectorStr
481
+ .split(",")
482
+ .map((s) => s.trim())
483
+ .filter(Boolean);
484
+
485
+ for (const sel of selectors) {
486
+ const matches = findScoped($parent, sel);
487
+ if (matches.length > 0) return matches;
488
+ }
489
+
490
+ return $parent.find("__nonexistent__");
491
+ }
492
+
493
+ /**
494
+ * Resolve top-level elements for a section (document-wide with fallback).
495
+ */
496
+ function resolveTopElements($, selectorStr) {
497
+ if (!selectorStr || selectorStr.trim() === "") {
498
+ return $("body");
499
+ }
500
+
501
+ const selectors = selectorStr
502
+ .split(",")
503
+ .map((s) => s.trim())
504
+ .filter(Boolean);
505
+
506
+ for (const sel of selectors) {
507
+ try {
508
+ const matches = $(sel);
509
+ if (matches.length > 0) return matches;
510
+ } catch (_) {
511
+ // Skip invalid selectors
512
+ }
513
+ }
514
+
515
+ return $();
516
+ }
517
+
518
+ // === TRANSFORMS ==========================================================
519
+
520
+ function applyTransform(value, transform, origin) {
521
+ const transforms = Array.isArray(transform) ? transform : [transform];
522
+ let result = value;
523
+
524
+ for (const t of transforms) {
525
+ if (!result) continue;
526
+ switch (t) {
527
+ case "strip":
528
+ result = result.trim();
529
+ break;
530
+
531
+ case "decode_google_url":
532
+ if (result.startsWith("/url?q=")) {
533
+ try {
534
+ const urlPart = result.split("/url?q=")[1].split("&")[0];
535
+ result = decodeURIComponent(urlPart);
536
+ } catch (_) {
537
+ // Leave as-is
154
538
  }
155
- parent = parent.parent();
156
539
  }
540
+ break;
157
541
 
158
- if (link.startsWith("http")) {
159
- if (!results.some((r) => r.link === link)) {
160
- results.push({
161
- position: results.length + 1,
162
- title,
163
- link,
164
- snippet,
165
- });
542
+ case "decode_ddg_url":
543
+ if (result.includes("/l/?uddg=")) {
544
+ try {
545
+ const queryString = result.split("?")[1] || "";
546
+ const params = new URLSearchParams(queryString);
547
+ const uddg = params.get("uddg");
548
+ if (uddg) result = decodeURIComponent(uddg);
549
+ } catch (_) {
550
+ // Leave as-is
166
551
  }
167
552
  }
168
- });
169
- } else {
170
- $(".result").each((i, el) => {
171
- if (results.length >= maxResults) return;
172
- const titleEl = $(el).find(".result__title a");
173
- let link = titleEl.attr("href") || "";
553
+ break;
554
+
555
+ case "json_parse":
556
+ try {
557
+ result = JSON.stringify(JSON.parse(result), null, 2);
558
+ } catch (_) {
559
+ // Leave as-is
560
+ }
561
+ break;
174
562
 
175
- if (link.includes("/l/?uddg=")) {
563
+ case "resolve_href":
564
+ if (origin && result.startsWith("/") && !result.startsWith("//")) {
176
565
  try {
177
- const urlParams = new URLSearchParams(link.split("?")[1]);
178
- link = decodeURIComponent(urlParams.get("uddg") || link);
179
- } catch (e) {}
566
+ result = new URL(result, origin).href;
567
+ } catch (_) {
568
+ // Leave as-is
569
+ }
180
570
  }
571
+ break;
572
+ }
573
+ }
574
+
575
+ return result;
576
+ }
577
+
578
+ // === EXTRACTION ==========================================================
579
+
580
+ function extractValue($el, section, origin) {
581
+ let value;
582
+
583
+ switch (section.format) {
584
+ case "text":
585
+ value = $el.text().replace(/\s+/g, " ").trim();
586
+ break;
587
+
588
+ case "markdown": {
589
+ const html = $el.html() || "";
590
+ value = turndown
591
+ .turndown(html)
592
+ .replace(/\n{3,}/g, "\n\n")
593
+ .trim();
594
+ break;
595
+ }
596
+
597
+ case "attribute":
598
+ value = $el.attr(section.attribute) || "";
599
+ break;
181
600
 
182
- const title = titleEl.text().trim();
183
- const snippet = $(el)
184
- .find(".result__snippet")
185
- .text()
186
- .replace(/\s+/g, " ")
187
- .trim();
601
+ case "html":
602
+ value = $el.html() || "";
603
+ break;
188
604
 
189
- if (title && link.startsWith("http")) {
190
- results.push({ position: results.length + 1, title, link, snippet });
605
+ default:
606
+ value = $el.text().replace(/\s+/g, " ").trim();
607
+ }
608
+
609
+ if (section.transform && value) {
610
+ value = applyTransform(value, section.transform, origin);
611
+ }
612
+
613
+ return value;
614
+ }
615
+
616
+ /**
617
+ * Extract one child section, scoped to $parentEl.
618
+ * Returns { type: "value", text } or null.
619
+ */
620
+ function extractChildSection($, $parentEl, section, origin) {
621
+ const elements = findFirstMatch($parentEl, section.selector);
622
+
623
+ if (!elements || elements.length === 0) {
624
+ if (section.required) {
625
+ throw new Error(
626
+ `Required section '${section.name}' not found on page.`,
627
+ );
628
+ }
629
+ return null;
630
+ }
631
+
632
+ const el = elements.eq(0);
633
+ const value = extractValue(el, section, origin);
634
+ return { type: "value", text: value };
635
+ }
636
+
637
+ /**
638
+ * Extract a top-level section from the document.
639
+ * Returns a SectionResult or null.
640
+ */
641
+ function extractSection($, section, context) {
642
+ const elements = resolveTopElements($, section.selector);
643
+
644
+ if (!elements || elements.length === 0) {
645
+ if (section.required) {
646
+ throw new Error(
647
+ `Required section '${section.name}' not found on page.`,
648
+ );
649
+ }
650
+ return null;
651
+ }
652
+
653
+ // Determine limit
654
+ let limit = elements.length;
655
+ if (section.multiple && section.max_items) {
656
+ limit = Math.min(limit, section.max_items);
657
+ }
658
+ // Override max_items with max_results for first multiple+children section
659
+ if (
660
+ context.isWebsearch &&
661
+ context.maxResultsOverride &&
662
+ !context._maxResultsConsumed &&
663
+ section.multiple &&
664
+ section.children &&
665
+ section.children.length > 0
666
+ ) {
667
+ limit = Math.min(limit, context.maxResultsOverride);
668
+ context._maxResultsConsumed = true;
669
+ }
670
+
671
+ if (section.multiple) {
672
+ const items = [];
673
+
674
+ for (let i = 0; i < limit; i++) {
675
+ const el = elements.eq(i);
676
+
677
+ if (section.children && section.children.length > 0) {
678
+ // Multiple parents, each with children
679
+ const childValues = {};
680
+ for (const child of section.children) {
681
+ const cr = extractChildSection($, el, child, context.origin);
682
+ if (cr && cr.type === "value" && cr.text !== null && cr.text !== undefined) {
683
+ childValues[child.name] = cr.text;
684
+ }
191
685
  }
192
- });
686
+ if (Object.keys(childValues).length > 0) {
687
+ items.push(childValues);
688
+ }
689
+ } else {
690
+ // Multiple parents, no children
691
+ const value = extractValue(el, section, context.origin);
692
+ if (value && value.trim()) {
693
+ items.push(value.trim());
694
+ }
695
+ }
193
696
  }
194
697
 
195
- if (results.length === 0) {
196
- return `No results found on ${engine}. The engine may have shown a captcha, or the query returned nothing.`;
698
+ if (section.children && section.children.length > 0) {
699
+ return { section, type: "children-multiple", items };
700
+ } else {
701
+ return { section, type: "list", items };
197
702
  }
703
+ } else {
704
+ // Single parent
705
+ const el = elements.eq(0);
198
706
 
199
- return (
200
- `Found ${results.length} search results on ${engine}:\n\n` +
201
- results
202
- .map(
203
- (r) =>
204
- `[${r.position}] ${r.title}\n URL: ${r.link}\n Summary: ${r.snippet}`,
205
- )
206
- .join("\n\n")
207
- );
208
- } finally {
209
- await page.close();
210
- await context.close();
707
+ if (section.children && section.children.length > 0) {
708
+ // Single parent with children parent format ignored
709
+ const childValues = {};
710
+ for (const child of section.children) {
711
+ const cr = extractChildSection($, el, child, context.origin);
712
+ if (cr && cr.type === "value" && cr.text !== null && cr.text !== undefined) {
713
+ childValues[child.name] = cr.text;
714
+ }
715
+ }
716
+ return { section, type: "children", items: childValues };
717
+ } else {
718
+ const value = extractValue(el, section, context.origin);
719
+ return { section, type: "value", text: value };
720
+ }
211
721
  }
212
722
  }
213
723
 
214
- async function executeFetch(url, format, startIndex, maxLength, blockMedia) {
215
- const browser = await browserManager.getBrowser();
216
- const context = await browser.newContext();
217
- const page = await context.newPage();
724
+ function extractTemplate($, template, context) {
725
+ const results = [];
218
726
 
219
- try {
220
- if (blockMedia) {
221
- await page.route("**/*", (route) => {
222
- const type = route.request().resourceType();
223
- if (["image", "media", "font"].includes(type)) {
224
- route.abort();
727
+ for (const section of template.sections) {
728
+ try {
729
+ const result = extractSection($, section, context);
730
+ if (result !== null) {
731
+ results.push(result);
732
+ }
733
+ } catch (err) {
734
+ if (
735
+ err.message &&
736
+ err.message.includes("Required section")
737
+ ) {
738
+ throw err;
739
+ }
740
+ // Non-required failures are silently skipped
741
+ }
742
+ }
743
+
744
+ return results;
745
+ }
746
+
747
+ // === COMPOSITION: WEBFETCH ===============================================
748
+
749
+ function isCommentStyle(result) {
750
+ if (!result.items || result.items.length === 0) return false;
751
+ const first = result.items[0];
752
+ const keys = Object.keys(first).map((k) => k.toLowerCase());
753
+ return (
754
+ (keys.includes("author") && (keys.includes("comment") || keys.includes("body"))) ||
755
+ (keys.includes("user") && (keys.includes("comment") || keys.includes("body")))
756
+ );
757
+ }
758
+
759
+ function composeSections(extracted, template, startIndex, maxLength) {
760
+ const parts = [];
761
+
762
+ for (const result of extracted) {
763
+ if (result.type === "value") {
764
+ const text = result.text;
765
+ if (text && String(text).trim()) {
766
+ parts.push(`## ${result.section.name}\n\n${String(text).trim()}`);
767
+ }
768
+ } else if (result.type === "list") {
769
+ if (result.items && result.items.length > 0) {
770
+ const listText = result.items.map((item) => `- ${item}`).join("\n");
771
+ parts.push(`## ${result.section.name}\n\n${listText}`);
772
+ }
773
+ } else if (result.type === "children") {
774
+ if (result.items && Object.keys(result.items).length > 0) {
775
+ for (const [childName, value] of Object.entries(result.items)) {
776
+ if (value && String(value).trim()) {
777
+ parts.push(`## ${childName}\n\n${String(value).trim()}`);
778
+ }
779
+ }
780
+ }
781
+ } else if (result.type === "children-multiple") {
782
+ if (result.items && result.items.length > 0) {
783
+ if (isCommentStyle(result)) {
784
+ const commentParts = [];
785
+ for (const item of result.items) {
786
+ const author =
787
+ item["Author"] || item["author"] || item["User"] || item["user"] || "";
788
+ const comment =
789
+ item["Comment"] || item["Body"] || item["comment"] || item["body"] || "";
790
+ if (author) {
791
+ commentParts.push(`**${author}:**\n\n${comment}`);
792
+ } else if (comment) {
793
+ commentParts.push(comment);
794
+ }
795
+ }
796
+ if (commentParts.length > 0) {
797
+ parts.push(
798
+ `## ${result.section.name}\n\n${commentParts.join("\n\n---\n\n")}`,
799
+ );
800
+ }
225
801
  } else {
226
- route.continue();
802
+ const itemParts = [];
803
+ for (const item of result.items) {
804
+ const lines = [];
805
+ for (const [key, value] of Object.entries(item)) {
806
+ if (value && String(value).trim()) {
807
+ lines.push(` ${key}: ${String(value).trim()}`);
808
+ }
809
+ }
810
+ if (lines.length > 0) itemParts.push(lines.join("\n"));
811
+ }
812
+ if (itemParts.length > 0) {
813
+ parts.push(
814
+ `## ${result.section.name}\n\n${itemParts.join("\n\n")}`,
815
+ );
816
+ }
227
817
  }
228
- });
818
+ }
229
819
  }
820
+ }
230
821
 
231
- try {
232
- await page.goto(url, { waitUntil: "networkidle", timeout: 15000 });
233
- } catch (navError) {
234
- // Allow partial rendering on timeout
822
+ if (parts.length === 0) {
823
+ return "(No content extracted from this page.)";
824
+ }
825
+
826
+ const full = parts.join("\n\n---\n\n");
827
+ const totalLength = full.length;
828
+ const paginated = full.substring(startIndex, startIndex + maxLength);
829
+
830
+ const templateName = template ? template.name : "auto";
831
+ let metadata = `\n\n---\n[webfetch: template="${templateName}", showing characters ${startIndex} to ${startIndex + paginated.length} of ${totalLength} total.`;
832
+ if (startIndex + maxLength < totalLength) {
833
+ metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
834
+ }
835
+ metadata += `]`;
836
+
837
+ return paginated + metadata;
838
+ }
839
+
840
+ // === COMPOSITION: WEBSEARCH ==============================================
841
+
842
+ function composeSearchResults(extracted) {
843
+ // Find the search results section (first children-multiple)
844
+ const searchSection = extracted.find((r) => r.type === "children-multiple");
845
+
846
+ if (!searchSection || !searchSection.items || searchSection.items.length === 0) {
847
+ // Fall back to section-based output
848
+ return composeSections(extracted, null, 0, Infinity);
849
+ }
850
+
851
+ const items = searchSection.items;
852
+ const parts = [];
853
+
854
+ for (let i = 0; i < items.length; i++) {
855
+ const item = items[i];
856
+ const num = i + 1;
857
+
858
+ const title =
859
+ item["Title"] || item["title"] || Object.values(item)[0] || "";
860
+ const url =
861
+ item["URL"] || item["url"] || item["Url"] || "";
862
+ const snippet =
863
+ item["Snippet"] || item["snippet"] || "";
864
+
865
+ // Filter out non-http URLs and google internal links
866
+ let cleanUrl = url;
867
+ if (cleanUrl && !cleanUrl.startsWith("http")) {
868
+ cleanUrl = ""; // Skip internal/non-web URLs
869
+ }
870
+ if (
871
+ cleanUrl &&
872
+ (cleanUrl.includes("google.com/search") ||
873
+ cleanUrl.includes("support.google.com"))
874
+ ) {
875
+ cleanUrl = ""; // Skip google internal links
235
876
  }
236
877
 
237
- const pageContent = await page.content();
238
- let finalContent = "";
878
+ if (!title) continue;
239
879
 
240
- if (format === "raw_html") {
241
- finalContent = pageContent;
242
- } else {
243
- const $ = cheerio.load(pageContent);
880
+ const lines = [`[${num}] ${title}`];
881
+ if (cleanUrl) lines.push(` URL: ${cleanUrl}`);
882
+ if (snippet) lines.push(` Snippet: ${snippet}`);
244
883
 
245
- $(
246
- "script, style, nav, header, footer, noscript, iframe, svg, aside, .advertisement, img, picture, video, audio, canvas, map, area, dialog",
247
- ).remove();
248
- $("*")
249
- .removeAttr("style")
250
- .each((i, el) => {
251
- const src = $(el).attr("src");
252
- if (src && src.startsWith("data:image")) $(el).removeAttr("src");
253
- });
884
+ parts.push(lines.join("\n"));
885
+ }
254
886
 
255
- if (format === "clean_html") {
256
- finalContent = $.html();
257
- } else {
258
- const turndownService = new TurndownService({
259
- headingStyle: "atx",
260
- codeBlockStyle: "fenced",
261
- });
262
- finalContent = turndownService
263
- .turndown($.html())
264
- .replace(/\n{3,}/g, "\n\n")
265
- .trim();
266
- }
887
+ if (parts.length === 0) {
888
+ return "(No content extracted from this page.)";
889
+ }
890
+
891
+ return `## ${searchSection.section.name}\n\n${parts.join("\n\n")}`;
892
+ }
893
+
894
+ // === GENERIC FALLBACK ====================================================
895
+
896
+ function genericFallback($, startIndex, maxLength) {
897
+ applyRemove($, null);
898
+
899
+ const bodyHtml = $("body").html() || "";
900
+ let markdown = turndown
901
+ .turndown(bodyHtml)
902
+ .replace(/\n{3,}/g, "\n\n")
903
+ .trim();
904
+
905
+ if (!markdown || markdown.trim().length === 0) {
906
+ return "(No content extracted from this page.)";
907
+ }
908
+
909
+ const totalLength = markdown.length;
910
+ const paginated = markdown.substring(startIndex, startIndex + maxLength);
911
+
912
+ let metadata = `\n\n---\n[webfetch: template="auto (fallback)", showing characters ${startIndex} to ${startIndex + paginated.length} of ${totalLength} total.`;
913
+ if (startIndex + maxLength < totalLength) {
914
+ metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
915
+ }
916
+ metadata += `]`;
917
+
918
+ return paginated + metadata;
919
+ }
920
+
921
+ // === SEARCH TEMPLATE RESOLUTION ==========================================
922
+
923
+ function resolveSearchTemplate(engine, query, region, safeSearch) {
924
+ const templateName = resolveEngineToTemplateName(engine);
925
+
926
+ let template;
927
+ if (templateName.startsWith("{")) {
928
+ try {
929
+ template = JSON.parse(templateName);
930
+ } catch (e) {
931
+ throw new Error(`Invalid inline JSON template: ${e.message}`);
267
932
  }
933
+ } else {
934
+ template = getTemplateByName(templateName);
935
+ }
268
936
 
269
- const totalLength = finalContent.length;
270
- let paginatedText = finalContent.substring(
271
- startIndex,
272
- startIndex + maxLength,
937
+ if (!template.url_template) {
938
+ throw new Error(
939
+ `Template '${template.name}' is not a search template (no url_template).`,
273
940
  );
941
+ }
274
942
 
275
- let metadata = `\n\n---\n[Document Info: Showing characters ${startIndex} to ${startIndex + paginatedText.length} of ${totalLength} total.`;
276
- if (startIndex + maxLength < totalLength) {
277
- metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
278
- }
279
- metadata += `]`;
943
+ const params = mapSearchParams(engine, query, region, safeSearch);
944
+ let url = resolveUrlTemplate(template, params);
280
945
 
281
- return paginatedText + metadata;
282
- } finally {
283
- await page.close();
284
- await context.close();
946
+ // Google safe_search: append safe=active to URL
947
+ if (
948
+ (engine === "google" || templateName === "google-search") &&
949
+ safeSearch === true
950
+ ) {
951
+ url += "&safe=active";
285
952
  }
953
+
954
+ return { template, url };
286
955
  }
287
956
 
288
- const server = new McpServer({ name: "searchfetch", version: "2.0.0" });
957
+ // === MCP SERVER & TOOLS ==================================================
958
+
959
+ const server = new McpServer({ name: "searchfetch", version: "3.0.1" });
960
+
961
+ // --- websearch tool ---
289
962
 
290
963
  server.registerTool(
291
964
  "websearch",
@@ -296,38 +969,67 @@ server.registerTool(
296
969
  inputSchema: z.object({
297
970
  query: z.string().describe("The search query string."),
298
971
  engine: z
299
- .enum(["duckduckgo", "google"])
972
+ .string()
300
973
  .default("duckduckgo")
301
974
  .describe(
302
975
  "Search engine to use. Can be 'duckduckgo' or 'google'. Default is 'duckduckgo'.",
303
976
  ),
304
- max_results: z
305
- .number()
306
- .default(10)
307
- .describe("Maximum number of results to return. Default is 10."),
308
977
  region: z
309
978
  .string()
310
- .default("wt-wt")
979
+ .nullable()
980
+ .default(null)
311
981
  .describe(
312
- "Region and language code to localize search results (e.g., 'us-en', 'uk-en', 'de-de'). For DuckDuckGo it maps directly. For Google, 'us' is country code and 'en' is language. Default is 'wt-wt' (global/US English).",
982
+ "Region and language code to localize search results (e.g., 'us-en', 'uk-en', 'de-de'). For DuckDuckGo it maps directly. For Google, 'us' is country code and 'en' is language. Default is null (uses template default).",
313
983
  ),
314
984
  safe_search: z
315
- .string()
316
- .default("-1")
985
+ .boolean()
986
+ .nullable()
987
+ .default(null)
988
+ .describe(
989
+ "Enable safe search filtering. null = use template default. Applies to both DuckDuckGo and Google.",
990
+ ),
991
+ max_results: z
992
+ .number()
993
+ .default(10)
994
+ .describe("Maximum number of search results to return. Default is 10."),
995
+ block_media: z
996
+ .boolean()
997
+ .default(true)
317
998
  .describe(
318
- "Safe search filtering mode. '-1' for Moderate, '1' for Strict, '-2' for Off. Default is '-1'. Note: Only applies to DuckDuckGo.",
999
+ "Block images, videos, and fonts entirely at the network layer. Default is true.",
319
1000
  ),
320
1001
  }),
321
1002
  },
322
- async ({ query, max_results, region, safe_search, engine }) => {
1003
+ async ({ query, engine, region, safe_search, max_results, block_media }) => {
323
1004
  try {
324
- const result = await executeSearch(
1005
+ // 1. Resolve search template (+ url_params mapping + url building)
1006
+ const { template, url } = resolveSearchTemplate(
1007
+ engine,
325
1008
  query,
326
- max_results,
327
1009
  region,
328
1010
  safe_search,
329
- engine,
330
1011
  );
1012
+
1013
+ // 2. Fetch
1014
+ const html = await fetchHtmlWithRetry(url, template, block_media);
1015
+
1016
+ // 3. Extract
1017
+ const $ = cheerio.load(html);
1018
+ applyRemove($, template);
1019
+
1020
+ const pageOrigin = new URL(url).origin;
1021
+ const context = {
1022
+ origin: pageOrigin,
1023
+ isWebsearch: true,
1024
+ maxResultsOverride: max_results,
1025
+ _maxResultsConsumed: false,
1026
+ };
1027
+
1028
+ const extracted = extractTemplate($, template, context);
1029
+
1030
+ // 4. Compose
1031
+ const result = composeSearchResults(extracted);
1032
+
331
1033
  return { content: [{ type: "text", text: result }] };
332
1034
  } catch (err) {
333
1035
  return {
@@ -338,6 +1040,8 @@ server.registerTool(
338
1040
  },
339
1041
  );
340
1042
 
1043
+ // --- webfetch tool ---
1044
+
341
1045
  server.registerTool(
342
1046
  "webfetch",
343
1047
  {
@@ -345,22 +1049,20 @@ server.registerTool(
345
1049
  description:
346
1050
  "Fetch and extract the main text content from any webpage. Fully executes JavaScript to load React/SPAs and aggressively strips images/media (including base64) to save context tokens.",
347
1051
  inputSchema: z.object({
348
- url: z
349
- .url()
350
- .describe(
351
- "The full URL of the webpage to fetch (must start with http/https).",
352
- ),
353
- format: z
354
- .enum(["markdown", "clean_html", "raw_html"])
355
- .default("markdown")
1052
+ url: z.string().describe(
1053
+ "The full URL of the webpage to fetch (must start with http/https).",
1054
+ ),
1055
+ template: z
1056
+ .string()
1057
+ .default("auto")
356
1058
  .describe(
357
- "Output format. Set to 'markdown', 'clean_html', or 'raw_html'. Default is 'markdown' (highly recommended to save context tokens).",
1059
+ "Template to use: 'auto' (auto-detect from URL), a built-in name, or inline JSON.",
358
1060
  ),
359
1061
  start_index: z
360
1062
  .number()
361
1063
  .default(0)
362
1064
  .describe(
363
- "Character offset to start reading from for pagination. Use this if a document is too large to fit in the context window. Default is 0.",
1065
+ "Character offset for pagination. Default: 0.",
364
1066
  ),
365
1067
  max_length: z
366
1068
  .number()
@@ -372,20 +1074,50 @@ server.registerTool(
372
1074
  .boolean()
373
1075
  .default(true)
374
1076
  .describe(
375
- "Block images, videos, and fonts entirely at the network layer to drastically speed up page loads and dodge tracking pixels. Default is true.",
1077
+ "Block images, videos, and fonts entirely at the network layer. Default is true.",
376
1078
  ),
377
1079
  }),
378
1080
  },
379
- async ({ url, format, start_index, max_length, block_media }) => {
1081
+ async ({ url, template: templateParam, start_index, max_length, block_media }) => {
380
1082
  try {
381
- const result = await executeFetch(
382
- url,
383
- format,
384
- start_index,
385
- max_length,
386
- block_media,
387
- );
388
- return { content: [{ type: "text", text: result }] };
1083
+ // 1. Resolve template
1084
+ let template;
1085
+
1086
+ if (templateParam.startsWith("{")) {
1087
+ try {
1088
+ template = JSON.parse(templateParam);
1089
+ } catch (e) {
1090
+ throw new Error(`Invalid inline JSON template: ${e.message}`);
1091
+ }
1092
+ } else if (templateParam === "auto") {
1093
+ template = detectTemplateByUrl(url);
1094
+ } else {
1095
+ template = getTemplateByName(templateParam);
1096
+ }
1097
+
1098
+ // 2. Fetch
1099
+ const html = await fetchHtmlWithRetry(url, template, block_media);
1100
+
1101
+ // 3. Extract and compose
1102
+ const $ = cheerio.load(html);
1103
+
1104
+ if (template) {
1105
+ applyRemove($, template);
1106
+
1107
+ const pageOrigin = new URL(url).origin;
1108
+ const context = {
1109
+ origin: pageOrigin,
1110
+ isWebsearch: false,
1111
+ };
1112
+
1113
+ const extracted = extractTemplate($, template, context);
1114
+ const result = composeSections(extracted, template, start_index, max_length);
1115
+ return { content: [{ type: "text", text: result }] };
1116
+ } else {
1117
+ // Generic fallback
1118
+ const result = genericFallback($, start_index, max_length);
1119
+ return { content: [{ type: "text", text: result }] };
1120
+ }
389
1121
  } catch (err) {
390
1122
  return {
391
1123
  content: [{ type: "text", text: `Fetch Error: ${err.message}` }],
@@ -395,6 +1127,8 @@ server.registerTool(
395
1127
  },
396
1128
  );
397
1129
 
1130
+ // === MAIN =================================================================
1131
+
398
1132
  async function main() {
399
1133
  await ensureBinary();
400
1134
  process.stdout.write = originalStdoutWrite;