searchfetch 1.0.2 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env node
2
2
 
3
+ // === STDOUT/STDERR REDIRECTION ===========================================
3
4
  const originalStdoutWrite = process.stdout.write.bind(process.stdout);
4
5
  process.stdout.write = (chunk, encoding, callback) => {
5
6
  return process.stderr.write(chunk, encoding, callback);
@@ -7,6 +8,10 @@ process.stdout.write = (chunk, encoding, callback) => {
7
8
  console.log = (...args) => console.error(...args);
8
9
  console.info = (...args) => console.error(...args);
9
10
 
11
+ // === IMPORTS =============================================================
12
+ import { readdirSync, readFileSync } from "node:fs";
13
+ import { dirname, join } from "node:path";
14
+ import { fileURLToPath } from "node:url";
10
15
  import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
11
16
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
12
17
  import { z } from "zod";
@@ -14,37 +19,47 @@ import { launch, ensureBinary } from "cloakbrowser";
14
19
  import * as cheerio from "cheerio";
15
20
  import TurndownService from "turndown";
16
21
 
17
- const logger = {
18
- info: (msg) => console.error(`[INFO] ${msg}`),
19
- warn: (msg) => console.error(`[WARN] ${msg}`),
20
- error: (msg, err) => console.error(`[ERROR] ${msg}`, err || ""),
21
- };
22
-
23
- // ==========================================
24
- // BROWSER LIFECYCLE MANAGEMENT
25
- // ==========================================
22
+ // === BROWSER MANAGER =====================================================
26
23
  class BrowserManager {
27
24
  constructor() {
28
25
  this.browser = null;
26
+ this.launchPromise = null;
29
27
  }
30
28
 
31
29
  async getBrowser() {
32
- if (!this.browser) {
33
- logger.info("Launching stealth CloakBrowser instance...");
34
- this.browser = await launch({
35
- headless: true,
36
- humanize: true, // Native C++ bot-bypass patches + human behavior
37
- args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"],
30
+ if (this.browser && this.browser.isConnected()) return this.browser;
31
+ if (this.launchPromise) return this.launchPromise;
32
+
33
+ this.launchPromise = launch({
34
+ headless: true,
35
+ humanize: true,
36
+ args: [
37
+ "--disable-blink-features=AutomationControlled",
38
+ "--no-sandbox",
39
+ "--disable-dev-shm-usage",
40
+ ],
41
+ })
42
+ .then((browser) => {
43
+ this.browser = browser;
44
+ this.browser.on("disconnected", () => {
45
+ this.browser = null;
46
+ });
47
+ return browser;
48
+ })
49
+ .catch((err) => {
50
+ throw err;
51
+ })
52
+ .finally(() => {
53
+ this.launchPromise = null;
38
54
  });
39
- }
40
- return this.browser;
55
+
56
+ return this.launchPromise;
41
57
  }
42
58
 
43
59
  async close() {
44
60
  if (this.browser) {
45
61
  await this.browser.close();
46
62
  this.browser = null;
47
- logger.info("Browser instance securely closed.");
48
63
  }
49
64
  }
50
65
  }
@@ -52,358 +67,1039 @@ class BrowserManager {
52
67
  const browserManager = new BrowserManager();
53
68
 
54
69
  const cleanup = async () => {
55
- logger.info("Received termination signal. Shutting down browser...");
56
70
  await browserManager.close();
57
71
  process.exit(0);
58
72
  };
59
73
  process.on("SIGINT", cleanup);
60
74
  process.on("SIGTERM", cleanup);
61
75
 
62
- // ==========================================
63
- // CORE LOGIC: SEARCH & FETCH
64
- // ==========================================
76
+ // === TURNDOWN ============================================================
77
+ const turndown = new TurndownService({
78
+ headingStyle: "atx",
79
+ codeBlockStyle: "fenced",
80
+ emDelimiter: "*",
81
+ });
82
+
83
+ // === BUILT-IN TEMPLATES (loaded from templates/*.json) ====================
65
84
 
66
- async function executeSearch(query, maxResults, region, safeSearch, engine) {
67
- logger.info(
68
- `Searching ${engine.toUpperCase()} via Stealth Browser for: "${query}"`,
69
- );
85
+ const __filename = fileURLToPath(import.meta.url);
86
+ const __dirname = dirname(__filename);
87
+ const TEMPLATES_DIR = join(__dirname, "templates");
88
+
89
+ function loadBuiltinTemplates() {
90
+ let files;
91
+ try {
92
+ files = readdirSync(TEMPLATES_DIR);
93
+ } catch (err) {
94
+ throw new Error(
95
+ `Cannot read templates directory '${TEMPLATES_DIR}': ${err.message}`,
96
+ );
97
+ }
98
+
99
+ const jsonFiles = files
100
+ .filter((f) => f.endsWith(".json"))
101
+ .sort();
102
+
103
+ if (jsonFiles.length === 0) {
104
+ throw new Error(
105
+ `No template JSON files found in '${TEMPLATES_DIR}'`,
106
+ );
107
+ }
108
+
109
+ const templates = [];
110
+ for (const file of jsonFiles) {
111
+ const filePath = join(TEMPLATES_DIR, file);
112
+ const content = readFileSync(filePath, "utf-8");
113
+ let template;
114
+ try {
115
+ template = JSON.parse(content);
116
+ } catch (err) {
117
+ throw new Error(
118
+ `Invalid JSON in template file '${filePath}': ${err.message}`,
119
+ );
120
+ }
121
+ if (!template.name || typeof template.name !== "string") {
122
+ throw new Error(
123
+ `Template file '${filePath}' is missing a valid "name" field`,
124
+ );
125
+ }
126
+ templates.push(template);
127
+ }
128
+
129
+ // Sort by "order" field for deterministic URL-pattern matching
130
+ templates.sort((a, b) => (a.order ?? 999) - (b.order ?? 999));
131
+
132
+ return templates;
133
+ }
134
+
135
+ const BUILTIN_TEMPLATES = loadBuiltinTemplates();
136
+
137
+ // === TEMPLATE LOOKUP =====================================================
138
+ const TEMPLATE_MAP = new Map();
139
+ for (const t of BUILTIN_TEMPLATES) {
140
+ TEMPLATE_MAP.set(t.name, t);
141
+ }
142
+
143
+ function getTemplateByName(name) {
144
+ const t = TEMPLATE_MAP.get(name);
145
+ if (!t) {
146
+ const names = [...TEMPLATE_MAP.keys()].join(", ");
147
+ throw new Error(
148
+ `Unknown template '${name}'. Available: ${names}`,
149
+ );
150
+ }
151
+ return t;
152
+ }
153
+
154
+ function detectTemplateByUrl(url) {
155
+ for (const template of BUILTIN_TEMPLATES) {
156
+ if (!template.url_patterns) continue;
157
+ for (const pattern of template.url_patterns) {
158
+ try {
159
+ if (new RegExp(pattern).test(url)) {
160
+ return template;
161
+ }
162
+ } catch (_) {
163
+ // Skip invalid regex
164
+ }
165
+ }
166
+ }
167
+ return null;
168
+ }
169
+
170
+ // === URL TEMPLATE RESOLUTION =============================================
171
+
172
+ function resolveUrlTemplate(template, providedParams) {
173
+ const urlParams = template.url_params || {};
174
+ let url = template.url_template;
175
+ if (!url) return null;
176
+
177
+ let match;
178
+ const re = /\{(\w+)\}/g;
179
+ while ((match = re.exec(url)) !== null) {
180
+ const name = match[1];
181
+ const def = urlParams[name] || {};
182
+
183
+ let value;
184
+ if (
185
+ name in providedParams &&
186
+ providedParams[name] !== null &&
187
+ providedParams[name] !== undefined
188
+ ) {
189
+ value = String(providedParams[name]);
190
+ } else if (def.default !== undefined) {
191
+ value = String(def.default);
192
+ } else if (def.required) {
193
+ throw new Error(
194
+ `Required URL parameter '${name}' not provided for template '${template.name}'.`,
195
+ );
196
+ } else {
197
+ value = "";
198
+ }
199
+
200
+ if (def.encode === "url") {
201
+ value = encodeURIComponent(value);
202
+ }
203
+
204
+ url = url.replace(match[0], value);
205
+ }
206
+
207
+ // Remove any remaining unreplaced placeholders
208
+ url = url.replace(/\{\w+\}/g, "").replace(/&{2,}/g, "&").replace(/\?&/, "?");
209
+
210
+ return url;
211
+ }
212
+
213
+ // === SEARCH PARAM MAPPING ================================================
214
+
215
+ function resolveEngineToTemplateName(engine) {
216
+ if (engine === "duckduckgo") return "duckduckgo-search";
217
+ if (engine === "google") return "google-search";
218
+ return engine;
219
+ }
220
+
221
+ function mapSearchParams(engine, query, region, safeSearch) {
222
+ const params = { query };
223
+ const resolved = resolveEngineToTemplateName(engine);
224
+
225
+ if (resolved === "duckduckgo-search") {
226
+ if (region !== null && region !== undefined) {
227
+ params.kl = region;
228
+ }
229
+ if (safeSearch === true) {
230
+ params.kp = "1";
231
+ } else if (safeSearch === false) {
232
+ params.kp = "-2";
233
+ }
234
+ } else if (resolved === "google-search") {
235
+ if (region !== null && region !== undefined) {
236
+ const parts = region.split("-");
237
+ params.hl = parts[0];
238
+ params.gl = parts.length > 1 ? parts[1] : parts[0];
239
+ }
240
+ }
241
+
242
+ return params;
243
+ }
244
+
245
+ // === FETCH ===============================================================
246
+
247
+ function isAccessDenied($) {
248
+ const title = ($("title").text() || "").toLowerCase();
249
+ const bodyText = ($("body").text() || "").replace(/\s+/g, " ").trim().toLowerCase();
250
+
251
+ const titleDenyPatterns = [
252
+ "captcha",
253
+ "are you a robot",
254
+ "access denied",
255
+ "blocked",
256
+ "forbidden",
257
+ "unusual traffic",
258
+ "sorry, you have been blocked",
259
+ "verify you are human",
260
+ "one more step",
261
+ "security check",
262
+ "ddos protection",
263
+ "cloudflare",
264
+ ];
265
+
266
+ if (titleDenyPatterns.some((pattern) => title.includes(pattern))) return true;
267
+
268
+ const bodyDenyPatterns = [
269
+ "to continue, please type the characters",
270
+ "our systems have detected unusual traffic",
271
+ "verify you are human",
272
+ "are you a robot",
273
+ "sorry, you have been blocked",
274
+ "access denied",
275
+ ];
276
+
277
+ if (bodyText.length < 1200 && bodyDenyPatterns.some((pattern) => bodyText.includes(pattern))) return true;
70
278
 
279
+ return false;
280
+ }
281
+
282
+ async function fetchHtml(url, template, blockMedia) {
71
283
  const browser = await browserManager.getBrowser();
72
284
  const context = await browser.newContext();
73
285
 
74
- // Inject Google Consent cookie to universally bypass GDPR popups blocking the DOM
75
- await context.addCookies([
76
- {
77
- name: "CONSENT",
78
- value: "YES+cb.20250101-01-p0.en+FX+999",
79
- domain: ".google.com",
80
- path: "/",
81
- },
82
- ]);
286
+ try {
287
+ // Pre-load cookies from template
288
+ if (template && template.cookies && template.cookies.length > 0) {
289
+ await context.addCookies(template.cookies);
290
+ }
83
291
 
84
- const page = await context.newPage();
292
+ const page = await context.newPage();
85
293
 
86
- try {
87
- // Optimization: Block heavy/unnecessary resources to make searches lightning fast
88
- await page.route("**/*", (route) => {
89
- const type = route.request().resourceType();
90
- if (["image", "media", "font", "stylesheet"].includes(type)) {
91
- route.abort();
92
- } else {
93
- route.continue();
294
+ try {
295
+ // Route blocked resource types
296
+ if (blockMedia) {
297
+ const blockedTypes =
298
+ template && template.block_resources
299
+ ? template.block_resources
300
+ : ["image", "media", "font"];
301
+
302
+ if (blockedTypes.length > 0) {
303
+ await page.route("**/*", (route) => {
304
+ const type = route.request().resourceType();
305
+ if (blockedTypes.includes(type)) {
306
+ route.abort();
307
+ } else {
308
+ route.continue();
309
+ }
310
+ });
311
+ }
94
312
  }
95
- });
96
313
 
97
- const results = [];
98
- let searchUrl = "";
314
+ let response;
315
+ try {
316
+ response = await page.goto(url, {
317
+ waitUntil: "networkidle",
318
+ timeout: 15000,
319
+ });
320
+ } catch (_navError) {
321
+ // Allow partial rendering on timeout
322
+ }
99
323
 
100
- if (engine === "google") {
101
- searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}&hl=en&gl=us`;
102
- } else {
103
- searchUrl = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}&kl=${encodeURIComponent(region)}&kp=${encodeURIComponent(safeSearch)}`;
324
+ // Check HTTP status for access failures
325
+ if (response) {
326
+ const status = response.status();
327
+ if ([401, 403, 429].includes(status)) {
328
+ throw new Error(
329
+ `Access denied: HTTP ${status} when fetching ${url}`,
330
+ );
331
+ }
332
+ }
333
+
334
+ const pageContent = await page.content();
335
+
336
+ // Check for CAPTCHA / access-denied pages
337
+ const $ = cheerio.load(pageContent);
338
+ if (isAccessDenied($)) {
339
+ throw new Error(
340
+ `Access denied: CAPTCHA or block page detected at ${url}. The site is blocking automated access.`,
341
+ );
342
+ }
343
+
344
+ return pageContent;
345
+ } finally {
346
+ await page.close();
104
347
  }
348
+ } finally {
349
+ await context.close();
350
+ }
351
+ }
105
352
 
353
+ async function fetchHtmlWithRetry(url, template, blockMedia) {
354
+ let lastError;
355
+ for (let attempt = 0; attempt < 2; attempt++) {
106
356
  try {
107
- // Use networkidle to ensure JavaScript fully renders organic results or follows hidden redirects
108
- await page.goto(searchUrl, { waitUntil: "networkidle", timeout: 15000 });
109
- } catch (e) {
110
- if (e.name === "TimeoutError") {
111
- logger.warn(`Network idle timeout on search. Extracting loaded DOM...`);
112
- } else {
113
- throw e;
357
+ return await fetchHtml(url, template, blockMedia);
358
+ } catch (err) {
359
+ lastError = err;
360
+ if (
361
+ attempt === 0 &&
362
+ (err.message.includes("net::") ||
363
+ err.message.includes("ERR_") ||
364
+ err.message.includes("Navigation failed"))
365
+ ) {
366
+ // Network error — retry once
367
+ continue;
114
368
  }
369
+ throw err;
115
370
  }
371
+ }
372
+ throw lastError;
373
+ }
116
374
 
117
- const pageContent = await page.content();
118
- const $ = cheerio.load(pageContent);
375
+ // === HTML CLEANUP ========================================================
119
376
 
120
- if (engine === "google") {
121
- // Google's core organic result selector
122
- $("div.g").each((i, el) => {
123
- if (results.length >= maxResults) return;
377
+ const DEFAULT_REMOVE_SELECTORS = [
378
+ "script", "style", "svg", "nav", "footer", "noscript", "iframe",
379
+ ".advertisement",
380
+ ];
124
381
 
125
- const titleEl = $(el).find("h3").first();
126
- const linkEl = $(el).find("a").first();
127
- if (!titleEl.length || !linkEl.length) return;
382
+ function applyRemove($, template) {
383
+ const removeSelectors =
384
+ template && template.remove && template.remove.length > 0
385
+ ? template.remove
386
+ : DEFAULT_REMOVE_SELECTORS;
128
387
 
129
- const title = titleEl.text().trim();
130
- let link = linkEl.attr("href") || "";
388
+ for (const selector of removeSelectors) {
389
+ try {
390
+ $(selector).remove();
391
+ } catch (_) {
392
+ // Skip invalid selectors
393
+ }
394
+ }
131
395
 
132
- // Handle Google relative redirect links
133
- if (link.startsWith("/url?q=")) {
134
- try {
135
- link = decodeURIComponent(link.split("/url?q=")[1].split("&")[0]);
136
- } catch (e) {}
137
- }
396
+ // Strip style attributes and data:image src
397
+ $("[style]").removeAttr("style");
398
+ $("*").each((_i, el) => {
399
+ const src = $(el).attr("src");
400
+ if (src && src.startsWith("data:image")) {
401
+ $(el).removeAttr("src");
402
+ }
403
+ });
404
+ }
138
405
 
139
- // Isolate snippet text safely
140
- const cloned = $(el).clone();
141
- cloned.find("h3, a, script, style, cite").remove();
142
- const snippet = cloned.text().replace(/\s+/g, " ").trim();
406
+ // === EXTRACTION ENGINE ===================================================
143
407
 
144
- if (title && link && link.startsWith("http")) {
145
- results.push({ position: results.length + 1, title, link, snippet });
146
- }
147
- });
148
- } else {
149
- // DuckDuckGo selector
150
- $(".result").each((i, el) => {
151
- if (results.length >= maxResults) return;
408
+ /**
409
+ * Find elements matching selector, scoped to $parent.
410
+ * Search order: descendants → closest ancestor → ancestor subtrees (up to 4 levels).
411
+ */
412
+ function findScoped($parent, selector) {
413
+ if (!selector || selector.trim() === "") {
414
+ return $parent;
415
+ }
416
+
417
+ // 1. Descendants
418
+ let result = $parent.find(selector);
419
+ if (result.length > 0) return result;
420
+
421
+ // 2. Closest ancestor matching selector
422
+ result = $parent.closest(selector);
423
+ if (result.length > 0) return result;
424
+
425
+ // 3. Ancestor subtrees (up to 4 levels up)
426
+ let ancestor = $parent.parent();
427
+ for (let i = 0; i < 4 && ancestor.length > 0; i++) {
428
+ result = ancestor.find(selector);
429
+ if (result.length > 0) return result;
430
+ ancestor = ancestor.parent();
431
+ }
432
+
433
+ return $parent.find("__nonexistent__");
434
+ }
152
435
 
153
- const titleEl = $(el).find(".result__title a");
154
- const snippetEl = $(el).find(".result__snippet");
155
- if (!titleEl.length) return;
436
+ /**
437
+ * Try comma-separated selectors in order; first match wins.
438
+ */
439
+ function findFirstMatch($parent, selectorStr) {
440
+ if (!selectorStr || selectorStr.trim() === "") {
441
+ return $parent;
442
+ }
443
+
444
+ const selectors = selectorStr
445
+ .split(",")
446
+ .map((s) => s.trim())
447
+ .filter(Boolean);
156
448
 
157
- const title = titleEl.text().trim();
158
- let link = titleEl.attr("href") || "";
449
+ for (const sel of selectors) {
450
+ const matches = findScoped($parent, sel);
451
+ if (matches.length > 0) return matches;
452
+ }
453
+
454
+ return $parent.find("__nonexistent__");
455
+ }
159
456
 
160
- if (link.includes("/l/?uddg=")) {
457
+ /**
458
+ * Resolve top-level elements for a section (document-wide with fallback).
459
+ */
460
+ function resolveTopElements($, selectorStr) {
461
+ if (!selectorStr || selectorStr.trim() === "") {
462
+ return $("body");
463
+ }
464
+
465
+ const selectors = selectorStr
466
+ .split(",")
467
+ .map((s) => s.trim())
468
+ .filter(Boolean);
469
+
470
+ for (const sel of selectors) {
471
+ try {
472
+ const matches = $(sel);
473
+ if (matches.length > 0) return matches;
474
+ } catch (_) {
475
+ // Skip invalid selectors
476
+ }
477
+ }
478
+
479
+ return $();
480
+ }
481
+
482
+ // === TRANSFORMS ==========================================================
483
+
484
+ function applyTransform(value, transform, origin) {
485
+ const transforms = Array.isArray(transform) ? transform : [transform];
486
+ let result = value;
487
+
488
+ for (const t of transforms) {
489
+ if (!result) continue;
490
+ switch (t) {
491
+ case "strip":
492
+ result = result.trim();
493
+ break;
494
+
495
+ case "decode_google_url":
496
+ if (result.startsWith("/url?q=")) {
161
497
  try {
162
- const urlParams = new URLSearchParams(link.split("?")[1]);
163
- link = decodeURIComponent(urlParams.get("uddg") || link);
164
- } catch (e) {}
498
+ const urlPart = result.split("/url?q=")[1].split("&")[0];
499
+ result = decodeURIComponent(urlPart);
500
+ } catch (_) {
501
+ // Leave as-is
502
+ }
165
503
  }
504
+ break;
166
505
 
167
- const snippet = snippetEl.text().replace(/\s+/g, " ").trim();
168
- if (title && link && link.startsWith("http")) {
169
- results.push({ position: results.length + 1, title, link, snippet });
506
+ case "decode_ddg_url":
507
+ if (result.includes("/l/?uddg=")) {
508
+ try {
509
+ const queryString = result.split("?")[1] || "";
510
+ const params = new URLSearchParams(queryString);
511
+ const uddg = params.get("uddg");
512
+ if (uddg) result = decodeURIComponent(uddg);
513
+ } catch (_) {
514
+ // Leave as-is
515
+ }
170
516
  }
171
- });
517
+ break;
518
+
519
+ case "json_parse":
520
+ try {
521
+ result = JSON.stringify(JSON.parse(result), null, 2);
522
+ } catch (_) {
523
+ // Leave as-is
524
+ }
525
+ break;
526
+
527
+ case "resolve_href":
528
+ if (origin && result.startsWith("/") && !result.startsWith("//")) {
529
+ try {
530
+ result = new URL(result, origin).href;
531
+ } catch (_) {
532
+ // Leave as-is
533
+ }
534
+ }
535
+ break;
172
536
  }
537
+ }
538
+
539
+ return result;
540
+ }
541
+
542
+ // === EXTRACTION ==========================================================
543
+
544
+ function extractValue($el, section, origin) {
545
+ let value;
173
546
 
174
- if (results.length === 0) {
175
- const pageText = $("body").text().replace(/\s+/g, " ").substring(0, 300);
176
- logger.warn(`No results found. DOM Sample: ${pageText}`);
177
- return `No results found on ${engine}. The search engine might be showing a captcha/consent screen, or the query returned nothing. Try rephrasing or switching engines.`;
547
+ switch (section.format) {
548
+ case "text":
549
+ value = $el.text().replace(/\s+/g, " ").trim();
550
+ break;
551
+
552
+ case "markdown": {
553
+ const html = $el.html() || "";
554
+ value = turndown
555
+ .turndown(html)
556
+ .replace(/\n{3,}/g, "\n\n")
557
+ .trim();
558
+ break;
178
559
  }
179
560
 
180
- return (
181
- `Found ${results.length} search results on ${engine}:\n\n` +
182
- results
183
- .map(
184
- (r) =>
185
- `[${r.position}] ${r.title}\n URL: ${r.link}\n Summary: ${r.snippet}`,
186
- )
187
- .join("\n\n")
188
- );
189
- } finally {
190
- await page.close();
191
- await context.close();
561
+ case "attribute":
562
+ value = $el.attr(section.attribute) || "";
563
+ break;
564
+
565
+ case "html":
566
+ value = $el.html() || "";
567
+ break;
568
+
569
+ default:
570
+ value = $el.text().replace(/\s+/g, " ").trim();
192
571
  }
572
+
573
+ if (section.transform && value) {
574
+ value = applyTransform(value, section.transform, origin);
575
+ }
576
+
577
+ return value;
193
578
  }
194
579
 
195
- async function executeFetch(url, format, startIndex, maxLength, blockMedia) {
196
- logger.info(`Fetching URL: ${url} | Format: ${format}`);
580
+ /**
581
+ * Extract one child section, scoped to $parentEl.
582
+ * Returns { type: "value", text } or null.
583
+ */
584
+ function extractChildSection($, $parentEl, section, origin) {
585
+ const elements = findFirstMatch($parentEl, section.selector);
197
586
 
198
- const browser = await browserManager.getBrowser();
199
- const context = await browser.newContext();
200
- const page = await context.newPage();
587
+ if (!elements || elements.length === 0) {
588
+ if (section.required) {
589
+ throw new Error(
590
+ `Required section '${section.name}' not found on page.`,
591
+ );
592
+ }
593
+ return null;
594
+ }
201
595
 
202
- try {
203
- if (blockMedia) {
204
- await page.route("**/*", (route) => {
205
- const type = route.request().resourceType();
206
- if (["image", "media", "font"].includes(type)) {
207
- route.abort();
208
- } else {
209
- route.continue();
210
- }
211
- });
596
+ const el = elements.eq(0);
597
+ const value = extractValue(el, section, origin);
598
+ return { type: "value", text: value };
599
+ }
600
+
601
+ /**
602
+ * Extract a top-level section from the document.
603
+ * Returns a SectionResult or null.
604
+ */
605
+ function extractSection($, section, context) {
606
+ const elements = resolveTopElements($, section.selector);
607
+
608
+ if (!elements || elements.length === 0) {
609
+ if (section.required) {
610
+ throw new Error(
611
+ `Required section '${section.name}' not found on page.`,
612
+ );
212
613
  }
614
+ return null;
615
+ }
213
616
 
214
- try {
215
- await page.goto(url, { waitUntil: "networkidle", timeout: 15000 });
216
- } catch (navError) {
217
- if (navError.name === "TimeoutError") {
218
- logger.warn(
219
- `Network idle timeout on ${url}. Extracting partial DOM...`,
220
- );
617
+ // Determine limit
618
+ let limit = elements.length;
619
+ if (section.multiple && section.max_items) {
620
+ limit = Math.min(limit, section.max_items);
621
+ }
622
+ // Override max_items with max_results for first multiple+children section
623
+ if (
624
+ context.isWebsearch &&
625
+ context.maxResultsOverride &&
626
+ !context._maxResultsConsumed &&
627
+ section.multiple &&
628
+ section.children &&
629
+ section.children.length > 0
630
+ ) {
631
+ limit = Math.min(limit, context.maxResultsOverride);
632
+ context._maxResultsConsumed = true;
633
+ }
634
+
635
+ if (section.multiple) {
636
+ const items = [];
637
+
638
+ for (let i = 0; i < limit; i++) {
639
+ const el = elements.eq(i);
640
+
641
+ if (section.children && section.children.length > 0) {
642
+ // Multiple parents, each with children
643
+ const childValues = {};
644
+ for (const child of section.children) {
645
+ const cr = extractChildSection($, el, child, context.origin);
646
+ if (cr && cr.type === "value" && cr.text !== null && cr.text !== undefined) {
647
+ childValues[child.name] = cr.text;
648
+ }
649
+ }
650
+ if (Object.keys(childValues).length > 0) {
651
+ items.push(childValues);
652
+ }
221
653
  } else {
222
- throw navError;
654
+ // Multiple parents, no children
655
+ const value = extractValue(el, section, context.origin);
656
+ if (value && value.trim()) {
657
+ items.push(value.trim());
658
+ }
223
659
  }
224
660
  }
225
661
 
226
- const pageContent = await page.content();
227
- let finalContent = "";
662
+ if (section.children && section.children.length > 0) {
663
+ return { section, type: "children-multiple", items };
664
+ } else {
665
+ return { section, type: "list", items };
666
+ }
667
+ } else {
668
+ // Single parent
669
+ const el = elements.eq(0);
228
670
 
229
- if (format === "raw_html") {
230
- finalContent = pageContent;
671
+ if (section.children && section.children.length > 0) {
672
+ // Single parent with children — parent format ignored
673
+ const childValues = {};
674
+ for (const child of section.children) {
675
+ const cr = extractChildSection($, el, child, context.origin);
676
+ if (cr && cr.type === "value" && cr.text !== null && cr.text !== undefined) {
677
+ childValues[child.name] = cr.text;
678
+ }
679
+ }
680
+ return { section, type: "children", items: childValues };
231
681
  } else {
232
- const $ = cheerio.load(pageContent);
682
+ const value = extractValue(el, section, context.origin);
683
+ return { section, type: "value", text: value };
684
+ }
685
+ }
686
+ }
687
+
688
+ function extractTemplate($, template, context) {
689
+ const results = [];
233
690
 
234
- // 🚀 NUCLEAR OPTION FOR BASE64 AND TOKENS 🚀
235
- // Physically scrub out all tags that harbor base64 strings or waste tokens
236
- $(
237
- "script, style, nav, header, footer, noscript, iframe, svg, aside, .advertisement, img, picture, video, audio, canvas, map, area, dialog",
238
- ).remove();
691
+ for (const section of template.sections) {
692
+ try {
693
+ const result = extractSection($, section, context);
694
+ if (result !== null) {
695
+ results.push(result);
696
+ }
697
+ } catch (err) {
698
+ if (
699
+ err.message &&
700
+ err.message.includes("Required section")
701
+ ) {
702
+ throw err;
703
+ }
704
+ // Non-required failures are silently skipped
705
+ }
706
+ }
239
707
 
240
- // Remove inline styles from EVERY element to prevent background-image base64 leaks
241
- $("*").removeAttr("style");
708
+ return results;
709
+ }
242
710
 
243
- // Remove data URIs anywhere else in the document
244
- $("*").each((i, el) => {
245
- const src = $(el).attr("src");
246
- if (src && src.startsWith("data:image")) $(el).removeAttr("src");
247
- });
711
+ // === COMPOSITION: WEBFETCH ===============================================
248
712
 
249
- if (format === "clean_html") {
250
- finalContent = $.html();
251
- } else if (format === "markdown") {
252
- const turndownService = new TurndownService({
253
- headingStyle: "atx",
254
- codeBlockStyle: "fenced",
255
- });
256
- finalContent = turndownService.turndown($.html());
257
- finalContent = finalContent.replace(/\n{3,}/g, "\n\n").trim();
713
+ function isCommentStyle(result) {
714
+ if (!result.items || result.items.length === 0) return false;
715
+ const first = result.items[0];
716
+ const keys = Object.keys(first).map((k) => k.toLowerCase());
717
+ return (
718
+ (keys.includes("author") && (keys.includes("comment") || keys.includes("body"))) ||
719
+ (keys.includes("user") && (keys.includes("comment") || keys.includes("body")))
720
+ );
721
+ }
722
+
723
+ function composeSections(extracted, template, startIndex, maxLength) {
724
+ const parts = [];
725
+
726
+ for (const result of extracted) {
727
+ if (result.type === "value") {
728
+ const text = result.text;
729
+ if (text && String(text).trim()) {
730
+ parts.push(`## ${result.section.name}\n\n${String(text).trim()}`);
731
+ }
732
+ } else if (result.type === "list") {
733
+ if (result.items && result.items.length > 0) {
734
+ const listText = result.items.map((item) => `- ${item}`).join("\n");
735
+ parts.push(`## ${result.section.name}\n\n${listText}`);
736
+ }
737
+ } else if (result.type === "children") {
738
+ if (result.items && Object.keys(result.items).length > 0) {
739
+ for (const [childName, value] of Object.entries(result.items)) {
740
+ if (value && String(value).trim()) {
741
+ parts.push(`## ${childName}\n\n${String(value).trim()}`);
742
+ }
743
+ }
744
+ }
745
+ } else if (result.type === "children-multiple") {
746
+ if (result.items && result.items.length > 0) {
747
+ if (isCommentStyle(result)) {
748
+ const commentParts = [];
749
+ for (const item of result.items) {
750
+ const author =
751
+ item["Author"] || item["author"] || item["User"] || item["user"] || "";
752
+ const comment =
753
+ item["Comment"] || item["Body"] || item["comment"] || item["body"] || "";
754
+ if (author) {
755
+ commentParts.push(`**${author}:**\n\n${comment}`);
756
+ } else if (comment) {
757
+ commentParts.push(comment);
758
+ }
759
+ }
760
+ if (commentParts.length > 0) {
761
+ parts.push(
762
+ `## ${result.section.name}\n\n${commentParts.join("\n\n---\n\n")}`,
763
+ );
764
+ }
765
+ } else {
766
+ const itemParts = [];
767
+ for (const item of result.items) {
768
+ const lines = [];
769
+ for (const [key, value] of Object.entries(item)) {
770
+ if (value && String(value).trim()) {
771
+ lines.push(` ${key}: ${String(value).trim()}`);
772
+ }
773
+ }
774
+ if (lines.length > 0) itemParts.push(lines.join("\n"));
775
+ }
776
+ if (itemParts.length > 0) {
777
+ parts.push(
778
+ `## ${result.section.name}\n\n${itemParts.join("\n\n")}`,
779
+ );
780
+ }
781
+ }
258
782
  }
259
783
  }
784
+ }
260
785
 
261
- const totalLength = finalContent.length;
262
- let paginatedText = finalContent.substring(
263
- startIndex,
264
- startIndex + maxLength,
265
- );
266
- const isTruncated = startIndex + maxLength < totalLength;
786
+ if (parts.length === 0) {
787
+ return "(No content extracted from this page.)";
788
+ }
789
+
790
+ const full = parts.join("\n\n---\n\n");
791
+ const totalLength = full.length;
792
+ const paginated = full.substring(startIndex, startIndex + maxLength);
793
+
794
+ const templateName = template ? template.name : "auto";
795
+ let metadata = `\n\n---\n[webfetch: template="${templateName}", showing characters ${startIndex} to ${startIndex + paginated.length} of ${totalLength} total.`;
796
+ if (startIndex + maxLength < totalLength) {
797
+ metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
798
+ }
799
+ metadata += `]`;
800
+
801
+ return paginated + metadata;
802
+ }
267
803
 
268
- let metadata = `\n\n---\n[Document Info: Showing characters ${startIndex} to ${
269
- startIndex + paginatedText.length
270
- } of ${totalLength} total.`;
804
+ // === COMPOSITION: WEBSEARCH ==============================================
271
805
 
272
- if (isTruncated) {
273
- metadata += ` Use start_index=${startIndex + maxLength} to paginate and read more.`;
806
+ function composeSearchResults(extracted) {
807
+ // Find the search results section (first children-multiple)
808
+ const searchSection = extracted.find((r) => r.type === "children-multiple");
809
+
810
+ if (!searchSection || !searchSection.items || searchSection.items.length === 0) {
811
+ // Fall back to section-based output
812
+ return composeSections(extracted, null, 0, Infinity);
813
+ }
814
+
815
+ const items = searchSection.items;
816
+ const parts = [];
817
+
818
+ for (let i = 0; i < items.length; i++) {
819
+ const item = items[i];
820
+ const num = i + 1;
821
+
822
+ const title =
823
+ item["Title"] || item["title"] || Object.values(item)[0] || "";
824
+ const url =
825
+ item["URL"] || item["url"] || item["Url"] || "";
826
+ const snippet =
827
+ item["Snippet"] || item["snippet"] || "";
828
+
829
+ // Filter out non-http URLs and google internal links
830
+ let cleanUrl = url;
831
+ if (cleanUrl && !cleanUrl.startsWith("http")) {
832
+ cleanUrl = ""; // Skip internal/non-web URLs
833
+ }
834
+ if (
835
+ cleanUrl &&
836
+ (cleanUrl.includes("google.com/search") ||
837
+ cleanUrl.includes("support.google.com"))
838
+ ) {
839
+ cleanUrl = ""; // Skip google internal links
274
840
  }
275
- metadata += `]`;
276
841
 
277
- return paginatedText + metadata;
278
- } finally {
279
- await page.close();
280
- await context.close();
842
+ if (!title) continue;
843
+
844
+ const lines = [`[${num}] ${title}`];
845
+ if (cleanUrl) lines.push(` URL: ${cleanUrl}`);
846
+ if (snippet) lines.push(` Snippet: ${snippet}`);
847
+
848
+ parts.push(lines.join("\n"));
281
849
  }
850
+
851
+ if (parts.length === 0) {
852
+ return "(No content extracted from this page.)";
853
+ }
854
+
855
+ return `## ${searchSection.section.name}\n\n${parts.join("\n\n")}`;
282
856
  }
283
857
 
284
- // ==========================================
285
- // MCP SERVER INIT & TOOL REGISTRATION
286
- // ==========================================
858
+ // === GENERIC FALLBACK ====================================================
287
859
 
288
- const server = new McpServer({
289
- name: "searchfetch",
290
- version: "1.3.0",
291
- });
860
+ function genericFallback($, startIndex, maxLength) {
861
+ applyRemove($, null);
862
+
863
+ const bodyHtml = $("body").html() || "";
864
+ let markdown = turndown
865
+ .turndown(bodyHtml)
866
+ .replace(/\n{3,}/g, "\n\n")
867
+ .trim();
868
+
869
+ if (!markdown || markdown.trim().length === 0) {
870
+ return "(No content extracted from this page.)";
871
+ }
872
+
873
+ const totalLength = markdown.length;
874
+ const paginated = markdown.substring(startIndex, startIndex + maxLength);
875
+
876
+ let metadata = `\n\n---\n[webfetch: template="auto (fallback)", showing characters ${startIndex} to ${startIndex + paginated.length} of ${totalLength} total.`;
877
+ if (startIndex + maxLength < totalLength) {
878
+ metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
879
+ }
880
+ metadata += `]`;
881
+
882
+ return paginated + metadata;
883
+ }
884
+
885
+ // === SEARCH TEMPLATE RESOLUTION ==========================================
886
+
887
+ function resolveSearchTemplate(engine, query, region, safeSearch) {
888
+ const templateName = resolveEngineToTemplateName(engine);
889
+
890
+ let template;
891
+ if (templateName.startsWith("{")) {
892
+ try {
893
+ template = JSON.parse(templateName);
894
+ } catch (e) {
895
+ throw new Error(`Invalid inline JSON template: ${e.message}`);
896
+ }
897
+ } else {
898
+ template = getTemplateByName(templateName);
899
+ }
900
+
901
+ if (!template.url_template) {
902
+ throw new Error(
903
+ `Template '${template.name}' is not a search template (no url_template).`,
904
+ );
905
+ }
906
+
907
+ const params = mapSearchParams(engine, query, region, safeSearch);
908
+ let url = resolveUrlTemplate(template, params);
909
+
910
+ // Google safe_search: append safe=active to URL
911
+ if (
912
+ (engine === "google" || templateName === "google-search") &&
913
+ safeSearch === true
914
+ ) {
915
+ url += "&safe=active";
916
+ }
292
917
 
293
- server.tool(
918
+ return { template, url };
919
+ }
920
+
921
+ // === MCP SERVER & TOOLS ==================================================
922
+
923
+ const server = new McpServer({ name: "searchfetch", version: "3.0.0" });
924
+
925
+ // --- websearch tool ---
926
+
927
+ server.registerTool(
294
928
  "websearch",
295
- "Search the web using DuckDuckGo or Google. Returns a clean list of titles, URLs, and snippets. Excellent for researching general knowledge, news, and finding URLs.",
296
929
  {
297
- query: z.string().describe("The search query string."),
298
- engine: z
299
- .enum(["duckduckgo", "google"])
300
- .default("duckduckgo")
301
- .describe("Search engine to use (default: duckduckgo)."),
302
- max_results: z
303
- .number()
304
- .default(10)
305
- .describe("Maximum number of results to return (default: 10)."),
306
- region: z
307
- .string()
308
- .default("wt-wt")
309
- .describe("Region code (e.g., 'us-en'). Only applies to DuckDuckGo."),
310
- safe_search: z
311
- .string()
312
- .default("-1")
313
- .describe(
314
- "'-1' for Moderate, '1' for Strict, '-2' for Off. Only applies to DuckDuckGo.",
315
- ),
930
+ title: "Web Search",
931
+ description:
932
+ "Search the web using DuckDuckGo or Google. Returns a clean list of titles, URLs, and snippets. Excellent for researching general knowledge, news, and finding URLs.",
933
+ inputSchema: z.object({
934
+ query: z.string().describe("The search query string."),
935
+ engine: z
936
+ .string()
937
+ .default("duckduckgo")
938
+ .describe(
939
+ "Search engine to use. Can be 'duckduckgo' or 'google'. Default is 'duckduckgo'.",
940
+ ),
941
+ region: z
942
+ .string()
943
+ .nullable()
944
+ .default(null)
945
+ .describe(
946
+ "Region and language code to localize search results (e.g., 'us-en', 'uk-en', 'de-de'). For DuckDuckGo it maps directly. For Google, 'us' is country code and 'en' is language. Default is null (uses template default).",
947
+ ),
948
+ safe_search: z
949
+ .boolean()
950
+ .nullable()
951
+ .default(null)
952
+ .describe(
953
+ "Enable safe search filtering. null = use template default. Applies to both DuckDuckGo and Google.",
954
+ ),
955
+ max_results: z
956
+ .number()
957
+ .default(10)
958
+ .describe("Maximum number of search results to return. Default is 10."),
959
+ block_media: z
960
+ .boolean()
961
+ .default(true)
962
+ .describe(
963
+ "Block images, videos, and fonts entirely at the network layer. Default is true.",
964
+ ),
965
+ }),
316
966
  },
317
- async ({ query, engine, max_results, region, safe_search }) => {
967
+ async ({ query, engine, region, safe_search, max_results, block_media }) => {
318
968
  try {
319
- const result = await executeSearch(
969
+ // 1. Resolve search template (+ url_params mapping + url building)
970
+ const { template, url } = resolveSearchTemplate(
971
+ engine,
320
972
  query,
321
- max_results,
322
973
  region,
323
974
  safe_search,
324
- engine,
325
975
  );
976
+
977
+ // 2. Fetch
978
+ const html = await fetchHtmlWithRetry(url, template, block_media);
979
+
980
+ // 3. Extract
981
+ const $ = cheerio.load(html);
982
+ applyRemove($, template);
983
+
984
+ const pageOrigin = new URL(url).origin;
985
+ const context = {
986
+ origin: pageOrigin,
987
+ isWebsearch: true,
988
+ maxResultsOverride: max_results,
989
+ _maxResultsConsumed: false,
990
+ };
991
+
992
+ const extracted = extractTemplate($, template, context);
993
+
994
+ // 4. Compose
995
+ const result = composeSearchResults(extracted);
996
+
326
997
  return { content: [{ type: "text", text: result }] };
327
- } catch (error) {
328
- logger.error("Search Tool failed:", error);
998
+ } catch (err) {
329
999
  return {
330
- content: [{ type: "text", text: `Search Error: ${error.message}` }],
1000
+ content: [{ type: "text", text: `Search Error: ${err.message}` }],
331
1001
  isError: true,
332
1002
  };
333
1003
  }
334
1004
  },
335
1005
  );
336
1006
 
337
- server.tool(
1007
+ // --- webfetch tool ---
1008
+
1009
+ server.registerTool(
338
1010
  "webfetch",
339
- "Fetch and extract the main text content from any webpage. Fully executes JavaScript to load React/SPAs and aggressively strips images/media (including base64) to save context tokens.",
340
1011
  {
341
- url: z
342
- .string()
343
- .url()
344
- .describe(
1012
+ title: "Web Fetch",
1013
+ description:
1014
+ "Fetch and extract the main text content from any webpage. Fully executes JavaScript to load React/SPAs and aggressively strips images/media (including base64) to save context tokens.",
1015
+ inputSchema: z.object({
1016
+ url: z.string().describe(
345
1017
  "The full URL of the webpage to fetch (must start with http/https).",
346
1018
  ),
347
- format: z
348
- .enum(["markdown", "clean_html", "raw_html"])
349
- .default("markdown")
350
- .describe(
351
- "Output format. Markdown is highly recommended to save context tokens.",
352
- ),
353
- start_index: z
354
- .number()
355
- .default(0)
356
- .describe("Character offset to start reading from for pagination."),
357
- max_length: z
358
- .number()
359
- .default(10000)
360
- .describe("Maximum characters to return per request (default: 10000)."),
361
- block_media: z
362
- .boolean()
363
- .default(true)
364
- .describe(
365
- "Block images/videos/fonts to drastically speed up rendering (default: true).",
366
- ),
1019
+ template: z
1020
+ .string()
1021
+ .default("auto")
1022
+ .describe(
1023
+ "Template to use: 'auto' (auto-detect from URL), a built-in name, or inline JSON.",
1024
+ ),
1025
+ start_index: z
1026
+ .number()
1027
+ .default(0)
1028
+ .describe(
1029
+ "Character offset for pagination. Default: 0.",
1030
+ ),
1031
+ max_length: z
1032
+ .number()
1033
+ .default(10000)
1034
+ .describe(
1035
+ "Maximum characters to return per request. Default is 10000.",
1036
+ ),
1037
+ block_media: z
1038
+ .boolean()
1039
+ .default(true)
1040
+ .describe(
1041
+ "Block images, videos, and fonts entirely at the network layer. Default is true.",
1042
+ ),
1043
+ }),
367
1044
  },
368
- async ({ url, format, start_index, max_length, block_media }) => {
1045
+ async ({ url, template: templateParam, start_index, max_length, block_media }) => {
369
1046
  try {
370
- const result = await executeFetch(
371
- url,
372
- format,
373
- start_index,
374
- max_length,
375
- block_media,
376
- );
377
- return { content: [{ type: "text", text: result }] };
378
- } catch (error) {
379
- logger.error(`Fetch Tool failed on ${url}:`, error);
1047
+ // 1. Resolve template
1048
+ let template;
1049
+
1050
+ if (templateParam.startsWith("{")) {
1051
+ try {
1052
+ template = JSON.parse(templateParam);
1053
+ } catch (e) {
1054
+ throw new Error(`Invalid inline JSON template: ${e.message}`);
1055
+ }
1056
+ } else if (templateParam === "auto") {
1057
+ template = detectTemplateByUrl(url);
1058
+ } else {
1059
+ template = getTemplateByName(templateParam);
1060
+ }
1061
+
1062
+ // 2. Fetch
1063
+ const html = await fetchHtmlWithRetry(url, template, block_media);
1064
+
1065
+ // 3. Extract and compose
1066
+ const $ = cheerio.load(html);
1067
+
1068
+ if (template) {
1069
+ applyRemove($, template);
1070
+
1071
+ const pageOrigin = new URL(url).origin;
1072
+ const context = {
1073
+ origin: pageOrigin,
1074
+ isWebsearch: false,
1075
+ };
1076
+
1077
+ const extracted = extractTemplate($, template, context);
1078
+ const result = composeSections(extracted, template, start_index, max_length);
1079
+ return { content: [{ type: "text", text: result }] };
1080
+ } else {
1081
+ // Generic fallback
1082
+ const result = genericFallback($, start_index, max_length);
1083
+ return { content: [{ type: "text", text: result }] };
1084
+ }
1085
+ } catch (err) {
380
1086
  return {
381
- content: [{ type: "text", text: `Fetch Error: ${error.message}` }],
1087
+ content: [{ type: "text", text: `Fetch Error: ${err.message}` }],
382
1088
  isError: true,
383
1089
  };
384
1090
  }
385
1091
  },
386
1092
  );
387
1093
 
388
- // ==========================================
389
- // BOOTSTRAP
390
- // ==========================================
1094
+ // === MAIN =================================================================
391
1095
 
392
1096
  async function main() {
393
- logger.info("Initializing MCP Server...");
394
-
395
1097
  await ensureBinary();
396
-
397
- // Re-enable STDOUT right before protocol hook-in
398
1098
  process.stdout.write = originalStdoutWrite;
399
-
400
1099
  const transport = new StdioServerTransport();
401
1100
  await server.connect(transport);
402
-
403
- logger.info("searchfetch successfully connected and listening for requests.");
404
1101
  }
405
1102
 
406
1103
  main().catch((err) => {
407
- logger.error("Fatal error during startup:", err);
408
1104
  process.exit(1);
409
1105
  });