searchfetch 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +81 -0
  2. package/index.js +409 -0
  3. package/package.json +36 -0
package/README.md ADDED
@@ -0,0 +1,81 @@
1
+ # SearchFetch
2
+
3
+ A fault-tolerant, stealth-enabled Model Context Protocol (MCP) server for web searching and content fetching. Built specifically for AI Agents, it bypasses Google's GDPR consent screens, Cloudflare Turnstile, and converts heavy HTML into clean, token-optimized Markdown.
4
+
5
+ ## Features
6
+ * **Multi-Engine Search:** Natively supports DuckDuckGo and Google parsing out of the box. DuckDuckGo is set as the preferred default.
7
+ * **Aggressive Base64 / Image Scrubber:** Implements "nuclear" DOM scrubbing prior to parsing. Guaranteed to NEVER pollute your LLM's context window with giant base64 image strings (`data:image/...`).
8
+ * **Stealth CloakBrowser:** Avoids FingerprintJS, reCAPTCHA, and Cloudflare using Chromium C++ patches and humanized mouse movements natively.
9
+ * **SPA & React Support:** Waits for network idle to ensure modern Single Page Applications fully execute JavaScript and render before extracting content.
10
+ * **Fault Tolerant:** Extracts whatever DOM was successfully loaded even if a massive, clunky page times out mid-render.
11
+ * **Pagination Support:** Fetches massive webpages iteratively via `start_index` and `max_length` without blowing out AI context tokens limits.
12
+
13
+ ## Installation
14
+
15
+ 1. Clone or copy the directory.
16
+ ```bash
17
+ git clone https://github.com/maxylev/searchfetch
18
+ ```
19
+ 2. Install dependencies:
20
+ ```bash
21
+ npm install
22
+ ```
23
+ 3. Make the main script executable:
24
+ ```bash
25
+ chmod +x index.js
26
+ ```
27
+ 4. Link it globally to your system:
28
+ ```bash
29
+ npm link
30
+ ```
31
+
32
+ ## Configuration
33
+
34
+ Configure your AI tool/IDE (Cursor, Claude Desktop, Opencode, etc.) to point to this server.
35
+
36
+ ### Example `config.json` (Opencode, Cursor):
37
+ ```json
38
+ {
39
+ "mcp": {
40
+ "searchfetch": {
41
+ "type": "local",
42
+ "command":["npx", "searchfetch"],
43
+ "enabled": true
44
+ }
45
+ }
46
+ }
47
+ ```
48
+
49
+ ### Example `claude_desktop_config.json`:
50
+ ```json
51
+ {
52
+ "mcpServers": {
53
+ "searchfetch": {
54
+ "command": "npx",
55
+ "args": ["searchfetch"]
56
+ }
57
+ }
58
+ }
59
+ ```
60
+
61
+ ## Available Tools
62
+
63
+ ### 1. `websearch`
64
+ Searches the web via Google or DuckDuckGo and returns structured snippets.
65
+ * **`query`** (string): Your search query.
66
+ * **`engine`** (string): `"google"` or `"duckduckgo"` (default `"duckduckgo"`).
67
+ * **`max_results`** (number): Number of results to return (default `10`).
68
+
69
+ ### 2. `webfetch`
70
+ Visits a URL as a stealthy human, waits for the JS to render, completely scrubs visual assets/inline styles to save tokens, and returns the markdown content.
71
+ * **`url`** (string): Full HTTP/HTTPS link.
72
+ * **`format`** (string): Set to `"markdown"` (default), `"clean_html"`, or `"raw_html"`.
73
+ * **`start_index`** (number): Pagination offset.
74
+ * **`max_length`** (number): Maximum character length to return per call (default `10000`).
75
+ * **`block_media`** (boolean): Speeds up page loads by ignoring images, videos, and fonts entirely at the network layer (default `true`).
76
+
77
+ ## Debugging
78
+ If you want to debug JSON-RPC shapes locally:
79
+ ```bash
80
+ npm run inspector
81
+ ```
package/index.js ADDED
@@ -0,0 +1,409 @@
1
+ #!/usr/bin/env node
2
+
3
+ const originalStdoutWrite = process.stdout.write.bind(process.stdout);
4
+ process.stdout.write = (chunk, encoding, callback) => {
5
+ return process.stderr.write(chunk, encoding, callback);
6
+ };
7
+ console.log = (...args) => console.error(...args);
8
+ console.info = (...args) => console.error(...args);
9
+
10
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
11
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
12
+ import { z } from "zod";
13
+ import { launch, ensureBinary } from "cloakbrowser";
14
+ import * as cheerio from "cheerio";
15
+ import TurndownService from "turndown";
16
+
17
+ const logger = {
18
+ info: (msg) => console.error(`[INFO] ${msg}`),
19
+ warn: (msg) => console.error(`[WARN] ${msg}`),
20
+ error: (msg, err) => console.error(`[ERROR] ${msg}`, err || ""),
21
+ };
22
+
23
+ // ==========================================
24
+ // BROWSER LIFECYCLE MANAGEMENT
25
+ // ==========================================
26
+ class BrowserManager {
27
+ constructor() {
28
+ this.browser = null;
29
+ }
30
+
31
+ async getBrowser() {
32
+ if (!this.browser) {
33
+ logger.info("Launching stealth CloakBrowser instance...");
34
+ this.browser = await launch({
35
+ headless: true,
36
+ humanize: true, // Native C++ bot-bypass patches + human behavior
37
+ args: ["--disable-blink-features=AutomationControlled", "--no-sandbox"],
38
+ });
39
+ }
40
+ return this.browser;
41
+ }
42
+
43
+ async close() {
44
+ if (this.browser) {
45
+ await this.browser.close();
46
+ this.browser = null;
47
+ logger.info("Browser instance securely closed.");
48
+ }
49
+ }
50
+ }
51
+
52
+ const browserManager = new BrowserManager();
53
+
54
+ const cleanup = async () => {
55
+ logger.info("Received termination signal. Shutting down browser...");
56
+ await browserManager.close();
57
+ process.exit(0);
58
+ };
59
+ process.on("SIGINT", cleanup);
60
+ process.on("SIGTERM", cleanup);
61
+
62
+ // ==========================================
63
+ // CORE LOGIC: SEARCH & FETCH
64
+ // ==========================================
65
+
66
+ async function executeSearch(query, maxResults, region, safeSearch, engine) {
67
+ logger.info(
68
+ `Searching ${engine.toUpperCase()} via Stealth Browser for: "${query}"`,
69
+ );
70
+
71
+ const browser = await browserManager.getBrowser();
72
+ const context = await browser.newContext();
73
+
74
+ // Inject Google Consent cookie to universally bypass GDPR popups blocking the DOM
75
+ await context.addCookies([
76
+ {
77
+ name: "CONSENT",
78
+ value: "YES+cb.20250101-01-p0.en+FX+999",
79
+ domain: ".google.com",
80
+ path: "/",
81
+ },
82
+ ]);
83
+
84
+ const page = await context.newPage();
85
+
86
+ try {
87
+ // Optimization: Block heavy/unnecessary resources to make searches lightning fast
88
+ await page.route("**/*", (route) => {
89
+ const type = route.request().resourceType();
90
+ if (["image", "media", "font", "stylesheet"].includes(type)) {
91
+ route.abort();
92
+ } else {
93
+ route.continue();
94
+ }
95
+ });
96
+
97
+ const results = [];
98
+ let searchUrl = "";
99
+
100
+ if (engine === "google") {
101
+ searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}&hl=en&gl=us`;
102
+ } else {
103
+ searchUrl = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}&kl=${encodeURIComponent(region)}&kp=${encodeURIComponent(safeSearch)}`;
104
+ }
105
+
106
+ try {
107
+ // Use networkidle to ensure JavaScript fully renders organic results or follows hidden redirects
108
+ await page.goto(searchUrl, { waitUntil: "networkidle", timeout: 15000 });
109
+ } catch (e) {
110
+ if (e.name === "TimeoutError") {
111
+ logger.warn(`Network idle timeout on search. Extracting loaded DOM...`);
112
+ } else {
113
+ throw e;
114
+ }
115
+ }
116
+
117
+ const pageContent = await page.content();
118
+ const $ = cheerio.load(pageContent);
119
+
120
+ if (engine === "google") {
121
+ // Google's core organic result selector
122
+ $("div.g").each((i, el) => {
123
+ if (results.length >= maxResults) return;
124
+
125
+ const titleEl = $(el).find("h3").first();
126
+ const linkEl = $(el).find("a").first();
127
+ if (!titleEl.length || !linkEl.length) return;
128
+
129
+ const title = titleEl.text().trim();
130
+ let link = linkEl.attr("href") || "";
131
+
132
+ // Handle Google relative redirect links
133
+ if (link.startsWith("/url?q=")) {
134
+ try {
135
+ link = decodeURIComponent(link.split("/url?q=")[1].split("&")[0]);
136
+ } catch (e) {}
137
+ }
138
+
139
+ // Isolate snippet text safely
140
+ const cloned = $(el).clone();
141
+ cloned.find("h3, a, script, style, cite").remove();
142
+ const snippet = cloned.text().replace(/\s+/g, " ").trim();
143
+
144
+ if (title && link && link.startsWith("http")) {
145
+ results.push({ position: results.length + 1, title, link, snippet });
146
+ }
147
+ });
148
+ } else {
149
+ // DuckDuckGo selector
150
+ $(".result").each((i, el) => {
151
+ if (results.length >= maxResults) return;
152
+
153
+ const titleEl = $(el).find(".result__title a");
154
+ const snippetEl = $(el).find(".result__snippet");
155
+ if (!titleEl.length) return;
156
+
157
+ const title = titleEl.text().trim();
158
+ let link = titleEl.attr("href") || "";
159
+
160
+ if (link.includes("/l/?uddg=")) {
161
+ try {
162
+ const urlParams = new URLSearchParams(link.split("?")[1]);
163
+ link = decodeURIComponent(urlParams.get("uddg") || link);
164
+ } catch (e) {}
165
+ }
166
+
167
+ const snippet = snippetEl.text().replace(/\s+/g, " ").trim();
168
+ if (title && link && link.startsWith("http")) {
169
+ results.push({ position: results.length + 1, title, link, snippet });
170
+ }
171
+ });
172
+ }
173
+
174
+ if (results.length === 0) {
175
+ const pageText = $("body").text().replace(/\s+/g, " ").substring(0, 300);
176
+ logger.warn(`No results found. DOM Sample: ${pageText}`);
177
+ return `No results found on ${engine}. The search engine might be showing a captcha/consent screen, or the query returned nothing. Try rephrasing or switching engines.`;
178
+ }
179
+
180
+ return (
181
+ `Found ${results.length} search results on ${engine}:\n\n` +
182
+ results
183
+ .map(
184
+ (r) =>
185
+ `[${r.position}] ${r.title}\n URL: ${r.link}\n Summary: ${r.snippet}`,
186
+ )
187
+ .join("\n\n")
188
+ );
189
+ } finally {
190
+ await page.close();
191
+ await context.close();
192
+ }
193
+ }
194
+
195
+ async function executeFetch(url, format, startIndex, maxLength, blockMedia) {
196
+ logger.info(`Fetching URL: ${url} | Format: ${format}`);
197
+
198
+ const browser = await browserManager.getBrowser();
199
+ const context = await browser.newContext();
200
+ const page = await context.newPage();
201
+
202
+ try {
203
+ if (blockMedia) {
204
+ await page.route("**/*", (route) => {
205
+ const type = route.request().resourceType();
206
+ if (["image", "media", "font"].includes(type)) {
207
+ route.abort();
208
+ } else {
209
+ route.continue();
210
+ }
211
+ });
212
+ }
213
+
214
+ try {
215
+ await page.goto(url, { waitUntil: "networkidle", timeout: 15000 });
216
+ } catch (navError) {
217
+ if (navError.name === "TimeoutError") {
218
+ logger.warn(
219
+ `Network idle timeout on ${url}. Extracting partial DOM...`,
220
+ );
221
+ } else {
222
+ throw navError;
223
+ }
224
+ }
225
+
226
+ const pageContent = await page.content();
227
+ let finalContent = "";
228
+
229
+ if (format === "raw_html") {
230
+ finalContent = pageContent;
231
+ } else {
232
+ const $ = cheerio.load(pageContent);
233
+
234
+ // 🚀 NUCLEAR OPTION FOR BASE64 AND TOKENS 🚀
235
+ // Physically scrub out all tags that harbor base64 strings or waste tokens
236
+ $(
237
+ "script, style, nav, header, footer, noscript, iframe, svg, aside, .advertisement, img, picture, video, audio, canvas, map, area, dialog",
238
+ ).remove();
239
+
240
+ // Remove inline styles from EVERY element to prevent background-image base64 leaks
241
+ $("*").removeAttr("style");
242
+
243
+ // Remove data URIs anywhere else in the document
244
+ $("*").each((i, el) => {
245
+ const src = $(el).attr("src");
246
+ if (src && src.startsWith("data:image")) $(el).removeAttr("src");
247
+ });
248
+
249
+ if (format === "clean_html") {
250
+ finalContent = $.html();
251
+ } else if (format === "markdown") {
252
+ const turndownService = new TurndownService({
253
+ headingStyle: "atx",
254
+ codeBlockStyle: "fenced",
255
+ });
256
+ finalContent = turndownService.turndown($.html());
257
+ finalContent = finalContent.replace(/\n{3,}/g, "\n\n").trim();
258
+ }
259
+ }
260
+
261
+ const totalLength = finalContent.length;
262
+ let paginatedText = finalContent.substring(
263
+ startIndex,
264
+ startIndex + maxLength,
265
+ );
266
+ const isTruncated = startIndex + maxLength < totalLength;
267
+
268
+ let metadata = `\n\n---\n[Document Info: Showing characters ${startIndex} to ${
269
+ startIndex + paginatedText.length
270
+ } of ${totalLength} total.`;
271
+
272
+ if (isTruncated) {
273
+ metadata += ` Use start_index=${startIndex + maxLength} to paginate and read more.`;
274
+ }
275
+ metadata += `]`;
276
+
277
+ return paginatedText + metadata;
278
+ } finally {
279
+ await page.close();
280
+ await context.close();
281
+ }
282
+ }
283
+
284
+ // ==========================================
285
+ // MCP SERVER INIT & TOOL REGISTRATION
286
+ // ==========================================
287
+
288
+ const server = new McpServer({
289
+ name: "searchfetch",
290
+ version: "1.3.0",
291
+ });
292
+
293
+ server.tool(
294
+ "websearch",
295
+ "Search the web using DuckDuckGo or Google. Returns a clean list of titles, URLs, and snippets. Excellent for researching general knowledge, news, and finding URLs.",
296
+ {
297
+ query: z.string().describe("The search query string."),
298
+ engine: z
299
+ .enum(["duckduckgo", "google"])
300
+ .default("duckduckgo")
301
+ .describe("Search engine to use (default: duckduckgo)."),
302
+ max_results: z
303
+ .number()
304
+ .default(10)
305
+ .describe("Maximum number of results to return (default: 10)."),
306
+ region: z
307
+ .string()
308
+ .default("wt-wt")
309
+ .describe("Region code (e.g., 'us-en'). Only applies to DuckDuckGo."),
310
+ safe_search: z
311
+ .string()
312
+ .default("-1")
313
+ .describe(
314
+ "'-1' for Moderate, '1' for Strict, '-2' for Off. Only applies to DuckDuckGo.",
315
+ ),
316
+ },
317
+ async ({ query, engine, max_results, region, safe_search }) => {
318
+ try {
319
+ const result = await executeSearch(
320
+ query,
321
+ max_results,
322
+ region,
323
+ safe_search,
324
+ engine,
325
+ );
326
+ return { content: [{ type: "text", text: result }] };
327
+ } catch (error) {
328
+ logger.error("Search Tool failed:", error);
329
+ return {
330
+ content: [{ type: "text", text: `Search Error: ${error.message}` }],
331
+ isError: true,
332
+ };
333
+ }
334
+ },
335
+ );
336
+
337
+ server.tool(
338
+ "webfetch",
339
+ "Fetch and extract the main text content from any webpage. Fully executes JavaScript to load React/SPAs and aggressively strips images/media (including base64) to save context tokens.",
340
+ {
341
+ url: z
342
+ .string()
343
+ .url()
344
+ .describe(
345
+ "The full URL of the webpage to fetch (must start with http/https).",
346
+ ),
347
+ format: z
348
+ .enum(["markdown", "clean_html", "raw_html"])
349
+ .default("markdown")
350
+ .describe(
351
+ "Output format. Markdown is highly recommended to save context tokens.",
352
+ ),
353
+ start_index: z
354
+ .number()
355
+ .default(0)
356
+ .describe("Character offset to start reading from for pagination."),
357
+ max_length: z
358
+ .number()
359
+ .default(10000)
360
+ .describe("Maximum characters to return per request (default: 10000)."),
361
+ block_media: z
362
+ .boolean()
363
+ .default(true)
364
+ .describe(
365
+ "Block images/videos/fonts to drastically speed up rendering (default: true).",
366
+ ),
367
+ },
368
+ async ({ url, format, start_index, max_length, block_media }) => {
369
+ try {
370
+ const result = await executeFetch(
371
+ url,
372
+ format,
373
+ start_index,
374
+ max_length,
375
+ block_media,
376
+ );
377
+ return { content: [{ type: "text", text: result }] };
378
+ } catch (error) {
379
+ logger.error(`Fetch Tool failed on ${url}:`, error);
380
+ return {
381
+ content: [{ type: "text", text: `Fetch Error: ${error.message}` }],
382
+ isError: true,
383
+ };
384
+ }
385
+ },
386
+ );
387
+
388
+ // ==========================================
389
+ // BOOTSTRAP
390
+ // ==========================================
391
+
392
+ async function main() {
393
+ logger.info("Initializing MCP Server...");
394
+
395
+ await ensureBinary();
396
+
397
+ // Re-enable STDOUT right before protocol hook-in
398
+ process.stdout.write = originalStdoutWrite;
399
+
400
+ const transport = new StdioServerTransport();
401
+ await server.connect(transport);
402
+
403
+ logger.info("searchfetch successfully connected and listening for requests.");
404
+ }
405
+
406
+ main().catch((err) => {
407
+ logger.error("Fatal error during startup:", err);
408
+ process.exit(1);
409
+ });
package/package.json ADDED
@@ -0,0 +1,36 @@
1
+ {
2
+ "name": "searchfetch",
3
+ "version": "1.0.2",
4
+ "description": "Fault-tolerant MCP Server for Stealth Web Search and Fetching",
5
+ "type": "module",
6
+ "bin": {
7
+ "searchfetch": "./index.js"
8
+ },
9
+ "files": [
10
+ "index.js"
11
+ ],
12
+ "publishConfig": {
13
+ "access": "public"
14
+ },
15
+ "repository": {
16
+ "type": "git",
17
+ "url": "git+https://github.com/maxylev/searchfetch.git"
18
+ },
19
+ "scripts": {
20
+ "start": "node ./index.js",
21
+ "inspector": "npx @modelcontextprotocol/inspector node ./index.js"
22
+ },
23
+ "dependencies": {
24
+ "@modelcontextprotocol/sdk": "^1.29.0",
25
+ "cheerio": "^1.2.0",
26
+ "cloakbrowser": "^0.3.27",
27
+ "playwright-core": "^1.59.1",
28
+ "turndown": "^7.2.4",
29
+ "zod": "^4.4.3"
30
+ },
31
+ "devDependencies": {
32
+ "@types/cheerio": "^0.22.35",
33
+ "@types/node": "^25.6.2",
34
+ "@types/turndown": "^5.0.6"
35
+ }
36
+ }