searchfetch 1.0.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +68 -53
- package/index.js +169 -171
- package/package.json +3 -7
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Max
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -1,81 +1,96 @@
|
|
|
1
|
-
# SearchFetch
|
|
1
|
+
# SearchFetch (MCP Server)
|
|
2
2
|
|
|
3
|
-
A fault-tolerant, stealth-enabled Model Context Protocol (MCP) server for web searching and content fetching. Built specifically for AI Agents, it bypasses
|
|
3
|
+
A maximum fault-tolerant, stealth-enabled Model Context Protocol (MCP) server for web searching and content fetching. Built specifically for AI Agents (Cursor, Claude Desktop, OpenHands), it completely bypasses bot detection (Cloudflare Turnstile, Datadome), dynamically handles SPAs/React, and converts bloat into token-optimized Markdown.
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
|
-
* **
|
|
7
|
-
* **
|
|
8
|
-
* **
|
|
9
|
-
* **
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
## Installation
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
npm install
|
|
22
|
-
```
|
|
23
|
-
3. Make the main script executable:
|
|
24
|
-
```bash
|
|
25
|
-
chmod +x index.js
|
|
26
|
-
```
|
|
27
|
-
4. Link it globally to your system:
|
|
28
|
-
```bash
|
|
29
|
-
npm link
|
|
30
|
-
```
|
|
31
|
-
|
|
32
|
-
## Configuration
|
|
33
|
-
|
|
34
|
-
Configure your AI tool/IDE (Cursor, Claude Desktop, Opencode, etc.) to point to this server.
|
|
35
|
-
|
|
36
|
-
### Example `config.json` (Opencode, Cursor):
|
|
6
|
+
* **Maximum Fault Tolerance:** Implements auto-healing browser sessions, grace-period timeouts for clunky SPAs, and network-level aborting of tracking scripts and media.
|
|
7
|
+
* **Stealth Engine:** Powered by CloakBrowser C++ patches + `humanize` logic. Antibot systems score it as a normal browser because it mathematically moves and renders exactly like one.
|
|
8
|
+
* **Nuclear Token Scrubber:** Strips Base64 images, SVGs, scripts, and inline styles out of the DOM *before* Markdown conversion, guaranteeing your LLM context window won't blow out.
|
|
9
|
+
* **Dual Execution Paths:** Natively supports zero-install execution via both Python (`uvx`) and Node.js (`npx`).
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Usage & Installation
|
|
14
|
+
|
|
15
|
+
You do not need to install this repository manually. Configure your agent to use the zero-install commands `npx` or `uvx` depending on your environment.
|
|
16
|
+
|
|
17
|
+
### Claude Desktop Configuration
|
|
18
|
+
Add the following to your config:
|
|
19
|
+
|
|
20
|
+
**Option A: Using Python (`uvx` - Recommended)**
|
|
37
21
|
```json
|
|
38
22
|
{
|
|
39
|
-
"
|
|
23
|
+
"mcpServers": {
|
|
40
24
|
"searchfetch": {
|
|
41
|
-
"
|
|
42
|
-
"
|
|
43
|
-
"enabled": true
|
|
25
|
+
"command": "uvx",
|
|
26
|
+
"args": ["searchfetch"]
|
|
44
27
|
}
|
|
45
28
|
}
|
|
46
29
|
}
|
|
47
30
|
```
|
|
48
31
|
|
|
49
|
-
|
|
32
|
+
**Option B: Using Node.js (`npx`)**
|
|
50
33
|
```json
|
|
51
34
|
{
|
|
52
35
|
"mcpServers": {
|
|
53
36
|
"searchfetch": {
|
|
54
37
|
"command": "npx",
|
|
55
|
-
"args": ["searchfetch"]
|
|
38
|
+
"args": ["-y", "searchfetch"]
|
|
56
39
|
}
|
|
57
40
|
}
|
|
58
41
|
}
|
|
59
42
|
```
|
|
60
43
|
|
|
44
|
+
### Cursor / IDE Configuration
|
|
45
|
+
Add it via the **MCP panel** in Cursor settings:
|
|
46
|
+
* **Type:** `command`
|
|
47
|
+
* **Command:** `uvx searchfetch` (or `npx -y searchfetch`)
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
61
51
|
## Available Tools
|
|
62
52
|
|
|
63
53
|
### 1. `websearch`
|
|
64
|
-
Searches the web
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
* **`
|
|
54
|
+
Searches the web using DuckDuckGo or Google. Returns a clean list of titles, URLs, and snippets. Excellent for researching general knowledge, news, and finding URLs.
|
|
55
|
+
|
|
56
|
+
**Parameters:**
|
|
57
|
+
* **`query`** *(string, required)*: The search query string.
|
|
58
|
+
* **`engine`** *(string, optional)*: Search engine to use. Can be `"duckduckgo"` or `"google"`. Default is `"duckduckgo"`.
|
|
59
|
+
* **`max_results`** *(number, optional)*: Maximum number of results to return. Default is `10`.
|
|
60
|
+
* **`region`** *(string, optional)*: Region and language code to localize search results.
|
|
61
|
+
* Examples: `"us-en"`, `"uk-en"`, `"de-de"`.
|
|
62
|
+
* For DuckDuckGo, it maps directly.
|
|
63
|
+
* For Google, it maps to the `gl` (country) and `hl` (language) query parameters automatically.
|
|
64
|
+
* Default is `"wt-wt"` (global/US English).
|
|
65
|
+
* **`safe_search`** *(string, optional)*: Safe search filtering mode.
|
|
66
|
+
* `"-1"` for Moderate.
|
|
67
|
+
* `"1"` for Strict.
|
|
68
|
+
* `"-2"` for Off.
|
|
69
|
+
* Default is `"-1"`.
|
|
70
|
+
* *Note: Only applies to DuckDuckGo.*
|
|
68
71
|
|
|
69
72
|
### 2. `webfetch`
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
* **`
|
|
74
|
-
* **`
|
|
75
|
-
* **`
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
73
|
+
Fetch and extract the main text content from any webpage. Fully executes JavaScript to load React/SPAs and aggressively strips images/media (including base64) to save context tokens.
|
|
74
|
+
|
|
75
|
+
**Parameters:**
|
|
76
|
+
* **`url`** *(string, required)*: The full URL of the webpage to fetch (must start with http/https).
|
|
77
|
+
* **`format`** *(string, optional)*: Output format. Set to `"markdown"`, `"clean_html"`, or `"raw_html"`. Default is `"markdown"` (highly recommended to save context tokens).
|
|
78
|
+
* **`start_index`** *(number, optional)*: Character offset to start reading from for pagination. Use this if a document is too large to fit in the context window. Default is `0`.
|
|
79
|
+
* **`max_length`** *(number, optional)*: Maximum characters to return per request. Default is `10000`.
|
|
80
|
+
* **`block_media`** *(boolean, optional)*: Block images, videos, and fonts entirely at the network layer to drastically speed up page loads and dodge tracking pixels. Default is `true`.
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Architecture & Contributions
|
|
85
|
+
This repository utilizes a flat dual-manifest file structure (`package.json` and `pyproject.toml` in the root). When committing changes, ensure parity between `index.js` and `server.py` logic.
|
|
86
|
+
|
|
87
|
+
### Local Development
|
|
79
88
|
```bash
|
|
80
|
-
|
|
89
|
+
# Node.js Testing
|
|
90
|
+
npm i
|
|
91
|
+
npm run inspector-js
|
|
92
|
+
|
|
93
|
+
# Python Testing
|
|
94
|
+
pip install -e .
|
|
95
|
+
npm run inspector-py
|
|
81
96
|
```
|
package/index.js
CHANGED
|
@@ -14,37 +14,46 @@ import { launch, ensureBinary } from "cloakbrowser";
|
|
|
14
14
|
import * as cheerio from "cheerio";
|
|
15
15
|
import TurndownService from "turndown";
|
|
16
16
|
|
|
17
|
-
const logger = {
|
|
18
|
-
info: (msg) => console.error(`[INFO] ${msg}`),
|
|
19
|
-
warn: (msg) => console.error(`[WARN] ${msg}`),
|
|
20
|
-
error: (msg, err) => console.error(`[ERROR] ${msg}`, err || ""),
|
|
21
|
-
};
|
|
22
|
-
|
|
23
|
-
// ==========================================
|
|
24
|
-
// BROWSER LIFECYCLE MANAGEMENT
|
|
25
|
-
// ==========================================
|
|
26
17
|
class BrowserManager {
|
|
27
18
|
constructor() {
|
|
28
19
|
this.browser = null;
|
|
20
|
+
this.launchPromise = null;
|
|
29
21
|
}
|
|
30
22
|
|
|
31
23
|
async getBrowser() {
|
|
32
|
-
if (
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
24
|
+
if (this.browser && this.browser.isConnected()) return this.browser;
|
|
25
|
+
if (this.launchPromise) return this.launchPromise;
|
|
26
|
+
|
|
27
|
+
this.launchPromise = launch({
|
|
28
|
+
headless: true,
|
|
29
|
+
humanize: true,
|
|
30
|
+
args: [
|
|
31
|
+
"--disable-blink-features=AutomationControlled",
|
|
32
|
+
"--no-sandbox",
|
|
33
|
+
"--disable-dev-shm-usage",
|
|
34
|
+
],
|
|
35
|
+
})
|
|
36
|
+
.then((browser) => {
|
|
37
|
+
this.browser = browser;
|
|
38
|
+
this.browser.on("disconnected", () => {
|
|
39
|
+
this.browser = null;
|
|
40
|
+
});
|
|
41
|
+
return browser;
|
|
42
|
+
})
|
|
43
|
+
.catch((err) => {
|
|
44
|
+
throw err;
|
|
45
|
+
})
|
|
46
|
+
.finally(() => {
|
|
47
|
+
this.launchPromise = null;
|
|
38
48
|
});
|
|
39
|
-
|
|
40
|
-
return this.
|
|
49
|
+
|
|
50
|
+
return this.launchPromise;
|
|
41
51
|
}
|
|
42
52
|
|
|
43
53
|
async close() {
|
|
44
54
|
if (this.browser) {
|
|
45
55
|
await this.browser.close();
|
|
46
56
|
this.browser = null;
|
|
47
|
-
logger.info("Browser instance securely closed.");
|
|
48
57
|
}
|
|
49
58
|
}
|
|
50
59
|
}
|
|
@@ -52,26 +61,23 @@ class BrowserManager {
|
|
|
52
61
|
const browserManager = new BrowserManager();
|
|
53
62
|
|
|
54
63
|
const cleanup = async () => {
|
|
55
|
-
logger.info("Received termination signal. Shutting down browser...");
|
|
56
64
|
await browserManager.close();
|
|
57
65
|
process.exit(0);
|
|
58
66
|
};
|
|
59
67
|
process.on("SIGINT", cleanup);
|
|
60
68
|
process.on("SIGTERM", cleanup);
|
|
61
69
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
70
|
+
function getGoogleRegionParams(region) {
|
|
71
|
+
if (!region || region === "wt-wt") return "hl=en&gl=us";
|
|
72
|
+
const parts = region.split("-");
|
|
73
|
+
if (parts.length === 2) return `gl=${parts[0]}&hl=${parts[1]}`;
|
|
74
|
+
return `gl=${region}&hl=en`;
|
|
75
|
+
}
|
|
65
76
|
|
|
66
77
|
async function executeSearch(query, maxResults, region, safeSearch, engine) {
|
|
67
|
-
logger.info(
|
|
68
|
-
`Searching ${engine.toUpperCase()} via Stealth Browser for: "${query}"`,
|
|
69
|
-
);
|
|
70
|
-
|
|
71
78
|
const browser = await browserManager.getBrowser();
|
|
72
79
|
const context = await browser.newContext();
|
|
73
80
|
|
|
74
|
-
// Inject Google Consent cookie to universally bypass GDPR popups blocking the DOM
|
|
75
81
|
await context.addCookies([
|
|
76
82
|
{
|
|
77
83
|
name: "CONSENT",
|
|
@@ -84,7 +90,6 @@ async function executeSearch(query, maxResults, region, safeSearch, engine) {
|
|
|
84
90
|
const page = await context.newPage();
|
|
85
91
|
|
|
86
92
|
try {
|
|
87
|
-
// Optimization: Block heavy/unnecessary resources to make searches lightning fast
|
|
88
93
|
await page.route("**/*", (route) => {
|
|
89
94
|
const type = route.request().resourceType();
|
|
90
95
|
if (["image", "media", "font", "stylesheet"].includes(type)) {
|
|
@@ -95,66 +100,76 @@ async function executeSearch(query, maxResults, region, safeSearch, engine) {
|
|
|
95
100
|
});
|
|
96
101
|
|
|
97
102
|
const results = [];
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
} else {
|
|
103
|
-
searchUrl = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}&kl=${encodeURIComponent(region)}&kp=${encodeURIComponent(safeSearch)}`;
|
|
104
|
-
}
|
|
103
|
+
const searchUrl =
|
|
104
|
+
engine === "google"
|
|
105
|
+
? `https://www.google.com/search?udm=web&udm=14&q=${encodeURIComponent(query)}&${getGoogleRegionParams(region)}`
|
|
106
|
+
: `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}&kl=${encodeURIComponent(region)}&kp=${encodeURIComponent(safeSearch)}`;
|
|
105
107
|
|
|
106
108
|
try {
|
|
107
|
-
// Use networkidle to ensure JavaScript fully renders organic results or follows hidden redirects
|
|
108
109
|
await page.goto(searchUrl, { waitUntil: "networkidle", timeout: 15000 });
|
|
109
110
|
} catch (e) {
|
|
110
|
-
|
|
111
|
-
logger.warn(`Network idle timeout on search. Extracting loaded DOM...`);
|
|
112
|
-
} else {
|
|
113
|
-
throw e;
|
|
114
|
-
}
|
|
111
|
+
// Allow partial rendering on timeout
|
|
115
112
|
}
|
|
116
113
|
|
|
117
114
|
const pageContent = await page.content();
|
|
118
115
|
const $ = cheerio.load(pageContent);
|
|
119
116
|
|
|
120
117
|
if (engine === "google") {
|
|
121
|
-
|
|
122
|
-
$("div.g").each((i, el) => {
|
|
118
|
+
$("h3").each((i, el) => {
|
|
123
119
|
if (results.length >= maxResults) return;
|
|
124
120
|
|
|
125
|
-
const
|
|
126
|
-
|
|
127
|
-
if (!
|
|
121
|
+
const h3 = $(el);
|
|
122
|
+
let linkEl = h3.closest("a");
|
|
123
|
+
if (!linkEl.length) linkEl = h3.find("a");
|
|
124
|
+
if (!linkEl.length) return;
|
|
128
125
|
|
|
129
|
-
const title = titleEl.text().trim();
|
|
130
126
|
let link = linkEl.attr("href") || "";
|
|
131
127
|
|
|
132
|
-
|
|
128
|
+
if (!link || (link.startsWith("/") && !link.startsWith("/url?q=")))
|
|
129
|
+
return;
|
|
130
|
+
if (
|
|
131
|
+
link.includes("google.com/search") ||
|
|
132
|
+
link.includes("support.google.com")
|
|
133
|
+
)
|
|
134
|
+
return;
|
|
135
|
+
|
|
133
136
|
if (link.startsWith("/url?q=")) {
|
|
134
137
|
try {
|
|
135
138
|
link = decodeURIComponent(link.split("/url?q=")[1].split("&")[0]);
|
|
136
139
|
} catch (e) {}
|
|
137
140
|
}
|
|
138
141
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
142
|
+
const title = h3.text().trim();
|
|
143
|
+
if (!title) return;
|
|
144
|
+
|
|
145
|
+
let snippet = "";
|
|
146
|
+
let parent = h3.parent();
|
|
147
|
+
while (parent.length && parent.prop("tagName") !== "BODY") {
|
|
148
|
+
const snippetEl = parent.find(
|
|
149
|
+
"div.VwiC3b, div[style*='-webkit-line-clamp'], div.yXK7lf, div.Uroaid",
|
|
150
|
+
);
|
|
151
|
+
if (snippetEl.length) {
|
|
152
|
+
snippet = snippetEl.first().text().replace(/\s+/g, " ").trim();
|
|
153
|
+
break;
|
|
154
|
+
}
|
|
155
|
+
parent = parent.parent();
|
|
156
|
+
}
|
|
143
157
|
|
|
144
|
-
if (
|
|
145
|
-
results.
|
|
158
|
+
if (link.startsWith("http")) {
|
|
159
|
+
if (!results.some((r) => r.link === link)) {
|
|
160
|
+
results.push({
|
|
161
|
+
position: results.length + 1,
|
|
162
|
+
title,
|
|
163
|
+
link,
|
|
164
|
+
snippet,
|
|
165
|
+
});
|
|
166
|
+
}
|
|
146
167
|
}
|
|
147
168
|
});
|
|
148
169
|
} else {
|
|
149
|
-
// DuckDuckGo selector
|
|
150
170
|
$(".result").each((i, el) => {
|
|
151
171
|
if (results.length >= maxResults) return;
|
|
152
|
-
|
|
153
172
|
const titleEl = $(el).find(".result__title a");
|
|
154
|
-
const snippetEl = $(el).find(".result__snippet");
|
|
155
|
-
if (!titleEl.length) return;
|
|
156
|
-
|
|
157
|
-
const title = titleEl.text().trim();
|
|
158
173
|
let link = titleEl.attr("href") || "";
|
|
159
174
|
|
|
160
175
|
if (link.includes("/l/?uddg=")) {
|
|
@@ -164,17 +179,21 @@ async function executeSearch(query, maxResults, region, safeSearch, engine) {
|
|
|
164
179
|
} catch (e) {}
|
|
165
180
|
}
|
|
166
181
|
|
|
167
|
-
const
|
|
168
|
-
|
|
182
|
+
const title = titleEl.text().trim();
|
|
183
|
+
const snippet = $(el)
|
|
184
|
+
.find(".result__snippet")
|
|
185
|
+
.text()
|
|
186
|
+
.replace(/\s+/g, " ")
|
|
187
|
+
.trim();
|
|
188
|
+
|
|
189
|
+
if (title && link.startsWith("http")) {
|
|
169
190
|
results.push({ position: results.length + 1, title, link, snippet });
|
|
170
191
|
}
|
|
171
192
|
});
|
|
172
193
|
}
|
|
173
194
|
|
|
174
195
|
if (results.length === 0) {
|
|
175
|
-
|
|
176
|
-
logger.warn(`No results found. DOM Sample: ${pageText}`);
|
|
177
|
-
return `No results found on ${engine}. The search engine might be showing a captcha/consent screen, or the query returned nothing. Try rephrasing or switching engines.`;
|
|
196
|
+
return `No results found on ${engine}. The engine may have shown a captcha, or the query returned nothing.`;
|
|
178
197
|
}
|
|
179
198
|
|
|
180
199
|
return (
|
|
@@ -193,8 +212,6 @@ async function executeSearch(query, maxResults, region, safeSearch, engine) {
|
|
|
193
212
|
}
|
|
194
213
|
|
|
195
214
|
async function executeFetch(url, format, startIndex, maxLength, blockMedia) {
|
|
196
|
-
logger.info(`Fetching URL: ${url} | Format: ${format}`);
|
|
197
|
-
|
|
198
215
|
const browser = await browserManager.getBrowser();
|
|
199
216
|
const context = await browser.newContext();
|
|
200
217
|
const page = await context.newPage();
|
|
@@ -214,13 +231,7 @@ async function executeFetch(url, format, startIndex, maxLength, blockMedia) {
|
|
|
214
231
|
try {
|
|
215
232
|
await page.goto(url, { waitUntil: "networkidle", timeout: 15000 });
|
|
216
233
|
} catch (navError) {
|
|
217
|
-
|
|
218
|
-
logger.warn(
|
|
219
|
-
`Network idle timeout on ${url}. Extracting partial DOM...`,
|
|
220
|
-
);
|
|
221
|
-
} else {
|
|
222
|
-
throw navError;
|
|
223
|
-
}
|
|
234
|
+
// Allow partial rendering on timeout
|
|
224
235
|
}
|
|
225
236
|
|
|
226
237
|
const pageContent = await page.content();
|
|
@@ -231,30 +242,27 @@ async function executeFetch(url, format, startIndex, maxLength, blockMedia) {
|
|
|
231
242
|
} else {
|
|
232
243
|
const $ = cheerio.load(pageContent);
|
|
233
244
|
|
|
234
|
-
// 🚀 NUCLEAR OPTION FOR BASE64 AND TOKENS 🚀
|
|
235
|
-
// Physically scrub out all tags that harbor base64 strings or waste tokens
|
|
236
245
|
$(
|
|
237
246
|
"script, style, nav, header, footer, noscript, iframe, svg, aside, .advertisement, img, picture, video, audio, canvas, map, area, dialog",
|
|
238
247
|
).remove();
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
const src = $(el).attr("src");
|
|
246
|
-
if (src && src.startsWith("data:image")) $(el).removeAttr("src");
|
|
247
|
-
});
|
|
248
|
+
$("*")
|
|
249
|
+
.removeAttr("style")
|
|
250
|
+
.each((i, el) => {
|
|
251
|
+
const src = $(el).attr("src");
|
|
252
|
+
if (src && src.startsWith("data:image")) $(el).removeAttr("src");
|
|
253
|
+
});
|
|
248
254
|
|
|
249
255
|
if (format === "clean_html") {
|
|
250
256
|
finalContent = $.html();
|
|
251
|
-
} else
|
|
257
|
+
} else {
|
|
252
258
|
const turndownService = new TurndownService({
|
|
253
259
|
headingStyle: "atx",
|
|
254
260
|
codeBlockStyle: "fenced",
|
|
255
261
|
});
|
|
256
|
-
finalContent = turndownService
|
|
257
|
-
|
|
262
|
+
finalContent = turndownService
|
|
263
|
+
.turndown($.html())
|
|
264
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
265
|
+
.trim();
|
|
258
266
|
}
|
|
259
267
|
}
|
|
260
268
|
|
|
@@ -263,14 +271,10 @@ async function executeFetch(url, format, startIndex, maxLength, blockMedia) {
|
|
|
263
271
|
startIndex,
|
|
264
272
|
startIndex + maxLength,
|
|
265
273
|
);
|
|
266
|
-
const isTruncated = startIndex + maxLength < totalLength;
|
|
267
|
-
|
|
268
|
-
let metadata = `\n\n---\n[Document Info: Showing characters ${startIndex} to ${
|
|
269
|
-
startIndex + paginatedText.length
|
|
270
|
-
} of ${totalLength} total.`;
|
|
271
274
|
|
|
272
|
-
|
|
273
|
-
|
|
275
|
+
let metadata = `\n\n---\n[Document Info: Showing characters ${startIndex} to ${startIndex + paginatedText.length} of ${totalLength} total.`;
|
|
276
|
+
if (startIndex + maxLength < totalLength) {
|
|
277
|
+
metadata += ` Use start_index=${startIndex + maxLength} to read more.`;
|
|
274
278
|
}
|
|
275
279
|
metadata += `]`;
|
|
276
280
|
|
|
@@ -281,40 +285,41 @@ async function executeFetch(url, format, startIndex, maxLength, blockMedia) {
|
|
|
281
285
|
}
|
|
282
286
|
}
|
|
283
287
|
|
|
284
|
-
|
|
285
|
-
// MCP SERVER INIT & TOOL REGISTRATION
|
|
286
|
-
// ==========================================
|
|
288
|
+
const server = new McpServer({ name: "searchfetch", version: "2.0.0" });
|
|
287
289
|
|
|
288
|
-
|
|
289
|
-
name: "searchfetch",
|
|
290
|
-
version: "1.3.0",
|
|
291
|
-
});
|
|
292
|
-
|
|
293
|
-
server.tool(
|
|
290
|
+
server.registerTool(
|
|
294
291
|
"websearch",
|
|
295
|
-
"Search the web using DuckDuckGo or Google. Returns a clean list of titles, URLs, and snippets. Excellent for researching general knowledge, news, and finding URLs.",
|
|
296
292
|
{
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
.
|
|
300
|
-
|
|
301
|
-
.describe("
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
"
|
|
315
|
-
|
|
293
|
+
title: "Web Search",
|
|
294
|
+
description:
|
|
295
|
+
"Search the web using DuckDuckGo or Google. Returns a clean list of titles, URLs, and snippets. Excellent for researching general knowledge, news, and finding URLs.",
|
|
296
|
+
inputSchema: z.object({
|
|
297
|
+
query: z.string().describe("The search query string."),
|
|
298
|
+
engine: z
|
|
299
|
+
.enum(["duckduckgo", "google"])
|
|
300
|
+
.default("duckduckgo")
|
|
301
|
+
.describe(
|
|
302
|
+
"Search engine to use. Can be 'duckduckgo' or 'google'. Default is 'duckduckgo'.",
|
|
303
|
+
),
|
|
304
|
+
max_results: z
|
|
305
|
+
.number()
|
|
306
|
+
.default(10)
|
|
307
|
+
.describe("Maximum number of results to return. Default is 10."),
|
|
308
|
+
region: z
|
|
309
|
+
.string()
|
|
310
|
+
.default("wt-wt")
|
|
311
|
+
.describe(
|
|
312
|
+
"Region and language code to localize search results (e.g., 'us-en', 'uk-en', 'de-de'). For DuckDuckGo it maps directly. For Google, 'us' is country code and 'en' is language. Default is 'wt-wt' (global/US English).",
|
|
313
|
+
),
|
|
314
|
+
safe_search: z
|
|
315
|
+
.string()
|
|
316
|
+
.default("-1")
|
|
317
|
+
.describe(
|
|
318
|
+
"Safe search filtering mode. '-1' for Moderate, '1' for Strict, '-2' for Off. Default is '-1'. Note: Only applies to DuckDuckGo.",
|
|
319
|
+
),
|
|
320
|
+
}),
|
|
316
321
|
},
|
|
317
|
-
async ({ query,
|
|
322
|
+
async ({ query, max_results, region, safe_search, engine }) => {
|
|
318
323
|
try {
|
|
319
324
|
const result = await executeSearch(
|
|
320
325
|
query,
|
|
@@ -324,46 +329,52 @@ server.tool(
|
|
|
324
329
|
engine,
|
|
325
330
|
);
|
|
326
331
|
return { content: [{ type: "text", text: result }] };
|
|
327
|
-
} catch (
|
|
328
|
-
logger.error("Search Tool failed:", error);
|
|
332
|
+
} catch (err) {
|
|
329
333
|
return {
|
|
330
|
-
content: [{ type: "text", text: `Search Error: ${
|
|
334
|
+
content: [{ type: "text", text: `Search Error: ${err.message}` }],
|
|
331
335
|
isError: true,
|
|
332
336
|
};
|
|
333
337
|
}
|
|
334
338
|
},
|
|
335
339
|
);
|
|
336
340
|
|
|
337
|
-
server.
|
|
341
|
+
server.registerTool(
|
|
338
342
|
"webfetch",
|
|
339
|
-
"Fetch and extract the main text content from any webpage. Fully executes JavaScript to load React/SPAs and aggressively strips images/media (including base64) to save context tokens.",
|
|
340
343
|
{
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
.
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
"
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
344
|
+
title: "Web Fetch",
|
|
345
|
+
description:
|
|
346
|
+
"Fetch and extract the main text content from any webpage. Fully executes JavaScript to load React/SPAs and aggressively strips images/media (including base64) to save context tokens.",
|
|
347
|
+
inputSchema: z.object({
|
|
348
|
+
url: z
|
|
349
|
+
.url()
|
|
350
|
+
.describe(
|
|
351
|
+
"The full URL of the webpage to fetch (must start with http/https).",
|
|
352
|
+
),
|
|
353
|
+
format: z
|
|
354
|
+
.enum(["markdown", "clean_html", "raw_html"])
|
|
355
|
+
.default("markdown")
|
|
356
|
+
.describe(
|
|
357
|
+
"Output format. Set to 'markdown', 'clean_html', or 'raw_html'. Default is 'markdown' (highly recommended to save context tokens).",
|
|
358
|
+
),
|
|
359
|
+
start_index: z
|
|
360
|
+
.number()
|
|
361
|
+
.default(0)
|
|
362
|
+
.describe(
|
|
363
|
+
"Character offset to start reading from for pagination. Use this if a document is too large to fit in the context window. Default is 0.",
|
|
364
|
+
),
|
|
365
|
+
max_length: z
|
|
366
|
+
.number()
|
|
367
|
+
.default(10000)
|
|
368
|
+
.describe(
|
|
369
|
+
"Maximum characters to return per request. Default is 10000.",
|
|
370
|
+
),
|
|
371
|
+
block_media: z
|
|
372
|
+
.boolean()
|
|
373
|
+
.default(true)
|
|
374
|
+
.describe(
|
|
375
|
+
"Block images, videos, and fonts entirely at the network layer to drastically speed up page loads and dodge tracking pixels. Default is true.",
|
|
376
|
+
),
|
|
377
|
+
}),
|
|
367
378
|
},
|
|
368
379
|
async ({ url, format, start_index, max_length, block_media }) => {
|
|
369
380
|
try {
|
|
@@ -375,35 +386,22 @@ server.tool(
|
|
|
375
386
|
block_media,
|
|
376
387
|
);
|
|
377
388
|
return { content: [{ type: "text", text: result }] };
|
|
378
|
-
} catch (
|
|
379
|
-
logger.error(`Fetch Tool failed on ${url}:`, error);
|
|
389
|
+
} catch (err) {
|
|
380
390
|
return {
|
|
381
|
-
content: [{ type: "text", text: `Fetch Error: ${
|
|
391
|
+
content: [{ type: "text", text: `Fetch Error: ${err.message}` }],
|
|
382
392
|
isError: true,
|
|
383
393
|
};
|
|
384
394
|
}
|
|
385
395
|
},
|
|
386
396
|
);
|
|
387
397
|
|
|
388
|
-
// ==========================================
|
|
389
|
-
// BOOTSTRAP
|
|
390
|
-
// ==========================================
|
|
391
|
-
|
|
392
398
|
async function main() {
|
|
393
|
-
logger.info("Initializing MCP Server...");
|
|
394
|
-
|
|
395
399
|
await ensureBinary();
|
|
396
|
-
|
|
397
|
-
// Re-enable STDOUT right before protocol hook-in
|
|
398
400
|
process.stdout.write = originalStdoutWrite;
|
|
399
|
-
|
|
400
401
|
const transport = new StdioServerTransport();
|
|
401
402
|
await server.connect(transport);
|
|
402
|
-
|
|
403
|
-
logger.info("searchfetch successfully connected and listening for requests.");
|
|
404
403
|
}
|
|
405
404
|
|
|
406
405
|
main().catch((err) => {
|
|
407
|
-
logger.error("Fatal error during startup:", err);
|
|
408
406
|
process.exit(1);
|
|
409
407
|
});
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "searchfetch",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.0",
|
|
4
4
|
"description": "Fault-tolerant MCP Server for Stealth Web Search and Fetching",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -18,7 +18,8 @@
|
|
|
18
18
|
},
|
|
19
19
|
"scripts": {
|
|
20
20
|
"start": "node ./index.js",
|
|
21
|
-
"inspector": "npx @modelcontextprotocol/inspector node ./index.js"
|
|
21
|
+
"inspector-js": "npx @modelcontextprotocol/inspector node ./index.js",
|
|
22
|
+
"inspector-py": "npx @modelcontextprotocol/inspector uv run python ./server.py"
|
|
22
23
|
},
|
|
23
24
|
"dependencies": {
|
|
24
25
|
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
@@ -27,10 +28,5 @@
|
|
|
27
28
|
"playwright-core": "^1.59.1",
|
|
28
29
|
"turndown": "^7.2.4",
|
|
29
30
|
"zod": "^4.4.3"
|
|
30
|
-
},
|
|
31
|
-
"devDependencies": {
|
|
32
|
-
"@types/cheerio": "^0.22.35",
|
|
33
|
-
"@types/node": "^25.6.2",
|
|
34
|
-
"@types/turndown": "^5.0.6"
|
|
35
31
|
}
|
|
36
32
|
}
|