@zhafron/mcp-web-search 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +140 -0
- package/dist/src/chrome.js +66 -0
- package/dist/src/engines.js +164 -0
- package/dist/src/extract.js +187 -0
- package/dist/src/extractors/markdown.js +40 -0
- package/dist/src/extractors/readability-alt.js +110 -0
- package/dist/src/extractors/truncation.js +246 -0
- package/dist/src/extractors/types.js +7 -0
- package/dist/src/server.js +92 -0
- package/dist/src/wikipedia.js +102 -0
- package/package.json +64 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 tickernelz
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# mcp-web-search
|
|
2
|
+
|
|
3
|
+
MCP server: web search, Wikipedia summaries, and URL content extraction. No API keys required.
|
|
4
|
+
|
|
5
|
+
Version: 1.0.0
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- search_web - Two-tier web search (DuckDuckGo HTML / Puppeteer/Bing)
|
|
10
|
+
- fetch_url - Extract content from URLs with semantic truncation
|
|
11
|
+
- summarize_url - Fetch and summarize URL content
|
|
12
|
+
- wiki_get - Wikipedia summary by language
|
|
13
|
+
- wiki_multi - Wikipedia summaries in multiple languages
|
|
14
|
+
|
|
15
|
+
## Requirements
|
|
16
|
+
|
|
17
|
+
- Node.js 18+
|
|
18
|
+
- Windows/macOS/Linux
|
|
19
|
+
- Chrome/Chromium (for deep search mode)
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
npm install
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Chrome Installation
|
|
28
|
+
|
|
29
|
+
| OS | Command |
|
|
30
|
+
|----|---------|
|
|
31
|
+
| Ubuntu/Debian | sudo apt install chromium-browser |
|
|
32
|
+
| Fedora | sudo dnf install chromium |
|
|
33
|
+
| Arch | sudo pacman -S chromium |
|
|
34
|
+
| macOS | brew install --cask google-chrome |
|
|
35
|
+
|
|
36
|
+
Custom path: `export CHROME_PATH=/path/to/chrome`
|
|
37
|
+
|
|
38
|
+
## Commands
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
npm run dev # Development
|
|
42
|
+
npm run build # Build
|
|
43
|
+
npm run start # Production
|
|
44
|
+
npm test # Run tests
|
|
45
|
+
npm run format # Format code
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Environment Variables
|
|
49
|
+
|
|
50
|
+
| Variable | Default | Description |
|
|
51
|
+
|----------|---------|-------------|
|
|
52
|
+
| USER_AGENT | mcp-web-search/1.0 | User agent string |
|
|
53
|
+
| HTTP_TIMEOUT | 15000 | Request timeout (ms) |
|
|
54
|
+
| MAX_RESULTS | 10 | Default search limit |
|
|
55
|
+
| LANG_DEFAULT | en | Default language |
|
|
56
|
+
| MAX_BYTES | 20971520 | Max download size |
|
|
57
|
+
| CHROME_PATH | auto-detect | Chrome executable path |
|
|
58
|
+
|
|
59
|
+
SSRF Protection: Blocks localhost, 127.0.0.1, ::1, .local domains.
|
|
60
|
+
|
|
61
|
+
## Tool Reference
|
|
62
|
+
|
|
63
|
+
### search_web
|
|
64
|
+
|
|
65
|
+
Two-tier web search.
|
|
66
|
+
|
|
67
|
+
Input: `{ q: string, limit?: number, lang?: string, mode?: "fast"|"deep"|"auto" }`
|
|
68
|
+
|
|
69
|
+
Output: `{ items: Array<{ title, url, snippet?, source }>, modeUsed, enginesUsed, escalated }`
|
|
70
|
+
|
|
71
|
+
Example: `{ "q": "Node.js LTS", "mode": "fast", "limit": 5 }`
|
|
72
|
+
|
|
73
|
+
### fetch_url
|
|
74
|
+
|
|
75
|
+
Extract content with intelligent truncation.
|
|
76
|
+
|
|
77
|
+
Input: `{ url: string, mode?: "compact"|"standard"|"full", max_length?: number, format?: "markdown"|"text"|"html" }`
|
|
78
|
+
|
|
79
|
+
| Mode | Characters | Tokens | Use Case |
|
|
80
|
+
|------|------------|--------|----------|
|
|
81
|
+
| compact | ~3000 | ~750 | Quick summaries |
|
|
82
|
+
| standard | ~8000 | ~2000 | Balanced (default) |
|
|
83
|
+
| full | unlimited | - | Full content |
|
|
84
|
+
|
|
85
|
+
max_length: Exact character limit (1000-100000), overrides mode.
|
|
86
|
+
|
|
87
|
+
format: Output format (markdown, text, html). Default: markdown.
|
|
88
|
+
|
|
89
|
+
Truncation: Semantic chunking prioritizes headings, code blocks, conclusions.
|
|
90
|
+
|
|
91
|
+
Output: `{ markdown?, text?, format, url, title?, truncated?, original_length?, truncation_ratio? }`
|
|
92
|
+
|
|
93
|
+
Examples:
|
|
94
|
+
- `{ "url": "https://example.com", "mode": "compact" }`
|
|
95
|
+
- `{ "url": "https://example.com", "format": "text" }`
|
|
96
|
+
- `{ "url": "https://example.com", "format": "markdown" }`
|
|
97
|
+
- `{ "url": "https://example.com", "max_length": 5000 }`
|
|
98
|
+
|
|
99
|
+
### summarize_url
|
|
100
|
+
|
|
101
|
+
Fetch and summarize URL content.
|
|
102
|
+
|
|
103
|
+
Input: `{ url: string }`
|
|
104
|
+
|
|
105
|
+
### wiki_get
|
|
106
|
+
|
|
107
|
+
Wikipedia summary by language.
|
|
108
|
+
|
|
109
|
+
Input: `{ title: string, lang?: string }`
|
|
110
|
+
|
|
111
|
+
Output: `{ lang, title, url, description?, extract?, thumbnailUrl? }`
|
|
112
|
+
|
|
113
|
+
### wiki_multi
|
|
114
|
+
|
|
115
|
+
Wikipedia summaries in multiple languages.
|
|
116
|
+
|
|
117
|
+
Input: `{ term: string, baseLang?: string, langs?: string[] }`
|
|
118
|
+
|
|
119
|
+
## Quick Examples
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
search_web: { "q": "App Intents", "mode": "deep", "limit": 5 }
|
|
123
|
+
fetch_url: { "url": "https://example.com", "mode": "compact" }
|
|
124
|
+
summarize_url: { "url": "https://python.org/pep-8" }
|
|
125
|
+
wiki_get: { "title": "Lambda calculus", "lang": "en" }
|
|
126
|
+
wiki_multi: { "term": "AI", "langs": ["en", "es", "fr"] }
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Troubleshooting
|
|
130
|
+
|
|
131
|
+
| Issue | Solution |
|
|
132
|
+
|-------|----------|
|
|
133
|
+
| Chrome not found | Install Chrome or set CHROME_PATH |
|
|
134
|
+
| CAPTCHA/blocks | Reduce frequency, use fast mode |
|
|
135
|
+
| Timeout | Increase HTTP_TIMEOUT, check MAX_BYTES |
|
|
136
|
+
| Blocked URL | SSRF protection, public URLs only |
|
|
137
|
+
|
|
138
|
+
## License
|
|
139
|
+
|
|
140
|
+
MIT
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import { existsSync } from "fs";
|
|
2
|
+
import { platform } from "os";
|
|
3
|
+
export class ChromeNotFoundError extends Error {
|
|
4
|
+
constructor(message) {
|
|
5
|
+
super(message);
|
|
6
|
+
this.name = "ChromeNotFoundError";
|
|
7
|
+
}
|
|
8
|
+
}
|
|
9
|
+
function getDefaultChromePaths() {
|
|
10
|
+
const plat = platform();
|
|
11
|
+
if (plat === "win32") {
|
|
12
|
+
return [
|
|
13
|
+
process.env.LOCALAPPDATA + "\\Google\\Chrome\\Application\\chrome.exe",
|
|
14
|
+
process.env.PROGRAMFILES + "\\Google\\Chrome\\Application\\chrome.exe",
|
|
15
|
+
process.env["PROGRAMFILES(X86)"] + "\\Google\\Chrome\\Application\\chrome.exe",
|
|
16
|
+
process.env.LOCALAPPDATA + "\\Chromium\\Application\\chrome.exe",
|
|
17
|
+
process.env.PROGRAMFILES + "\\Chromium\\Application\\chrome.exe",
|
|
18
|
+
process.env["PROGRAMFILES(X86)"] + "\\Chromium\\Application\\chrome.exe"
|
|
19
|
+
].filter(Boolean);
|
|
20
|
+
}
|
|
21
|
+
if (plat === "darwin") {
|
|
22
|
+
return [
|
|
23
|
+
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
|
24
|
+
"/Applications/Chromium.app/Contents/MacOS/Chromium",
|
|
25
|
+
process.env.HOME + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
|
26
|
+
process.env.HOME + "/Applications/Chromium.app/Contents/MacOS/Chromium"
|
|
27
|
+
].filter(Boolean);
|
|
28
|
+
}
|
|
29
|
+
return [
|
|
30
|
+
"/usr/bin/google-chrome",
|
|
31
|
+
"/usr/bin/google-chrome-stable",
|
|
32
|
+
"/usr/bin/chromium",
|
|
33
|
+
"/usr/bin/chromium-browser",
|
|
34
|
+
"/snap/bin/chromium",
|
|
35
|
+
"/usr/local/bin/chrome",
|
|
36
|
+
"/usr/local/bin/chromium"
|
|
37
|
+
];
|
|
38
|
+
}
|
|
39
|
+
export function findChrome() {
|
|
40
|
+
if (process.env.CHROME_PATH) {
|
|
41
|
+
if (existsSync(process.env.CHROME_PATH)) {
|
|
42
|
+
return process.env.CHROME_PATH;
|
|
43
|
+
}
|
|
44
|
+
throw new ChromeNotFoundError(`Chrome not found at CHROME_PATH: ${process.env.CHROME_PATH}`);
|
|
45
|
+
}
|
|
46
|
+
const paths = getDefaultChromePaths();
|
|
47
|
+
for (const path of paths) {
|
|
48
|
+
if (existsSync(path)) {
|
|
49
|
+
return path;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
const plat = platform();
|
|
53
|
+
let installInstructions = "";
|
|
54
|
+
if (plat === "win32") {
|
|
55
|
+
installInstructions = "Download from: https://www.google.com/chrome/";
|
|
56
|
+
}
|
|
57
|
+
else if (plat === "darwin") {
|
|
58
|
+
installInstructions =
|
|
59
|
+
"Install via: brew install --cask google-chrome\nOr download from: https://www.google.com/chrome/";
|
|
60
|
+
}
|
|
61
|
+
else {
|
|
62
|
+
installInstructions =
|
|
63
|
+
"Install via:\n Ubuntu/Debian: sudo apt install chromium-browser\n Fedora: sudo dnf install chromium\n Arch: sudo pacman -S chromium\nOr download from: https://www.google.com/chrome/";
|
|
64
|
+
}
|
|
65
|
+
throw new ChromeNotFoundError(`Chrome/Chromium not found on system.\n\n${installInstructions}\n\nAlternatively, set CHROME_PATH environment variable to your Chrome executable.`);
|
|
66
|
+
}
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import { JSDOM } from "jsdom";
|
|
2
|
+
import puppeteer from "puppeteer-core";
|
|
3
|
+
import { findChrome } from "./chrome.js";
|
|
4
|
+
function uaHeaders(lang = process.env.LANG_DEFAULT || "en") {
|
|
5
|
+
const ua = process.env.USER_AGENT || "mcp-web-search/1.0";
|
|
6
|
+
const acceptLang = lang === "en" ? "en-US,en;q=0.9" : `${lang};q=0.9,en;q=0.8`;
|
|
7
|
+
return { "User-Agent": ua, "Accept-Language": acceptLang };
|
|
8
|
+
}
|
|
9
|
+
function toMs(env, def) {
|
|
10
|
+
const n = Number(env);
|
|
11
|
+
return Number.isFinite(n) && n > 0 ? n : def;
|
|
12
|
+
}
|
|
13
|
+
async function fetchWithTimeout(input, init = {}, timeoutMs = 15000) {
|
|
14
|
+
const controller = new AbortController();
|
|
15
|
+
const t = setTimeout(() => controller.abort(), timeoutMs);
|
|
16
|
+
try {
|
|
17
|
+
return await fetch(input, { ...init, signal: controller.signal });
|
|
18
|
+
}
|
|
19
|
+
finally {
|
|
20
|
+
clearTimeout(t);
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
const HTTP_TIMEOUT = toMs(process.env.HTTP_TIMEOUT, 15000);
|
|
24
|
+
function decodeDuckDuckGoRedirect(href) {
|
|
25
|
+
try {
|
|
26
|
+
const u = new URL(href, "https://duckduckgo.com/");
|
|
27
|
+
if (u.hostname === "duckduckgo.com" && u.pathname.startsWith("/l/")) {
|
|
28
|
+
const real = u.searchParams.get("uddg");
|
|
29
|
+
if (real)
|
|
30
|
+
return decodeURIComponent(real);
|
|
31
|
+
}
|
|
32
|
+
return u.toString();
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
return href;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
async function ddgHtmlSearch(q, limit, lang) {
|
|
39
|
+
const url = new URL("https://html.duckduckgo.com/html/");
|
|
40
|
+
url.searchParams.set("q", q);
|
|
41
|
+
const res = await fetchWithTimeout(url, { headers: uaHeaders(lang) }, HTTP_TIMEOUT);
|
|
42
|
+
if (!res.ok)
|
|
43
|
+
throw new Error(`DuckDuckGo HTML ${res.status}`);
|
|
44
|
+
const html = await res.text();
|
|
45
|
+
const dom = new JSDOM(html, { url: "https://duckduckgo.com/?q=" + encodeURIComponent(q) });
|
|
46
|
+
const doc = dom.window.document;
|
|
47
|
+
const anchors = Array.from(doc.querySelectorAll("a.result__a"));
|
|
48
|
+
const snippets = Array.from(doc.querySelectorAll(".result__snippet"));
|
|
49
|
+
const items = [];
|
|
50
|
+
for (let i = 0; i < anchors.length && items.length < limit; i++) {
|
|
51
|
+
const a = anchors[i];
|
|
52
|
+
const title = (a.textContent || "").trim();
|
|
53
|
+
const href = decodeDuckDuckGoRedirect(a.getAttribute("href") || "");
|
|
54
|
+
if (!title || !href)
|
|
55
|
+
continue;
|
|
56
|
+
const sn = (snippets[i]?.textContent || "").trim() || undefined;
|
|
57
|
+
try {
|
|
58
|
+
const u = new URL(href);
|
|
59
|
+
items.push({ title, url: u.toString(), snippet: sn, source: "ddg_html" });
|
|
60
|
+
}
|
|
61
|
+
catch { }
|
|
62
|
+
}
|
|
63
|
+
return items;
|
|
64
|
+
}
|
|
65
|
+
async function bingPuppeteerSearch(q, limit, lang) {
|
|
66
|
+
const chromePath = findChrome();
|
|
67
|
+
const browser = await puppeteer.launch({
|
|
68
|
+
executablePath: chromePath,
|
|
69
|
+
headless: true,
|
|
70
|
+
args: [
|
|
71
|
+
"--no-sandbox",
|
|
72
|
+
"--disable-setuid-sandbox",
|
|
73
|
+
"--disable-dev-shm-usage",
|
|
74
|
+
"--disable-accelerated-2d-canvas",
|
|
75
|
+
"--no-first-run",
|
|
76
|
+
"--no-zygote",
|
|
77
|
+
"--disable-gpu"
|
|
78
|
+
]
|
|
79
|
+
});
|
|
80
|
+
try {
|
|
81
|
+
const page = await browser.newPage();
|
|
82
|
+
await page.setUserAgent((process.env.USER_AGENT || "mcp-web-search/1.0") + " Puppeteer");
|
|
83
|
+
await page.setExtraHTTPHeaders({
|
|
84
|
+
"Accept-Language": lang === "en" ? "en-US,en;q=0.9" : `${lang};q=0.9,en;q=0.8`
|
|
85
|
+
});
|
|
86
|
+
const url = new URL("https://www.bing.com/search");
|
|
87
|
+
url.searchParams.set("q", q);
|
|
88
|
+
if (lang)
|
|
89
|
+
url.searchParams.set("setlang", lang);
|
|
90
|
+
await page.goto(url.toString(), { waitUntil: "domcontentloaded", timeout: 30000 });
|
|
91
|
+
const results = await page.evaluate(maxResults => {
|
|
92
|
+
const items = [];
|
|
93
|
+
const cards = document.querySelectorAll("li.b_algo");
|
|
94
|
+
for (const card of Array.from(cards)) {
|
|
95
|
+
const anchor = card.querySelector("h2 a");
|
|
96
|
+
if (!anchor)
|
|
97
|
+
continue;
|
|
98
|
+
const title = anchor.textContent?.trim() || "";
|
|
99
|
+
const href = anchor.getAttribute("href");
|
|
100
|
+
if (!href || !title)
|
|
101
|
+
continue;
|
|
102
|
+
let snippet = "";
|
|
103
|
+
const captionP = card.querySelector("div.b_caption p");
|
|
104
|
+
if (captionP) {
|
|
105
|
+
snippet = captionP.textContent?.trim() || "";
|
|
106
|
+
}
|
|
107
|
+
else {
|
|
108
|
+
const snippetDiv = card.querySelector("div.b_snippet");
|
|
109
|
+
if (snippetDiv) {
|
|
110
|
+
snippet = snippetDiv.textContent?.trim() || "";
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
try {
|
|
114
|
+
new URL(href);
|
|
115
|
+
items.push({ title, url: href, snippet: snippet || undefined });
|
|
116
|
+
}
|
|
117
|
+
catch { }
|
|
118
|
+
if (items.length >= maxResults)
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
return items;
|
|
122
|
+
}, limit);
|
|
123
|
+
return results.map(r => ({ ...r, source: "bing_puppeteer" }));
|
|
124
|
+
}
|
|
125
|
+
finally {
|
|
126
|
+
await browser.close();
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
export async function runTwoTierSearch(opts) {
|
|
130
|
+
const { q } = opts;
|
|
131
|
+
const limit = Math.max(1, Math.min(Number(opts.limit ?? (Number(process.env.MAX_RESULTS) || 10)), 50));
|
|
132
|
+
const lang = opts.lang ?? (process.env.LANG_DEFAULT || "en");
|
|
133
|
+
const mode = opts.mode ?? "auto";
|
|
134
|
+
const enginesUsed = [];
|
|
135
|
+
const diagnostics = {};
|
|
136
|
+
if (mode === "fast") {
|
|
137
|
+
const fast = await ddgHtmlSearch(q, limit, lang);
|
|
138
|
+
enginesUsed.push("ddg_html");
|
|
139
|
+
diagnostics["fastCount"] = fast.length;
|
|
140
|
+
return { items: fast, modeUsed: "fast", enginesUsed, escalated: false, diagnostics };
|
|
141
|
+
}
|
|
142
|
+
if (mode === "deep") {
|
|
143
|
+
const deep = await bingPuppeteerSearch(q, limit, lang);
|
|
144
|
+
enginesUsed.push("bing_puppeteer");
|
|
145
|
+
diagnostics["deepCount"] = deep.length;
|
|
146
|
+
return { items: deep, modeUsed: "deep", enginesUsed, escalated: false, diagnostics };
|
|
147
|
+
}
|
|
148
|
+
const fast = await ddgHtmlSearch(q, limit, lang);
|
|
149
|
+
enginesUsed.push("ddg_html");
|
|
150
|
+
diagnostics["fastCount"] = fast.length;
|
|
151
|
+
if (fast.length < Math.min(3, limit)) {
|
|
152
|
+
const deep = await bingPuppeteerSearch(q, limit, lang);
|
|
153
|
+
enginesUsed.push("bing_puppeteer");
|
|
154
|
+
diagnostics["deepCount"] = deep.length;
|
|
155
|
+
return {
|
|
156
|
+
items: [...fast, ...deep].slice(0, limit),
|
|
157
|
+
modeUsed: "auto",
|
|
158
|
+
enginesUsed,
|
|
159
|
+
escalated: true,
|
|
160
|
+
diagnostics
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
return { items: fast, modeUsed: "auto", enginesUsed, escalated: false, diagnostics };
|
|
164
|
+
}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import { JSDOM } from "jsdom";
|
|
2
|
+
import { Readability } from "@mozilla/readability";
|
|
3
|
+
import { extractWithReadabilityAlt } from "./extractors/readability-alt.js";
|
|
4
|
+
import { htmlToMarkdown } from "./extractors/markdown.js";
|
|
5
|
+
import { applySmartTruncation } from "./extractors/truncation.js";
|
|
6
|
+
function uaHeaders() {
|
|
7
|
+
const ua = process.env.USER_AGENT || "mcp-web-search/1.0";
|
|
8
|
+
const lang = process.env.LANG_DEFAULT || "en";
|
|
9
|
+
const accept = lang === "en" ? "en-US,en;q=0.9" : `${lang};q=0.9,en;q=0.8`;
|
|
10
|
+
return { "User-Agent": ua, "Accept-Language": accept };
|
|
11
|
+
}
|
|
12
|
+
function toMs(env, def) {
|
|
13
|
+
const n = Number(env);
|
|
14
|
+
return Number.isFinite(n) && n > 0 ? n : def;
|
|
15
|
+
}
|
|
16
|
+
async function fetchWithTimeout(input, init = {}, timeoutMs = 15000) {
|
|
17
|
+
const controller = new AbortController();
|
|
18
|
+
const t = setTimeout(() => controller.abort(), timeoutMs);
|
|
19
|
+
try {
|
|
20
|
+
return await fetch(input, { ...init, signal: controller.signal });
|
|
21
|
+
}
|
|
22
|
+
finally {
|
|
23
|
+
clearTimeout(t);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
const HTTP_TIMEOUT = toMs(process.env.HTTP_TIMEOUT, 15000);
|
|
27
|
+
const MAX_BYTES = toMs(process.env.MAX_BYTES, 20 * 1024 * 1024);
|
|
28
|
+
function isBlockedHost(hostname) {
|
|
29
|
+
const lower = hostname.toLowerCase();
|
|
30
|
+
if (lower === "localhost" || lower === "127.0.0.1" || lower === "::1")
|
|
31
|
+
return true;
|
|
32
|
+
if (lower.endsWith(".local") || lower.endsWith(".localhost"))
|
|
33
|
+
return true;
|
|
34
|
+
return false;
|
|
35
|
+
}
|
|
36
|
+
function fallbackExtraction(html, url) {
|
|
37
|
+
try {
|
|
38
|
+
const dom = new JSDOM(html, { url });
|
|
39
|
+
const reader = new Readability(dom.window.document);
|
|
40
|
+
const article = reader.parse();
|
|
41
|
+
if (article) {
|
|
42
|
+
return {
|
|
43
|
+
title: article.title ?? undefined,
|
|
44
|
+
byline: article.byline ?? undefined,
|
|
45
|
+
siteName: article.siteName ?? undefined,
|
|
46
|
+
text: article.textContent ?? ""
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
const text = dom.window.document.body.textContent || "";
|
|
50
|
+
return { text, title: dom.window.document.title };
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
return { text: "" };
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
export async function fetchAndExtract(url, options) {
|
|
57
|
+
const u = new URL(url);
|
|
58
|
+
if (isBlockedHost(u.hostname)) {
|
|
59
|
+
throw new Error("Blocked localhost/private URL");
|
|
60
|
+
}
|
|
61
|
+
const res = await fetchWithTimeout(u.toString(), { redirect: "follow", headers: uaHeaders() }, HTTP_TIMEOUT);
|
|
62
|
+
if (!res.ok)
|
|
63
|
+
throw new Error(`Fetch ${res.status} for ${url}`);
|
|
64
|
+
const lenHeader = res.headers.get("content-length");
|
|
65
|
+
const len = Number(lenHeader || "0");
|
|
66
|
+
if (len > 0 && len > MAX_BYTES)
|
|
67
|
+
throw new Error(`Content too large: ${len} bytes`);
|
|
68
|
+
const ct = res.headers.get("content-type") || "";
|
|
69
|
+
const buf = Buffer.from(await res.arrayBuffer());
|
|
70
|
+
if (buf.byteLength > MAX_BYTES)
|
|
71
|
+
throw new Error(`Content too large (downloaded)`);
|
|
72
|
+
if (ct.includes("application/pdf") || u.pathname.toLowerCase().endsWith(".pdf")) {
|
|
73
|
+
const pdfParse = (await import("pdf-parse")).default;
|
|
74
|
+
const data = await pdfParse(buf);
|
|
75
|
+
const text = data.text || "";
|
|
76
|
+
const truncationResult = applySmartTruncation(text, "text", options);
|
|
77
|
+
return {
|
|
78
|
+
text: truncationResult.content,
|
|
79
|
+
url,
|
|
80
|
+
title: data.info?.Title,
|
|
81
|
+
length: data.numpages,
|
|
82
|
+
format: "text",
|
|
83
|
+
truncated: truncationResult.truncated,
|
|
84
|
+
original_length: truncationResult.original_length,
|
|
85
|
+
truncation_ratio: truncationResult.truncated
|
|
86
|
+
? truncationResult.final_length / truncationResult.original_length
|
|
87
|
+
: undefined
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
const html = buf.toString("utf8");
|
|
91
|
+
const extracted = extractWithReadabilityAlt(html, url);
|
|
92
|
+
const requestedFormat = options?.format || "markdown";
|
|
93
|
+
const shouldReturnMarkdown = requestedFormat === "markdown";
|
|
94
|
+
const shouldReturnText = requestedFormat === "text";
|
|
95
|
+
const shouldReturnHtml = requestedFormat === "html";
|
|
96
|
+
if (extracted && extracted.textContent && extracted.textContent.length > 0) {
|
|
97
|
+
const markdown = htmlToMarkdown(extracted.content);
|
|
98
|
+
if (shouldReturnMarkdown && markdown) {
|
|
99
|
+
const truncationResult = applySmartTruncation(markdown, "markdown", options);
|
|
100
|
+
return {
|
|
101
|
+
title: extracted.title || undefined,
|
|
102
|
+
markdown: truncationResult.content,
|
|
103
|
+
url,
|
|
104
|
+
length: extracted.length,
|
|
105
|
+
format: "markdown",
|
|
106
|
+
truncated: truncationResult.truncated,
|
|
107
|
+
original_length: truncationResult.original_length,
|
|
108
|
+
truncation_ratio: truncationResult.truncated
|
|
109
|
+
? truncationResult.final_length / truncationResult.original_length
|
|
110
|
+
: undefined
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
if (shouldReturnText) {
|
|
114
|
+
const truncationResult = applySmartTruncation(extracted.textContent, "text", options);
|
|
115
|
+
return {
|
|
116
|
+
title: extracted.title || undefined,
|
|
117
|
+
text: truncationResult.content,
|
|
118
|
+
url,
|
|
119
|
+
length: extracted.length,
|
|
120
|
+
format: "text",
|
|
121
|
+
truncated: truncationResult.truncated,
|
|
122
|
+
original_length: truncationResult.original_length,
|
|
123
|
+
truncation_ratio: truncationResult.truncated
|
|
124
|
+
? truncationResult.final_length / truncationResult.original_length
|
|
125
|
+
: undefined
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
if (shouldReturnHtml && extracted.content) {
|
|
129
|
+
const truncationResult = applySmartTruncation(extracted.content, "markdown", options);
|
|
130
|
+
return {
|
|
131
|
+
title: extracted.title || undefined,
|
|
132
|
+
markdown: truncationResult.content,
|
|
133
|
+
url,
|
|
134
|
+
length: extracted.length,
|
|
135
|
+
format: "markdown",
|
|
136
|
+
truncated: truncationResult.truncated,
|
|
137
|
+
original_length: truncationResult.original_length,
|
|
138
|
+
truncation_ratio: truncationResult.truncated
|
|
139
|
+
? truncationResult.final_length / truncationResult.original_length
|
|
140
|
+
: undefined
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
if (markdown) {
|
|
144
|
+
const truncationResult = applySmartTruncation(markdown, "markdown", options);
|
|
145
|
+
return {
|
|
146
|
+
title: extracted.title || undefined,
|
|
147
|
+
markdown: truncationResult.content,
|
|
148
|
+
url,
|
|
149
|
+
length: extracted.length,
|
|
150
|
+
format: "markdown",
|
|
151
|
+
truncated: truncationResult.truncated,
|
|
152
|
+
original_length: truncationResult.original_length,
|
|
153
|
+
truncation_ratio: truncationResult.truncated
|
|
154
|
+
? truncationResult.final_length / truncationResult.original_length
|
|
155
|
+
: undefined
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
const truncationResult = applySmartTruncation(extracted.textContent, "text", options);
|
|
159
|
+
return {
|
|
160
|
+
title: extracted.title || undefined,
|
|
161
|
+
text: truncationResult.content,
|
|
162
|
+
url,
|
|
163
|
+
length: extracted.length,
|
|
164
|
+
format: "text",
|
|
165
|
+
truncated: truncationResult.truncated,
|
|
166
|
+
original_length: truncationResult.original_length,
|
|
167
|
+
truncation_ratio: truncationResult.truncated
|
|
168
|
+
? truncationResult.final_length / truncationResult.original_length
|
|
169
|
+
: undefined
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
const fallback = fallbackExtraction(html, url);
|
|
173
|
+
const truncationResult = applySmartTruncation(fallback.text, "text", options);
|
|
174
|
+
return {
|
|
175
|
+
title: fallback.title,
|
|
176
|
+
byline: fallback.byline,
|
|
177
|
+
siteName: fallback.siteName,
|
|
178
|
+
text: truncationResult.content,
|
|
179
|
+
url,
|
|
180
|
+
format: "text",
|
|
181
|
+
truncated: truncationResult.truncated,
|
|
182
|
+
original_length: truncationResult.original_length,
|
|
183
|
+
truncation_ratio: truncationResult.truncated
|
|
184
|
+
? truncationResult.final_length / truncationResult.original_length
|
|
185
|
+
: undefined
|
|
186
|
+
};
|
|
187
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import TurndownService from "turndown";
|
|
2
|
+
const turndownService = new TurndownService({
|
|
3
|
+
headingStyle: "atx",
|
|
4
|
+
codeBlockStyle: "fenced",
|
|
5
|
+
bulletListMarker: "-",
|
|
6
|
+
emDelimiter: "*",
|
|
7
|
+
strongDelimiter: "**",
|
|
8
|
+
linkStyle: "inlined"
|
|
9
|
+
});
|
|
10
|
+
turndownService.addRule("removeEmptyElements", {
|
|
11
|
+
filter: (node) => {
|
|
12
|
+
return node.textContent?.trim() === "" && !["IMG", "BR", "HR"].includes(node.nodeName);
|
|
13
|
+
},
|
|
14
|
+
replacement: () => ""
|
|
15
|
+
});
|
|
16
|
+
turndownService.addRule("preserveCodeBlocks", {
|
|
17
|
+
filter: ["pre", "code"],
|
|
18
|
+
replacement: (content, node) => {
|
|
19
|
+
if (node.nodeName === "PRE") {
|
|
20
|
+
const code = node.querySelector("code");
|
|
21
|
+
const lang = code?.className.match(/language-(\w+)/)?.[1] || "";
|
|
22
|
+
return `\n\`\`\`${lang}\n${content}\n\`\`\`\n`;
|
|
23
|
+
}
|
|
24
|
+
return `\`${content}\``;
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
export function htmlToMarkdown(html) {
|
|
28
|
+
try {
|
|
29
|
+
if (!html || html.trim().length === 0)
|
|
30
|
+
return null;
|
|
31
|
+
const markdown = turndownService.turndown(html);
|
|
32
|
+
if (!markdown || markdown.trim().length === 0)
|
|
33
|
+
return null;
|
|
34
|
+
return markdown.trim();
|
|
35
|
+
}
|
|
36
|
+
catch (error) {
|
|
37
|
+
console.error("Markdown conversion failed:", error);
|
|
38
|
+
return null;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import { JSDOM } from "jsdom";
|
|
2
|
+
import { DEFAULT_CONFIG } from "./types.js";
|
|
3
|
+
function scoreNode(node, config) {
|
|
4
|
+
let score = 0;
|
|
5
|
+
const text = node.textContent || "";
|
|
6
|
+
const textLength = text.trim().length;
|
|
7
|
+
if (textLength < config.minTextLength)
|
|
8
|
+
return 0;
|
|
9
|
+
score += textLength * 0.1;
|
|
10
|
+
const tagName = node.tagName.toLowerCase();
|
|
11
|
+
const boost = config.tagBoosts[tagName] || 1.0;
|
|
12
|
+
score *= boost;
|
|
13
|
+
const className = node.className || "";
|
|
14
|
+
const id = node.id || "";
|
|
15
|
+
if (config.ignoreClasses.test(className) || config.ignoreClasses.test(id)) {
|
|
16
|
+
return 0;
|
|
17
|
+
}
|
|
18
|
+
const density = textLength / (node.children.length + 1);
|
|
19
|
+
score += density * 0.5;
|
|
20
|
+
const pCount = node.querySelectorAll("p").length;
|
|
21
|
+
score += pCount * 5;
|
|
22
|
+
return score;
|
|
23
|
+
}
|
|
24
|
+
function cleanNode(node) {
|
|
25
|
+
const toRemove = ["script", "style", "noscript", "iframe", "object", "embed"];
|
|
26
|
+
toRemove.forEach(tag => {
|
|
27
|
+
node.querySelectorAll(tag).forEach(el => el.remove());
|
|
28
|
+
});
|
|
29
|
+
node.querySelectorAll("*").forEach(el => {
|
|
30
|
+
const className = el.className || "";
|
|
31
|
+
const id = el.id || "";
|
|
32
|
+
if (/ads|advertisement|sponsor|promo/i.test(className) || /ads|advertisement/i.test(id)) {
|
|
33
|
+
el.remove();
|
|
34
|
+
}
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
function extractTextContent(node) {
|
|
38
|
+
let text = "";
|
|
39
|
+
function traverse(el) {
|
|
40
|
+
if (el.nodeType === 3) {
|
|
41
|
+
const content = el.textContent?.trim();
|
|
42
|
+
if (content)
|
|
43
|
+
text += content + " ";
|
|
44
|
+
}
|
|
45
|
+
else if (el.nodeType === 1) {
|
|
46
|
+
const element = el;
|
|
47
|
+
if (["P", "DIV", "BR", "H1", "H2", "H3", "H4", "H5", "H6"].includes(element.tagName)) {
|
|
48
|
+
text += "\n";
|
|
49
|
+
}
|
|
50
|
+
element.childNodes.forEach(child => traverse(child));
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
traverse(node);
|
|
54
|
+
return text.replace(/\s+/g, " ").trim();
|
|
55
|
+
}
|
|
56
|
+
export function extractWithReadabilityAlt(html, url, config = DEFAULT_CONFIG) {
|
|
57
|
+
try {
|
|
58
|
+
const dom = new JSDOM(html, { url });
|
|
59
|
+
const doc = dom.window.document;
|
|
60
|
+
const title = doc.title || "";
|
|
61
|
+
const candidates = Array.from(doc.querySelectorAll("article, section, main, div, [role='main']"));
|
|
62
|
+
if (candidates.length === 0) {
|
|
63
|
+
const body = doc.body;
|
|
64
|
+
if (!body)
|
|
65
|
+
return null;
|
|
66
|
+
cleanNode(body);
|
|
67
|
+
const textContent = extractTextContent(body);
|
|
68
|
+
return {
|
|
69
|
+
title,
|
|
70
|
+
textContent,
|
|
71
|
+
content: body.innerHTML,
|
|
72
|
+
length: textContent.length
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
let bestNode = null;
|
|
76
|
+
let bestScore = 0;
|
|
77
|
+
for (const candidate of candidates) {
|
|
78
|
+
const score = scoreNode(candidate, config);
|
|
79
|
+
if (score > bestScore) {
|
|
80
|
+
bestScore = score;
|
|
81
|
+
bestNode = candidate;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
if (!bestNode || bestScore === 0) {
|
|
85
|
+
const body = doc.body;
|
|
86
|
+
if (!body)
|
|
87
|
+
return null;
|
|
88
|
+
cleanNode(body);
|
|
89
|
+
const textContent = extractTextContent(body);
|
|
90
|
+
return {
|
|
91
|
+
title,
|
|
92
|
+
textContent,
|
|
93
|
+
content: body.innerHTML,
|
|
94
|
+
length: textContent.length
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
cleanNode(bestNode);
|
|
98
|
+
const textContent = extractTextContent(bestNode);
|
|
99
|
+
return {
|
|
100
|
+
title,
|
|
101
|
+
textContent,
|
|
102
|
+
content: bestNode.innerHTML,
|
|
103
|
+
length: textContent.length
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
catch (error) {
|
|
107
|
+
console.error("Content extraction failed:", error);
|
|
108
|
+
return null;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
const MODE_LIMITS = {
|
|
2
|
+
compact: 3000,
|
|
3
|
+
standard: 8000,
|
|
4
|
+
full: Infinity
|
|
5
|
+
};
|
|
6
|
+
const KEYWORDS = [
|
|
7
|
+
"summary",
|
|
8
|
+
"conclusion",
|
|
9
|
+
"important",
|
|
10
|
+
"overview",
|
|
11
|
+
"introduction",
|
|
12
|
+
"key",
|
|
13
|
+
"main",
|
|
14
|
+
"abstract"
|
|
15
|
+
];
|
|
16
|
+
export function applySmartTruncation(content, format, options) {
|
|
17
|
+
const mode = options?.mode || "standard";
|
|
18
|
+
const maxLength = options?.max_length || MODE_LIMITS[mode] || MODE_LIMITS.standard;
|
|
19
|
+
if (maxLength === Infinity || content.length <= maxLength) {
|
|
20
|
+
return {
|
|
21
|
+
content,
|
|
22
|
+
truncated: false,
|
|
23
|
+
original_length: content.length,
|
|
24
|
+
final_length: content.length
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
if (format === "markdown") {
|
|
28
|
+
return truncateMarkdown(content, maxLength);
|
|
29
|
+
}
|
|
30
|
+
else {
|
|
31
|
+
return truncateText(content, maxLength);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
export function truncateMarkdown(content, maxLength) {
|
|
35
|
+
const chunks = parseMarkdownChunks(content);
|
|
36
|
+
if (chunks.length === 0) {
|
|
37
|
+
return {
|
|
38
|
+
content: balancedTruncate(content, maxLength),
|
|
39
|
+
truncated: true,
|
|
40
|
+
original_length: content.length,
|
|
41
|
+
final_length: Math.min(content.length, maxLength)
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
chunks.forEach((chunk, idx) => {
|
|
45
|
+
chunk.score = scoreChunk(chunk, chunks.length);
|
|
46
|
+
chunk.position = idx;
|
|
47
|
+
});
|
|
48
|
+
const selected = selectChunks(chunks, maxLength);
|
|
49
|
+
const assembled = assembleChunks(selected);
|
|
50
|
+
return {
|
|
51
|
+
content: assembled,
|
|
52
|
+
truncated: true,
|
|
53
|
+
original_length: content.length,
|
|
54
|
+
final_length: assembled.length,
|
|
55
|
+
chunks_selected: selected.length,
|
|
56
|
+
chunks_total: chunks.length
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
export function truncateText(content, maxLength) {
|
|
60
|
+
const chunks = parseSentences(content);
|
|
61
|
+
if (chunks.length === 0) {
|
|
62
|
+
return {
|
|
63
|
+
content: balancedTruncate(content, maxLength),
|
|
64
|
+
truncated: true,
|
|
65
|
+
original_length: content.length,
|
|
66
|
+
final_length: Math.min(content.length, maxLength)
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
chunks.forEach((chunk, idx) => {
|
|
70
|
+
chunk.score = scoreChunk(chunk, chunks.length);
|
|
71
|
+
chunk.position = idx;
|
|
72
|
+
});
|
|
73
|
+
const selected = selectChunks(chunks, maxLength);
|
|
74
|
+
const assembled = assembleChunks(selected);
|
|
75
|
+
return {
|
|
76
|
+
content: assembled,
|
|
77
|
+
truncated: true,
|
|
78
|
+
original_length: content.length,
|
|
79
|
+
final_length: assembled.length,
|
|
80
|
+
chunks_selected: selected.length,
|
|
81
|
+
chunks_total: chunks.length
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
export function parseMarkdownChunks(markdown) {
|
|
85
|
+
const chunks = [];
|
|
86
|
+
const lines = markdown.split("\n");
|
|
87
|
+
let currentChunk = [];
|
|
88
|
+
let currentType = "paragraph";
|
|
89
|
+
let inCodeBlock = false;
|
|
90
|
+
const flushChunk = () => {
|
|
91
|
+
if (currentChunk.length > 0) {
|
|
92
|
+
const content = currentChunk.join("\n").trim();
|
|
93
|
+
if (content) {
|
|
94
|
+
chunks.push({
|
|
95
|
+
content,
|
|
96
|
+
type: currentType,
|
|
97
|
+
position: 0,
|
|
98
|
+
score: 0,
|
|
99
|
+
length: content.length
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
currentChunk = [];
|
|
103
|
+
}
|
|
104
|
+
};
|
|
105
|
+
for (const line of lines) {
|
|
106
|
+
if (line.startsWith("```")) {
|
|
107
|
+
if (inCodeBlock) {
|
|
108
|
+
currentChunk.push(line);
|
|
109
|
+
flushChunk();
|
|
110
|
+
inCodeBlock = false;
|
|
111
|
+
currentType = "paragraph";
|
|
112
|
+
}
|
|
113
|
+
else {
|
|
114
|
+
flushChunk();
|
|
115
|
+
inCodeBlock = true;
|
|
116
|
+
currentType = "code";
|
|
117
|
+
currentChunk.push(line);
|
|
118
|
+
}
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
if (inCodeBlock) {
|
|
122
|
+
currentChunk.push(line);
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
if (line.match(/^#{1,6}\s/)) {
|
|
126
|
+
flushChunk();
|
|
127
|
+
currentType = "heading";
|
|
128
|
+
currentChunk.push(line);
|
|
129
|
+
flushChunk();
|
|
130
|
+
currentType = "paragraph";
|
|
131
|
+
}
|
|
132
|
+
else if (line.match(/^[\s]*[-*+]\s/) || line.match(/^[\s]*\d+\.\s/)) {
|
|
133
|
+
if (currentType !== "list") {
|
|
134
|
+
flushChunk();
|
|
135
|
+
currentType = "list";
|
|
136
|
+
}
|
|
137
|
+
currentChunk.push(line);
|
|
138
|
+
}
|
|
139
|
+
else if (line.trim() === "") {
|
|
140
|
+
if (currentType === "list") {
|
|
141
|
+
flushChunk();
|
|
142
|
+
currentType = "paragraph";
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
else {
|
|
146
|
+
if (currentType === "list") {
|
|
147
|
+
flushChunk();
|
|
148
|
+
currentType = "paragraph";
|
|
149
|
+
}
|
|
150
|
+
currentChunk.push(line);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
flushChunk();
|
|
154
|
+
return chunks;
|
|
155
|
+
}
|
|
156
|
+
export function parseSentences(text) {
|
|
157
|
+
const sentences = text.match(/[^.!?]+[.!?]+/g) || [];
|
|
158
|
+
return sentences.map(sentence => ({
|
|
159
|
+
content: sentence.trim(),
|
|
160
|
+
type: "text",
|
|
161
|
+
position: 0,
|
|
162
|
+
score: 0,
|
|
163
|
+
length: sentence.trim().length
|
|
164
|
+
}));
|
|
165
|
+
}
|
|
166
|
+
export function scoreChunk(chunk, totalChunks) {
|
|
167
|
+
let score = 0;
|
|
168
|
+
if (chunk.type === "heading") {
|
|
169
|
+
score += 20;
|
|
170
|
+
}
|
|
171
|
+
else if (chunk.type === "code") {
|
|
172
|
+
score += 10;
|
|
173
|
+
}
|
|
174
|
+
else if (chunk.type === "list") {
|
|
175
|
+
score += 5;
|
|
176
|
+
}
|
|
177
|
+
const positionRatio = chunk.position / Math.max(totalChunks - 1, 1);
|
|
178
|
+
if (positionRatio <= 0.15) {
|
|
179
|
+
score += 15;
|
|
180
|
+
}
|
|
181
|
+
else if (positionRatio >= 0.85) {
|
|
182
|
+
score += 12;
|
|
183
|
+
}
|
|
184
|
+
if (chunk.length >= 100 && chunk.length <= 1000) {
|
|
185
|
+
score += 5;
|
|
186
|
+
}
|
|
187
|
+
else if (chunk.length > 300) {
|
|
188
|
+
score += 3;
|
|
189
|
+
}
|
|
190
|
+
const lowerContent = chunk.content.toLowerCase();
|
|
191
|
+
for (const keyword of KEYWORDS) {
|
|
192
|
+
if (lowerContent.includes(keyword)) {
|
|
193
|
+
score += 5;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
if (chunk.length < 50) {
|
|
197
|
+
score -= 10;
|
|
198
|
+
}
|
|
199
|
+
return score;
|
|
200
|
+
}
|
|
201
|
+
export function selectChunks(chunks, maxLength) {
|
|
202
|
+
const selected = [];
|
|
203
|
+
let currentLength = 0;
|
|
204
|
+
const firstHeading = chunks.find(c => c.type === "heading");
|
|
205
|
+
if (firstHeading) {
|
|
206
|
+
selected.push(firstHeading);
|
|
207
|
+
currentLength += firstHeading.length + 5;
|
|
208
|
+
}
|
|
209
|
+
const sortedChunks = [...chunks]
|
|
210
|
+
.filter(c => !selected.includes(c))
|
|
211
|
+
.sort((a, b) => b.score - a.score);
|
|
212
|
+
for (const chunk of sortedChunks) {
|
|
213
|
+
const chunkLength = chunk.length + 5;
|
|
214
|
+
if (currentLength + chunkLength <= maxLength) {
|
|
215
|
+
selected.push(chunk);
|
|
216
|
+
currentLength += chunkLength;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
return selected.sort((a, b) => a.position - b.position);
|
|
220
|
+
}
|
|
221
|
+
export function assembleChunks(chunks) {
|
|
222
|
+
if (chunks.length === 0)
|
|
223
|
+
return "";
|
|
224
|
+
const parts = [];
|
|
225
|
+
let lastPosition = -1;
|
|
226
|
+
for (const chunk of chunks) {
|
|
227
|
+
if (lastPosition >= 0 && chunk.position > lastPosition + 1) {
|
|
228
|
+
parts.push("[...]");
|
|
229
|
+
}
|
|
230
|
+
parts.push(chunk.content);
|
|
231
|
+
lastPosition = chunk.position;
|
|
232
|
+
}
|
|
233
|
+
return parts.join("\n\n");
|
|
234
|
+
}
|
|
235
|
+
export function balancedTruncate(text, maxLength) {
|
|
236
|
+
if (text.length <= maxLength)
|
|
237
|
+
return text;
|
|
238
|
+
const startLen = Math.floor(maxLength * 0.4);
|
|
239
|
+
const middleLen = Math.floor(maxLength * 0.3);
|
|
240
|
+
const endLen = maxLength - startLen - middleLen - 10;
|
|
241
|
+
const start = text.slice(0, startLen);
|
|
242
|
+
const middleStart = Math.floor((text.length - middleLen) / 2);
|
|
243
|
+
const middle = text.slice(middleStart, middleStart + middleLen);
|
|
244
|
+
const end = text.slice(-endLen);
|
|
245
|
+
return `${start}\n[...]\n${middle}\n[...]\n${end}`;
|
|
246
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
2
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
3
|
+
import { z } from "zod";
|
|
4
|
+
import { runTwoTierSearch } from "./engines.js";
|
|
5
|
+
import { fetchAndExtract } from "./extract.js";
|
|
6
|
+
const toInt = (v, def) => {
|
|
7
|
+
const n = Number(v);
|
|
8
|
+
return Number.isFinite(n) && n > 0 ? Math.floor(n) : def;
|
|
9
|
+
};
|
|
10
|
+
const DEFAULT_LIMIT = toInt(process.env.MAX_RESULTS, 10);
|
|
11
|
+
const server = new McpServer({ name: "mcp-web-search", version: "1.0.0" });
|
|
12
|
+
server.registerTool("search_web", {
|
|
13
|
+
title: "Web Search (Fast: DuckDuckGo, Deep: Puppeteer/Bing)",
|
|
14
|
+
description: "Two-tier web search: runs fast DuckDuckGo HTML search by default, escalates to Puppeteer/Bing if results are insufficient. No API keys required.",
|
|
15
|
+
inputSchema: {
|
|
16
|
+
q: z.string(),
|
|
17
|
+
limit: z.number().int().min(1).max(50).default(DEFAULT_LIMIT).optional(),
|
|
18
|
+
lang: z.string().default("en").optional(),
|
|
19
|
+
mode: z.enum(["fast", "deep", "auto"]).default("auto").optional()
|
|
20
|
+
}
|
|
21
|
+
}, async ({ q, limit = DEFAULT_LIMIT, lang = "en", mode = "auto" }) => {
|
|
22
|
+
const res = await runTwoTierSearch({ q, limit: Math.min(Math.max(1, limit), 50), lang, mode });
|
|
23
|
+
const payload = { ...res, items: res.items.slice(0, limit) };
|
|
24
|
+
return { content: [{ type: "text", text: JSON.stringify(payload, null, 2) }] };
|
|
25
|
+
});
|
|
26
|
+
server.registerTool("fetch_url", {
|
|
27
|
+
title: "Fetch and Extract URL Content",
|
|
28
|
+
description: "Fetches content from a URL (HTML/PDF) and extracts readable text. Supports truncation modes: compact (~3000 chars), standard (~8000 chars, default), full (no truncation). Output formats: markdown (default), text, html.",
|
|
29
|
+
inputSchema: {
|
|
30
|
+
url: z.string().url(),
|
|
31
|
+
mode: z.enum(["compact", "standard", "full"]).optional(),
|
|
32
|
+
max_length: z.number().int().min(1000).max(100000).optional(),
|
|
33
|
+
format: z.enum(["markdown", "text", "html"]).optional()
|
|
34
|
+
}
|
|
35
|
+
}, async ({ url, mode, max_length, format }) => {
|
|
36
|
+
const doc = await fetchAndExtract(url, { mode, max_length, format });
|
|
37
|
+
return { content: [{ type: "text", text: JSON.stringify(doc, null, 2) }] };
|
|
38
|
+
});
|
|
39
|
+
server.registerTool("summarize_url", {
|
|
40
|
+
title: "Summarize URL Content",
|
|
41
|
+
description: "Fetches content from a URL and generates a concise summary.",
|
|
42
|
+
inputSchema: { url: z.string().url() }
|
|
43
|
+
}, async ({ url }) => {
|
|
44
|
+
const doc = await fetchAndExtract(url);
|
|
45
|
+
try {
|
|
46
|
+
const content = doc.markdown || doc.text || "";
|
|
47
|
+
const prompt = `Provide a concise summary (<=10 sentences) of the following content:\n\nTitle: ${doc.title || "(none)"}\nURL: ${doc.url}\n\n--- Content ---\n${content.slice(0, 12000)}`;
|
|
48
|
+
const resp = await server.server.createMessage({
|
|
49
|
+
messages: [{ role: "user", content: { type: "text", text: prompt } }],
|
|
50
|
+
maxTokens: 800
|
|
51
|
+
});
|
|
52
|
+
const text = resp.content && resp.content.type === "text"
|
|
53
|
+
? resp.content.text
|
|
54
|
+
: "(unable to generate summary)";
|
|
55
|
+
return { content: [{ type: "text", text }] };
|
|
56
|
+
}
|
|
57
|
+
catch {
|
|
58
|
+
const fallback = (doc.markdown || doc.text || "").slice(0, 2000);
|
|
59
|
+
return { content: [{ type: "text", text: fallback || "(no content to summarize)" }] };
|
|
60
|
+
}
|
|
61
|
+
});
|
|
62
|
+
server.registerTool("wiki_get", {
|
|
63
|
+
title: "Wikipedia: Get Summary",
|
|
64
|
+
description: "Retrieves a Wikipedia summary for a given title. Supports multiple languages (default: en).",
|
|
65
|
+
inputSchema: { title: z.string(), lang: z.string().default("en").optional() }
|
|
66
|
+
}, async ({ title, lang = "en" }) => {
|
|
67
|
+
const { wikiGetSummary } = await import("./wikipedia.js");
|
|
68
|
+
const summary = await wikiGetSummary(title, lang);
|
|
69
|
+
return { content: [{ type: "text", text: JSON.stringify(summary, null, 2) }] };
|
|
70
|
+
});
|
|
71
|
+
server.registerTool("wiki_multi", {
|
|
72
|
+
title: "Wikipedia: Multi-Language Summary",
|
|
73
|
+
description: "Retrieves Wikipedia summaries in multiple languages for a given term. Uses langlinks to map titles accurately across languages.",
|
|
74
|
+
inputSchema: {
|
|
75
|
+
term: z.string(),
|
|
76
|
+
baseLang: z.string().default("en").optional(),
|
|
77
|
+
langs: z.array(z.string()).default(["en"]).optional()
|
|
78
|
+
}
|
|
79
|
+
}, async ({ term, baseLang = "en", langs = ["en"] }) => {
|
|
80
|
+
const { wikiGetMultiSummary } = await import("./wikipedia.js");
|
|
81
|
+
const out = await wikiGetMultiSummary(term, baseLang, langs);
|
|
82
|
+
return { content: [{ type: "text", text: JSON.stringify(out, null, 2) }] };
|
|
83
|
+
});
|
|
84
|
+
async function main() {
|
|
85
|
+
const transport = new StdioServerTransport();
|
|
86
|
+
await server.connect(transport);
|
|
87
|
+
console.error("mcp-web-search ready (stdio)...");
|
|
88
|
+
}
|
|
89
|
+
main().catch(err => {
|
|
90
|
+
console.error(err);
|
|
91
|
+
process.exit(1);
|
|
92
|
+
});
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
function uaHeaders(lang = process.env.LANG_DEFAULT || "en") {
|
|
2
|
+
const ua = process.env.USER_AGENT || "mcp-web-search/1.0";
|
|
3
|
+
const accept = lang === "en" ? "en-US,en;q=0.9" : `${lang};q=0.9,en;q=0.8`;
|
|
4
|
+
return { "User-Agent": ua, "Accept-Language": accept };
|
|
5
|
+
}
|
|
6
|
+
function toMs(env, def) {
|
|
7
|
+
const n = Number(env);
|
|
8
|
+
return Number.isFinite(n) && n > 0 ? n : def;
|
|
9
|
+
}
|
|
10
|
+
async function fetchWithTimeout(input, init = {}, timeoutMs = 15000) {
|
|
11
|
+
const controller = new AbortController();
|
|
12
|
+
const t = setTimeout(() => controller.abort(), timeoutMs);
|
|
13
|
+
try {
|
|
14
|
+
return await fetch(input, { ...init, signal: controller.signal });
|
|
15
|
+
}
|
|
16
|
+
finally {
|
|
17
|
+
clearTimeout(t);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
export async function wikiGetSummary(title, lang = "en") {
|
|
21
|
+
const base = `https://${lang}.wikipedia.org`;
|
|
22
|
+
const sumUrl = new URL(`${base}/api/rest_v1/page/summary/${encodeURIComponent(title)}`);
|
|
23
|
+
try {
|
|
24
|
+
const sres = await fetchWithTimeout(sumUrl, { headers: uaHeaders(lang) }, toMs(process.env.HTTP_TIMEOUT, 15000));
|
|
25
|
+
if (!sres.ok) {
|
|
26
|
+
return { lang, title, url: `${base}/wiki/${encodeURIComponent(title)}` };
|
|
27
|
+
}
|
|
28
|
+
const s = (await sres.json());
|
|
29
|
+
return {
|
|
30
|
+
lang,
|
|
31
|
+
title: s.title ?? title,
|
|
32
|
+
url: s.content_urls?.desktop?.page ?? `${base}/wiki/${encodeURIComponent(title)}`,
|
|
33
|
+
description: s.description,
|
|
34
|
+
extract: s.extract,
|
|
35
|
+
thumbnailUrl: s.thumbnail?.source
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
catch {
|
|
39
|
+
return { lang, title, url: `${base}/wiki/${encodeURIComponent(title)}` };
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
async function wikiGetLanglinks(baseTitle, baseLang) {
|
|
43
|
+
const base = `https://${baseLang}.wikipedia.org/w/api.php`;
|
|
44
|
+
const url = new URL(base);
|
|
45
|
+
url.searchParams.set("action", "query");
|
|
46
|
+
url.searchParams.set("titles", baseTitle);
|
|
47
|
+
url.searchParams.set("prop", "langlinks");
|
|
48
|
+
url.searchParams.set("lllimit", "max");
|
|
49
|
+
url.searchParams.set("format", "json");
|
|
50
|
+
try {
|
|
51
|
+
const res = await fetchWithTimeout(url, { headers: uaHeaders(baseLang) }, toMs(process.env.HTTP_TIMEOUT, 15000));
|
|
52
|
+
if (!res.ok)
|
|
53
|
+
return {};
|
|
54
|
+
const data = (await res.json());
|
|
55
|
+
const pages = data?.query?.pages;
|
|
56
|
+
const first = pages && Object.values(pages)[0];
|
|
57
|
+
const ll = first?.langlinks || [];
|
|
58
|
+
const map = {};
|
|
59
|
+
for (const item of ll)
|
|
60
|
+
map[item.lang] = item["*"];
|
|
61
|
+
return map;
|
|
62
|
+
}
|
|
63
|
+
catch {
|
|
64
|
+
return {};
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
export async function wikiGetMultiSummary(term, baseLang = "en", langs = ["en"]) {
|
|
68
|
+
const want = Array.from(new Set(langs.map(s => s.trim().toLowerCase()).filter(Boolean)));
|
|
69
|
+
if (!want.includes(baseLang))
|
|
70
|
+
want.unshift(baseLang);
|
|
71
|
+
const base = await wikiGetSummary(term, baseLang);
|
|
72
|
+
const langlinks = await wikiGetLanglinks(base.title, baseLang);
|
|
73
|
+
const items = {};
|
|
74
|
+
const resolved = {};
|
|
75
|
+
items[baseLang] = base;
|
|
76
|
+
resolved[baseLang] = { title: base.title, source: "base" };
|
|
77
|
+
const tasks = want
|
|
78
|
+
.filter(l => l !== baseLang)
|
|
79
|
+
.map(async (l) => {
|
|
80
|
+
let title;
|
|
81
|
+
let source = "none";
|
|
82
|
+
if (langlinks[l]) {
|
|
83
|
+
title = langlinks[l];
|
|
84
|
+
source = "langlinks";
|
|
85
|
+
}
|
|
86
|
+
else {
|
|
87
|
+
title = term;
|
|
88
|
+
source = "direct";
|
|
89
|
+
}
|
|
90
|
+
try {
|
|
91
|
+
const sum = await wikiGetSummary(title, l);
|
|
92
|
+
items[l] = sum;
|
|
93
|
+
resolved[l] = { title: sum.title, source };
|
|
94
|
+
}
|
|
95
|
+
catch {
|
|
96
|
+
items[l] = null;
|
|
97
|
+
resolved[l] = { title, source: "none" };
|
|
98
|
+
}
|
|
99
|
+
});
|
|
100
|
+
await Promise.all(tasks);
|
|
101
|
+
return { baseLang, base, items, resolved };
|
|
102
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@zhafron/mcp-web-search",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "MCP server: DuckDuckGo HTML search, Wikipedia summaries, and URL content extraction — no API keys required.",
|
|
6
|
+
"main": "dist/src/server.js",
|
|
7
|
+
"scripts": {
|
|
8
|
+
"build": "tsc -p .",
|
|
9
|
+
"start": "node dist/src/server.js",
|
|
10
|
+
"dev": "tsx src/server.ts",
|
|
11
|
+
"test": "node --experimental-vm-modules node_modules/jest/bin/jest.js",
|
|
12
|
+
"test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch",
|
|
13
|
+
"format": "prettier --write \"src/**/*.ts\" \"test/**/*.ts\"",
|
|
14
|
+
"format:check": "prettier --check \"src/**/*.ts\" \"test/**/*.ts\""
|
|
15
|
+
},
|
|
16
|
+
"keywords": [
|
|
17
|
+
"mcp",
|
|
18
|
+
"modelcontextprotocol",
|
|
19
|
+
"web",
|
|
20
|
+
"search",
|
|
21
|
+
"extraction",
|
|
22
|
+
"html",
|
|
23
|
+
"pdf",
|
|
24
|
+
"readability",
|
|
25
|
+
"turndown",
|
|
26
|
+
"ai"
|
|
27
|
+
],
|
|
28
|
+
"author": "tickernelz",
|
|
29
|
+
"license": "MIT",
|
|
30
|
+
"repository": {
|
|
31
|
+
"type": "git",
|
|
32
|
+
"url": "git+https://github.com/tickernelz/mcp-web-search.git"
|
|
33
|
+
},
|
|
34
|
+
"publishConfig": {
|
|
35
|
+
"access": "public"
|
|
36
|
+
},
|
|
37
|
+
"files": [
|
|
38
|
+
"dist",
|
|
39
|
+
"package.json",
|
|
40
|
+
"README.md",
|
|
41
|
+
"LICENSE"
|
|
42
|
+
],
|
|
43
|
+
"dependencies": {
|
|
44
|
+
"@modelcontextprotocol/sdk": "^1.17.0",
|
|
45
|
+
"@mozilla/readability": "^0.6.0",
|
|
46
|
+
"jsdom": "^24.1.0",
|
|
47
|
+
"pdf-parse": "^1.1.1",
|
|
48
|
+
"puppeteer-core": "^23.11.1",
|
|
49
|
+
"turndown": "^7.2.2",
|
|
50
|
+
"zod": "^3.23.8"
|
|
51
|
+
},
|
|
52
|
+
"devDependencies": {
|
|
53
|
+
"@jest/globals": "^29.7.0",
|
|
54
|
+
"@types/jest": "^29.5.14",
|
|
55
|
+
"@types/jsdom": "^21.1.7",
|
|
56
|
+
"@types/node": "^20.11.30",
|
|
57
|
+
"@types/turndown": "^5.0.6",
|
|
58
|
+
"jest": "^29.7.0",
|
|
59
|
+
"prettier": "^3.8.0",
|
|
60
|
+
"ts-jest": "^29.2.5",
|
|
61
|
+
"tsx": "^4.19.0",
|
|
62
|
+
"typescript": "^5.5.4"
|
|
63
|
+
}
|
|
64
|
+
}
|