lean-reader 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +55 -0
- package/lib/core.js +352 -0
- package/package.json +45 -0
- package/src/server.js +41 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 AIMento
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Lean Reader
|
|
2
|
+
|
|
3
|
+
Turn any URL into **token-minimized clean text for LLMs**, with a token-savings receipt on every call. MCP server + library.
|
|
4
|
+
|
|
5
|
+
LLMs don't need your nav bar, your cookie banner, your `<script>` tags, or 200 KB of inlined SVG — but raw page HTML makes them pay for all of it. Lean Reader strips a page down to the article and tells you exactly how many tokens (and dollars) you just saved.
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
231,276 → 15,735 tokens (93% saved · 14.7× vs raw HTML · ~$0.54 on gpt-4o) · cleaned by lean reader
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Use as an MCP server
|
|
12
|
+
|
|
13
|
+
Add to your client's MCP config (Claude Desktop/Code, Cursor, …):
|
|
14
|
+
|
|
15
|
+
```json
|
|
16
|
+
{
|
|
17
|
+
"mcpServers": {
|
|
18
|
+
"lean-reader": { "command": "npx", "args": ["-y", "lean-reader"] }
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Then the `lean_read(url, format?)` tool returns clean text plus the receipt.
|
|
24
|
+
|
|
25
|
+
## Use as a library
|
|
26
|
+
|
|
27
|
+
```js
|
|
28
|
+
import { leanRead } from 'lean-reader/lib/core.js';
|
|
29
|
+
|
|
30
|
+
const r = await leanRead('https://example.com/article', { format: 'markdown' });
|
|
31
|
+
console.log(r.content); // token-minimized text
|
|
32
|
+
console.log(r.receipt); // { beforeTokens, afterTokens, savedPct, ratio, estCostSavedUsd, ... }
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## How much does it save?
|
|
36
|
+
|
|
37
|
+
Measured, not marketed — the [open benchmark](https://github.com/AIMento/lean-reader-bench) ships the corpus, the tokenizer, and every raw output, and flags the cases where Lean Reader **loses**:
|
|
38
|
+
|
|
39
|
+
- **~32% fewer tokens than Mozilla Readability** (the standard extractor) at the median, while keeping ~99% of the body text. Be honest about where that edge comes from: it's the `minimize` post-pass (link/image/footnote/whitespace strip), not smarter extraction — run both through `minimize` and they're roughly par. Lean actually runs Readability as one of its two extractors (see Honest limits), so it doesn't lose to it.
|
|
40
|
+
- Versus **raw page HTML** the multiple is much larger (median ~15×, 100×+ on script-heavy docs) — but that's HTML nobody feeds an LLM, so read it as "don't dump raw pages," not as a competitive claim.
|
|
41
|
+
- Versus **Jina Reader** (measured, anonymous tier): ~1.6× fewer tokens on a like-for-like body, ~4.8× if you count the nav and reference dumps Jina also returns. Firecrawl is not yet measured (needs an API key).
|
|
42
|
+
|
|
43
|
+
The receipt uses the `o200k_base` tokenizer (GPT-4o/4.1 class); the model and tokenizer are always shown, and counts are vs the raw page HTML so you can check the math.
|
|
44
|
+
|
|
45
|
+
## Honest limits
|
|
46
|
+
|
|
47
|
+
- **Static HTML only (v1).** Pages whose body is client-rendered (some SPAs, GitHub repo landing pages) return little — Lean Reader flags `partial` instead of emitting empty text. Jina/Firecrawl render JS and will beat us there.
|
|
48
|
+
- **Two extractors, body-max selection.** Defuddle and Mozilla Readability each silently drop the body on *different* pages (Defuddle on some large Wikipedia articles, Readability on some docs/SPAs). Lean runs both and keeps whichever recovers more body, so neither's blind spot becomes a silent content drop. A real ROUGE-L ground-truth fidelity pass is still the next step (see the bench repo).
|
|
49
|
+
- Token counts are `o200k_base`; Claude/Gemini tokenize differently.
|
|
50
|
+
|
|
51
|
+
## Open-core
|
|
52
|
+
|
|
53
|
+
The extraction + token-minimization core (`lib/`) and the MCP server (`src/`) are MIT. Hosted service, sharing UI, and metering are separate.
|
|
54
|
+
|
|
55
|
+
MIT © 2026
|
package/lib/core.js
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
// Lean Reader core — single source of truth shared by the MCP server and the Vercel API routes (open-core, MIT).
|
|
2
|
+
// Pipeline: SSRF-safe fetch (size-capped) → linkedom → [Defuddle ‖ Readability] pick-the-fuller-body → minimize → token receipt.
|
|
3
|
+
// Dual extraction: Defuddle sometimes drops entire body sections on certain pages (large wiki articles), while Readability
|
|
4
|
+
// drops content on other pages (some SPAs/docs). We run both and keep whichever yields more body text, guaranteeing fidelity
|
|
5
|
+
// (honesty: never silently return a worse extraction).
|
|
6
|
+
import { Defuddle } from 'defuddle/node';
|
|
7
|
+
import { parseHTML } from 'linkedom';
|
|
8
|
+
import { Readability } from '@mozilla/readability';
|
|
9
|
+
import TurndownService from 'turndown';
|
|
10
|
+
import { getEncoding } from 'js-tiktoken';
|
|
11
|
+
import { Agent } from 'undici';
|
|
12
|
+
import dns from 'node:dns';
|
|
13
|
+
import net from 'node:net';
|
|
14
|
+
import ipaddr from 'ipaddr.js';
|
|
15
|
+
|
|
16
|
+
const enc = getEncoding('o200k_base');
|
|
17
|
+
|
|
18
|
+
// Errors follow a message=code convention (web/MCP map the message to a user-facing message).
|
|
19
|
+
const codeErr = (code) => new Error(code);
|
|
20
|
+
|
|
21
|
+
// Honest bot UA. No Chrome spoofing (honesty) — the "compatible;" form is the convention for legitimate bots, and it
|
|
22
|
+
// includes a contact URL.
|
|
23
|
+
const UA = 'Mozilla/5.0 (compatible; LeanReaderBot/0.1; +https://github.com/AIMento/lean-reader)';
|
|
24
|
+
|
|
25
|
+
const MAX_BYTES = 3_000_000; // Hard cap on response body (after decompression) — decompression-bomb defense
|
|
26
|
+
const TOKENIZE_MAX = 400_000; // Upper bound (chars) on input we'll exact-encode
|
|
27
|
+
|
|
28
|
+
// ---- Token counting (DoS guard) -------------------------------------------------
|
|
29
|
+
// js-tiktoken BPE is O(n^2) on low-entropy/repetitive input (10KB of 'A' ≈ 16s, attacker-controllable).
|
|
30
|
+
// Regardless of entropy, only exact-encode inputs known to be safe; fall back to a chars/4 estimate for
|
|
31
|
+
// pathological/oversized input. Real article text always takes the exact path.
|
|
32
|
+
const estimateTokens = (s) => Math.ceil(s.length / 4);
|
|
33
|
+
|
|
34
|
+
function isPathological(s) {
|
|
35
|
+
if (/(.)\1{300,}/.test(s)) return true; // 300+ runs of the same character
|
|
36
|
+
const head = s.slice(0, 4000);
|
|
37
|
+
if (head.length > 200 && new Set(head).size < 12) return true; // extremely low character diversity
|
|
38
|
+
return false;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** o200k_base token count. The receipt always exposes tokenizer + model together (honesty). Adversarial input falls back to an estimate. */
|
|
42
|
+
export function countTokens(s) {
|
|
43
|
+
if (!s) return 0;
|
|
44
|
+
if (s.length > TOKENIZE_MAX || isPathological(s)) return estimateTokens(s);
|
|
45
|
+
try {
|
|
46
|
+
return enc.encode(s).length;
|
|
47
|
+
} catch {
|
|
48
|
+
return estimateTokens(s);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Input price in $/1M tokens. The receipt names the reference model to prevent inflated claims.
|
|
53
|
+
export const PRICING = {
|
|
54
|
+
'gpt-4o': 2.5,
|
|
55
|
+
'gpt-4o-mini': 0.15,
|
|
56
|
+
'claude-sonnet': 3.0,
|
|
57
|
+
'claude-haiku': 0.8,
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
// ---- SSRF defense: resolve → validate every IP → pin to the validated IP (undici connect.lookup) ----------
|
|
61
|
+
// A string-based blocklist (the old approach) was vulnerable to 172.16/12, IPv6 ULA, decimal/hex IPs, DNS rebinding,
|
|
62
|
+
// and redirect bypasses. The undici dispatcher's connect.lookup fires on every connection (including redirects) →
|
|
63
|
+
// it validates every hop and pins the connection to the validated IP (blocking rebinding). SNI keeps the original
|
|
64
|
+
// hostname → TLS stays valid.
|
|
65
|
+
function ipIsPublic(ipStr) {
|
|
66
|
+
let addr;
|
|
67
|
+
try {
|
|
68
|
+
addr = ipaddr.parse(ipStr);
|
|
69
|
+
} catch {
|
|
70
|
+
return false;
|
|
71
|
+
}
|
|
72
|
+
if (addr.kind() === 'ipv6') {
|
|
73
|
+
if (addr.isIPv4MappedAddress()) return ipIsPublic(addr.toIPv4Address().toString());
|
|
74
|
+
return addr.range() === 'unicast'; // excludes ULA/loopback/linkLocal/reserved/multicast
|
|
75
|
+
}
|
|
76
|
+
return addr.range() === 'unicast'; // excludes private/loopback/linkLocal/CGNAT/reserved/broadcast
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// IP-literal normalization (undici skips lookup for strict IP literals and connects directly, so we validate
|
|
80
|
+
// them ourselves up front).
|
|
81
|
+
// Reduce strict v4/v6 + decimal 32-bit (http://2130706433) + hex (http://0x7f000001) to dotted-quad IPv4.
|
|
82
|
+
// Shorthand/octal forms (e.g. 127.1) yield net.isIP=0 and are treated as hostnames → the dispatcher lookup resolves
|
|
83
|
+
// and validates them.
|
|
84
|
+
function literalIp(host) {
|
|
85
|
+
if (!host) return null;
|
|
86
|
+
let h = host;
|
|
87
|
+
if (h.startsWith('[') && h.endsWith(']')) h = h.slice(1, -1); // IPv6 bracket
|
|
88
|
+
if (net.isIP(h)) return h;
|
|
89
|
+
if (/^[0-9]+$/.test(h)) {
|
|
90
|
+
const n = Number(h);
|
|
91
|
+
if (Number.isInteger(n) && n >= 0 && n <= 0xffffffff)
|
|
92
|
+
return [(n >>> 24) & 255, (n >>> 16) & 255, (n >>> 8) & 255, n & 255].join('.');
|
|
93
|
+
}
|
|
94
|
+
if (/^0x[0-9a-f]+$/i.test(h)) {
|
|
95
|
+
const n = parseInt(h, 16);
|
|
96
|
+
if (n >= 0 && n <= 0xffffffff)
|
|
97
|
+
return [(n >>> 24) & 255, (n >>> 16) & 255, (n >>> 8) & 255, n & 255].join('.');
|
|
98
|
+
}
|
|
99
|
+
return null;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function safeLookup(hostname, options, callback) {
|
|
103
|
+
const opts = typeof options === 'function' ? {} : options || {};
|
|
104
|
+
const cb = typeof options === 'function' ? options : callback;
|
|
105
|
+
dns.lookup(hostname, { all: true, family: opts.family || 0 }, (err, addresses) => {
|
|
106
|
+
if (err) return cb(err);
|
|
107
|
+
const list = Array.isArray(addresses) ? addresses : [{ address: addresses, family: opts.family || 4 }];
|
|
108
|
+
for (const a of list) {
|
|
109
|
+
if (!ipIsPublic(a.address)) return cb(codeErr('blocked_host'));
|
|
110
|
+
}
|
|
111
|
+
if (opts.all) return cb(null, list);
|
|
112
|
+
cb(null, list[0].address, list[0].family);
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
const safeAgent = new Agent({ connect: { lookup: safeLookup } });
|
|
117
|
+
|
|
118
|
+
// Follow redirects manually so every hop is validated. (Automatic following would let a redirect → IP-literal hop
|
|
119
|
+
// bypass both Layer 1 [which only checks the initial URL] and the dispatcher lookup [which skips IP literals], so we
|
|
120
|
+
// walk the chain ourselves.)
|
|
121
|
+
async function fetchSafe(startUrl, { fetchImpl, timeoutMs }) {
|
|
122
|
+
let current = startUrl;
|
|
123
|
+
for (let hop = 0; hop < 6; hop++) {
|
|
124
|
+
const u = new URL(current);
|
|
125
|
+
if (!/^https?:$/.test(u.protocol)) throw codeErr('unsupported_protocol');
|
|
126
|
+
const lit = literalIp(u.hostname);
|
|
127
|
+
if (lit && !ipIsPublic(lit)) throw codeErr('blocked_host');
|
|
128
|
+
|
|
129
|
+
let res;
|
|
130
|
+
try {
|
|
131
|
+
res = await fetchImpl(current, {
|
|
132
|
+
headers: { 'user-agent': UA, accept: 'text/html,application/xhtml+xml,*/*;q=0.8' },
|
|
133
|
+
redirect: 'manual',
|
|
134
|
+
signal: AbortSignal.timeout(timeoutMs),
|
|
135
|
+
dispatcher: safeAgent,
|
|
136
|
+
});
|
|
137
|
+
} catch (e) {
|
|
138
|
+
const msg = ((e && e.cause && e.cause.message) || (e && e.message) || '') + '';
|
|
139
|
+
if (/blocked_host/.test(msg)) throw codeErr('blocked_host');
|
|
140
|
+
if ((e && e.name === 'TimeoutError') || /timed?\s?out|timeout|aborted/i.test(msg)) throw codeErr('timeout');
|
|
141
|
+
throw codeErr('fetch_failed');
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if (res.status >= 300 && res.status < 400 && res.headers.get('location')) {
|
|
145
|
+
current = new URL(res.headers.get('location'), current).toString();
|
|
146
|
+
if (res.body && res.body.cancel) {
|
|
147
|
+
try {
|
|
148
|
+
await res.body.cancel();
|
|
149
|
+
} catch {}
|
|
150
|
+
}
|
|
151
|
+
continue;
|
|
152
|
+
}
|
|
153
|
+
return res;
|
|
154
|
+
}
|
|
155
|
+
throw codeErr('too_many_redirects');
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/** Read the body capped by post-decompression byte count (streaming — memory-bomb defense). */
|
|
159
|
+
async function readCapped(res) {
|
|
160
|
+
const reader = res.body && res.body.getReader ? res.body.getReader() : null;
|
|
161
|
+
if (!reader) return await res.text();
|
|
162
|
+
const decoder = new TextDecoder('utf-8');
|
|
163
|
+
let out = '';
|
|
164
|
+
let total = 0;
|
|
165
|
+
for (;;) {
|
|
166
|
+
const { done, value } = await reader.read();
|
|
167
|
+
if (done) break;
|
|
168
|
+
total += value.byteLength;
|
|
169
|
+
if (total > MAX_BYTES) {
|
|
170
|
+
try {
|
|
171
|
+
await reader.cancel();
|
|
172
|
+
} catch {}
|
|
173
|
+
throw codeErr('response_too_large');
|
|
174
|
+
}
|
|
175
|
+
out += decoder.decode(value, { stream: true });
|
|
176
|
+
}
|
|
177
|
+
out += decoder.decode();
|
|
178
|
+
return out;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/** token-minimize layer. Further compresses Defuddle output while preserving code blocks and table information. */
|
|
182
|
+
export function minimize(md) {
|
|
183
|
+
let s = md || '';
|
|
184
|
+
|
|
185
|
+
// 1) Preserve fenced code blocks (never touch their internal whitespace/indentation)
|
|
186
|
+
const fences = [];
|
|
187
|
+
s = s.replace(/```[\s\S]*?```/g, (m) => {
|
|
188
|
+
fences.push(m);
|
|
189
|
+
return 'F' + (fences.length - 1) + '';
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
s = s
|
|
193
|
+
.replace(/!\[[^\]]*\]\([^)]*\)/g, '') // strip image markdown
|
|
194
|
+
.replace(/\[\s*\]\([^)]*\)/g, '') // strip empty-text links (e.g. wiki thumbnails Readability leaves behind, [](…/File:…) — zero information, pure noise)
|
|
195
|
+
.replace(/\[([^\]]+)\]\([^()]*(?:\([^()]*\)[^()]*)*\)/g, '$1') // keep only the link's display text
|
|
196
|
+
.replace(/[ \t]*\[edit\]/gi, '') // wiki section-edit link markers (trail every heading on the Readability path)
|
|
197
|
+
.replace(/^\[\^[^\]]+\]:.*$/gm, '') // strip footnote-definition lines (Wikipedia citation dumps = pure LLM noise)
|
|
198
|
+
.replace(/\[\^[^\]]+\]/g, '') // strip inline footnote markers
|
|
199
|
+
.replace(/[ \t]+\n/g, '\n') // strip trailing whitespace
|
|
200
|
+
.replace(/\n{3,}/g, '\n\n'); // collapse runs of blank lines
|
|
201
|
+
// Note: a former global `[ \t]{2,}→' '` collapse mangled code/tables/nested lists, so it was removed.
|
|
202
|
+
|
|
203
|
+
// 2) Restore fences
|
|
204
|
+
s = s.replace(/F(\d+)/g, (_, i) => fences[+i]);
|
|
205
|
+
return s.trim();
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/** markdown → plain text (rough symbol stripping). Used when format='text'. */
|
|
209
|
+
const turndown = new TurndownService({ codeBlockStyle: 'fenced', headingStyle: 'atx', bulletListMarker: '-' });
|
|
210
|
+
// HTML→markdown for the Readability fallback (Defuddle emits its own markdown; Readability emits article.content HTML).
|
|
211
|
+
function htmlToMarkdown(html) {
|
|
212
|
+
try {
|
|
213
|
+
return turndown.turndown(html || '');
|
|
214
|
+
} catch {
|
|
215
|
+
return '';
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
// Word count for comparing body volume (the extractor-selection criterion).
|
|
219
|
+
const wordsOf = (s) => (s ? s.trim().split(/\s+/).filter(Boolean).length : 0);
|
|
220
|
+
|
|
221
|
+
function toPlainText(md) {
|
|
222
|
+
return md
|
|
223
|
+
.replace(/^#{1,6}\s+/gm, '')
|
|
224
|
+
.replace(/(\*\*|__|\*|_|`)/g, '')
|
|
225
|
+
.replace(/^>\s?/gm, '')
|
|
226
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
227
|
+
.trim();
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/** Fetch a URL and extract the body with Defuddle. SSRF-safe, size-capped, content-type validated. */
|
|
231
|
+
export async function extract(url, { fetchImpl = fetch, timeoutMs = 8000 } = {}) {
|
|
232
|
+
let u;
|
|
233
|
+
try {
|
|
234
|
+
u = new URL(url);
|
|
235
|
+
} catch {
|
|
236
|
+
throw codeErr('invalid_url');
|
|
237
|
+
}
|
|
238
|
+
if (!/^https?:$/.test(u.protocol)) throw codeErr('unsupported_protocol');
|
|
239
|
+
|
|
240
|
+
const res = await fetchSafe(url, { fetchImpl, timeoutMs });
|
|
241
|
+
if (!res.ok) throw codeErr('fetch_' + res.status);
|
|
242
|
+
|
|
243
|
+
const ctype = res.headers.get('content-type') || '';
|
|
244
|
+
if (ctype && !/text\/html|application\/xhtml|text\/plain|\+xml/i.test(ctype)) {
|
|
245
|
+
throw codeErr('unsupported_content_type');
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
const html = await readCapped(res);
|
|
249
|
+
|
|
250
|
+
let document;
|
|
251
|
+
try {
|
|
252
|
+
({ document } = parseHTML(html));
|
|
253
|
+
} catch {
|
|
254
|
+
throw codeErr('parse_failed');
|
|
255
|
+
}
|
|
256
|
+
if (!document || !document.documentElement) throw codeErr('empty_or_non_html');
|
|
257
|
+
|
|
258
|
+
// Readability mutates the document destructively → capture a clone before running Defuddle.
|
|
259
|
+
let readabilityDoc = null;
|
|
260
|
+
try {
|
|
261
|
+
readabilityDoc = document.cloneNode(true);
|
|
262
|
+
} catch {
|
|
263
|
+
readabilityDoc = null; // if cloning fails, use Defuddle only with no fallback
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
let defuddle;
|
|
267
|
+
try {
|
|
268
|
+
defuddle = await Defuddle(document, url, { markdown: true });
|
|
269
|
+
} catch {
|
|
270
|
+
throw codeErr('extract_failed');
|
|
271
|
+
}
|
|
272
|
+
const defuddleMd = defuddle.contentMarkdown ?? defuddle.content ?? '';
|
|
273
|
+
|
|
274
|
+
// Second extractor — defends against pages where Defuddle drops entire body sections (some large wiki articles).
|
|
275
|
+
// Reuses the same linkedom document (no jsdom needed). leanRead picks whichever yields more body text.
|
|
276
|
+
let readabilityMd = '';
|
|
277
|
+
if (readabilityDoc) {
|
|
278
|
+
try {
|
|
279
|
+
const art = new Readability(readabilityDoc).parse();
|
|
280
|
+
if (art && art.content) readabilityMd = htmlToMarkdown(art.content);
|
|
281
|
+
} catch {
|
|
282
|
+
readabilityMd = ''; // ignore fallback failure — use the Defuddle result
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
return { html, defuddle, defuddleMd, readabilityMd };
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* The full pipeline. A single entry point shared by the MCP tool, the HTTP API, and the CLI.
|
|
291
|
+
* @returns {{ url, title, wordCount, content, receipt, partial, extractor }}
|
|
292
|
+
*/
|
|
293
|
+
export async function leanRead(url, { format = 'markdown', model = 'gpt-4o', includeFooter = false } = {}) {
|
|
294
|
+
const { html, defuddle, defuddleMd, readabilityMd } = await extract(url);
|
|
295
|
+
|
|
296
|
+
// Body-fidelity-first selection: minimize both Defuddle and Readability outputs, then keep whichever has more body
|
|
297
|
+
// text (word count). On a tie, keep Defuddle (slightly more token-efficient). Switch only when Readability has 15%+
|
|
298
|
+
// more body. (The two drop body content on different pages — Defuddle: some large wikis / Readability: some SPAs/docs.)
|
|
299
|
+
const defuddleMin = minimize(defuddleMd);
|
|
300
|
+
const readabilityMin = readabilityMd ? minimize(readabilityMd) : '';
|
|
301
|
+
const useReadability = wordsOf(readabilityMin) > wordsOf(defuddleMin) * 1.15;
|
|
302
|
+
const extractor = useReadability ? 'readability' : 'defuddle';
|
|
303
|
+
const md = useReadability ? readabilityMd : defuddleMd; // raw markdown used for the integrity-guard check
|
|
304
|
+
let content = useReadability ? readabilityMin : defuddleMin;
|
|
305
|
+
|
|
306
|
+
// Integrity guard: if the selected output is dominated by footnote definitions (citations), treat it as a failed
|
|
307
|
+
// body extraction (honesty: never pretend success).
|
|
308
|
+
const footnoteLines = (md.match(/^\[\^[^\]]+\]:.*$/gm) || []).length;
|
|
309
|
+
const footnoteChars = (md.match(/^\[\^[^\]]+\]:.*$/gm) || []).join('\n').length;
|
|
310
|
+
const citationHeavy = md.length > 0 && footnoteChars / md.length > 0.5 && footnoteLines > 20;
|
|
311
|
+
|
|
312
|
+
if (format === 'text') content = toPlainText(content);
|
|
313
|
+
|
|
314
|
+
// Static-extraction failure (likely an SPA) or a citation-dump page — flag it honestly instead of padding empty
|
|
315
|
+
// text with a guess (honesty: surface partial results rather than fake completeness).
|
|
316
|
+
const partial = content.length < 200 || citationHeavy;
|
|
317
|
+
|
|
318
|
+
if (includeFooter) content += `\n\n---\ncleaned by lean reader — lean.tld/${url}`;
|
|
319
|
+
|
|
320
|
+
const beforeTokens = countTokens(html);
|
|
321
|
+
const afterTokens = countTokens(content);
|
|
322
|
+
const savedTokens = Math.max(0, beforeTokens - afterTokens);
|
|
323
|
+
// An unsupported model name is priced at the gpt-4o rate, so the receipt's model reflects the actual billed model
|
|
324
|
+
// (honesty: avoid a label/price mismatch).
|
|
325
|
+
const pricedModel = PRICING[model] ? model : 'gpt-4o';
|
|
326
|
+
const price = PRICING[pricedModel];
|
|
327
|
+
|
|
328
|
+
const receipt = {
|
|
329
|
+
tokenizer: 'o200k_base',
|
|
330
|
+
model: pricedModel,
|
|
331
|
+
beforeTokens,
|
|
332
|
+
afterTokens,
|
|
333
|
+
savedTokens,
|
|
334
|
+
savedPct: beforeTokens ? Math.round((savedTokens / beforeTokens) * 100) : 0,
|
|
335
|
+
ratio: afterTokens ? Number((beforeTokens / afterTokens).toFixed(1)) : null,
|
|
336
|
+
estCostSavedUsd: Number(((savedTokens / 1e6) * price).toFixed(4)),
|
|
337
|
+
};
|
|
338
|
+
|
|
339
|
+
// wordCount is based on the actually-returned content (Defuddle's pre-strip count overstates it by including
|
|
340
|
+
// citations and the like — honesty: keep the reported count consistent with what's returned).
|
|
341
|
+
const wordCount = content ? content.trim().split(/\s+/).filter(Boolean).length : 0;
|
|
342
|
+
|
|
343
|
+
return {
|
|
344
|
+
url,
|
|
345
|
+
title: defuddle.title || '',
|
|
346
|
+
wordCount,
|
|
347
|
+
content,
|
|
348
|
+
receipt,
|
|
349
|
+
partial,
|
|
350
|
+
extractor,
|
|
351
|
+
};
|
|
352
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "lean-reader",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"private": false,
|
|
5
|
+
"type": "module",
|
|
6
|
+
"description": "Token-minimized URL-to-clean-text reader for LLMs, with a token-savings receipt. MCP server + library.",
|
|
7
|
+
"license": "MIT",
|
|
8
|
+
"bin": {
|
|
9
|
+
"lean-reader": "src/server.js"
|
|
10
|
+
},
|
|
11
|
+
"mcpName": "io.github.AIMento/lean-reader",
|
|
12
|
+
"engines": {
|
|
13
|
+
"node": ">=20"
|
|
14
|
+
},
|
|
15
|
+
"files": [
|
|
16
|
+
"src",
|
|
17
|
+
"lib"
|
|
18
|
+
],
|
|
19
|
+
"keywords": [
|
|
20
|
+
"mcp",
|
|
21
|
+
"mcp-server",
|
|
22
|
+
"llm",
|
|
23
|
+
"tokens",
|
|
24
|
+
"scraper",
|
|
25
|
+
"readability",
|
|
26
|
+
"markdown"
|
|
27
|
+
],
|
|
28
|
+
"scripts": {
|
|
29
|
+
"start": "node src/server.js",
|
|
30
|
+
"test:core": "node scripts/test-core.mjs",
|
|
31
|
+
"test:mcp": "node scripts/test-mcp.mjs",
|
|
32
|
+
"measure": "node measure.mjs"
|
|
33
|
+
},
|
|
34
|
+
"dependencies": {
|
|
35
|
+
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
36
|
+
"@mozilla/readability": "^0.6.0",
|
|
37
|
+
"defuddle": "^0.19.0",
|
|
38
|
+
"ipaddr.js": "^2.4.0",
|
|
39
|
+
"js-tiktoken": "^1.0.21",
|
|
40
|
+
"linkedom": "^0.18.12",
|
|
41
|
+
"turndown": "^7.2.4",
|
|
42
|
+
"undici": "^6.27.0",
|
|
43
|
+
"zod": "^4.4.3"
|
|
44
|
+
}
|
|
45
|
+
}
|
package/src/server.js
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// Lean Reader MCP server (stdio). Shares the core (lib/core.js) with the Vercel API.
|
|
3
|
+
// Run: npx lean-reader / client mcp.json: { "command": "npx", "args": ["-y", "lean-reader"] }
|
|
4
|
+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
5
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
6
|
+
import { z } from 'zod';
|
|
7
|
+
import { leanRead } from '../lib/core.js';
|
|
8
|
+
|
|
9
|
+
const server = new McpServer({ name: 'lean-reader', version: '0.1.0' });
|
|
10
|
+
|
|
11
|
+
server.registerTool(
|
|
12
|
+
'lean_read',
|
|
13
|
+
{
|
|
14
|
+
title: 'Lean Reader',
|
|
15
|
+
description:
|
|
16
|
+
'Fetch a URL and return token-minimized clean text plus a token-savings receipt. Strips nav/scripts/boilerplate so an LLM reads the article, not the page. The receipt counts tokens vs the raw page HTML (typically ~15x fewer, but it ranges from ~1.5x on already-clean pages to 100x+ on script-heavy docs). Two extractors (Defuddle + Readability), body-max selection, so it does not silently drop the article body. Static HTML only — JS-rendered pages may come back partial.',
|
|
17
|
+
inputSchema: {
|
|
18
|
+
url: z.string().url().describe('The URL to fetch and clean'),
|
|
19
|
+
format: z.enum(['markdown', 'text']).optional().describe('Output format (default: markdown)'),
|
|
20
|
+
},
|
|
21
|
+
},
|
|
22
|
+
async ({ url, format }) => {
|
|
23
|
+
try {
|
|
24
|
+
const r = await leanRead(url, { format: format ?? 'markdown' });
|
|
25
|
+
const c = r.receipt;
|
|
26
|
+
const receiptLine =
|
|
27
|
+
`> ${c.beforeTokens.toLocaleString()} → ${c.afterTokens.toLocaleString()} tokens ` +
|
|
28
|
+
`(${c.savedPct}% saved · ${c.ratio}x · ~$${c.estCostSavedUsd} on ${c.model}, ${c.tokenizer}) · cleaned by lean reader`;
|
|
29
|
+
const header = (r.title ? `# ${r.title}\n\n` : '') + receiptLine + '\n\n';
|
|
30
|
+
const note = r.partial
|
|
31
|
+
? '[lean reader] This page looks JS-rendered; static extraction returned little. v1 supports static HTML only.\n\n'
|
|
32
|
+
: '';
|
|
33
|
+
return { content: [{ type: 'text', text: header + note + r.content }] };
|
|
34
|
+
} catch (e) {
|
|
35
|
+
return { content: [{ type: 'text', text: `lean_read error: ${e.message}` }], isError: true };
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
);
|
|
39
|
+
|
|
40
|
+
const transport = new StdioServerTransport();
|
|
41
|
+
await server.connect(transport);
|