@just-every/mcp-read-website-fast 0.1.21 → 0.1.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +6 -5
- package/dist/internal/fetchMarkdown.js +9 -5
- package/dist/internal/pinnedDnsDispatcher.d.ts +5 -0
- package/dist/internal/pinnedDnsDispatcher.js +29 -0
- package/dist/internal/secureCrawl.d.ts +3 -0
- package/dist/internal/secureCrawl.js +164 -0
- package/dist/internal/secureFetchHtml.d.ts +10 -0
- package/dist/internal/secureFetchHtml.js +53 -0
- package/dist/internal/turndownPluginGfmCompat.js +12 -2
- package/dist/utils/urlPolicy.d.ts +14 -0
- package/dist/utils/urlPolicy.js +172 -0
- package/package.json +2 -1
package/dist/index.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { Command } from 'commander';
|
|
3
3
|
import { fetchMarkdown } from './internal/fetchMarkdown.js';
|
|
4
|
-
import {
|
|
4
|
+
import { secureCrawl } from './internal/secureCrawl.js';
|
|
5
5
|
import { readFileSync } from 'fs';
|
|
6
6
|
import { fileURLToPath } from 'url';
|
|
7
7
|
import { dirname, join } from 'path';
|
|
@@ -43,8 +43,7 @@ program
|
|
|
43
43
|
}
|
|
44
44
|
console.error(`Fetching ${url}...`);
|
|
45
45
|
if (options.output === 'json') {
|
|
46
|
-
const
|
|
47
|
-
const results = await fetch(url, crawlOptions);
|
|
46
|
+
const results = await secureCrawl(url, crawlOptions);
|
|
48
47
|
console.log(JSON.stringify(results, null, 2));
|
|
49
48
|
}
|
|
50
49
|
else if (options.output === 'markdown') {
|
|
@@ -57,11 +56,13 @@ program
|
|
|
57
56
|
}
|
|
58
57
|
if (result.error) {
|
|
59
58
|
console.error(`Error: ${result.error}`);
|
|
59
|
+
if (!result.markdown) {
|
|
60
|
+
process.exit(1);
|
|
61
|
+
}
|
|
60
62
|
}
|
|
61
63
|
}
|
|
62
64
|
else if (options.output === 'both') {
|
|
63
|
-
const
|
|
64
|
-
const results = await fetch(url, crawlOptions);
|
|
65
|
+
const results = await secureCrawl(url, crawlOptions);
|
|
65
66
|
results.forEach((result) => {
|
|
66
67
|
console.log(`\n## URL: ${result.url}\n`);
|
|
67
68
|
if (result.markdown) {
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
1
|
+
import { extractMarkdownLinks, filterSameOriginLinks, } from '../utils/extractMarkdownLinks.js';
|
|
2
|
+
import { assertPublicHttpUrl } from '../utils/urlPolicy.js';
|
|
3
|
+
import { secureCrawl } from './secureCrawl.js';
|
|
3
4
|
export async function fetchMarkdown(url, options = {}) {
|
|
4
5
|
try {
|
|
5
6
|
const maxPages = options.maxPages ?? 1;
|
|
@@ -11,6 +12,7 @@ export async function fetchMarkdown(url, options = {}) {
|
|
|
11
12
|
if (visited.has(currentUrl))
|
|
12
13
|
continue;
|
|
13
14
|
visited.add(currentUrl);
|
|
15
|
+
await assertPublicHttpUrl(currentUrl);
|
|
14
16
|
const crawlOptions = {
|
|
15
17
|
depth: 0,
|
|
16
18
|
maxConcurrency: options.maxConcurrency ?? 3,
|
|
@@ -23,8 +25,7 @@ export async function fetchMarkdown(url, options = {}) {
|
|
|
23
25
|
if (options.cookiesFile) {
|
|
24
26
|
crawlOptions.cookiesFile = options.cookiesFile;
|
|
25
27
|
}
|
|
26
|
-
const
|
|
27
|
-
const results = await fetch(currentUrl, crawlOptions);
|
|
28
|
+
const results = await secureCrawl(currentUrl, crawlOptions);
|
|
28
29
|
if (results && results.length > 0) {
|
|
29
30
|
const result = results[0];
|
|
30
31
|
allResults.push(result);
|
|
@@ -67,7 +68,10 @@ export async function fetchMarkdown(url, options = {}) {
|
|
|
67
68
|
title: pagesToReturn[0].title,
|
|
68
69
|
links: pagesToReturn.flatMap(r => r.links || []),
|
|
69
70
|
error: pagesToReturn.some(r => r.error)
|
|
70
|
-
? `Some pages had errors: ${pagesToReturn
|
|
71
|
+
? `Some pages had errors: ${pagesToReturn
|
|
72
|
+
.filter(r => r.error)
|
|
73
|
+
.map(r => r.url)
|
|
74
|
+
.join(', ')}`
|
|
71
75
|
: undefined,
|
|
72
76
|
};
|
|
73
77
|
}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { LookupFunction } from 'node:net';
|
|
2
|
+
import { Agent } from 'undici';
|
|
3
|
+
import type { PublicHttpUrlResolution } from '../utils/urlPolicy.js';
|
|
4
|
+
export declare function createPinnedDnsAgent(resolution: PublicHttpUrlResolution): Agent;
|
|
5
|
+
export declare function createPinnedLookup(resolution: PublicHttpUrlResolution): LookupFunction;
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { Agent } from 'undici';
|
|
2
|
+
export function createPinnedDnsAgent(resolution) {
|
|
3
|
+
return new Agent({
|
|
4
|
+
connect: {
|
|
5
|
+
lookup: createPinnedLookup(resolution),
|
|
6
|
+
},
|
|
7
|
+
});
|
|
8
|
+
}
|
|
9
|
+
export function createPinnedLookup(resolution) {
|
|
10
|
+
return (hostname, options, callback) => {
|
|
11
|
+
const normalizedHostname = normalizeLookupHostname(hostname);
|
|
12
|
+
if (normalizedHostname !== resolution.hostname) {
|
|
13
|
+
callback(new Error(`Unexpected hostname lookup: ${normalizedHostname}`), '');
|
|
14
|
+
return;
|
|
15
|
+
}
|
|
16
|
+
if (options.all) {
|
|
17
|
+
callback(null, resolution.addresses);
|
|
18
|
+
return;
|
|
19
|
+
}
|
|
20
|
+
const address = resolution.addresses[0];
|
|
21
|
+
callback(null, address.address, address.family);
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
function normalizeLookupHostname(hostname) {
|
|
25
|
+
if (hostname.startsWith('[') && hostname.endsWith(']')) {
|
|
26
|
+
return hostname.slice(1, -1);
|
|
27
|
+
}
|
|
28
|
+
return hostname;
|
|
29
|
+
}
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import pLimit from 'p-limit';
|
|
2
|
+
import { DiskCache } from '@just-every/crawl/dist/cache/disk.js';
|
|
3
|
+
import { normalizeUrl, isSameOrigin, } from '@just-every/crawl/dist/cache/normalize.js';
|
|
4
|
+
import { parseNetscapeCookieFile, buildCookieHeaderForUrl, } from '@just-every/crawl/dist/crawler/cookies.js';
|
|
5
|
+
import { isAllowedByRobots, getCrawlDelay, } from '@just-every/crawl/dist/crawler/robots.js';
|
|
6
|
+
import { htmlToDom, extractLinks } from '@just-every/crawl/dist/parser/dom.js';
|
|
7
|
+
import { extractArticle } from '@just-every/crawl/dist/parser/article.js';
|
|
8
|
+
import { formatArticleMarkdown } from '@just-every/crawl/dist/parser/markdown.js';
|
|
9
|
+
import { assertPublicHttpUrl } from '../utils/urlPolicy.js';
|
|
10
|
+
import { secureFetchHtml } from './secureFetchHtml.js';
|
|
11
|
+
export async function secureCrawl(startUrl, options = {}) {
|
|
12
|
+
const crawler = new SecureCrawler(options);
|
|
13
|
+
await crawler.init();
|
|
14
|
+
return crawler.crawl(startUrl);
|
|
15
|
+
}
|
|
16
|
+
class SecureCrawler {
|
|
17
|
+
visited = new Set();
|
|
18
|
+
queue = [];
|
|
19
|
+
limit;
|
|
20
|
+
cache;
|
|
21
|
+
options;
|
|
22
|
+
results = [];
|
|
23
|
+
cookieJar;
|
|
24
|
+
constructor(options = {}) {
|
|
25
|
+
this.options = {
|
|
26
|
+
depth: options.depth ?? 0,
|
|
27
|
+
maxConcurrency: options.maxConcurrency ?? 3,
|
|
28
|
+
respectRobots: options.respectRobots ?? true,
|
|
29
|
+
sameOriginOnly: options.sameOriginOnly ?? true,
|
|
30
|
+
userAgent: options.userAgent,
|
|
31
|
+
cacheDir: options.cacheDir ?? '.cache',
|
|
32
|
+
timeout: options.timeout ?? 30000,
|
|
33
|
+
cookiesFile: options.cookiesFile,
|
|
34
|
+
};
|
|
35
|
+
this.limit = pLimit(this.options.maxConcurrency);
|
|
36
|
+
this.cache = new DiskCache(this.options.cacheDir);
|
|
37
|
+
if (options.cookiesFile) {
|
|
38
|
+
this.cookieJar = parseNetscapeCookieFile(options.cookiesFile);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
async init() {
|
|
42
|
+
await this.cache.init();
|
|
43
|
+
}
|
|
44
|
+
async crawl(startUrl) {
|
|
45
|
+
const normalizedUrl = normalizeUrl(startUrl);
|
|
46
|
+
await assertPublicHttpUrl(normalizedUrl);
|
|
47
|
+
this.queue.push(normalizedUrl);
|
|
48
|
+
await this.processQueue(0);
|
|
49
|
+
return this.results;
|
|
50
|
+
}
|
|
51
|
+
async processQueue(currentDepth) {
|
|
52
|
+
if (currentDepth > this.options.depth) {
|
|
53
|
+
return;
|
|
54
|
+
}
|
|
55
|
+
const urls = [...this.queue];
|
|
56
|
+
this.queue = [];
|
|
57
|
+
const tasks = urls.map(url => this.limit(() => this.processUrl(url, currentDepth)));
|
|
58
|
+
await Promise.all(tasks);
|
|
59
|
+
if (this.queue.length > 0) {
|
|
60
|
+
await this.processQueue(currentDepth + 1);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
async processUrl(url, depth) {
|
|
64
|
+
const normalizedUrl = normalizeUrl(url);
|
|
65
|
+
if (this.visited.has(normalizedUrl)) {
|
|
66
|
+
return;
|
|
67
|
+
}
|
|
68
|
+
this.visited.add(normalizedUrl);
|
|
69
|
+
try {
|
|
70
|
+
await assertPublicHttpUrl(normalizedUrl);
|
|
71
|
+
const cached = await this.cache.get(normalizedUrl);
|
|
72
|
+
if (cached) {
|
|
73
|
+
this.results.push({
|
|
74
|
+
url: normalizedUrl,
|
|
75
|
+
markdown: cached.markdown,
|
|
76
|
+
title: cached.title,
|
|
77
|
+
});
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
if (this.options.respectRobots) {
|
|
81
|
+
const allowed = await isAllowedByRobots(normalizedUrl, this.options.userAgent);
|
|
82
|
+
if (!allowed) {
|
|
83
|
+
this.results.push({
|
|
84
|
+
url: normalizedUrl,
|
|
85
|
+
markdown: '',
|
|
86
|
+
error: 'Blocked by robots.txt',
|
|
87
|
+
});
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
90
|
+
const delay = await getCrawlDelay(normalizedUrl, this.options.userAgent);
|
|
91
|
+
if (delay > 0) {
|
|
92
|
+
await new Promise(resolve => setTimeout(resolve, delay * 1000));
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
const { html, finalUrl } = await secureFetchHtml(normalizedUrl, {
|
|
96
|
+
userAgent: this.options.userAgent,
|
|
97
|
+
timeout: this.options.timeout,
|
|
98
|
+
cookieHeaderForUrl: redirectedUrl => this.cookieJar
|
|
99
|
+
? buildCookieHeaderForUrl(redirectedUrl, this.cookieJar)
|
|
100
|
+
: undefined,
|
|
101
|
+
});
|
|
102
|
+
if (!html.trim()) {
|
|
103
|
+
this.results.push({
|
|
104
|
+
url: finalUrl,
|
|
105
|
+
markdown: '',
|
|
106
|
+
error: 'Empty response from server',
|
|
107
|
+
});
|
|
108
|
+
return;
|
|
109
|
+
}
|
|
110
|
+
const dom = htmlToDom(html, finalUrl);
|
|
111
|
+
const article = extractArticle(dom);
|
|
112
|
+
if (!article) {
|
|
113
|
+
this.results.push({
|
|
114
|
+
url: finalUrl,
|
|
115
|
+
markdown: '',
|
|
116
|
+
error: 'Failed to extract article content',
|
|
117
|
+
});
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
if (!article.content || article.content.trim().length < 50) {
|
|
121
|
+
this.results.push({
|
|
122
|
+
url: finalUrl,
|
|
123
|
+
markdown: `# ${article.title || 'Page Content'}\n\n` +
|
|
124
|
+
'*Note: This page appears to be JavaScript-rendered. Limited content extracted.*\n\n' +
|
|
125
|
+
(article.textContent
|
|
126
|
+
? `${article.textContent.substring(0, 1000)}...`
|
|
127
|
+
: 'No text content available'),
|
|
128
|
+
title: article.title || finalUrl,
|
|
129
|
+
error: 'Limited content extracted (JavaScript-rendered page)',
|
|
130
|
+
});
|
|
131
|
+
return;
|
|
132
|
+
}
|
|
133
|
+
const markdown = formatArticleMarkdown(article);
|
|
134
|
+
await this.cache.put(finalUrl, markdown, article.title);
|
|
135
|
+
let links = [];
|
|
136
|
+
if (depth < this.options.depth) {
|
|
137
|
+
links = extractLinks(dom);
|
|
138
|
+
if (this.options.sameOriginOnly) {
|
|
139
|
+
links = links.filter(link => isSameOrigin(finalUrl, link));
|
|
140
|
+
}
|
|
141
|
+
for (const link of links) {
|
|
142
|
+
const normalized = normalizeUrl(link);
|
|
143
|
+
if (!this.visited.has(normalized)) {
|
|
144
|
+
await assertPublicHttpUrl(normalized);
|
|
145
|
+
this.queue.push(normalized);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
this.results.push({
|
|
150
|
+
url: finalUrl,
|
|
151
|
+
markdown,
|
|
152
|
+
title: article.title,
|
|
153
|
+
links: links.length > 0 ? links : undefined,
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
catch (error) {
|
|
157
|
+
this.results.push({
|
|
158
|
+
url: normalizedUrl,
|
|
159
|
+
markdown: '',
|
|
160
|
+
error: error instanceof Error ? error.message : 'Unknown error',
|
|
161
|
+
});
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export interface SecureFetchHtmlOptions {
|
|
2
|
+
userAgent?: string;
|
|
3
|
+
timeout?: number;
|
|
4
|
+
maxRedirections?: number;
|
|
5
|
+
cookieHeaderForUrl?: (url: string) => string | undefined;
|
|
6
|
+
}
|
|
7
|
+
export declare function secureFetchHtml(url: string, options?: SecureFetchHtmlOptions): Promise<{
|
|
8
|
+
html: string;
|
|
9
|
+
finalUrl: string;
|
|
10
|
+
}>;
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { fetch } from 'undici';
|
|
2
|
+
import { resolvePublicHttpUrl } from '../utils/urlPolicy.js';
|
|
3
|
+
import { createPinnedDnsAgent } from './pinnedDnsDispatcher.js';
|
|
4
|
+
const DEFAULT_USER_AGENT = 'MCP/0.1 (+https://github.com/just-every/mcp-read-website-fast)';
|
|
5
|
+
export async function secureFetchHtml(url, options = {}) {
|
|
6
|
+
const maxRedirections = options.maxRedirections ?? 5;
|
|
7
|
+
let currentUrl = url;
|
|
8
|
+
for (let redirectCount = 0; redirectCount <= maxRedirections; redirectCount += 1) {
|
|
9
|
+
const resolution = await resolvePublicHttpUrl(currentUrl);
|
|
10
|
+
const dispatcher = createPinnedDnsAgent(resolution);
|
|
11
|
+
try {
|
|
12
|
+
const response = await fetch(currentUrl, {
|
|
13
|
+
dispatcher,
|
|
14
|
+
headers: {
|
|
15
|
+
'User-Agent': options.userAgent ?? DEFAULT_USER_AGENT,
|
|
16
|
+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
17
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
18
|
+
DNT: '1',
|
|
19
|
+
Connection: 'keep-alive',
|
|
20
|
+
'Upgrade-Insecure-Requests': '1',
|
|
21
|
+
...(options.cookieHeaderForUrl?.(currentUrl)
|
|
22
|
+
? { Cookie: options.cookieHeaderForUrl(currentUrl) }
|
|
23
|
+
: {}),
|
|
24
|
+
},
|
|
25
|
+
redirect: 'manual',
|
|
26
|
+
signal: AbortSignal.timeout(options.timeout ?? 30000),
|
|
27
|
+
});
|
|
28
|
+
if (response.status >= 300 && response.status < 400) {
|
|
29
|
+
const location = response.headers.get('location');
|
|
30
|
+
await response.body?.cancel();
|
|
31
|
+
if (!location) {
|
|
32
|
+
throw new Error(`Redirect without Location header for ${currentUrl}`);
|
|
33
|
+
}
|
|
34
|
+
currentUrl = new URL(location, currentUrl).href;
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
if (!response.ok) {
|
|
38
|
+
throw new Error(`HTTP ${response.status} for ${currentUrl}`);
|
|
39
|
+
}
|
|
40
|
+
const contentType = response.headers.get('content-type');
|
|
41
|
+
if (contentType &&
|
|
42
|
+
!contentType.includes('text/html') &&
|
|
43
|
+
!contentType.includes('application/xhtml+xml')) {
|
|
44
|
+
throw new Error(`Non-HTML content type: ${contentType} for ${currentUrl}`);
|
|
45
|
+
}
|
|
46
|
+
return { html: await response.text(), finalUrl: currentUrl };
|
|
47
|
+
}
|
|
48
|
+
finally {
|
|
49
|
+
await dispatcher.close();
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
throw new Error(`Too many redirects for ${url}`);
|
|
53
|
+
}
|
|
@@ -1,6 +1,16 @@
|
|
|
1
1
|
import { createRequire } from 'node:module';
|
|
2
|
-
const
|
|
3
|
-
|
|
2
|
+
const requireFromHere = createRequire(import.meta.url);
|
|
3
|
+
function loadTurndownPluginGfmModule() {
|
|
4
|
+
try {
|
|
5
|
+
return requireFromHere('turndown-plugin-gfm');
|
|
6
|
+
}
|
|
7
|
+
catch {
|
|
8
|
+
const crawlPackageJsonPath = requireFromHere.resolve('@just-every/crawl/package.json');
|
|
9
|
+
const requireFromCrawl = createRequire(crawlPackageJsonPath);
|
|
10
|
+
return requireFromCrawl('turndown-plugin-gfm');
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
const turndownPluginGfmModule = loadTurndownPluginGfmModule();
|
|
4
14
|
export const gfm = turndownPluginGfmModule.gfm ?? turndownPluginGfmModule.default?.gfm;
|
|
5
15
|
if (typeof gfm !== 'function') {
|
|
6
16
|
throw new Error('turndown-plugin-gfm did not provide a usable gfm export');
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export interface PublicResolvedAddress {
|
|
2
|
+
address: string;
|
|
3
|
+
family: 4 | 6;
|
|
4
|
+
}
|
|
5
|
+
export interface PublicHttpUrlResolution {
|
|
6
|
+
url: URL;
|
|
7
|
+
hostname: string;
|
|
8
|
+
addresses: PublicResolvedAddress[];
|
|
9
|
+
}
|
|
10
|
+
export declare class UrlPolicyError extends Error {
|
|
11
|
+
constructor(message: string);
|
|
12
|
+
}
|
|
13
|
+
export declare function assertPublicHttpUrl(url: string): Promise<URL>;
|
|
14
|
+
export declare function resolvePublicHttpUrl(url: string): Promise<PublicHttpUrlResolution>;
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import { lookup } from 'node:dns/promises';
|
|
2
|
+
import { isIP } from 'node:net';
|
|
3
|
+
const IPV4_BLOCKED_RANGES = [
|
|
4
|
+
[0x00000000n, 0xff000000n],
|
|
5
|
+
[0x0a000000n, 0xff000000n],
|
|
6
|
+
[0x64400000n, 0xffc00000n],
|
|
7
|
+
[0x7f000000n, 0xff000000n],
|
|
8
|
+
[0xa9fe0000n, 0xffff0000n],
|
|
9
|
+
[0xac100000n, 0xfff00000n],
|
|
10
|
+
[0xc0000000n, 0xffffff00n],
|
|
11
|
+
[0xc0000200n, 0xffffff00n],
|
|
12
|
+
[0xc0586300n, 0xffffff00n],
|
|
13
|
+
[0xc0a80000n, 0xffff0000n],
|
|
14
|
+
[0xc6120000n, 0xfffe0000n],
|
|
15
|
+
[0xc6336400n, 0xffffff00n],
|
|
16
|
+
[0xcb007100n, 0xffffff00n],
|
|
17
|
+
[0xe0000000n, 0xf0000000n],
|
|
18
|
+
[0xf0000000n, 0xf0000000n],
|
|
19
|
+
];
|
|
20
|
+
const IPV6_BLOCKED_RANGES = [
|
|
21
|
+
[0n, (1n << 128n) - 1n],
|
|
22
|
+
[1n, (1n << 128n) - 1n],
|
|
23
|
+
[0xffffn << 32n, ((1n << 96n) - 1n) << 32n],
|
|
24
|
+
[0x0064ff9b000000000000000000000000n, prefixMask(96)],
|
|
25
|
+
[0x01000000000000000000000000000000n, prefixMask(64)],
|
|
26
|
+
[0x20010000000000000000000000000000n, prefixMask(23)],
|
|
27
|
+
[0x20020000000000000000000000000000n, prefixMask(16)],
|
|
28
|
+
[0xfc000000000000000000000000000000n, prefixMask(7)],
|
|
29
|
+
[0xfe800000000000000000000000000000n, prefixMask(10)],
|
|
30
|
+
[0xff000000000000000000000000000000n, prefixMask(8)],
|
|
31
|
+
];
|
|
32
|
+
export class UrlPolicyError extends Error {
|
|
33
|
+
constructor(message) {
|
|
34
|
+
super(message);
|
|
35
|
+
this.name = 'UrlPolicyError';
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
export async function assertPublicHttpUrl(url) {
|
|
39
|
+
return (await resolvePublicHttpUrl(url)).url;
|
|
40
|
+
}
|
|
41
|
+
export async function resolvePublicHttpUrl(url) {
|
|
42
|
+
let parsed;
|
|
43
|
+
try {
|
|
44
|
+
parsed = new URL(url);
|
|
45
|
+
}
|
|
46
|
+
catch {
|
|
47
|
+
throw new UrlPolicyError(`Invalid URL: ${url}`);
|
|
48
|
+
}
|
|
49
|
+
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
|
|
50
|
+
throw new UrlPolicyError(`URL scheme is not allowed: ${parsed.protocol}`);
|
|
51
|
+
}
|
|
52
|
+
if (parsed.username || parsed.password) {
|
|
53
|
+
throw new UrlPolicyError('URL credentials are not allowed');
|
|
54
|
+
}
|
|
55
|
+
const hostname = normalizeHostname(parsed.hostname);
|
|
56
|
+
const hostIpVersion = isIP(hostname);
|
|
57
|
+
if (hostIpVersion !== 0) {
|
|
58
|
+
const family = ipVersionToFamily(hostIpVersion);
|
|
59
|
+
assertPublicIp(hostname, family);
|
|
60
|
+
return {
|
|
61
|
+
url: parsed,
|
|
62
|
+
hostname,
|
|
63
|
+
addresses: [{ address: hostname, family }],
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
let addresses;
|
|
67
|
+
try {
|
|
68
|
+
addresses = await lookup(hostname, { all: true, verbatim: true });
|
|
69
|
+
}
|
|
70
|
+
catch (error) {
|
|
71
|
+
throw new UrlPolicyError(`Unable to resolve hostname "${hostname}": ${error instanceof Error ? error.message : 'Unknown DNS error'}`);
|
|
72
|
+
}
|
|
73
|
+
if (addresses.length === 0) {
|
|
74
|
+
throw new UrlPolicyError(`Hostname "${hostname}" did not resolve`);
|
|
75
|
+
}
|
|
76
|
+
const publicAddresses = addresses.map(address => {
|
|
77
|
+
const family = ipVersionToFamily(address.family);
|
|
78
|
+
assertPublicIp(address.address, family);
|
|
79
|
+
return { address: address.address, family };
|
|
80
|
+
});
|
|
81
|
+
return { url: parsed, hostname, addresses: publicAddresses };
|
|
82
|
+
}
|
|
83
|
+
function normalizeHostname(hostname) {
|
|
84
|
+
if (hostname.startsWith('[') && hostname.endsWith(']')) {
|
|
85
|
+
return hostname.slice(1, -1);
|
|
86
|
+
}
|
|
87
|
+
return hostname;
|
|
88
|
+
}
|
|
89
|
+
function assertPublicIp(address, family) {
|
|
90
|
+
if (family === 4) {
|
|
91
|
+
if (isBlockedIpv4(address)) {
|
|
92
|
+
throw new UrlPolicyError(`IP address is not allowed: ${address}`);
|
|
93
|
+
}
|
|
94
|
+
return;
|
|
95
|
+
}
|
|
96
|
+
if (family === 6) {
|
|
97
|
+
const ipv4Mapped = address
|
|
98
|
+
.toLowerCase()
|
|
99
|
+
.match(/^::ffff:(\d+\.\d+\.\d+\.\d+)$/);
|
|
100
|
+
if (ipv4Mapped) {
|
|
101
|
+
assertPublicIp(ipv4Mapped[1], 4);
|
|
102
|
+
return;
|
|
103
|
+
}
|
|
104
|
+
if (isBlockedIpv6(address)) {
|
|
105
|
+
throw new UrlPolicyError(`IP address is not allowed: ${address}`);
|
|
106
|
+
}
|
|
107
|
+
return;
|
|
108
|
+
}
|
|
109
|
+
throw new UrlPolicyError(`Unknown IP address family for ${address}`);
|
|
110
|
+
}
|
|
111
|
+
function ipVersionToFamily(family) {
|
|
112
|
+
if (family === 4 || family === 6) {
|
|
113
|
+
return family;
|
|
114
|
+
}
|
|
115
|
+
throw new UrlPolicyError(`Unknown IP address family: ${family}`);
|
|
116
|
+
}
|
|
117
|
+
function isBlockedIpv4(address) {
|
|
118
|
+
const numeric = ipv4ToNumber(address);
|
|
119
|
+
return IPV4_BLOCKED_RANGES.some(([range, mask]) => (numeric & mask) === range);
|
|
120
|
+
}
|
|
121
|
+
function ipv4ToNumber(address) {
|
|
122
|
+
const parts = address.split('.').map(part => Number(part));
|
|
123
|
+
if (parts.length !== 4 ||
|
|
124
|
+
parts.some(part => !Number.isInteger(part) || part < 0 || part > 255)) {
|
|
125
|
+
throw new UrlPolicyError(`Invalid IPv4 address: ${address}`);
|
|
126
|
+
}
|
|
127
|
+
return ((BigInt(parts[0]) << 24n) +
|
|
128
|
+
(BigInt(parts[1]) << 16n) +
|
|
129
|
+
(BigInt(parts[2]) << 8n) +
|
|
130
|
+
BigInt(parts[3]));
|
|
131
|
+
}
|
|
132
|
+
function isBlockedIpv6(address) {
|
|
133
|
+
const numeric = ipv6ToBigInt(address);
|
|
134
|
+
return IPV6_BLOCKED_RANGES.some(([range, mask]) => (numeric & mask) === range);
|
|
135
|
+
}
|
|
136
|
+
function ipv6ToBigInt(address) {
|
|
137
|
+
const lower = address.toLowerCase();
|
|
138
|
+
const [head = '', tail = ''] = lower.split('::');
|
|
139
|
+
const headParts = parseIpv6Parts(head);
|
|
140
|
+
const tailParts = parseIpv6Parts(tail);
|
|
141
|
+
const missing = 8 - headParts.length - tailParts.length;
|
|
142
|
+
if (lower.includes('::')) {
|
|
143
|
+
if (missing < 1) {
|
|
144
|
+
throw new UrlPolicyError(`Invalid IPv6 address: ${address}`);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
else if (headParts.length !== 8) {
|
|
148
|
+
throw new UrlPolicyError(`Invalid IPv6 address: ${address}`);
|
|
149
|
+
}
|
|
150
|
+
const parts = lower.includes('::')
|
|
151
|
+
? [...headParts, ...Array(missing).fill(0), ...tailParts]
|
|
152
|
+
: headParts;
|
|
153
|
+
return parts.reduce((acc, part) => (acc << 16n) + BigInt(part), 0n);
|
|
154
|
+
}
|
|
155
|
+
function parseIpv6Parts(value) {
|
|
156
|
+
if (!value) {
|
|
157
|
+
return [];
|
|
158
|
+
}
|
|
159
|
+
return value.split(':').map(part => {
|
|
160
|
+
const parsed = Number.parseInt(part, 16);
|
|
161
|
+
if (!part ||
|
|
162
|
+
!Number.isInteger(parsed) ||
|
|
163
|
+
parsed < 0 ||
|
|
164
|
+
parsed > 0xffff) {
|
|
165
|
+
throw new UrlPolicyError(`Invalid IPv6 address part: ${part}`);
|
|
166
|
+
}
|
|
167
|
+
return parsed;
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
function prefixMask(prefixLength) {
|
|
171
|
+
return ((1n << BigInt(prefixLength)) - 1n) << BigInt(128 - prefixLength);
|
|
172
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@just-every/mcp-read-website-fast",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.23",
|
|
4
4
|
"description": "Markdown Content Preprocessor - Fetch web pages, extract content, convert to clean Markdown",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -53,6 +53,7 @@
|
|
|
53
53
|
"@just-every/crawl": "^1.0.8",
|
|
54
54
|
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
55
55
|
"commander": "^14.0.3",
|
|
56
|
+
"turndown-plugin-gfm": "^1.0.2",
|
|
56
57
|
"uuid": "^13.0.0"
|
|
57
58
|
},
|
|
58
59
|
"devDependencies": {
|