@just-every/mcp-read-website-fast 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +165 -0
- package/bin/mcp-read-website.js +49 -0
- package/dist/cache/disk.d.ts +12 -0
- package/dist/cache/disk.js +54 -0
- package/dist/cache/normalize.d.ts +2 -0
- package/dist/cache/normalize.js +31 -0
- package/dist/crawler/fetch.d.ts +8 -0
- package/dist/crawler/fetch.js +42 -0
- package/dist/crawler/queue.d.ts +14 -0
- package/dist/crawler/queue.js +142 -0
- package/dist/crawler/robots.d.ts +8 -0
- package/dist/crawler/robots.js +47 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +99 -0
- package/dist/internal/fetchMarkdown.d.ts +16 -0
- package/dist/internal/fetchMarkdown.js +36 -0
- package/dist/parser/article.d.ts +4 -0
- package/dist/parser/article.js +115 -0
- package/dist/parser/dom.d.ts +3 -0
- package/dist/parser/dom.js +53 -0
- package/dist/parser/markdown.d.ts +9 -0
- package/dist/parser/markdown.js +134 -0
- package/dist/serve.d.ts +2 -0
- package/dist/serve.js +171 -0
- package/dist/utils/chunker.d.ts +26 -0
- package/dist/utils/chunker.js +146 -0
- package/dist/utils/logger.d.ts +18 -0
- package/dist/utils/logger.js +52 -0
- package/package.json +71 -0
- package/tsconfig.json +24 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
export class MarkdownChunker {
|
|
2
|
+
options;
|
|
3
|
+
constructor(options = {}) {
|
|
4
|
+
this.options = {
|
|
5
|
+
maxTokens: options.maxTokens ?? 0,
|
|
6
|
+
maxChars: options.maxChars ?? 4000,
|
|
7
|
+
splitOn: options.splitOn ?? 'heading',
|
|
8
|
+
overlap: options.overlap ?? 200
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
chunk(markdown) {
|
|
12
|
+
switch (this.options.splitOn) {
|
|
13
|
+
case 'heading':
|
|
14
|
+
return this.chunkByHeading(markdown);
|
|
15
|
+
case 'paragraph':
|
|
16
|
+
return this.chunkByParagraph(markdown);
|
|
17
|
+
case 'sentence':
|
|
18
|
+
return this.chunkBySentence(markdown);
|
|
19
|
+
default:
|
|
20
|
+
return this.chunkByHeading(markdown);
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
chunkByHeading(markdown) {
|
|
24
|
+
const chunks = [];
|
|
25
|
+
const lines = markdown.split('\n');
|
|
26
|
+
let currentChunk = [];
|
|
27
|
+
let currentHeadings = [];
|
|
28
|
+
let startLine = 0;
|
|
29
|
+
for (let i = 0; i < lines.length; i++) {
|
|
30
|
+
const line = lines[i];
|
|
31
|
+
const isHeading = /^#+\s/.test(line);
|
|
32
|
+
if (isHeading && currentChunk.length > 0) {
|
|
33
|
+
chunks.push({
|
|
34
|
+
content: currentChunk.join('\n').trim(),
|
|
35
|
+
index: chunks.length,
|
|
36
|
+
metadata: {
|
|
37
|
+
headings: [...currentHeadings],
|
|
38
|
+
startLine,
|
|
39
|
+
endLine: i - 1
|
|
40
|
+
}
|
|
41
|
+
});
|
|
42
|
+
const overlapLines = this.getOverlapLines(currentChunk);
|
|
43
|
+
currentChunk = [...overlapLines, line];
|
|
44
|
+
currentHeadings = [line];
|
|
45
|
+
startLine = i - overlapLines.length;
|
|
46
|
+
}
|
|
47
|
+
else {
|
|
48
|
+
currentChunk.push(line);
|
|
49
|
+
if (isHeading) {
|
|
50
|
+
currentHeadings.push(line);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
const currentSize = currentChunk.join('\n').length;
|
|
54
|
+
if (currentSize > this.options.maxChars) {
|
|
55
|
+
chunks.push({
|
|
56
|
+
content: currentChunk.join('\n').trim(),
|
|
57
|
+
index: chunks.length,
|
|
58
|
+
metadata: {
|
|
59
|
+
headings: [...currentHeadings],
|
|
60
|
+
startLine,
|
|
61
|
+
endLine: i
|
|
62
|
+
}
|
|
63
|
+
});
|
|
64
|
+
const overlapLines = this.getOverlapLines(currentChunk);
|
|
65
|
+
currentChunk = [...overlapLines];
|
|
66
|
+
currentHeadings = [];
|
|
67
|
+
startLine = i - overlapLines.length + 1;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
if (currentChunk.length > 0) {
|
|
71
|
+
chunks.push({
|
|
72
|
+
content: currentChunk.join('\n').trim(),
|
|
73
|
+
index: chunks.length,
|
|
74
|
+
metadata: {
|
|
75
|
+
headings: currentHeadings,
|
|
76
|
+
startLine,
|
|
77
|
+
endLine: lines.length - 1
|
|
78
|
+
}
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
return chunks;
|
|
82
|
+
}
|
|
83
|
+
chunkByParagraph(markdown) {
|
|
84
|
+
const chunks = [];
|
|
85
|
+
const paragraphs = markdown.split(/\n\n+/);
|
|
86
|
+
let currentChunk = [];
|
|
87
|
+
for (const paragraph of paragraphs) {
|
|
88
|
+
const wouldExceedLimit = currentChunk.join('\n\n').length + paragraph.length > this.options.maxChars;
|
|
89
|
+
if (wouldExceedLimit && currentChunk.length > 0) {
|
|
90
|
+
chunks.push({
|
|
91
|
+
content: currentChunk.join('\n\n').trim(),
|
|
92
|
+
index: chunks.length
|
|
93
|
+
});
|
|
94
|
+
currentChunk = [];
|
|
95
|
+
}
|
|
96
|
+
currentChunk.push(paragraph);
|
|
97
|
+
}
|
|
98
|
+
if (currentChunk.length > 0) {
|
|
99
|
+
chunks.push({
|
|
100
|
+
content: currentChunk.join('\n\n').trim(),
|
|
101
|
+
index: chunks.length
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
return chunks;
|
|
105
|
+
}
|
|
106
|
+
chunkBySentence(markdown) {
|
|
107
|
+
const chunks = [];
|
|
108
|
+
const sentences = markdown.match(/[^.!?]+[.!?]+/g) || [markdown];
|
|
109
|
+
let currentChunk = [];
|
|
110
|
+
for (const sentence of sentences) {
|
|
111
|
+
const wouldExceedLimit = currentChunk.join(' ').length + sentence.length > this.options.maxChars;
|
|
112
|
+
if (wouldExceedLimit && currentChunk.length > 0) {
|
|
113
|
+
chunks.push({
|
|
114
|
+
content: currentChunk.join(' ').trim(),
|
|
115
|
+
index: chunks.length
|
|
116
|
+
});
|
|
117
|
+
currentChunk = [];
|
|
118
|
+
}
|
|
119
|
+
currentChunk.push(sentence.trim());
|
|
120
|
+
}
|
|
121
|
+
if (currentChunk.length > 0) {
|
|
122
|
+
chunks.push({
|
|
123
|
+
content: currentChunk.join(' ').trim(),
|
|
124
|
+
index: chunks.length
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
return chunks;
|
|
128
|
+
}
|
|
129
|
+
getOverlapLines(lines) {
|
|
130
|
+
if (this.options.overlap <= 0)
|
|
131
|
+
return [];
|
|
132
|
+
let overlapChars = 0;
|
|
133
|
+
const overlapLines = [];
|
|
134
|
+
for (let i = lines.length - 1; i >= 0; i--) {
|
|
135
|
+
overlapLines.unshift(lines[i]);
|
|
136
|
+
overlapChars += lines[i].length + 1;
|
|
137
|
+
if (overlapChars >= this.options.overlap) {
|
|
138
|
+
break;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
return overlapLines;
|
|
142
|
+
}
|
|
143
|
+
estimateTokens(text) {
|
|
144
|
+
return Math.ceil(text.length / 4);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
export declare enum LogLevel {
|
|
2
|
+
ERROR = 0,
|
|
3
|
+
WARN = 1,
|
|
4
|
+
INFO = 2,
|
|
5
|
+
DEBUG = 3
|
|
6
|
+
}
|
|
7
|
+
export declare class Logger {
|
|
8
|
+
private level;
|
|
9
|
+
private name;
|
|
10
|
+
constructor(name: string, level?: LogLevel);
|
|
11
|
+
private log;
|
|
12
|
+
error(message: string, ...args: any[]): void;
|
|
13
|
+
warn(message: string, ...args: any[]): void;
|
|
14
|
+
info(message: string, ...args: any[]): void;
|
|
15
|
+
debug(message: string, ...args: any[]): void;
|
|
16
|
+
setLevel(level: LogLevel): void;
|
|
17
|
+
}
|
|
18
|
+
export declare const logger: Logger;
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
export var LogLevel;
|
|
2
|
+
(function (LogLevel) {
|
|
3
|
+
LogLevel[LogLevel["ERROR"] = 0] = "ERROR";
|
|
4
|
+
LogLevel[LogLevel["WARN"] = 1] = "WARN";
|
|
5
|
+
LogLevel[LogLevel["INFO"] = 2] = "INFO";
|
|
6
|
+
LogLevel[LogLevel["DEBUG"] = 3] = "DEBUG";
|
|
7
|
+
})(LogLevel || (LogLevel = {}));
|
|
8
|
+
export class Logger {
|
|
9
|
+
level;
|
|
10
|
+
name;
|
|
11
|
+
constructor(name, level = LogLevel.INFO) {
|
|
12
|
+
this.name = name;
|
|
13
|
+
this.level = level;
|
|
14
|
+
}
|
|
15
|
+
log(level, message, ...args) {
|
|
16
|
+
if (level > this.level)
|
|
17
|
+
return;
|
|
18
|
+
const timestamp = new Date().toISOString();
|
|
19
|
+
const levelName = LogLevel[level];
|
|
20
|
+
const prefix = `[${timestamp}] [${levelName}] [${this.name}]`;
|
|
21
|
+
switch (level) {
|
|
22
|
+
case LogLevel.ERROR:
|
|
23
|
+
console.error(prefix, message, ...args);
|
|
24
|
+
break;
|
|
25
|
+
case LogLevel.WARN:
|
|
26
|
+
console.warn(prefix, message, ...args);
|
|
27
|
+
break;
|
|
28
|
+
default:
|
|
29
|
+
console.log(prefix, message, ...args);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
error(message, ...args) {
|
|
33
|
+
this.log(LogLevel.ERROR, message, ...args);
|
|
34
|
+
}
|
|
35
|
+
warn(message, ...args) {
|
|
36
|
+
this.log(LogLevel.WARN, message, ...args);
|
|
37
|
+
}
|
|
38
|
+
info(message, ...args) {
|
|
39
|
+
this.log(LogLevel.INFO, message, ...args);
|
|
40
|
+
}
|
|
41
|
+
debug(message, ...args) {
|
|
42
|
+
this.log(LogLevel.DEBUG, message, ...args);
|
|
43
|
+
}
|
|
44
|
+
setLevel(level) {
|
|
45
|
+
this.level = level;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
export const logger = new Logger('MCP');
|
|
49
|
+
const envLevel = process.env.LOG_LEVEL?.toUpperCase();
|
|
50
|
+
if (envLevel && envLevel in LogLevel) {
|
|
51
|
+
logger.setLevel(LogLevel[envLevel]);
|
|
52
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@just-every/mcp-read-website-fast",
|
|
3
|
+
"version": "0.1.1",
|
|
4
|
+
"description": "Markdown Content Preprocessor - Fetch web pages, extract content, convert to clean Markdown",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"mcp-read-website-fast": "bin/mcp-read-website.js"
|
|
8
|
+
},
|
|
9
|
+
"scripts": {
|
|
10
|
+
"build": "tsc -p tsconfig.prod.json",
|
|
11
|
+
"build:dev": "tsc",
|
|
12
|
+
"dev": "tsx src/index.ts",
|
|
13
|
+
"start": "node dist/index.js",
|
|
14
|
+
"serve": "tsx src/serve.ts",
|
|
15
|
+
"serve:dev": "tsx src/serve.ts",
|
|
16
|
+
"test": "vitest",
|
|
17
|
+
"test:deploy": "vitest run test/deployment.test.ts",
|
|
18
|
+
"lint": "eslint src",
|
|
19
|
+
"typecheck": "tsc --noEmit"
|
|
20
|
+
},
|
|
21
|
+
"keywords": [
|
|
22
|
+
"mcp",
|
|
23
|
+
"mcp-server",
|
|
24
|
+
"model-context-protocol",
|
|
25
|
+
"markdown",
|
|
26
|
+
"web-scraper",
|
|
27
|
+
"readability",
|
|
28
|
+
"content-extraction",
|
|
29
|
+
"rag",
|
|
30
|
+
"llm",
|
|
31
|
+
"claude",
|
|
32
|
+
"cursor",
|
|
33
|
+
"vscode"
|
|
34
|
+
],
|
|
35
|
+
"author": "Just Every",
|
|
36
|
+
"repository": {
|
|
37
|
+
"type": "git",
|
|
38
|
+
"url": "git+https://github.com/just-every/mcp-read-website-fast.git"
|
|
39
|
+
},
|
|
40
|
+
"bugs": {
|
|
41
|
+
"url": "https://github.com/just-every/mcp-read-website-fast/issues"
|
|
42
|
+
},
|
|
43
|
+
"homepage": "https://github.com/just-every/mcp-read-website-fast#readme",
|
|
44
|
+
"license": "MIT",
|
|
45
|
+
"dependencies": {
|
|
46
|
+
"@modelcontextprotocol/sdk": "^1.12.1",
|
|
47
|
+
"@mozilla/readability": "^0.6.0",
|
|
48
|
+
"commander": "^14.0.0",
|
|
49
|
+
"jsdom": "^26.1.0",
|
|
50
|
+
"p-limit": "^6.2.0",
|
|
51
|
+
"robots-parser": "^3.0.1",
|
|
52
|
+
"turndown": "^7.1.3",
|
|
53
|
+
"turndown-plugin-gfm": "^1.0.2",
|
|
54
|
+
"undici": "^7.10.0"
|
|
55
|
+
},
|
|
56
|
+
"devDependencies": {
|
|
57
|
+
"@types/jsdom": "^21.1.6",
|
|
58
|
+
"@types/node": "^24.0.0",
|
|
59
|
+
"@types/turndown": "^5.0.4",
|
|
60
|
+
"@typescript-eslint/eslint-plugin": "^8.34.0",
|
|
61
|
+
"@typescript-eslint/parser": "^8.34.0",
|
|
62
|
+
"eslint": "^9.28.0",
|
|
63
|
+
"tsx": "^4.7.0",
|
|
64
|
+
"typescript": "^5.3.3",
|
|
65
|
+
"vitest": "^3.2.3"
|
|
66
|
+
},
|
|
67
|
+
"engines": {
|
|
68
|
+
"node": ">=20.0.0"
|
|
69
|
+
},
|
|
70
|
+
"type": "module"
|
|
71
|
+
}
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
"target": "ES2022",
|
|
4
|
+
"module": "NodeNext",
|
|
5
|
+
"moduleResolution": "NodeNext",
|
|
6
|
+
"lib": ["ES2022"],
|
|
7
|
+
"outDir": "./dist",
|
|
8
|
+
"rootDir": "./src",
|
|
9
|
+
"strict": true,
|
|
10
|
+
"esModuleInterop": true,
|
|
11
|
+
"skipLibCheck": true,
|
|
12
|
+
"forceConsistentCasingInFileNames": true,
|
|
13
|
+
"resolveJsonModule": true,
|
|
14
|
+
"declaration": true,
|
|
15
|
+
"declarationMap": true,
|
|
16
|
+
"sourceMap": true,
|
|
17
|
+
"noUnusedLocals": true,
|
|
18
|
+
"noUnusedParameters": true,
|
|
19
|
+
"noImplicitReturns": true,
|
|
20
|
+
"noFallthroughCasesInSwitch": true
|
|
21
|
+
},
|
|
22
|
+
"include": ["src/**/*"],
|
|
23
|
+
"exclude": ["node_modules", "dist", "src/test-*.ts", "src/serve-*.ts", "src/trace-*.ts"]
|
|
24
|
+
}
|