@j0hanz/superfetch 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -38
- package/dist/cache.d.ts +42 -0
- package/dist/cache.js +565 -0
- package/dist/config/env-parsers.d.ts +1 -0
- package/dist/config/env-parsers.js +12 -0
- package/dist/config/index.d.ts +7 -0
- package/dist/config/index.js +10 -3
- package/dist/config/types/content.d.ts +1 -0
- package/dist/config.d.ts +77 -0
- package/dist/config.js +261 -0
- package/dist/crypto.d.ts +2 -0
- package/dist/crypto.js +32 -0
- package/dist/errors.d.ts +10 -0
- package/dist/errors.js +28 -0
- package/dist/fetch.d.ts +40 -0
- package/dist/fetch.js +910 -0
- package/dist/http/base-middleware.d.ts +7 -0
- package/dist/http/base-middleware.js +143 -0
- package/dist/http/cors.d.ts +0 -5
- package/dist/http/cors.js +0 -6
- package/dist/http/download-routes.js +6 -2
- package/dist/http/error-handler.d.ts +2 -0
- package/dist/http/error-handler.js +55 -0
- package/dist/http/mcp-routes.js +2 -2
- package/dist/http/mcp-sessions.d.ts +3 -5
- package/dist/http/mcp-sessions.js +8 -8
- package/dist/http/server-tuning.d.ts +9 -0
- package/dist/http/server-tuning.js +45 -0
- package/dist/http/server.d.ts +0 -10
- package/dist/http/server.js +33 -333
- package/dist/http.d.ts +78 -0
- package/dist/http.js +1437 -0
- package/dist/index.js +3 -3
- package/dist/mcp.d.ts +3 -0
- package/dist/mcp.js +94 -0
- package/dist/observability.d.ts +16 -0
- package/dist/observability.js +78 -0
- package/dist/server.js +20 -5
- package/dist/services/cache.d.ts +1 -1
- package/dist/services/context.d.ts +2 -0
- package/dist/services/context.js +3 -0
- package/dist/services/extractor.d.ts +1 -0
- package/dist/services/extractor.js +28 -2
- package/dist/services/fetcher.d.ts +2 -0
- package/dist/services/fetcher.js +35 -14
- package/dist/services/logger.js +4 -1
- package/dist/services/telemetry.d.ts +19 -0
- package/dist/services/telemetry.js +43 -0
- package/dist/services/transform-worker-pool.d.ts +10 -3
- package/dist/services/transform-worker-pool.js +213 -184
- package/dist/tools/handlers/fetch-url.tool.js +8 -6
- package/dist/tools/index.d.ts +1 -0
- package/dist/tools/index.js +13 -1
- package/dist/tools/schemas.d.ts +2 -0
- package/dist/tools/schemas.js +8 -0
- package/dist/tools/utils/content-transform-core.d.ts +5 -0
- package/dist/tools/utils/content-transform-core.js +180 -0
- package/dist/tools/utils/content-transform-workers.d.ts +1 -0
- package/dist/tools/utils/content-transform-workers.js +1 -0
- package/dist/tools/utils/content-transform.d.ts +3 -5
- package/dist/tools/utils/content-transform.js +35 -148
- package/dist/tools/utils/raw-markdown.js +15 -1
- package/dist/tools.d.ts +104 -0
- package/dist/tools.js +421 -0
- package/dist/transform.d.ts +69 -0
- package/dist/transform.js +1509 -0
- package/dist/transformers/markdown.d.ts +4 -1
- package/dist/transformers/markdown.js +182 -53
- package/dist/utils/cancellation.d.ts +1 -0
- package/dist/utils/cancellation.js +18 -0
- package/dist/utils/code-language.d.ts +0 -9
- package/dist/utils/code-language.js +5 -5
- package/dist/utils/host-normalizer.d.ts +1 -0
- package/dist/utils/host-normalizer.js +37 -0
- package/dist/utils/url-redactor.d.ts +1 -0
- package/dist/utils/url-redactor.js +13 -0
- package/dist/utils/url-validator.js +8 -5
- package/dist/workers/transform-worker.js +82 -38
- package/package.json +8 -7
|
@@ -1,2 +1,5 @@
|
|
|
1
1
|
import type { MetadataBlock } from '../config/types/content.js';
|
|
2
|
-
export declare function htmlToMarkdown(html: string, metadata?: MetadataBlock
|
|
2
|
+
export declare function htmlToMarkdown(html: string, metadata?: MetadataBlock, options?: {
|
|
3
|
+
url?: string;
|
|
4
|
+
signal?: AbortSignal;
|
|
5
|
+
}): string;
|
|
@@ -1,5 +1,9 @@
|
|
|
1
|
-
import
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import { NodeHtmlMarkdown, } from 'node-html-markdown';
|
|
2
3
|
import { CODE_BLOCK, FRONTMATTER_DELIMITER, joinLines, } from '../config/formatting.js';
|
|
4
|
+
import { FetchError } from '../errors/app-error.js';
|
|
5
|
+
import { endTransformStage, startTransformStage, } from '../services/telemetry.js';
|
|
6
|
+
import { throwIfAborted } from '../utils/cancellation.js';
|
|
3
7
|
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../utils/code-language.js';
|
|
4
8
|
import { isRecord } from '../utils/guards.js';
|
|
5
9
|
const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
|
|
@@ -43,6 +47,9 @@ function buildFrontmatter(metadata) {
|
|
|
43
47
|
const lines = [FRONTMATTER_DELIMITER];
|
|
44
48
|
appendFrontmatterField(lines, 'title', metadata.title);
|
|
45
49
|
appendFrontmatterField(lines, 'source', metadata.url);
|
|
50
|
+
appendFrontmatterField(lines, 'author', metadata.author);
|
|
51
|
+
appendFrontmatterField(lines, 'description', metadata.description);
|
|
52
|
+
appendFrontmatterField(lines, 'fetchedAt', metadata.fetchedAt);
|
|
46
53
|
lines.push(FRONTMATTER_DELIMITER);
|
|
47
54
|
return joinLines(lines);
|
|
48
55
|
}
|
|
@@ -51,36 +58,6 @@ function isElement(node) {
|
|
|
51
58
|
'getAttribute' in node &&
|
|
52
59
|
typeof node.getAttribute === 'function');
|
|
53
60
|
}
|
|
54
|
-
function isFencedCodeBlock(node, options) {
|
|
55
|
-
return (options.codeBlockStyle === 'fenced' &&
|
|
56
|
-
node.nodeName === 'PRE' &&
|
|
57
|
-
node.firstChild?.nodeName === 'CODE');
|
|
58
|
-
}
|
|
59
|
-
function formatFencedCodeBlock(node) {
|
|
60
|
-
const codeNode = node.firstChild;
|
|
61
|
-
if (!isElement(codeNode))
|
|
62
|
-
return '';
|
|
63
|
-
const code = codeNode.textContent || '';
|
|
64
|
-
const language = resolveCodeLanguage(codeNode, code);
|
|
65
|
-
return CODE_BLOCK.format(code, language);
|
|
66
|
-
}
|
|
67
|
-
function resolveCodeLanguage(codeNode, code) {
|
|
68
|
-
const { className, dataLanguage } = readCodeAttributes(codeNode);
|
|
69
|
-
const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
|
|
70
|
-
return attributeLanguage ?? detectLanguageFromCode(code) ?? '';
|
|
71
|
-
}
|
|
72
|
-
function readCodeAttributes(codeNode) {
|
|
73
|
-
return {
|
|
74
|
-
className: codeNode.getAttribute('class') ?? '',
|
|
75
|
-
dataLanguage: codeNode.getAttribute('data-language') ?? '',
|
|
76
|
-
};
|
|
77
|
-
}
|
|
78
|
-
function addFencedCodeRule(instance) {
|
|
79
|
-
instance.addRule('fencedCodeBlockWithLanguage', {
|
|
80
|
-
filter: (node, options) => isFencedCodeBlock(node, options),
|
|
81
|
-
replacement: (_content, node) => formatFencedCodeBlock(node),
|
|
82
|
-
});
|
|
83
|
-
}
|
|
84
61
|
const STRUCTURAL_TAGS = new Set([
|
|
85
62
|
'script',
|
|
86
63
|
'style',
|
|
@@ -109,6 +86,67 @@ const PROMO_PATTERN = /banner|promo|announcement|cta|callout|advert|newsletter|s
|
|
|
109
86
|
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
110
87
|
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
111
88
|
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
89
|
+
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
90
|
+
const NOISE_MARKERS = [
|
|
91
|
+
'<script',
|
|
92
|
+
'<style',
|
|
93
|
+
'<noscript',
|
|
94
|
+
'<iframe',
|
|
95
|
+
'<nav',
|
|
96
|
+
'<footer',
|
|
97
|
+
'<aside',
|
|
98
|
+
'<header',
|
|
99
|
+
'<form',
|
|
100
|
+
'<button',
|
|
101
|
+
'<input',
|
|
102
|
+
'<select',
|
|
103
|
+
'<textarea',
|
|
104
|
+
'<svg',
|
|
105
|
+
'<canvas',
|
|
106
|
+
' aria-hidden="true"',
|
|
107
|
+
" aria-hidden='true'",
|
|
108
|
+
' hidden',
|
|
109
|
+
' role="navigation"',
|
|
110
|
+
" role='navigation'",
|
|
111
|
+
' role="banner"',
|
|
112
|
+
" role='banner'",
|
|
113
|
+
' role="complementary"',
|
|
114
|
+
" role='complementary'",
|
|
115
|
+
' role="contentinfo"',
|
|
116
|
+
" role='contentinfo'",
|
|
117
|
+
' role="tree"',
|
|
118
|
+
" role='tree'",
|
|
119
|
+
' role="menubar"',
|
|
120
|
+
" role='menubar'",
|
|
121
|
+
' role="menu"',
|
|
122
|
+
" role='menu'",
|
|
123
|
+
' banner',
|
|
124
|
+
' promo',
|
|
125
|
+
' announcement',
|
|
126
|
+
' cta',
|
|
127
|
+
' callout',
|
|
128
|
+
' advert',
|
|
129
|
+
' newsletter',
|
|
130
|
+
' subscribe',
|
|
131
|
+
' cookie',
|
|
132
|
+
' consent',
|
|
133
|
+
' popup',
|
|
134
|
+
' modal',
|
|
135
|
+
' overlay',
|
|
136
|
+
' toast',
|
|
137
|
+
' fixed',
|
|
138
|
+
' sticky',
|
|
139
|
+
' z-50',
|
|
140
|
+
' z-4',
|
|
141
|
+
' isolate',
|
|
142
|
+
];
|
|
143
|
+
function mayContainNoise(html) {
|
|
144
|
+
const haystack = html.toLowerCase();
|
|
145
|
+
return NOISE_MARKERS.some((marker) => haystack.includes(marker));
|
|
146
|
+
}
|
|
147
|
+
function isFullDocumentHtml(html) {
|
|
148
|
+
return HTML_DOCUMENT_MARKERS.test(html);
|
|
149
|
+
}
|
|
112
150
|
function isStructuralNoiseTag(tagName) {
|
|
113
151
|
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
114
152
|
}
|
|
@@ -146,40 +184,131 @@ function isNoiseElement(node) {
|
|
|
146
184
|
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
147
185
|
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
148
186
|
}
|
|
149
|
-
function
|
|
150
|
-
|
|
187
|
+
function removeNoiseFromHtml(html) {
|
|
188
|
+
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
189
|
+
if (!shouldParse)
|
|
190
|
+
return html;
|
|
191
|
+
const shouldRemove = mayContainNoise(html);
|
|
192
|
+
try {
|
|
193
|
+
const { document } = parseHTML(html);
|
|
194
|
+
if (shouldRemove) {
|
|
195
|
+
const nodes = Array.from(document.querySelectorAll('*'));
|
|
196
|
+
for (let index = nodes.length - 1; index >= 0; index -= 1) {
|
|
197
|
+
const node = nodes[index];
|
|
198
|
+
if (!node)
|
|
199
|
+
continue;
|
|
200
|
+
if (isElement(node) && isNoiseElement(node)) {
|
|
201
|
+
node.remove();
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
const { body } = document;
|
|
206
|
+
if (body?.innerHTML)
|
|
207
|
+
return body.innerHTML;
|
|
208
|
+
if (typeof document.toString ===
|
|
209
|
+
'function') {
|
|
210
|
+
return document.toString();
|
|
211
|
+
}
|
|
212
|
+
const { documentElement } = document;
|
|
213
|
+
if (documentElement?.outerHTML)
|
|
214
|
+
return documentElement.outerHTML;
|
|
215
|
+
return html;
|
|
216
|
+
}
|
|
217
|
+
catch {
|
|
218
|
+
return html;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
function buildInlineCode(content) {
|
|
222
|
+
const runs = content.match(/`+/g);
|
|
223
|
+
const longest = runs?.sort((a, b) => b.length - a.length)[0] ?? '';
|
|
224
|
+
const delimiter = `\`${longest}`;
|
|
225
|
+
const padding = delimiter.length > 1 ? ' ' : '';
|
|
226
|
+
return `${delimiter}${padding}${content}${padding}${delimiter}`;
|
|
151
227
|
}
|
|
152
|
-
function
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
228
|
+
function isCodeBlock(parent) {
|
|
229
|
+
if (!isRecord(parent))
|
|
230
|
+
return false;
|
|
231
|
+
const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
|
|
232
|
+
return ['PRE', 'WRAPPED-PRE'].includes(tagName);
|
|
157
233
|
}
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
234
|
+
function createCodeTranslator() {
|
|
235
|
+
return {
|
|
236
|
+
code: (ctx) => {
|
|
237
|
+
if (!isRecord(ctx)) {
|
|
238
|
+
return {
|
|
239
|
+
spaceIfRepeatingChar: true,
|
|
240
|
+
noEscape: true,
|
|
241
|
+
postprocess: ({ content }) => buildInlineCode(content),
|
|
242
|
+
};
|
|
243
|
+
}
|
|
244
|
+
const { node, parent, visitor } = ctx;
|
|
245
|
+
const getAttribute = isRecord(node) && typeof node.getAttribute === 'function'
|
|
246
|
+
? node.getAttribute.bind(node)
|
|
247
|
+
: undefined;
|
|
248
|
+
if (!isCodeBlock(parent)) {
|
|
249
|
+
return {
|
|
250
|
+
spaceIfRepeatingChar: true,
|
|
251
|
+
noEscape: true,
|
|
252
|
+
postprocess: ({ content }) => buildInlineCode(content),
|
|
253
|
+
};
|
|
254
|
+
}
|
|
255
|
+
const className = getAttribute?.('class') ?? '';
|
|
256
|
+
const dataLanguage = getAttribute?.('data-language') ?? '';
|
|
257
|
+
const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
|
|
258
|
+
const childTranslators = isRecord(visitor) ? visitor.instance : null;
|
|
259
|
+
const codeBlockTranslators = isRecord(childTranslators) &&
|
|
260
|
+
isRecord(childTranslators
|
|
261
|
+
.codeBlockTranslators)
|
|
262
|
+
? childTranslators.codeBlockTranslators
|
|
263
|
+
: null;
|
|
264
|
+
return {
|
|
265
|
+
noEscape: true,
|
|
266
|
+
preserveWhitespace: true,
|
|
267
|
+
...(codeBlockTranslators
|
|
268
|
+
? { childTranslators: codeBlockTranslators }
|
|
269
|
+
: null),
|
|
270
|
+
postprocess: ({ content }) => {
|
|
271
|
+
const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
|
|
272
|
+
return CODE_BLOCK.format(content, language);
|
|
273
|
+
},
|
|
274
|
+
};
|
|
275
|
+
},
|
|
276
|
+
};
|
|
277
|
+
}
|
|
278
|
+
let markdownInstance = null;
|
|
279
|
+
function createMarkdownInstance() {
|
|
280
|
+
return new NodeHtmlMarkdown({
|
|
281
|
+
codeFence: CODE_BLOCK.fence,
|
|
162
282
|
codeBlockStyle: 'fenced',
|
|
163
283
|
emDelimiter: '_',
|
|
164
|
-
|
|
165
|
-
});
|
|
166
|
-
addNoiseRule(instance);
|
|
167
|
-
addFencedCodeRule(instance);
|
|
168
|
-
return instance;
|
|
284
|
+
bulletMarker: '-',
|
|
285
|
+
}, createCodeTranslator());
|
|
169
286
|
}
|
|
170
|
-
function
|
|
171
|
-
|
|
172
|
-
return
|
|
287
|
+
function getMarkdownConverter() {
|
|
288
|
+
markdownInstance ??= createMarkdownInstance();
|
|
289
|
+
return markdownInstance;
|
|
173
290
|
}
|
|
174
|
-
export function htmlToMarkdown(html, metadata) {
|
|
291
|
+
export function htmlToMarkdown(html, metadata, options) {
|
|
292
|
+
const url = options?.url ?? metadata?.url ?? '';
|
|
175
293
|
const frontmatter = buildFrontmatter(metadata);
|
|
176
294
|
if (!html)
|
|
177
295
|
return frontmatter;
|
|
178
296
|
try {
|
|
179
|
-
|
|
297
|
+
throwIfAborted(options?.signal, url, 'markdown:begin');
|
|
298
|
+
const noiseStage = startTransformStage(url, 'markdown:noise');
|
|
299
|
+
const cleanedHtml = removeNoiseFromHtml(html);
|
|
300
|
+
endTransformStage(noiseStage);
|
|
301
|
+
throwIfAborted(options?.signal, url, 'markdown:cleaned');
|
|
302
|
+
const translateStage = startTransformStage(url, 'markdown:translate');
|
|
303
|
+
const content = getMarkdownConverter().translate(cleanedHtml).trim();
|
|
304
|
+
endTransformStage(translateStage);
|
|
305
|
+
throwIfAborted(options?.signal, url, 'markdown:translated');
|
|
180
306
|
return frontmatter ? `${frontmatter}\n${content}` : content;
|
|
181
307
|
}
|
|
182
|
-
catch {
|
|
308
|
+
catch (error) {
|
|
309
|
+
if (error instanceof FetchError) {
|
|
310
|
+
throw error;
|
|
311
|
+
}
|
|
183
312
|
return frontmatter;
|
|
184
313
|
}
|
|
185
314
|
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function throwIfAborted(signal: AbortSignal | undefined, url: string, stage: string): void;
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import { FetchError } from '../errors/app-error.js';
|
|
2
|
+
function isTimeoutReason(reason) {
|
|
3
|
+
return reason instanceof Error && reason.name === 'TimeoutError';
|
|
4
|
+
}
|
|
5
|
+
export function throwIfAborted(signal, url, stage) {
|
|
6
|
+
if (!signal?.aborted)
|
|
7
|
+
return;
|
|
8
|
+
if (isTimeoutReason(signal.reason)) {
|
|
9
|
+
throw new FetchError('Request timeout', url, 504, {
|
|
10
|
+
reason: 'timeout',
|
|
11
|
+
stage,
|
|
12
|
+
});
|
|
13
|
+
}
|
|
14
|
+
throw new FetchError('Request was canceled', url, 499, {
|
|
15
|
+
reason: 'aborted',
|
|
16
|
+
stage,
|
|
17
|
+
});
|
|
18
|
+
}
|
|
@@ -1,11 +1,2 @@
|
|
|
1
|
-
export declare function containsJsxTag(code: string): boolean;
|
|
2
|
-
export declare function containsWord(source: string, word: string): boolean;
|
|
3
|
-
export declare function splitLines(content: string): string[];
|
|
4
|
-
export declare function extractLanguageFromClassName(className: string): string | undefined;
|
|
5
|
-
export declare function resolveLanguageFromDataAttribute(dataLang: string): string | undefined;
|
|
6
|
-
export interface CodeDetector {
|
|
7
|
-
language: string;
|
|
8
|
-
detect: (code: string) => boolean;
|
|
9
|
-
}
|
|
10
1
|
export declare function detectLanguageFromCode(code: string): string | undefined;
|
|
11
2
|
export declare function resolveLanguageFromAttributes(className: string, dataLang: string): string | undefined;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
function containsJsxTag(code) {
|
|
2
2
|
for (let index = 0; index < code.length - 1; index += 1) {
|
|
3
3
|
if (code[index] !== '<')
|
|
4
4
|
continue;
|
|
@@ -10,7 +10,7 @@ export function containsJsxTag(code) {
|
|
|
10
10
|
}
|
|
11
11
|
return false;
|
|
12
12
|
}
|
|
13
|
-
|
|
13
|
+
function containsWord(source, word) {
|
|
14
14
|
let startIndex = source.indexOf(word);
|
|
15
15
|
while (startIndex !== -1) {
|
|
16
16
|
const before = startIndex === 0 ? '' : source[startIndex - 1];
|
|
@@ -22,10 +22,10 @@ export function containsWord(source, word) {
|
|
|
22
22
|
}
|
|
23
23
|
return false;
|
|
24
24
|
}
|
|
25
|
-
|
|
25
|
+
function splitLines(content) {
|
|
26
26
|
return content.split('\n');
|
|
27
27
|
}
|
|
28
|
-
|
|
28
|
+
function extractLanguageFromClassName(className) {
|
|
29
29
|
const tokens = className.match(/\S+/g);
|
|
30
30
|
if (!tokens)
|
|
31
31
|
return undefined;
|
|
@@ -41,7 +41,7 @@ export function extractLanguageFromClassName(className) {
|
|
|
41
41
|
}
|
|
42
42
|
return undefined;
|
|
43
43
|
}
|
|
44
|
-
|
|
44
|
+
function resolveLanguageFromDataAttribute(dataLang) {
|
|
45
45
|
const trimmed = dataLang.trim();
|
|
46
46
|
if (!trimmed)
|
|
47
47
|
return undefined;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function normalizeHost(value: string): string | null;
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { isIP } from 'node:net';
|
|
2
|
+
function takeFirstHostValue(value) {
|
|
3
|
+
const first = value.split(',')[0];
|
|
4
|
+
if (!first)
|
|
5
|
+
return null;
|
|
6
|
+
const trimmed = first.trim();
|
|
7
|
+
return trimmed ? trimmed : null;
|
|
8
|
+
}
|
|
9
|
+
function stripIpv6Brackets(value) {
|
|
10
|
+
if (!value.startsWith('['))
|
|
11
|
+
return null;
|
|
12
|
+
const end = value.indexOf(']');
|
|
13
|
+
if (end === -1)
|
|
14
|
+
return null;
|
|
15
|
+
return value.slice(1, end);
|
|
16
|
+
}
|
|
17
|
+
function stripPortIfPresent(value) {
|
|
18
|
+
const colonIndex = value.indexOf(':');
|
|
19
|
+
if (colonIndex === -1)
|
|
20
|
+
return value;
|
|
21
|
+
return value.slice(0, colonIndex);
|
|
22
|
+
}
|
|
23
|
+
export function normalizeHost(value) {
|
|
24
|
+
const trimmed = value.trim().toLowerCase();
|
|
25
|
+
if (!trimmed)
|
|
26
|
+
return null;
|
|
27
|
+
const first = takeFirstHostValue(trimmed);
|
|
28
|
+
if (!first)
|
|
29
|
+
return null;
|
|
30
|
+
const ipv6 = stripIpv6Brackets(first);
|
|
31
|
+
if (ipv6)
|
|
32
|
+
return ipv6;
|
|
33
|
+
if (isIP(first) === 6) {
|
|
34
|
+
return first;
|
|
35
|
+
}
|
|
36
|
+
return stripPortIfPresent(first);
|
|
37
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function redactUrl(rawUrl: string): string;
|
|
@@ -78,6 +78,8 @@ export function normalizeUrl(urlString) {
|
|
|
78
78
|
assertNoCredentials(url);
|
|
79
79
|
const hostname = normalizeHostname(url);
|
|
80
80
|
assertHostnameAllowed(hostname);
|
|
81
|
+
// Canonicalize hostname to avoid trailing-dot variants and keep url.href consistent.
|
|
82
|
+
url.hostname = hostname;
|
|
81
83
|
return { normalizedUrl: url.href, hostname };
|
|
82
84
|
}
|
|
83
85
|
export function validateAndNormalizeUrl(urlString) {
|
|
@@ -103,12 +105,10 @@ function assertUrlLength(url) {
|
|
|
103
105
|
throw createValidationError(`URL exceeds maximum length of ${config.constants.maxUrlLength} characters`);
|
|
104
106
|
}
|
|
105
107
|
function parseUrl(urlString) {
|
|
106
|
-
|
|
107
|
-
return new URL(urlString);
|
|
108
|
-
}
|
|
109
|
-
catch {
|
|
108
|
+
if (!URL.canParse(urlString)) {
|
|
110
109
|
throw createValidationError('Invalid URL format');
|
|
111
110
|
}
|
|
111
|
+
return new URL(urlString);
|
|
112
112
|
}
|
|
113
113
|
function assertHttpProtocol(url) {
|
|
114
114
|
if (url.protocol === 'http:' || url.protocol === 'https:')
|
|
@@ -121,7 +121,10 @@ function assertNoCredentials(url) {
|
|
|
121
121
|
throw createValidationError('URLs with embedded credentials are not allowed');
|
|
122
122
|
}
|
|
123
123
|
function normalizeHostname(url) {
|
|
124
|
-
|
|
124
|
+
let hostname = url.hostname.toLowerCase();
|
|
125
|
+
while (hostname.endsWith('.')) {
|
|
126
|
+
hostname = hostname.slice(0, -1);
|
|
127
|
+
}
|
|
125
128
|
if (!hostname) {
|
|
126
129
|
throw createValidationError('URL must have a valid hostname');
|
|
127
130
|
}
|
|
@@ -1,50 +1,94 @@
|
|
|
1
1
|
import { parentPort } from 'node:worker_threads';
|
|
2
|
-
import {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
return (typeof record.id === 'number' &&
|
|
8
|
-
typeof record.mode === 'string' &&
|
|
9
|
-
typeof record.html === 'string' &&
|
|
10
|
-
typeof record.url === 'string');
|
|
2
|
+
import { FetchError, getErrorMessage } from '../errors.js';
|
|
3
|
+
import { transformHtmlToMarkdownInProcess } from '../transform.js';
|
|
4
|
+
const controllers = new Map();
|
|
5
|
+
function isRecord(value) {
|
|
6
|
+
return typeof value === 'object' && value !== null;
|
|
11
7
|
}
|
|
12
|
-
function
|
|
13
|
-
|
|
14
|
-
|
|
8
|
+
function post(message) {
|
|
9
|
+
parentPort?.postMessage(message);
|
|
10
|
+
}
|
|
11
|
+
function handleTransform(message) {
|
|
12
|
+
const controller = new AbortController();
|
|
13
|
+
controllers.set(message.id, controller);
|
|
14
|
+
try {
|
|
15
|
+
const result = transformHtmlToMarkdownInProcess(message.html, message.url, {
|
|
16
|
+
includeMetadata: message.includeMetadata,
|
|
17
|
+
signal: controller.signal,
|
|
18
|
+
});
|
|
19
|
+
post({
|
|
20
|
+
type: 'result',
|
|
21
|
+
id: message.id,
|
|
22
|
+
result: {
|
|
23
|
+
markdown: result.markdown,
|
|
24
|
+
...(result.title === undefined ? {} : { title: result.title }),
|
|
25
|
+
truncated: result.truncated,
|
|
26
|
+
},
|
|
27
|
+
});
|
|
15
28
|
}
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
29
|
+
catch (error) {
|
|
30
|
+
if (error instanceof FetchError) {
|
|
31
|
+
post({
|
|
32
|
+
type: 'error',
|
|
33
|
+
id: message.id,
|
|
34
|
+
error: {
|
|
35
|
+
name: error.name,
|
|
36
|
+
message: error.message,
|
|
37
|
+
url: error.url,
|
|
38
|
+
statusCode: error.statusCode,
|
|
39
|
+
details: { ...error.details },
|
|
40
|
+
},
|
|
41
|
+
});
|
|
42
|
+
return;
|
|
43
|
+
}
|
|
44
|
+
post({
|
|
45
|
+
type: 'error',
|
|
46
|
+
id: message.id,
|
|
47
|
+
error: {
|
|
48
|
+
name: error instanceof Error ? error.name : 'Error',
|
|
49
|
+
message: getErrorMessage(error),
|
|
50
|
+
url: message.url,
|
|
51
|
+
},
|
|
20
52
|
});
|
|
21
53
|
}
|
|
22
|
-
|
|
54
|
+
finally {
|
|
55
|
+
controllers.delete(message.id);
|
|
56
|
+
}
|
|
23
57
|
}
|
|
24
|
-
function
|
|
25
|
-
|
|
58
|
+
function handleCancel(message) {
|
|
59
|
+
const controller = controllers.get(message.id);
|
|
60
|
+
if (!controller)
|
|
26
61
|
return;
|
|
27
|
-
|
|
62
|
+
controller.abort(new Error('Canceled'));
|
|
28
63
|
}
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
64
|
+
if (!parentPort) {
|
|
65
|
+
throw new Error('transform-worker started without parentPort');
|
|
66
|
+
}
|
|
67
|
+
parentPort.on('message', (raw) => {
|
|
68
|
+
if (!isRecord(raw))
|
|
69
|
+
return;
|
|
70
|
+
const { type } = raw;
|
|
71
|
+
if (type === 'cancel') {
|
|
72
|
+
if (typeof raw.id !== 'string')
|
|
73
|
+
return;
|
|
74
|
+
handleCancel({ type: 'cancel', id: raw.id });
|
|
36
75
|
return;
|
|
37
76
|
}
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
77
|
+
if (type === 'transform') {
|
|
78
|
+
if (typeof raw.id !== 'string')
|
|
79
|
+
return;
|
|
80
|
+
if (typeof raw.html !== 'string')
|
|
81
|
+
return;
|
|
82
|
+
if (typeof raw.url !== 'string')
|
|
83
|
+
return;
|
|
84
|
+
if (typeof raw.includeMetadata !== 'boolean')
|
|
85
|
+
return;
|
|
86
|
+
handleTransform({
|
|
87
|
+
type: 'transform',
|
|
88
|
+
id: raw.id,
|
|
89
|
+
html: raw.html,
|
|
90
|
+
url: raw.url,
|
|
91
|
+
includeMetadata: raw.includeMetadata,
|
|
47
92
|
});
|
|
48
93
|
}
|
|
49
|
-
}
|
|
50
|
-
parentPort?.on('message', handleMessage);
|
|
94
|
+
});
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@j0hanz/superfetch",
|
|
3
|
-
"version": "2.0
|
|
3
|
+
"version": "2.1.0",
|
|
4
4
|
"mcpName": "io.github.j0hanz/superfetch",
|
|
5
5
|
"description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable Markdown",
|
|
6
6
|
"type": "module",
|
|
@@ -40,6 +40,8 @@
|
|
|
40
40
|
"start": "node dist/index.js",
|
|
41
41
|
"format": "prettier --write .",
|
|
42
42
|
"type-check": "tsc --noEmit",
|
|
43
|
+
"type-check:diagnostics": "tsc --noEmit --extendedDiagnostics",
|
|
44
|
+
"type-check:trace": "node -e \"require('fs').rmSync('.ts-trace',{recursive:true,force:true})\" && tsc --noEmit --generateTrace .ts-trace",
|
|
43
45
|
"lint": "eslint .",
|
|
44
46
|
"lint:fix": "eslint . --fix",
|
|
45
47
|
"test": "npm run build --silent && node --test --experimental-transform-types",
|
|
@@ -54,29 +56,28 @@
|
|
|
54
56
|
"@mozilla/readability": "^0.6.0",
|
|
55
57
|
"express": "^5.2.1",
|
|
56
58
|
"linkedom": "^0.18.12",
|
|
57
|
-
"
|
|
58
|
-
"undici": "^
|
|
59
|
+
"node-html-markdown": "^2.0.0",
|
|
60
|
+
"undici": "^7.18.2",
|
|
59
61
|
"zod": "^4.3.5"
|
|
60
62
|
},
|
|
61
63
|
"devDependencies": {
|
|
62
64
|
"@eslint/js": "^9.39.2",
|
|
63
65
|
"@trivago/prettier-plugin-sort-imports": "^6.0.2",
|
|
64
66
|
"@types/express": "^5.0.6",
|
|
65
|
-
"@types/node": "^22.19.
|
|
66
|
-
"@types/turndown": "^5.0.6",
|
|
67
|
+
"@types/node": "^22.19.5",
|
|
67
68
|
"eslint": "^9.23.2",
|
|
68
69
|
"eslint-config-prettier": "^10.1.8",
|
|
69
70
|
"eslint-plugin-de-morgan": "^2.0.0",
|
|
70
71
|
"eslint-plugin-depend": "^1.4.0",
|
|
71
72
|
"eslint-plugin-sonarjs": "^3.0.5",
|
|
72
73
|
"eslint-plugin-unused-imports": "^4.3.0",
|
|
73
|
-
"knip": "^5.80.
|
|
74
|
+
"knip": "^5.80.2",
|
|
74
75
|
"prettier": "^3.7.4",
|
|
75
76
|
"tsx": "^4.21.0",
|
|
76
77
|
"typescript": "^5.9.3",
|
|
77
78
|
"typescript-eslint": "^8.52.0"
|
|
78
79
|
},
|
|
79
80
|
"engines": {
|
|
80
|
-
"node": ">=20.
|
|
81
|
+
"node": ">=20.18.1"
|
|
81
82
|
}
|
|
82
83
|
}
|