@j0hanz/superfetch 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +139 -46
- package/dist/cache.d.ts +42 -0
- package/dist/cache.js +565 -0
- package/dist/config/env-parsers.d.ts +1 -0
- package/dist/config/env-parsers.js +12 -0
- package/dist/config/index.d.ts +7 -0
- package/dist/config/index.js +20 -8
- package/dist/config/types/content.d.ts +1 -0
- package/dist/config.d.ts +77 -0
- package/dist/config.js +261 -0
- package/dist/crypto.d.ts +2 -0
- package/dist/crypto.js +32 -0
- package/dist/errors.d.ts +10 -0
- package/dist/errors.js +28 -0
- package/dist/fetch.d.ts +40 -0
- package/dist/fetch.js +910 -0
- package/dist/http/auth.js +161 -2
- package/dist/http/base-middleware.d.ts +7 -0
- package/dist/http/base-middleware.js +143 -0
- package/dist/http/cors.d.ts +0 -5
- package/dist/http/cors.js +0 -6
- package/dist/http/download-routes.js +6 -2
- package/dist/http/error-handler.d.ts +2 -0
- package/dist/http/error-handler.js +55 -0
- package/dist/http/host-allowlist.d.ts +3 -0
- package/dist/http/host-allowlist.js +117 -0
- package/dist/http/mcp-routes.d.ts +8 -2
- package/dist/http/mcp-routes.js +101 -8
- package/dist/http/mcp-session-eviction.d.ts +3 -0
- package/dist/http/mcp-session-eviction.js +24 -0
- package/dist/http/mcp-session-init.d.ts +7 -0
- package/dist/http/mcp-session-init.js +94 -0
- package/dist/http/mcp-session-slots.d.ts +17 -0
- package/dist/http/mcp-session-slots.js +55 -0
- package/dist/http/mcp-session-transport-init.d.ts +7 -0
- package/dist/http/mcp-session-transport-init.js +41 -0
- package/dist/http/mcp-session-types.d.ts +5 -0
- package/dist/http/mcp-session-types.js +1 -0
- package/dist/http/mcp-session.d.ts +9 -9
- package/dist/http/mcp-session.js +5 -114
- package/dist/http/mcp-sessions.d.ts +41 -0
- package/dist/http/mcp-sessions.js +392 -0
- package/dist/http/rate-limit.js +2 -2
- package/dist/http/server-middleware.d.ts +6 -1
- package/dist/http/server-middleware.js +3 -117
- package/dist/http/server-shutdown.js +1 -1
- package/dist/http/server-tuning.d.ts +9 -0
- package/dist/http/server-tuning.js +45 -0
- package/dist/http/server.js +206 -9
- package/dist/http/session-cleanup.js +8 -5
- package/dist/http.d.ts +78 -0
- package/dist/http.js +1437 -0
- package/dist/index.js +3 -3
- package/dist/mcp.d.ts +3 -0
- package/dist/mcp.js +94 -0
- package/dist/middleware/error-handler.d.ts +1 -1
- package/dist/middleware/error-handler.js +31 -30
- package/dist/observability.d.ts +16 -0
- package/dist/observability.js +78 -0
- package/dist/resources/cached-content-params.d.ts +5 -0
- package/dist/resources/cached-content-params.js +36 -0
- package/dist/resources/cached-content.js +33 -33
- package/dist/server.js +21 -6
- package/dist/services/cache-events.d.ts +8 -0
- package/dist/services/cache-events.js +19 -0
- package/dist/services/cache.d.ts +5 -4
- package/dist/services/cache.js +49 -45
- package/dist/services/context.d.ts +2 -0
- package/dist/services/context.js +3 -0
- package/dist/services/extractor.d.ts +1 -0
- package/dist/services/extractor.js +77 -40
- package/dist/services/fetcher/agents.js +1 -1
- package/dist/services/fetcher/dns-selection.js +1 -1
- package/dist/services/fetcher/interceptors.js +29 -60
- package/dist/services/fetcher/redirects.js +12 -4
- package/dist/services/fetcher/response.js +18 -8
- package/dist/services/fetcher.d.ts +23 -0
- package/dist/services/fetcher.js +553 -13
- package/dist/services/logger.js +4 -1
- package/dist/services/telemetry.d.ts +19 -0
- package/dist/services/telemetry.js +43 -0
- package/dist/services/transform-worker-pool.d.ts +10 -3
- package/dist/services/transform-worker-pool.js +213 -184
- package/dist/tools/handlers/fetch-single.shared.d.ts +11 -3
- package/dist/tools/handlers/fetch-single.shared.js +131 -2
- package/dist/tools/handlers/fetch-url.tool.d.ts +6 -0
- package/dist/tools/handlers/fetch-url.tool.js +56 -12
- package/dist/tools/index.d.ts +1 -0
- package/dist/tools/index.js +13 -1
- package/dist/tools/schemas.d.ts +2 -0
- package/dist/tools/schemas.js +8 -0
- package/dist/tools/utils/content-shaping.js +19 -4
- package/dist/tools/utils/content-transform-core.d.ts +5 -0
- package/dist/tools/utils/content-transform-core.js +180 -0
- package/dist/tools/utils/content-transform-workers.d.ts +1 -0
- package/dist/tools/utils/content-transform-workers.js +1 -0
- package/dist/tools/utils/content-transform.d.ts +2 -1
- package/dist/tools/utils/content-transform.js +37 -136
- package/dist/tools/utils/fetch-pipeline.js +47 -56
- package/dist/tools/utils/frontmatter.d.ts +3 -0
- package/dist/tools/utils/frontmatter.js +73 -0
- package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
- package/dist/tools/utils/markdown-heuristics.js +19 -0
- package/dist/tools/utils/markdown-signals.d.ts +1 -0
- package/dist/tools/utils/markdown-signals.js +19 -0
- package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
- package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
- package/dist/tools/utils/raw-markdown.d.ts +6 -0
- package/dist/tools/utils/raw-markdown.js +149 -0
- package/dist/tools.d.ts +104 -0
- package/dist/tools.js +421 -0
- package/dist/transform.d.ts +69 -0
- package/dist/transform.js +1509 -0
- package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
- package/dist/transformers/markdown/fenced-code-rule.js +38 -0
- package/dist/transformers/markdown/frontmatter.d.ts +2 -0
- package/dist/transformers/markdown/frontmatter.js +45 -0
- package/dist/transformers/markdown/noise-rule.d.ts +2 -0
- package/dist/transformers/markdown/noise-rule.js +80 -0
- package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
- package/dist/transformers/markdown/turndown-instance.js +19 -0
- package/dist/transformers/markdown.d.ts +5 -0
- package/dist/transformers/markdown.js +314 -0
- package/dist/transformers/markdown.transformer.js +2 -189
- package/dist/utils/cancellation.d.ts +1 -0
- package/dist/utils/cancellation.js +18 -0
- package/dist/utils/code-language-bash.d.ts +1 -0
- package/dist/utils/code-language-bash.js +48 -0
- package/dist/utils/code-language-core.d.ts +2 -0
- package/dist/utils/code-language-core.js +13 -0
- package/dist/utils/code-language-detectors.d.ts +5 -0
- package/dist/utils/code-language-detectors.js +142 -0
- package/dist/utils/code-language-helpers.d.ts +5 -0
- package/dist/utils/code-language-helpers.js +62 -0
- package/dist/utils/code-language-parsing.d.ts +5 -0
- package/dist/utils/code-language-parsing.js +62 -0
- package/dist/utils/code-language.js +250 -46
- package/dist/utils/error-details.d.ts +3 -0
- package/dist/utils/error-details.js +12 -0
- package/dist/utils/filename-generator.js +14 -3
- package/dist/utils/host-normalizer.d.ts +1 -0
- package/dist/utils/host-normalizer.js +37 -0
- package/dist/utils/ip-address.d.ts +4 -0
- package/dist/utils/ip-address.js +6 -0
- package/dist/utils/tool-error-handler.js +12 -17
- package/dist/utils/url-redactor.d.ts +1 -0
- package/dist/utils/url-redactor.js +13 -0
- package/dist/utils/url-validator.js +35 -20
- package/dist/workers/transform-worker.js +82 -38
- package/package.json +13 -10
|
@@ -5,19 +5,29 @@ import { logDebug } from '../../services/logger.js';
|
|
|
5
5
|
import { isRecord } from '../../utils/guards.js';
|
|
6
6
|
import { transformToRawUrl } from '../../utils/url-transformer.js';
|
|
7
7
|
import { normalizeUrl } from '../../utils/url-validator.js';
|
|
8
|
-
function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalizedUrl) {
|
|
8
|
+
function attemptCacheRetrieval({ cacheKey, deserialize, cacheNamespace, normalizedUrl, }) {
|
|
9
9
|
if (!cacheKey)
|
|
10
10
|
return null;
|
|
11
11
|
const cached = cache.get(cacheKey);
|
|
12
12
|
if (!cached)
|
|
13
13
|
return null;
|
|
14
|
-
if (!deserialize)
|
|
15
|
-
|
|
14
|
+
if (!deserialize) {
|
|
15
|
+
logCacheMiss('missing deserializer', cacheNamespace, normalizedUrl);
|
|
16
|
+
return null;
|
|
17
|
+
}
|
|
16
18
|
const data = deserialize(cached.content);
|
|
17
|
-
if (data === undefined)
|
|
18
|
-
|
|
19
|
+
if (data === undefined) {
|
|
20
|
+
logCacheMiss('deserialize failure', cacheNamespace, normalizedUrl);
|
|
21
|
+
return null;
|
|
22
|
+
}
|
|
19
23
|
logDebug('Cache hit', { namespace: cacheNamespace, url: normalizedUrl });
|
|
20
|
-
return
|
|
24
|
+
return {
|
|
25
|
+
data,
|
|
26
|
+
fromCache: true,
|
|
27
|
+
url: normalizedUrl,
|
|
28
|
+
fetchedAt: cached.fetchedAt,
|
|
29
|
+
cacheKey,
|
|
30
|
+
};
|
|
21
31
|
}
|
|
22
32
|
function resolveNormalizedUrl(url) {
|
|
23
33
|
const { normalizedUrl: validatedUrl } = normalizeUrl(url);
|
|
@@ -27,44 +37,44 @@ function resolveNormalizedUrl(url) {
|
|
|
27
37
|
export async function executeFetchPipeline(options) {
|
|
28
38
|
const resolvedUrl = resolveNormalizedUrl(options.url);
|
|
29
39
|
logRawUrlTransformation(resolvedUrl);
|
|
30
|
-
const cacheKey =
|
|
31
|
-
const cachedResult = attemptCacheRetrieval(
|
|
40
|
+
const cacheKey = createCacheKey(options.cacheNamespace, resolvedUrl.normalizedUrl, options.cacheVary);
|
|
41
|
+
const cachedResult = attemptCacheRetrieval({
|
|
42
|
+
cacheKey,
|
|
43
|
+
deserialize: options.deserialize,
|
|
44
|
+
cacheNamespace: options.cacheNamespace,
|
|
45
|
+
normalizedUrl: resolvedUrl.normalizedUrl,
|
|
46
|
+
});
|
|
32
47
|
if (cachedResult)
|
|
33
48
|
return cachedResult;
|
|
34
|
-
|
|
49
|
+
logDebug('Fetching URL', { url: resolvedUrl.normalizedUrl });
|
|
50
|
+
const fetchOptions = options.signal === undefined ? {} : { signal: options.signal };
|
|
51
|
+
const html = await fetchNormalizedUrl(resolvedUrl.normalizedUrl, fetchOptions);
|
|
52
|
+
const data = await options.transform(html, resolvedUrl.normalizedUrl);
|
|
35
53
|
if (cache.isEnabled()) {
|
|
36
|
-
persistCache(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
}
|
|
43
|
-
async function fetchAndTransform(options, normalizedUrl) {
|
|
44
|
-
const fetchOptions = buildFetchOptions(options);
|
|
45
|
-
logDebug('Fetching URL', { url: normalizedUrl });
|
|
46
|
-
const html = await fetchNormalizedUrl(normalizedUrl, fetchOptions);
|
|
47
|
-
return options.transform(html, normalizedUrl);
|
|
48
|
-
}
|
|
49
|
-
function buildFetchOptions(options) {
|
|
50
|
-
return options.signal === undefined ? {} : { signal: options.signal };
|
|
51
|
-
}
|
|
52
|
-
function resolveCacheMetadata(data, normalizedUrl) {
|
|
53
|
-
const metadata = { url: normalizedUrl };
|
|
54
|
-
const title = extractTitle(data);
|
|
55
|
-
if (title !== undefined) {
|
|
56
|
-
metadata.title = title;
|
|
54
|
+
persistCache({
|
|
55
|
+
cacheKey,
|
|
56
|
+
data,
|
|
57
|
+
serialize: options.serialize,
|
|
58
|
+
normalizedUrl: resolvedUrl.normalizedUrl,
|
|
59
|
+
});
|
|
57
60
|
}
|
|
58
|
-
return
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
61
|
+
return {
|
|
62
|
+
data,
|
|
63
|
+
fromCache: false,
|
|
64
|
+
url: resolvedUrl.normalizedUrl,
|
|
65
|
+
fetchedAt: new Date().toISOString(),
|
|
66
|
+
cacheKey,
|
|
67
|
+
};
|
|
62
68
|
}
|
|
63
|
-
function persistCache(cacheKey, data, serialize, normalizedUrl) {
|
|
69
|
+
function persistCache({ cacheKey, data, serialize, normalizedUrl, }) {
|
|
64
70
|
if (!cacheKey)
|
|
65
71
|
return;
|
|
66
|
-
const serializer =
|
|
67
|
-
const
|
|
72
|
+
const serializer = serialize ?? JSON.stringify;
|
|
73
|
+
const title = extractTitle(data);
|
|
74
|
+
const metadata = {
|
|
75
|
+
url: normalizedUrl,
|
|
76
|
+
...(title === undefined ? {} : { title }),
|
|
77
|
+
};
|
|
68
78
|
cache.set(cacheKey, serializer(data), metadata);
|
|
69
79
|
}
|
|
70
80
|
function extractTitle(value) {
|
|
@@ -78,7 +88,6 @@ function logCacheMiss(reason, cacheNamespace, normalizedUrl) {
|
|
|
78
88
|
namespace: cacheNamespace,
|
|
79
89
|
url: normalizedUrl,
|
|
80
90
|
});
|
|
81
|
-
return null;
|
|
82
91
|
}
|
|
83
92
|
function logRawUrlTransformation(resolvedUrl) {
|
|
84
93
|
if (!resolvedUrl.transformed)
|
|
@@ -87,21 +96,3 @@ function logRawUrlTransformation(resolvedUrl) {
|
|
|
87
96
|
original: resolvedUrl.originalUrl,
|
|
88
97
|
});
|
|
89
98
|
}
|
|
90
|
-
function buildCacheHitResult(data, fetchedAt, url, cacheKey) {
|
|
91
|
-
return {
|
|
92
|
-
data,
|
|
93
|
-
fromCache: true,
|
|
94
|
-
url,
|
|
95
|
-
fetchedAt,
|
|
96
|
-
cacheKey,
|
|
97
|
-
};
|
|
98
|
-
}
|
|
99
|
-
function buildPipelineResult(url, data, cacheKey) {
|
|
100
|
-
return {
|
|
101
|
-
data,
|
|
102
|
-
fromCache: false,
|
|
103
|
-
url,
|
|
104
|
-
fetchedAt: new Date().toISOString(),
|
|
105
|
-
cacheKey,
|
|
106
|
-
};
|
|
107
|
-
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
function detectLineEnding(content) {
|
|
2
|
+
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
3
|
+
}
|
|
4
|
+
function findFrontmatterLines(content) {
|
|
5
|
+
const lineEnding = detectLineEnding(content);
|
|
6
|
+
const lines = content.split(lineEnding);
|
|
7
|
+
if (lines[0] !== '---')
|
|
8
|
+
return null;
|
|
9
|
+
const endIndex = lines.indexOf('---', 1);
|
|
10
|
+
if (endIndex === -1)
|
|
11
|
+
return null;
|
|
12
|
+
return { lineEnding, lines, endIndex };
|
|
13
|
+
}
|
|
14
|
+
function stripOptionalQuotes(value) {
|
|
15
|
+
const trimmed = value.trim();
|
|
16
|
+
if (trimmed.length < 2)
|
|
17
|
+
return trimmed;
|
|
18
|
+
const first = trimmed[0];
|
|
19
|
+
const last = trimmed[trimmed.length - 1];
|
|
20
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
21
|
+
return trimmed.slice(1, -1).trim();
|
|
22
|
+
}
|
|
23
|
+
return trimmed;
|
|
24
|
+
}
|
|
25
|
+
function parseFrontmatterEntry(line) {
|
|
26
|
+
const trimmed = line.trim();
|
|
27
|
+
if (!trimmed)
|
|
28
|
+
return null;
|
|
29
|
+
const separatorIndex = trimmed.indexOf(':');
|
|
30
|
+
if (separatorIndex <= 0)
|
|
31
|
+
return null;
|
|
32
|
+
const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
|
|
33
|
+
const value = trimmed.slice(separatorIndex + 1);
|
|
34
|
+
return { key, value };
|
|
35
|
+
}
|
|
36
|
+
function isTitleKey(key) {
|
|
37
|
+
return key === 'title' || key === 'name';
|
|
38
|
+
}
|
|
39
|
+
export function extractTitleFromRawMarkdown(content) {
|
|
40
|
+
const frontmatter = findFrontmatterLines(content);
|
|
41
|
+
if (!frontmatter)
|
|
42
|
+
return undefined;
|
|
43
|
+
const { lines, endIndex } = frontmatter;
|
|
44
|
+
const entry = lines
|
|
45
|
+
.slice(1, endIndex)
|
|
46
|
+
.map((line) => parseFrontmatterEntry(line))
|
|
47
|
+
.find((parsed) => parsed !== null && isTitleKey(parsed.key));
|
|
48
|
+
if (!entry)
|
|
49
|
+
return undefined;
|
|
50
|
+
const value = stripOptionalQuotes(entry.value);
|
|
51
|
+
return value || undefined;
|
|
52
|
+
}
|
|
53
|
+
export function addSourceToMarkdown(content, url) {
|
|
54
|
+
const frontmatter = findFrontmatterLines(content);
|
|
55
|
+
if (!frontmatter) {
|
|
56
|
+
return `---\nsource: "${url}"\n---\n\n${content}`;
|
|
57
|
+
}
|
|
58
|
+
const { lineEnding, lines, endIndex } = frontmatter;
|
|
59
|
+
const bodyLines = lines.slice(1, endIndex);
|
|
60
|
+
const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
|
|
61
|
+
if (hasSource)
|
|
62
|
+
return content;
|
|
63
|
+
const updatedLines = [
|
|
64
|
+
lines[0],
|
|
65
|
+
...bodyLines,
|
|
66
|
+
`source: "${url}"`,
|
|
67
|
+
...lines.slice(endIndex),
|
|
68
|
+
];
|
|
69
|
+
return updatedLines.join(lineEnding);
|
|
70
|
+
}
|
|
71
|
+
export function hasFrontmatter(trimmed) {
|
|
72
|
+
return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
|
|
73
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function looksLikeMarkdown(content: string): boolean;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
const HEADING_PATTERN = /^#{1,6}\s/m;
|
|
2
|
+
const LIST_PATTERN = /^(?:[-*+])\s/m;
|
|
3
|
+
export function looksLikeMarkdown(content) {
|
|
4
|
+
return (containsMarkdownHeading(content) ||
|
|
5
|
+
containsMarkdownList(content) ||
|
|
6
|
+
containsFencedCodeBlock(content));
|
|
7
|
+
}
|
|
8
|
+
function containsMarkdownHeading(content) {
|
|
9
|
+
return HEADING_PATTERN.test(content);
|
|
10
|
+
}
|
|
11
|
+
function containsMarkdownList(content) {
|
|
12
|
+
return LIST_PATTERN.test(content);
|
|
13
|
+
}
|
|
14
|
+
function containsFencedCodeBlock(content) {
|
|
15
|
+
const first = content.indexOf('```');
|
|
16
|
+
if (first === -1)
|
|
17
|
+
return false;
|
|
18
|
+
return content.includes('```', first + 3);
|
|
19
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function looksLikeMarkdown(content: string): boolean;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
const HEADING_PATTERN = /^#{1,6}\s/m;
|
|
2
|
+
const LIST_PATTERN = /^(?:[-*+])\s/m;
|
|
3
|
+
export function looksLikeMarkdown(content) {
|
|
4
|
+
return (containsMarkdownHeading(content) ||
|
|
5
|
+
containsMarkdownList(content) ||
|
|
6
|
+
containsFencedCodeBlock(content));
|
|
7
|
+
}
|
|
8
|
+
function containsMarkdownHeading(content) {
|
|
9
|
+
return HEADING_PATTERN.test(content);
|
|
10
|
+
}
|
|
11
|
+
function containsMarkdownList(content) {
|
|
12
|
+
return LIST_PATTERN.test(content);
|
|
13
|
+
}
|
|
14
|
+
function containsFencedCodeBlock(content) {
|
|
15
|
+
const first = content.indexOf('```');
|
|
16
|
+
if (first === -1)
|
|
17
|
+
return false;
|
|
18
|
+
return content.includes('```', first + 3);
|
|
19
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
function detectLineEnding(content) {
|
|
2
|
+
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
3
|
+
}
|
|
4
|
+
function findFrontmatterLines(content) {
|
|
5
|
+
const lineEnding = detectLineEnding(content);
|
|
6
|
+
const lines = content.split(lineEnding);
|
|
7
|
+
if (lines[0] !== '---')
|
|
8
|
+
return null;
|
|
9
|
+
const endIndex = lines.indexOf('---', 1);
|
|
10
|
+
if (endIndex === -1)
|
|
11
|
+
return null;
|
|
12
|
+
return { lineEnding, lines, endIndex };
|
|
13
|
+
}
|
|
14
|
+
function stripOptionalQuotes(value) {
|
|
15
|
+
const trimmed = value.trim();
|
|
16
|
+
if (trimmed.length < 2)
|
|
17
|
+
return trimmed;
|
|
18
|
+
const first = trimmed[0];
|
|
19
|
+
const last = trimmed[trimmed.length - 1];
|
|
20
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
21
|
+
return trimmed.slice(1, -1).trim();
|
|
22
|
+
}
|
|
23
|
+
return trimmed;
|
|
24
|
+
}
|
|
25
|
+
function parseFrontmatterEntry(line) {
|
|
26
|
+
const trimmed = line.trim();
|
|
27
|
+
if (!trimmed)
|
|
28
|
+
return null;
|
|
29
|
+
const separatorIndex = trimmed.indexOf(':');
|
|
30
|
+
if (separatorIndex <= 0)
|
|
31
|
+
return null;
|
|
32
|
+
const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
|
|
33
|
+
const value = trimmed.slice(separatorIndex + 1);
|
|
34
|
+
return { key, value };
|
|
35
|
+
}
|
|
36
|
+
function isTitleKey(key) {
|
|
37
|
+
return key === 'title' || key === 'name';
|
|
38
|
+
}
|
|
39
|
+
export function extractTitleFromRawMarkdown(content) {
|
|
40
|
+
const frontmatter = findFrontmatterLines(content);
|
|
41
|
+
if (!frontmatter)
|
|
42
|
+
return undefined;
|
|
43
|
+
const { lines, endIndex } = frontmatter;
|
|
44
|
+
const entry = lines
|
|
45
|
+
.slice(1, endIndex)
|
|
46
|
+
.map((line) => parseFrontmatterEntry(line))
|
|
47
|
+
.find((parsed) => parsed !== null && isTitleKey(parsed.key));
|
|
48
|
+
if (!entry)
|
|
49
|
+
return undefined;
|
|
50
|
+
const value = stripOptionalQuotes(entry.value);
|
|
51
|
+
return value || undefined;
|
|
52
|
+
}
|
|
53
|
+
export function addSourceToMarkdown(content, url) {
|
|
54
|
+
const frontmatter = findFrontmatterLines(content);
|
|
55
|
+
if (!frontmatter) {
|
|
56
|
+
return `---\nsource: "${url}"\n---\n\n${content}`;
|
|
57
|
+
}
|
|
58
|
+
const { lineEnding, lines, endIndex } = frontmatter;
|
|
59
|
+
const bodyLines = lines.slice(1, endIndex);
|
|
60
|
+
const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
|
|
61
|
+
if (hasSource)
|
|
62
|
+
return content;
|
|
63
|
+
const updatedLines = [
|
|
64
|
+
lines[0],
|
|
65
|
+
...bodyLines,
|
|
66
|
+
`source: "${url}"`,
|
|
67
|
+
...lines.slice(endIndex),
|
|
68
|
+
];
|
|
69
|
+
return updatedLines.join(lineEnding);
|
|
70
|
+
}
|
|
71
|
+
export function hasFrontmatter(trimmed) {
|
|
72
|
+
return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
|
|
73
|
+
}
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
import { logDebug } from '../../services/logger.js';
|
|
2
|
+
import { isRawTextContentUrl } from '../../utils/url-transformer.js';
|
|
3
|
+
const HEADING_PATTERN = /^#{1,6}\s/m;
|
|
4
|
+
const LIST_PATTERN = /^(?:[-*+])\s/m;
|
|
5
|
+
const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
|
|
6
|
+
function containsMarkdownHeading(content) {
|
|
7
|
+
return HEADING_PATTERN.test(content);
|
|
8
|
+
}
|
|
9
|
+
function containsMarkdownList(content) {
|
|
10
|
+
return LIST_PATTERN.test(content);
|
|
11
|
+
}
|
|
12
|
+
function containsFencedCodeBlock(content) {
|
|
13
|
+
const first = content.indexOf('```');
|
|
14
|
+
if (first === -1)
|
|
15
|
+
return false;
|
|
16
|
+
return content.includes('```', first + 3);
|
|
17
|
+
}
|
|
18
|
+
function looksLikeMarkdown(content) {
|
|
19
|
+
return (containsMarkdownHeading(content) ||
|
|
20
|
+
containsMarkdownList(content) ||
|
|
21
|
+
containsFencedCodeBlock(content));
|
|
22
|
+
}
|
|
23
|
+
function detectLineEnding(content) {
|
|
24
|
+
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
25
|
+
}
|
|
26
|
+
function findFrontmatterLines(content) {
|
|
27
|
+
const lineEnding = detectLineEnding(content);
|
|
28
|
+
const lines = content.split(lineEnding);
|
|
29
|
+
if (lines[0] !== '---')
|
|
30
|
+
return null;
|
|
31
|
+
const endIndex = lines.indexOf('---', 1);
|
|
32
|
+
if (endIndex === -1)
|
|
33
|
+
return null;
|
|
34
|
+
return { lineEnding, lines, endIndex };
|
|
35
|
+
}
|
|
36
|
+
function stripOptionalQuotes(value) {
|
|
37
|
+
const trimmed = value.trim();
|
|
38
|
+
if (trimmed.length < 2)
|
|
39
|
+
return trimmed;
|
|
40
|
+
const first = trimmed[0];
|
|
41
|
+
const last = trimmed[trimmed.length - 1];
|
|
42
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
43
|
+
return trimmed.slice(1, -1).trim();
|
|
44
|
+
}
|
|
45
|
+
return trimmed;
|
|
46
|
+
}
|
|
47
|
+
function parseFrontmatterEntry(line) {
|
|
48
|
+
const trimmed = line.trim();
|
|
49
|
+
if (!trimmed)
|
|
50
|
+
return null;
|
|
51
|
+
const separatorIndex = trimmed.indexOf(':');
|
|
52
|
+
if (separatorIndex <= 0)
|
|
53
|
+
return null;
|
|
54
|
+
const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
|
|
55
|
+
const value = trimmed.slice(separatorIndex + 1);
|
|
56
|
+
return { key, value };
|
|
57
|
+
}
|
|
58
|
+
function isTitleKey(key) {
|
|
59
|
+
return key === 'title' || key === 'name';
|
|
60
|
+
}
|
|
61
|
+
function extractTitleFromRawMarkdown(content) {
|
|
62
|
+
const frontmatter = findFrontmatterLines(content);
|
|
63
|
+
if (!frontmatter)
|
|
64
|
+
return undefined;
|
|
65
|
+
const { lines, endIndex } = frontmatter;
|
|
66
|
+
const entry = lines
|
|
67
|
+
.slice(1, endIndex)
|
|
68
|
+
.map((line) => parseFrontmatterEntry(line))
|
|
69
|
+
.find((parsed) => parsed !== null && isTitleKey(parsed.key));
|
|
70
|
+
if (!entry)
|
|
71
|
+
return undefined;
|
|
72
|
+
const value = stripOptionalQuotes(entry.value);
|
|
73
|
+
return value || undefined;
|
|
74
|
+
}
|
|
75
|
+
function addSourceToMarkdown(content, url) {
|
|
76
|
+
const frontmatter = findFrontmatterLines(content);
|
|
77
|
+
if (!frontmatter) {
|
|
78
|
+
return `---\nsource: "${url}"\n---\n\n${content}`;
|
|
79
|
+
}
|
|
80
|
+
const { lineEnding, lines, endIndex } = frontmatter;
|
|
81
|
+
const bodyLines = lines.slice(1, endIndex);
|
|
82
|
+
const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
|
|
83
|
+
if (hasSource)
|
|
84
|
+
return content;
|
|
85
|
+
const updatedLines = [
|
|
86
|
+
lines[0],
|
|
87
|
+
...bodyLines,
|
|
88
|
+
`source: "${url}"`,
|
|
89
|
+
...lines.slice(endIndex),
|
|
90
|
+
];
|
|
91
|
+
return updatedLines.join(lineEnding);
|
|
92
|
+
}
|
|
93
|
+
function hasFrontmatter(trimmed) {
|
|
94
|
+
return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
|
|
95
|
+
}
|
|
96
|
+
function looksLikeHtmlDocument(trimmed) {
|
|
97
|
+
return HTML_DOCUMENT_PATTERN.test(trimmed);
|
|
98
|
+
}
|
|
99
|
+
function countCommonHtmlTags(content) {
|
|
100
|
+
const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
|
|
101
|
+
[];
|
|
102
|
+
return matches.length;
|
|
103
|
+
}
|
|
104
|
+
function isRawTextContent(content) {
|
|
105
|
+
const trimmed = content.trim();
|
|
106
|
+
const isHtmlDocument = looksLikeHtmlDocument(trimmed);
|
|
107
|
+
const hasMarkdownFrontmatter = hasFrontmatter(trimmed);
|
|
108
|
+
const hasTooManyHtmlTags = countCommonHtmlTags(content) > 2;
|
|
109
|
+
const isMarkdown = looksLikeMarkdown(content);
|
|
110
|
+
return (!isHtmlDocument &&
|
|
111
|
+
(hasMarkdownFrontmatter || (!hasTooManyHtmlTags && isMarkdown)));
|
|
112
|
+
}
|
|
113
|
+
function isLikelyHtmlContent(content) {
|
|
114
|
+
const trimmed = content.trim();
|
|
115
|
+
if (!trimmed)
|
|
116
|
+
return false;
|
|
117
|
+
if (looksLikeHtmlDocument(trimmed))
|
|
118
|
+
return true;
|
|
119
|
+
return countCommonHtmlTags(content) > 2;
|
|
120
|
+
}
|
|
121
|
+
function shouldPreserveRawContent(url, content) {
|
|
122
|
+
if (isRawTextContentUrl(url)) {
|
|
123
|
+
return !isLikelyHtmlContent(content);
|
|
124
|
+
}
|
|
125
|
+
return isRawTextContent(content);
|
|
126
|
+
}
|
|
127
|
+
function buildRawMarkdownPayload({ rawContent, url, includeMetadata, }) {
|
|
128
|
+
const title = extractTitleFromRawMarkdown(rawContent);
|
|
129
|
+
const content = includeMetadata
|
|
130
|
+
? addSourceToMarkdown(rawContent, url)
|
|
131
|
+
: rawContent;
|
|
132
|
+
return { content, title };
|
|
133
|
+
}
|
|
134
|
+
export function tryTransformRawContent({ html, url, includeMetadata, }) {
|
|
135
|
+
if (!shouldPreserveRawContent(url, html)) {
|
|
136
|
+
return null;
|
|
137
|
+
}
|
|
138
|
+
logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
|
|
139
|
+
const { content, title } = buildRawMarkdownPayload({
|
|
140
|
+
rawContent: html,
|
|
141
|
+
url,
|
|
142
|
+
includeMetadata,
|
|
143
|
+
});
|
|
144
|
+
return {
|
|
145
|
+
markdown: content,
|
|
146
|
+
title,
|
|
147
|
+
truncated: false,
|
|
148
|
+
};
|
|
149
|
+
}
|
package/dist/tools.d.ts
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
2
|
+
import { type MarkdownTransformResult } from './transform.js';
|
|
3
|
+
export interface FetchUrlInput {
|
|
4
|
+
url: string;
|
|
5
|
+
}
|
|
6
|
+
export interface ToolContentBlock {
|
|
7
|
+
type: 'text';
|
|
8
|
+
text: string;
|
|
9
|
+
}
|
|
10
|
+
export interface ToolContentResourceLinkBlock {
|
|
11
|
+
type: 'resource_link';
|
|
12
|
+
uri: string;
|
|
13
|
+
name: string;
|
|
14
|
+
title?: string;
|
|
15
|
+
description?: string;
|
|
16
|
+
mimeType?: string;
|
|
17
|
+
}
|
|
18
|
+
export interface ToolContentResourceBlock {
|
|
19
|
+
type: 'resource';
|
|
20
|
+
resource: {
|
|
21
|
+
uri: string;
|
|
22
|
+
mimeType?: string;
|
|
23
|
+
text: string;
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
export type ToolContentBlockUnion = ToolContentBlock | ToolContentResourceLinkBlock | ToolContentResourceBlock;
|
|
27
|
+
export interface ToolErrorResponse {
|
|
28
|
+
[x: string]: unknown;
|
|
29
|
+
content: ToolContentBlockUnion[];
|
|
30
|
+
structuredContent: {
|
|
31
|
+
error: string;
|
|
32
|
+
url: string;
|
|
33
|
+
};
|
|
34
|
+
isError: true;
|
|
35
|
+
}
|
|
36
|
+
export interface ToolResponseBase {
|
|
37
|
+
[x: string]: unknown;
|
|
38
|
+
content: ToolContentBlockUnion[];
|
|
39
|
+
structuredContent?: Record<string, unknown>;
|
|
40
|
+
isError?: boolean;
|
|
41
|
+
}
|
|
42
|
+
export interface FetchPipelineOptions<T> {
|
|
43
|
+
/** URL to fetch */
|
|
44
|
+
url: string;
|
|
45
|
+
/** Cache namespace (e.g., 'markdown') */
|
|
46
|
+
cacheNamespace: string;
|
|
47
|
+
/** Optional: AbortSignal for request cancellation */
|
|
48
|
+
signal?: AbortSignal;
|
|
49
|
+
/** Optional: cache variation input for headers/flags */
|
|
50
|
+
cacheVary?: Record<string, unknown> | string;
|
|
51
|
+
/** Transform function to process HTML into desired format */
|
|
52
|
+
transform: (html: string, url: string) => T | Promise<T>;
|
|
53
|
+
/** Optional: serialize result for caching (defaults to JSON.stringify) */
|
|
54
|
+
serialize?: (result: T) => string;
|
|
55
|
+
/** Optional: deserialize cached content */
|
|
56
|
+
deserialize?: (cached: string) => T | undefined;
|
|
57
|
+
}
|
|
58
|
+
export interface PipelineResult<T> {
|
|
59
|
+
data: T;
|
|
60
|
+
fromCache: boolean;
|
|
61
|
+
url: string;
|
|
62
|
+
fetchedAt: string;
|
|
63
|
+
cacheKey?: string | null;
|
|
64
|
+
}
|
|
65
|
+
export declare const FETCH_URL_TOOL_NAME = "fetch-url";
|
|
66
|
+
export declare const FETCH_URL_TOOL_DESCRIPTION = "Fetches a webpage and converts it to clean Markdown format";
|
|
67
|
+
interface InlineContentResult {
|
|
68
|
+
content?: string;
|
|
69
|
+
contentSize: number;
|
|
70
|
+
resourceUri?: string;
|
|
71
|
+
resourceMimeType?: string;
|
|
72
|
+
error?: string;
|
|
73
|
+
truncated?: boolean;
|
|
74
|
+
}
|
|
75
|
+
declare function applyInlineContentLimit(content: string, cacheKey: string | null): InlineContentResult;
|
|
76
|
+
export type InlineResult = ReturnType<typeof applyInlineContentLimit>;
|
|
77
|
+
export declare function executeFetchPipeline<T>(options: FetchPipelineOptions<T>): Promise<PipelineResult<T>>;
|
|
78
|
+
interface SharedFetchOptions<T extends {
|
|
79
|
+
content: string;
|
|
80
|
+
}> {
|
|
81
|
+
readonly url: string;
|
|
82
|
+
readonly transform: (html: string, normalizedUrl: string) => T | Promise<T>;
|
|
83
|
+
readonly serialize?: (result: T) => string;
|
|
84
|
+
readonly deserialize?: (cached: string) => T | undefined;
|
|
85
|
+
}
|
|
86
|
+
interface SharedFetchDeps {
|
|
87
|
+
readonly executeFetchPipeline?: typeof executeFetchPipeline;
|
|
88
|
+
}
|
|
89
|
+
export declare function performSharedFetch<T extends {
|
|
90
|
+
content: string;
|
|
91
|
+
}>(options: SharedFetchOptions<T>, deps?: SharedFetchDeps): Promise<{
|
|
92
|
+
pipeline: PipelineResult<T>;
|
|
93
|
+
inlineResult: InlineResult;
|
|
94
|
+
}>;
|
|
95
|
+
export declare function createToolErrorResponse(message: string, url: string): ToolErrorResponse;
|
|
96
|
+
export declare function handleToolError(error: unknown, url: string, fallbackMessage?: string): ToolErrorResponse;
|
|
97
|
+
type MarkdownPipelineResult = MarkdownTransformResult & {
|
|
98
|
+
readonly content: string;
|
|
99
|
+
};
|
|
100
|
+
export declare function parseCachedMarkdownResult(cached: string): MarkdownPipelineResult | undefined;
|
|
101
|
+
export declare function fetchUrlToolHandler(input: FetchUrlInput): Promise<ToolResponseBase>;
|
|
102
|
+
export declare function withRequestContextIfMissing<TParams, TResult>(handler: (params: TParams) => Promise<TResult>): (params: TParams) => Promise<TResult>;
|
|
103
|
+
export declare function registerTools(server: McpServer): void;
|
|
104
|
+
export {};
|