@j0hanz/superfetch 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -38
- package/dist/cache.d.ts +42 -0
- package/dist/cache.js +565 -0
- package/dist/config/env-parsers.d.ts +1 -0
- package/dist/config/env-parsers.js +12 -0
- package/dist/config/index.d.ts +7 -0
- package/dist/config/index.js +10 -3
- package/dist/config/types/content.d.ts +1 -0
- package/dist/config.d.ts +77 -0
- package/dist/config.js +261 -0
- package/dist/crypto.d.ts +2 -0
- package/dist/crypto.js +32 -0
- package/dist/errors.d.ts +10 -0
- package/dist/errors.js +28 -0
- package/dist/fetch.d.ts +40 -0
- package/dist/fetch.js +910 -0
- package/dist/http/base-middleware.d.ts +7 -0
- package/dist/http/base-middleware.js +143 -0
- package/dist/http/cors.d.ts +0 -5
- package/dist/http/cors.js +0 -6
- package/dist/http/download-routes.js +6 -2
- package/dist/http/error-handler.d.ts +2 -0
- package/dist/http/error-handler.js +55 -0
- package/dist/http/mcp-routes.js +2 -2
- package/dist/http/mcp-sessions.d.ts +3 -5
- package/dist/http/mcp-sessions.js +8 -8
- package/dist/http/server-tuning.d.ts +9 -0
- package/dist/http/server-tuning.js +45 -0
- package/dist/http/server.d.ts +0 -10
- package/dist/http/server.js +33 -333
- package/dist/http.d.ts +78 -0
- package/dist/http.js +1437 -0
- package/dist/index.js +3 -3
- package/dist/mcp.d.ts +3 -0
- package/dist/mcp.js +94 -0
- package/dist/observability.d.ts +16 -0
- package/dist/observability.js +78 -0
- package/dist/server.js +20 -5
- package/dist/services/cache.d.ts +1 -1
- package/dist/services/context.d.ts +2 -0
- package/dist/services/context.js +3 -0
- package/dist/services/extractor.d.ts +1 -0
- package/dist/services/extractor.js +28 -2
- package/dist/services/fetcher.d.ts +2 -0
- package/dist/services/fetcher.js +35 -14
- package/dist/services/logger.js +4 -1
- package/dist/services/telemetry.d.ts +19 -0
- package/dist/services/telemetry.js +43 -0
- package/dist/services/transform-worker-pool.d.ts +10 -3
- package/dist/services/transform-worker-pool.js +213 -184
- package/dist/tools/handlers/fetch-url.tool.js +8 -6
- package/dist/tools/index.d.ts +1 -0
- package/dist/tools/index.js +13 -1
- package/dist/tools/schemas.d.ts +2 -0
- package/dist/tools/schemas.js +8 -0
- package/dist/tools/utils/content-transform-core.d.ts +5 -0
- package/dist/tools/utils/content-transform-core.js +180 -0
- package/dist/tools/utils/content-transform-workers.d.ts +1 -0
- package/dist/tools/utils/content-transform-workers.js +1 -0
- package/dist/tools/utils/content-transform.d.ts +3 -5
- package/dist/tools/utils/content-transform.js +35 -148
- package/dist/tools/utils/raw-markdown.js +15 -1
- package/dist/tools.d.ts +104 -0
- package/dist/tools.js +421 -0
- package/dist/transform.d.ts +69 -0
- package/dist/transform.js +1509 -0
- package/dist/transformers/markdown.d.ts +4 -1
- package/dist/transformers/markdown.js +182 -53
- package/dist/utils/cancellation.d.ts +1 -0
- package/dist/utils/cancellation.js +18 -0
- package/dist/utils/code-language.d.ts +0 -9
- package/dist/utils/code-language.js +5 -5
- package/dist/utils/host-normalizer.d.ts +1 -0
- package/dist/utils/host-normalizer.js +37 -0
- package/dist/utils/url-redactor.d.ts +1 -0
- package/dist/utils/url-redactor.js +13 -0
- package/dist/utils/url-validator.js +8 -5
- package/dist/workers/transform-worker.js +82 -38
- package/package.json +8 -7
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import { extractContent } from '../../services/extractor.js';
|
|
2
|
+
import { logDebug } from '../../services/logger.js';
|
|
3
|
+
import { endTransformStage, startTransformStage, } from '../../services/telemetry.js';
|
|
4
|
+
import { throwIfAborted } from '../../utils/cancellation.js';
|
|
5
|
+
import { htmlToMarkdown } from '../../transformers/markdown.js';
|
|
6
|
+
import { tryTransformRawContent } from './raw-markdown.js';
|
|
7
|
+
const MIN_CONTENT_RATIO = 0.3;
|
|
8
|
+
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
9
|
+
function stripHtmlTags(html) {
|
|
10
|
+
const parts = [];
|
|
11
|
+
let inTag = false;
|
|
12
|
+
for (const char of html) {
|
|
13
|
+
if (char === '<') {
|
|
14
|
+
inTag = true;
|
|
15
|
+
continue;
|
|
16
|
+
}
|
|
17
|
+
if (char === '>') {
|
|
18
|
+
inTag = false;
|
|
19
|
+
continue;
|
|
20
|
+
}
|
|
21
|
+
if (!inTag) {
|
|
22
|
+
parts.push(char);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
return parts.join('');
|
|
26
|
+
}
|
|
27
|
+
function estimateTextLength(html) {
|
|
28
|
+
return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
|
|
29
|
+
}
|
|
30
|
+
export function isExtractionSufficient(article, originalHtml) {
|
|
31
|
+
if (!article)
|
|
32
|
+
return false;
|
|
33
|
+
const articleLength = article.textContent.length;
|
|
34
|
+
const originalLength = estimateTextLength(originalHtml);
|
|
35
|
+
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
36
|
+
return true;
|
|
37
|
+
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
38
|
+
}
|
|
39
|
+
export function determineContentExtractionSource(article) {
|
|
40
|
+
return !!article;
|
|
41
|
+
}
|
|
42
|
+
function applyArticleMetadata(metadata, article) {
|
|
43
|
+
if (article.title !== undefined)
|
|
44
|
+
metadata.title = article.title;
|
|
45
|
+
if (article.byline !== undefined)
|
|
46
|
+
metadata.author = article.byline;
|
|
47
|
+
}
|
|
48
|
+
function applyExtractedMetadata(metadata, extractedMeta) {
|
|
49
|
+
if (extractedMeta.title !== undefined)
|
|
50
|
+
metadata.title = extractedMeta.title;
|
|
51
|
+
if (extractedMeta.description !== undefined) {
|
|
52
|
+
metadata.description = extractedMeta.description;
|
|
53
|
+
}
|
|
54
|
+
if (extractedMeta.author !== undefined) {
|
|
55
|
+
metadata.author = extractedMeta.author;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
|
|
59
|
+
if (!includeMetadata)
|
|
60
|
+
return undefined;
|
|
61
|
+
const now = new Date().toISOString();
|
|
62
|
+
const metadata = {
|
|
63
|
+
type: 'metadata',
|
|
64
|
+
url,
|
|
65
|
+
fetchedAt: now,
|
|
66
|
+
};
|
|
67
|
+
if (shouldExtractFromArticle && article) {
|
|
68
|
+
applyArticleMetadata(metadata, article);
|
|
69
|
+
return metadata;
|
|
70
|
+
}
|
|
71
|
+
applyExtractedMetadata(metadata, extractedMeta);
|
|
72
|
+
return metadata;
|
|
73
|
+
}
|
|
74
|
+
function buildArticleContentSource({ url, article, extractedMeta, includeMetadata, }) {
|
|
75
|
+
const metadata = createContentMetadataBlock(url, article, extractedMeta, true, includeMetadata);
|
|
76
|
+
return {
|
|
77
|
+
sourceHtml: article.content,
|
|
78
|
+
title: article.title,
|
|
79
|
+
metadata,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
function buildFullHtmlContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
|
|
83
|
+
const metadata = createContentMetadataBlock(url, article, extractedMeta, false, includeMetadata);
|
|
84
|
+
return {
|
|
85
|
+
sourceHtml: html,
|
|
86
|
+
title: extractedMeta.title,
|
|
87
|
+
metadata,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
function logQualityGateFallback({ url, articleLength, }) {
|
|
91
|
+
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
92
|
+
url: url.substring(0, 80),
|
|
93
|
+
articleLength,
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
function tryBuildExtractedArticleContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
|
|
97
|
+
if (!article)
|
|
98
|
+
return null;
|
|
99
|
+
const shouldExtractFromArticle = determineContentExtractionSource(article);
|
|
100
|
+
if (shouldExtractFromArticle && isExtractionSufficient(article, html)) {
|
|
101
|
+
return buildArticleContentSource({
|
|
102
|
+
url,
|
|
103
|
+
article,
|
|
104
|
+
extractedMeta,
|
|
105
|
+
includeMetadata,
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
if (shouldExtractFromArticle) {
|
|
109
|
+
logQualityGateFallback({
|
|
110
|
+
url,
|
|
111
|
+
articleLength: article.textContent.length,
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
return null;
|
|
115
|
+
}
|
|
116
|
+
function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
117
|
+
const { article, metadata: extractedMeta } = extractContent(html, url, {
|
|
118
|
+
extractArticle: true,
|
|
119
|
+
...(signal ? { signal } : {}),
|
|
120
|
+
});
|
|
121
|
+
const extracted = tryBuildExtractedArticleContentSource({
|
|
122
|
+
html,
|
|
123
|
+
url,
|
|
124
|
+
article,
|
|
125
|
+
extractedMeta,
|
|
126
|
+
includeMetadata,
|
|
127
|
+
});
|
|
128
|
+
if (extracted)
|
|
129
|
+
return extracted;
|
|
130
|
+
return buildFullHtmlContentSource({
|
|
131
|
+
html,
|
|
132
|
+
url,
|
|
133
|
+
article,
|
|
134
|
+
extractedMeta,
|
|
135
|
+
includeMetadata,
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
139
|
+
const totalStage = startTransformStage(url, 'transform:total');
|
|
140
|
+
let success = false;
|
|
141
|
+
try {
|
|
142
|
+
throwIfAborted(options.signal, url, 'transform:begin');
|
|
143
|
+
const rawStage = startTransformStage(url, 'transform:raw');
|
|
144
|
+
const raw = tryTransformRawContent({
|
|
145
|
+
html,
|
|
146
|
+
url,
|
|
147
|
+
includeMetadata: options.includeMetadata,
|
|
148
|
+
});
|
|
149
|
+
endTransformStage(rawStage);
|
|
150
|
+
if (raw) {
|
|
151
|
+
success = true;
|
|
152
|
+
return raw;
|
|
153
|
+
}
|
|
154
|
+
const extractStage = startTransformStage(url, 'transform:extract');
|
|
155
|
+
const context = resolveContentSource({
|
|
156
|
+
html,
|
|
157
|
+
url,
|
|
158
|
+
includeMetadata: options.includeMetadata,
|
|
159
|
+
...(options.signal ? { signal: options.signal } : {}),
|
|
160
|
+
});
|
|
161
|
+
endTransformStage(extractStage);
|
|
162
|
+
const markdownStage = startTransformStage(url, 'transform:markdown');
|
|
163
|
+
const content = htmlToMarkdown(context.sourceHtml, context.metadata, {
|
|
164
|
+
url,
|
|
165
|
+
...(options.signal ? { signal: options.signal } : {}),
|
|
166
|
+
});
|
|
167
|
+
endTransformStage(markdownStage);
|
|
168
|
+
success = true;
|
|
169
|
+
return {
|
|
170
|
+
markdown: content,
|
|
171
|
+
title: context.title,
|
|
172
|
+
truncated: false,
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
finally {
|
|
176
|
+
if (success) {
|
|
177
|
+
endTransformStage(totalStage, { truncated: false });
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
export
|
|
3
|
-
export declare function
|
|
4
|
-
export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
|
|
5
|
-
export declare function transformHtmlToMarkdown(html: string, url: string, options: TransformOptions): MarkdownTransformResult;
|
|
1
|
+
import type { MarkdownTransformResult, TransformOptions } from '../../config/types/content.js';
|
|
2
|
+
export { createContentMetadataBlock, determineContentExtractionSource, isExtractionSufficient, } from './content-transform-core.js';
|
|
3
|
+
export declare function transformHtmlToMarkdown(html: string, url: string, options: TransformOptions): Promise<MarkdownTransformResult>;
|
|
@@ -1,154 +1,41 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
function
|
|
8
|
-
const
|
|
9
|
-
let
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
1
|
+
import { FetchError } from '../../errors/app-error.js';
|
|
2
|
+
import { endTransformStage, startTransformStage, } from '../../services/telemetry.js';
|
|
3
|
+
import { getOrCreateTransformWorkerPool } from '../../services/transform-worker-pool.js';
|
|
4
|
+
import { throwIfAborted } from '../../utils/cancellation.js';
|
|
5
|
+
import { transformHtmlToMarkdownInProcess } from './content-transform-core.js';
|
|
6
|
+
export { createContentMetadataBlock, determineContentExtractionSource, isExtractionSufficient, } from './content-transform-core.js';
|
|
7
|
+
export async function transformHtmlToMarkdown(html, url, options) {
|
|
8
|
+
const totalStage = startTransformStage(url, 'transform:total');
|
|
9
|
+
let success = false;
|
|
10
|
+
try {
|
|
11
|
+
throwIfAborted(options.signal, url, 'transform:begin');
|
|
12
|
+
const workerStage = startTransformStage(url, 'transform:worker');
|
|
13
|
+
try {
|
|
14
|
+
const pool = getOrCreateTransformWorkerPool();
|
|
15
|
+
const result = await pool.transform(html, url, {
|
|
16
|
+
includeMetadata: options.includeMetadata,
|
|
17
|
+
...(options.signal ? { signal: options.signal } : {}),
|
|
18
|
+
});
|
|
19
|
+
success = true;
|
|
20
|
+
return result;
|
|
14
21
|
}
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
22
|
+
catch (error) {
|
|
23
|
+
if (error instanceof FetchError) {
|
|
24
|
+
throw error;
|
|
25
|
+
}
|
|
26
|
+
// Stability-first: if worker infrastructure fails, fall back to in-process.
|
|
27
|
+
throwIfAborted(options.signal, url, 'transform:worker-fallback');
|
|
28
|
+
const fallback = transformHtmlToMarkdownInProcess(html, url, options);
|
|
29
|
+
success = true;
|
|
30
|
+
return fallback;
|
|
18
31
|
}
|
|
19
|
-
|
|
20
|
-
|
|
32
|
+
finally {
|
|
33
|
+
endTransformStage(workerStage);
|
|
21
34
|
}
|
|
22
35
|
}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
}
|
|
28
|
-
export function isExtractionSufficient(article, originalHtml) {
|
|
29
|
-
if (!article)
|
|
30
|
-
return false;
|
|
31
|
-
const articleLength = article.textContent.length;
|
|
32
|
-
const originalLength = estimateTextLength(originalHtml);
|
|
33
|
-
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
34
|
-
return true;
|
|
35
|
-
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
36
|
-
}
|
|
37
|
-
export function determineContentExtractionSource(article) {
|
|
38
|
-
return !!article;
|
|
39
|
-
}
|
|
40
|
-
function applyArticleMetadata(metadata, article) {
|
|
41
|
-
if (article.title !== undefined)
|
|
42
|
-
metadata.title = article.title;
|
|
43
|
-
if (article.byline !== undefined)
|
|
44
|
-
metadata.author = article.byline;
|
|
45
|
-
}
|
|
46
|
-
function applyExtractedMetadata(metadata, extractedMeta) {
|
|
47
|
-
if (extractedMeta.title !== undefined)
|
|
48
|
-
metadata.title = extractedMeta.title;
|
|
49
|
-
if (extractedMeta.description !== undefined) {
|
|
50
|
-
metadata.description = extractedMeta.description;
|
|
51
|
-
}
|
|
52
|
-
if (extractedMeta.author !== undefined) {
|
|
53
|
-
metadata.author = extractedMeta.author;
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
|
|
57
|
-
if (!includeMetadata)
|
|
58
|
-
return undefined;
|
|
59
|
-
const now = new Date().toISOString();
|
|
60
|
-
const metadata = {
|
|
61
|
-
type: 'metadata',
|
|
62
|
-
url,
|
|
63
|
-
fetchedAt: now,
|
|
64
|
-
};
|
|
65
|
-
if (shouldExtractFromArticle && article) {
|
|
66
|
-
applyArticleMetadata(metadata, article);
|
|
67
|
-
return metadata;
|
|
68
|
-
}
|
|
69
|
-
applyExtractedMetadata(metadata, extractedMeta);
|
|
70
|
-
return metadata;
|
|
71
|
-
}
|
|
72
|
-
function buildArticleContentSource({ url, article, extractedMeta, includeMetadata, }) {
|
|
73
|
-
const metadata = createContentMetadataBlock(url, article, extractedMeta, true, includeMetadata);
|
|
74
|
-
return {
|
|
75
|
-
sourceHtml: article.content,
|
|
76
|
-
title: article.title,
|
|
77
|
-
metadata,
|
|
78
|
-
};
|
|
79
|
-
}
|
|
80
|
-
function buildFullHtmlContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
|
|
81
|
-
const metadata = createContentMetadataBlock(url, article, extractedMeta, false, includeMetadata);
|
|
82
|
-
return {
|
|
83
|
-
sourceHtml: html,
|
|
84
|
-
title: extractedMeta.title,
|
|
85
|
-
metadata,
|
|
86
|
-
};
|
|
87
|
-
}
|
|
88
|
-
function logQualityGateFallback({ url, articleLength, }) {
|
|
89
|
-
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
90
|
-
url: url.substring(0, 80),
|
|
91
|
-
articleLength,
|
|
92
|
-
});
|
|
93
|
-
}
|
|
94
|
-
function tryBuildExtractedArticleContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
|
|
95
|
-
if (!article)
|
|
96
|
-
return null;
|
|
97
|
-
const shouldExtractFromArticle = determineContentExtractionSource(article);
|
|
98
|
-
if (shouldExtractFromArticle && isExtractionSufficient(article, html)) {
|
|
99
|
-
return buildArticleContentSource({
|
|
100
|
-
url,
|
|
101
|
-
article,
|
|
102
|
-
extractedMeta,
|
|
103
|
-
includeMetadata,
|
|
104
|
-
});
|
|
105
|
-
}
|
|
106
|
-
if (shouldExtractFromArticle) {
|
|
107
|
-
logQualityGateFallback({
|
|
108
|
-
url,
|
|
109
|
-
articleLength: article.textContent.length,
|
|
110
|
-
});
|
|
36
|
+
finally {
|
|
37
|
+
if (success) {
|
|
38
|
+
endTransformStage(totalStage, { truncated: false });
|
|
39
|
+
}
|
|
111
40
|
}
|
|
112
|
-
return null;
|
|
113
|
-
}
|
|
114
|
-
function resolveContentSource({ html, url, includeMetadata, }) {
|
|
115
|
-
const { article, metadata: extractedMeta } = extractContent(html, url, {
|
|
116
|
-
extractArticle: true,
|
|
117
|
-
});
|
|
118
|
-
const extracted = tryBuildExtractedArticleContentSource({
|
|
119
|
-
html,
|
|
120
|
-
url,
|
|
121
|
-
article,
|
|
122
|
-
extractedMeta,
|
|
123
|
-
includeMetadata,
|
|
124
|
-
});
|
|
125
|
-
if (extracted)
|
|
126
|
-
return extracted;
|
|
127
|
-
return buildFullHtmlContentSource({
|
|
128
|
-
html,
|
|
129
|
-
url,
|
|
130
|
-
article,
|
|
131
|
-
extractedMeta,
|
|
132
|
-
includeMetadata,
|
|
133
|
-
});
|
|
134
|
-
}
|
|
135
|
-
export function transformHtmlToMarkdown(html, url, options) {
|
|
136
|
-
const raw = tryTransformRawContent({
|
|
137
|
-
html,
|
|
138
|
-
url,
|
|
139
|
-
includeMetadata: options.includeMetadata,
|
|
140
|
-
});
|
|
141
|
-
if (raw)
|
|
142
|
-
return raw;
|
|
143
|
-
const context = resolveContentSource({
|
|
144
|
-
html,
|
|
145
|
-
url,
|
|
146
|
-
includeMetadata: options.includeMetadata,
|
|
147
|
-
});
|
|
148
|
-
const content = htmlToMarkdown(context.sourceHtml, context.metadata);
|
|
149
|
-
return {
|
|
150
|
-
markdown: content,
|
|
151
|
-
title: context.title,
|
|
152
|
-
truncated: false,
|
|
153
|
-
};
|
|
154
41
|
}
|
|
@@ -110,6 +110,20 @@ function isRawTextContent(content) {
|
|
|
110
110
|
return (!isHtmlDocument &&
|
|
111
111
|
(hasMarkdownFrontmatter || (!hasTooManyHtmlTags && isMarkdown)));
|
|
112
112
|
}
|
|
113
|
+
function isLikelyHtmlContent(content) {
|
|
114
|
+
const trimmed = content.trim();
|
|
115
|
+
if (!trimmed)
|
|
116
|
+
return false;
|
|
117
|
+
if (looksLikeHtmlDocument(trimmed))
|
|
118
|
+
return true;
|
|
119
|
+
return countCommonHtmlTags(content) > 2;
|
|
120
|
+
}
|
|
121
|
+
function shouldPreserveRawContent(url, content) {
|
|
122
|
+
if (isRawTextContentUrl(url)) {
|
|
123
|
+
return !isLikelyHtmlContent(content);
|
|
124
|
+
}
|
|
125
|
+
return isRawTextContent(content);
|
|
126
|
+
}
|
|
113
127
|
function buildRawMarkdownPayload({ rawContent, url, includeMetadata, }) {
|
|
114
128
|
const title = extractTitleFromRawMarkdown(rawContent);
|
|
115
129
|
const content = includeMetadata
|
|
@@ -118,7 +132,7 @@ function buildRawMarkdownPayload({ rawContent, url, includeMetadata, }) {
|
|
|
118
132
|
return { content, title };
|
|
119
133
|
}
|
|
120
134
|
export function tryTransformRawContent({ html, url, includeMetadata, }) {
|
|
121
|
-
if (!
|
|
135
|
+
if (!shouldPreserveRawContent(url, html)) {
|
|
122
136
|
return null;
|
|
123
137
|
}
|
|
124
138
|
logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
|
package/dist/tools.d.ts
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
2
|
+
import { type MarkdownTransformResult } from './transform.js';
|
|
3
|
+
export interface FetchUrlInput {
|
|
4
|
+
url: string;
|
|
5
|
+
}
|
|
6
|
+
export interface ToolContentBlock {
|
|
7
|
+
type: 'text';
|
|
8
|
+
text: string;
|
|
9
|
+
}
|
|
10
|
+
export interface ToolContentResourceLinkBlock {
|
|
11
|
+
type: 'resource_link';
|
|
12
|
+
uri: string;
|
|
13
|
+
name: string;
|
|
14
|
+
title?: string;
|
|
15
|
+
description?: string;
|
|
16
|
+
mimeType?: string;
|
|
17
|
+
}
|
|
18
|
+
export interface ToolContentResourceBlock {
|
|
19
|
+
type: 'resource';
|
|
20
|
+
resource: {
|
|
21
|
+
uri: string;
|
|
22
|
+
mimeType?: string;
|
|
23
|
+
text: string;
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
export type ToolContentBlockUnion = ToolContentBlock | ToolContentResourceLinkBlock | ToolContentResourceBlock;
|
|
27
|
+
export interface ToolErrorResponse {
|
|
28
|
+
[x: string]: unknown;
|
|
29
|
+
content: ToolContentBlockUnion[];
|
|
30
|
+
structuredContent: {
|
|
31
|
+
error: string;
|
|
32
|
+
url: string;
|
|
33
|
+
};
|
|
34
|
+
isError: true;
|
|
35
|
+
}
|
|
36
|
+
export interface ToolResponseBase {
|
|
37
|
+
[x: string]: unknown;
|
|
38
|
+
content: ToolContentBlockUnion[];
|
|
39
|
+
structuredContent?: Record<string, unknown>;
|
|
40
|
+
isError?: boolean;
|
|
41
|
+
}
|
|
42
|
+
export interface FetchPipelineOptions<T> {
|
|
43
|
+
/** URL to fetch */
|
|
44
|
+
url: string;
|
|
45
|
+
/** Cache namespace (e.g., 'markdown') */
|
|
46
|
+
cacheNamespace: string;
|
|
47
|
+
/** Optional: AbortSignal for request cancellation */
|
|
48
|
+
signal?: AbortSignal;
|
|
49
|
+
/** Optional: cache variation input for headers/flags */
|
|
50
|
+
cacheVary?: Record<string, unknown> | string;
|
|
51
|
+
/** Transform function to process HTML into desired format */
|
|
52
|
+
transform: (html: string, url: string) => T | Promise<T>;
|
|
53
|
+
/** Optional: serialize result for caching (defaults to JSON.stringify) */
|
|
54
|
+
serialize?: (result: T) => string;
|
|
55
|
+
/** Optional: deserialize cached content */
|
|
56
|
+
deserialize?: (cached: string) => T | undefined;
|
|
57
|
+
}
|
|
58
|
+
export interface PipelineResult<T> {
|
|
59
|
+
data: T;
|
|
60
|
+
fromCache: boolean;
|
|
61
|
+
url: string;
|
|
62
|
+
fetchedAt: string;
|
|
63
|
+
cacheKey?: string | null;
|
|
64
|
+
}
|
|
65
|
+
export declare const FETCH_URL_TOOL_NAME = "fetch-url";
|
|
66
|
+
export declare const FETCH_URL_TOOL_DESCRIPTION = "Fetches a webpage and converts it to clean Markdown format";
|
|
67
|
+
interface InlineContentResult {
|
|
68
|
+
content?: string;
|
|
69
|
+
contentSize: number;
|
|
70
|
+
resourceUri?: string;
|
|
71
|
+
resourceMimeType?: string;
|
|
72
|
+
error?: string;
|
|
73
|
+
truncated?: boolean;
|
|
74
|
+
}
|
|
75
|
+
declare function applyInlineContentLimit(content: string, cacheKey: string | null): InlineContentResult;
|
|
76
|
+
export type InlineResult = ReturnType<typeof applyInlineContentLimit>;
|
|
77
|
+
export declare function executeFetchPipeline<T>(options: FetchPipelineOptions<T>): Promise<PipelineResult<T>>;
|
|
78
|
+
interface SharedFetchOptions<T extends {
|
|
79
|
+
content: string;
|
|
80
|
+
}> {
|
|
81
|
+
readonly url: string;
|
|
82
|
+
readonly transform: (html: string, normalizedUrl: string) => T | Promise<T>;
|
|
83
|
+
readonly serialize?: (result: T) => string;
|
|
84
|
+
readonly deserialize?: (cached: string) => T | undefined;
|
|
85
|
+
}
|
|
86
|
+
interface SharedFetchDeps {
|
|
87
|
+
readonly executeFetchPipeline?: typeof executeFetchPipeline;
|
|
88
|
+
}
|
|
89
|
+
export declare function performSharedFetch<T extends {
|
|
90
|
+
content: string;
|
|
91
|
+
}>(options: SharedFetchOptions<T>, deps?: SharedFetchDeps): Promise<{
|
|
92
|
+
pipeline: PipelineResult<T>;
|
|
93
|
+
inlineResult: InlineResult;
|
|
94
|
+
}>;
|
|
95
|
+
export declare function createToolErrorResponse(message: string, url: string): ToolErrorResponse;
|
|
96
|
+
export declare function handleToolError(error: unknown, url: string, fallbackMessage?: string): ToolErrorResponse;
|
|
97
|
+
type MarkdownPipelineResult = MarkdownTransformResult & {
|
|
98
|
+
readonly content: string;
|
|
99
|
+
};
|
|
100
|
+
export declare function parseCachedMarkdownResult(cached: string): MarkdownPipelineResult | undefined;
|
|
101
|
+
export declare function fetchUrlToolHandler(input: FetchUrlInput): Promise<ToolResponseBase>;
|
|
102
|
+
export declare function withRequestContextIfMissing<TParams, TResult>(handler: (params: TParams) => Promise<TResult>): (params: TParams) => Promise<TResult>;
|
|
103
|
+
export declare function registerTools(server: McpServer): void;
|
|
104
|
+
export {};
|