@j0hanz/superfetch 1.2.1 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +60 -45
- package/dist/config/formatting.d.ts +1 -1
- package/dist/config/types/content.d.ts +3 -3
- package/dist/config/types/runtime.d.ts +1 -1
- package/dist/config/types/tools.d.ts +12 -12
- package/dist/http/cors.js +23 -23
- package/dist/http/download-routes.js +11 -5
- package/dist/http/mcp-routes.js +2 -13
- package/dist/http/mcp-validation.js +1 -1
- package/dist/http/server-middleware.js +5 -3
- package/dist/http/server.js +2 -0
- package/dist/index.js +5 -0
- package/dist/middleware/error-handler.js +1 -1
- package/dist/resources/cached-content.js +8 -4
- package/dist/server.js +2 -0
- package/dist/services/cache.d.ts +2 -1
- package/dist/services/cache.js +23 -7
- package/dist/services/context.d.ts +4 -4
- package/dist/services/context.js +11 -1
- package/dist/services/extractor.js +26 -21
- package/dist/services/fetcher/agents.js +55 -1
- package/dist/services/fetcher/interceptors.d.ts +22 -0
- package/dist/services/fetcher/interceptors.js +57 -26
- package/dist/services/fetcher/response.d.ts +1 -1
- package/dist/services/fetcher/response.js +37 -16
- package/dist/services/fetcher.d.ts +1 -1
- package/dist/services/fetcher.js +9 -8
- package/dist/services/metadata-collector.d.ts +10 -0
- package/dist/services/metadata-collector.js +11 -0
- package/dist/services/parser.d.ts +5 -1
- package/dist/services/parser.js +82 -11
- package/dist/services/transform-worker-pool.d.ts +14 -0
- package/dist/services/transform-worker-pool.js +167 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +9 -1
- package/dist/tools/handlers/fetch-markdown.tool.js +58 -30
- package/dist/tools/handlers/fetch-single.shared.d.ts +8 -3
- package/dist/tools/handlers/fetch-single.shared.js +42 -17
- package/dist/tools/handlers/fetch-url.tool.js +46 -16
- package/dist/tools/index.js +13 -0
- package/dist/tools/schemas.d.ts +19 -16
- package/dist/tools/schemas.js +25 -4
- package/dist/tools/utils/common.js +20 -16
- package/dist/tools/utils/content-transform-async.d.ts +6 -0
- package/dist/tools/utils/content-transform-async.js +33 -0
- package/dist/tools/utils/content-transform.d.ts +4 -1
- package/dist/tools/utils/content-transform.js +37 -3
- package/dist/tools/utils/fetch-pipeline.js +26 -15
- package/dist/utils/content-cleaner.d.ts +1 -1
- package/dist/utils/download-url.d.ts +9 -1
- package/dist/utils/download-url.js +9 -6
- package/dist/utils/tool-error-handler.d.ts +2 -2
- package/dist/utils/tool-error-handler.js +7 -7
- package/dist/utils/url-validator.d.ts +5 -0
- package/dist/utils/url-validator.js +45 -3
- package/dist/workers/transform-worker.d.ts +1 -0
- package/dist/workers/transform-worker.js +50 -0
- package/package.json +4 -6
package/dist/tools/schemas.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
2
|
export declare const fetchUrlInputSchema: z.ZodObject<{
|
|
3
|
-
customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString
|
|
3
|
+
customHeaders: z.ZodOptional<z.ZodEffects<z.ZodRecord<z.ZodString, z.ZodString>, Record<string, string>, Record<string, string>>>;
|
|
4
4
|
timeout: z.ZodDefault<z.ZodNumber>;
|
|
5
5
|
retries: z.ZodDefault<z.ZodNumber>;
|
|
6
6
|
} & {
|
|
@@ -11,6 +11,7 @@ export declare const fetchUrlInputSchema: z.ZodObject<{
|
|
|
11
11
|
maxContentLength: z.ZodOptional<z.ZodNumber>;
|
|
12
12
|
} & {
|
|
13
13
|
format: z.ZodDefault<z.ZodEnum<["jsonl", "markdown"]>>;
|
|
14
|
+
includeContentBlocks: z.ZodOptional<z.ZodBoolean>;
|
|
14
15
|
}, "strict", z.ZodTypeAny, {
|
|
15
16
|
url: string;
|
|
16
17
|
extractMainContent: boolean;
|
|
@@ -18,20 +19,22 @@ export declare const fetchUrlInputSchema: z.ZodObject<{
|
|
|
18
19
|
retries: number;
|
|
19
20
|
format: "jsonl" | "markdown";
|
|
20
21
|
timeout: number;
|
|
21
|
-
customHeaders?: Record<string, string> | undefined;
|
|
22
22
|
maxContentLength?: number | undefined;
|
|
23
|
+
includeContentBlocks?: boolean | undefined;
|
|
24
|
+
customHeaders?: Record<string, string> | undefined;
|
|
23
25
|
}, {
|
|
24
26
|
url: string;
|
|
25
|
-
customHeaders?: Record<string, string> | undefined;
|
|
26
27
|
extractMainContent?: boolean | undefined;
|
|
27
28
|
includeMetadata?: boolean | undefined;
|
|
29
|
+
maxContentLength?: number | undefined;
|
|
28
30
|
retries?: number | undefined;
|
|
29
31
|
format?: "jsonl" | "markdown" | undefined;
|
|
30
|
-
|
|
32
|
+
includeContentBlocks?: boolean | undefined;
|
|
31
33
|
timeout?: number | undefined;
|
|
34
|
+
customHeaders?: Record<string, string> | undefined;
|
|
32
35
|
}>;
|
|
33
36
|
export declare const fetchMarkdownInputSchema: z.ZodObject<{
|
|
34
|
-
customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString
|
|
37
|
+
customHeaders: z.ZodOptional<z.ZodEffects<z.ZodRecord<z.ZodString, z.ZodString>, Record<string, string>, Record<string, string>>>;
|
|
35
38
|
timeout: z.ZodDefault<z.ZodNumber>;
|
|
36
39
|
retries: z.ZodDefault<z.ZodNumber>;
|
|
37
40
|
} & {
|
|
@@ -46,16 +49,16 @@ export declare const fetchMarkdownInputSchema: z.ZodObject<{
|
|
|
46
49
|
includeMetadata: boolean;
|
|
47
50
|
retries: number;
|
|
48
51
|
timeout: number;
|
|
49
|
-
customHeaders?: Record<string, string> | undefined;
|
|
50
52
|
maxContentLength?: number | undefined;
|
|
53
|
+
customHeaders?: Record<string, string> | undefined;
|
|
51
54
|
}, {
|
|
52
55
|
url: string;
|
|
53
|
-
customHeaders?: Record<string, string> | undefined;
|
|
54
56
|
extractMainContent?: boolean | undefined;
|
|
55
57
|
includeMetadata?: boolean | undefined;
|
|
56
|
-
retries?: number | undefined;
|
|
57
58
|
maxContentLength?: number | undefined;
|
|
59
|
+
retries?: number | undefined;
|
|
58
60
|
timeout?: number | undefined;
|
|
61
|
+
customHeaders?: Record<string, string> | undefined;
|
|
59
62
|
}>;
|
|
60
63
|
export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
61
64
|
url: z.ZodString;
|
|
@@ -74,30 +77,30 @@ export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
|
74
77
|
errorCode: z.ZodOptional<z.ZodString>;
|
|
75
78
|
}, "strict", z.ZodTypeAny, {
|
|
76
79
|
url: string;
|
|
77
|
-
contentBlocks: number;
|
|
78
80
|
fetchedAt: string;
|
|
79
81
|
format: "jsonl" | "markdown";
|
|
82
|
+
contentBlocks: number;
|
|
80
83
|
cached: boolean;
|
|
81
84
|
error?: string | undefined;
|
|
82
|
-
content?: string | undefined;
|
|
83
85
|
title?: string | undefined;
|
|
84
86
|
truncated?: boolean | undefined;
|
|
85
87
|
resourceUri?: string | undefined;
|
|
86
88
|
resourceMimeType?: string | undefined;
|
|
89
|
+
content?: string | undefined;
|
|
87
90
|
contentSize?: number | undefined;
|
|
88
91
|
errorCode?: string | undefined;
|
|
89
92
|
}, {
|
|
90
93
|
url: string;
|
|
91
|
-
contentBlocks: number;
|
|
92
94
|
fetchedAt: string;
|
|
93
95
|
format: "jsonl" | "markdown";
|
|
96
|
+
contentBlocks: number;
|
|
94
97
|
cached: boolean;
|
|
95
98
|
error?: string | undefined;
|
|
96
|
-
content?: string | undefined;
|
|
97
99
|
title?: string | undefined;
|
|
98
100
|
truncated?: boolean | undefined;
|
|
99
101
|
resourceUri?: string | undefined;
|
|
100
102
|
resourceMimeType?: string | undefined;
|
|
103
|
+
content?: string | undefined;
|
|
101
104
|
contentSize?: number | undefined;
|
|
102
105
|
errorCode?: string | undefined;
|
|
103
106
|
}>;
|
|
@@ -111,13 +114,13 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
|
111
114
|
fileName: z.ZodString;
|
|
112
115
|
expiresAt: z.ZodString;
|
|
113
116
|
}, "strip", z.ZodTypeAny, {
|
|
114
|
-
fileName: string;
|
|
115
117
|
expiresAt: string;
|
|
116
118
|
downloadUrl: string;
|
|
117
|
-
}, {
|
|
118
119
|
fileName: string;
|
|
120
|
+
}, {
|
|
119
121
|
expiresAt: string;
|
|
120
122
|
downloadUrl: string;
|
|
123
|
+
fileName: string;
|
|
121
124
|
}>>;
|
|
122
125
|
} & {
|
|
123
126
|
contentSize: z.ZodOptional<z.ZodNumber>;
|
|
@@ -139,9 +142,9 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
|
139
142
|
resourceMimeType?: string | undefined;
|
|
140
143
|
contentSize?: number | undefined;
|
|
141
144
|
file?: {
|
|
142
|
-
fileName: string;
|
|
143
145
|
expiresAt: string;
|
|
144
146
|
downloadUrl: string;
|
|
147
|
+
fileName: string;
|
|
145
148
|
} | undefined;
|
|
146
149
|
errorCode?: string | undefined;
|
|
147
150
|
}, {
|
|
@@ -156,9 +159,9 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
|
156
159
|
resourceMimeType?: string | undefined;
|
|
157
160
|
contentSize?: number | undefined;
|
|
158
161
|
file?: {
|
|
159
|
-
fileName: string;
|
|
160
162
|
expiresAt: string;
|
|
161
163
|
downloadUrl: string;
|
|
164
|
+
fileName: string;
|
|
162
165
|
} | undefined;
|
|
163
166
|
errorCode?: string | undefined;
|
|
164
167
|
}>;
|
package/dist/tools/schemas.js
CHANGED
|
@@ -1,8 +1,16 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
2
|
import { config } from '../config/index.js';
|
|
3
|
+
const MAX_HEADER_NAME_LENGTH = 128;
|
|
4
|
+
const MAX_HEADER_VALUE_LENGTH = 2048;
|
|
5
|
+
const MAX_HEADER_COUNT = 50;
|
|
6
|
+
const MAX_CONTENT_LENGTH = config.constants.maxContentSize;
|
|
7
|
+
const customHeadersSchema = z
|
|
8
|
+
.record(z.string().max(MAX_HEADER_NAME_LENGTH), z.string().max(MAX_HEADER_VALUE_LENGTH))
|
|
9
|
+
.refine((headers) => Object.keys(headers).length <= MAX_HEADER_COUNT, {
|
|
10
|
+
message: `customHeaders must have at most ${MAX_HEADER_COUNT} entries`,
|
|
11
|
+
});
|
|
3
12
|
const requestOptionsSchema = z.object({
|
|
4
|
-
customHeaders:
|
|
5
|
-
.record(z.string())
|
|
13
|
+
customHeaders: customHeadersSchema
|
|
6
14
|
.optional()
|
|
7
15
|
.describe('Custom HTTP headers for the request'),
|
|
8
16
|
timeout: z
|
|
@@ -30,6 +38,7 @@ const extractionOptionsSchema = z.object({
|
|
|
30
38
|
maxContentLength: z
|
|
31
39
|
.number()
|
|
32
40
|
.positive()
|
|
41
|
+
.max(MAX_CONTENT_LENGTH)
|
|
33
42
|
.optional()
|
|
34
43
|
.describe('Maximum content length in characters'),
|
|
35
44
|
});
|
|
@@ -38,6 +47,10 @@ const formatOptionsSchema = z.object({
|
|
|
38
47
|
.enum(['jsonl', 'markdown'])
|
|
39
48
|
.default('jsonl')
|
|
40
49
|
.describe('Output format'),
|
|
50
|
+
includeContentBlocks: z
|
|
51
|
+
.boolean()
|
|
52
|
+
.optional()
|
|
53
|
+
.describe('Include content block counts when format=markdown'),
|
|
41
54
|
});
|
|
42
55
|
const resourceFieldsSchema = z.object({
|
|
43
56
|
contentSize: z.number().optional().describe('Content length in characters'),
|
|
@@ -64,14 +77,22 @@ const fileDownloadSchema = z.object({
|
|
|
64
77
|
});
|
|
65
78
|
export const fetchUrlInputSchema = requestOptionsSchema
|
|
66
79
|
.extend({
|
|
67
|
-
url: z
|
|
80
|
+
url: z
|
|
81
|
+
.string()
|
|
82
|
+
.min(1)
|
|
83
|
+
.max(config.constants.maxUrlLength)
|
|
84
|
+
.describe('The URL to fetch'),
|
|
68
85
|
})
|
|
69
86
|
.merge(extractionOptionsSchema)
|
|
70
87
|
.merge(formatOptionsSchema)
|
|
71
88
|
.strict();
|
|
72
89
|
export const fetchMarkdownInputSchema = requestOptionsSchema
|
|
73
90
|
.extend({
|
|
74
|
-
url: z
|
|
91
|
+
url: z
|
|
92
|
+
.string()
|
|
93
|
+
.min(1)
|
|
94
|
+
.max(config.constants.maxUrlLength)
|
|
95
|
+
.describe('The URL to fetch'),
|
|
75
96
|
})
|
|
76
97
|
.merge(extractionOptionsSchema)
|
|
77
98
|
.strict();
|
|
@@ -6,22 +6,26 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
6
6
|
if (!includeMetadata)
|
|
7
7
|
return undefined;
|
|
8
8
|
const now = new Date().toISOString();
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
9
|
+
const metadata = {
|
|
10
|
+
type: 'metadata',
|
|
11
|
+
url,
|
|
12
|
+
fetchedAt: now,
|
|
13
|
+
};
|
|
14
|
+
if (shouldExtractFromArticle && article) {
|
|
15
|
+
if (article.title !== undefined)
|
|
16
|
+
metadata.title = article.title;
|
|
17
|
+
if (article.byline !== undefined)
|
|
18
|
+
metadata.author = article.byline;
|
|
19
|
+
return metadata;
|
|
20
|
+
}
|
|
21
|
+
if (extractedMeta.title !== undefined)
|
|
22
|
+
metadata.title = extractedMeta.title;
|
|
23
|
+
if (extractedMeta.description !== undefined) {
|
|
24
|
+
metadata.description = extractedMeta.description;
|
|
25
|
+
}
|
|
26
|
+
if (extractedMeta.author !== undefined)
|
|
27
|
+
metadata.author = extractedMeta.author;
|
|
28
|
+
return metadata;
|
|
25
29
|
}
|
|
26
30
|
export function truncateContent(content, maxLength, suffix = TRUNCATION_MARKER) {
|
|
27
31
|
if (maxLength === undefined ||
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { JsonlTransformResult, MarkdownTransformResult, TransformOptions } from '../../config/types/content.js';
|
|
2
|
+
export declare function transformHtmlToJsonlAsync(html: string, url: string, options: TransformOptions): Promise<JsonlTransformResult>;
|
|
3
|
+
export declare function transformHtmlToMarkdownAsync(html: string, url: string, options: TransformOptions): Promise<MarkdownTransformResult>;
|
|
4
|
+
export declare function transformHtmlToMarkdownWithBlocksAsync(html: string, url: string, options: TransformOptions & {
|
|
5
|
+
includeContentBlocks?: boolean;
|
|
6
|
+
}): Promise<JsonlTransformResult>;
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { logWarn } from '../../services/logger.js';
|
|
2
|
+
import { runTransformInWorker, } from '../../services/transform-worker-pool.js';
|
|
3
|
+
import { transformHtmlToJsonl, transformHtmlToMarkdown, transformHtmlToMarkdownWithBlocks, } from './content-transform.js';
|
|
4
|
+
async function runOrFallback(job, fallback) {
|
|
5
|
+
try {
|
|
6
|
+
const result = await runTransformInWorker(job);
|
|
7
|
+
if (result)
|
|
8
|
+
return result;
|
|
9
|
+
}
|
|
10
|
+
catch (error) {
|
|
11
|
+
logWarn('Transform worker unavailable; using main thread', {
|
|
12
|
+
error: error instanceof Error ? error.message : String(error),
|
|
13
|
+
});
|
|
14
|
+
}
|
|
15
|
+
return fallback();
|
|
16
|
+
}
|
|
17
|
+
export async function transformHtmlToJsonlAsync(html, url, options) {
|
|
18
|
+
const result = await runOrFallback({ mode: 'jsonl', html, url, options }, () => transformHtmlToJsonl(html, url, options));
|
|
19
|
+
return result;
|
|
20
|
+
}
|
|
21
|
+
export async function transformHtmlToMarkdownAsync(html, url, options) {
|
|
22
|
+
const result = await runOrFallback({ mode: 'markdown', html, url, options }, () => transformHtmlToMarkdown(html, url, options));
|
|
23
|
+
return result;
|
|
24
|
+
}
|
|
25
|
+
export async function transformHtmlToMarkdownWithBlocksAsync(html, url, options) {
|
|
26
|
+
const result = await runOrFallback({
|
|
27
|
+
mode: 'markdown-blocks',
|
|
28
|
+
html,
|
|
29
|
+
url,
|
|
30
|
+
options,
|
|
31
|
+
}, () => transformHtmlToMarkdownWithBlocks(html, url, options));
|
|
32
|
+
return result;
|
|
33
|
+
}
|
|
@@ -8,7 +8,10 @@ interface ContentLengthOptions {
|
|
|
8
8
|
}
|
|
9
9
|
interface MarkdownOptions extends ExtractionOptions, ContentLengthOptions {
|
|
10
10
|
}
|
|
11
|
+
interface MarkdownWithBlocksOptions extends ExtractionOptions, ContentLengthOptions {
|
|
12
|
+
readonly includeContentBlocks?: boolean;
|
|
13
|
+
}
|
|
11
14
|
export declare function transformHtmlToJsonl(html: string, url: string, options: ExtractionOptions & ContentLengthOptions): JsonlTransformResult;
|
|
12
15
|
export declare function transformHtmlToMarkdown(html: string, url: string, options: MarkdownOptions): MarkdownTransformResult;
|
|
13
|
-
export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options:
|
|
16
|
+
export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options: MarkdownWithBlocksOptions): JsonlTransformResult;
|
|
14
17
|
export {};
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { TRUNCATION_MARKER } from '../../config/formatting.js';
|
|
2
2
|
import { extractContent } from '../../services/extractor.js';
|
|
3
|
-
import { parseHtml } from '../../services/parser.js';
|
|
3
|
+
import { parseHtml, parseHtmlWithMetadata } from '../../services/parser.js';
|
|
4
4
|
import { sanitizeText } from '../../utils/sanitizer.js';
|
|
5
5
|
import { toJsonl } from '../../transformers/jsonl.transformer.js';
|
|
6
6
|
import { htmlToMarkdown } from '../../transformers/markdown.transformer.js';
|
|
@@ -56,7 +56,10 @@ function decodeHtmlEntities(value) {
|
|
|
56
56
|
}
|
|
57
57
|
function buildJsonlPayload(context, maxContentLength) {
|
|
58
58
|
const contentBlocks = parseHtml(context.sourceHtml);
|
|
59
|
-
|
|
59
|
+
return buildJsonlPayloadFromBlocks(contentBlocks, context.metadata, maxContentLength);
|
|
60
|
+
}
|
|
61
|
+
function buildJsonlPayloadFromBlocks(contentBlocks, metadata, maxContentLength) {
|
|
62
|
+
const { content, truncated } = truncateContent(toJsonl(contentBlocks, metadata), maxContentLength);
|
|
60
63
|
return {
|
|
61
64
|
content,
|
|
62
65
|
contentBlocks: contentBlocks.length,
|
|
@@ -69,6 +72,17 @@ function buildMarkdownPayload(context, maxContentLength) {
|
|
|
69
72
|
return { content, truncated };
|
|
70
73
|
}
|
|
71
74
|
export function transformHtmlToJsonl(html, url, options) {
|
|
75
|
+
if (!options.extractMainContent && options.includeMetadata) {
|
|
76
|
+
const parsed = parseHtmlWithMetadata(html);
|
|
77
|
+
const metadataBlock = createContentMetadataBlock(url, null, parsed.metadata, false, true);
|
|
78
|
+
const { content, contentBlocks, truncated } = buildJsonlPayloadFromBlocks(parsed.blocks, metadataBlock, options.maxContentLength);
|
|
79
|
+
return {
|
|
80
|
+
content,
|
|
81
|
+
contentBlocks,
|
|
82
|
+
title: parsed.metadata.title,
|
|
83
|
+
...(truncated && { truncated }),
|
|
84
|
+
};
|
|
85
|
+
}
|
|
72
86
|
const context = resolveContentSource(html, url, options);
|
|
73
87
|
const { content, contentBlocks, truncated } = buildJsonlPayload(context, options.maxContentLength);
|
|
74
88
|
return {
|
|
@@ -88,8 +102,28 @@ export function transformHtmlToMarkdown(html, url, options) {
|
|
|
88
102
|
};
|
|
89
103
|
}
|
|
90
104
|
export function transformHtmlToMarkdownWithBlocks(html, url, options) {
|
|
105
|
+
const includeContentBlocks = options.includeContentBlocks ?? true;
|
|
106
|
+
if (includeContentBlocks &&
|
|
107
|
+
!options.extractMainContent &&
|
|
108
|
+
options.includeMetadata) {
|
|
109
|
+
const parsed = parseHtmlWithMetadata(html);
|
|
110
|
+
const context = {
|
|
111
|
+
sourceHtml: html,
|
|
112
|
+
title: parsed.metadata.title,
|
|
113
|
+
metadata: createContentMetadataBlock(url, null, parsed.metadata, false, true),
|
|
114
|
+
};
|
|
115
|
+
const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
|
|
116
|
+
return {
|
|
117
|
+
content,
|
|
118
|
+
contentBlocks: parsed.blocks.length,
|
|
119
|
+
title: context.title,
|
|
120
|
+
...(truncated && { truncated }),
|
|
121
|
+
};
|
|
122
|
+
}
|
|
91
123
|
const context = resolveContentSource(html, url, options);
|
|
92
|
-
const contentBlocks =
|
|
124
|
+
const contentBlocks = includeContentBlocks
|
|
125
|
+
? parseHtml(context.sourceHtml)
|
|
126
|
+
: [];
|
|
93
127
|
const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
|
|
94
128
|
return {
|
|
95
129
|
content,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import * as cache from '../../services/cache.js';
|
|
2
|
-
import {
|
|
2
|
+
import { fetchNormalizedUrlWithRetry } from '../../services/fetcher.js';
|
|
3
3
|
import { logDebug } from '../../services/logger.js';
|
|
4
|
-
import {
|
|
4
|
+
import { assertResolvedAddressesAllowed, normalizeUrl, } from '../../utils/url-validator.js';
|
|
5
5
|
import { appendHeaderVary } from './cache-vary.js';
|
|
6
6
|
function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalizedUrl) {
|
|
7
7
|
if (!cacheKey)
|
|
@@ -42,16 +42,19 @@ function attemptCacheRetrieval(cacheKey, deserialize, cacheNamespace, normalized
|
|
|
42
42
|
* @returns Promise resolving to the pipeline result
|
|
43
43
|
*/
|
|
44
44
|
export async function executeFetchPipeline(options) {
|
|
45
|
-
const normalizedUrl =
|
|
45
|
+
const { normalizedUrl, hostname } = normalizeUrl(options.url);
|
|
46
46
|
const cacheKey = resolveCacheKey(options, normalizedUrl);
|
|
47
47
|
const cachedResult = attemptCacheRetrieval(cacheKey, options.deserialize, options.cacheNamespace, normalizedUrl);
|
|
48
48
|
if (cachedResult)
|
|
49
49
|
return cachedResult;
|
|
50
|
+
await assertResolvedAddressesAllowed(hostname);
|
|
50
51
|
const fetchOptions = buildFetchOptions(options);
|
|
51
52
|
logDebug('Fetching URL', { url: normalizedUrl, retries: options.retries });
|
|
52
|
-
const html = await
|
|
53
|
-
const data = options.transform(html, normalizedUrl);
|
|
54
|
-
|
|
53
|
+
const html = await fetchNormalizedUrlWithRetry(normalizedUrl, fetchOptions, options.retries);
|
|
54
|
+
const data = await options.transform(html, normalizedUrl);
|
|
55
|
+
if (cache.isEnabled()) {
|
|
56
|
+
persistCache(cacheKey, data, options.serialize, normalizedUrl);
|
|
57
|
+
}
|
|
55
58
|
return buildPipelineResult(normalizedUrl, data, cacheKey);
|
|
56
59
|
}
|
|
57
60
|
function resolveCacheKey(options, normalizedUrl) {
|
|
@@ -59,20 +62,28 @@ function resolveCacheKey(options, normalizedUrl) {
|
|
|
59
62
|
return cache.createCacheKey(options.cacheNamespace, normalizedUrl, cacheVary);
|
|
60
63
|
}
|
|
61
64
|
function buildFetchOptions(options) {
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
65
|
+
const fetchOptions = {};
|
|
66
|
+
if (options.customHeaders !== undefined) {
|
|
67
|
+
fetchOptions.customHeaders = options.customHeaders;
|
|
68
|
+
}
|
|
69
|
+
if (options.signal !== undefined) {
|
|
70
|
+
fetchOptions.signal = options.signal;
|
|
71
|
+
}
|
|
72
|
+
if (options.timeout !== undefined) {
|
|
73
|
+
fetchOptions.timeout = options.timeout;
|
|
74
|
+
}
|
|
75
|
+
return fetchOptions;
|
|
67
76
|
}
|
|
68
77
|
function persistCache(cacheKey, data, serialize, normalizedUrl) {
|
|
69
78
|
if (!cacheKey)
|
|
70
79
|
return;
|
|
71
80
|
const serializer = serialize ?? JSON.stringify;
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
81
|
+
const metadata = { url: normalizedUrl };
|
|
82
|
+
const title = extractTitle(data);
|
|
83
|
+
if (title !== undefined) {
|
|
84
|
+
metadata.title = title;
|
|
85
|
+
}
|
|
86
|
+
cache.set(cacheKey, serializer(data), metadata);
|
|
76
87
|
}
|
|
77
88
|
function extractTitle(value) {
|
|
78
89
|
if (!value || typeof value !== 'object')
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
export declare function cleanParagraph(text: string): string | null;
|
|
2
2
|
export declare function cleanHeading(text: string): string | null;
|
|
3
|
-
export declare function cleanListItems(items: string[]): string[];
|
|
3
|
+
export declare function cleanListItems(items: readonly string[]): readonly string[];
|
|
4
4
|
export declare function cleanCodeBlock(code: string): string | null;
|
|
5
5
|
export declare function removeInlineTimestamps(text: string): string;
|
|
@@ -1,8 +1,16 @@
|
|
|
1
|
+
import { config } from '../config/index.js';
|
|
1
2
|
import type { FileDownloadInfo } from '../config/types/tools.js';
|
|
3
|
+
import * as cache from '../services/cache.js';
|
|
4
|
+
import { generateSafeFilename } from './filename-generator.js';
|
|
2
5
|
interface DownloadInfoOptions {
|
|
3
6
|
cacheKey: string | null;
|
|
4
7
|
url: string;
|
|
5
8
|
title?: string;
|
|
6
9
|
}
|
|
7
|
-
|
|
10
|
+
interface DownloadInfoDeps {
|
|
11
|
+
readonly config?: typeof config;
|
|
12
|
+
readonly cache?: Pick<typeof cache, 'get' | 'parseCacheKey'>;
|
|
13
|
+
readonly generateSafeFilename?: typeof generateSafeFilename;
|
|
14
|
+
}
|
|
15
|
+
export declare function buildFileDownloadInfo(options: DownloadInfoOptions, deps?: DownloadInfoDeps): FileDownloadInfo | null;
|
|
8
16
|
export {};
|
|
@@ -1,22 +1,25 @@
|
|
|
1
1
|
import { config } from '../config/index.js';
|
|
2
2
|
import * as cache from '../services/cache.js';
|
|
3
3
|
import { generateSafeFilename } from './filename-generator.js';
|
|
4
|
-
export function buildFileDownloadInfo(options) {
|
|
5
|
-
|
|
4
|
+
export function buildFileDownloadInfo(options, deps = {}) {
|
|
5
|
+
const resolvedConfig = deps.config ?? config;
|
|
6
|
+
const resolvedCache = deps.cache ?? cache;
|
|
7
|
+
const resolveFilename = deps.generateSafeFilename ?? generateSafeFilename;
|
|
8
|
+
if (!resolvedConfig.runtime.httpMode) {
|
|
6
9
|
return null;
|
|
7
10
|
}
|
|
8
|
-
if (!
|
|
11
|
+
if (!resolvedConfig.cache.enabled || !options.cacheKey) {
|
|
9
12
|
return null;
|
|
10
13
|
}
|
|
11
|
-
const parts =
|
|
14
|
+
const parts = resolvedCache.parseCacheKey(options.cacheKey);
|
|
12
15
|
if (!parts)
|
|
13
16
|
return null;
|
|
14
|
-
const cacheEntry =
|
|
17
|
+
const cacheEntry = resolvedCache.get(options.cacheKey);
|
|
15
18
|
if (!cacheEntry)
|
|
16
19
|
return null;
|
|
17
20
|
const { expiresAt, title, url } = cacheEntry;
|
|
18
21
|
const downloadUrl = buildDownloadUrl(parts.namespace, parts.urlHash);
|
|
19
|
-
const fileName =
|
|
22
|
+
const fileName = resolveFilename(url, title ?? options.title, parts.urlHash, resolveExtension(parts.namespace));
|
|
20
23
|
return { downloadUrl, fileName, expiresAt };
|
|
21
24
|
}
|
|
22
25
|
function buildDownloadUrl(namespace, hash) {
|
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
import type { ToolErrorResponse } from '../config/types/tools.js';
|
|
2
|
-
export declare function createToolErrorResponse(message: string, url: string, code: string): ToolErrorResponse;
|
|
3
|
-
export declare function handleToolError(error: unknown, url: string, fallbackMessage?: string): ToolErrorResponse;
|
|
2
|
+
export declare function createToolErrorResponse(message: string, url: string, code: string, details?: Record<string, unknown>): ToolErrorResponse;
|
|
3
|
+
export declare function handleToolError(error: unknown, url: string, fallbackMessage?: string, details?: Record<string, unknown>): ToolErrorResponse;
|
|
@@ -22,12 +22,12 @@ function normalizeToolErrorCode(code) {
|
|
|
22
22
|
return String(ErrorCode.InternalError);
|
|
23
23
|
return MCP_ERROR_CODE_MAP[code] ?? code;
|
|
24
24
|
}
|
|
25
|
-
export function createToolErrorResponse(message, url, code) {
|
|
25
|
+
export function createToolErrorResponse(message, url, code, details = {}) {
|
|
26
26
|
const structuredContent = {
|
|
27
|
+
...details,
|
|
27
28
|
error: message,
|
|
28
29
|
url,
|
|
29
30
|
errorCode: normalizeToolErrorCode(code),
|
|
30
|
-
errorType: code,
|
|
31
31
|
};
|
|
32
32
|
return {
|
|
33
33
|
content: [{ type: 'text', text: JSON.stringify(structuredContent) }],
|
|
@@ -42,19 +42,19 @@ function formatErrorMessage(baseMessage, error, fallback) {
|
|
|
42
42
|
}
|
|
43
43
|
return message;
|
|
44
44
|
}
|
|
45
|
-
export function handleToolError(error, url, fallbackMessage = 'Operation failed') {
|
|
45
|
+
export function handleToolError(error, url, fallbackMessage = 'Operation failed', details = {}) {
|
|
46
46
|
if (isValidationError(error)) {
|
|
47
|
-
return createToolErrorResponse(error.message, url, 'VALIDATION_ERROR');
|
|
47
|
+
return createToolErrorResponse(error.message, url, 'VALIDATION_ERROR', details);
|
|
48
48
|
}
|
|
49
49
|
if (error instanceof FetchError) {
|
|
50
50
|
const message = formatErrorMessage(error.message, error);
|
|
51
|
-
return createToolErrorResponse(message, url, error.code);
|
|
51
|
+
return createToolErrorResponse(message, url, error.code, details);
|
|
52
52
|
}
|
|
53
53
|
if (error instanceof Error) {
|
|
54
54
|
const message = formatErrorMessage(error.message, error, fallbackMessage);
|
|
55
|
-
return createToolErrorResponse(message, url, 'UNKNOWN_ERROR');
|
|
55
|
+
return createToolErrorResponse(message, url, 'UNKNOWN_ERROR', details);
|
|
56
56
|
}
|
|
57
|
-
return createToolErrorResponse(`${fallbackMessage}: Unknown error`, url, 'UNKNOWN_ERROR');
|
|
57
|
+
return createToolErrorResponse(`${fallbackMessage}: Unknown error`, url, 'UNKNOWN_ERROR', details);
|
|
58
58
|
}
|
|
59
59
|
function isValidationError(error) {
|
|
60
60
|
return (error instanceof Error &&
|
|
@@ -1,2 +1,7 @@
|
|
|
1
1
|
export declare function isBlockedIp(ip: string): boolean;
|
|
2
|
+
export declare function assertResolvedAddressesAllowed(hostname: string): Promise<void>;
|
|
3
|
+
export declare function normalizeUrl(urlString: string): {
|
|
4
|
+
normalizedUrl: string;
|
|
5
|
+
hostname: string;
|
|
6
|
+
};
|
|
2
7
|
export declare function validateAndNormalizeUrl(urlString: string): Promise<string>;
|
|
@@ -32,6 +32,35 @@ for (const entry of BLOCKED_IPV6_SUBNETS) {
|
|
|
32
32
|
BLOCK_LIST.addSubnet(entry.subnet, entry.prefix, 'ipv6');
|
|
33
33
|
}
|
|
34
34
|
const DNS_LOOKUP_TIMEOUT_MS = 5000;
|
|
35
|
+
const DNS_DECISION_TTL_MS = 60000;
|
|
36
|
+
const DNS_DECISION_MAX = 1000;
|
|
37
|
+
const dnsDecisionCache = new Map();
|
|
38
|
+
function getCachedDnsDecision(hostname) {
|
|
39
|
+
const cached = dnsDecisionCache.get(hostname);
|
|
40
|
+
if (!cached)
|
|
41
|
+
return null;
|
|
42
|
+
if (cached.expiresAt <= Date.now()) {
|
|
43
|
+
dnsDecisionCache.delete(hostname);
|
|
44
|
+
return null;
|
|
45
|
+
}
|
|
46
|
+
return cached;
|
|
47
|
+
}
|
|
48
|
+
function setCachedDnsDecision(hostname, ok) {
|
|
49
|
+
dnsDecisionCache.set(hostname, {
|
|
50
|
+
ok,
|
|
51
|
+
expiresAt: Date.now() + DNS_DECISION_TTL_MS,
|
|
52
|
+
});
|
|
53
|
+
if (dnsDecisionCache.size <= DNS_DECISION_MAX)
|
|
54
|
+
return;
|
|
55
|
+
const evictCount = Math.ceil(DNS_DECISION_MAX * 0.05);
|
|
56
|
+
const iterator = dnsDecisionCache.keys();
|
|
57
|
+
for (let i = 0; i < evictCount; i++) {
|
|
58
|
+
const { value, done } = iterator.next();
|
|
59
|
+
if (done)
|
|
60
|
+
break;
|
|
61
|
+
dnsDecisionCache.delete(value);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
35
64
|
function matchesBlockedIpPatterns(resolvedIp) {
|
|
36
65
|
for (const pattern of config.security.blockedIpPatterns) {
|
|
37
66
|
if (pattern.test(resolvedIp)) {
|
|
@@ -78,7 +107,14 @@ function lookupWithTimeout(hostname) {
|
|
|
78
107
|
});
|
|
79
108
|
});
|
|
80
109
|
}
|
|
81
|
-
async function assertResolvedAddressesAllowed(hostname) {
|
|
110
|
+
export async function assertResolvedAddressesAllowed(hostname) {
|
|
111
|
+
const cached = getCachedDnsDecision(hostname);
|
|
112
|
+
if (cached) {
|
|
113
|
+
if (!cached.ok) {
|
|
114
|
+
throw createValidationError(`Blocked IP range resolved from hostname: ${hostname}`);
|
|
115
|
+
}
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
82
118
|
try {
|
|
83
119
|
const result = await lookupWithTimeout(hostname);
|
|
84
120
|
const addresses = Array.isArray(result) ? result : [result];
|
|
@@ -87,9 +123,11 @@ async function assertResolvedAddressesAllowed(hostname) {
|
|
|
87
123
|
}
|
|
88
124
|
for (const { address } of addresses) {
|
|
89
125
|
if (isBlockedIp(address.toLowerCase())) {
|
|
126
|
+
setCachedDnsDecision(hostname, false);
|
|
90
127
|
throw createValidationError(`Blocked IP range resolved from hostname: ${hostname}`);
|
|
91
128
|
}
|
|
92
129
|
}
|
|
130
|
+
setCachedDnsDecision(hostname, true);
|
|
93
131
|
}
|
|
94
132
|
catch (error) {
|
|
95
133
|
const code = error?.code;
|
|
@@ -102,7 +140,7 @@ async function assertResolvedAddressesAllowed(hostname) {
|
|
|
102
140
|
throw createValidationError(String(error));
|
|
103
141
|
}
|
|
104
142
|
}
|
|
105
|
-
export
|
|
143
|
+
export function normalizeUrl(urlString) {
|
|
106
144
|
const trimmedUrl = requireTrimmedUrl(urlString);
|
|
107
145
|
assertUrlLength(trimmedUrl);
|
|
108
146
|
const url = parseUrl(trimmedUrl);
|
|
@@ -110,8 +148,12 @@ export async function validateAndNormalizeUrl(urlString) {
|
|
|
110
148
|
assertNoCredentials(url);
|
|
111
149
|
const hostname = normalizeHostname(url);
|
|
112
150
|
assertHostnameAllowed(hostname);
|
|
151
|
+
return { normalizedUrl: url.href, hostname };
|
|
152
|
+
}
|
|
153
|
+
export async function validateAndNormalizeUrl(urlString) {
|
|
154
|
+
const { normalizedUrl, hostname } = normalizeUrl(urlString);
|
|
113
155
|
await assertResolvedAddressesAllowed(hostname);
|
|
114
|
-
return
|
|
156
|
+
return normalizedUrl;
|
|
115
157
|
}
|
|
116
158
|
const VALIDATION_ERROR_CODE = 'VALIDATION_ERROR';
|
|
117
159
|
function createValidationError(message) {
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|