@j0hanz/superfetch 1.2.2 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +60 -45
- package/dist/config/formatting.d.ts +1 -1
- package/dist/config/types/content.d.ts +3 -3
- package/dist/config/types/runtime.d.ts +1 -1
- package/dist/config/types/tools.d.ts +12 -12
- package/dist/http/cors.js +23 -23
- package/dist/http/download-routes.js +9 -4
- package/dist/http/mcp-routes.js +2 -13
- package/dist/http/mcp-validation.js +1 -1
- package/dist/http/server-middleware.js +2 -1
- package/dist/http/server.js +2 -0
- package/dist/index.js +5 -0
- package/dist/middleware/error-handler.js +1 -1
- package/dist/resources/cached-content.js +8 -4
- package/dist/server.js +2 -0
- package/dist/services/cache.d.ts +1 -1
- package/dist/services/cache.js +20 -7
- package/dist/services/context.d.ts +2 -4
- package/dist/services/context.js +1 -1
- package/dist/services/extractor.js +26 -21
- package/dist/services/fetcher/interceptors.d.ts +22 -0
- package/dist/services/fetcher/interceptors.js +18 -8
- package/dist/services/fetcher/response.js +32 -24
- package/dist/services/fetcher.d.ts +0 -1
- package/dist/services/fetcher.js +5 -7
- package/dist/services/metadata-collector.d.ts +10 -0
- package/dist/services/metadata-collector.js +11 -0
- package/dist/services/parser.js +26 -25
- package/dist/services/transform-worker-pool.d.ts +14 -0
- package/dist/services/transform-worker-pool.js +167 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +9 -1
- package/dist/tools/handlers/fetch-markdown.tool.js +58 -30
- package/dist/tools/handlers/fetch-single.shared.d.ts +8 -3
- package/dist/tools/handlers/fetch-single.shared.js +42 -17
- package/dist/tools/handlers/fetch-url.tool.js +46 -16
- package/dist/tools/index.js +13 -0
- package/dist/tools/schemas.d.ts +33 -30
- package/dist/tools/schemas.js +4 -0
- package/dist/tools/utils/common.js +20 -16
- package/dist/tools/utils/content-transform-async.d.ts +6 -0
- package/dist/tools/utils/content-transform-async.js +33 -0
- package/dist/tools/utils/content-transform.d.ts +4 -1
- package/dist/tools/utils/content-transform.js +7 -2
- package/dist/tools/utils/fetch-pipeline.js +18 -10
- package/dist/utils/content-cleaner.d.ts +1 -1
- package/dist/utils/download-url.d.ts +9 -1
- package/dist/utils/download-url.js +9 -6
- package/dist/utils/tool-error-handler.d.ts +2 -2
- package/dist/utils/tool-error-handler.js +7 -7
- package/dist/utils/url-validator.js +38 -0
- package/dist/workers/transform-worker.d.ts +1 -0
- package/dist/workers/transform-worker.js +50 -0
- package/package.json +4 -6
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import type { PipelineResult, ToolContentBlock } from '../../config/types/runtime.js';
|
|
2
2
|
import type { FileDownloadInfo, ToolResponseBase } from '../../config/types/tools.js';
|
|
3
|
+
import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
|
|
3
4
|
import { applyInlineContentLimit } from '../utils/inline-content.js';
|
|
4
5
|
type SharedFetchFormat = 'jsonl' | 'markdown';
|
|
5
6
|
interface SharedFetchOptions<T extends {
|
|
@@ -10,17 +11,21 @@ interface SharedFetchOptions<T extends {
|
|
|
10
11
|
readonly extractMainContent: boolean;
|
|
11
12
|
readonly includeMetadata: boolean;
|
|
12
13
|
readonly maxContentLength?: number;
|
|
14
|
+
readonly includeContentBlocks?: boolean;
|
|
13
15
|
readonly cacheVariant?: string;
|
|
14
16
|
readonly customHeaders?: Record<string, string>;
|
|
15
17
|
readonly retries?: number;
|
|
16
18
|
readonly timeout?: number;
|
|
17
|
-
readonly transform: (html: string, normalizedUrl: string) => T
|
|
19
|
+
readonly transform: (html: string, normalizedUrl: string) => T | Promise<T>;
|
|
18
20
|
readonly serialize?: (result: T) => string;
|
|
19
21
|
readonly deserialize?: (cached: string) => T | undefined;
|
|
20
22
|
}
|
|
23
|
+
interface SharedFetchDeps {
|
|
24
|
+
readonly executeFetchPipeline?: typeof executeFetchPipeline;
|
|
25
|
+
}
|
|
21
26
|
export declare function performSharedFetch<T extends {
|
|
22
27
|
content: string;
|
|
23
|
-
}>(options: SharedFetchOptions<T
|
|
28
|
+
}>(options: SharedFetchOptions<T>, deps?: SharedFetchDeps): Promise<{
|
|
24
29
|
pipeline: PipelineResult<T>;
|
|
25
30
|
inlineResult: ReturnType<typeof applyInlineContentLimit>;
|
|
26
31
|
}>;
|
|
@@ -31,7 +36,7 @@ interface DownloadContext {
|
|
|
31
36
|
title?: string;
|
|
32
37
|
}
|
|
33
38
|
export declare function getFileDownloadInfo(context: DownloadContext): FileDownloadInfo | null;
|
|
34
|
-
export declare function getInlineErrorResponse(inlineResult: InlineResult, url: string): ToolResponseBase | null;
|
|
39
|
+
export declare function getInlineErrorResponse(inlineResult: InlineResult, url: string, details?: Record<string, unknown>): ToolResponseBase | null;
|
|
35
40
|
export declare function applyInlineResultToStructuredContent(structuredContent: Record<string, unknown>, inlineResult: InlineResult, contentKey: string): void;
|
|
36
41
|
export declare function buildToolContentBlocks(structuredContent: Record<string, unknown>, fromCache: boolean, inlineResult: InlineResult, resourceName: string, cacheKey?: string | null, fullContent?: string, format?: SharedFetchFormat, url?: string, title?: string): ToolContentBlock[];
|
|
37
42
|
export {};
|
|
@@ -5,7 +5,8 @@ import { createToolErrorResponse } from '../../utils/tool-error-handler.js';
|
|
|
5
5
|
import { appendHeaderVary } from '../utils/cache-vary.js';
|
|
6
6
|
import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
|
|
7
7
|
import { applyInlineContentLimit } from '../utils/inline-content.js';
|
|
8
|
-
export async function performSharedFetch(options) {
|
|
8
|
+
export async function performSharedFetch(options, deps = {}) {
|
|
9
|
+
const executePipeline = deps.executeFetchPipeline ?? executeFetchPipeline;
|
|
9
10
|
const cacheNamespace = options.format === 'markdown' ? 'markdown' : 'url';
|
|
10
11
|
const cacheVary = appendHeaderVary({
|
|
11
12
|
format: options.format,
|
|
@@ -13,33 +14,54 @@ export async function performSharedFetch(options) {
|
|
|
13
14
|
includeMetadata: options.includeMetadata,
|
|
14
15
|
maxContentLength: options.maxContentLength,
|
|
15
16
|
...(options.cacheVariant ? { variant: options.cacheVariant } : {}),
|
|
16
|
-
...(options.format === 'markdown'
|
|
17
|
+
...(options.format === 'markdown'
|
|
18
|
+
? { includeContentBlocks: options.includeContentBlocks }
|
|
19
|
+
: { contentBlocks: true }),
|
|
17
20
|
}, options.customHeaders);
|
|
18
|
-
const
|
|
21
|
+
const pipelineOptions = {
|
|
19
22
|
url: options.url,
|
|
20
23
|
cacheNamespace,
|
|
21
|
-
customHeaders: options.customHeaders,
|
|
22
|
-
retries: options.retries,
|
|
23
|
-
timeout: options.timeout,
|
|
24
|
-
cacheVary,
|
|
25
24
|
transform: options.transform,
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
25
|
+
};
|
|
26
|
+
if (options.customHeaders !== undefined) {
|
|
27
|
+
pipelineOptions.customHeaders = options.customHeaders;
|
|
28
|
+
}
|
|
29
|
+
if (options.retries !== undefined) {
|
|
30
|
+
pipelineOptions.retries = options.retries;
|
|
31
|
+
}
|
|
32
|
+
if (options.timeout !== undefined) {
|
|
33
|
+
pipelineOptions.timeout = options.timeout;
|
|
34
|
+
}
|
|
35
|
+
if (cacheVary !== undefined) {
|
|
36
|
+
pipelineOptions.cacheVary = cacheVary;
|
|
37
|
+
}
|
|
38
|
+
if (options.serialize !== undefined) {
|
|
39
|
+
pipelineOptions.serialize = options.serialize;
|
|
40
|
+
}
|
|
41
|
+
if (options.deserialize !== undefined) {
|
|
42
|
+
pipelineOptions.deserialize = options.deserialize;
|
|
43
|
+
}
|
|
44
|
+
const pipeline = await executePipeline(pipelineOptions);
|
|
29
45
|
const inlineResult = applyInlineContentLimit(pipeline.data.content, pipeline.cacheKey ?? null, options.format);
|
|
30
46
|
return { pipeline, inlineResult };
|
|
31
47
|
}
|
|
32
48
|
export function getFileDownloadInfo(context) {
|
|
33
|
-
|
|
49
|
+
const infoOptions = {
|
|
34
50
|
cacheKey: context.cacheKey,
|
|
35
51
|
url: context.url,
|
|
36
|
-
|
|
37
|
-
|
|
52
|
+
};
|
|
53
|
+
if (context.title !== undefined) {
|
|
54
|
+
return buildFileDownloadInfo({
|
|
55
|
+
...infoOptions,
|
|
56
|
+
title: context.title,
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
return buildFileDownloadInfo(infoOptions);
|
|
38
60
|
}
|
|
39
|
-
export function getInlineErrorResponse(inlineResult, url) {
|
|
61
|
+
export function getInlineErrorResponse(inlineResult, url, details) {
|
|
40
62
|
if (!inlineResult.error)
|
|
41
63
|
return null;
|
|
42
|
-
return createToolErrorResponse(inlineResult.error, url, 'INTERNAL_ERROR');
|
|
64
|
+
return createToolErrorResponse(inlineResult.error, url, 'INTERNAL_ERROR', details);
|
|
43
65
|
}
|
|
44
66
|
export function applyInlineResultToStructuredContent(structuredContent, inlineResult, contentKey) {
|
|
45
67
|
if (inlineResult.truncated) {
|
|
@@ -60,13 +82,16 @@ function buildResourceLink(inlineResult, name) {
|
|
|
60
82
|
if (!inlineResult.resourceUri) {
|
|
61
83
|
return null;
|
|
62
84
|
}
|
|
63
|
-
|
|
85
|
+
const block = {
|
|
64
86
|
type: 'resource_link',
|
|
65
87
|
uri: inlineResult.resourceUri,
|
|
66
88
|
name,
|
|
67
|
-
mimeType: inlineResult.resourceMimeType,
|
|
68
89
|
description: `Content exceeds inline limit (${config.constants.maxInlineContentChars} chars)`,
|
|
69
90
|
};
|
|
91
|
+
if (inlineResult.resourceMimeType !== undefined) {
|
|
92
|
+
block.mimeType = inlineResult.resourceMimeType;
|
|
93
|
+
}
|
|
94
|
+
return block;
|
|
70
95
|
}
|
|
71
96
|
function buildEmbeddedResource(content, mimeType, url, title) {
|
|
72
97
|
if (!content) {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { config } from '../../config/index.js';
|
|
2
2
|
import { logDebug, logError } from '../../services/logger.js';
|
|
3
3
|
import { createToolErrorResponse, handleToolError, } from '../../utils/tool-error-handler.js';
|
|
4
|
-
import {
|
|
4
|
+
import { transformHtmlToJsonlAsync, transformHtmlToMarkdownWithBlocksAsync, } from '../utils/content-transform-async.js';
|
|
5
5
|
import { applyInlineResultToStructuredContent, buildToolContentBlocks, getInlineErrorResponse, performSharedFetch, } from './fetch-single.shared.js';
|
|
6
6
|
export const FETCH_URL_TOOL_NAME = 'fetch-url';
|
|
7
7
|
export const FETCH_URL_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to AI-readable JSONL format with semantic content blocks. Supports custom headers, retries, and content length limits.';
|
|
@@ -37,17 +37,36 @@ function deserializeJsonlTransformResult(cached) {
|
|
|
37
37
|
}
|
|
38
38
|
}
|
|
39
39
|
function resolveFetchUrlOptions(input) {
|
|
40
|
+
const format = input.format ?? 'jsonl';
|
|
40
41
|
return {
|
|
41
42
|
extractMainContent: input.extractMainContent ?? config.extraction.extractMainContent,
|
|
42
43
|
includeMetadata: input.includeMetadata ?? config.extraction.includeMetadata,
|
|
43
|
-
|
|
44
|
-
|
|
44
|
+
format,
|
|
45
|
+
includeContentBlocks: input.includeContentBlocks ?? (format === 'markdown' ? false : true),
|
|
46
|
+
...(input.maxContentLength !== undefined && {
|
|
47
|
+
maxContentLength: input.maxContentLength,
|
|
48
|
+
}),
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
function buildFetchUrlErrorDetails(format) {
|
|
52
|
+
return {
|
|
53
|
+
contentBlocks: 0,
|
|
54
|
+
fetchedAt: new Date().toISOString(),
|
|
55
|
+
format,
|
|
56
|
+
cached: false,
|
|
45
57
|
};
|
|
46
58
|
}
|
|
47
59
|
function buildFetchUrlTransform(options) {
|
|
48
|
-
return (html, url) => options.format === 'markdown'
|
|
49
|
-
?
|
|
50
|
-
|
|
60
|
+
return async (html, url) => options.format === 'markdown'
|
|
61
|
+
? transformHtmlToMarkdownWithBlocksAsync(html, url, {
|
|
62
|
+
extractMainContent: options.extractMainContent,
|
|
63
|
+
includeMetadata: options.includeMetadata,
|
|
64
|
+
...(options.maxContentLength !== undefined && {
|
|
65
|
+
maxContentLength: options.maxContentLength,
|
|
66
|
+
}),
|
|
67
|
+
includeContentBlocks: options.includeContentBlocks,
|
|
68
|
+
})
|
|
69
|
+
: transformHtmlToJsonlAsync(html, url, options);
|
|
51
70
|
}
|
|
52
71
|
function buildFetchUrlStructuredContent(format, pipeline, inlineResult) {
|
|
53
72
|
const structuredContent = {
|
|
@@ -74,22 +93,31 @@ function logFetchUrlStart(url, options) {
|
|
|
74
93
|
extractMainContent: options.extractMainContent,
|
|
75
94
|
includeMetadata: options.includeMetadata,
|
|
76
95
|
format: options.format,
|
|
96
|
+
includeContentBlocks: options.includeContentBlocks,
|
|
77
97
|
});
|
|
78
98
|
}
|
|
79
99
|
async function fetchUrlPipeline(url, input, options) {
|
|
80
|
-
|
|
100
|
+
const sharedOptions = {
|
|
81
101
|
url,
|
|
82
102
|
format: options.format,
|
|
83
103
|
extractMainContent: options.extractMainContent,
|
|
84
104
|
includeMetadata: options.includeMetadata,
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
105
|
+
includeContentBlocks: options.includeContentBlocks,
|
|
106
|
+
...(options.maxContentLength !== undefined && {
|
|
107
|
+
maxContentLength: options.maxContentLength,
|
|
108
|
+
}),
|
|
109
|
+
...(input.customHeaders !== undefined && {
|
|
110
|
+
customHeaders: input.customHeaders,
|
|
111
|
+
}),
|
|
112
|
+
...(input.retries !== undefined && { retries: input.retries }),
|
|
113
|
+
...(input.timeout !== undefined && { timeout: input.timeout }),
|
|
114
|
+
...(options.format === 'markdown' && {
|
|
115
|
+
cacheVariant: 'markdown-with-blocks',
|
|
116
|
+
}),
|
|
90
117
|
transform: buildFetchUrlTransform(options),
|
|
91
118
|
deserialize: deserializeJsonlTransformResult,
|
|
92
|
-
}
|
|
119
|
+
};
|
|
120
|
+
return performSharedFetch(sharedOptions);
|
|
93
121
|
}
|
|
94
122
|
function buildFetchUrlResponse(pipeline, inlineResult, format) {
|
|
95
123
|
const structuredContent = buildFetchUrlStructuredContent(format, pipeline, inlineResult);
|
|
@@ -104,18 +132,20 @@ export async function fetchUrlToolHandler(input) {
|
|
|
104
132
|
}
|
|
105
133
|
catch (error) {
|
|
106
134
|
logError('fetch-url tool error', error instanceof Error ? error : undefined);
|
|
107
|
-
|
|
135
|
+
const errorDetails = buildFetchUrlErrorDetails(input.format ?? 'jsonl');
|
|
136
|
+
return handleToolError(error, input.url, 'Failed to fetch URL', errorDetails);
|
|
108
137
|
}
|
|
109
138
|
}
|
|
110
139
|
async function executeFetchUrl(input) {
|
|
111
140
|
const { url } = input;
|
|
141
|
+
const format = input.format ?? 'jsonl';
|
|
112
142
|
if (!url) {
|
|
113
|
-
return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR');
|
|
143
|
+
return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR', buildFetchUrlErrorDetails(format));
|
|
114
144
|
}
|
|
115
145
|
const options = resolveFetchUrlOptions(input);
|
|
116
146
|
logFetchUrlStart(url, options);
|
|
117
147
|
const { pipeline, inlineResult } = await fetchUrlPipeline(url, input, options);
|
|
118
|
-
const inlineError = getInlineErrorResponse(inlineResult, url);
|
|
148
|
+
const inlineError = getInlineErrorResponse(inlineResult, url, buildFetchUrlErrorDetails(options.format));
|
|
119
149
|
if (inlineError)
|
|
120
150
|
return inlineError;
|
|
121
151
|
return buildFetchUrlResponse(pipeline, inlineResult, options.format);
|
package/dist/tools/index.js
CHANGED
|
@@ -9,6 +9,12 @@ const TOOL_DEFINITIONS = [
|
|
|
9
9
|
inputSchema: fetchUrlInputSchema,
|
|
10
10
|
outputSchema: fetchUrlOutputSchema,
|
|
11
11
|
handler: fetchUrlToolHandler,
|
|
12
|
+
annotations: {
|
|
13
|
+
readOnlyHint: true,
|
|
14
|
+
destructiveHint: false,
|
|
15
|
+
idempotentHint: true,
|
|
16
|
+
openWorldHint: true,
|
|
17
|
+
},
|
|
12
18
|
},
|
|
13
19
|
{
|
|
14
20
|
name: FETCH_MARKDOWN_TOOL_NAME,
|
|
@@ -17,6 +23,12 @@ const TOOL_DEFINITIONS = [
|
|
|
17
23
|
inputSchema: fetchMarkdownInputSchema,
|
|
18
24
|
outputSchema: fetchMarkdownOutputSchema,
|
|
19
25
|
handler: fetchMarkdownToolHandler,
|
|
26
|
+
annotations: {
|
|
27
|
+
readOnlyHint: true,
|
|
28
|
+
destructiveHint: false,
|
|
29
|
+
idempotentHint: true,
|
|
30
|
+
openWorldHint: true,
|
|
31
|
+
},
|
|
20
32
|
},
|
|
21
33
|
];
|
|
22
34
|
export function registerTools(server) {
|
|
@@ -26,6 +38,7 @@ export function registerTools(server) {
|
|
|
26
38
|
description: tool.description,
|
|
27
39
|
inputSchema: tool.inputSchema,
|
|
28
40
|
outputSchema: tool.outputSchema,
|
|
41
|
+
annotations: tool.annotations,
|
|
29
42
|
}, tool.handler);
|
|
30
43
|
}
|
|
31
44
|
}
|
package/dist/tools/schemas.d.ts
CHANGED
|
@@ -11,24 +11,27 @@ export declare const fetchUrlInputSchema: z.ZodObject<{
|
|
|
11
11
|
maxContentLength: z.ZodOptional<z.ZodNumber>;
|
|
12
12
|
} & {
|
|
13
13
|
format: z.ZodDefault<z.ZodEnum<["jsonl", "markdown"]>>;
|
|
14
|
+
includeContentBlocks: z.ZodOptional<z.ZodBoolean>;
|
|
14
15
|
}, "strict", z.ZodTypeAny, {
|
|
15
16
|
url: string;
|
|
16
|
-
timeout: number;
|
|
17
|
-
retries: number;
|
|
18
17
|
extractMainContent: boolean;
|
|
19
18
|
includeMetadata: boolean;
|
|
19
|
+
retries: number;
|
|
20
20
|
format: "jsonl" | "markdown";
|
|
21
|
-
|
|
21
|
+
timeout: number;
|
|
22
22
|
maxContentLength?: number | undefined;
|
|
23
|
+
includeContentBlocks?: boolean | undefined;
|
|
24
|
+
customHeaders?: Record<string, string> | undefined;
|
|
23
25
|
}, {
|
|
24
26
|
url: string;
|
|
25
|
-
customHeaders?: Record<string, string> | undefined;
|
|
26
|
-
timeout?: number | undefined;
|
|
27
|
-
retries?: number | undefined;
|
|
28
27
|
extractMainContent?: boolean | undefined;
|
|
29
28
|
includeMetadata?: boolean | undefined;
|
|
30
29
|
maxContentLength?: number | undefined;
|
|
30
|
+
retries?: number | undefined;
|
|
31
31
|
format?: "jsonl" | "markdown" | undefined;
|
|
32
|
+
includeContentBlocks?: boolean | undefined;
|
|
33
|
+
timeout?: number | undefined;
|
|
34
|
+
customHeaders?: Record<string, string> | undefined;
|
|
32
35
|
}>;
|
|
33
36
|
export declare const fetchMarkdownInputSchema: z.ZodObject<{
|
|
34
37
|
customHeaders: z.ZodOptional<z.ZodEffects<z.ZodRecord<z.ZodString, z.ZodString>, Record<string, string>, Record<string, string>>>;
|
|
@@ -42,20 +45,20 @@ export declare const fetchMarkdownInputSchema: z.ZodObject<{
|
|
|
42
45
|
maxContentLength: z.ZodOptional<z.ZodNumber>;
|
|
43
46
|
}, "strict", z.ZodTypeAny, {
|
|
44
47
|
url: string;
|
|
45
|
-
timeout: number;
|
|
46
|
-
retries: number;
|
|
47
48
|
extractMainContent: boolean;
|
|
48
49
|
includeMetadata: boolean;
|
|
49
|
-
|
|
50
|
+
retries: number;
|
|
51
|
+
timeout: number;
|
|
50
52
|
maxContentLength?: number | undefined;
|
|
53
|
+
customHeaders?: Record<string, string> | undefined;
|
|
51
54
|
}, {
|
|
52
55
|
url: string;
|
|
53
|
-
customHeaders?: Record<string, string> | undefined;
|
|
54
|
-
timeout?: number | undefined;
|
|
55
|
-
retries?: number | undefined;
|
|
56
56
|
extractMainContent?: boolean | undefined;
|
|
57
57
|
includeMetadata?: boolean | undefined;
|
|
58
58
|
maxContentLength?: number | undefined;
|
|
59
|
+
retries?: number | undefined;
|
|
60
|
+
timeout?: number | undefined;
|
|
61
|
+
customHeaders?: Record<string, string> | undefined;
|
|
59
62
|
}>;
|
|
60
63
|
export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
61
64
|
url: z.ZodString;
|
|
@@ -74,31 +77,31 @@ export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
|
74
77
|
errorCode: z.ZodOptional<z.ZodString>;
|
|
75
78
|
}, "strict", z.ZodTypeAny, {
|
|
76
79
|
url: string;
|
|
80
|
+
fetchedAt: string;
|
|
77
81
|
format: "jsonl" | "markdown";
|
|
78
82
|
contentBlocks: number;
|
|
79
|
-
fetchedAt: string;
|
|
80
83
|
cached: boolean;
|
|
81
84
|
error?: string | undefined;
|
|
82
85
|
title?: string | undefined;
|
|
83
|
-
|
|
84
|
-
contentSize?: number | undefined;
|
|
86
|
+
truncated?: boolean | undefined;
|
|
85
87
|
resourceUri?: string | undefined;
|
|
86
88
|
resourceMimeType?: string | undefined;
|
|
87
|
-
|
|
89
|
+
content?: string | undefined;
|
|
90
|
+
contentSize?: number | undefined;
|
|
88
91
|
errorCode?: string | undefined;
|
|
89
92
|
}, {
|
|
90
93
|
url: string;
|
|
94
|
+
fetchedAt: string;
|
|
91
95
|
format: "jsonl" | "markdown";
|
|
92
96
|
contentBlocks: number;
|
|
93
|
-
fetchedAt: string;
|
|
94
97
|
cached: boolean;
|
|
95
98
|
error?: string | undefined;
|
|
96
99
|
title?: string | undefined;
|
|
97
|
-
|
|
98
|
-
contentSize?: number | undefined;
|
|
100
|
+
truncated?: boolean | undefined;
|
|
99
101
|
resourceUri?: string | undefined;
|
|
100
102
|
resourceMimeType?: string | undefined;
|
|
101
|
-
|
|
103
|
+
content?: string | undefined;
|
|
104
|
+
contentSize?: number | undefined;
|
|
102
105
|
errorCode?: string | undefined;
|
|
103
106
|
}>;
|
|
104
107
|
export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
@@ -111,13 +114,13 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
|
111
114
|
fileName: z.ZodString;
|
|
112
115
|
expiresAt: z.ZodString;
|
|
113
116
|
}, "strip", z.ZodTypeAny, {
|
|
117
|
+
expiresAt: string;
|
|
114
118
|
downloadUrl: string;
|
|
115
119
|
fileName: string;
|
|
116
|
-
expiresAt: string;
|
|
117
120
|
}, {
|
|
121
|
+
expiresAt: string;
|
|
118
122
|
downloadUrl: string;
|
|
119
123
|
fileName: string;
|
|
120
|
-
expiresAt: string;
|
|
121
124
|
}>>;
|
|
122
125
|
} & {
|
|
123
126
|
contentSize: z.ZodOptional<z.ZodNumber>;
|
|
@@ -134,16 +137,16 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
|
134
137
|
error?: string | undefined;
|
|
135
138
|
markdown?: string | undefined;
|
|
136
139
|
title?: string | undefined;
|
|
137
|
-
|
|
140
|
+
truncated?: boolean | undefined;
|
|
138
141
|
resourceUri?: string | undefined;
|
|
139
142
|
resourceMimeType?: string | undefined;
|
|
140
|
-
|
|
141
|
-
errorCode?: string | undefined;
|
|
143
|
+
contentSize?: number | undefined;
|
|
142
144
|
file?: {
|
|
145
|
+
expiresAt: string;
|
|
143
146
|
downloadUrl: string;
|
|
144
147
|
fileName: string;
|
|
145
|
-
expiresAt: string;
|
|
146
148
|
} | undefined;
|
|
149
|
+
errorCode?: string | undefined;
|
|
147
150
|
}, {
|
|
148
151
|
url: string;
|
|
149
152
|
fetchedAt: string;
|
|
@@ -151,14 +154,14 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
|
151
154
|
error?: string | undefined;
|
|
152
155
|
markdown?: string | undefined;
|
|
153
156
|
title?: string | undefined;
|
|
154
|
-
|
|
157
|
+
truncated?: boolean | undefined;
|
|
155
158
|
resourceUri?: string | undefined;
|
|
156
159
|
resourceMimeType?: string | undefined;
|
|
157
|
-
|
|
158
|
-
errorCode?: string | undefined;
|
|
160
|
+
contentSize?: number | undefined;
|
|
159
161
|
file?: {
|
|
162
|
+
expiresAt: string;
|
|
160
163
|
downloadUrl: string;
|
|
161
164
|
fileName: string;
|
|
162
|
-
expiresAt: string;
|
|
163
165
|
} | undefined;
|
|
166
|
+
errorCode?: string | undefined;
|
|
164
167
|
}>;
|
package/dist/tools/schemas.js
CHANGED
|
@@ -47,6 +47,10 @@ const formatOptionsSchema = z.object({
|
|
|
47
47
|
.enum(['jsonl', 'markdown'])
|
|
48
48
|
.default('jsonl')
|
|
49
49
|
.describe('Output format'),
|
|
50
|
+
includeContentBlocks: z
|
|
51
|
+
.boolean()
|
|
52
|
+
.optional()
|
|
53
|
+
.describe('Include content block counts when format=markdown'),
|
|
50
54
|
});
|
|
51
55
|
const resourceFieldsSchema = z.object({
|
|
52
56
|
contentSize: z.number().optional().describe('Content length in characters'),
|
|
@@ -6,22 +6,26 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
6
6
|
if (!includeMetadata)
|
|
7
7
|
return undefined;
|
|
8
8
|
const now = new Date().toISOString();
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
9
|
+
const metadata = {
|
|
10
|
+
type: 'metadata',
|
|
11
|
+
url,
|
|
12
|
+
fetchedAt: now,
|
|
13
|
+
};
|
|
14
|
+
if (shouldExtractFromArticle && article) {
|
|
15
|
+
if (article.title !== undefined)
|
|
16
|
+
metadata.title = article.title;
|
|
17
|
+
if (article.byline !== undefined)
|
|
18
|
+
metadata.author = article.byline;
|
|
19
|
+
return metadata;
|
|
20
|
+
}
|
|
21
|
+
if (extractedMeta.title !== undefined)
|
|
22
|
+
metadata.title = extractedMeta.title;
|
|
23
|
+
if (extractedMeta.description !== undefined) {
|
|
24
|
+
metadata.description = extractedMeta.description;
|
|
25
|
+
}
|
|
26
|
+
if (extractedMeta.author !== undefined)
|
|
27
|
+
metadata.author = extractedMeta.author;
|
|
28
|
+
return metadata;
|
|
25
29
|
}
|
|
26
30
|
export function truncateContent(content, maxLength, suffix = TRUNCATION_MARKER) {
|
|
27
31
|
if (maxLength === undefined ||
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { JsonlTransformResult, MarkdownTransformResult, TransformOptions } from '../../config/types/content.js';
|
|
2
|
+
export declare function transformHtmlToJsonlAsync(html: string, url: string, options: TransformOptions): Promise<JsonlTransformResult>;
|
|
3
|
+
export declare function transformHtmlToMarkdownAsync(html: string, url: string, options: TransformOptions): Promise<MarkdownTransformResult>;
|
|
4
|
+
export declare function transformHtmlToMarkdownWithBlocksAsync(html: string, url: string, options: TransformOptions & {
|
|
5
|
+
includeContentBlocks?: boolean;
|
|
6
|
+
}): Promise<JsonlTransformResult>;
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { logWarn } from '../../services/logger.js';
|
|
2
|
+
import { runTransformInWorker, } from '../../services/transform-worker-pool.js';
|
|
3
|
+
import { transformHtmlToJsonl, transformHtmlToMarkdown, transformHtmlToMarkdownWithBlocks, } from './content-transform.js';
|
|
4
|
+
async function runOrFallback(job, fallback) {
|
|
5
|
+
try {
|
|
6
|
+
const result = await runTransformInWorker(job);
|
|
7
|
+
if (result)
|
|
8
|
+
return result;
|
|
9
|
+
}
|
|
10
|
+
catch (error) {
|
|
11
|
+
logWarn('Transform worker unavailable; using main thread', {
|
|
12
|
+
error: error instanceof Error ? error.message : String(error),
|
|
13
|
+
});
|
|
14
|
+
}
|
|
15
|
+
return fallback();
|
|
16
|
+
}
|
|
17
|
+
export async function transformHtmlToJsonlAsync(html, url, options) {
|
|
18
|
+
const result = await runOrFallback({ mode: 'jsonl', html, url, options }, () => transformHtmlToJsonl(html, url, options));
|
|
19
|
+
return result;
|
|
20
|
+
}
|
|
21
|
+
export async function transformHtmlToMarkdownAsync(html, url, options) {
|
|
22
|
+
const result = await runOrFallback({ mode: 'markdown', html, url, options }, () => transformHtmlToMarkdown(html, url, options));
|
|
23
|
+
return result;
|
|
24
|
+
}
|
|
25
|
+
export async function transformHtmlToMarkdownWithBlocksAsync(html, url, options) {
|
|
26
|
+
const result = await runOrFallback({
|
|
27
|
+
mode: 'markdown-blocks',
|
|
28
|
+
html,
|
|
29
|
+
url,
|
|
30
|
+
options,
|
|
31
|
+
}, () => transformHtmlToMarkdownWithBlocks(html, url, options));
|
|
32
|
+
return result;
|
|
33
|
+
}
|
|
@@ -8,7 +8,10 @@ interface ContentLengthOptions {
|
|
|
8
8
|
}
|
|
9
9
|
interface MarkdownOptions extends ExtractionOptions, ContentLengthOptions {
|
|
10
10
|
}
|
|
11
|
+
interface MarkdownWithBlocksOptions extends ExtractionOptions, ContentLengthOptions {
|
|
12
|
+
readonly includeContentBlocks?: boolean;
|
|
13
|
+
}
|
|
11
14
|
export declare function transformHtmlToJsonl(html: string, url: string, options: ExtractionOptions & ContentLengthOptions): JsonlTransformResult;
|
|
12
15
|
export declare function transformHtmlToMarkdown(html: string, url: string, options: MarkdownOptions): MarkdownTransformResult;
|
|
13
|
-
export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options:
|
|
16
|
+
export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options: MarkdownWithBlocksOptions): JsonlTransformResult;
|
|
14
17
|
export {};
|
|
@@ -102,7 +102,10 @@ export function transformHtmlToMarkdown(html, url, options) {
|
|
|
102
102
|
};
|
|
103
103
|
}
|
|
104
104
|
export function transformHtmlToMarkdownWithBlocks(html, url, options) {
|
|
105
|
-
|
|
105
|
+
const includeContentBlocks = options.includeContentBlocks ?? true;
|
|
106
|
+
if (includeContentBlocks &&
|
|
107
|
+
!options.extractMainContent &&
|
|
108
|
+
options.includeMetadata) {
|
|
106
109
|
const parsed = parseHtmlWithMetadata(html);
|
|
107
110
|
const context = {
|
|
108
111
|
sourceHtml: html,
|
|
@@ -118,7 +121,9 @@ export function transformHtmlToMarkdownWithBlocks(html, url, options) {
|
|
|
118
121
|
};
|
|
119
122
|
}
|
|
120
123
|
const context = resolveContentSource(html, url, options);
|
|
121
|
-
const contentBlocks =
|
|
124
|
+
const contentBlocks = includeContentBlocks
|
|
125
|
+
? parseHtml(context.sourceHtml)
|
|
126
|
+
: [];
|
|
122
127
|
const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
|
|
123
128
|
return {
|
|
124
129
|
content,
|
|
@@ -51,7 +51,7 @@ export async function executeFetchPipeline(options) {
|
|
|
51
51
|
const fetchOptions = buildFetchOptions(options);
|
|
52
52
|
logDebug('Fetching URL', { url: normalizedUrl, retries: options.retries });
|
|
53
53
|
const html = await fetchNormalizedUrlWithRetry(normalizedUrl, fetchOptions, options.retries);
|
|
54
|
-
const data = options.transform(html, normalizedUrl);
|
|
54
|
+
const data = await options.transform(html, normalizedUrl);
|
|
55
55
|
if (cache.isEnabled()) {
|
|
56
56
|
persistCache(cacheKey, data, options.serialize, normalizedUrl);
|
|
57
57
|
}
|
|
@@ -62,20 +62,28 @@ function resolveCacheKey(options, normalizedUrl) {
|
|
|
62
62
|
return cache.createCacheKey(options.cacheNamespace, normalizedUrl, cacheVary);
|
|
63
63
|
}
|
|
64
64
|
function buildFetchOptions(options) {
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
65
|
+
const fetchOptions = {};
|
|
66
|
+
if (options.customHeaders !== undefined) {
|
|
67
|
+
fetchOptions.customHeaders = options.customHeaders;
|
|
68
|
+
}
|
|
69
|
+
if (options.signal !== undefined) {
|
|
70
|
+
fetchOptions.signal = options.signal;
|
|
71
|
+
}
|
|
72
|
+
if (options.timeout !== undefined) {
|
|
73
|
+
fetchOptions.timeout = options.timeout;
|
|
74
|
+
}
|
|
75
|
+
return fetchOptions;
|
|
70
76
|
}
|
|
71
77
|
function persistCache(cacheKey, data, serialize, normalizedUrl) {
|
|
72
78
|
if (!cacheKey)
|
|
73
79
|
return;
|
|
74
80
|
const serializer = serialize ?? JSON.stringify;
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
81
|
+
const metadata = { url: normalizedUrl };
|
|
82
|
+
const title = extractTitle(data);
|
|
83
|
+
if (title !== undefined) {
|
|
84
|
+
metadata.title = title;
|
|
85
|
+
}
|
|
86
|
+
cache.set(cacheKey, serializer(data), metadata);
|
|
79
87
|
}
|
|
80
88
|
function extractTitle(value) {
|
|
81
89
|
if (!value || typeof value !== 'object')
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
export declare function cleanParagraph(text: string): string | null;
|
|
2
2
|
export declare function cleanHeading(text: string): string | null;
|
|
3
|
-
export declare function cleanListItems(items: string[]): string[];
|
|
3
|
+
export declare function cleanListItems(items: readonly string[]): readonly string[];
|
|
4
4
|
export declare function cleanCodeBlock(code: string): string | null;
|
|
5
5
|
export declare function removeInlineTimestamps(text: string): string;
|
|
@@ -1,8 +1,16 @@
|
|
|
1
|
+
import { config } from '../config/index.js';
|
|
1
2
|
import type { FileDownloadInfo } from '../config/types/tools.js';
|
|
3
|
+
import * as cache from '../services/cache.js';
|
|
4
|
+
import { generateSafeFilename } from './filename-generator.js';
|
|
2
5
|
interface DownloadInfoOptions {
|
|
3
6
|
cacheKey: string | null;
|
|
4
7
|
url: string;
|
|
5
8
|
title?: string;
|
|
6
9
|
}
|
|
7
|
-
|
|
10
|
+
interface DownloadInfoDeps {
|
|
11
|
+
readonly config?: typeof config;
|
|
12
|
+
readonly cache?: Pick<typeof cache, 'get' | 'parseCacheKey'>;
|
|
13
|
+
readonly generateSafeFilename?: typeof generateSafeFilename;
|
|
14
|
+
}
|
|
15
|
+
export declare function buildFileDownloadInfo(options: DownloadInfoOptions, deps?: DownloadInfoDeps): FileDownloadInfo | null;
|
|
8
16
|
export {};
|