@j0hanz/superfetch 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -46
- package/dist/config/formatting.d.ts +1 -1
- package/dist/config/types/content.d.ts +3 -3
- package/dist/config/types/runtime.d.ts +1 -1
- package/dist/config/types/tools.d.ts +12 -12
- package/dist/http/cors.js +23 -23
- package/dist/http/download-routes.js +9 -4
- package/dist/http/mcp-routes.js +2 -13
- package/dist/http/mcp-validation.js +1 -1
- package/dist/http/server-middleware.js +2 -1
- package/dist/http/server.js +2 -0
- package/dist/index.js +5 -0
- package/dist/middleware/error-handler.js +1 -1
- package/dist/resources/cached-content.js +8 -4
- package/dist/server.js +2 -0
- package/dist/services/cache.d.ts +1 -1
- package/dist/services/cache.js +20 -7
- package/dist/services/context.d.ts +2 -4
- package/dist/services/context.js +1 -1
- package/dist/services/extractor.js +26 -21
- package/dist/services/fetcher/interceptors.d.ts +22 -0
- package/dist/services/fetcher/interceptors.js +18 -8
- package/dist/services/fetcher/response.js +32 -24
- package/dist/services/fetcher.d.ts +0 -1
- package/dist/services/fetcher.js +5 -7
- package/dist/services/metadata-collector.d.ts +10 -0
- package/dist/services/metadata-collector.js +11 -0
- package/dist/services/parser.js +26 -25
- package/dist/services/transform-worker-pool.d.ts +14 -0
- package/dist/services/transform-worker-pool.js +167 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +9 -1
- package/dist/tools/handlers/fetch-markdown.tool.js +58 -30
- package/dist/tools/handlers/fetch-single.shared.d.ts +8 -3
- package/dist/tools/handlers/fetch-single.shared.js +42 -17
- package/dist/tools/handlers/fetch-url.tool.js +46 -16
- package/dist/tools/index.js +13 -0
- package/dist/tools/schemas.d.ts +29 -133
- package/dist/tools/schemas.js +22 -32
- package/dist/tools/utils/common.js +20 -16
- package/dist/tools/utils/content-transform-async.d.ts +6 -0
- package/dist/tools/utils/content-transform-async.js +33 -0
- package/dist/tools/utils/content-transform.d.ts +4 -1
- package/dist/tools/utils/content-transform.js +7 -2
- package/dist/tools/utils/fetch-pipeline.js +18 -10
- package/dist/utils/content-cleaner.d.ts +1 -1
- package/dist/utils/download-url.d.ts +9 -1
- package/dist/utils/download-url.js +9 -6
- package/dist/utils/tool-error-handler.d.ts +2 -2
- package/dist/utils/tool-error-handler.js +7 -7
- package/dist/utils/url-validator.js +38 -0
- package/dist/workers/transform-worker.d.ts +1 -0
- package/dist/workers/transform-worker.js +50 -0
- package/package.json +5 -7
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import type { PipelineResult, ToolContentBlock } from '../../config/types/runtime.js';
|
|
2
2
|
import type { FileDownloadInfo, ToolResponseBase } from '../../config/types/tools.js';
|
|
3
|
+
import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
|
|
3
4
|
import { applyInlineContentLimit } from '../utils/inline-content.js';
|
|
4
5
|
type SharedFetchFormat = 'jsonl' | 'markdown';
|
|
5
6
|
interface SharedFetchOptions<T extends {
|
|
@@ -10,17 +11,21 @@ interface SharedFetchOptions<T extends {
|
|
|
10
11
|
readonly extractMainContent: boolean;
|
|
11
12
|
readonly includeMetadata: boolean;
|
|
12
13
|
readonly maxContentLength?: number;
|
|
14
|
+
readonly includeContentBlocks?: boolean;
|
|
13
15
|
readonly cacheVariant?: string;
|
|
14
16
|
readonly customHeaders?: Record<string, string>;
|
|
15
17
|
readonly retries?: number;
|
|
16
18
|
readonly timeout?: number;
|
|
17
|
-
readonly transform: (html: string, normalizedUrl: string) => T
|
|
19
|
+
readonly transform: (html: string, normalizedUrl: string) => T | Promise<T>;
|
|
18
20
|
readonly serialize?: (result: T) => string;
|
|
19
21
|
readonly deserialize?: (cached: string) => T | undefined;
|
|
20
22
|
}
|
|
23
|
+
interface SharedFetchDeps {
|
|
24
|
+
readonly executeFetchPipeline?: typeof executeFetchPipeline;
|
|
25
|
+
}
|
|
21
26
|
export declare function performSharedFetch<T extends {
|
|
22
27
|
content: string;
|
|
23
|
-
}>(options: SharedFetchOptions<T
|
|
28
|
+
}>(options: SharedFetchOptions<T>, deps?: SharedFetchDeps): Promise<{
|
|
24
29
|
pipeline: PipelineResult<T>;
|
|
25
30
|
inlineResult: ReturnType<typeof applyInlineContentLimit>;
|
|
26
31
|
}>;
|
|
@@ -31,7 +36,7 @@ interface DownloadContext {
|
|
|
31
36
|
title?: string;
|
|
32
37
|
}
|
|
33
38
|
export declare function getFileDownloadInfo(context: DownloadContext): FileDownloadInfo | null;
|
|
34
|
-
export declare function getInlineErrorResponse(inlineResult: InlineResult, url: string): ToolResponseBase | null;
|
|
39
|
+
export declare function getInlineErrorResponse(inlineResult: InlineResult, url: string, details?: Record<string, unknown>): ToolResponseBase | null;
|
|
35
40
|
export declare function applyInlineResultToStructuredContent(structuredContent: Record<string, unknown>, inlineResult: InlineResult, contentKey: string): void;
|
|
36
41
|
export declare function buildToolContentBlocks(structuredContent: Record<string, unknown>, fromCache: boolean, inlineResult: InlineResult, resourceName: string, cacheKey?: string | null, fullContent?: string, format?: SharedFetchFormat, url?: string, title?: string): ToolContentBlock[];
|
|
37
42
|
export {};
|
|
@@ -5,7 +5,8 @@ import { createToolErrorResponse } from '../../utils/tool-error-handler.js';
|
|
|
5
5
|
import { appendHeaderVary } from '../utils/cache-vary.js';
|
|
6
6
|
import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
|
|
7
7
|
import { applyInlineContentLimit } from '../utils/inline-content.js';
|
|
8
|
-
export async function performSharedFetch(options) {
|
|
8
|
+
export async function performSharedFetch(options, deps = {}) {
|
|
9
|
+
const executePipeline = deps.executeFetchPipeline ?? executeFetchPipeline;
|
|
9
10
|
const cacheNamespace = options.format === 'markdown' ? 'markdown' : 'url';
|
|
10
11
|
const cacheVary = appendHeaderVary({
|
|
11
12
|
format: options.format,
|
|
@@ -13,33 +14,54 @@ export async function performSharedFetch(options) {
|
|
|
13
14
|
includeMetadata: options.includeMetadata,
|
|
14
15
|
maxContentLength: options.maxContentLength,
|
|
15
16
|
...(options.cacheVariant ? { variant: options.cacheVariant } : {}),
|
|
16
|
-
...(options.format === 'markdown'
|
|
17
|
+
...(options.format === 'markdown'
|
|
18
|
+
? { includeContentBlocks: options.includeContentBlocks }
|
|
19
|
+
: { contentBlocks: true }),
|
|
17
20
|
}, options.customHeaders);
|
|
18
|
-
const
|
|
21
|
+
const pipelineOptions = {
|
|
19
22
|
url: options.url,
|
|
20
23
|
cacheNamespace,
|
|
21
|
-
customHeaders: options.customHeaders,
|
|
22
|
-
retries: options.retries,
|
|
23
|
-
timeout: options.timeout,
|
|
24
|
-
cacheVary,
|
|
25
24
|
transform: options.transform,
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
25
|
+
};
|
|
26
|
+
if (options.customHeaders !== undefined) {
|
|
27
|
+
pipelineOptions.customHeaders = options.customHeaders;
|
|
28
|
+
}
|
|
29
|
+
if (options.retries !== undefined) {
|
|
30
|
+
pipelineOptions.retries = options.retries;
|
|
31
|
+
}
|
|
32
|
+
if (options.timeout !== undefined) {
|
|
33
|
+
pipelineOptions.timeout = options.timeout;
|
|
34
|
+
}
|
|
35
|
+
if (cacheVary !== undefined) {
|
|
36
|
+
pipelineOptions.cacheVary = cacheVary;
|
|
37
|
+
}
|
|
38
|
+
if (options.serialize !== undefined) {
|
|
39
|
+
pipelineOptions.serialize = options.serialize;
|
|
40
|
+
}
|
|
41
|
+
if (options.deserialize !== undefined) {
|
|
42
|
+
pipelineOptions.deserialize = options.deserialize;
|
|
43
|
+
}
|
|
44
|
+
const pipeline = await executePipeline(pipelineOptions);
|
|
29
45
|
const inlineResult = applyInlineContentLimit(pipeline.data.content, pipeline.cacheKey ?? null, options.format);
|
|
30
46
|
return { pipeline, inlineResult };
|
|
31
47
|
}
|
|
32
48
|
export function getFileDownloadInfo(context) {
|
|
33
|
-
|
|
49
|
+
const infoOptions = {
|
|
34
50
|
cacheKey: context.cacheKey,
|
|
35
51
|
url: context.url,
|
|
36
|
-
|
|
37
|
-
|
|
52
|
+
};
|
|
53
|
+
if (context.title !== undefined) {
|
|
54
|
+
return buildFileDownloadInfo({
|
|
55
|
+
...infoOptions,
|
|
56
|
+
title: context.title,
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
return buildFileDownloadInfo(infoOptions);
|
|
38
60
|
}
|
|
39
|
-
export function getInlineErrorResponse(inlineResult, url) {
|
|
61
|
+
export function getInlineErrorResponse(inlineResult, url, details) {
|
|
40
62
|
if (!inlineResult.error)
|
|
41
63
|
return null;
|
|
42
|
-
return createToolErrorResponse(inlineResult.error, url, 'INTERNAL_ERROR');
|
|
64
|
+
return createToolErrorResponse(inlineResult.error, url, 'INTERNAL_ERROR', details);
|
|
43
65
|
}
|
|
44
66
|
export function applyInlineResultToStructuredContent(structuredContent, inlineResult, contentKey) {
|
|
45
67
|
if (inlineResult.truncated) {
|
|
@@ -60,13 +82,16 @@ function buildResourceLink(inlineResult, name) {
|
|
|
60
82
|
if (!inlineResult.resourceUri) {
|
|
61
83
|
return null;
|
|
62
84
|
}
|
|
63
|
-
|
|
85
|
+
const block = {
|
|
64
86
|
type: 'resource_link',
|
|
65
87
|
uri: inlineResult.resourceUri,
|
|
66
88
|
name,
|
|
67
|
-
mimeType: inlineResult.resourceMimeType,
|
|
68
89
|
description: `Content exceeds inline limit (${config.constants.maxInlineContentChars} chars)`,
|
|
69
90
|
};
|
|
91
|
+
if (inlineResult.resourceMimeType !== undefined) {
|
|
92
|
+
block.mimeType = inlineResult.resourceMimeType;
|
|
93
|
+
}
|
|
94
|
+
return block;
|
|
70
95
|
}
|
|
71
96
|
function buildEmbeddedResource(content, mimeType, url, title) {
|
|
72
97
|
if (!content) {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { config } from '../../config/index.js';
|
|
2
2
|
import { logDebug, logError } from '../../services/logger.js';
|
|
3
3
|
import { createToolErrorResponse, handleToolError, } from '../../utils/tool-error-handler.js';
|
|
4
|
-
import {
|
|
4
|
+
import { transformHtmlToJsonlAsync, transformHtmlToMarkdownWithBlocksAsync, } from '../utils/content-transform-async.js';
|
|
5
5
|
import { applyInlineResultToStructuredContent, buildToolContentBlocks, getInlineErrorResponse, performSharedFetch, } from './fetch-single.shared.js';
|
|
6
6
|
export const FETCH_URL_TOOL_NAME = 'fetch-url';
|
|
7
7
|
export const FETCH_URL_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to AI-readable JSONL format with semantic content blocks. Supports custom headers, retries, and content length limits.';
|
|
@@ -37,17 +37,36 @@ function deserializeJsonlTransformResult(cached) {
|
|
|
37
37
|
}
|
|
38
38
|
}
|
|
39
39
|
function resolveFetchUrlOptions(input) {
|
|
40
|
+
const format = input.format ?? 'jsonl';
|
|
40
41
|
return {
|
|
41
42
|
extractMainContent: input.extractMainContent ?? config.extraction.extractMainContent,
|
|
42
43
|
includeMetadata: input.includeMetadata ?? config.extraction.includeMetadata,
|
|
43
|
-
|
|
44
|
-
|
|
44
|
+
format,
|
|
45
|
+
includeContentBlocks: input.includeContentBlocks ?? (format === 'markdown' ? false : true),
|
|
46
|
+
...(input.maxContentLength !== undefined && {
|
|
47
|
+
maxContentLength: input.maxContentLength,
|
|
48
|
+
}),
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
function buildFetchUrlErrorDetails(format) {
|
|
52
|
+
return {
|
|
53
|
+
contentBlocks: 0,
|
|
54
|
+
fetchedAt: new Date().toISOString(),
|
|
55
|
+
format,
|
|
56
|
+
cached: false,
|
|
45
57
|
};
|
|
46
58
|
}
|
|
47
59
|
function buildFetchUrlTransform(options) {
|
|
48
|
-
return (html, url) => options.format === 'markdown'
|
|
49
|
-
?
|
|
50
|
-
|
|
60
|
+
return async (html, url) => options.format === 'markdown'
|
|
61
|
+
? transformHtmlToMarkdownWithBlocksAsync(html, url, {
|
|
62
|
+
extractMainContent: options.extractMainContent,
|
|
63
|
+
includeMetadata: options.includeMetadata,
|
|
64
|
+
...(options.maxContentLength !== undefined && {
|
|
65
|
+
maxContentLength: options.maxContentLength,
|
|
66
|
+
}),
|
|
67
|
+
includeContentBlocks: options.includeContentBlocks,
|
|
68
|
+
})
|
|
69
|
+
: transformHtmlToJsonlAsync(html, url, options);
|
|
51
70
|
}
|
|
52
71
|
function buildFetchUrlStructuredContent(format, pipeline, inlineResult) {
|
|
53
72
|
const structuredContent = {
|
|
@@ -74,22 +93,31 @@ function logFetchUrlStart(url, options) {
|
|
|
74
93
|
extractMainContent: options.extractMainContent,
|
|
75
94
|
includeMetadata: options.includeMetadata,
|
|
76
95
|
format: options.format,
|
|
96
|
+
includeContentBlocks: options.includeContentBlocks,
|
|
77
97
|
});
|
|
78
98
|
}
|
|
79
99
|
async function fetchUrlPipeline(url, input, options) {
|
|
80
|
-
|
|
100
|
+
const sharedOptions = {
|
|
81
101
|
url,
|
|
82
102
|
format: options.format,
|
|
83
103
|
extractMainContent: options.extractMainContent,
|
|
84
104
|
includeMetadata: options.includeMetadata,
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
105
|
+
includeContentBlocks: options.includeContentBlocks,
|
|
106
|
+
...(options.maxContentLength !== undefined && {
|
|
107
|
+
maxContentLength: options.maxContentLength,
|
|
108
|
+
}),
|
|
109
|
+
...(input.customHeaders !== undefined && {
|
|
110
|
+
customHeaders: input.customHeaders,
|
|
111
|
+
}),
|
|
112
|
+
...(input.retries !== undefined && { retries: input.retries }),
|
|
113
|
+
...(input.timeout !== undefined && { timeout: input.timeout }),
|
|
114
|
+
...(options.format === 'markdown' && {
|
|
115
|
+
cacheVariant: 'markdown-with-blocks',
|
|
116
|
+
}),
|
|
90
117
|
transform: buildFetchUrlTransform(options),
|
|
91
118
|
deserialize: deserializeJsonlTransformResult,
|
|
92
|
-
}
|
|
119
|
+
};
|
|
120
|
+
return performSharedFetch(sharedOptions);
|
|
93
121
|
}
|
|
94
122
|
function buildFetchUrlResponse(pipeline, inlineResult, format) {
|
|
95
123
|
const structuredContent = buildFetchUrlStructuredContent(format, pipeline, inlineResult);
|
|
@@ -104,18 +132,20 @@ export async function fetchUrlToolHandler(input) {
|
|
|
104
132
|
}
|
|
105
133
|
catch (error) {
|
|
106
134
|
logError('fetch-url tool error', error instanceof Error ? error : undefined);
|
|
107
|
-
|
|
135
|
+
const errorDetails = buildFetchUrlErrorDetails(input.format ?? 'jsonl');
|
|
136
|
+
return handleToolError(error, input.url, 'Failed to fetch URL', errorDetails);
|
|
108
137
|
}
|
|
109
138
|
}
|
|
110
139
|
async function executeFetchUrl(input) {
|
|
111
140
|
const { url } = input;
|
|
141
|
+
const format = input.format ?? 'jsonl';
|
|
112
142
|
if (!url) {
|
|
113
|
-
return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR');
|
|
143
|
+
return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR', buildFetchUrlErrorDetails(format));
|
|
114
144
|
}
|
|
115
145
|
const options = resolveFetchUrlOptions(input);
|
|
116
146
|
logFetchUrlStart(url, options);
|
|
117
147
|
const { pipeline, inlineResult } = await fetchUrlPipeline(url, input, options);
|
|
118
|
-
const inlineError = getInlineErrorResponse(inlineResult, url);
|
|
148
|
+
const inlineError = getInlineErrorResponse(inlineResult, url, buildFetchUrlErrorDetails(options.format));
|
|
119
149
|
if (inlineError)
|
|
120
150
|
return inlineError;
|
|
121
151
|
return buildFetchUrlResponse(pipeline, inlineResult, options.format);
|
package/dist/tools/index.js
CHANGED
|
@@ -9,6 +9,12 @@ const TOOL_DEFINITIONS = [
|
|
|
9
9
|
inputSchema: fetchUrlInputSchema,
|
|
10
10
|
outputSchema: fetchUrlOutputSchema,
|
|
11
11
|
handler: fetchUrlToolHandler,
|
|
12
|
+
annotations: {
|
|
13
|
+
readOnlyHint: true,
|
|
14
|
+
destructiveHint: false,
|
|
15
|
+
idempotentHint: true,
|
|
16
|
+
openWorldHint: true,
|
|
17
|
+
},
|
|
12
18
|
},
|
|
13
19
|
{
|
|
14
20
|
name: FETCH_MARKDOWN_TOOL_NAME,
|
|
@@ -17,6 +23,12 @@ const TOOL_DEFINITIONS = [
|
|
|
17
23
|
inputSchema: fetchMarkdownInputSchema,
|
|
18
24
|
outputSchema: fetchMarkdownOutputSchema,
|
|
19
25
|
handler: fetchMarkdownToolHandler,
|
|
26
|
+
annotations: {
|
|
27
|
+
readOnlyHint: true,
|
|
28
|
+
destructiveHint: false,
|
|
29
|
+
idempotentHint: true,
|
|
30
|
+
openWorldHint: true,
|
|
31
|
+
},
|
|
20
32
|
},
|
|
21
33
|
];
|
|
22
34
|
export function registerTools(server) {
|
|
@@ -26,6 +38,7 @@ export function registerTools(server) {
|
|
|
26
38
|
description: tool.description,
|
|
27
39
|
inputSchema: tool.inputSchema,
|
|
28
40
|
outputSchema: tool.outputSchema,
|
|
41
|
+
annotations: tool.annotations,
|
|
29
42
|
}, tool.handler);
|
|
30
43
|
}
|
|
31
44
|
}
|
package/dist/tools/schemas.d.ts
CHANGED
|
@@ -1,70 +1,46 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
2
|
export declare const fetchUrlInputSchema: z.ZodObject<{
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
} & {
|
|
3
|
+
format: z.ZodDefault<z.ZodEnum<{
|
|
4
|
+
jsonl: "jsonl";
|
|
5
|
+
markdown: "markdown";
|
|
6
|
+
}>>;
|
|
7
|
+
includeContentBlocks: z.ZodOptional<z.ZodBoolean>;
|
|
9
8
|
extractMainContent: z.ZodDefault<z.ZodBoolean>;
|
|
10
9
|
includeMetadata: z.ZodDefault<z.ZodBoolean>;
|
|
11
10
|
maxContentLength: z.ZodOptional<z.ZodNumber>;
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
}, "strict", z.ZodTypeAny, {
|
|
15
|
-
url: string;
|
|
16
|
-
timeout: number;
|
|
17
|
-
retries: number;
|
|
18
|
-
extractMainContent: boolean;
|
|
19
|
-
includeMetadata: boolean;
|
|
20
|
-
format: "jsonl" | "markdown";
|
|
21
|
-
customHeaders?: Record<string, string> | undefined;
|
|
22
|
-
maxContentLength?: number | undefined;
|
|
23
|
-
}, {
|
|
24
|
-
url: string;
|
|
25
|
-
customHeaders?: Record<string, string> | undefined;
|
|
26
|
-
timeout?: number | undefined;
|
|
27
|
-
retries?: number | undefined;
|
|
28
|
-
extractMainContent?: boolean | undefined;
|
|
29
|
-
includeMetadata?: boolean | undefined;
|
|
30
|
-
maxContentLength?: number | undefined;
|
|
31
|
-
format?: "jsonl" | "markdown" | undefined;
|
|
32
|
-
}>;
|
|
33
|
-
export declare const fetchMarkdownInputSchema: z.ZodObject<{
|
|
34
|
-
customHeaders: z.ZodOptional<z.ZodEffects<z.ZodRecord<z.ZodString, z.ZodString>, Record<string, string>, Record<string, string>>>;
|
|
11
|
+
url: z.ZodURL;
|
|
12
|
+
customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
35
13
|
timeout: z.ZodDefault<z.ZodNumber>;
|
|
36
14
|
retries: z.ZodDefault<z.ZodNumber>;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
} & {
|
|
15
|
+
}, z.core.$strict>;
|
|
16
|
+
export declare const fetchMarkdownInputSchema: z.ZodObject<{
|
|
40
17
|
extractMainContent: z.ZodDefault<z.ZodBoolean>;
|
|
41
18
|
includeMetadata: z.ZodDefault<z.ZodBoolean>;
|
|
42
19
|
maxContentLength: z.ZodOptional<z.ZodNumber>;
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
timeout:
|
|
46
|
-
retries:
|
|
47
|
-
|
|
48
|
-
includeMetadata: boolean;
|
|
49
|
-
customHeaders?: Record<string, string> | undefined;
|
|
50
|
-
maxContentLength?: number | undefined;
|
|
51
|
-
}, {
|
|
52
|
-
url: string;
|
|
53
|
-
customHeaders?: Record<string, string> | undefined;
|
|
54
|
-
timeout?: number | undefined;
|
|
55
|
-
retries?: number | undefined;
|
|
56
|
-
extractMainContent?: boolean | undefined;
|
|
57
|
-
includeMetadata?: boolean | undefined;
|
|
58
|
-
maxContentLength?: number | undefined;
|
|
59
|
-
}>;
|
|
20
|
+
url: z.ZodURL;
|
|
21
|
+
customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
22
|
+
timeout: z.ZodDefault<z.ZodNumber>;
|
|
23
|
+
retries: z.ZodDefault<z.ZodNumber>;
|
|
24
|
+
}, z.core.$strict>;
|
|
60
25
|
export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
26
|
+
contentSize: z.ZodOptional<z.ZodNumber>;
|
|
27
|
+
resourceUri: z.ZodOptional<z.ZodString>;
|
|
28
|
+
resourceMimeType: z.ZodOptional<z.ZodString>;
|
|
29
|
+
cached: z.ZodBoolean;
|
|
30
|
+
truncated: z.ZodOptional<z.ZodBoolean>;
|
|
31
|
+
error: z.ZodOptional<z.ZodString>;
|
|
32
|
+
errorCode: z.ZodOptional<z.ZodString>;
|
|
61
33
|
url: z.ZodString;
|
|
62
34
|
title: z.ZodOptional<z.ZodString>;
|
|
63
35
|
contentBlocks: z.ZodNumber;
|
|
64
36
|
fetchedAt: z.ZodString;
|
|
65
|
-
format: z.ZodEnum<
|
|
37
|
+
format: z.ZodEnum<{
|
|
38
|
+
jsonl: "jsonl";
|
|
39
|
+
markdown: "markdown";
|
|
40
|
+
}>;
|
|
66
41
|
content: z.ZodOptional<z.ZodString>;
|
|
67
|
-
}
|
|
42
|
+
}, z.core.$strict>;
|
|
43
|
+
export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
68
44
|
contentSize: z.ZodOptional<z.ZodNumber>;
|
|
69
45
|
resourceUri: z.ZodOptional<z.ZodString>;
|
|
70
46
|
resourceMimeType: z.ZodOptional<z.ZodString>;
|
|
@@ -72,36 +48,6 @@ export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
|
72
48
|
truncated: z.ZodOptional<z.ZodBoolean>;
|
|
73
49
|
error: z.ZodOptional<z.ZodString>;
|
|
74
50
|
errorCode: z.ZodOptional<z.ZodString>;
|
|
75
|
-
}, "strict", z.ZodTypeAny, {
|
|
76
|
-
url: string;
|
|
77
|
-
format: "jsonl" | "markdown";
|
|
78
|
-
contentBlocks: number;
|
|
79
|
-
fetchedAt: string;
|
|
80
|
-
cached: boolean;
|
|
81
|
-
error?: string | undefined;
|
|
82
|
-
title?: string | undefined;
|
|
83
|
-
content?: string | undefined;
|
|
84
|
-
contentSize?: number | undefined;
|
|
85
|
-
resourceUri?: string | undefined;
|
|
86
|
-
resourceMimeType?: string | undefined;
|
|
87
|
-
truncated?: boolean | undefined;
|
|
88
|
-
errorCode?: string | undefined;
|
|
89
|
-
}, {
|
|
90
|
-
url: string;
|
|
91
|
-
format: "jsonl" | "markdown";
|
|
92
|
-
contentBlocks: number;
|
|
93
|
-
fetchedAt: string;
|
|
94
|
-
cached: boolean;
|
|
95
|
-
error?: string | undefined;
|
|
96
|
-
title?: string | undefined;
|
|
97
|
-
content?: string | undefined;
|
|
98
|
-
contentSize?: number | undefined;
|
|
99
|
-
resourceUri?: string | undefined;
|
|
100
|
-
resourceMimeType?: string | undefined;
|
|
101
|
-
truncated?: boolean | undefined;
|
|
102
|
-
errorCode?: string | undefined;
|
|
103
|
-
}>;
|
|
104
|
-
export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
105
51
|
url: z.ZodString;
|
|
106
52
|
title: z.ZodOptional<z.ZodString>;
|
|
107
53
|
fetchedAt: z.ZodString;
|
|
@@ -110,55 +56,5 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
|
|
|
110
56
|
downloadUrl: z.ZodString;
|
|
111
57
|
fileName: z.ZodString;
|
|
112
58
|
expiresAt: z.ZodString;
|
|
113
|
-
},
|
|
114
|
-
|
|
115
|
-
fileName: string;
|
|
116
|
-
expiresAt: string;
|
|
117
|
-
}, {
|
|
118
|
-
downloadUrl: string;
|
|
119
|
-
fileName: string;
|
|
120
|
-
expiresAt: string;
|
|
121
|
-
}>>;
|
|
122
|
-
} & {
|
|
123
|
-
contentSize: z.ZodOptional<z.ZodNumber>;
|
|
124
|
-
resourceUri: z.ZodOptional<z.ZodString>;
|
|
125
|
-
resourceMimeType: z.ZodOptional<z.ZodString>;
|
|
126
|
-
cached: z.ZodBoolean;
|
|
127
|
-
truncated: z.ZodOptional<z.ZodBoolean>;
|
|
128
|
-
error: z.ZodOptional<z.ZodString>;
|
|
129
|
-
errorCode: z.ZodOptional<z.ZodString>;
|
|
130
|
-
}, "strict", z.ZodTypeAny, {
|
|
131
|
-
url: string;
|
|
132
|
-
fetchedAt: string;
|
|
133
|
-
cached: boolean;
|
|
134
|
-
error?: string | undefined;
|
|
135
|
-
markdown?: string | undefined;
|
|
136
|
-
title?: string | undefined;
|
|
137
|
-
contentSize?: number | undefined;
|
|
138
|
-
resourceUri?: string | undefined;
|
|
139
|
-
resourceMimeType?: string | undefined;
|
|
140
|
-
truncated?: boolean | undefined;
|
|
141
|
-
errorCode?: string | undefined;
|
|
142
|
-
file?: {
|
|
143
|
-
downloadUrl: string;
|
|
144
|
-
fileName: string;
|
|
145
|
-
expiresAt: string;
|
|
146
|
-
} | undefined;
|
|
147
|
-
}, {
|
|
148
|
-
url: string;
|
|
149
|
-
fetchedAt: string;
|
|
150
|
-
cached: boolean;
|
|
151
|
-
error?: string | undefined;
|
|
152
|
-
markdown?: string | undefined;
|
|
153
|
-
title?: string | undefined;
|
|
154
|
-
contentSize?: number | undefined;
|
|
155
|
-
resourceUri?: string | undefined;
|
|
156
|
-
resourceMimeType?: string | undefined;
|
|
157
|
-
truncated?: boolean | undefined;
|
|
158
|
-
errorCode?: string | undefined;
|
|
159
|
-
file?: {
|
|
160
|
-
downloadUrl: string;
|
|
161
|
-
fileName: string;
|
|
162
|
-
expiresAt: string;
|
|
163
|
-
} | undefined;
|
|
164
|
-
}>;
|
|
59
|
+
}, z.core.$strip>>;
|
|
60
|
+
}, z.core.$strict>;
|
package/dist/tools/schemas.js
CHANGED
|
@@ -7,7 +7,7 @@ const MAX_CONTENT_LENGTH = config.constants.maxContentSize;
|
|
|
7
7
|
const customHeadersSchema = z
|
|
8
8
|
.record(z.string().max(MAX_HEADER_NAME_LENGTH), z.string().max(MAX_HEADER_VALUE_LENGTH))
|
|
9
9
|
.refine((headers) => Object.keys(headers).length <= MAX_HEADER_COUNT, {
|
|
10
|
-
|
|
10
|
+
error: `customHeaders must have at most ${MAX_HEADER_COUNT} entries`,
|
|
11
11
|
});
|
|
12
12
|
const requestOptionsSchema = z.object({
|
|
13
13
|
customHeaders: customHeadersSchema
|
|
@@ -47,6 +47,10 @@ const formatOptionsSchema = z.object({
|
|
|
47
47
|
.enum(['jsonl', 'markdown'])
|
|
48
48
|
.default('jsonl')
|
|
49
49
|
.describe('Output format'),
|
|
50
|
+
includeContentBlocks: z
|
|
51
|
+
.boolean()
|
|
52
|
+
.optional()
|
|
53
|
+
.describe('Include content block counts when format=markdown'),
|
|
50
54
|
});
|
|
51
55
|
const resourceFieldsSchema = z.object({
|
|
52
56
|
contentSize: z.number().optional().describe('Content length in characters'),
|
|
@@ -71,29 +75,18 @@ const fileDownloadSchema = z.object({
|
|
|
71
75
|
fileName: z.string().describe('Suggested filename for download'),
|
|
72
76
|
expiresAt: z.string().describe('ISO timestamp when download expires'),
|
|
73
77
|
});
|
|
74
|
-
export const fetchUrlInputSchema =
|
|
75
|
-
.
|
|
76
|
-
url: z
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
.
|
|
83
|
-
.
|
|
84
|
-
|
|
85
|
-
export const
|
|
86
|
-
.extend({
|
|
87
|
-
url: z
|
|
88
|
-
.string()
|
|
89
|
-
.min(1)
|
|
90
|
-
.max(config.constants.maxUrlLength)
|
|
91
|
-
.describe('The URL to fetch'),
|
|
92
|
-
})
|
|
93
|
-
.merge(extractionOptionsSchema)
|
|
94
|
-
.strict();
|
|
95
|
-
export const fetchUrlOutputSchema = z
|
|
96
|
-
.object({
|
|
78
|
+
export const fetchUrlInputSchema = z.strictObject({
|
|
79
|
+
...requestOptionsSchema.shape,
|
|
80
|
+
url: z.url({ protocol: /^https?:$/i }).describe('The URL to fetch'),
|
|
81
|
+
...extractionOptionsSchema.shape,
|
|
82
|
+
...formatOptionsSchema.shape,
|
|
83
|
+
});
|
|
84
|
+
export const fetchMarkdownInputSchema = z.strictObject({
|
|
85
|
+
...requestOptionsSchema.shape,
|
|
86
|
+
url: z.url({ protocol: /^https?:$/i }).describe('The URL to fetch'),
|
|
87
|
+
...extractionOptionsSchema.shape,
|
|
88
|
+
});
|
|
89
|
+
export const fetchUrlOutputSchema = z.strictObject({
|
|
97
90
|
url: z.string().describe('The fetched URL'),
|
|
98
91
|
title: z.string().optional().describe('Page title'),
|
|
99
92
|
contentBlocks: z
|
|
@@ -107,11 +100,9 @@ export const fetchUrlOutputSchema = z
|
|
|
107
100
|
.string()
|
|
108
101
|
.optional()
|
|
109
102
|
.describe('The extracted content in JSONL or Markdown format'),
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
export const fetchMarkdownOutputSchema = z
|
|
114
|
-
.object({
|
|
103
|
+
...resourceFieldsSchema.shape,
|
|
104
|
+
});
|
|
105
|
+
export const fetchMarkdownOutputSchema = z.strictObject({
|
|
115
106
|
url: z.string().describe('The fetched URL'),
|
|
116
107
|
title: z.string().optional().describe('Page title'),
|
|
117
108
|
fetchedAt: z
|
|
@@ -124,6 +115,5 @@ export const fetchMarkdownOutputSchema = z
|
|
|
124
115
|
file: fileDownloadSchema
|
|
125
116
|
.optional()
|
|
126
117
|
.describe('Download information when content is cached'),
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
.strict();
|
|
118
|
+
...resourceFieldsSchema.shape,
|
|
119
|
+
});
|
|
@@ -6,22 +6,26 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
6
6
|
if (!includeMetadata)
|
|
7
7
|
return undefined;
|
|
8
8
|
const now = new Date().toISOString();
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
9
|
+
const metadata = {
|
|
10
|
+
type: 'metadata',
|
|
11
|
+
url,
|
|
12
|
+
fetchedAt: now,
|
|
13
|
+
};
|
|
14
|
+
if (shouldExtractFromArticle && article) {
|
|
15
|
+
if (article.title !== undefined)
|
|
16
|
+
metadata.title = article.title;
|
|
17
|
+
if (article.byline !== undefined)
|
|
18
|
+
metadata.author = article.byline;
|
|
19
|
+
return metadata;
|
|
20
|
+
}
|
|
21
|
+
if (extractedMeta.title !== undefined)
|
|
22
|
+
metadata.title = extractedMeta.title;
|
|
23
|
+
if (extractedMeta.description !== undefined) {
|
|
24
|
+
metadata.description = extractedMeta.description;
|
|
25
|
+
}
|
|
26
|
+
if (extractedMeta.author !== undefined)
|
|
27
|
+
metadata.author = extractedMeta.author;
|
|
28
|
+
return metadata;
|
|
25
29
|
}
|
|
26
30
|
export function truncateContent(content, maxLength, suffix = TRUNCATION_MARKER) {
|
|
27
31
|
if (maxLength === undefined ||
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { JsonlTransformResult, MarkdownTransformResult, TransformOptions } from '../../config/types/content.js';
|
|
2
|
+
export declare function transformHtmlToJsonlAsync(html: string, url: string, options: TransformOptions): Promise<JsonlTransformResult>;
|
|
3
|
+
export declare function transformHtmlToMarkdownAsync(html: string, url: string, options: TransformOptions): Promise<MarkdownTransformResult>;
|
|
4
|
+
export declare function transformHtmlToMarkdownWithBlocksAsync(html: string, url: string, options: TransformOptions & {
|
|
5
|
+
includeContentBlocks?: boolean;
|
|
6
|
+
}): Promise<JsonlTransformResult>;
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { logWarn } from '../../services/logger.js';
|
|
2
|
+
import { runTransformInWorker, } from '../../services/transform-worker-pool.js';
|
|
3
|
+
import { transformHtmlToJsonl, transformHtmlToMarkdown, transformHtmlToMarkdownWithBlocks, } from './content-transform.js';
|
|
4
|
+
async function runOrFallback(job, fallback) {
|
|
5
|
+
try {
|
|
6
|
+
const result = await runTransformInWorker(job);
|
|
7
|
+
if (result)
|
|
8
|
+
return result;
|
|
9
|
+
}
|
|
10
|
+
catch (error) {
|
|
11
|
+
logWarn('Transform worker unavailable; using main thread', {
|
|
12
|
+
error: error instanceof Error ? error.message : String(error),
|
|
13
|
+
});
|
|
14
|
+
}
|
|
15
|
+
return fallback();
|
|
16
|
+
}
|
|
17
|
+
export async function transformHtmlToJsonlAsync(html, url, options) {
|
|
18
|
+
const result = await runOrFallback({ mode: 'jsonl', html, url, options }, () => transformHtmlToJsonl(html, url, options));
|
|
19
|
+
return result;
|
|
20
|
+
}
|
|
21
|
+
export async function transformHtmlToMarkdownAsync(html, url, options) {
|
|
22
|
+
const result = await runOrFallback({ mode: 'markdown', html, url, options }, () => transformHtmlToMarkdown(html, url, options));
|
|
23
|
+
return result;
|
|
24
|
+
}
|
|
25
|
+
export async function transformHtmlToMarkdownWithBlocksAsync(html, url, options) {
|
|
26
|
+
const result = await runOrFallback({
|
|
27
|
+
mode: 'markdown-blocks',
|
|
28
|
+
html,
|
|
29
|
+
url,
|
|
30
|
+
options,
|
|
31
|
+
}, () => transformHtmlToMarkdownWithBlocks(html, url, options));
|
|
32
|
+
return result;
|
|
33
|
+
}
|
|
@@ -8,7 +8,10 @@ interface ContentLengthOptions {
|
|
|
8
8
|
}
|
|
9
9
|
interface MarkdownOptions extends ExtractionOptions, ContentLengthOptions {
|
|
10
10
|
}
|
|
11
|
+
interface MarkdownWithBlocksOptions extends ExtractionOptions, ContentLengthOptions {
|
|
12
|
+
readonly includeContentBlocks?: boolean;
|
|
13
|
+
}
|
|
11
14
|
export declare function transformHtmlToJsonl(html: string, url: string, options: ExtractionOptions & ContentLengthOptions): JsonlTransformResult;
|
|
12
15
|
export declare function transformHtmlToMarkdown(html: string, url: string, options: MarkdownOptions): MarkdownTransformResult;
|
|
13
|
-
export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options:
|
|
16
|
+
export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options: MarkdownWithBlocksOptions): JsonlTransformResult;
|
|
14
17
|
export {};
|