@j0hanz/superfetch 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +61 -46
  2. package/dist/config/formatting.d.ts +1 -1
  3. package/dist/config/types/content.d.ts +3 -3
  4. package/dist/config/types/runtime.d.ts +1 -1
  5. package/dist/config/types/tools.d.ts +12 -12
  6. package/dist/http/cors.js +23 -23
  7. package/dist/http/download-routes.js +9 -4
  8. package/dist/http/mcp-routes.js +2 -13
  9. package/dist/http/mcp-validation.js +1 -1
  10. package/dist/http/server-middleware.js +2 -1
  11. package/dist/http/server.js +2 -0
  12. package/dist/index.js +5 -0
  13. package/dist/middleware/error-handler.js +1 -1
  14. package/dist/resources/cached-content.js +8 -4
  15. package/dist/server.js +2 -0
  16. package/dist/services/cache.d.ts +1 -1
  17. package/dist/services/cache.js +20 -7
  18. package/dist/services/context.d.ts +2 -4
  19. package/dist/services/context.js +1 -1
  20. package/dist/services/extractor.js +26 -21
  21. package/dist/services/fetcher/interceptors.d.ts +22 -0
  22. package/dist/services/fetcher/interceptors.js +18 -8
  23. package/dist/services/fetcher/response.js +32 -24
  24. package/dist/services/fetcher.d.ts +0 -1
  25. package/dist/services/fetcher.js +5 -7
  26. package/dist/services/metadata-collector.d.ts +10 -0
  27. package/dist/services/metadata-collector.js +11 -0
  28. package/dist/services/parser.js +26 -25
  29. package/dist/services/transform-worker-pool.d.ts +14 -0
  30. package/dist/services/transform-worker-pool.js +167 -0
  31. package/dist/tools/handlers/fetch-markdown.tool.d.ts +9 -1
  32. package/dist/tools/handlers/fetch-markdown.tool.js +58 -30
  33. package/dist/tools/handlers/fetch-single.shared.d.ts +8 -3
  34. package/dist/tools/handlers/fetch-single.shared.js +42 -17
  35. package/dist/tools/handlers/fetch-url.tool.js +46 -16
  36. package/dist/tools/index.js +13 -0
  37. package/dist/tools/schemas.d.ts +29 -133
  38. package/dist/tools/schemas.js +22 -32
  39. package/dist/tools/utils/common.js +20 -16
  40. package/dist/tools/utils/content-transform-async.d.ts +6 -0
  41. package/dist/tools/utils/content-transform-async.js +33 -0
  42. package/dist/tools/utils/content-transform.d.ts +4 -1
  43. package/dist/tools/utils/content-transform.js +7 -2
  44. package/dist/tools/utils/fetch-pipeline.js +18 -10
  45. package/dist/utils/content-cleaner.d.ts +1 -1
  46. package/dist/utils/download-url.d.ts +9 -1
  47. package/dist/utils/download-url.js +9 -6
  48. package/dist/utils/tool-error-handler.d.ts +2 -2
  49. package/dist/utils/tool-error-handler.js +7 -7
  50. package/dist/utils/url-validator.js +38 -0
  51. package/dist/workers/transform-worker.d.ts +1 -0
  52. package/dist/workers/transform-worker.js +50 -0
  53. package/package.json +5 -7
@@ -1,5 +1,6 @@
1
1
  import type { PipelineResult, ToolContentBlock } from '../../config/types/runtime.js';
2
2
  import type { FileDownloadInfo, ToolResponseBase } from '../../config/types/tools.js';
3
+ import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
3
4
  import { applyInlineContentLimit } from '../utils/inline-content.js';
4
5
  type SharedFetchFormat = 'jsonl' | 'markdown';
5
6
  interface SharedFetchOptions<T extends {
@@ -10,17 +11,21 @@ interface SharedFetchOptions<T extends {
10
11
  readonly extractMainContent: boolean;
11
12
  readonly includeMetadata: boolean;
12
13
  readonly maxContentLength?: number;
14
+ readonly includeContentBlocks?: boolean;
13
15
  readonly cacheVariant?: string;
14
16
  readonly customHeaders?: Record<string, string>;
15
17
  readonly retries?: number;
16
18
  readonly timeout?: number;
17
- readonly transform: (html: string, normalizedUrl: string) => T;
19
+ readonly transform: (html: string, normalizedUrl: string) => T | Promise<T>;
18
20
  readonly serialize?: (result: T) => string;
19
21
  readonly deserialize?: (cached: string) => T | undefined;
20
22
  }
23
+ interface SharedFetchDeps {
24
+ readonly executeFetchPipeline?: typeof executeFetchPipeline;
25
+ }
21
26
  export declare function performSharedFetch<T extends {
22
27
  content: string;
23
- }>(options: SharedFetchOptions<T>): Promise<{
28
+ }>(options: SharedFetchOptions<T>, deps?: SharedFetchDeps): Promise<{
24
29
  pipeline: PipelineResult<T>;
25
30
  inlineResult: ReturnType<typeof applyInlineContentLimit>;
26
31
  }>;
@@ -31,7 +36,7 @@ interface DownloadContext {
31
36
  title?: string;
32
37
  }
33
38
  export declare function getFileDownloadInfo(context: DownloadContext): FileDownloadInfo | null;
34
- export declare function getInlineErrorResponse(inlineResult: InlineResult, url: string): ToolResponseBase | null;
39
+ export declare function getInlineErrorResponse(inlineResult: InlineResult, url: string, details?: Record<string, unknown>): ToolResponseBase | null;
35
40
  export declare function applyInlineResultToStructuredContent(structuredContent: Record<string, unknown>, inlineResult: InlineResult, contentKey: string): void;
36
41
  export declare function buildToolContentBlocks(structuredContent: Record<string, unknown>, fromCache: boolean, inlineResult: InlineResult, resourceName: string, cacheKey?: string | null, fullContent?: string, format?: SharedFetchFormat, url?: string, title?: string): ToolContentBlock[];
37
42
  export {};
@@ -5,7 +5,8 @@ import { createToolErrorResponse } from '../../utils/tool-error-handler.js';
5
5
  import { appendHeaderVary } from '../utils/cache-vary.js';
6
6
  import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
7
7
  import { applyInlineContentLimit } from '../utils/inline-content.js';
8
- export async function performSharedFetch(options) {
8
+ export async function performSharedFetch(options, deps = {}) {
9
+ const executePipeline = deps.executeFetchPipeline ?? executeFetchPipeline;
9
10
  const cacheNamespace = options.format === 'markdown' ? 'markdown' : 'url';
10
11
  const cacheVary = appendHeaderVary({
11
12
  format: options.format,
@@ -13,33 +14,54 @@ export async function performSharedFetch(options) {
13
14
  includeMetadata: options.includeMetadata,
14
15
  maxContentLength: options.maxContentLength,
15
16
  ...(options.cacheVariant ? { variant: options.cacheVariant } : {}),
16
- ...(options.format === 'markdown' ? {} : { contentBlocks: true }),
17
+ ...(options.format === 'markdown'
18
+ ? { includeContentBlocks: options.includeContentBlocks }
19
+ : { contentBlocks: true }),
17
20
  }, options.customHeaders);
18
- const pipeline = await executeFetchPipeline({
21
+ const pipelineOptions = {
19
22
  url: options.url,
20
23
  cacheNamespace,
21
- customHeaders: options.customHeaders,
22
- retries: options.retries,
23
- timeout: options.timeout,
24
- cacheVary,
25
24
  transform: options.transform,
26
- serialize: options.serialize,
27
- deserialize: options.deserialize,
28
- });
25
+ };
26
+ if (options.customHeaders !== undefined) {
27
+ pipelineOptions.customHeaders = options.customHeaders;
28
+ }
29
+ if (options.retries !== undefined) {
30
+ pipelineOptions.retries = options.retries;
31
+ }
32
+ if (options.timeout !== undefined) {
33
+ pipelineOptions.timeout = options.timeout;
34
+ }
35
+ if (cacheVary !== undefined) {
36
+ pipelineOptions.cacheVary = cacheVary;
37
+ }
38
+ if (options.serialize !== undefined) {
39
+ pipelineOptions.serialize = options.serialize;
40
+ }
41
+ if (options.deserialize !== undefined) {
42
+ pipelineOptions.deserialize = options.deserialize;
43
+ }
44
+ const pipeline = await executePipeline(pipelineOptions);
29
45
  const inlineResult = applyInlineContentLimit(pipeline.data.content, pipeline.cacheKey ?? null, options.format);
30
46
  return { pipeline, inlineResult };
31
47
  }
32
48
  export function getFileDownloadInfo(context) {
33
- return buildFileDownloadInfo({
49
+ const infoOptions = {
34
50
  cacheKey: context.cacheKey,
35
51
  url: context.url,
36
- title: context.title,
37
- });
52
+ };
53
+ if (context.title !== undefined) {
54
+ return buildFileDownloadInfo({
55
+ ...infoOptions,
56
+ title: context.title,
57
+ });
58
+ }
59
+ return buildFileDownloadInfo(infoOptions);
38
60
  }
39
- export function getInlineErrorResponse(inlineResult, url) {
61
+ export function getInlineErrorResponse(inlineResult, url, details) {
40
62
  if (!inlineResult.error)
41
63
  return null;
42
- return createToolErrorResponse(inlineResult.error, url, 'INTERNAL_ERROR');
64
+ return createToolErrorResponse(inlineResult.error, url, 'INTERNAL_ERROR', details);
43
65
  }
44
66
  export function applyInlineResultToStructuredContent(structuredContent, inlineResult, contentKey) {
45
67
  if (inlineResult.truncated) {
@@ -60,13 +82,16 @@ function buildResourceLink(inlineResult, name) {
60
82
  if (!inlineResult.resourceUri) {
61
83
  return null;
62
84
  }
63
- return {
85
+ const block = {
64
86
  type: 'resource_link',
65
87
  uri: inlineResult.resourceUri,
66
88
  name,
67
- mimeType: inlineResult.resourceMimeType,
68
89
  description: `Content exceeds inline limit (${config.constants.maxInlineContentChars} chars)`,
69
90
  };
91
+ if (inlineResult.resourceMimeType !== undefined) {
92
+ block.mimeType = inlineResult.resourceMimeType;
93
+ }
94
+ return block;
70
95
  }
71
96
  function buildEmbeddedResource(content, mimeType, url, title) {
72
97
  if (!content) {
@@ -1,7 +1,7 @@
1
1
  import { config } from '../../config/index.js';
2
2
  import { logDebug, logError } from '../../services/logger.js';
3
3
  import { createToolErrorResponse, handleToolError, } from '../../utils/tool-error-handler.js';
4
- import { transformHtmlToJsonl, transformHtmlToMarkdownWithBlocks, } from '../utils/content-transform.js';
4
+ import { transformHtmlToJsonlAsync, transformHtmlToMarkdownWithBlocksAsync, } from '../utils/content-transform-async.js';
5
5
  import { applyInlineResultToStructuredContent, buildToolContentBlocks, getInlineErrorResponse, performSharedFetch, } from './fetch-single.shared.js';
6
6
  export const FETCH_URL_TOOL_NAME = 'fetch-url';
7
7
  export const FETCH_URL_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to AI-readable JSONL format with semantic content blocks. Supports custom headers, retries, and content length limits.';
@@ -37,17 +37,36 @@ function deserializeJsonlTransformResult(cached) {
37
37
  }
38
38
  }
39
39
  function resolveFetchUrlOptions(input) {
40
+ const format = input.format ?? 'jsonl';
40
41
  return {
41
42
  extractMainContent: input.extractMainContent ?? config.extraction.extractMainContent,
42
43
  includeMetadata: input.includeMetadata ?? config.extraction.includeMetadata,
43
- maxContentLength: input.maxContentLength,
44
- format: input.format ?? 'jsonl',
44
+ format,
45
+ includeContentBlocks: input.includeContentBlocks ?? (format === 'markdown' ? false : true),
46
+ ...(input.maxContentLength !== undefined && {
47
+ maxContentLength: input.maxContentLength,
48
+ }),
49
+ };
50
+ }
51
+ function buildFetchUrlErrorDetails(format) {
52
+ return {
53
+ contentBlocks: 0,
54
+ fetchedAt: new Date().toISOString(),
55
+ format,
56
+ cached: false,
45
57
  };
46
58
  }
47
59
  function buildFetchUrlTransform(options) {
48
- return (html, url) => options.format === 'markdown'
49
- ? transformHtmlToMarkdownWithBlocks(html, url, options)
50
- : transformHtmlToJsonl(html, url, options);
60
+ return async (html, url) => options.format === 'markdown'
61
+ ? transformHtmlToMarkdownWithBlocksAsync(html, url, {
62
+ extractMainContent: options.extractMainContent,
63
+ includeMetadata: options.includeMetadata,
64
+ ...(options.maxContentLength !== undefined && {
65
+ maxContentLength: options.maxContentLength,
66
+ }),
67
+ includeContentBlocks: options.includeContentBlocks,
68
+ })
69
+ : transformHtmlToJsonlAsync(html, url, options);
51
70
  }
52
71
  function buildFetchUrlStructuredContent(format, pipeline, inlineResult) {
53
72
  const structuredContent = {
@@ -74,22 +93,31 @@ function logFetchUrlStart(url, options) {
74
93
  extractMainContent: options.extractMainContent,
75
94
  includeMetadata: options.includeMetadata,
76
95
  format: options.format,
96
+ includeContentBlocks: options.includeContentBlocks,
77
97
  });
78
98
  }
79
99
  async function fetchUrlPipeline(url, input, options) {
80
- return performSharedFetch({
100
+ const sharedOptions = {
81
101
  url,
82
102
  format: options.format,
83
103
  extractMainContent: options.extractMainContent,
84
104
  includeMetadata: options.includeMetadata,
85
- maxContentLength: options.maxContentLength,
86
- customHeaders: input.customHeaders,
87
- retries: input.retries,
88
- timeout: input.timeout,
89
- cacheVariant: options.format === 'markdown' ? 'markdown-with-blocks' : undefined,
105
+ includeContentBlocks: options.includeContentBlocks,
106
+ ...(options.maxContentLength !== undefined && {
107
+ maxContentLength: options.maxContentLength,
108
+ }),
109
+ ...(input.customHeaders !== undefined && {
110
+ customHeaders: input.customHeaders,
111
+ }),
112
+ ...(input.retries !== undefined && { retries: input.retries }),
113
+ ...(input.timeout !== undefined && { timeout: input.timeout }),
114
+ ...(options.format === 'markdown' && {
115
+ cacheVariant: 'markdown-with-blocks',
116
+ }),
90
117
  transform: buildFetchUrlTransform(options),
91
118
  deserialize: deserializeJsonlTransformResult,
92
- });
119
+ };
120
+ return performSharedFetch(sharedOptions);
93
121
  }
94
122
  function buildFetchUrlResponse(pipeline, inlineResult, format) {
95
123
  const structuredContent = buildFetchUrlStructuredContent(format, pipeline, inlineResult);
@@ -104,18 +132,20 @@ export async function fetchUrlToolHandler(input) {
104
132
  }
105
133
  catch (error) {
106
134
  logError('fetch-url tool error', error instanceof Error ? error : undefined);
107
- return handleToolError(error, input.url, 'Failed to fetch URL');
135
+ const errorDetails = buildFetchUrlErrorDetails(input.format ?? 'jsonl');
136
+ return handleToolError(error, input.url, 'Failed to fetch URL', errorDetails);
108
137
  }
109
138
  }
110
139
  async function executeFetchUrl(input) {
111
140
  const { url } = input;
141
+ const format = input.format ?? 'jsonl';
112
142
  if (!url) {
113
- return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR');
143
+ return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR', buildFetchUrlErrorDetails(format));
114
144
  }
115
145
  const options = resolveFetchUrlOptions(input);
116
146
  logFetchUrlStart(url, options);
117
147
  const { pipeline, inlineResult } = await fetchUrlPipeline(url, input, options);
118
- const inlineError = getInlineErrorResponse(inlineResult, url);
148
+ const inlineError = getInlineErrorResponse(inlineResult, url, buildFetchUrlErrorDetails(options.format));
119
149
  if (inlineError)
120
150
  return inlineError;
121
151
  return buildFetchUrlResponse(pipeline, inlineResult, options.format);
@@ -9,6 +9,12 @@ const TOOL_DEFINITIONS = [
9
9
  inputSchema: fetchUrlInputSchema,
10
10
  outputSchema: fetchUrlOutputSchema,
11
11
  handler: fetchUrlToolHandler,
12
+ annotations: {
13
+ readOnlyHint: true,
14
+ destructiveHint: false,
15
+ idempotentHint: true,
16
+ openWorldHint: true,
17
+ },
12
18
  },
13
19
  {
14
20
  name: FETCH_MARKDOWN_TOOL_NAME,
@@ -17,6 +23,12 @@ const TOOL_DEFINITIONS = [
17
23
  inputSchema: fetchMarkdownInputSchema,
18
24
  outputSchema: fetchMarkdownOutputSchema,
19
25
  handler: fetchMarkdownToolHandler,
26
+ annotations: {
27
+ readOnlyHint: true,
28
+ destructiveHint: false,
29
+ idempotentHint: true,
30
+ openWorldHint: true,
31
+ },
20
32
  },
21
33
  ];
22
34
  export function registerTools(server) {
@@ -26,6 +38,7 @@ export function registerTools(server) {
26
38
  description: tool.description,
27
39
  inputSchema: tool.inputSchema,
28
40
  outputSchema: tool.outputSchema,
41
+ annotations: tool.annotations,
29
42
  }, tool.handler);
30
43
  }
31
44
  }
@@ -1,70 +1,46 @@
1
1
  import { z } from 'zod';
2
2
  export declare const fetchUrlInputSchema: z.ZodObject<{
3
- customHeaders: z.ZodOptional<z.ZodEffects<z.ZodRecord<z.ZodString, z.ZodString>, Record<string, string>, Record<string, string>>>;
4
- timeout: z.ZodDefault<z.ZodNumber>;
5
- retries: z.ZodDefault<z.ZodNumber>;
6
- } & {
7
- url: z.ZodString;
8
- } & {
3
+ format: z.ZodDefault<z.ZodEnum<{
4
+ jsonl: "jsonl";
5
+ markdown: "markdown";
6
+ }>>;
7
+ includeContentBlocks: z.ZodOptional<z.ZodBoolean>;
9
8
  extractMainContent: z.ZodDefault<z.ZodBoolean>;
10
9
  includeMetadata: z.ZodDefault<z.ZodBoolean>;
11
10
  maxContentLength: z.ZodOptional<z.ZodNumber>;
12
- } & {
13
- format: z.ZodDefault<z.ZodEnum<["jsonl", "markdown"]>>;
14
- }, "strict", z.ZodTypeAny, {
15
- url: string;
16
- timeout: number;
17
- retries: number;
18
- extractMainContent: boolean;
19
- includeMetadata: boolean;
20
- format: "jsonl" | "markdown";
21
- customHeaders?: Record<string, string> | undefined;
22
- maxContentLength?: number | undefined;
23
- }, {
24
- url: string;
25
- customHeaders?: Record<string, string> | undefined;
26
- timeout?: number | undefined;
27
- retries?: number | undefined;
28
- extractMainContent?: boolean | undefined;
29
- includeMetadata?: boolean | undefined;
30
- maxContentLength?: number | undefined;
31
- format?: "jsonl" | "markdown" | undefined;
32
- }>;
33
- export declare const fetchMarkdownInputSchema: z.ZodObject<{
34
- customHeaders: z.ZodOptional<z.ZodEffects<z.ZodRecord<z.ZodString, z.ZodString>, Record<string, string>, Record<string, string>>>;
11
+ url: z.ZodURL;
12
+ customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
35
13
  timeout: z.ZodDefault<z.ZodNumber>;
36
14
  retries: z.ZodDefault<z.ZodNumber>;
37
- } & {
38
- url: z.ZodString;
39
- } & {
15
+ }, z.core.$strict>;
16
+ export declare const fetchMarkdownInputSchema: z.ZodObject<{
40
17
  extractMainContent: z.ZodDefault<z.ZodBoolean>;
41
18
  includeMetadata: z.ZodDefault<z.ZodBoolean>;
42
19
  maxContentLength: z.ZodOptional<z.ZodNumber>;
43
- }, "strict", z.ZodTypeAny, {
44
- url: string;
45
- timeout: number;
46
- retries: number;
47
- extractMainContent: boolean;
48
- includeMetadata: boolean;
49
- customHeaders?: Record<string, string> | undefined;
50
- maxContentLength?: number | undefined;
51
- }, {
52
- url: string;
53
- customHeaders?: Record<string, string> | undefined;
54
- timeout?: number | undefined;
55
- retries?: number | undefined;
56
- extractMainContent?: boolean | undefined;
57
- includeMetadata?: boolean | undefined;
58
- maxContentLength?: number | undefined;
59
- }>;
20
+ url: z.ZodURL;
21
+ customHeaders: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
22
+ timeout: z.ZodDefault<z.ZodNumber>;
23
+ retries: z.ZodDefault<z.ZodNumber>;
24
+ }, z.core.$strict>;
60
25
  export declare const fetchUrlOutputSchema: z.ZodObject<{
26
+ contentSize: z.ZodOptional<z.ZodNumber>;
27
+ resourceUri: z.ZodOptional<z.ZodString>;
28
+ resourceMimeType: z.ZodOptional<z.ZodString>;
29
+ cached: z.ZodBoolean;
30
+ truncated: z.ZodOptional<z.ZodBoolean>;
31
+ error: z.ZodOptional<z.ZodString>;
32
+ errorCode: z.ZodOptional<z.ZodString>;
61
33
  url: z.ZodString;
62
34
  title: z.ZodOptional<z.ZodString>;
63
35
  contentBlocks: z.ZodNumber;
64
36
  fetchedAt: z.ZodString;
65
- format: z.ZodEnum<["jsonl", "markdown"]>;
37
+ format: z.ZodEnum<{
38
+ jsonl: "jsonl";
39
+ markdown: "markdown";
40
+ }>;
66
41
  content: z.ZodOptional<z.ZodString>;
67
- } & {
42
+ }, z.core.$strict>;
43
+ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
68
44
  contentSize: z.ZodOptional<z.ZodNumber>;
69
45
  resourceUri: z.ZodOptional<z.ZodString>;
70
46
  resourceMimeType: z.ZodOptional<z.ZodString>;
@@ -72,36 +48,6 @@ export declare const fetchUrlOutputSchema: z.ZodObject<{
72
48
  truncated: z.ZodOptional<z.ZodBoolean>;
73
49
  error: z.ZodOptional<z.ZodString>;
74
50
  errorCode: z.ZodOptional<z.ZodString>;
75
- }, "strict", z.ZodTypeAny, {
76
- url: string;
77
- format: "jsonl" | "markdown";
78
- contentBlocks: number;
79
- fetchedAt: string;
80
- cached: boolean;
81
- error?: string | undefined;
82
- title?: string | undefined;
83
- content?: string | undefined;
84
- contentSize?: number | undefined;
85
- resourceUri?: string | undefined;
86
- resourceMimeType?: string | undefined;
87
- truncated?: boolean | undefined;
88
- errorCode?: string | undefined;
89
- }, {
90
- url: string;
91
- format: "jsonl" | "markdown";
92
- contentBlocks: number;
93
- fetchedAt: string;
94
- cached: boolean;
95
- error?: string | undefined;
96
- title?: string | undefined;
97
- content?: string | undefined;
98
- contentSize?: number | undefined;
99
- resourceUri?: string | undefined;
100
- resourceMimeType?: string | undefined;
101
- truncated?: boolean | undefined;
102
- errorCode?: string | undefined;
103
- }>;
104
- export declare const fetchMarkdownOutputSchema: z.ZodObject<{
105
51
  url: z.ZodString;
106
52
  title: z.ZodOptional<z.ZodString>;
107
53
  fetchedAt: z.ZodString;
@@ -110,55 +56,5 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
110
56
  downloadUrl: z.ZodString;
111
57
  fileName: z.ZodString;
112
58
  expiresAt: z.ZodString;
113
- }, "strip", z.ZodTypeAny, {
114
- downloadUrl: string;
115
- fileName: string;
116
- expiresAt: string;
117
- }, {
118
- downloadUrl: string;
119
- fileName: string;
120
- expiresAt: string;
121
- }>>;
122
- } & {
123
- contentSize: z.ZodOptional<z.ZodNumber>;
124
- resourceUri: z.ZodOptional<z.ZodString>;
125
- resourceMimeType: z.ZodOptional<z.ZodString>;
126
- cached: z.ZodBoolean;
127
- truncated: z.ZodOptional<z.ZodBoolean>;
128
- error: z.ZodOptional<z.ZodString>;
129
- errorCode: z.ZodOptional<z.ZodString>;
130
- }, "strict", z.ZodTypeAny, {
131
- url: string;
132
- fetchedAt: string;
133
- cached: boolean;
134
- error?: string | undefined;
135
- markdown?: string | undefined;
136
- title?: string | undefined;
137
- contentSize?: number | undefined;
138
- resourceUri?: string | undefined;
139
- resourceMimeType?: string | undefined;
140
- truncated?: boolean | undefined;
141
- errorCode?: string | undefined;
142
- file?: {
143
- downloadUrl: string;
144
- fileName: string;
145
- expiresAt: string;
146
- } | undefined;
147
- }, {
148
- url: string;
149
- fetchedAt: string;
150
- cached: boolean;
151
- error?: string | undefined;
152
- markdown?: string | undefined;
153
- title?: string | undefined;
154
- contentSize?: number | undefined;
155
- resourceUri?: string | undefined;
156
- resourceMimeType?: string | undefined;
157
- truncated?: boolean | undefined;
158
- errorCode?: string | undefined;
159
- file?: {
160
- downloadUrl: string;
161
- fileName: string;
162
- expiresAt: string;
163
- } | undefined;
164
- }>;
59
+ }, z.core.$strip>>;
60
+ }, z.core.$strict>;
@@ -7,7 +7,7 @@ const MAX_CONTENT_LENGTH = config.constants.maxContentSize;
7
7
  const customHeadersSchema = z
8
8
  .record(z.string().max(MAX_HEADER_NAME_LENGTH), z.string().max(MAX_HEADER_VALUE_LENGTH))
9
9
  .refine((headers) => Object.keys(headers).length <= MAX_HEADER_COUNT, {
10
- message: `customHeaders must have at most ${MAX_HEADER_COUNT} entries`,
10
+ error: `customHeaders must have at most ${MAX_HEADER_COUNT} entries`,
11
11
  });
12
12
  const requestOptionsSchema = z.object({
13
13
  customHeaders: customHeadersSchema
@@ -47,6 +47,10 @@ const formatOptionsSchema = z.object({
47
47
  .enum(['jsonl', 'markdown'])
48
48
  .default('jsonl')
49
49
  .describe('Output format'),
50
+ includeContentBlocks: z
51
+ .boolean()
52
+ .optional()
53
+ .describe('Include content block counts when format=markdown'),
50
54
  });
51
55
  const resourceFieldsSchema = z.object({
52
56
  contentSize: z.number().optional().describe('Content length in characters'),
@@ -71,29 +75,18 @@ const fileDownloadSchema = z.object({
71
75
  fileName: z.string().describe('Suggested filename for download'),
72
76
  expiresAt: z.string().describe('ISO timestamp when download expires'),
73
77
  });
74
- export const fetchUrlInputSchema = requestOptionsSchema
75
- .extend({
76
- url: z
77
- .string()
78
- .min(1)
79
- .max(config.constants.maxUrlLength)
80
- .describe('The URL to fetch'),
81
- })
82
- .merge(extractionOptionsSchema)
83
- .merge(formatOptionsSchema)
84
- .strict();
85
- export const fetchMarkdownInputSchema = requestOptionsSchema
86
- .extend({
87
- url: z
88
- .string()
89
- .min(1)
90
- .max(config.constants.maxUrlLength)
91
- .describe('The URL to fetch'),
92
- })
93
- .merge(extractionOptionsSchema)
94
- .strict();
95
- export const fetchUrlOutputSchema = z
96
- .object({
78
+ export const fetchUrlInputSchema = z.strictObject({
79
+ ...requestOptionsSchema.shape,
80
+ url: z.url({ protocol: /^https?:$/i }).describe('The URL to fetch'),
81
+ ...extractionOptionsSchema.shape,
82
+ ...formatOptionsSchema.shape,
83
+ });
84
+ export const fetchMarkdownInputSchema = z.strictObject({
85
+ ...requestOptionsSchema.shape,
86
+ url: z.url({ protocol: /^https?:$/i }).describe('The URL to fetch'),
87
+ ...extractionOptionsSchema.shape,
88
+ });
89
+ export const fetchUrlOutputSchema = z.strictObject({
97
90
  url: z.string().describe('The fetched URL'),
98
91
  title: z.string().optional().describe('Page title'),
99
92
  contentBlocks: z
@@ -107,11 +100,9 @@ export const fetchUrlOutputSchema = z
107
100
  .string()
108
101
  .optional()
109
102
  .describe('The extracted content in JSONL or Markdown format'),
110
- })
111
- .merge(resourceFieldsSchema)
112
- .strict();
113
- export const fetchMarkdownOutputSchema = z
114
- .object({
103
+ ...resourceFieldsSchema.shape,
104
+ });
105
+ export const fetchMarkdownOutputSchema = z.strictObject({
115
106
  url: z.string().describe('The fetched URL'),
116
107
  title: z.string().optional().describe('Page title'),
117
108
  fetchedAt: z
@@ -124,6 +115,5 @@ export const fetchMarkdownOutputSchema = z
124
115
  file: fileDownloadSchema
125
116
  .optional()
126
117
  .describe('Download information when content is cached'),
127
- })
128
- .merge(resourceFieldsSchema)
129
- .strict();
118
+ ...resourceFieldsSchema.shape,
119
+ });
@@ -6,22 +6,26 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
6
6
  if (!includeMetadata)
7
7
  return undefined;
8
8
  const now = new Date().toISOString();
9
- return shouldExtractFromArticle && article
10
- ? {
11
- type: 'metadata',
12
- title: article.title,
13
- author: article.byline,
14
- url,
15
- fetchedAt: now,
16
- }
17
- : {
18
- type: 'metadata',
19
- title: extractedMeta.title,
20
- description: extractedMeta.description,
21
- author: extractedMeta.author,
22
- url,
23
- fetchedAt: now,
24
- };
9
+ const metadata = {
10
+ type: 'metadata',
11
+ url,
12
+ fetchedAt: now,
13
+ };
14
+ if (shouldExtractFromArticle && article) {
15
+ if (article.title !== undefined)
16
+ metadata.title = article.title;
17
+ if (article.byline !== undefined)
18
+ metadata.author = article.byline;
19
+ return metadata;
20
+ }
21
+ if (extractedMeta.title !== undefined)
22
+ metadata.title = extractedMeta.title;
23
+ if (extractedMeta.description !== undefined) {
24
+ metadata.description = extractedMeta.description;
25
+ }
26
+ if (extractedMeta.author !== undefined)
27
+ metadata.author = extractedMeta.author;
28
+ return metadata;
25
29
  }
26
30
  export function truncateContent(content, maxLength, suffix = TRUNCATION_MARKER) {
27
31
  if (maxLength === undefined ||
@@ -0,0 +1,6 @@
1
+ import type { JsonlTransformResult, MarkdownTransformResult, TransformOptions } from '../../config/types/content.js';
2
+ export declare function transformHtmlToJsonlAsync(html: string, url: string, options: TransformOptions): Promise<JsonlTransformResult>;
3
+ export declare function transformHtmlToMarkdownAsync(html: string, url: string, options: TransformOptions): Promise<MarkdownTransformResult>;
4
+ export declare function transformHtmlToMarkdownWithBlocksAsync(html: string, url: string, options: TransformOptions & {
5
+ includeContentBlocks?: boolean;
6
+ }): Promise<JsonlTransformResult>;
@@ -0,0 +1,33 @@
1
+ import { logWarn } from '../../services/logger.js';
2
+ import { runTransformInWorker, } from '../../services/transform-worker-pool.js';
3
+ import { transformHtmlToJsonl, transformHtmlToMarkdown, transformHtmlToMarkdownWithBlocks, } from './content-transform.js';
4
+ async function runOrFallback(job, fallback) {
5
+ try {
6
+ const result = await runTransformInWorker(job);
7
+ if (result)
8
+ return result;
9
+ }
10
+ catch (error) {
11
+ logWarn('Transform worker unavailable; using main thread', {
12
+ error: error instanceof Error ? error.message : String(error),
13
+ });
14
+ }
15
+ return fallback();
16
+ }
17
+ export async function transformHtmlToJsonlAsync(html, url, options) {
18
+ const result = await runOrFallback({ mode: 'jsonl', html, url, options }, () => transformHtmlToJsonl(html, url, options));
19
+ return result;
20
+ }
21
+ export async function transformHtmlToMarkdownAsync(html, url, options) {
22
+ const result = await runOrFallback({ mode: 'markdown', html, url, options }, () => transformHtmlToMarkdown(html, url, options));
23
+ return result;
24
+ }
25
+ export async function transformHtmlToMarkdownWithBlocksAsync(html, url, options) {
26
+ const result = await runOrFallback({
27
+ mode: 'markdown-blocks',
28
+ html,
29
+ url,
30
+ options,
31
+ }, () => transformHtmlToMarkdownWithBlocks(html, url, options));
32
+ return result;
33
+ }
@@ -8,7 +8,10 @@ interface ContentLengthOptions {
8
8
  }
9
9
  interface MarkdownOptions extends ExtractionOptions, ContentLengthOptions {
10
10
  }
11
+ interface MarkdownWithBlocksOptions extends ExtractionOptions, ContentLengthOptions {
12
+ readonly includeContentBlocks?: boolean;
13
+ }
11
14
  export declare function transformHtmlToJsonl(html: string, url: string, options: ExtractionOptions & ContentLengthOptions): JsonlTransformResult;
12
15
  export declare function transformHtmlToMarkdown(html: string, url: string, options: MarkdownOptions): MarkdownTransformResult;
13
- export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options: ExtractionOptions & ContentLengthOptions): JsonlTransformResult;
16
+ export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options: MarkdownWithBlocksOptions): JsonlTransformResult;
14
17
  export {};