@j0hanz/superfetch 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +60 -45
  2. package/dist/config/formatting.d.ts +1 -1
  3. package/dist/config/types/content.d.ts +3 -3
  4. package/dist/config/types/runtime.d.ts +1 -1
  5. package/dist/config/types/tools.d.ts +12 -12
  6. package/dist/http/cors.js +23 -23
  7. package/dist/http/download-routes.js +9 -4
  8. package/dist/http/mcp-routes.js +2 -13
  9. package/dist/http/mcp-validation.js +1 -1
  10. package/dist/http/server-middleware.js +2 -1
  11. package/dist/http/server.js +2 -0
  12. package/dist/index.js +5 -0
  13. package/dist/middleware/error-handler.js +1 -1
  14. package/dist/resources/cached-content.js +8 -4
  15. package/dist/server.js +2 -0
  16. package/dist/services/cache.d.ts +1 -1
  17. package/dist/services/cache.js +20 -7
  18. package/dist/services/context.d.ts +2 -4
  19. package/dist/services/context.js +1 -1
  20. package/dist/services/extractor.js +26 -21
  21. package/dist/services/fetcher/interceptors.d.ts +22 -0
  22. package/dist/services/fetcher/interceptors.js +18 -8
  23. package/dist/services/fetcher/response.js +32 -24
  24. package/dist/services/fetcher.d.ts +0 -1
  25. package/dist/services/fetcher.js +5 -7
  26. package/dist/services/metadata-collector.d.ts +10 -0
  27. package/dist/services/metadata-collector.js +11 -0
  28. package/dist/services/parser.js +26 -25
  29. package/dist/services/transform-worker-pool.d.ts +14 -0
  30. package/dist/services/transform-worker-pool.js +167 -0
  31. package/dist/tools/handlers/fetch-markdown.tool.d.ts +9 -1
  32. package/dist/tools/handlers/fetch-markdown.tool.js +58 -30
  33. package/dist/tools/handlers/fetch-single.shared.d.ts +8 -3
  34. package/dist/tools/handlers/fetch-single.shared.js +42 -17
  35. package/dist/tools/handlers/fetch-url.tool.js +46 -16
  36. package/dist/tools/index.js +13 -0
  37. package/dist/tools/schemas.d.ts +33 -30
  38. package/dist/tools/schemas.js +4 -0
  39. package/dist/tools/utils/common.js +20 -16
  40. package/dist/tools/utils/content-transform-async.d.ts +6 -0
  41. package/dist/tools/utils/content-transform-async.js +33 -0
  42. package/dist/tools/utils/content-transform.d.ts +4 -1
  43. package/dist/tools/utils/content-transform.js +7 -2
  44. package/dist/tools/utils/fetch-pipeline.js +18 -10
  45. package/dist/utils/content-cleaner.d.ts +1 -1
  46. package/dist/utils/download-url.d.ts +9 -1
  47. package/dist/utils/download-url.js +9 -6
  48. package/dist/utils/tool-error-handler.d.ts +2 -2
  49. package/dist/utils/tool-error-handler.js +7 -7
  50. package/dist/utils/url-validator.js +38 -0
  51. package/dist/workers/transform-worker.d.ts +1 -0
  52. package/dist/workers/transform-worker.js +50 -0
  53. package/package.json +4 -6
@@ -1,5 +1,6 @@
1
1
  import type { PipelineResult, ToolContentBlock } from '../../config/types/runtime.js';
2
2
  import type { FileDownloadInfo, ToolResponseBase } from '../../config/types/tools.js';
3
+ import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
3
4
  import { applyInlineContentLimit } from '../utils/inline-content.js';
4
5
  type SharedFetchFormat = 'jsonl' | 'markdown';
5
6
  interface SharedFetchOptions<T extends {
@@ -10,17 +11,21 @@ interface SharedFetchOptions<T extends {
10
11
  readonly extractMainContent: boolean;
11
12
  readonly includeMetadata: boolean;
12
13
  readonly maxContentLength?: number;
14
+ readonly includeContentBlocks?: boolean;
13
15
  readonly cacheVariant?: string;
14
16
  readonly customHeaders?: Record<string, string>;
15
17
  readonly retries?: number;
16
18
  readonly timeout?: number;
17
- readonly transform: (html: string, normalizedUrl: string) => T;
19
+ readonly transform: (html: string, normalizedUrl: string) => T | Promise<T>;
18
20
  readonly serialize?: (result: T) => string;
19
21
  readonly deserialize?: (cached: string) => T | undefined;
20
22
  }
23
+ interface SharedFetchDeps {
24
+ readonly executeFetchPipeline?: typeof executeFetchPipeline;
25
+ }
21
26
  export declare function performSharedFetch<T extends {
22
27
  content: string;
23
- }>(options: SharedFetchOptions<T>): Promise<{
28
+ }>(options: SharedFetchOptions<T>, deps?: SharedFetchDeps): Promise<{
24
29
  pipeline: PipelineResult<T>;
25
30
  inlineResult: ReturnType<typeof applyInlineContentLimit>;
26
31
  }>;
@@ -31,7 +36,7 @@ interface DownloadContext {
31
36
  title?: string;
32
37
  }
33
38
  export declare function getFileDownloadInfo(context: DownloadContext): FileDownloadInfo | null;
34
- export declare function getInlineErrorResponse(inlineResult: InlineResult, url: string): ToolResponseBase | null;
39
+ export declare function getInlineErrorResponse(inlineResult: InlineResult, url: string, details?: Record<string, unknown>): ToolResponseBase | null;
35
40
  export declare function applyInlineResultToStructuredContent(structuredContent: Record<string, unknown>, inlineResult: InlineResult, contentKey: string): void;
36
41
  export declare function buildToolContentBlocks(structuredContent: Record<string, unknown>, fromCache: boolean, inlineResult: InlineResult, resourceName: string, cacheKey?: string | null, fullContent?: string, format?: SharedFetchFormat, url?: string, title?: string): ToolContentBlock[];
37
42
  export {};
@@ -5,7 +5,8 @@ import { createToolErrorResponse } from '../../utils/tool-error-handler.js';
5
5
  import { appendHeaderVary } from '../utils/cache-vary.js';
6
6
  import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
7
7
  import { applyInlineContentLimit } from '../utils/inline-content.js';
8
- export async function performSharedFetch(options) {
8
+ export async function performSharedFetch(options, deps = {}) {
9
+ const executePipeline = deps.executeFetchPipeline ?? executeFetchPipeline;
9
10
  const cacheNamespace = options.format === 'markdown' ? 'markdown' : 'url';
10
11
  const cacheVary = appendHeaderVary({
11
12
  format: options.format,
@@ -13,33 +14,54 @@ export async function performSharedFetch(options) {
13
14
  includeMetadata: options.includeMetadata,
14
15
  maxContentLength: options.maxContentLength,
15
16
  ...(options.cacheVariant ? { variant: options.cacheVariant } : {}),
16
- ...(options.format === 'markdown' ? {} : { contentBlocks: true }),
17
+ ...(options.format === 'markdown'
18
+ ? { includeContentBlocks: options.includeContentBlocks }
19
+ : { contentBlocks: true }),
17
20
  }, options.customHeaders);
18
- const pipeline = await executeFetchPipeline({
21
+ const pipelineOptions = {
19
22
  url: options.url,
20
23
  cacheNamespace,
21
- customHeaders: options.customHeaders,
22
- retries: options.retries,
23
- timeout: options.timeout,
24
- cacheVary,
25
24
  transform: options.transform,
26
- serialize: options.serialize,
27
- deserialize: options.deserialize,
28
- });
25
+ };
26
+ if (options.customHeaders !== undefined) {
27
+ pipelineOptions.customHeaders = options.customHeaders;
28
+ }
29
+ if (options.retries !== undefined) {
30
+ pipelineOptions.retries = options.retries;
31
+ }
32
+ if (options.timeout !== undefined) {
33
+ pipelineOptions.timeout = options.timeout;
34
+ }
35
+ if (cacheVary !== undefined) {
36
+ pipelineOptions.cacheVary = cacheVary;
37
+ }
38
+ if (options.serialize !== undefined) {
39
+ pipelineOptions.serialize = options.serialize;
40
+ }
41
+ if (options.deserialize !== undefined) {
42
+ pipelineOptions.deserialize = options.deserialize;
43
+ }
44
+ const pipeline = await executePipeline(pipelineOptions);
29
45
  const inlineResult = applyInlineContentLimit(pipeline.data.content, pipeline.cacheKey ?? null, options.format);
30
46
  return { pipeline, inlineResult };
31
47
  }
32
48
  export function getFileDownloadInfo(context) {
33
- return buildFileDownloadInfo({
49
+ const infoOptions = {
34
50
  cacheKey: context.cacheKey,
35
51
  url: context.url,
36
- title: context.title,
37
- });
52
+ };
53
+ if (context.title !== undefined) {
54
+ return buildFileDownloadInfo({
55
+ ...infoOptions,
56
+ title: context.title,
57
+ });
58
+ }
59
+ return buildFileDownloadInfo(infoOptions);
38
60
  }
39
- export function getInlineErrorResponse(inlineResult, url) {
61
+ export function getInlineErrorResponse(inlineResult, url, details) {
40
62
  if (!inlineResult.error)
41
63
  return null;
42
- return createToolErrorResponse(inlineResult.error, url, 'INTERNAL_ERROR');
64
+ return createToolErrorResponse(inlineResult.error, url, 'INTERNAL_ERROR', details);
43
65
  }
44
66
  export function applyInlineResultToStructuredContent(structuredContent, inlineResult, contentKey) {
45
67
  if (inlineResult.truncated) {
@@ -60,13 +82,16 @@ function buildResourceLink(inlineResult, name) {
60
82
  if (!inlineResult.resourceUri) {
61
83
  return null;
62
84
  }
63
- return {
85
+ const block = {
64
86
  type: 'resource_link',
65
87
  uri: inlineResult.resourceUri,
66
88
  name,
67
- mimeType: inlineResult.resourceMimeType,
68
89
  description: `Content exceeds inline limit (${config.constants.maxInlineContentChars} chars)`,
69
90
  };
91
+ if (inlineResult.resourceMimeType !== undefined) {
92
+ block.mimeType = inlineResult.resourceMimeType;
93
+ }
94
+ return block;
70
95
  }
71
96
  function buildEmbeddedResource(content, mimeType, url, title) {
72
97
  if (!content) {
@@ -1,7 +1,7 @@
1
1
  import { config } from '../../config/index.js';
2
2
  import { logDebug, logError } from '../../services/logger.js';
3
3
  import { createToolErrorResponse, handleToolError, } from '../../utils/tool-error-handler.js';
4
- import { transformHtmlToJsonl, transformHtmlToMarkdownWithBlocks, } from '../utils/content-transform.js';
4
+ import { transformHtmlToJsonlAsync, transformHtmlToMarkdownWithBlocksAsync, } from '../utils/content-transform-async.js';
5
5
  import { applyInlineResultToStructuredContent, buildToolContentBlocks, getInlineErrorResponse, performSharedFetch, } from './fetch-single.shared.js';
6
6
  export const FETCH_URL_TOOL_NAME = 'fetch-url';
7
7
  export const FETCH_URL_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to AI-readable JSONL format with semantic content blocks. Supports custom headers, retries, and content length limits.';
@@ -37,17 +37,36 @@ function deserializeJsonlTransformResult(cached) {
37
37
  }
38
38
  }
39
39
  function resolveFetchUrlOptions(input) {
40
+ const format = input.format ?? 'jsonl';
40
41
  return {
41
42
  extractMainContent: input.extractMainContent ?? config.extraction.extractMainContent,
42
43
  includeMetadata: input.includeMetadata ?? config.extraction.includeMetadata,
43
- maxContentLength: input.maxContentLength,
44
- format: input.format ?? 'jsonl',
44
+ format,
45
+ includeContentBlocks: input.includeContentBlocks ?? (format === 'markdown' ? false : true),
46
+ ...(input.maxContentLength !== undefined && {
47
+ maxContentLength: input.maxContentLength,
48
+ }),
49
+ };
50
+ }
51
+ function buildFetchUrlErrorDetails(format) {
52
+ return {
53
+ contentBlocks: 0,
54
+ fetchedAt: new Date().toISOString(),
55
+ format,
56
+ cached: false,
45
57
  };
46
58
  }
47
59
  function buildFetchUrlTransform(options) {
48
- return (html, url) => options.format === 'markdown'
49
- ? transformHtmlToMarkdownWithBlocks(html, url, options)
50
- : transformHtmlToJsonl(html, url, options);
60
+ return async (html, url) => options.format === 'markdown'
61
+ ? transformHtmlToMarkdownWithBlocksAsync(html, url, {
62
+ extractMainContent: options.extractMainContent,
63
+ includeMetadata: options.includeMetadata,
64
+ ...(options.maxContentLength !== undefined && {
65
+ maxContentLength: options.maxContentLength,
66
+ }),
67
+ includeContentBlocks: options.includeContentBlocks,
68
+ })
69
+ : transformHtmlToJsonlAsync(html, url, options);
51
70
  }
52
71
  function buildFetchUrlStructuredContent(format, pipeline, inlineResult) {
53
72
  const structuredContent = {
@@ -74,22 +93,31 @@ function logFetchUrlStart(url, options) {
74
93
  extractMainContent: options.extractMainContent,
75
94
  includeMetadata: options.includeMetadata,
76
95
  format: options.format,
96
+ includeContentBlocks: options.includeContentBlocks,
77
97
  });
78
98
  }
79
99
  async function fetchUrlPipeline(url, input, options) {
80
- return performSharedFetch({
100
+ const sharedOptions = {
81
101
  url,
82
102
  format: options.format,
83
103
  extractMainContent: options.extractMainContent,
84
104
  includeMetadata: options.includeMetadata,
85
- maxContentLength: options.maxContentLength,
86
- customHeaders: input.customHeaders,
87
- retries: input.retries,
88
- timeout: input.timeout,
89
- cacheVariant: options.format === 'markdown' ? 'markdown-with-blocks' : undefined,
105
+ includeContentBlocks: options.includeContentBlocks,
106
+ ...(options.maxContentLength !== undefined && {
107
+ maxContentLength: options.maxContentLength,
108
+ }),
109
+ ...(input.customHeaders !== undefined && {
110
+ customHeaders: input.customHeaders,
111
+ }),
112
+ ...(input.retries !== undefined && { retries: input.retries }),
113
+ ...(input.timeout !== undefined && { timeout: input.timeout }),
114
+ ...(options.format === 'markdown' && {
115
+ cacheVariant: 'markdown-with-blocks',
116
+ }),
90
117
  transform: buildFetchUrlTransform(options),
91
118
  deserialize: deserializeJsonlTransformResult,
92
- });
119
+ };
120
+ return performSharedFetch(sharedOptions);
93
121
  }
94
122
  function buildFetchUrlResponse(pipeline, inlineResult, format) {
95
123
  const structuredContent = buildFetchUrlStructuredContent(format, pipeline, inlineResult);
@@ -104,18 +132,20 @@ export async function fetchUrlToolHandler(input) {
104
132
  }
105
133
  catch (error) {
106
134
  logError('fetch-url tool error', error instanceof Error ? error : undefined);
107
- return handleToolError(error, input.url, 'Failed to fetch URL');
135
+ const errorDetails = buildFetchUrlErrorDetails(input.format ?? 'jsonl');
136
+ return handleToolError(error, input.url, 'Failed to fetch URL', errorDetails);
108
137
  }
109
138
  }
110
139
  async function executeFetchUrl(input) {
111
140
  const { url } = input;
141
+ const format = input.format ?? 'jsonl';
112
142
  if (!url) {
113
- return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR');
143
+ return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR', buildFetchUrlErrorDetails(format));
114
144
  }
115
145
  const options = resolveFetchUrlOptions(input);
116
146
  logFetchUrlStart(url, options);
117
147
  const { pipeline, inlineResult } = await fetchUrlPipeline(url, input, options);
118
- const inlineError = getInlineErrorResponse(inlineResult, url);
148
+ const inlineError = getInlineErrorResponse(inlineResult, url, buildFetchUrlErrorDetails(options.format));
119
149
  if (inlineError)
120
150
  return inlineError;
121
151
  return buildFetchUrlResponse(pipeline, inlineResult, options.format);
@@ -9,6 +9,12 @@ const TOOL_DEFINITIONS = [
9
9
  inputSchema: fetchUrlInputSchema,
10
10
  outputSchema: fetchUrlOutputSchema,
11
11
  handler: fetchUrlToolHandler,
12
+ annotations: {
13
+ readOnlyHint: true,
14
+ destructiveHint: false,
15
+ idempotentHint: true,
16
+ openWorldHint: true,
17
+ },
12
18
  },
13
19
  {
14
20
  name: FETCH_MARKDOWN_TOOL_NAME,
@@ -17,6 +23,12 @@ const TOOL_DEFINITIONS = [
17
23
  inputSchema: fetchMarkdownInputSchema,
18
24
  outputSchema: fetchMarkdownOutputSchema,
19
25
  handler: fetchMarkdownToolHandler,
26
+ annotations: {
27
+ readOnlyHint: true,
28
+ destructiveHint: false,
29
+ idempotentHint: true,
30
+ openWorldHint: true,
31
+ },
20
32
  },
21
33
  ];
22
34
  export function registerTools(server) {
@@ -26,6 +38,7 @@ export function registerTools(server) {
26
38
  description: tool.description,
27
39
  inputSchema: tool.inputSchema,
28
40
  outputSchema: tool.outputSchema,
41
+ annotations: tool.annotations,
29
42
  }, tool.handler);
30
43
  }
31
44
  }
@@ -11,24 +11,27 @@ export declare const fetchUrlInputSchema: z.ZodObject<{
11
11
  maxContentLength: z.ZodOptional<z.ZodNumber>;
12
12
  } & {
13
13
  format: z.ZodDefault<z.ZodEnum<["jsonl", "markdown"]>>;
14
+ includeContentBlocks: z.ZodOptional<z.ZodBoolean>;
14
15
  }, "strict", z.ZodTypeAny, {
15
16
  url: string;
16
- timeout: number;
17
- retries: number;
18
17
  extractMainContent: boolean;
19
18
  includeMetadata: boolean;
19
+ retries: number;
20
20
  format: "jsonl" | "markdown";
21
- customHeaders?: Record<string, string> | undefined;
21
+ timeout: number;
22
22
  maxContentLength?: number | undefined;
23
+ includeContentBlocks?: boolean | undefined;
24
+ customHeaders?: Record<string, string> | undefined;
23
25
  }, {
24
26
  url: string;
25
- customHeaders?: Record<string, string> | undefined;
26
- timeout?: number | undefined;
27
- retries?: number | undefined;
28
27
  extractMainContent?: boolean | undefined;
29
28
  includeMetadata?: boolean | undefined;
30
29
  maxContentLength?: number | undefined;
30
+ retries?: number | undefined;
31
31
  format?: "jsonl" | "markdown" | undefined;
32
+ includeContentBlocks?: boolean | undefined;
33
+ timeout?: number | undefined;
34
+ customHeaders?: Record<string, string> | undefined;
32
35
  }>;
33
36
  export declare const fetchMarkdownInputSchema: z.ZodObject<{
34
37
  customHeaders: z.ZodOptional<z.ZodEffects<z.ZodRecord<z.ZodString, z.ZodString>, Record<string, string>, Record<string, string>>>;
@@ -42,20 +45,20 @@ export declare const fetchMarkdownInputSchema: z.ZodObject<{
42
45
  maxContentLength: z.ZodOptional<z.ZodNumber>;
43
46
  }, "strict", z.ZodTypeAny, {
44
47
  url: string;
45
- timeout: number;
46
- retries: number;
47
48
  extractMainContent: boolean;
48
49
  includeMetadata: boolean;
49
- customHeaders?: Record<string, string> | undefined;
50
+ retries: number;
51
+ timeout: number;
50
52
  maxContentLength?: number | undefined;
53
+ customHeaders?: Record<string, string> | undefined;
51
54
  }, {
52
55
  url: string;
53
- customHeaders?: Record<string, string> | undefined;
54
- timeout?: number | undefined;
55
- retries?: number | undefined;
56
56
  extractMainContent?: boolean | undefined;
57
57
  includeMetadata?: boolean | undefined;
58
58
  maxContentLength?: number | undefined;
59
+ retries?: number | undefined;
60
+ timeout?: number | undefined;
61
+ customHeaders?: Record<string, string> | undefined;
59
62
  }>;
60
63
  export declare const fetchUrlOutputSchema: z.ZodObject<{
61
64
  url: z.ZodString;
@@ -74,31 +77,31 @@ export declare const fetchUrlOutputSchema: z.ZodObject<{
74
77
  errorCode: z.ZodOptional<z.ZodString>;
75
78
  }, "strict", z.ZodTypeAny, {
76
79
  url: string;
80
+ fetchedAt: string;
77
81
  format: "jsonl" | "markdown";
78
82
  contentBlocks: number;
79
- fetchedAt: string;
80
83
  cached: boolean;
81
84
  error?: string | undefined;
82
85
  title?: string | undefined;
83
- content?: string | undefined;
84
- contentSize?: number | undefined;
86
+ truncated?: boolean | undefined;
85
87
  resourceUri?: string | undefined;
86
88
  resourceMimeType?: string | undefined;
87
- truncated?: boolean | undefined;
89
+ content?: string | undefined;
90
+ contentSize?: number | undefined;
88
91
  errorCode?: string | undefined;
89
92
  }, {
90
93
  url: string;
94
+ fetchedAt: string;
91
95
  format: "jsonl" | "markdown";
92
96
  contentBlocks: number;
93
- fetchedAt: string;
94
97
  cached: boolean;
95
98
  error?: string | undefined;
96
99
  title?: string | undefined;
97
- content?: string | undefined;
98
- contentSize?: number | undefined;
100
+ truncated?: boolean | undefined;
99
101
  resourceUri?: string | undefined;
100
102
  resourceMimeType?: string | undefined;
101
- truncated?: boolean | undefined;
103
+ content?: string | undefined;
104
+ contentSize?: number | undefined;
102
105
  errorCode?: string | undefined;
103
106
  }>;
104
107
  export declare const fetchMarkdownOutputSchema: z.ZodObject<{
@@ -111,13 +114,13 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
111
114
  fileName: z.ZodString;
112
115
  expiresAt: z.ZodString;
113
116
  }, "strip", z.ZodTypeAny, {
117
+ expiresAt: string;
114
118
  downloadUrl: string;
115
119
  fileName: string;
116
- expiresAt: string;
117
120
  }, {
121
+ expiresAt: string;
118
122
  downloadUrl: string;
119
123
  fileName: string;
120
- expiresAt: string;
121
124
  }>>;
122
125
  } & {
123
126
  contentSize: z.ZodOptional<z.ZodNumber>;
@@ -134,16 +137,16 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
134
137
  error?: string | undefined;
135
138
  markdown?: string | undefined;
136
139
  title?: string | undefined;
137
- contentSize?: number | undefined;
140
+ truncated?: boolean | undefined;
138
141
  resourceUri?: string | undefined;
139
142
  resourceMimeType?: string | undefined;
140
- truncated?: boolean | undefined;
141
- errorCode?: string | undefined;
143
+ contentSize?: number | undefined;
142
144
  file?: {
145
+ expiresAt: string;
143
146
  downloadUrl: string;
144
147
  fileName: string;
145
- expiresAt: string;
146
148
  } | undefined;
149
+ errorCode?: string | undefined;
147
150
  }, {
148
151
  url: string;
149
152
  fetchedAt: string;
@@ -151,14 +154,14 @@ export declare const fetchMarkdownOutputSchema: z.ZodObject<{
151
154
  error?: string | undefined;
152
155
  markdown?: string | undefined;
153
156
  title?: string | undefined;
154
- contentSize?: number | undefined;
157
+ truncated?: boolean | undefined;
155
158
  resourceUri?: string | undefined;
156
159
  resourceMimeType?: string | undefined;
157
- truncated?: boolean | undefined;
158
- errorCode?: string | undefined;
160
+ contentSize?: number | undefined;
159
161
  file?: {
162
+ expiresAt: string;
160
163
  downloadUrl: string;
161
164
  fileName: string;
162
- expiresAt: string;
163
165
  } | undefined;
166
+ errorCode?: string | undefined;
164
167
  }>;
@@ -47,6 +47,10 @@ const formatOptionsSchema = z.object({
47
47
  .enum(['jsonl', 'markdown'])
48
48
  .default('jsonl')
49
49
  .describe('Output format'),
50
+ includeContentBlocks: z
51
+ .boolean()
52
+ .optional()
53
+ .describe('Include content block counts when format=markdown'),
50
54
  });
51
55
  const resourceFieldsSchema = z.object({
52
56
  contentSize: z.number().optional().describe('Content length in characters'),
@@ -6,22 +6,26 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
6
6
  if (!includeMetadata)
7
7
  return undefined;
8
8
  const now = new Date().toISOString();
9
- return shouldExtractFromArticle && article
10
- ? {
11
- type: 'metadata',
12
- title: article.title,
13
- author: article.byline,
14
- url,
15
- fetchedAt: now,
16
- }
17
- : {
18
- type: 'metadata',
19
- title: extractedMeta.title,
20
- description: extractedMeta.description,
21
- author: extractedMeta.author,
22
- url,
23
- fetchedAt: now,
24
- };
9
+ const metadata = {
10
+ type: 'metadata',
11
+ url,
12
+ fetchedAt: now,
13
+ };
14
+ if (shouldExtractFromArticle && article) {
15
+ if (article.title !== undefined)
16
+ metadata.title = article.title;
17
+ if (article.byline !== undefined)
18
+ metadata.author = article.byline;
19
+ return metadata;
20
+ }
21
+ if (extractedMeta.title !== undefined)
22
+ metadata.title = extractedMeta.title;
23
+ if (extractedMeta.description !== undefined) {
24
+ metadata.description = extractedMeta.description;
25
+ }
26
+ if (extractedMeta.author !== undefined)
27
+ metadata.author = extractedMeta.author;
28
+ return metadata;
25
29
  }
26
30
  export function truncateContent(content, maxLength, suffix = TRUNCATION_MARKER) {
27
31
  if (maxLength === undefined ||
@@ -0,0 +1,6 @@
1
+ import type { JsonlTransformResult, MarkdownTransformResult, TransformOptions } from '../../config/types/content.js';
2
+ export declare function transformHtmlToJsonlAsync(html: string, url: string, options: TransformOptions): Promise<JsonlTransformResult>;
3
+ export declare function transformHtmlToMarkdownAsync(html: string, url: string, options: TransformOptions): Promise<MarkdownTransformResult>;
4
+ export declare function transformHtmlToMarkdownWithBlocksAsync(html: string, url: string, options: TransformOptions & {
5
+ includeContentBlocks?: boolean;
6
+ }): Promise<JsonlTransformResult>;
@@ -0,0 +1,33 @@
1
+ import { logWarn } from '../../services/logger.js';
2
+ import { runTransformInWorker, } from '../../services/transform-worker-pool.js';
3
+ import { transformHtmlToJsonl, transformHtmlToMarkdown, transformHtmlToMarkdownWithBlocks, } from './content-transform.js';
4
+ async function runOrFallback(job, fallback) {
5
+ try {
6
+ const result = await runTransformInWorker(job);
7
+ if (result)
8
+ return result;
9
+ }
10
+ catch (error) {
11
+ logWarn('Transform worker unavailable; using main thread', {
12
+ error: error instanceof Error ? error.message : String(error),
13
+ });
14
+ }
15
+ return fallback();
16
+ }
17
+ export async function transformHtmlToJsonlAsync(html, url, options) {
18
+ const result = await runOrFallback({ mode: 'jsonl', html, url, options }, () => transformHtmlToJsonl(html, url, options));
19
+ return result;
20
+ }
21
+ export async function transformHtmlToMarkdownAsync(html, url, options) {
22
+ const result = await runOrFallback({ mode: 'markdown', html, url, options }, () => transformHtmlToMarkdown(html, url, options));
23
+ return result;
24
+ }
25
+ export async function transformHtmlToMarkdownWithBlocksAsync(html, url, options) {
26
+ const result = await runOrFallback({
27
+ mode: 'markdown-blocks',
28
+ html,
29
+ url,
30
+ options,
31
+ }, () => transformHtmlToMarkdownWithBlocks(html, url, options));
32
+ return result;
33
+ }
@@ -8,7 +8,10 @@ interface ContentLengthOptions {
8
8
  }
9
9
  interface MarkdownOptions extends ExtractionOptions, ContentLengthOptions {
10
10
  }
11
+ interface MarkdownWithBlocksOptions extends ExtractionOptions, ContentLengthOptions {
12
+ readonly includeContentBlocks?: boolean;
13
+ }
11
14
  export declare function transformHtmlToJsonl(html: string, url: string, options: ExtractionOptions & ContentLengthOptions): JsonlTransformResult;
12
15
  export declare function transformHtmlToMarkdown(html: string, url: string, options: MarkdownOptions): MarkdownTransformResult;
13
- export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options: ExtractionOptions & ContentLengthOptions): JsonlTransformResult;
16
+ export declare function transformHtmlToMarkdownWithBlocks(html: string, url: string, options: MarkdownWithBlocksOptions): JsonlTransformResult;
14
17
  export {};
@@ -102,7 +102,10 @@ export function transformHtmlToMarkdown(html, url, options) {
102
102
  };
103
103
  }
104
104
  export function transformHtmlToMarkdownWithBlocks(html, url, options) {
105
- if (!options.extractMainContent && options.includeMetadata) {
105
+ const includeContentBlocks = options.includeContentBlocks ?? true;
106
+ if (includeContentBlocks &&
107
+ !options.extractMainContent &&
108
+ options.includeMetadata) {
106
109
  const parsed = parseHtmlWithMetadata(html);
107
110
  const context = {
108
111
  sourceHtml: html,
@@ -118,7 +121,9 @@ export function transformHtmlToMarkdownWithBlocks(html, url, options) {
118
121
  };
119
122
  }
120
123
  const context = resolveContentSource(html, url, options);
121
- const contentBlocks = parseHtml(context.sourceHtml);
124
+ const contentBlocks = includeContentBlocks
125
+ ? parseHtml(context.sourceHtml)
126
+ : [];
122
127
  const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
123
128
  return {
124
129
  content,
@@ -51,7 +51,7 @@ export async function executeFetchPipeline(options) {
51
51
  const fetchOptions = buildFetchOptions(options);
52
52
  logDebug('Fetching URL', { url: normalizedUrl, retries: options.retries });
53
53
  const html = await fetchNormalizedUrlWithRetry(normalizedUrl, fetchOptions, options.retries);
54
- const data = options.transform(html, normalizedUrl);
54
+ const data = await options.transform(html, normalizedUrl);
55
55
  if (cache.isEnabled()) {
56
56
  persistCache(cacheKey, data, options.serialize, normalizedUrl);
57
57
  }
@@ -62,20 +62,28 @@ function resolveCacheKey(options, normalizedUrl) {
62
62
  return cache.createCacheKey(options.cacheNamespace, normalizedUrl, cacheVary);
63
63
  }
64
64
  function buildFetchOptions(options) {
65
- return {
66
- customHeaders: options.customHeaders,
67
- signal: options.signal,
68
- timeout: options.timeout,
69
- };
65
+ const fetchOptions = {};
66
+ if (options.customHeaders !== undefined) {
67
+ fetchOptions.customHeaders = options.customHeaders;
68
+ }
69
+ if (options.signal !== undefined) {
70
+ fetchOptions.signal = options.signal;
71
+ }
72
+ if (options.timeout !== undefined) {
73
+ fetchOptions.timeout = options.timeout;
74
+ }
75
+ return fetchOptions;
70
76
  }
71
77
  function persistCache(cacheKey, data, serialize, normalizedUrl) {
72
78
  if (!cacheKey)
73
79
  return;
74
80
  const serializer = serialize ?? JSON.stringify;
75
- cache.set(cacheKey, serializer(data), {
76
- url: normalizedUrl,
77
- title: extractTitle(data),
78
- });
81
+ const metadata = { url: normalizedUrl };
82
+ const title = extractTitle(data);
83
+ if (title !== undefined) {
84
+ metadata.title = title;
85
+ }
86
+ cache.set(cacheKey, serializer(data), metadata);
79
87
  }
80
88
  function extractTitle(value) {
81
89
  if (!value || typeof value !== 'object')
@@ -1,5 +1,5 @@
1
1
  export declare function cleanParagraph(text: string): string | null;
2
2
  export declare function cleanHeading(text: string): string | null;
3
- export declare function cleanListItems(items: string[]): string[];
3
+ export declare function cleanListItems(items: readonly string[]): readonly string[];
4
4
  export declare function cleanCodeBlock(code: string): string | null;
5
5
  export declare function removeInlineTimestamps(text: string): string;
@@ -1,8 +1,16 @@
1
+ import { config } from '../config/index.js';
1
2
  import type { FileDownloadInfo } from '../config/types/tools.js';
3
+ import * as cache from '../services/cache.js';
4
+ import { generateSafeFilename } from './filename-generator.js';
2
5
  interface DownloadInfoOptions {
3
6
  cacheKey: string | null;
4
7
  url: string;
5
8
  title?: string;
6
9
  }
7
- export declare function buildFileDownloadInfo(options: DownloadInfoOptions): FileDownloadInfo | null;
10
+ interface DownloadInfoDeps {
11
+ readonly config?: typeof config;
12
+ readonly cache?: Pick<typeof cache, 'get' | 'parseCacheKey'>;
13
+ readonly generateSafeFilename?: typeof generateSafeFilename;
14
+ }
15
+ export declare function buildFileDownloadInfo(options: DownloadInfoOptions, deps?: DownloadInfoDeps): FileDownloadInfo | null;
8
16
  export {};