@j0hanz/superfetch 1.2.1 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/README.md +60 -45
  2. package/dist/config/formatting.d.ts +1 -1
  3. package/dist/config/types/content.d.ts +3 -3
  4. package/dist/config/types/runtime.d.ts +1 -1
  5. package/dist/config/types/tools.d.ts +12 -12
  6. package/dist/http/cors.js +23 -23
  7. package/dist/http/download-routes.js +11 -5
  8. package/dist/http/mcp-routes.js +2 -13
  9. package/dist/http/mcp-validation.js +1 -1
  10. package/dist/http/server-middleware.js +5 -3
  11. package/dist/http/server.js +2 -0
  12. package/dist/index.js +5 -0
  13. package/dist/middleware/error-handler.js +1 -1
  14. package/dist/resources/cached-content.js +8 -4
  15. package/dist/server.js +2 -0
  16. package/dist/services/cache.d.ts +2 -1
  17. package/dist/services/cache.js +23 -7
  18. package/dist/services/context.d.ts +4 -4
  19. package/dist/services/context.js +11 -1
  20. package/dist/services/extractor.js +26 -21
  21. package/dist/services/fetcher/agents.js +55 -1
  22. package/dist/services/fetcher/interceptors.d.ts +22 -0
  23. package/dist/services/fetcher/interceptors.js +57 -26
  24. package/dist/services/fetcher/response.d.ts +1 -1
  25. package/dist/services/fetcher/response.js +37 -16
  26. package/dist/services/fetcher.d.ts +1 -1
  27. package/dist/services/fetcher.js +9 -8
  28. package/dist/services/metadata-collector.d.ts +10 -0
  29. package/dist/services/metadata-collector.js +11 -0
  30. package/dist/services/parser.d.ts +5 -1
  31. package/dist/services/parser.js +82 -11
  32. package/dist/services/transform-worker-pool.d.ts +14 -0
  33. package/dist/services/transform-worker-pool.js +167 -0
  34. package/dist/tools/handlers/fetch-markdown.tool.d.ts +9 -1
  35. package/dist/tools/handlers/fetch-markdown.tool.js +58 -30
  36. package/dist/tools/handlers/fetch-single.shared.d.ts +8 -3
  37. package/dist/tools/handlers/fetch-single.shared.js +42 -17
  38. package/dist/tools/handlers/fetch-url.tool.js +46 -16
  39. package/dist/tools/index.js +13 -0
  40. package/dist/tools/schemas.d.ts +19 -16
  41. package/dist/tools/schemas.js +25 -4
  42. package/dist/tools/utils/common.js +20 -16
  43. package/dist/tools/utils/content-transform-async.d.ts +6 -0
  44. package/dist/tools/utils/content-transform-async.js +33 -0
  45. package/dist/tools/utils/content-transform.d.ts +4 -1
  46. package/dist/tools/utils/content-transform.js +37 -3
  47. package/dist/tools/utils/fetch-pipeline.js +26 -15
  48. package/dist/utils/content-cleaner.d.ts +1 -1
  49. package/dist/utils/download-url.d.ts +9 -1
  50. package/dist/utils/download-url.js +9 -6
  51. package/dist/utils/tool-error-handler.d.ts +2 -2
  52. package/dist/utils/tool-error-handler.js +7 -7
  53. package/dist/utils/url-validator.d.ts +5 -0
  54. package/dist/utils/url-validator.js +45 -3
  55. package/dist/workers/transform-worker.d.ts +1 -0
  56. package/dist/workers/transform-worker.js +50 -0
  57. package/package.json +4 -6
@@ -0,0 +1,167 @@
1
+ import os from 'node:os';
2
+ import { isMainThread, Worker } from 'node:worker_threads';
3
+ import { config } from '../config/index.js';
4
+ import { getErrorMessage } from '../utils/error-utils.js';
5
+ import { logWarn } from './logger.js';
6
+ const MAX_POOL_SIZE = 4;
7
+ function resolvePoolSize() {
8
+ const available = os.availableParallelism();
9
+ return Math.max(1, Math.min(available - 1, MAX_POOL_SIZE));
10
+ }
11
+ let pool = null;
12
+ let poolDisabled = false;
13
+ function shouldUseWorkers() {
14
+ return isMainThread && config.runtime.httpMode && !poolDisabled;
15
+ }
16
+ function getWorkerUrl() {
17
+ return new URL('../workers/transform-worker.js', import.meta.url);
18
+ }
19
+ export async function runTransformInWorker(job) {
20
+ if (!shouldUseWorkers())
21
+ return null;
22
+ if (!pool) {
23
+ try {
24
+ pool = new TransformWorkerPool(getWorkerUrl(), resolvePoolSize());
25
+ }
26
+ catch (error) {
27
+ poolDisabled = true;
28
+ logWarn('Failed to initialize transform worker pool', {
29
+ error: getErrorMessage(error),
30
+ });
31
+ return null;
32
+ }
33
+ }
34
+ try {
35
+ return await pool.run(job);
36
+ }
37
+ catch (error) {
38
+ poolDisabled = true;
39
+ pool.destroy();
40
+ pool = null;
41
+ logWarn('Transform worker failed; falling back to main thread', {
42
+ error: getErrorMessage(error),
43
+ });
44
+ return null;
45
+ }
46
+ }
47
+ export function destroyTransformWorkers() {
48
+ pool?.destroy();
49
+ pool = null;
50
+ }
51
+ class TransformWorkerPool {
52
+ workerUrl;
53
+ size;
54
+ workers = [];
55
+ queue = [];
56
+ pending = new Map();
57
+ nextId = 1;
58
+ destroyed = false;
59
+ constructor(workerUrl, size) {
60
+ this.workerUrl = workerUrl;
61
+ this.size = size;
62
+ for (let i = 0; i < size; i += 1) {
63
+ this.workers.push(this.createWorker());
64
+ }
65
+ }
66
+ run(job) {
67
+ if (this.destroyed) {
68
+ return Promise.reject(new Error('Transform worker pool is closed'));
69
+ }
70
+ const id = this.nextId++;
71
+ const queuedJob = { ...job, id };
72
+ return new Promise((resolve, reject) => {
73
+ this.pending.set(id, { resolve, reject });
74
+ this.queue.push(queuedJob);
75
+ this.schedule();
76
+ });
77
+ }
78
+ destroy() {
79
+ if (this.destroyed)
80
+ return;
81
+ this.destroyed = true;
82
+ for (const workerState of this.workers) {
83
+ void workerState.worker.terminate();
84
+ }
85
+ for (const [id, pending] of this.pending.entries()) {
86
+ pending.reject(new Error('Transform worker pool shut down'));
87
+ this.pending.delete(id);
88
+ }
89
+ this.queue.length = 0;
90
+ }
91
+ createWorker() {
92
+ const worker = new Worker(this.workerUrl);
93
+ worker.unref();
94
+ const state = { worker, busy: false, currentJobId: undefined };
95
+ worker.on('message', (message) => {
96
+ this.handleMessage(state, message);
97
+ });
98
+ worker.on('error', (error) => {
99
+ this.handleWorkerError(state, error);
100
+ });
101
+ worker.on('exit', (code) => {
102
+ this.handleWorkerExit(state, code);
103
+ });
104
+ return state;
105
+ }
106
+ handleMessage(state, message) {
107
+ const pending = this.pending.get(message.id);
108
+ if (pending) {
109
+ this.pending.delete(message.id);
110
+ if (message.ok) {
111
+ pending.resolve(message.result);
112
+ }
113
+ else {
114
+ pending.reject(new Error(message.error));
115
+ }
116
+ }
117
+ state.busy = false;
118
+ state.currentJobId = undefined;
119
+ this.schedule();
120
+ }
121
+ handleWorkerError(state, error) {
122
+ this.failCurrentJob(state, error);
123
+ this.replaceWorker(state);
124
+ }
125
+ handleWorkerExit(state, code) {
126
+ if (code !== 0) {
127
+ this.failCurrentJob(state, new Error(`Transform worker exited with code ${code}`));
128
+ }
129
+ this.replaceWorker(state);
130
+ }
131
+ failCurrentJob(state, error) {
132
+ if (!state.currentJobId)
133
+ return;
134
+ const pending = this.pending.get(state.currentJobId);
135
+ if (pending) {
136
+ pending.reject(error);
137
+ this.pending.delete(state.currentJobId);
138
+ }
139
+ state.currentJobId = undefined;
140
+ state.busy = false;
141
+ }
142
+ replaceWorker(state) {
143
+ if (this.destroyed)
144
+ return;
145
+ const index = this.workers.indexOf(state);
146
+ if (index === -1)
147
+ return;
148
+ this.workers[index] = this.createWorker();
149
+ this.schedule();
150
+ }
151
+ schedule() {
152
+ if (this.destroyed)
153
+ return;
154
+ for (const workerState of this.workers) {
155
+ if (this.queue.length === 0)
156
+ return;
157
+ if (workerState.busy)
158
+ continue;
159
+ const job = this.queue.shift();
160
+ if (!job)
161
+ return;
162
+ workerState.busy = true;
163
+ workerState.currentJobId = job.id;
164
+ workerState.worker.postMessage(job);
165
+ }
166
+ }
167
+ }
@@ -1,4 +1,12 @@
1
1
  import type { FetchMarkdownInput, ToolResponseBase } from '../../config/types/tools.js';
2
+ import { transformHtmlToMarkdownAsync } from '../utils/content-transform-async.js';
3
+ import { performSharedFetch } from './fetch-single.shared.js';
2
4
  export declare const FETCH_MARKDOWN_TOOL_NAME = "fetch-markdown";
3
5
  export declare const FETCH_MARKDOWN_TOOL_DESCRIPTION = "Fetches a webpage and converts it to clean Markdown format with optional frontmatter and content length limits";
4
- export declare function fetchMarkdownToolHandler(input: FetchMarkdownInput): Promise<ToolResponseBase>;
6
+ interface FetchMarkdownDeps {
7
+ readonly performSharedFetch?: typeof performSharedFetch;
8
+ readonly transformHtmlToMarkdown?: typeof transformHtmlToMarkdownAsync;
9
+ }
10
+ export declare function createFetchMarkdownToolHandler(deps?: FetchMarkdownDeps): (input: FetchMarkdownInput) => Promise<ToolResponseBase>;
11
+ export declare const fetchMarkdownToolHandler: (input: FetchMarkdownInput) => Promise<ToolResponseBase>;
12
+ export {};
@@ -1,7 +1,7 @@
1
1
  import { config } from '../../config/index.js';
2
2
  import { logDebug, logError } from '../../services/logger.js';
3
3
  import { createToolErrorResponse, handleToolError, } from '../../utils/tool-error-handler.js';
4
- import { transformHtmlToMarkdown } from '../utils/content-transform.js';
4
+ import { transformHtmlToMarkdownAsync } from '../utils/content-transform-async.js';
5
5
  import { applyInlineResultToStructuredContent, buildToolContentBlocks, getFileDownloadInfo, getInlineErrorResponse, performSharedFetch, } from './fetch-single.shared.js';
6
6
  export const FETCH_MARKDOWN_TOOL_NAME = 'fetch-markdown';
7
7
  export const FETCH_MARKDOWN_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to clean Markdown format with optional frontmatter and content length limits';
@@ -39,7 +39,15 @@ function resolveMarkdownOptions(input) {
39
39
  return {
40
40
  extractMainContent: input.extractMainContent ?? config.extraction.extractMainContent,
41
41
  includeMetadata: input.includeMetadata ?? config.extraction.includeMetadata,
42
- maxContentLength: input.maxContentLength,
42
+ ...(input.maxContentLength !== undefined && {
43
+ maxContentLength: input.maxContentLength,
44
+ }),
45
+ };
46
+ }
47
+ function buildFetchMarkdownErrorDetails() {
48
+ return {
49
+ fetchedAt: new Date().toISOString(),
50
+ cached: false,
43
51
  };
44
52
  }
45
53
  function buildMarkdownStructuredContent(pipeline, inlineResult, fileDownload) {
@@ -62,25 +70,30 @@ function buildMarkdownStructuredContent(pipeline, inlineResult, fileDownload) {
62
70
  function logFetchMarkdownStart(url, options) {
63
71
  logDebug('Fetching markdown', { url, ...options });
64
72
  }
65
- function buildMarkdownTransform(options) {
66
- return (html, url) => {
67
- const markdownResult = transformHtmlToMarkdown(html, url, options);
73
+ function buildMarkdownTransform(options, transform) {
74
+ return async (html, url) => {
75
+ const markdownResult = await transform(html, url, options);
68
76
  return { ...markdownResult, content: markdownResult.markdown };
69
77
  };
70
78
  }
71
- async function fetchMarkdownPipeline(url, input, options, transformOptions) {
72
- return performSharedFetch({
79
+ async function fetchMarkdownPipeline(url, input, options, transformOptions, performSharedFetchImpl, transformImpl) {
80
+ const sharedOptions = {
73
81
  url,
74
82
  format: 'markdown',
75
83
  extractMainContent: options.extractMainContent,
76
84
  includeMetadata: options.includeMetadata,
77
- maxContentLength: options.maxContentLength,
78
- customHeaders: input.customHeaders,
79
- retries: input.retries,
80
- timeout: input.timeout,
81
- transform: buildMarkdownTransform(transformOptions),
85
+ ...(options.maxContentLength !== undefined && {
86
+ maxContentLength: options.maxContentLength,
87
+ }),
88
+ ...(input.customHeaders !== undefined && {
89
+ customHeaders: input.customHeaders,
90
+ }),
91
+ ...(input.retries !== undefined && { retries: input.retries }),
92
+ ...(input.timeout !== undefined && { timeout: input.timeout }),
93
+ transform: buildMarkdownTransform(transformOptions, transformImpl),
82
94
  deserialize: deserializeMarkdownPipelineResult,
83
- });
95
+ };
96
+ return performSharedFetchImpl(sharedOptions);
84
97
  }
85
98
  function buildMarkdownResponse(pipeline, inlineResult, fileDownload) {
86
99
  const structuredContent = buildMarkdownStructuredContent(pipeline, inlineResult, fileDownload);
@@ -89,33 +102,48 @@ function buildMarkdownResponse(pipeline, inlineResult, fileDownload) {
89
102
  structuredContent,
90
103
  };
91
104
  }
92
- export async function fetchMarkdownToolHandler(input) {
93
- try {
94
- return await executeFetchMarkdown(input);
95
- }
96
- catch (error) {
97
- logError('fetch-markdown tool error', error instanceof Error ? error : undefined);
98
- return handleToolError(error, input.url, 'Failed to fetch markdown');
99
- }
105
+ export function createFetchMarkdownToolHandler(deps = {}) {
106
+ const performSharedFetchImpl = deps.performSharedFetch ?? performSharedFetch;
107
+ const transformImpl = deps.transformHtmlToMarkdown ?? transformHtmlToMarkdownAsync;
108
+ return async (input) => {
109
+ try {
110
+ return await executeFetchMarkdown(input, performSharedFetchImpl, transformImpl);
111
+ }
112
+ catch (error) {
113
+ logError('fetch-markdown tool error', error instanceof Error ? error : undefined);
114
+ const errorDetails = buildFetchMarkdownErrorDetails();
115
+ return handleToolError(error, input.url, 'Failed to fetch markdown', errorDetails);
116
+ }
117
+ };
100
118
  }
101
- async function executeFetchMarkdown(input) {
119
+ export const fetchMarkdownToolHandler = createFetchMarkdownToolHandler();
120
+ async function executeFetchMarkdown(input, performSharedFetchImpl, transformImpl) {
102
121
  const { url } = input;
103
122
  if (!url) {
104
- return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR');
123
+ return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR', buildFetchMarkdownErrorDetails());
105
124
  }
106
125
  const options = resolveMarkdownOptions(input);
107
126
  const transformOptions = { ...options };
108
127
  logFetchMarkdownStart(url, transformOptions);
109
- const { pipeline, inlineResult } = await fetchMarkdownPipeline(url, input, options, transformOptions);
110
- const inlineError = getInlineErrorResponse(inlineResult, url);
128
+ const { pipeline, inlineResult } = await fetchMarkdownPipeline(url, input, options, transformOptions, performSharedFetchImpl, transformImpl);
129
+ const inlineError = getInlineErrorResponse(inlineResult, url, buildFetchMarkdownErrorDetails());
111
130
  if (inlineError)
112
131
  return inlineError;
113
- const fileDownload = inlineResult.resourceUri
114
- ? getFileDownloadInfo({
132
+ let fileDownload = null;
133
+ if (inlineResult.resourceUri) {
134
+ const downloadContext = {
115
135
  cacheKey: pipeline.cacheKey ?? null,
116
136
  url: pipeline.url,
117
- title: pipeline.data.title,
118
- })
119
- : null;
137
+ };
138
+ if (pipeline.data.title !== undefined) {
139
+ fileDownload = getFileDownloadInfo({
140
+ ...downloadContext,
141
+ title: pipeline.data.title,
142
+ });
143
+ }
144
+ else {
145
+ fileDownload = getFileDownloadInfo(downloadContext);
146
+ }
147
+ }
120
148
  return buildMarkdownResponse(pipeline, inlineResult, fileDownload);
121
149
  }
@@ -1,5 +1,6 @@
1
1
  import type { PipelineResult, ToolContentBlock } from '../../config/types/runtime.js';
2
2
  import type { FileDownloadInfo, ToolResponseBase } from '../../config/types/tools.js';
3
+ import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
3
4
  import { applyInlineContentLimit } from '../utils/inline-content.js';
4
5
  type SharedFetchFormat = 'jsonl' | 'markdown';
5
6
  interface SharedFetchOptions<T extends {
@@ -10,17 +11,21 @@ interface SharedFetchOptions<T extends {
10
11
  readonly extractMainContent: boolean;
11
12
  readonly includeMetadata: boolean;
12
13
  readonly maxContentLength?: number;
14
+ readonly includeContentBlocks?: boolean;
13
15
  readonly cacheVariant?: string;
14
16
  readonly customHeaders?: Record<string, string>;
15
17
  readonly retries?: number;
16
18
  readonly timeout?: number;
17
- readonly transform: (html: string, normalizedUrl: string) => T;
19
+ readonly transform: (html: string, normalizedUrl: string) => T | Promise<T>;
18
20
  readonly serialize?: (result: T) => string;
19
21
  readonly deserialize?: (cached: string) => T | undefined;
20
22
  }
23
+ interface SharedFetchDeps {
24
+ readonly executeFetchPipeline?: typeof executeFetchPipeline;
25
+ }
21
26
  export declare function performSharedFetch<T extends {
22
27
  content: string;
23
- }>(options: SharedFetchOptions<T>): Promise<{
28
+ }>(options: SharedFetchOptions<T>, deps?: SharedFetchDeps): Promise<{
24
29
  pipeline: PipelineResult<T>;
25
30
  inlineResult: ReturnType<typeof applyInlineContentLimit>;
26
31
  }>;
@@ -31,7 +36,7 @@ interface DownloadContext {
31
36
  title?: string;
32
37
  }
33
38
  export declare function getFileDownloadInfo(context: DownloadContext): FileDownloadInfo | null;
34
- export declare function getInlineErrorResponse(inlineResult: InlineResult, url: string): ToolResponseBase | null;
39
+ export declare function getInlineErrorResponse(inlineResult: InlineResult, url: string, details?: Record<string, unknown>): ToolResponseBase | null;
35
40
  export declare function applyInlineResultToStructuredContent(structuredContent: Record<string, unknown>, inlineResult: InlineResult, contentKey: string): void;
36
41
  export declare function buildToolContentBlocks(structuredContent: Record<string, unknown>, fromCache: boolean, inlineResult: InlineResult, resourceName: string, cacheKey?: string | null, fullContent?: string, format?: SharedFetchFormat, url?: string, title?: string): ToolContentBlock[];
37
42
  export {};
@@ -5,7 +5,8 @@ import { createToolErrorResponse } from '../../utils/tool-error-handler.js';
5
5
  import { appendHeaderVary } from '../utils/cache-vary.js';
6
6
  import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
7
7
  import { applyInlineContentLimit } from '../utils/inline-content.js';
8
- export async function performSharedFetch(options) {
8
+ export async function performSharedFetch(options, deps = {}) {
9
+ const executePipeline = deps.executeFetchPipeline ?? executeFetchPipeline;
9
10
  const cacheNamespace = options.format === 'markdown' ? 'markdown' : 'url';
10
11
  const cacheVary = appendHeaderVary({
11
12
  format: options.format,
@@ -13,33 +14,54 @@ export async function performSharedFetch(options) {
13
14
  includeMetadata: options.includeMetadata,
14
15
  maxContentLength: options.maxContentLength,
15
16
  ...(options.cacheVariant ? { variant: options.cacheVariant } : {}),
16
- ...(options.format === 'markdown' ? {} : { contentBlocks: true }),
17
+ ...(options.format === 'markdown'
18
+ ? { includeContentBlocks: options.includeContentBlocks }
19
+ : { contentBlocks: true }),
17
20
  }, options.customHeaders);
18
- const pipeline = await executeFetchPipeline({
21
+ const pipelineOptions = {
19
22
  url: options.url,
20
23
  cacheNamespace,
21
- customHeaders: options.customHeaders,
22
- retries: options.retries,
23
- timeout: options.timeout,
24
- cacheVary,
25
24
  transform: options.transform,
26
- serialize: options.serialize,
27
- deserialize: options.deserialize,
28
- });
25
+ };
26
+ if (options.customHeaders !== undefined) {
27
+ pipelineOptions.customHeaders = options.customHeaders;
28
+ }
29
+ if (options.retries !== undefined) {
30
+ pipelineOptions.retries = options.retries;
31
+ }
32
+ if (options.timeout !== undefined) {
33
+ pipelineOptions.timeout = options.timeout;
34
+ }
35
+ if (cacheVary !== undefined) {
36
+ pipelineOptions.cacheVary = cacheVary;
37
+ }
38
+ if (options.serialize !== undefined) {
39
+ pipelineOptions.serialize = options.serialize;
40
+ }
41
+ if (options.deserialize !== undefined) {
42
+ pipelineOptions.deserialize = options.deserialize;
43
+ }
44
+ const pipeline = await executePipeline(pipelineOptions);
29
45
  const inlineResult = applyInlineContentLimit(pipeline.data.content, pipeline.cacheKey ?? null, options.format);
30
46
  return { pipeline, inlineResult };
31
47
  }
32
48
  export function getFileDownloadInfo(context) {
33
- return buildFileDownloadInfo({
49
+ const infoOptions = {
34
50
  cacheKey: context.cacheKey,
35
51
  url: context.url,
36
- title: context.title,
37
- });
52
+ };
53
+ if (context.title !== undefined) {
54
+ return buildFileDownloadInfo({
55
+ ...infoOptions,
56
+ title: context.title,
57
+ });
58
+ }
59
+ return buildFileDownloadInfo(infoOptions);
38
60
  }
39
- export function getInlineErrorResponse(inlineResult, url) {
61
+ export function getInlineErrorResponse(inlineResult, url, details) {
40
62
  if (!inlineResult.error)
41
63
  return null;
42
- return createToolErrorResponse(inlineResult.error, url, 'INTERNAL_ERROR');
64
+ return createToolErrorResponse(inlineResult.error, url, 'INTERNAL_ERROR', details);
43
65
  }
44
66
  export function applyInlineResultToStructuredContent(structuredContent, inlineResult, contentKey) {
45
67
  if (inlineResult.truncated) {
@@ -60,13 +82,16 @@ function buildResourceLink(inlineResult, name) {
60
82
  if (!inlineResult.resourceUri) {
61
83
  return null;
62
84
  }
63
- return {
85
+ const block = {
64
86
  type: 'resource_link',
65
87
  uri: inlineResult.resourceUri,
66
88
  name,
67
- mimeType: inlineResult.resourceMimeType,
68
89
  description: `Content exceeds inline limit (${config.constants.maxInlineContentChars} chars)`,
69
90
  };
91
+ if (inlineResult.resourceMimeType !== undefined) {
92
+ block.mimeType = inlineResult.resourceMimeType;
93
+ }
94
+ return block;
70
95
  }
71
96
  function buildEmbeddedResource(content, mimeType, url, title) {
72
97
  if (!content) {
@@ -1,7 +1,7 @@
1
1
  import { config } from '../../config/index.js';
2
2
  import { logDebug, logError } from '../../services/logger.js';
3
3
  import { createToolErrorResponse, handleToolError, } from '../../utils/tool-error-handler.js';
4
- import { transformHtmlToJsonl, transformHtmlToMarkdownWithBlocks, } from '../utils/content-transform.js';
4
+ import { transformHtmlToJsonlAsync, transformHtmlToMarkdownWithBlocksAsync, } from '../utils/content-transform-async.js';
5
5
  import { applyInlineResultToStructuredContent, buildToolContentBlocks, getInlineErrorResponse, performSharedFetch, } from './fetch-single.shared.js';
6
6
  export const FETCH_URL_TOOL_NAME = 'fetch-url';
7
7
  export const FETCH_URL_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to AI-readable JSONL format with semantic content blocks. Supports custom headers, retries, and content length limits.';
@@ -37,17 +37,36 @@ function deserializeJsonlTransformResult(cached) {
37
37
  }
38
38
  }
39
39
  function resolveFetchUrlOptions(input) {
40
+ const format = input.format ?? 'jsonl';
40
41
  return {
41
42
  extractMainContent: input.extractMainContent ?? config.extraction.extractMainContent,
42
43
  includeMetadata: input.includeMetadata ?? config.extraction.includeMetadata,
43
- maxContentLength: input.maxContentLength,
44
- format: input.format ?? 'jsonl',
44
+ format,
45
+ includeContentBlocks: input.includeContentBlocks ?? (format === 'markdown' ? false : true),
46
+ ...(input.maxContentLength !== undefined && {
47
+ maxContentLength: input.maxContentLength,
48
+ }),
49
+ };
50
+ }
51
+ function buildFetchUrlErrorDetails(format) {
52
+ return {
53
+ contentBlocks: 0,
54
+ fetchedAt: new Date().toISOString(),
55
+ format,
56
+ cached: false,
45
57
  };
46
58
  }
47
59
  function buildFetchUrlTransform(options) {
48
- return (html, url) => options.format === 'markdown'
49
- ? transformHtmlToMarkdownWithBlocks(html, url, options)
50
- : transformHtmlToJsonl(html, url, options);
60
+ return async (html, url) => options.format === 'markdown'
61
+ ? transformHtmlToMarkdownWithBlocksAsync(html, url, {
62
+ extractMainContent: options.extractMainContent,
63
+ includeMetadata: options.includeMetadata,
64
+ ...(options.maxContentLength !== undefined && {
65
+ maxContentLength: options.maxContentLength,
66
+ }),
67
+ includeContentBlocks: options.includeContentBlocks,
68
+ })
69
+ : transformHtmlToJsonlAsync(html, url, options);
51
70
  }
52
71
  function buildFetchUrlStructuredContent(format, pipeline, inlineResult) {
53
72
  const structuredContent = {
@@ -74,22 +93,31 @@ function logFetchUrlStart(url, options) {
74
93
  extractMainContent: options.extractMainContent,
75
94
  includeMetadata: options.includeMetadata,
76
95
  format: options.format,
96
+ includeContentBlocks: options.includeContentBlocks,
77
97
  });
78
98
  }
79
99
  async function fetchUrlPipeline(url, input, options) {
80
- return performSharedFetch({
100
+ const sharedOptions = {
81
101
  url,
82
102
  format: options.format,
83
103
  extractMainContent: options.extractMainContent,
84
104
  includeMetadata: options.includeMetadata,
85
- maxContentLength: options.maxContentLength,
86
- customHeaders: input.customHeaders,
87
- retries: input.retries,
88
- timeout: input.timeout,
89
- cacheVariant: options.format === 'markdown' ? 'markdown-with-blocks' : undefined,
105
+ includeContentBlocks: options.includeContentBlocks,
106
+ ...(options.maxContentLength !== undefined && {
107
+ maxContentLength: options.maxContentLength,
108
+ }),
109
+ ...(input.customHeaders !== undefined && {
110
+ customHeaders: input.customHeaders,
111
+ }),
112
+ ...(input.retries !== undefined && { retries: input.retries }),
113
+ ...(input.timeout !== undefined && { timeout: input.timeout }),
114
+ ...(options.format === 'markdown' && {
115
+ cacheVariant: 'markdown-with-blocks',
116
+ }),
90
117
  transform: buildFetchUrlTransform(options),
91
118
  deserialize: deserializeJsonlTransformResult,
92
- });
119
+ };
120
+ return performSharedFetch(sharedOptions);
93
121
  }
94
122
  function buildFetchUrlResponse(pipeline, inlineResult, format) {
95
123
  const structuredContent = buildFetchUrlStructuredContent(format, pipeline, inlineResult);
@@ -104,18 +132,20 @@ export async function fetchUrlToolHandler(input) {
104
132
  }
105
133
  catch (error) {
106
134
  logError('fetch-url tool error', error instanceof Error ? error : undefined);
107
- return handleToolError(error, input.url, 'Failed to fetch URL');
135
+ const errorDetails = buildFetchUrlErrorDetails(input.format ?? 'jsonl');
136
+ return handleToolError(error, input.url, 'Failed to fetch URL', errorDetails);
108
137
  }
109
138
  }
110
139
  async function executeFetchUrl(input) {
111
140
  const { url } = input;
141
+ const format = input.format ?? 'jsonl';
112
142
  if (!url) {
113
- return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR');
143
+ return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR', buildFetchUrlErrorDetails(format));
114
144
  }
115
145
  const options = resolveFetchUrlOptions(input);
116
146
  logFetchUrlStart(url, options);
117
147
  const { pipeline, inlineResult } = await fetchUrlPipeline(url, input, options);
118
- const inlineError = getInlineErrorResponse(inlineResult, url);
148
+ const inlineError = getInlineErrorResponse(inlineResult, url, buildFetchUrlErrorDetails(options.format));
119
149
  if (inlineError)
120
150
  return inlineError;
121
151
  return buildFetchUrlResponse(pipeline, inlineResult, options.format);
@@ -9,6 +9,12 @@ const TOOL_DEFINITIONS = [
9
9
  inputSchema: fetchUrlInputSchema,
10
10
  outputSchema: fetchUrlOutputSchema,
11
11
  handler: fetchUrlToolHandler,
12
+ annotations: {
13
+ readOnlyHint: true,
14
+ destructiveHint: false,
15
+ idempotentHint: true,
16
+ openWorldHint: true,
17
+ },
12
18
  },
13
19
  {
14
20
  name: FETCH_MARKDOWN_TOOL_NAME,
@@ -17,6 +23,12 @@ const TOOL_DEFINITIONS = [
17
23
  inputSchema: fetchMarkdownInputSchema,
18
24
  outputSchema: fetchMarkdownOutputSchema,
19
25
  handler: fetchMarkdownToolHandler,
26
+ annotations: {
27
+ readOnlyHint: true,
28
+ destructiveHint: false,
29
+ idempotentHint: true,
30
+ openWorldHint: true,
31
+ },
20
32
  },
21
33
  ];
22
34
  export function registerTools(server) {
@@ -26,6 +38,7 @@ export function registerTools(server) {
26
38
  description: tool.description,
27
39
  inputSchema: tool.inputSchema,
28
40
  outputSchema: tool.outputSchema,
41
+ annotations: tool.annotations,
29
42
  }, tool.handler);
30
43
  }
31
44
  }