@j0hanz/superfetch 1.2.2 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +60 -45
- package/dist/config/formatting.d.ts +1 -1
- package/dist/config/types/content.d.ts +3 -3
- package/dist/config/types/runtime.d.ts +1 -1
- package/dist/config/types/tools.d.ts +12 -12
- package/dist/http/cors.js +23 -23
- package/dist/http/download-routes.js +9 -4
- package/dist/http/mcp-routes.js +2 -13
- package/dist/http/mcp-validation.js +1 -1
- package/dist/http/server-middleware.js +2 -1
- package/dist/http/server.js +2 -0
- package/dist/index.js +5 -0
- package/dist/middleware/error-handler.js +1 -1
- package/dist/resources/cached-content.js +8 -4
- package/dist/server.js +2 -0
- package/dist/services/cache.d.ts +1 -1
- package/dist/services/cache.js +20 -7
- package/dist/services/context.d.ts +2 -4
- package/dist/services/context.js +1 -1
- package/dist/services/extractor.js +26 -21
- package/dist/services/fetcher/interceptors.d.ts +22 -0
- package/dist/services/fetcher/interceptors.js +18 -8
- package/dist/services/fetcher/response.js +32 -24
- package/dist/services/fetcher.d.ts +0 -1
- package/dist/services/fetcher.js +5 -7
- package/dist/services/metadata-collector.d.ts +10 -0
- package/dist/services/metadata-collector.js +11 -0
- package/dist/services/parser.js +26 -25
- package/dist/services/transform-worker-pool.d.ts +14 -0
- package/dist/services/transform-worker-pool.js +167 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +9 -1
- package/dist/tools/handlers/fetch-markdown.tool.js +58 -30
- package/dist/tools/handlers/fetch-single.shared.d.ts +8 -3
- package/dist/tools/handlers/fetch-single.shared.js +42 -17
- package/dist/tools/handlers/fetch-url.tool.js +46 -16
- package/dist/tools/index.js +13 -0
- package/dist/tools/schemas.d.ts +33 -30
- package/dist/tools/schemas.js +4 -0
- package/dist/tools/utils/common.js +20 -16
- package/dist/tools/utils/content-transform-async.d.ts +6 -0
- package/dist/tools/utils/content-transform-async.js +33 -0
- package/dist/tools/utils/content-transform.d.ts +4 -1
- package/dist/tools/utils/content-transform.js +7 -2
- package/dist/tools/utils/fetch-pipeline.js +18 -10
- package/dist/utils/content-cleaner.d.ts +1 -1
- package/dist/utils/download-url.d.ts +9 -1
- package/dist/utils/download-url.js +9 -6
- package/dist/utils/tool-error-handler.d.ts +2 -2
- package/dist/utils/tool-error-handler.js +7 -7
- package/dist/utils/url-validator.js +38 -0
- package/dist/workers/transform-worker.d.ts +1 -0
- package/dist/workers/transform-worker.js +50 -0
- package/package.json +4 -6
|
@@ -1,3 +1,25 @@
|
|
|
1
|
+
export type FetchChannelEvent = {
|
|
2
|
+
v: 1;
|
|
3
|
+
type: 'start';
|
|
4
|
+
requestId: string;
|
|
5
|
+
method: string;
|
|
6
|
+
url: string;
|
|
7
|
+
} | {
|
|
8
|
+
v: 1;
|
|
9
|
+
type: 'end';
|
|
10
|
+
requestId: string;
|
|
11
|
+
status: number;
|
|
12
|
+
duration: number;
|
|
13
|
+
} | {
|
|
14
|
+
v: 1;
|
|
15
|
+
type: 'error';
|
|
16
|
+
requestId: string;
|
|
17
|
+
url: string;
|
|
18
|
+
error: string;
|
|
19
|
+
code?: string;
|
|
20
|
+
status?: number;
|
|
21
|
+
duration: number;
|
|
22
|
+
};
|
|
1
23
|
interface FetchTelemetryContext {
|
|
2
24
|
requestId: string;
|
|
3
25
|
startTime: number;
|
|
@@ -70,13 +70,18 @@ function publishFetchEnd(context, status, duration) {
|
|
|
70
70
|
});
|
|
71
71
|
}
|
|
72
72
|
function buildResponseMeta(response, contentSize, duration) {
|
|
73
|
-
const contentType = response.headers.get('content-type') ?? undefined;
|
|
74
73
|
const contentLength = response.headers.get('content-length') ?? contentSize?.toString();
|
|
75
|
-
|
|
76
|
-
contentType,
|
|
74
|
+
const meta = {
|
|
77
75
|
duration: `${Math.round(duration)}ms`,
|
|
78
|
-
size: contentLength,
|
|
79
76
|
};
|
|
77
|
+
const contentType = response.headers.get('content-type');
|
|
78
|
+
if (contentType !== null) {
|
|
79
|
+
meta.contentType = contentType;
|
|
80
|
+
}
|
|
81
|
+
if (contentLength !== undefined) {
|
|
82
|
+
meta.size = contentLength;
|
|
83
|
+
}
|
|
84
|
+
return meta;
|
|
80
85
|
}
|
|
81
86
|
function logSlowRequestIfNeeded(context, duration) {
|
|
82
87
|
if (duration <= 5000)
|
|
@@ -91,16 +96,21 @@ export function recordFetchError(context, error, status) {
|
|
|
91
96
|
const duration = performance.now() - context.startTime;
|
|
92
97
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
93
98
|
const code = isSystemError(err) ? err.code : undefined;
|
|
94
|
-
|
|
99
|
+
const event = {
|
|
95
100
|
v: 1,
|
|
96
101
|
type: 'error',
|
|
97
102
|
requestId: context.requestId,
|
|
98
103
|
url: context.url,
|
|
99
104
|
error: err.message,
|
|
100
|
-
code,
|
|
101
|
-
status,
|
|
102
105
|
duration,
|
|
103
|
-
}
|
|
106
|
+
};
|
|
107
|
+
if (code !== undefined) {
|
|
108
|
+
event.code = code;
|
|
109
|
+
}
|
|
110
|
+
if (status !== undefined) {
|
|
111
|
+
event.status = status;
|
|
112
|
+
}
|
|
113
|
+
publishFetchEvent(event);
|
|
104
114
|
const log = status === 429 ? logWarn : logError;
|
|
105
115
|
log('HTTP Request Error', {
|
|
106
116
|
requestId: context.requestId,
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { Readable, Writable } from 'node:stream';
|
|
2
|
+
import { pipeline } from 'node:stream/promises';
|
|
1
3
|
import { FetchError } from '../../errors/app-error.js';
|
|
2
4
|
function assertContentLengthWithinLimit(response, url, maxBytes) {
|
|
3
5
|
const contentLengthHeader = response.headers.get('content-length');
|
|
@@ -9,37 +11,43 @@ function assertContentLengthWithinLimit(response, url, maxBytes) {
|
|
|
9
11
|
}
|
|
10
12
|
throw new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url);
|
|
11
13
|
}
|
|
12
|
-
function throwIfReadAborted(url, signal) {
|
|
13
|
-
if (!signal?.aborted)
|
|
14
|
-
return;
|
|
15
|
-
throw new FetchError('Request was aborted during response read', url, 499, {
|
|
16
|
-
reason: 'aborted',
|
|
17
|
-
});
|
|
18
|
-
}
|
|
19
14
|
async function readStreamWithLimit(stream, url, maxBytes, signal) {
|
|
20
|
-
const reader = stream.getReader();
|
|
21
15
|
const decoder = new TextDecoder();
|
|
22
16
|
let total = 0;
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
17
|
+
let text = '';
|
|
18
|
+
const toBuffer = (chunk) => {
|
|
19
|
+
if (typeof chunk === 'string') {
|
|
20
|
+
return Buffer.from(chunk);
|
|
21
|
+
}
|
|
22
|
+
return Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
|
|
23
|
+
};
|
|
24
|
+
const sink = new Writable({
|
|
25
|
+
write(chunk, _encoding, callback) {
|
|
26
|
+
const buffer = toBuffer(chunk);
|
|
27
|
+
total += buffer.length;
|
|
31
28
|
if (total > maxBytes) {
|
|
32
|
-
|
|
33
|
-
|
|
29
|
+
callback(new FetchError(`Response exceeds maximum size of ${maxBytes} bytes`, url));
|
|
30
|
+
return;
|
|
34
31
|
}
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
32
|
+
text += decoder.decode(buffer, { stream: true });
|
|
33
|
+
callback();
|
|
34
|
+
},
|
|
35
|
+
final(callback) {
|
|
36
|
+
text += decoder.decode();
|
|
37
|
+
callback();
|
|
38
|
+
},
|
|
39
|
+
});
|
|
40
|
+
try {
|
|
41
|
+
const readable = Readable.fromWeb(stream, { signal });
|
|
42
|
+
await pipeline(readable, sink, { signal });
|
|
39
43
|
}
|
|
40
|
-
|
|
41
|
-
|
|
44
|
+
catch (error) {
|
|
45
|
+
if (signal?.aborted) {
|
|
46
|
+
throw new FetchError('Request was aborted during response read', url, 499, { reason: 'aborted' });
|
|
47
|
+
}
|
|
48
|
+
throw error;
|
|
42
49
|
}
|
|
50
|
+
return { text, size: total };
|
|
43
51
|
}
|
|
44
52
|
export async function readResponseText(response, url, maxBytes, signal) {
|
|
45
53
|
assertContentLengthWithinLimit(response, url, maxBytes);
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import type { FetchOptions } from '../config/types/runtime.js';
|
|
2
2
|
import { destroyAgents } from './fetcher/agents.js';
|
|
3
3
|
export { destroyAgents };
|
|
4
|
-
export declare function fetchUrlWithRetry(url: string, options?: FetchOptions, maxRetries?: number): Promise<string>;
|
|
5
4
|
export declare function fetchNormalizedUrlWithRetry(normalizedUrl: string, options?: FetchOptions, maxRetries?: number): Promise<string>;
|
package/dist/services/fetcher.js
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { config } from '../config/index.js';
|
|
2
2
|
import { normalizeHeaderRecord } from '../utils/header-normalizer.js';
|
|
3
|
-
import { validateAndNormalizeUrl } from '../utils/url-validator.js';
|
|
4
3
|
import { destroyAgents, dispatcher } from './fetcher/agents.js';
|
|
5
4
|
import { createHttpError, createRateLimitError, mapFetchError, } from './fetcher/errors.js';
|
|
6
5
|
import { recordFetchError, recordFetchResponse, startFetchTelemetry, } from './fetcher/interceptors.js';
|
|
@@ -66,20 +65,19 @@ async function fetchWithTelemetry(normalizedUrl, requestInit, timeoutMs) {
|
|
|
66
65
|
throw mapped;
|
|
67
66
|
}
|
|
68
67
|
}
|
|
69
|
-
export async function fetchUrlWithRetry(url, options, maxRetries = 3) {
|
|
70
|
-
const normalizedUrl = await validateAndNormalizeUrl(url);
|
|
71
|
-
return fetchNormalizedUrlWithRetry(normalizedUrl, options, maxRetries);
|
|
72
|
-
}
|
|
73
68
|
export async function fetchNormalizedUrlWithRetry(normalizedUrl, options, maxRetries = 3) {
|
|
74
69
|
const context = buildRequestContext(options);
|
|
75
70
|
return executeWithRetry(normalizedUrl, maxRetries, async () => runFetch(normalizedUrl, context), context.signal);
|
|
76
71
|
}
|
|
77
72
|
function buildRequestContext(options) {
|
|
78
|
-
|
|
73
|
+
const context = {
|
|
79
74
|
timeoutMs: options?.timeout ?? config.fetcher.timeout,
|
|
80
75
|
headers: buildHeaders(options?.customHeaders),
|
|
81
|
-
signal: options?.signal,
|
|
82
76
|
};
|
|
77
|
+
if (options?.signal) {
|
|
78
|
+
context.signal = options.signal;
|
|
79
|
+
}
|
|
80
|
+
return context;
|
|
83
81
|
}
|
|
84
82
|
async function runFetch(normalizedUrl, context) {
|
|
85
83
|
const signal = buildRequestSignal(context.timeoutMs, context.signal);
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { ExtractedMetadata } from '../config/types/content.js';
|
|
2
|
+
export type MetaSource = 'og' | 'twitter' | 'standard';
|
|
3
|
+
export type MetaField = keyof ExtractedMetadata;
|
|
4
|
+
export interface MetaCollectorState {
|
|
5
|
+
title: Partial<Record<MetaSource, string>>;
|
|
6
|
+
description: Partial<Record<MetaSource, string>>;
|
|
7
|
+
author: Partial<Record<MetaSource, string>>;
|
|
8
|
+
}
|
|
9
|
+
export declare function createMetaCollectorState(): MetaCollectorState;
|
|
10
|
+
export declare function resolveMetaField(state: MetaCollectorState, field: MetaField): string | undefined;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export function createMetaCollectorState() {
|
|
2
|
+
return {
|
|
3
|
+
title: {},
|
|
4
|
+
description: {},
|
|
5
|
+
author: {},
|
|
6
|
+
};
|
|
7
|
+
}
|
|
8
|
+
export function resolveMetaField(state, field) {
|
|
9
|
+
const sources = state[field];
|
|
10
|
+
return sources.og ?? sources.twitter ?? sources.standard;
|
|
11
|
+
}
|
package/dist/services/parser.js
CHANGED
|
@@ -6,18 +6,8 @@ import { getErrorMessage } from '../utils/error-utils.js';
|
|
|
6
6
|
import { truncateHtml } from '../utils/html-truncator.js';
|
|
7
7
|
import { sanitizeText } from '../utils/sanitizer.js';
|
|
8
8
|
import { logWarn } from './logger.js';
|
|
9
|
+
import { createMetaCollectorState, resolveMetaField, } from './metadata-collector.js';
|
|
9
10
|
const CONTENT_SELECTOR = 'h1, h2, h3, h4, h5, h6, p, ul, ol, pre, code:not(pre code), table, img, blockquote';
|
|
10
|
-
function createMetaCollectorState() {
|
|
11
|
-
return {
|
|
12
|
-
title: {},
|
|
13
|
-
description: {},
|
|
14
|
-
author: {},
|
|
15
|
-
};
|
|
16
|
-
}
|
|
17
|
-
function resolveMetaField(state, field) {
|
|
18
|
-
const sources = state[field];
|
|
19
|
-
return sources.og ?? sources.twitter ?? sources.standard;
|
|
20
|
-
}
|
|
21
11
|
function extractMetadata($) {
|
|
22
12
|
const state = createMetaCollectorState();
|
|
23
13
|
$('meta').each((_, element) => {
|
|
@@ -55,11 +45,17 @@ function extractMetadata($) {
|
|
|
55
45
|
state.title.standard = titleText;
|
|
56
46
|
}
|
|
57
47
|
}
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
48
|
+
const metadata = {};
|
|
49
|
+
const title = resolveMetaField(state, 'title');
|
|
50
|
+
const description = resolveMetaField(state, 'description');
|
|
51
|
+
const author = resolveMetaField(state, 'author');
|
|
52
|
+
if (title !== undefined)
|
|
53
|
+
metadata.title = title;
|
|
54
|
+
if (description !== undefined)
|
|
55
|
+
metadata.description = description;
|
|
56
|
+
if (author !== undefined)
|
|
57
|
+
metadata.author = author;
|
|
58
|
+
return metadata;
|
|
63
59
|
}
|
|
64
60
|
function parseHeading($, element) {
|
|
65
61
|
const rawText = sanitizeText($(element).text());
|
|
@@ -109,11 +105,14 @@ function parseCode($, element) {
|
|
|
109
105
|
const dataLang = $(element).attr('data-language') ?? '';
|
|
110
106
|
const language = resolveLanguageFromAttributes(className, dataLang) ??
|
|
111
107
|
detectLanguageFromCode(text);
|
|
112
|
-
|
|
108
|
+
const block = {
|
|
113
109
|
type: 'code',
|
|
114
|
-
language,
|
|
115
110
|
text,
|
|
116
111
|
};
|
|
112
|
+
if (language !== undefined) {
|
|
113
|
+
block.language = language;
|
|
114
|
+
}
|
|
115
|
+
return block;
|
|
117
116
|
}
|
|
118
117
|
function parseTable($, element) {
|
|
119
118
|
const headers = [];
|
|
@@ -144,21 +143,23 @@ function parseTable($, element) {
|
|
|
144
143
|
});
|
|
145
144
|
if (rows.length === 0)
|
|
146
145
|
return null;
|
|
147
|
-
return
|
|
148
|
-
type: 'table',
|
|
149
|
-
|
|
150
|
-
rows,
|
|
151
|
-
};
|
|
146
|
+
return headers.length > 0
|
|
147
|
+
? { type: 'table', headers, rows }
|
|
148
|
+
: { type: 'table', rows };
|
|
152
149
|
}
|
|
153
150
|
function parseImage($, element) {
|
|
154
151
|
const src = $(element).attr('src');
|
|
155
152
|
if (!src)
|
|
156
153
|
return null;
|
|
157
|
-
|
|
154
|
+
const alt = $(element).attr('alt');
|
|
155
|
+
const image = {
|
|
158
156
|
type: 'image',
|
|
159
157
|
src,
|
|
160
|
-
alt: $(element).attr('alt') ?? undefined,
|
|
161
158
|
};
|
|
159
|
+
if (alt !== undefined) {
|
|
160
|
+
image.alt = alt;
|
|
161
|
+
}
|
|
162
|
+
return image;
|
|
162
163
|
}
|
|
163
164
|
function parseBlockquote($, element) {
|
|
164
165
|
const rawText = sanitizeText($(element).text());
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { JsonlTransformResult, MarkdownTransformResult, TransformOptions } from '../config/types/content.js';
|
|
2
|
+
type TransformMode = 'jsonl' | 'markdown' | 'markdown-blocks';
|
|
3
|
+
export interface TransformJob {
|
|
4
|
+
mode: TransformMode;
|
|
5
|
+
html: string;
|
|
6
|
+
url: string;
|
|
7
|
+
options: TransformOptions & {
|
|
8
|
+
includeContentBlocks?: boolean;
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
type TransformResult = JsonlTransformResult | MarkdownTransformResult;
|
|
12
|
+
export declare function runTransformInWorker(job: TransformJob): Promise<TransformResult | null>;
|
|
13
|
+
export declare function destroyTransformWorkers(): void;
|
|
14
|
+
export {};
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import os from 'node:os';
|
|
2
|
+
import { isMainThread, Worker } from 'node:worker_threads';
|
|
3
|
+
import { config } from '../config/index.js';
|
|
4
|
+
import { getErrorMessage } from '../utils/error-utils.js';
|
|
5
|
+
import { logWarn } from './logger.js';
|
|
6
|
+
const MAX_POOL_SIZE = 4;
|
|
7
|
+
function resolvePoolSize() {
|
|
8
|
+
const available = os.availableParallelism();
|
|
9
|
+
return Math.max(1, Math.min(available - 1, MAX_POOL_SIZE));
|
|
10
|
+
}
|
|
11
|
+
let pool = null;
|
|
12
|
+
let poolDisabled = false;
|
|
13
|
+
function shouldUseWorkers() {
|
|
14
|
+
return isMainThread && config.runtime.httpMode && !poolDisabled;
|
|
15
|
+
}
|
|
16
|
+
function getWorkerUrl() {
|
|
17
|
+
return new URL('../workers/transform-worker.js', import.meta.url);
|
|
18
|
+
}
|
|
19
|
+
export async function runTransformInWorker(job) {
|
|
20
|
+
if (!shouldUseWorkers())
|
|
21
|
+
return null;
|
|
22
|
+
if (!pool) {
|
|
23
|
+
try {
|
|
24
|
+
pool = new TransformWorkerPool(getWorkerUrl(), resolvePoolSize());
|
|
25
|
+
}
|
|
26
|
+
catch (error) {
|
|
27
|
+
poolDisabled = true;
|
|
28
|
+
logWarn('Failed to initialize transform worker pool', {
|
|
29
|
+
error: getErrorMessage(error),
|
|
30
|
+
});
|
|
31
|
+
return null;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
try {
|
|
35
|
+
return await pool.run(job);
|
|
36
|
+
}
|
|
37
|
+
catch (error) {
|
|
38
|
+
poolDisabled = true;
|
|
39
|
+
pool.destroy();
|
|
40
|
+
pool = null;
|
|
41
|
+
logWarn('Transform worker failed; falling back to main thread', {
|
|
42
|
+
error: getErrorMessage(error),
|
|
43
|
+
});
|
|
44
|
+
return null;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
export function destroyTransformWorkers() {
|
|
48
|
+
pool?.destroy();
|
|
49
|
+
pool = null;
|
|
50
|
+
}
|
|
51
|
+
class TransformWorkerPool {
|
|
52
|
+
workerUrl;
|
|
53
|
+
size;
|
|
54
|
+
workers = [];
|
|
55
|
+
queue = [];
|
|
56
|
+
pending = new Map();
|
|
57
|
+
nextId = 1;
|
|
58
|
+
destroyed = false;
|
|
59
|
+
constructor(workerUrl, size) {
|
|
60
|
+
this.workerUrl = workerUrl;
|
|
61
|
+
this.size = size;
|
|
62
|
+
for (let i = 0; i < size; i += 1) {
|
|
63
|
+
this.workers.push(this.createWorker());
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
run(job) {
|
|
67
|
+
if (this.destroyed) {
|
|
68
|
+
return Promise.reject(new Error('Transform worker pool is closed'));
|
|
69
|
+
}
|
|
70
|
+
const id = this.nextId++;
|
|
71
|
+
const queuedJob = { ...job, id };
|
|
72
|
+
return new Promise((resolve, reject) => {
|
|
73
|
+
this.pending.set(id, { resolve, reject });
|
|
74
|
+
this.queue.push(queuedJob);
|
|
75
|
+
this.schedule();
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
destroy() {
|
|
79
|
+
if (this.destroyed)
|
|
80
|
+
return;
|
|
81
|
+
this.destroyed = true;
|
|
82
|
+
for (const workerState of this.workers) {
|
|
83
|
+
void workerState.worker.terminate();
|
|
84
|
+
}
|
|
85
|
+
for (const [id, pending] of this.pending.entries()) {
|
|
86
|
+
pending.reject(new Error('Transform worker pool shut down'));
|
|
87
|
+
this.pending.delete(id);
|
|
88
|
+
}
|
|
89
|
+
this.queue.length = 0;
|
|
90
|
+
}
|
|
91
|
+
createWorker() {
|
|
92
|
+
const worker = new Worker(this.workerUrl);
|
|
93
|
+
worker.unref();
|
|
94
|
+
const state = { worker, busy: false, currentJobId: undefined };
|
|
95
|
+
worker.on('message', (message) => {
|
|
96
|
+
this.handleMessage(state, message);
|
|
97
|
+
});
|
|
98
|
+
worker.on('error', (error) => {
|
|
99
|
+
this.handleWorkerError(state, error);
|
|
100
|
+
});
|
|
101
|
+
worker.on('exit', (code) => {
|
|
102
|
+
this.handleWorkerExit(state, code);
|
|
103
|
+
});
|
|
104
|
+
return state;
|
|
105
|
+
}
|
|
106
|
+
handleMessage(state, message) {
|
|
107
|
+
const pending = this.pending.get(message.id);
|
|
108
|
+
if (pending) {
|
|
109
|
+
this.pending.delete(message.id);
|
|
110
|
+
if (message.ok) {
|
|
111
|
+
pending.resolve(message.result);
|
|
112
|
+
}
|
|
113
|
+
else {
|
|
114
|
+
pending.reject(new Error(message.error));
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
state.busy = false;
|
|
118
|
+
state.currentJobId = undefined;
|
|
119
|
+
this.schedule();
|
|
120
|
+
}
|
|
121
|
+
handleWorkerError(state, error) {
|
|
122
|
+
this.failCurrentJob(state, error);
|
|
123
|
+
this.replaceWorker(state);
|
|
124
|
+
}
|
|
125
|
+
handleWorkerExit(state, code) {
|
|
126
|
+
if (code !== 0) {
|
|
127
|
+
this.failCurrentJob(state, new Error(`Transform worker exited with code ${code}`));
|
|
128
|
+
}
|
|
129
|
+
this.replaceWorker(state);
|
|
130
|
+
}
|
|
131
|
+
failCurrentJob(state, error) {
|
|
132
|
+
if (!state.currentJobId)
|
|
133
|
+
return;
|
|
134
|
+
const pending = this.pending.get(state.currentJobId);
|
|
135
|
+
if (pending) {
|
|
136
|
+
pending.reject(error);
|
|
137
|
+
this.pending.delete(state.currentJobId);
|
|
138
|
+
}
|
|
139
|
+
state.currentJobId = undefined;
|
|
140
|
+
state.busy = false;
|
|
141
|
+
}
|
|
142
|
+
replaceWorker(state) {
|
|
143
|
+
if (this.destroyed)
|
|
144
|
+
return;
|
|
145
|
+
const index = this.workers.indexOf(state);
|
|
146
|
+
if (index === -1)
|
|
147
|
+
return;
|
|
148
|
+
this.workers[index] = this.createWorker();
|
|
149
|
+
this.schedule();
|
|
150
|
+
}
|
|
151
|
+
schedule() {
|
|
152
|
+
if (this.destroyed)
|
|
153
|
+
return;
|
|
154
|
+
for (const workerState of this.workers) {
|
|
155
|
+
if (this.queue.length === 0)
|
|
156
|
+
return;
|
|
157
|
+
if (workerState.busy)
|
|
158
|
+
continue;
|
|
159
|
+
const job = this.queue.shift();
|
|
160
|
+
if (!job)
|
|
161
|
+
return;
|
|
162
|
+
workerState.busy = true;
|
|
163
|
+
workerState.currentJobId = job.id;
|
|
164
|
+
workerState.worker.postMessage(job);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
@@ -1,4 +1,12 @@
|
|
|
1
1
|
import type { FetchMarkdownInput, ToolResponseBase } from '../../config/types/tools.js';
|
|
2
|
+
import { transformHtmlToMarkdownAsync } from '../utils/content-transform-async.js';
|
|
3
|
+
import { performSharedFetch } from './fetch-single.shared.js';
|
|
2
4
|
export declare const FETCH_MARKDOWN_TOOL_NAME = "fetch-markdown";
|
|
3
5
|
export declare const FETCH_MARKDOWN_TOOL_DESCRIPTION = "Fetches a webpage and converts it to clean Markdown format with optional frontmatter and content length limits";
|
|
4
|
-
|
|
6
|
+
interface FetchMarkdownDeps {
|
|
7
|
+
readonly performSharedFetch?: typeof performSharedFetch;
|
|
8
|
+
readonly transformHtmlToMarkdown?: typeof transformHtmlToMarkdownAsync;
|
|
9
|
+
}
|
|
10
|
+
export declare function createFetchMarkdownToolHandler(deps?: FetchMarkdownDeps): (input: FetchMarkdownInput) => Promise<ToolResponseBase>;
|
|
11
|
+
export declare const fetchMarkdownToolHandler: (input: FetchMarkdownInput) => Promise<ToolResponseBase>;
|
|
12
|
+
export {};
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { config } from '../../config/index.js';
|
|
2
2
|
import { logDebug, logError } from '../../services/logger.js';
|
|
3
3
|
import { createToolErrorResponse, handleToolError, } from '../../utils/tool-error-handler.js';
|
|
4
|
-
import {
|
|
4
|
+
import { transformHtmlToMarkdownAsync } from '../utils/content-transform-async.js';
|
|
5
5
|
import { applyInlineResultToStructuredContent, buildToolContentBlocks, getFileDownloadInfo, getInlineErrorResponse, performSharedFetch, } from './fetch-single.shared.js';
|
|
6
6
|
export const FETCH_MARKDOWN_TOOL_NAME = 'fetch-markdown';
|
|
7
7
|
export const FETCH_MARKDOWN_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to clean Markdown format with optional frontmatter and content length limits';
|
|
@@ -39,7 +39,15 @@ function resolveMarkdownOptions(input) {
|
|
|
39
39
|
return {
|
|
40
40
|
extractMainContent: input.extractMainContent ?? config.extraction.extractMainContent,
|
|
41
41
|
includeMetadata: input.includeMetadata ?? config.extraction.includeMetadata,
|
|
42
|
-
|
|
42
|
+
...(input.maxContentLength !== undefined && {
|
|
43
|
+
maxContentLength: input.maxContentLength,
|
|
44
|
+
}),
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
function buildFetchMarkdownErrorDetails() {
|
|
48
|
+
return {
|
|
49
|
+
fetchedAt: new Date().toISOString(),
|
|
50
|
+
cached: false,
|
|
43
51
|
};
|
|
44
52
|
}
|
|
45
53
|
function buildMarkdownStructuredContent(pipeline, inlineResult, fileDownload) {
|
|
@@ -62,25 +70,30 @@ function buildMarkdownStructuredContent(pipeline, inlineResult, fileDownload) {
|
|
|
62
70
|
function logFetchMarkdownStart(url, options) {
|
|
63
71
|
logDebug('Fetching markdown', { url, ...options });
|
|
64
72
|
}
|
|
65
|
-
function buildMarkdownTransform(options) {
|
|
66
|
-
return (html, url) => {
|
|
67
|
-
const markdownResult =
|
|
73
|
+
function buildMarkdownTransform(options, transform) {
|
|
74
|
+
return async (html, url) => {
|
|
75
|
+
const markdownResult = await transform(html, url, options);
|
|
68
76
|
return { ...markdownResult, content: markdownResult.markdown };
|
|
69
77
|
};
|
|
70
78
|
}
|
|
71
|
-
async function fetchMarkdownPipeline(url, input, options, transformOptions) {
|
|
72
|
-
|
|
79
|
+
async function fetchMarkdownPipeline(url, input, options, transformOptions, performSharedFetchImpl, transformImpl) {
|
|
80
|
+
const sharedOptions = {
|
|
73
81
|
url,
|
|
74
82
|
format: 'markdown',
|
|
75
83
|
extractMainContent: options.extractMainContent,
|
|
76
84
|
includeMetadata: options.includeMetadata,
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
85
|
+
...(options.maxContentLength !== undefined && {
|
|
86
|
+
maxContentLength: options.maxContentLength,
|
|
87
|
+
}),
|
|
88
|
+
...(input.customHeaders !== undefined && {
|
|
89
|
+
customHeaders: input.customHeaders,
|
|
90
|
+
}),
|
|
91
|
+
...(input.retries !== undefined && { retries: input.retries }),
|
|
92
|
+
...(input.timeout !== undefined && { timeout: input.timeout }),
|
|
93
|
+
transform: buildMarkdownTransform(transformOptions, transformImpl),
|
|
82
94
|
deserialize: deserializeMarkdownPipelineResult,
|
|
83
|
-
}
|
|
95
|
+
};
|
|
96
|
+
return performSharedFetchImpl(sharedOptions);
|
|
84
97
|
}
|
|
85
98
|
function buildMarkdownResponse(pipeline, inlineResult, fileDownload) {
|
|
86
99
|
const structuredContent = buildMarkdownStructuredContent(pipeline, inlineResult, fileDownload);
|
|
@@ -89,33 +102,48 @@ function buildMarkdownResponse(pipeline, inlineResult, fileDownload) {
|
|
|
89
102
|
structuredContent,
|
|
90
103
|
};
|
|
91
104
|
}
|
|
92
|
-
export
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
105
|
+
export function createFetchMarkdownToolHandler(deps = {}) {
|
|
106
|
+
const performSharedFetchImpl = deps.performSharedFetch ?? performSharedFetch;
|
|
107
|
+
const transformImpl = deps.transformHtmlToMarkdown ?? transformHtmlToMarkdownAsync;
|
|
108
|
+
return async (input) => {
|
|
109
|
+
try {
|
|
110
|
+
return await executeFetchMarkdown(input, performSharedFetchImpl, transformImpl);
|
|
111
|
+
}
|
|
112
|
+
catch (error) {
|
|
113
|
+
logError('fetch-markdown tool error', error instanceof Error ? error : undefined);
|
|
114
|
+
const errorDetails = buildFetchMarkdownErrorDetails();
|
|
115
|
+
return handleToolError(error, input.url, 'Failed to fetch markdown', errorDetails);
|
|
116
|
+
}
|
|
117
|
+
};
|
|
100
118
|
}
|
|
101
|
-
|
|
119
|
+
export const fetchMarkdownToolHandler = createFetchMarkdownToolHandler();
|
|
120
|
+
async function executeFetchMarkdown(input, performSharedFetchImpl, transformImpl) {
|
|
102
121
|
const { url } = input;
|
|
103
122
|
if (!url) {
|
|
104
|
-
return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR');
|
|
123
|
+
return createToolErrorResponse('URL is required', '', 'VALIDATION_ERROR', buildFetchMarkdownErrorDetails());
|
|
105
124
|
}
|
|
106
125
|
const options = resolveMarkdownOptions(input);
|
|
107
126
|
const transformOptions = { ...options };
|
|
108
127
|
logFetchMarkdownStart(url, transformOptions);
|
|
109
|
-
const { pipeline, inlineResult } = await fetchMarkdownPipeline(url, input, options, transformOptions);
|
|
110
|
-
const inlineError = getInlineErrorResponse(inlineResult, url);
|
|
128
|
+
const { pipeline, inlineResult } = await fetchMarkdownPipeline(url, input, options, transformOptions, performSharedFetchImpl, transformImpl);
|
|
129
|
+
const inlineError = getInlineErrorResponse(inlineResult, url, buildFetchMarkdownErrorDetails());
|
|
111
130
|
if (inlineError)
|
|
112
131
|
return inlineError;
|
|
113
|
-
|
|
114
|
-
|
|
132
|
+
let fileDownload = null;
|
|
133
|
+
if (inlineResult.resourceUri) {
|
|
134
|
+
const downloadContext = {
|
|
115
135
|
cacheKey: pipeline.cacheKey ?? null,
|
|
116
136
|
url: pipeline.url,
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
137
|
+
};
|
|
138
|
+
if (pipeline.data.title !== undefined) {
|
|
139
|
+
fileDownload = getFileDownloadInfo({
|
|
140
|
+
...downloadContext,
|
|
141
|
+
title: pipeline.data.title,
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
else {
|
|
145
|
+
fileDownload = getFileDownloadInfo(downloadContext);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
120
148
|
return buildMarkdownResponse(pipeline, inlineResult, fileDownload);
|
|
121
149
|
}
|