@j0hanz/superfetch 1.2.4 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +116 -152
- package/dist/config/auth-config.d.ts +16 -0
- package/dist/config/auth-config.js +53 -0
- package/dist/config/constants.d.ts +11 -13
- package/dist/config/constants.js +1 -3
- package/dist/config/env-parsers.d.ts +7 -0
- package/dist/config/env-parsers.js +84 -0
- package/dist/config/formatting.d.ts +2 -2
- package/dist/config/index.d.ts +47 -53
- package/dist/config/index.js +25 -59
- package/dist/config/types/content.d.ts +1 -49
- package/dist/config/types/runtime.d.ts +8 -16
- package/dist/config/types/tools.d.ts +2 -28
- package/dist/http/accept-policy.d.ts +3 -0
- package/dist/http/accept-policy.js +45 -0
- package/dist/http/async-handler.d.ts +2 -0
- package/dist/http/async-handler.js +5 -0
- package/dist/http/auth-introspection.d.ts +2 -0
- package/dist/http/auth-introspection.js +141 -0
- package/dist/http/auth-static.d.ts +2 -0
- package/dist/http/auth-static.js +23 -0
- package/dist/http/auth.d.ts +3 -2
- package/dist/http/auth.js +98 -26
- package/dist/http/cors.d.ts +6 -6
- package/dist/http/cors.js +7 -42
- package/dist/http/download-routes.d.ts +0 -12
- package/dist/http/download-routes.js +21 -58
- package/dist/http/jsonrpc-http.d.ts +2 -0
- package/dist/http/jsonrpc-http.js +10 -0
- package/dist/http/mcp-routes.d.ts +0 -1
- package/dist/http/mcp-routes.js +43 -30
- package/dist/http/mcp-session-helpers.d.ts +0 -1
- package/dist/http/mcp-session-helpers.js +1 -1
- package/dist/http/mcp-session-transport.d.ts +7 -0
- package/dist/http/mcp-session-transport.js +57 -0
- package/dist/http/mcp-session.js +60 -73
- package/dist/http/mcp-validation.d.ts +1 -0
- package/dist/http/mcp-validation.js +11 -10
- package/dist/http/protocol-policy.d.ts +2 -0
- package/dist/http/protocol-policy.js +31 -0
- package/dist/http/rate-limit.js +5 -2
- package/dist/http/server-config.d.ts +1 -0
- package/dist/http/server-config.js +40 -0
- package/dist/http/server-middleware.d.ts +2 -9
- package/dist/http/server-middleware.js +96 -43
- package/dist/http/server-shutdown.d.ts +4 -0
- package/dist/http/server-shutdown.js +43 -0
- package/dist/http/server.js +52 -64
- package/dist/http/session-cleanup.js +1 -1
- package/dist/middleware/error-handler.js +1 -3
- package/dist/resources/cached-content.js +50 -108
- package/dist/resources/index.js +0 -82
- package/dist/server.js +51 -30
- package/dist/services/cache-keys.d.ts +7 -0
- package/dist/services/cache-keys.js +57 -0
- package/dist/services/cache.d.ts +1 -7
- package/dist/services/cache.js +53 -119
- package/dist/services/context.d.ts +0 -1
- package/dist/services/context.js +0 -7
- package/dist/services/extractor.js +10 -82
- package/dist/services/fetcher/agents.d.ts +2 -2
- package/dist/services/fetcher/agents.js +34 -95
- package/dist/services/fetcher/dns-selection.d.ts +2 -0
- package/dist/services/fetcher/dns-selection.js +72 -0
- package/dist/services/fetcher/interceptors.d.ts +0 -22
- package/dist/services/fetcher/interceptors.js +30 -13
- package/dist/services/fetcher/redirects.js +4 -3
- package/dist/services/fetcher/response.js +66 -31
- package/dist/services/fetcher.d.ts +1 -3
- package/dist/services/fetcher.js +14 -33
- package/dist/services/fifo-queue.d.ts +8 -0
- package/dist/services/fifo-queue.js +25 -0
- package/dist/services/logger.js +2 -2
- package/dist/services/metadata-collector.d.ts +1 -9
- package/dist/services/metadata-collector.js +71 -2
- package/dist/services/transform-worker-pool.d.ts +4 -14
- package/dist/services/transform-worker-pool.js +177 -129
- package/dist/services/transform-worker-types.d.ts +32 -0
- package/dist/services/transform-worker-types.js +14 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +3 -4
- package/dist/tools/handlers/fetch-markdown.tool.js +20 -72
- package/dist/tools/handlers/fetch-single.shared.d.ts +1 -20
- package/dist/tools/handlers/fetch-single.shared.js +44 -87
- package/dist/tools/handlers/fetch-url.tool.d.ts +1 -1
- package/dist/tools/handlers/fetch-url.tool.js +46 -123
- package/dist/tools/index.js +21 -40
- package/dist/tools/schemas.d.ts +1 -51
- package/dist/tools/schemas.js +2 -108
- package/dist/tools/utils/cached-markdown.d.ts +5 -0
- package/dist/tools/utils/cached-markdown.js +46 -0
- package/dist/tools/utils/content-shaping.d.ts +4 -0
- package/dist/tools/utils/content-shaping.js +52 -0
- package/dist/tools/utils/content-transform.d.ts +2 -17
- package/dist/tools/utils/content-transform.js +120 -114
- package/dist/tools/utils/fetch-pipeline.d.ts +0 -8
- package/dist/tools/utils/fetch-pipeline.js +65 -62
- package/dist/tools/utils/inline-content.d.ts +1 -2
- package/dist/tools/utils/inline-content.js +4 -7
- package/dist/transformers/markdown.transformer.js +109 -34
- package/dist/utils/cached-payload.d.ts +7 -0
- package/dist/utils/cached-payload.js +36 -0
- package/dist/utils/error-utils.js +1 -1
- package/dist/utils/filename-generator.js +21 -10
- package/dist/utils/guards.d.ts +1 -0
- package/dist/utils/guards.js +3 -0
- package/dist/utils/header-normalizer.d.ts +0 -3
- package/dist/utils/header-normalizer.js +3 -3
- package/dist/utils/tool-error-handler.d.ts +2 -2
- package/dist/utils/tool-error-handler.js +11 -38
- package/dist/utils/url-transformer.d.ts +7 -0
- package/dist/utils/url-transformer.js +147 -0
- package/dist/utils/url-validator.d.ts +1 -2
- package/dist/utils/url-validator.js +20 -93
- package/dist/workers/content-transform.worker.d.ts +1 -0
- package/dist/workers/content-transform.worker.js +40 -0
- package/package.json +13 -16
|
@@ -1,167 +1,215 @@
|
|
|
1
|
-
import
|
|
2
|
-
import { isMainThread, Worker } from 'node:worker_threads';
|
|
1
|
+
import { Worker } from 'node:worker_threads';
|
|
3
2
|
import { config } from '../config/index.js';
|
|
4
3
|
import { getErrorMessage } from '../utils/error-utils.js';
|
|
5
4
|
import { logWarn } from './logger.js';
|
|
6
|
-
|
|
7
|
-
function resolvePoolSize() {
|
|
8
|
-
const available = os.availableParallelism();
|
|
9
|
-
return Math.max(1, Math.min(available - 1, MAX_POOL_SIZE));
|
|
10
|
-
}
|
|
11
|
-
let pool = null;
|
|
12
|
-
let poolDisabled = false;
|
|
13
|
-
function shouldUseWorkers() {
|
|
14
|
-
return isMainThread && config.runtime.httpMode && !poolDisabled;
|
|
15
|
-
}
|
|
16
|
-
function getWorkerUrl() {
|
|
17
|
-
return new URL('../workers/transform-worker.js', import.meta.url);
|
|
18
|
-
}
|
|
19
|
-
export async function runTransformInWorker(job) {
|
|
20
|
-
if (!shouldUseWorkers())
|
|
21
|
-
return null;
|
|
22
|
-
if (!pool) {
|
|
23
|
-
try {
|
|
24
|
-
pool = new TransformWorkerPool(getWorkerUrl(), resolvePoolSize());
|
|
25
|
-
}
|
|
26
|
-
catch (error) {
|
|
27
|
-
poolDisabled = true;
|
|
28
|
-
logWarn('Failed to initialize transform worker pool', {
|
|
29
|
-
error: getErrorMessage(error),
|
|
30
|
-
});
|
|
31
|
-
return null;
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
try {
|
|
35
|
-
return await pool.run(job);
|
|
36
|
-
}
|
|
37
|
-
catch (error) {
|
|
38
|
-
poolDisabled = true;
|
|
39
|
-
pool.destroy();
|
|
40
|
-
pool = null;
|
|
41
|
-
logWarn('Transform worker failed; falling back to main thread', {
|
|
42
|
-
error: getErrorMessage(error),
|
|
43
|
-
});
|
|
44
|
-
return null;
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
export function destroyTransformWorkers() {
|
|
48
|
-
pool?.destroy();
|
|
49
|
-
pool = null;
|
|
50
|
-
}
|
|
5
|
+
import { isWorkerResponse } from './transform-worker-types.js';
|
|
51
6
|
class TransformWorkerPool {
|
|
52
7
|
workerUrl;
|
|
53
|
-
|
|
54
|
-
workers = [];
|
|
8
|
+
slots = [];
|
|
55
9
|
queue = [];
|
|
56
|
-
pending = new Map();
|
|
57
10
|
nextId = 1;
|
|
58
11
|
destroyed = false;
|
|
59
12
|
constructor(workerUrl, size) {
|
|
60
13
|
this.workerUrl = workerUrl;
|
|
61
|
-
this.size = size;
|
|
62
14
|
for (let i = 0; i < size; i += 1) {
|
|
63
|
-
this.
|
|
15
|
+
this.slots.push(this.spawnWorker());
|
|
64
16
|
}
|
|
65
17
|
}
|
|
66
|
-
run(
|
|
18
|
+
run(request, signal) {
|
|
67
19
|
if (this.destroyed) {
|
|
68
|
-
return Promise.reject(new Error('
|
|
20
|
+
return Promise.reject(new Error('Worker pool is shut down'));
|
|
69
21
|
}
|
|
70
|
-
const id = this.nextId++;
|
|
71
|
-
const queuedJob = { ...job, id };
|
|
72
22
|
return new Promise((resolve, reject) => {
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
23
|
+
if (signal?.aborted) {
|
|
24
|
+
reject(new Error('Aborted'));
|
|
25
|
+
return;
|
|
26
|
+
}
|
|
27
|
+
const task = this.createTask(request, resolve, reject, signal);
|
|
28
|
+
this.attachAbortHandler(task, signal);
|
|
29
|
+
this.enqueueTask(task);
|
|
76
30
|
});
|
|
77
31
|
}
|
|
78
|
-
destroy() {
|
|
32
|
+
async destroy() {
|
|
79
33
|
if (this.destroyed)
|
|
80
34
|
return;
|
|
81
35
|
this.destroyed = true;
|
|
82
|
-
|
|
83
|
-
|
|
36
|
+
const pending = this.queue.splice(0);
|
|
37
|
+
for (const task of pending) {
|
|
38
|
+
this.cleanupTask(task);
|
|
39
|
+
task.reject(new Error('Worker pool shutting down'));
|
|
84
40
|
}
|
|
85
|
-
for (const
|
|
86
|
-
|
|
87
|
-
|
|
41
|
+
for (const slot of this.slots) {
|
|
42
|
+
if (slot.current) {
|
|
43
|
+
const task = slot.current;
|
|
44
|
+
slot.current = undefined;
|
|
45
|
+
slot.busy = false;
|
|
46
|
+
this.cleanupTask(task);
|
|
47
|
+
task.reject(new Error('Worker pool shutting down'));
|
|
48
|
+
}
|
|
88
49
|
}
|
|
89
|
-
this.
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
50
|
+
await Promise.allSettled(this.slots.map((slot) => slot.worker.terminate()));
|
|
51
|
+
this.slots = [];
|
|
52
|
+
}
|
|
53
|
+
dispatch() {
|
|
54
|
+
if (this.destroyed)
|
|
55
|
+
return;
|
|
56
|
+
const idle = this.slots.find((slot) => !slot.busy);
|
|
57
|
+
if (!idle)
|
|
58
|
+
return;
|
|
59
|
+
const task = this.queue.shift();
|
|
60
|
+
if (!task)
|
|
61
|
+
return;
|
|
62
|
+
task.status = 'running';
|
|
63
|
+
idle.busy = true;
|
|
64
|
+
idle.current = task;
|
|
65
|
+
try {
|
|
66
|
+
idle.worker.postMessage(task.request);
|
|
67
|
+
}
|
|
68
|
+
catch (error) {
|
|
69
|
+
this.failTask(idle, error);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
createTask(request, resolve, reject, signal) {
|
|
73
|
+
const id = this.nextId;
|
|
74
|
+
this.nextId += 1;
|
|
75
|
+
return {
|
|
76
|
+
id,
|
|
77
|
+
request: { ...request, id },
|
|
78
|
+
resolve,
|
|
79
|
+
reject,
|
|
80
|
+
signal,
|
|
81
|
+
abortHandler: undefined,
|
|
82
|
+
status: 'queued',
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
attachAbortHandler(task, signal) {
|
|
86
|
+
if (!signal)
|
|
87
|
+
return;
|
|
88
|
+
const onAbort = () => {
|
|
89
|
+
if (task.status === 'queued') {
|
|
90
|
+
this.removeQueuedTask(task);
|
|
91
|
+
task.reject(new Error('Aborted'));
|
|
92
|
+
return;
|
|
93
|
+
}
|
|
94
|
+
this.abortRunningTask(task);
|
|
95
|
+
};
|
|
96
|
+
task.abortHandler = onAbort;
|
|
97
|
+
signal.addEventListener('abort', onAbort, { once: true });
|
|
98
|
+
}
|
|
99
|
+
enqueueTask(task) {
|
|
100
|
+
this.queue.push(task);
|
|
101
|
+
this.dispatch();
|
|
102
|
+
}
|
|
103
|
+
attachWorker(slot) {
|
|
104
|
+
slot.worker.on('message', (message) => {
|
|
105
|
+
this.handleMessage(slot, message);
|
|
100
106
|
});
|
|
101
|
-
worker.on('
|
|
102
|
-
this.
|
|
107
|
+
slot.worker.on('error', (error) => {
|
|
108
|
+
this.handleWorkerFailure(slot, error);
|
|
103
109
|
});
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
const pending = this.pending.get(message.id);
|
|
108
|
-
if (pending) {
|
|
109
|
-
this.pending.delete(message.id);
|
|
110
|
-
if (message.ok) {
|
|
111
|
-
pending.resolve(message.result);
|
|
110
|
+
slot.worker.on('exit', (code) => {
|
|
111
|
+
if (code !== 0) {
|
|
112
|
+
this.handleWorkerFailure(slot, new Error(`Worker exited with code ${code}`));
|
|
112
113
|
}
|
|
113
|
-
|
|
114
|
-
pending.reject(new Error(message.error));
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
state.busy = false;
|
|
118
|
-
state.currentJobId = undefined;
|
|
119
|
-
this.schedule();
|
|
114
|
+
});
|
|
120
115
|
}
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
116
|
+
spawnWorker() {
|
|
117
|
+
const slot = {
|
|
118
|
+
worker: new Worker(this.workerUrl),
|
|
119
|
+
busy: false,
|
|
120
|
+
current: undefined,
|
|
121
|
+
};
|
|
122
|
+
this.attachWorker(slot);
|
|
123
|
+
return slot;
|
|
124
124
|
}
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
125
|
+
handleMessage(slot, message) {
|
|
126
|
+
const task = slot.current;
|
|
127
|
+
if (!task)
|
|
128
|
+
return;
|
|
129
|
+
if (!isWorkerResponse(message) || message.id !== task.id) {
|
|
130
|
+
this.handleWorkerFailure(slot, new Error('Unexpected worker response'));
|
|
131
|
+
return;
|
|
132
|
+
}
|
|
133
|
+
slot.current = undefined;
|
|
134
|
+
slot.busy = false;
|
|
135
|
+
this.cleanupTask(task);
|
|
136
|
+
if (message.ok) {
|
|
137
|
+
task.resolve(message.result);
|
|
128
138
|
}
|
|
129
|
-
|
|
139
|
+
else {
|
|
140
|
+
task.reject(new Error(message.error));
|
|
141
|
+
}
|
|
142
|
+
this.dispatch();
|
|
130
143
|
}
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
144
|
+
handleWorkerFailure(slot, error) {
|
|
145
|
+
const task = slot.current;
|
|
146
|
+
if (task) {
|
|
147
|
+
slot.current = undefined;
|
|
148
|
+
slot.busy = false;
|
|
149
|
+
this.cleanupTask(task);
|
|
150
|
+
task.reject(error instanceof Error ? error : new Error(getErrorMessage(error)));
|
|
138
151
|
}
|
|
139
|
-
|
|
140
|
-
|
|
152
|
+
logWarn('Worker thread failure', {
|
|
153
|
+
error: getErrorMessage(error),
|
|
154
|
+
});
|
|
155
|
+
this.replaceWorker(slot);
|
|
156
|
+
this.dispatch();
|
|
141
157
|
}
|
|
142
|
-
replaceWorker(
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
158
|
+
replaceWorker(slot) {
|
|
159
|
+
try {
|
|
160
|
+
void slot.worker.terminate();
|
|
161
|
+
}
|
|
162
|
+
catch {
|
|
163
|
+
// Best-effort cleanup.
|
|
164
|
+
}
|
|
165
|
+
slot.worker = new Worker(this.workerUrl);
|
|
166
|
+
slot.busy = false;
|
|
167
|
+
slot.current = undefined;
|
|
168
|
+
this.attachWorker(slot);
|
|
169
|
+
}
|
|
170
|
+
failTask(slot, error) {
|
|
171
|
+
const task = slot.current;
|
|
172
|
+
if (!task)
|
|
147
173
|
return;
|
|
148
|
-
|
|
149
|
-
|
|
174
|
+
slot.current = undefined;
|
|
175
|
+
slot.busy = false;
|
|
176
|
+
this.cleanupTask(task);
|
|
177
|
+
task.reject(error instanceof Error ? error : new Error(String(error)));
|
|
178
|
+
this.dispatch();
|
|
150
179
|
}
|
|
151
|
-
|
|
152
|
-
|
|
180
|
+
abortRunningTask(task) {
|
|
181
|
+
const slot = this.slots.find((s) => s.current?.id === task.id);
|
|
182
|
+
if (!slot)
|
|
153
183
|
return;
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
if (!job)
|
|
161
|
-
return;
|
|
162
|
-
workerState.busy = true;
|
|
163
|
-
workerState.currentJobId = job.id;
|
|
164
|
-
workerState.worker.postMessage(job);
|
|
184
|
+
this.handleWorkerFailure(slot, new Error('Aborted'));
|
|
185
|
+
}
|
|
186
|
+
removeQueuedTask(task) {
|
|
187
|
+
const index = this.queue.findIndex((queued) => queued.id === task.id);
|
|
188
|
+
if (index >= 0) {
|
|
189
|
+
this.queue.splice(index, 1);
|
|
165
190
|
}
|
|
191
|
+
this.cleanupTask(task);
|
|
166
192
|
}
|
|
193
|
+
cleanupTask(task) {
|
|
194
|
+
if (task.signal && task.abortHandler) {
|
|
195
|
+
task.signal.removeEventListener('abort', task.abortHandler);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
let pool = null;
|
|
200
|
+
function getPool() {
|
|
201
|
+
if (pool)
|
|
202
|
+
return pool;
|
|
203
|
+
pool = new TransformWorkerPool(new URL('../workers/content-transform.worker.js', import.meta.url), config.workers.poolSize);
|
|
204
|
+
return pool;
|
|
205
|
+
}
|
|
206
|
+
export async function transformInWorker(request, signal) {
|
|
207
|
+
return getPool().run(request, signal);
|
|
208
|
+
}
|
|
209
|
+
export async function destroyTransformWorkers() {
|
|
210
|
+
if (!pool)
|
|
211
|
+
return;
|
|
212
|
+
const current = pool;
|
|
213
|
+
pool = null;
|
|
214
|
+
await current.destroy();
|
|
167
215
|
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import type { Worker } from 'node:worker_threads';
|
|
2
|
+
import type { MarkdownTransformResult, TransformOptions } from '../config/types/content.js';
|
|
3
|
+
export interface WorkerTransformRequest {
|
|
4
|
+
id: number;
|
|
5
|
+
html: string;
|
|
6
|
+
url: string;
|
|
7
|
+
options: TransformOptions;
|
|
8
|
+
}
|
|
9
|
+
export type WorkerTransformResponse = {
|
|
10
|
+
id: number;
|
|
11
|
+
ok: true;
|
|
12
|
+
result: MarkdownTransformResult;
|
|
13
|
+
} | {
|
|
14
|
+
id: number;
|
|
15
|
+
ok: false;
|
|
16
|
+
error: string;
|
|
17
|
+
};
|
|
18
|
+
export interface TransformTask {
|
|
19
|
+
id: number;
|
|
20
|
+
request: WorkerTransformRequest;
|
|
21
|
+
resolve: (result: MarkdownTransformResult) => void;
|
|
22
|
+
reject: (error: Error) => void;
|
|
23
|
+
signal: AbortSignal | undefined;
|
|
24
|
+
abortHandler: (() => void) | undefined;
|
|
25
|
+
status: 'queued' | 'running';
|
|
26
|
+
}
|
|
27
|
+
export interface WorkerSlot {
|
|
28
|
+
worker: Worker;
|
|
29
|
+
busy: boolean;
|
|
30
|
+
current: TransformTask | undefined;
|
|
31
|
+
}
|
|
32
|
+
export declare function isWorkerResponse(value: unknown): value is WorkerTransformResponse;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { isRecord } from '../utils/guards.js';
|
|
2
|
+
export function isWorkerResponse(value) {
|
|
3
|
+
if (!isRecord(value))
|
|
4
|
+
return false;
|
|
5
|
+
if (typeof value.id !== 'number')
|
|
6
|
+
return false;
|
|
7
|
+
if (value.ok === true) {
|
|
8
|
+
return 'result' in value;
|
|
9
|
+
}
|
|
10
|
+
if (value.ok === false) {
|
|
11
|
+
return typeof value.error === 'string';
|
|
12
|
+
}
|
|
13
|
+
return false;
|
|
14
|
+
}
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
import type { FetchMarkdownInput, ToolResponseBase } from '../../config/types/tools.js';
|
|
2
|
-
import {
|
|
2
|
+
import { transformHtmlToMarkdown } from '../utils/content-transform.js';
|
|
3
3
|
import { performSharedFetch } from './fetch-single.shared.js';
|
|
4
4
|
export declare const FETCH_MARKDOWN_TOOL_NAME = "fetch-markdown";
|
|
5
|
-
export declare const FETCH_MARKDOWN_TOOL_DESCRIPTION = "Fetches a webpage and converts it to clean Markdown format with optional frontmatter
|
|
5
|
+
export declare const FETCH_MARKDOWN_TOOL_DESCRIPTION = "Fetches a webpage and converts it to clean Markdown format with optional frontmatter";
|
|
6
6
|
interface FetchMarkdownDeps {
|
|
7
7
|
readonly performSharedFetch?: typeof performSharedFetch;
|
|
8
|
-
readonly transformHtmlToMarkdown?: typeof
|
|
8
|
+
readonly transformHtmlToMarkdown?: typeof transformHtmlToMarkdown;
|
|
9
9
|
}
|
|
10
10
|
export declare function createFetchMarkdownToolHandler(deps?: FetchMarkdownDeps): (input: FetchMarkdownInput) => Promise<ToolResponseBase>;
|
|
11
|
-
export declare const fetchMarkdownToolHandler: (input: FetchMarkdownInput) => Promise<ToolResponseBase>;
|
|
12
11
|
export {};
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import { config } from '../../config/index.js';
|
|
2
2
|
import { logDebug, logError } from '../../services/logger.js';
|
|
3
3
|
import { createToolErrorResponse, handleToolError, } from '../../utils/tool-error-handler.js';
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
4
|
+
import { transformHtmlToMarkdown } from '../utils/content-transform.js';
|
|
5
|
+
import { buildToolContentBlocks, performSharedFetch, } from './fetch-single.shared.js';
|
|
6
6
|
export const FETCH_MARKDOWN_TOOL_NAME = 'fetch-markdown';
|
|
7
|
-
export const FETCH_MARKDOWN_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to clean Markdown format with optional frontmatter
|
|
7
|
+
export const FETCH_MARKDOWN_TOOL_DESCRIPTION = 'Fetches a webpage and converts it to clean Markdown format with optional frontmatter';
|
|
8
8
|
function isRecord(value) {
|
|
9
9
|
return value !== null && typeof value === 'object';
|
|
10
10
|
}
|
|
@@ -13,22 +13,18 @@ function deserializeMarkdownPipelineResult(cached) {
|
|
|
13
13
|
const parsed = JSON.parse(cached);
|
|
14
14
|
if (!isRecord(parsed))
|
|
15
15
|
return undefined;
|
|
16
|
-
const { content, markdown, title
|
|
16
|
+
const { content, markdown, title } = parsed;
|
|
17
17
|
if (typeof content !== 'string')
|
|
18
18
|
return undefined;
|
|
19
19
|
if (typeof markdown !== 'string')
|
|
20
20
|
return undefined;
|
|
21
21
|
if (title !== undefined && typeof title !== 'string')
|
|
22
22
|
return undefined;
|
|
23
|
-
if (truncated !== undefined && typeof truncated !== 'boolean') {
|
|
24
|
-
return undefined;
|
|
25
|
-
}
|
|
26
|
-
const resolvedTitle = typeof title === 'string' ? title : undefined;
|
|
27
23
|
return {
|
|
28
24
|
content,
|
|
29
25
|
markdown,
|
|
30
|
-
title:
|
|
31
|
-
truncated:
|
|
26
|
+
title: typeof title === 'string' ? title : undefined,
|
|
27
|
+
truncated: false,
|
|
32
28
|
};
|
|
33
29
|
}
|
|
34
30
|
catch {
|
|
@@ -37,35 +33,15 @@ function deserializeMarkdownPipelineResult(cached) {
|
|
|
37
33
|
}
|
|
38
34
|
function resolveMarkdownOptions(input) {
|
|
39
35
|
return {
|
|
40
|
-
extractMainContent: input.extractMainContent ?? config.extraction.extractMainContent,
|
|
41
36
|
includeMetadata: input.includeMetadata ?? config.extraction.includeMetadata,
|
|
42
|
-
...(input.maxContentLength !== undefined && {
|
|
43
|
-
maxContentLength: input.maxContentLength,
|
|
44
|
-
}),
|
|
45
37
|
};
|
|
46
38
|
}
|
|
47
|
-
function
|
|
39
|
+
function buildMarkdownStructuredContent(pipeline, inlineResult) {
|
|
48
40
|
return {
|
|
49
|
-
fetchedAt: new Date().toISOString(),
|
|
50
|
-
cached: false,
|
|
51
|
-
};
|
|
52
|
-
}
|
|
53
|
-
function buildMarkdownStructuredContent(pipeline, inlineResult, fileDownload) {
|
|
54
|
-
const structuredContent = {
|
|
55
41
|
url: pipeline.url,
|
|
56
42
|
title: pipeline.data.title,
|
|
57
|
-
|
|
58
|
-
contentSize: inlineResult.contentSize,
|
|
59
|
-
cached: pipeline.fromCache,
|
|
43
|
+
markdown: inlineResult.content,
|
|
60
44
|
};
|
|
61
|
-
if (pipeline.data.truncated || inlineResult.truncated) {
|
|
62
|
-
structuredContent.truncated = true;
|
|
63
|
-
}
|
|
64
|
-
applyInlineResultToStructuredContent(structuredContent, inlineResult, 'markdown');
|
|
65
|
-
if (fileDownload) {
|
|
66
|
-
structuredContent.file = fileDownload;
|
|
67
|
-
}
|
|
68
|
-
return structuredContent;
|
|
69
45
|
}
|
|
70
46
|
function logFetchMarkdownStart(url, options) {
|
|
71
47
|
logDebug('Fetching markdown', { url, ...options });
|
|
@@ -76,27 +52,18 @@ function buildMarkdownTransform(options, transform) {
|
|
|
76
52
|
return { ...markdownResult, content: markdownResult.markdown };
|
|
77
53
|
};
|
|
78
54
|
}
|
|
79
|
-
async function fetchMarkdownPipeline(url,
|
|
55
|
+
async function fetchMarkdownPipeline(url, options, performSharedFetchImpl, transformImpl) {
|
|
80
56
|
const sharedOptions = {
|
|
81
57
|
url,
|
|
82
58
|
format: 'markdown',
|
|
83
|
-
extractMainContent: options.extractMainContent,
|
|
84
59
|
includeMetadata: options.includeMetadata,
|
|
85
|
-
|
|
86
|
-
maxContentLength: options.maxContentLength,
|
|
87
|
-
}),
|
|
88
|
-
...(input.customHeaders !== undefined && {
|
|
89
|
-
customHeaders: input.customHeaders,
|
|
90
|
-
}),
|
|
91
|
-
...(input.retries !== undefined && { retries: input.retries }),
|
|
92
|
-
...(input.timeout !== undefined && { timeout: input.timeout }),
|
|
93
|
-
transform: buildMarkdownTransform(transformOptions, transformImpl),
|
|
60
|
+
transform: buildMarkdownTransform(options, transformImpl),
|
|
94
61
|
deserialize: deserializeMarkdownPipelineResult,
|
|
95
62
|
};
|
|
96
63
|
return performSharedFetchImpl(sharedOptions);
|
|
97
64
|
}
|
|
98
|
-
function buildMarkdownResponse(pipeline, inlineResult
|
|
99
|
-
const structuredContent = buildMarkdownStructuredContent(pipeline, inlineResult
|
|
65
|
+
function buildMarkdownResponse(pipeline, inlineResult) {
|
|
66
|
+
const structuredContent = buildMarkdownStructuredContent(pipeline, inlineResult);
|
|
100
67
|
return {
|
|
101
68
|
content: buildToolContentBlocks(structuredContent, pipeline.fromCache, inlineResult, 'Fetched markdown', pipeline.cacheKey, pipeline.data.content, 'markdown', pipeline.url, pipeline.data.title),
|
|
102
69
|
structuredContent,
|
|
@@ -104,46 +71,27 @@ function buildMarkdownResponse(pipeline, inlineResult, fileDownload) {
|
|
|
104
71
|
}
|
|
105
72
|
export function createFetchMarkdownToolHandler(deps = {}) {
|
|
106
73
|
const performSharedFetchImpl = deps.performSharedFetch ?? performSharedFetch;
|
|
107
|
-
const transformImpl = deps.transformHtmlToMarkdown ??
|
|
74
|
+
const transformImpl = deps.transformHtmlToMarkdown ?? transformHtmlToMarkdown;
|
|
108
75
|
return async (input) => {
|
|
109
76
|
try {
|
|
110
77
|
return await executeFetchMarkdown(input, performSharedFetchImpl, transformImpl);
|
|
111
78
|
}
|
|
112
79
|
catch (error) {
|
|
113
80
|
logError('fetch-markdown tool error', error instanceof Error ? error : undefined);
|
|
114
|
-
|
|
115
|
-
return handleToolError(error, input.url, 'Failed to fetch markdown', errorDetails);
|
|
81
|
+
return handleToolError(error, input.url, 'Failed to fetch markdown');
|
|
116
82
|
}
|
|
117
83
|
};
|
|
118
84
|
}
|
|
119
|
-
export const fetchMarkdownToolHandler = createFetchMarkdownToolHandler();
|
|
120
85
|
async function executeFetchMarkdown(input, performSharedFetchImpl, transformImpl) {
|
|
121
86
|
const { url } = input;
|
|
122
87
|
if (!url) {
|
|
123
|
-
return createToolErrorResponse('URL is required', ''
|
|
88
|
+
return createToolErrorResponse('URL is required', '');
|
|
124
89
|
}
|
|
125
90
|
const options = resolveMarkdownOptions(input);
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
if (inlineError)
|
|
131
|
-
return inlineError;
|
|
132
|
-
let fileDownload = null;
|
|
133
|
-
if (inlineResult.resourceUri) {
|
|
134
|
-
const downloadContext = {
|
|
135
|
-
cacheKey: pipeline.cacheKey ?? null,
|
|
136
|
-
url: pipeline.url,
|
|
137
|
-
};
|
|
138
|
-
if (pipeline.data.title !== undefined) {
|
|
139
|
-
fileDownload = getFileDownloadInfo({
|
|
140
|
-
...downloadContext,
|
|
141
|
-
title: pipeline.data.title,
|
|
142
|
-
});
|
|
143
|
-
}
|
|
144
|
-
else {
|
|
145
|
-
fileDownload = getFileDownloadInfo(downloadContext);
|
|
146
|
-
}
|
|
91
|
+
logFetchMarkdownStart(url, options);
|
|
92
|
+
const { pipeline, inlineResult } = await fetchMarkdownPipeline(url, options, performSharedFetchImpl, transformImpl);
|
|
93
|
+
if (inlineResult.error) {
|
|
94
|
+
return createToolErrorResponse(inlineResult.error, url);
|
|
147
95
|
}
|
|
148
|
-
return buildMarkdownResponse(pipeline, inlineResult
|
|
96
|
+
return buildMarkdownResponse(pipeline, inlineResult);
|
|
149
97
|
}
|
|
@@ -1,21 +1,10 @@
|
|
|
1
1
|
import type { PipelineResult, ToolContentBlock } from '../../config/types/runtime.js';
|
|
2
|
-
import type { FileDownloadInfo, ToolResponseBase } from '../../config/types/tools.js';
|
|
3
2
|
import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
|
|
4
3
|
import { applyInlineContentLimit } from '../utils/inline-content.js';
|
|
5
|
-
type SharedFetchFormat = 'jsonl' | 'markdown';
|
|
6
4
|
interface SharedFetchOptions<T extends {
|
|
7
5
|
content: string;
|
|
8
6
|
}> {
|
|
9
7
|
readonly url: string;
|
|
10
|
-
readonly format: SharedFetchFormat;
|
|
11
|
-
readonly extractMainContent: boolean;
|
|
12
|
-
readonly includeMetadata: boolean;
|
|
13
|
-
readonly maxContentLength?: number;
|
|
14
|
-
readonly includeContentBlocks?: boolean;
|
|
15
|
-
readonly cacheVariant?: string;
|
|
16
|
-
readonly customHeaders?: Record<string, string>;
|
|
17
|
-
readonly retries?: number;
|
|
18
|
-
readonly timeout?: number;
|
|
19
8
|
readonly transform: (html: string, normalizedUrl: string) => T | Promise<T>;
|
|
20
9
|
readonly serialize?: (result: T) => string;
|
|
21
10
|
readonly deserialize?: (cached: string) => T | undefined;
|
|
@@ -30,13 +19,5 @@ export declare function performSharedFetch<T extends {
|
|
|
30
19
|
inlineResult: ReturnType<typeof applyInlineContentLimit>;
|
|
31
20
|
}>;
|
|
32
21
|
export type InlineResult = ReturnType<typeof applyInlineContentLimit>;
|
|
33
|
-
|
|
34
|
-
cacheKey: string | null;
|
|
35
|
-
url: string;
|
|
36
|
-
title?: string;
|
|
37
|
-
}
|
|
38
|
-
export declare function getFileDownloadInfo(context: DownloadContext): FileDownloadInfo | null;
|
|
39
|
-
export declare function getInlineErrorResponse(inlineResult: InlineResult, url: string, details?: Record<string, unknown>): ToolResponseBase | null;
|
|
40
|
-
export declare function applyInlineResultToStructuredContent(structuredContent: Record<string, unknown>, inlineResult: InlineResult, contentKey: string): void;
|
|
41
|
-
export declare function buildToolContentBlocks(structuredContent: Record<string, unknown>, fromCache: boolean, inlineResult: InlineResult, resourceName: string, cacheKey?: string | null, fullContent?: string, format?: SharedFetchFormat, url?: string, title?: string): ToolContentBlock[];
|
|
22
|
+
export declare function buildToolContentBlocks(structuredContent: Record<string, unknown>, fromCache: boolean, inlineResult: InlineResult, resourceName: string, cacheKey?: string | null, fullContent?: string, url?: string, title?: string): ToolContentBlock[];
|
|
42
23
|
export {};
|