@j0hanz/superfetch 2.0.1 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +121 -38
- package/dist/cache.d.ts +42 -0
- package/dist/cache.js +674 -0
- package/dist/config/env-parsers.d.ts +1 -0
- package/dist/config/env-parsers.js +12 -0
- package/dist/config/index.d.ts +7 -0
- package/dist/config/index.js +10 -3
- package/dist/config/types/content.d.ts +1 -0
- package/dist/config.d.ts +82 -0
- package/dist/config.js +274 -0
- package/dist/crypto.d.ts +2 -0
- package/dist/crypto.js +32 -0
- package/dist/errors.d.ts +10 -0
- package/dist/errors.js +28 -0
- package/dist/fetch.d.ts +40 -0
- package/dist/fetch.js +930 -0
- package/dist/http/base-middleware.d.ts +7 -0
- package/dist/http/base-middleware.js +143 -0
- package/dist/http/cors.d.ts +0 -5
- package/dist/http/cors.js +0 -6
- package/dist/http/download-routes.js +6 -2
- package/dist/http/error-handler.d.ts +2 -0
- package/dist/http/error-handler.js +55 -0
- package/dist/http/mcp-routes.js +2 -2
- package/dist/http/mcp-sessions.d.ts +3 -5
- package/dist/http/mcp-sessions.js +8 -8
- package/dist/http/server-tuning.d.ts +9 -0
- package/dist/http/server-tuning.js +45 -0
- package/dist/http/server.d.ts +0 -10
- package/dist/http/server.js +33 -333
- package/dist/http.d.ts +86 -0
- package/dist/http.js +1507 -0
- package/dist/index.js +3 -3
- package/dist/instructions.md +96 -0
- package/dist/mcp.d.ts +3 -0
- package/dist/mcp.js +104 -0
- package/dist/observability.d.ts +16 -0
- package/dist/observability.js +78 -0
- package/dist/server.js +20 -5
- package/dist/services/cache.d.ts +1 -1
- package/dist/services/context.d.ts +2 -0
- package/dist/services/context.js +3 -0
- package/dist/services/extractor.d.ts +1 -0
- package/dist/services/extractor.js +28 -2
- package/dist/services/fetcher.d.ts +2 -0
- package/dist/services/fetcher.js +35 -14
- package/dist/services/logger.js +4 -1
- package/dist/services/telemetry.d.ts +19 -0
- package/dist/services/telemetry.js +43 -0
- package/dist/services/transform-worker-pool.d.ts +10 -3
- package/dist/services/transform-worker-pool.js +213 -184
- package/dist/tools/handlers/fetch-url.tool.js +8 -6
- package/dist/tools/index.d.ts +1 -0
- package/dist/tools/index.js +13 -1
- package/dist/tools/schemas.d.ts +2 -0
- package/dist/tools/schemas.js +8 -0
- package/dist/tools/utils/content-transform-core.d.ts +5 -0
- package/dist/tools/utils/content-transform-core.js +180 -0
- package/dist/tools/utils/content-transform-workers.d.ts +1 -0
- package/dist/tools/utils/content-transform-workers.js +1 -0
- package/dist/tools/utils/content-transform.d.ts +3 -5
- package/dist/tools/utils/content-transform.js +35 -148
- package/dist/tools/utils/raw-markdown.js +15 -1
- package/dist/tools.d.ts +109 -0
- package/dist/tools.js +434 -0
- package/dist/transform.d.ts +69 -0
- package/dist/transform.js +1814 -0
- package/dist/transformers/markdown.d.ts +4 -1
- package/dist/transformers/markdown.js +182 -53
- package/dist/utils/cancellation.d.ts +1 -0
- package/dist/utils/cancellation.js +18 -0
- package/dist/utils/code-language.d.ts +0 -9
- package/dist/utils/code-language.js +5 -5
- package/dist/utils/host-normalizer.d.ts +1 -0
- package/dist/utils/host-normalizer.js +37 -0
- package/dist/utils/url-redactor.d.ts +1 -0
- package/dist/utils/url-redactor.js +13 -0
- package/dist/utils/url-validator.js +8 -5
- package/dist/utils.d.ts +1 -0
- package/dist/utils.js +3 -0
- package/dist/workers/transform-worker.js +80 -38
- package/package.json +10 -9
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import diagnosticsChannel from 'node:diagnostics_channel';
|
|
2
|
+
import { performance } from 'node:perf_hooks';
|
|
3
|
+
import { redactUrl } from '../utils/url-redactor.js';
|
|
4
|
+
import { getOperationId, getRequestId } from './context.js';
|
|
5
|
+
const transformChannel = diagnosticsChannel.channel('superfetch.transform');
|
|
6
|
+
function publishTransformEvent(event) {
|
|
7
|
+
if (!transformChannel.hasSubscribers)
|
|
8
|
+
return;
|
|
9
|
+
try {
|
|
10
|
+
transformChannel.publish(event);
|
|
11
|
+
}
|
|
12
|
+
catch {
|
|
13
|
+
// Avoid crashing the publisher if a subscriber throws.
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
export function startTransformStage(url, stage) {
|
|
17
|
+
if (!transformChannel.hasSubscribers)
|
|
18
|
+
return null;
|
|
19
|
+
return {
|
|
20
|
+
stage,
|
|
21
|
+
startTime: performance.now(),
|
|
22
|
+
url: redactUrl(url),
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
export function endTransformStage(context, options) {
|
|
26
|
+
if (!context)
|
|
27
|
+
return;
|
|
28
|
+
const requestId = getRequestId();
|
|
29
|
+
const operationId = getOperationId();
|
|
30
|
+
const event = {
|
|
31
|
+
v: 1,
|
|
32
|
+
type: 'stage',
|
|
33
|
+
stage: context.stage,
|
|
34
|
+
durationMs: performance.now() - context.startTime,
|
|
35
|
+
url: context.url,
|
|
36
|
+
...(requestId ? { requestId } : {}),
|
|
37
|
+
...(operationId ? { operationId } : {}),
|
|
38
|
+
...(options?.truncated !== undefined
|
|
39
|
+
? { truncated: options.truncated }
|
|
40
|
+
: {}),
|
|
41
|
+
};
|
|
42
|
+
publishTransformEvent(event);
|
|
43
|
+
}
|
|
@@ -1,4 +1,11 @@
|
|
|
1
1
|
import type { MarkdownTransformResult } from '../config/types/content.js';
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
2
|
+
interface TransformWorkerPool {
|
|
3
|
+
transform(html: string, url: string, options: {
|
|
4
|
+
includeMetadata: boolean;
|
|
5
|
+
signal?: AbortSignal;
|
|
6
|
+
}): Promise<MarkdownTransformResult>;
|
|
7
|
+
close(): Promise<void>;
|
|
8
|
+
}
|
|
9
|
+
export declare function getOrCreateTransformWorkerPool(): TransformWorkerPool;
|
|
10
|
+
export declare function shutdownTransformWorkerPool(): Promise<void>;
|
|
11
|
+
export {};
|
|
@@ -1,215 +1,244 @@
|
|
|
1
|
+
import { randomUUID } from 'node:crypto';
|
|
2
|
+
import os from 'node:os';
|
|
1
3
|
import { Worker } from 'node:worker_threads';
|
|
2
|
-
import {
|
|
3
|
-
import { getErrorMessage } from '../utils/error-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
4
|
+
import { FetchError } from '../errors/app-error.js';
|
|
5
|
+
import { getErrorMessage } from '../utils/error-details.js';
|
|
6
|
+
let pool = null;
|
|
7
|
+
function resolveDefaultWorkerCount() {
|
|
8
|
+
const parallelism = typeof os.availableParallelism === 'function'
|
|
9
|
+
? os.availableParallelism()
|
|
10
|
+
: os.cpus().length;
|
|
11
|
+
// Leave 1 core for the event loop; cap to avoid runaway memory.
|
|
12
|
+
return Math.min(16, Math.max(1, parallelism - 1));
|
|
13
|
+
}
|
|
14
|
+
const DEFAULT_TIMEOUT_MS = 30000;
|
|
15
|
+
export function getOrCreateTransformWorkerPool() {
|
|
16
|
+
pool ??= new WorkerPool(resolveDefaultWorkerCount(), DEFAULT_TIMEOUT_MS);
|
|
17
|
+
return pool;
|
|
18
|
+
}
|
|
19
|
+
export async function shutdownTransformWorkerPool() {
|
|
20
|
+
if (!pool)
|
|
21
|
+
return;
|
|
22
|
+
await pool.close();
|
|
23
|
+
pool = null;
|
|
24
|
+
}
|
|
25
|
+
class WorkerPool {
|
|
26
|
+
workers = [];
|
|
9
27
|
queue = [];
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
return Promise.reject(new Error('Worker pool is shut down'));
|
|
21
|
-
}
|
|
22
|
-
return new Promise((resolve, reject) => {
|
|
23
|
-
if (signal?.aborted) {
|
|
24
|
-
reject(new Error('Aborted'));
|
|
25
|
-
return;
|
|
26
|
-
}
|
|
27
|
-
const task = this.createTask(request, resolve, reject, signal);
|
|
28
|
-
this.attachAbortHandler(task, signal);
|
|
29
|
-
this.enqueueTask(task);
|
|
30
|
-
});
|
|
31
|
-
}
|
|
32
|
-
async destroy() {
|
|
33
|
-
if (this.destroyed)
|
|
34
|
-
return;
|
|
35
|
-
this.destroyed = true;
|
|
36
|
-
const pending = this.queue.splice(0);
|
|
37
|
-
for (const task of pending) {
|
|
38
|
-
this.cleanupTask(task);
|
|
39
|
-
task.reject(new Error('Worker pool shutting down'));
|
|
40
|
-
}
|
|
41
|
-
for (const slot of this.slots) {
|
|
42
|
-
if (slot.current) {
|
|
43
|
-
const task = slot.current;
|
|
44
|
-
slot.current = undefined;
|
|
45
|
-
slot.busy = false;
|
|
46
|
-
this.cleanupTask(task);
|
|
47
|
-
task.reject(new Error('Worker pool shutting down'));
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
await Promise.allSettled(this.slots.map((slot) => slot.worker.terminate()));
|
|
51
|
-
this.slots = [];
|
|
52
|
-
}
|
|
53
|
-
dispatch() {
|
|
54
|
-
if (this.destroyed)
|
|
55
|
-
return;
|
|
56
|
-
const idle = this.slots.find((slot) => !slot.busy);
|
|
57
|
-
if (!idle)
|
|
58
|
-
return;
|
|
59
|
-
const task = this.queue.shift();
|
|
60
|
-
if (!task)
|
|
61
|
-
return;
|
|
62
|
-
task.status = 'running';
|
|
63
|
-
idle.busy = true;
|
|
64
|
-
idle.current = task;
|
|
65
|
-
try {
|
|
66
|
-
idle.worker.postMessage(task.request);
|
|
67
|
-
}
|
|
68
|
-
catch (error) {
|
|
69
|
-
this.failTask(idle, error);
|
|
28
|
+
inflight = new Map();
|
|
29
|
+
timeoutMs;
|
|
30
|
+
queueMax;
|
|
31
|
+
closed = false;
|
|
32
|
+
constructor(size, timeoutMs) {
|
|
33
|
+
const safeSize = Math.max(1, size);
|
|
34
|
+
this.timeoutMs = timeoutMs;
|
|
35
|
+
this.queueMax = safeSize * 2;
|
|
36
|
+
for (let index = 0; index < safeSize; index += 1) {
|
|
37
|
+
this.workers.push(this.spawnWorker(index));
|
|
70
38
|
}
|
|
71
39
|
}
|
|
72
|
-
|
|
73
|
-
const
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
signal,
|
|
81
|
-
abortHandler: undefined,
|
|
82
|
-
status: 'queued',
|
|
83
|
-
};
|
|
84
|
-
}
|
|
85
|
-
attachAbortHandler(task, signal) {
|
|
86
|
-
if (!signal)
|
|
87
|
-
return;
|
|
88
|
-
const onAbort = () => {
|
|
89
|
-
if (task.status === 'queued') {
|
|
90
|
-
this.removeQueuedTask(task);
|
|
91
|
-
task.reject(new Error('Aborted'));
|
|
92
|
-
return;
|
|
93
|
-
}
|
|
94
|
-
this.abortRunningTask(task);
|
|
40
|
+
spawnWorker(workerIndex) {
|
|
41
|
+
const worker = new Worker(new URL('../workers/transform-worker.js', import.meta.url));
|
|
42
|
+
// Workers must not keep the process alive by themselves.
|
|
43
|
+
worker.unref();
|
|
44
|
+
const slot = {
|
|
45
|
+
worker,
|
|
46
|
+
busy: false,
|
|
47
|
+
currentTaskId: null,
|
|
95
48
|
};
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
}
|
|
99
|
-
enqueueTask(task) {
|
|
100
|
-
this.queue.push(task);
|
|
101
|
-
this.dispatch();
|
|
102
|
-
}
|
|
103
|
-
attachWorker(slot) {
|
|
104
|
-
slot.worker.on('message', (message) => {
|
|
105
|
-
this.handleMessage(slot, message);
|
|
49
|
+
worker.on('message', (raw) => {
|
|
50
|
+
this.onWorkerMessage(workerIndex, raw);
|
|
106
51
|
});
|
|
107
|
-
|
|
108
|
-
this.
|
|
52
|
+
worker.on('error', (error) => {
|
|
53
|
+
this.onWorkerBroken(workerIndex, `Transform worker error: ${getErrorMessage(error)}`);
|
|
109
54
|
});
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
this.handleWorkerFailure(slot, new Error(`Worker exited with code ${code}`));
|
|
113
|
-
}
|
|
55
|
+
worker.on('exit', (code) => {
|
|
56
|
+
this.onWorkerBroken(workerIndex, `Transform worker exited (code ${code})`);
|
|
114
57
|
});
|
|
115
|
-
}
|
|
116
|
-
spawnWorker() {
|
|
117
|
-
const slot = {
|
|
118
|
-
worker: new Worker(this.workerUrl),
|
|
119
|
-
busy: false,
|
|
120
|
-
current: undefined,
|
|
121
|
-
};
|
|
122
|
-
this.attachWorker(slot);
|
|
123
58
|
return slot;
|
|
124
59
|
}
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
if (!task)
|
|
60
|
+
onWorkerBroken(workerIndex, message) {
|
|
61
|
+
if (this.closed)
|
|
128
62
|
return;
|
|
129
|
-
|
|
130
|
-
|
|
63
|
+
const slot = this.workers[workerIndex];
|
|
64
|
+
if (!slot)
|
|
65
|
+
return;
|
|
66
|
+
if (slot.busy && slot.currentTaskId) {
|
|
67
|
+
this.failTask(slot.currentTaskId, new Error(message));
|
|
68
|
+
}
|
|
69
|
+
void slot.worker.terminate();
|
|
70
|
+
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
71
|
+
this.drainQueue();
|
|
72
|
+
}
|
|
73
|
+
onWorkerMessage(workerIndex, raw) {
|
|
74
|
+
if (!raw ||
|
|
75
|
+
typeof raw !== 'object' ||
|
|
76
|
+
!('type' in raw) ||
|
|
77
|
+
!('id' in raw) ||
|
|
78
|
+
typeof raw.id !== 'string' ||
|
|
79
|
+
typeof raw.type !== 'string') {
|
|
80
|
+
return;
|
|
81
|
+
}
|
|
82
|
+
const message = raw;
|
|
83
|
+
const inflight = this.inflight.get(message.id);
|
|
84
|
+
if (!inflight)
|
|
131
85
|
return;
|
|
86
|
+
clearTimeout(inflight.timer);
|
|
87
|
+
if (inflight.signal && inflight.abortListener) {
|
|
88
|
+
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
132
89
|
}
|
|
133
|
-
|
|
134
|
-
slot
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
90
|
+
this.inflight.delete(message.id);
|
|
91
|
+
const slot = this.workers[workerIndex];
|
|
92
|
+
if (slot) {
|
|
93
|
+
slot.busy = false;
|
|
94
|
+
slot.currentTaskId = null;
|
|
95
|
+
}
|
|
96
|
+
if (message.type === 'result') {
|
|
97
|
+
inflight.resolve(message.result);
|
|
138
98
|
}
|
|
139
99
|
else {
|
|
140
|
-
|
|
100
|
+
const { error } = message;
|
|
101
|
+
if (error.name === 'FetchError') {
|
|
102
|
+
inflight.reject(new FetchError(error.message, error.url, error.statusCode, error.details ?? {}));
|
|
103
|
+
}
|
|
104
|
+
else {
|
|
105
|
+
inflight.reject(new Error(error.message));
|
|
106
|
+
}
|
|
141
107
|
}
|
|
142
|
-
this.
|
|
108
|
+
this.drainQueue();
|
|
143
109
|
}
|
|
144
|
-
|
|
145
|
-
const
|
|
146
|
-
if (
|
|
147
|
-
|
|
110
|
+
failTask(id, error) {
|
|
111
|
+
const inflight = this.inflight.get(id);
|
|
112
|
+
if (!inflight)
|
|
113
|
+
return;
|
|
114
|
+
clearTimeout(inflight.timer);
|
|
115
|
+
if (inflight.signal && inflight.abortListener) {
|
|
116
|
+
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
117
|
+
}
|
|
118
|
+
this.inflight.delete(id);
|
|
119
|
+
inflight.reject(error);
|
|
120
|
+
const slot = this.workers[inflight.workerIndex];
|
|
121
|
+
if (slot) {
|
|
148
122
|
slot.busy = false;
|
|
149
|
-
|
|
150
|
-
task.reject(error instanceof Error ? error : new Error(getErrorMessage(error)));
|
|
123
|
+
slot.currentTaskId = null;
|
|
151
124
|
}
|
|
152
|
-
logWarn('Worker thread failure', {
|
|
153
|
-
error: getErrorMessage(error),
|
|
154
|
-
});
|
|
155
|
-
this.replaceWorker(slot);
|
|
156
|
-
this.dispatch();
|
|
157
125
|
}
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
126
|
+
async transform(html, url, options) {
|
|
127
|
+
if (this.closed) {
|
|
128
|
+
throw new Error('Transform worker pool closed');
|
|
161
129
|
}
|
|
162
|
-
|
|
163
|
-
|
|
130
|
+
if (this.queue.length >= this.queueMax) {
|
|
131
|
+
throw new Error('Transform worker queue is full');
|
|
164
132
|
}
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
133
|
+
return new Promise((resolve, reject) => {
|
|
134
|
+
this.queue.push({
|
|
135
|
+
id: randomUUID(),
|
|
136
|
+
html,
|
|
137
|
+
url,
|
|
138
|
+
includeMetadata: options.includeMetadata,
|
|
139
|
+
signal: options.signal,
|
|
140
|
+
resolve,
|
|
141
|
+
reject,
|
|
142
|
+
});
|
|
143
|
+
this.drainQueue();
|
|
144
|
+
});
|
|
169
145
|
}
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
if (!task)
|
|
146
|
+
drainQueue() {
|
|
147
|
+
if (this.queue.length === 0)
|
|
173
148
|
return;
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
149
|
+
for (let workerIndex = 0; workerIndex < this.workers.length; workerIndex += 1) {
|
|
150
|
+
const slot = this.workers[workerIndex];
|
|
151
|
+
if (!slot || slot.busy)
|
|
152
|
+
continue;
|
|
153
|
+
const task = this.queue.shift();
|
|
154
|
+
if (!task)
|
|
155
|
+
return;
|
|
156
|
+
this.dispatch(workerIndex, slot, task);
|
|
157
|
+
if (this.queue.length === 0)
|
|
158
|
+
return;
|
|
159
|
+
}
|
|
179
160
|
}
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
161
|
+
dispatch(workerIndex, slot, task) {
|
|
162
|
+
if (task.signal?.aborted) {
|
|
163
|
+
task.reject(new FetchError('Request was canceled', task.url, 499, {
|
|
164
|
+
reason: 'aborted',
|
|
165
|
+
stage: 'transform:dispatch',
|
|
166
|
+
}));
|
|
183
167
|
return;
|
|
184
|
-
this.handleWorkerFailure(slot, new Error('Aborted'));
|
|
185
|
-
}
|
|
186
|
-
removeQueuedTask(task) {
|
|
187
|
-
const index = this.queue.findIndex((queued) => queued.id === task.id);
|
|
188
|
-
if (index >= 0) {
|
|
189
|
-
this.queue.splice(index, 1);
|
|
190
168
|
}
|
|
191
|
-
|
|
169
|
+
slot.busy = true;
|
|
170
|
+
slot.currentTaskId = task.id;
|
|
171
|
+
const timer = setTimeout(() => {
|
|
172
|
+
try {
|
|
173
|
+
slot.worker.postMessage({ type: 'cancel', id: task.id });
|
|
174
|
+
}
|
|
175
|
+
catch {
|
|
176
|
+
// ignore
|
|
177
|
+
}
|
|
178
|
+
const inflight = this.inflight.get(task.id);
|
|
179
|
+
if (!inflight)
|
|
180
|
+
return;
|
|
181
|
+
clearTimeout(inflight.timer);
|
|
182
|
+
if (inflight.signal && inflight.abortListener) {
|
|
183
|
+
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
184
|
+
}
|
|
185
|
+
this.inflight.delete(task.id);
|
|
186
|
+
inflight.reject(new FetchError('Request timeout', task.url, 504, {
|
|
187
|
+
reason: 'timeout',
|
|
188
|
+
stage: 'transform:worker-timeout',
|
|
189
|
+
}));
|
|
190
|
+
if (!this.closed) {
|
|
191
|
+
void slot.worker.terminate();
|
|
192
|
+
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
193
|
+
this.drainQueue();
|
|
194
|
+
}
|
|
195
|
+
}, this.timeoutMs).unref();
|
|
196
|
+
let abortListener;
|
|
197
|
+
if (task.signal) {
|
|
198
|
+
abortListener = () => {
|
|
199
|
+
try {
|
|
200
|
+
slot.worker.postMessage({ type: 'cancel', id: task.id });
|
|
201
|
+
}
|
|
202
|
+
catch {
|
|
203
|
+
// ignore
|
|
204
|
+
}
|
|
205
|
+
};
|
|
206
|
+
task.signal.addEventListener('abort', abortListener, { once: true });
|
|
207
|
+
}
|
|
208
|
+
this.inflight.set(task.id, {
|
|
209
|
+
resolve: task.resolve,
|
|
210
|
+
reject: task.reject,
|
|
211
|
+
timer,
|
|
212
|
+
signal: task.signal,
|
|
213
|
+
abortListener,
|
|
214
|
+
workerIndex,
|
|
215
|
+
});
|
|
216
|
+
slot.worker.postMessage({
|
|
217
|
+
type: 'transform',
|
|
218
|
+
id: task.id,
|
|
219
|
+
html: task.html,
|
|
220
|
+
url: task.url,
|
|
221
|
+
includeMetadata: task.includeMetadata,
|
|
222
|
+
});
|
|
192
223
|
}
|
|
193
|
-
|
|
194
|
-
if (
|
|
195
|
-
|
|
224
|
+
async close() {
|
|
225
|
+
if (this.closed)
|
|
226
|
+
return;
|
|
227
|
+
this.closed = true;
|
|
228
|
+
const terminations = this.workers.map((slot) => slot.worker.terminate());
|
|
229
|
+
this.workers.length = 0;
|
|
230
|
+
for (const [id, inflight] of this.inflight.entries()) {
|
|
231
|
+
clearTimeout(inflight.timer);
|
|
232
|
+
if (inflight.signal && inflight.abortListener) {
|
|
233
|
+
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
234
|
+
}
|
|
235
|
+
inflight.reject(new Error('Transform worker pool closed'));
|
|
236
|
+
this.inflight.delete(id);
|
|
196
237
|
}
|
|
238
|
+
for (const task of this.queue) {
|
|
239
|
+
task.reject(new Error('Transform worker pool closed'));
|
|
240
|
+
}
|
|
241
|
+
this.queue.length = 0;
|
|
242
|
+
await Promise.allSettled(terminations);
|
|
197
243
|
}
|
|
198
244
|
}
|
|
199
|
-
let pool = null;
|
|
200
|
-
function getPool() {
|
|
201
|
-
if (pool)
|
|
202
|
-
return pool;
|
|
203
|
-
pool = new TransformWorkerPool(new URL('../workers/content-transform.worker.js', import.meta.url), config.workers.poolSize);
|
|
204
|
-
return pool;
|
|
205
|
-
}
|
|
206
|
-
export async function transformInWorker(request, signal) {
|
|
207
|
-
return getPool().run(request, signal);
|
|
208
|
-
}
|
|
209
|
-
export async function destroyTransformWorkers() {
|
|
210
|
-
if (!pool)
|
|
211
|
-
return;
|
|
212
|
-
const current = pool;
|
|
213
|
-
pool = null;
|
|
214
|
-
await current.destroy();
|
|
215
|
-
}
|
|
@@ -54,8 +54,8 @@ function deserializeMarkdownResult(cached) {
|
|
|
54
54
|
return parseCachedMarkdownResult(cached);
|
|
55
55
|
}
|
|
56
56
|
function buildMarkdownTransform() {
|
|
57
|
-
return (html, url) => {
|
|
58
|
-
const result = transformHtmlToMarkdown(html, url, {
|
|
57
|
+
return async (html, url) => {
|
|
58
|
+
const result = await transformHtmlToMarkdown(html, url, {
|
|
59
59
|
includeMetadata: true,
|
|
60
60
|
});
|
|
61
61
|
return { ...result, content: result.markdown };
|
|
@@ -68,9 +68,11 @@ function serializeMarkdownResult(result) {
|
|
|
68
68
|
truncated: result.truncated,
|
|
69
69
|
});
|
|
70
70
|
}
|
|
71
|
-
function buildStructuredContent(pipeline, inlineResult) {
|
|
71
|
+
function buildStructuredContent(pipeline, inlineResult, inputUrl) {
|
|
72
72
|
return {
|
|
73
73
|
url: pipeline.url,
|
|
74
|
+
resolvedUrl: pipeline.url,
|
|
75
|
+
inputUrl,
|
|
74
76
|
title: pipeline.data.title,
|
|
75
77
|
markdown: inlineResult.content,
|
|
76
78
|
};
|
|
@@ -89,8 +91,8 @@ async function fetchPipeline(url) {
|
|
|
89
91
|
deserialize: deserializeMarkdownResult,
|
|
90
92
|
});
|
|
91
93
|
}
|
|
92
|
-
function buildResponse(pipeline, inlineResult) {
|
|
93
|
-
const structuredContent = buildStructuredContent(pipeline, inlineResult);
|
|
94
|
+
function buildResponse(pipeline, inlineResult, inputUrl) {
|
|
95
|
+
const structuredContent = buildStructuredContent(pipeline, inlineResult, inputUrl);
|
|
94
96
|
const content = buildFetchUrlContentBlocks(structuredContent, pipeline, inlineResult);
|
|
95
97
|
return {
|
|
96
98
|
content,
|
|
@@ -113,5 +115,5 @@ async function executeFetch(input) {
|
|
|
113
115
|
if (inlineResult.error) {
|
|
114
116
|
return createToolErrorResponse(inlineResult.error, url);
|
|
115
117
|
}
|
|
116
|
-
return buildResponse(pipeline, inlineResult);
|
|
118
|
+
return buildResponse(pipeline, inlineResult, url);
|
|
117
119
|
}
|
package/dist/tools/index.d.ts
CHANGED
|
@@ -1,2 +1,3 @@
|
|
|
1
1
|
import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
2
|
+
export declare function withRequestContextIfMissing<TParams, TResult>(handler: (params: TParams) => Promise<TResult>): (params: TParams) => Promise<TResult>;
|
|
2
3
|
export declare function registerTools(server: McpServer): void;
|
package/dist/tools/index.js
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { randomUUID } from 'node:crypto';
|
|
2
|
+
import { getRequestId, runWithRequestContext } from '../services/context.js';
|
|
1
3
|
import { FETCH_URL_TOOL_DESCRIPTION, FETCH_URL_TOOL_NAME, fetchUrlToolHandler, } from './handlers/fetch-url.tool.js';
|
|
2
4
|
import { fetchUrlInputSchema, fetchUrlOutputSchema } from './schemas.js';
|
|
3
5
|
const TOOL_DEFINITION = {
|
|
@@ -14,6 +16,16 @@ const TOOL_DEFINITION = {
|
|
|
14
16
|
openWorldHint: true,
|
|
15
17
|
},
|
|
16
18
|
};
|
|
19
|
+
export function withRequestContextIfMissing(handler) {
|
|
20
|
+
return async (params) => {
|
|
21
|
+
const existingRequestId = getRequestId();
|
|
22
|
+
if (existingRequestId) {
|
|
23
|
+
return handler(params);
|
|
24
|
+
}
|
|
25
|
+
const requestId = randomUUID();
|
|
26
|
+
return runWithRequestContext({ requestId, operationId: requestId }, () => handler(params));
|
|
27
|
+
};
|
|
28
|
+
}
|
|
17
29
|
export function registerTools(server) {
|
|
18
30
|
server.registerTool(TOOL_DEFINITION.name, {
|
|
19
31
|
title: TOOL_DEFINITION.title,
|
|
@@ -21,5 +33,5 @@ export function registerTools(server) {
|
|
|
21
33
|
inputSchema: TOOL_DEFINITION.inputSchema,
|
|
22
34
|
outputSchema: TOOL_DEFINITION.outputSchema,
|
|
23
35
|
annotations: TOOL_DEFINITION.annotations,
|
|
24
|
-
}, TOOL_DEFINITION.handler);
|
|
36
|
+
}, withRequestContextIfMissing(TOOL_DEFINITION.handler));
|
|
25
37
|
}
|
package/dist/tools/schemas.d.ts
CHANGED
|
@@ -4,6 +4,8 @@ export declare const fetchUrlInputSchema: z.ZodObject<{
|
|
|
4
4
|
}, z.core.$strict>;
|
|
5
5
|
export declare const fetchUrlOutputSchema: z.ZodObject<{
|
|
6
6
|
url: z.ZodString;
|
|
7
|
+
inputUrl: z.ZodOptional<z.ZodString>;
|
|
8
|
+
resolvedUrl: z.ZodOptional<z.ZodString>;
|
|
7
9
|
title: z.ZodOptional<z.ZodString>;
|
|
8
10
|
markdown: z.ZodOptional<z.ZodString>;
|
|
9
11
|
error: z.ZodOptional<z.ZodString>;
|
package/dist/tools/schemas.js
CHANGED
|
@@ -4,6 +4,14 @@ export const fetchUrlInputSchema = z.strictObject({
|
|
|
4
4
|
});
|
|
5
5
|
export const fetchUrlOutputSchema = z.strictObject({
|
|
6
6
|
url: z.string().describe('The fetched URL'),
|
|
7
|
+
inputUrl: z
|
|
8
|
+
.string()
|
|
9
|
+
.optional()
|
|
10
|
+
.describe('The original URL provided by the caller'),
|
|
11
|
+
resolvedUrl: z
|
|
12
|
+
.string()
|
|
13
|
+
.optional()
|
|
14
|
+
.describe('The normalized or transformed URL that was fetched'),
|
|
7
15
|
title: z.string().optional().describe('Page title'),
|
|
8
16
|
markdown: z
|
|
9
17
|
.string()
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { ExtractedArticle, ExtractedMetadata, MarkdownTransformResult, MetadataBlock, TransformOptions } from '../../config/types/content.js';
|
|
2
|
+
export declare function isExtractionSufficient(article: ExtractedArticle | null, originalHtml: string): boolean;
|
|
3
|
+
export declare function determineContentExtractionSource(article: ExtractedArticle | null): article is ExtractedArticle;
|
|
4
|
+
export declare function createContentMetadataBlock(url: string, article: ExtractedArticle | null, extractedMeta: ExtractedMetadata, shouldExtractFromArticle: boolean, includeMetadata: boolean): MetadataBlock | undefined;
|
|
5
|
+
export declare function transformHtmlToMarkdownInProcess(html: string, url: string, options: TransformOptions): MarkdownTransformResult;
|