@j0hanz/fetch-url-mcp 1.12.7 → 1.12.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/http/auth.d.ts +2 -2
- package/dist/http/auth.d.ts.map +1 -1
- package/dist/http/auth.js +4 -5
- package/dist/http/index.d.ts +6 -0
- package/dist/http/index.d.ts.map +1 -0
- package/dist/http/index.js +5 -0
- package/dist/http/native.d.ts +73 -0
- package/dist/http/native.d.ts.map +1 -1
- package/dist/http/native.js +554 -10
- package/dist/http/rate-limit.d.ts +1 -1
- package/dist/http/rate-limit.d.ts.map +1 -1
- package/dist/http/rate-limit.js +3 -4
- package/dist/index.d.ts +17 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +67 -6
- package/dist/lib/config.js +2 -2
- package/dist/lib/core.d.ts +56 -4
- package/dist/lib/core.d.ts.map +1 -1
- package/dist/lib/core.js +155 -4
- package/dist/lib/error/classes.d.ts +19 -0
- package/dist/lib/error/classes.d.ts.map +1 -0
- package/dist/lib/error/classes.js +107 -0
- package/dist/lib/error/classify.d.ts +4 -0
- package/dist/lib/error/classify.d.ts.map +1 -0
- package/dist/lib/error/classify.js +154 -0
- package/dist/lib/error/codes.d.ts +23 -0
- package/dist/lib/error/codes.d.ts.map +1 -0
- package/dist/lib/error/codes.js +22 -0
- package/dist/lib/error/index.d.ts +6 -0
- package/dist/lib/error/index.d.ts.map +1 -0
- package/dist/lib/error/index.js +5 -0
- package/dist/lib/{error-messages.d.ts → error/messages.d.ts} +2 -2
- package/dist/lib/error/messages.d.ts.map +1 -0
- package/dist/lib/{error-messages.js → error/messages.js} +2 -2
- package/dist/lib/{tool-errors.d.ts → error/payload.d.ts} +7 -13
- package/dist/lib/error/payload.d.ts.map +1 -0
- package/dist/lib/error/payload.js +108 -0
- package/dist/lib/mcp-interop.d.ts.map +1 -1
- package/dist/lib/mcp-interop.js +4 -6
- package/dist/lib/net/http.d.ts.map +1 -0
- package/dist/lib/{http.js → net/http.js} +4 -7
- package/dist/lib/net/index.d.ts +4 -0
- package/dist/lib/net/index.d.ts.map +1 -0
- package/dist/lib/net/index.js +3 -0
- package/dist/lib/{fetch-pipeline.d.ts → net/pipeline.d.ts} +3 -3
- package/dist/lib/net/pipeline.d.ts.map +1 -0
- package/dist/lib/{fetch-pipeline.js → net/pipeline.js} +3 -5
- package/dist/lib/{url.d.ts → net/url.d.ts} +1 -1
- package/dist/lib/net/url.d.ts.map +1 -0
- package/dist/lib/{url.js → net/url.js} +3 -5
- package/dist/lib/utils.d.ts +2 -18
- package/dist/lib/utils.d.ts.map +1 -1
- package/dist/lib/utils.js +29 -104
- package/dist/resources/index.d.ts.map +1 -1
- package/dist/resources/index.js +8 -5
- package/dist/schemas.d.ts +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +7 -9
- package/dist/tasks/index.d.ts +2 -0
- package/dist/tasks/index.d.ts.map +1 -0
- package/dist/tasks/index.js +1 -0
- package/dist/tasks/manager.d.ts +123 -1
- package/dist/tasks/manager.d.ts.map +1 -1
- package/dist/tasks/manager.js +745 -10
- package/dist/tools/{fetch-url.d.ts → index.d.ts} +4 -5
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/{fetch-url.js → index.js} +6 -8
- package/dist/transform/index.d.ts +279 -0
- package/dist/transform/index.d.ts.map +1 -0
- package/dist/transform/index.js +5234 -0
- package/package.json +2 -2
- package/dist/cli.d.ts +0 -19
- package/dist/cli.d.ts.map +0 -1
- package/dist/cli.js +0 -65
- package/dist/http/health.d.ts +0 -8
- package/dist/http/health.d.ts.map +0 -1
- package/dist/http/health.js +0 -152
- package/dist/http/helpers.d.ts +0 -68
- package/dist/http/helpers.d.ts.map +0 -1
- package/dist/http/helpers.js +0 -402
- package/dist/lib/error-codes.d.ts +0 -13
- package/dist/lib/error-codes.d.ts.map +0 -1
- package/dist/lib/error-codes.js +0 -12
- package/dist/lib/error-messages.d.ts.map +0 -1
- package/dist/lib/fetch-pipeline.d.ts.map +0 -1
- package/dist/lib/http.d.ts.map +0 -1
- package/dist/lib/logger-names.d.ts +0 -16
- package/dist/lib/logger-names.d.ts.map +0 -1
- package/dist/lib/logger-names.js +0 -15
- package/dist/lib/session.d.ts +0 -44
- package/dist/lib/session.d.ts.map +0 -1
- package/dist/lib/session.js +0 -137
- package/dist/lib/tool-errors.d.ts.map +0 -1
- package/dist/lib/tool-errors.js +0 -253
- package/dist/lib/url.d.ts.map +0 -1
- package/dist/lib/zod.d.ts +0 -3
- package/dist/lib/zod.d.ts.map +0 -1
- package/dist/lib/zod.js +0 -27
- package/dist/tasks/call-contract.d.ts +0 -25
- package/dist/tasks/call-contract.d.ts.map +0 -1
- package/dist/tasks/call-contract.js +0 -59
- package/dist/tasks/execution.d.ts +0 -16
- package/dist/tasks/execution.d.ts.map +0 -1
- package/dist/tasks/execution.js +0 -241
- package/dist/tasks/handlers.d.ts +0 -11
- package/dist/tasks/handlers.d.ts.map +0 -1
- package/dist/tasks/handlers.js +0 -157
- package/dist/tasks/owner.d.ts +0 -43
- package/dist/tasks/owner.d.ts.map +0 -1
- package/dist/tasks/owner.js +0 -144
- package/dist/tasks/registry.d.ts +0 -20
- package/dist/tasks/registry.d.ts.map +0 -1
- package/dist/tasks/registry.js +0 -40
- package/dist/tasks/waiters.d.ts +0 -27
- package/dist/tasks/waiters.d.ts.map +0 -1
- package/dist/tasks/waiters.js +0 -114
- package/dist/tools/fetch-url.d.ts.map +0 -1
- package/dist/transform/dom-prep.d.ts +0 -16
- package/dist/transform/dom-prep.d.ts.map +0 -1
- package/dist/transform/dom-prep.js +0 -1287
- package/dist/transform/html-translators.d.ts +0 -5
- package/dist/transform/html-translators.d.ts.map +0 -1
- package/dist/transform/html-translators.js +0 -697
- package/dist/transform/markdown-cleanup.d.ts +0 -10
- package/dist/transform/markdown-cleanup.d.ts.map +0 -1
- package/dist/transform/markdown-cleanup.js +0 -542
- package/dist/transform/metadata.d.ts +0 -18
- package/dist/transform/metadata.d.ts.map +0 -1
- package/dist/transform/metadata.js +0 -462
- package/dist/transform/next-flight.d.ts +0 -2
- package/dist/transform/next-flight.d.ts.map +0 -1
- package/dist/transform/next-flight.js +0 -374
- package/dist/transform/shared.d.ts +0 -8
- package/dist/transform/shared.d.ts.map +0 -1
- package/dist/transform/shared.js +0 -137
- package/dist/transform/transform.d.ts +0 -38
- package/dist/transform/transform.d.ts.map +0 -1
- package/dist/transform/transform.js +0 -1042
- package/dist/transform/types.d.ts +0 -124
- package/dist/transform/types.d.ts.map +0 -1
- package/dist/transform/types.js +0 -5
- package/dist/transform/worker-pool.d.ts +0 -76
- package/dist/transform/worker-pool.d.ts.map +0 -1
- package/dist/transform/worker-pool.js +0 -725
- /package/dist/lib/{http.d.ts → net/http.d.ts} +0 -0
|
@@ -1,725 +0,0 @@
|
|
|
1
|
-
import { AsyncLocalStorage } from 'node:async_hooks';
|
|
2
|
-
import { availableParallelism } from 'node:os';
|
|
3
|
-
import process from 'node:process';
|
|
4
|
-
import { isSharedArrayBuffer } from 'node:util/types';
|
|
5
|
-
import { isMainThread, isMarkedAsUntransferable, parentPort, Worker, } from 'node:worker_threads';
|
|
6
|
-
import { z } from 'zod';
|
|
7
|
-
import { config, logDebug, logInfo, logWarn } from '../lib/core.js';
|
|
8
|
-
import { SystemErrors } from '../lib/error-codes.js';
|
|
9
|
-
import { Loggers } from '../lib/logger-names.js';
|
|
10
|
-
import { createAbortError, createUnrefTimeout, FetchError, getErrorMessage, } from '../lib/utils.js';
|
|
11
|
-
import { formatZodError } from '../lib/zod.js';
|
|
12
|
-
import { extractedMetadataSchema } from '../schemas.js';
|
|
13
|
-
import { createTransformMessageHandler } from './shared.js';
|
|
14
|
-
import { transformHtmlToMarkdownInProcess } from './transform.js';
|
|
15
|
-
// Worker message validation
|
|
16
|
-
const workerResultPayloadSchema = z.strictObject({
|
|
17
|
-
markdown: z.string(),
|
|
18
|
-
title: z.string().optional(),
|
|
19
|
-
metadata: extractedMetadataSchema.optional(),
|
|
20
|
-
truncated: z.boolean(),
|
|
21
|
-
});
|
|
22
|
-
const workerErrorPayloadSchema = z.strictObject({
|
|
23
|
-
name: z.string(),
|
|
24
|
-
message: z.string(),
|
|
25
|
-
url: z.string(),
|
|
26
|
-
statusCode: z.number().int().optional(),
|
|
27
|
-
details: z.record(z.string(), z.unknown()).optional(),
|
|
28
|
-
});
|
|
29
|
-
const workerResponseSchema = z.discriminatedUnion('type', [
|
|
30
|
-
z.strictObject({
|
|
31
|
-
type: z.literal('result'),
|
|
32
|
-
id: z.string(),
|
|
33
|
-
result: workerResultPayloadSchema,
|
|
34
|
-
}),
|
|
35
|
-
z.strictObject({
|
|
36
|
-
type: z.literal('error'),
|
|
37
|
-
id: z.string(),
|
|
38
|
-
error: workerErrorPayloadSchema,
|
|
39
|
-
}),
|
|
40
|
-
z.strictObject({
|
|
41
|
-
type: z.literal('cancelled'),
|
|
42
|
-
id: z.string(),
|
|
43
|
-
}),
|
|
44
|
-
]);
|
|
45
|
-
function createTaskContext() {
|
|
46
|
-
const runWithStore = AsyncLocalStorage.snapshot();
|
|
47
|
-
return {
|
|
48
|
-
run: (fn) => {
|
|
49
|
-
runWithStore(fn);
|
|
50
|
-
},
|
|
51
|
-
};
|
|
52
|
-
}
|
|
53
|
-
function ensureTightBuffer(buffer) {
|
|
54
|
-
if (buffer.byteOffset === 0 &&
|
|
55
|
-
buffer.byteLength === buffer.buffer.byteLength) {
|
|
56
|
-
return buffer;
|
|
57
|
-
}
|
|
58
|
-
const copy = new Uint8Array(buffer);
|
|
59
|
-
return copy;
|
|
60
|
-
}
|
|
61
|
-
function getTransferableBuffer(buffer) {
|
|
62
|
-
const backingBuffer = buffer.buffer;
|
|
63
|
-
if (isSharedArrayBuffer(backingBuffer))
|
|
64
|
-
return null;
|
|
65
|
-
if (!(backingBuffer instanceof ArrayBuffer))
|
|
66
|
-
return null;
|
|
67
|
-
return isMarkedAsUntransferable(backingBuffer) ? null : backingBuffer;
|
|
68
|
-
}
|
|
69
|
-
function buildWorkerDispatchPayload(task) {
|
|
70
|
-
const message = {
|
|
71
|
-
type: 'transform',
|
|
72
|
-
id: task.id,
|
|
73
|
-
url: task.url,
|
|
74
|
-
includeMetadataFooter: task.includeMetadataFooter,
|
|
75
|
-
...(task.inputTruncated ? { inputTruncated: true } : {}),
|
|
76
|
-
};
|
|
77
|
-
if (!task.htmlBuffer) {
|
|
78
|
-
message.html = task.html;
|
|
79
|
-
return { message };
|
|
80
|
-
}
|
|
81
|
-
const htmlBuffer = ensureTightBuffer(task.htmlBuffer);
|
|
82
|
-
message.htmlBuffer = htmlBuffer;
|
|
83
|
-
if (task.encoding)
|
|
84
|
-
message.encoding = task.encoding;
|
|
85
|
-
const transferableBuffer = getTransferableBuffer(htmlBuffer);
|
|
86
|
-
return transferableBuffer
|
|
87
|
-
? { message, transferList: [transferableBuffer] }
|
|
88
|
-
: { message };
|
|
89
|
-
}
|
|
90
|
-
// Pool sizing & constants
|
|
91
|
-
// Core tuning: ~half of available CPUs as baseline, capped by config limits.
|
|
92
|
-
const POOL_MIN_WORKERS = Math.max(2, Math.min(4, Math.floor(availableParallelism() / 2)));
|
|
93
|
-
const POOL_MAX_WORKERS = config.transform.maxWorkerScale;
|
|
94
|
-
const POOL_SCALE_THRESHOLD = 0.5;
|
|
95
|
-
const WORKER_NAME_PREFIX = 'fetch-url-mcp-transform';
|
|
96
|
-
const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
|
|
97
|
-
const TRANSFORM_WORKER_PATH = new URL(import.meta.url);
|
|
98
|
-
const COMPACTION_HEAD_THRESHOLD = 1024;
|
|
99
|
-
const QUEUE_CAPACITY_MULTIPLIER = 4;
|
|
100
|
-
const HTTP_SERVICE_UNAVAILABLE = 503;
|
|
101
|
-
const HTTP_GATEWAY_TIMEOUT = 504;
|
|
102
|
-
// TaskQueue — array-deque with auto-compaction
|
|
103
|
-
class TaskQueue {
|
|
104
|
-
items = [];
|
|
105
|
-
head = 0;
|
|
106
|
-
activeCount = 0;
|
|
107
|
-
get depth() {
|
|
108
|
-
return this.activeCount;
|
|
109
|
-
}
|
|
110
|
-
enqueue(item) {
|
|
111
|
-
this.items.push(item);
|
|
112
|
-
this.activeCount += 1;
|
|
113
|
-
}
|
|
114
|
-
dequeue() {
|
|
115
|
-
let found = null;
|
|
116
|
-
while (this.head < this.items.length) {
|
|
117
|
-
const item = this.items[this.head];
|
|
118
|
-
this.head += 1;
|
|
119
|
-
if (item) {
|
|
120
|
-
this.activeCount -= 1;
|
|
121
|
-
found = item;
|
|
122
|
-
break;
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
this.compact();
|
|
126
|
-
return found;
|
|
127
|
-
}
|
|
128
|
-
removeById(id) {
|
|
129
|
-
for (let i = this.head; i < this.items.length; i += 1) {
|
|
130
|
-
const item = this.items[i];
|
|
131
|
-
if (item?.id === id) {
|
|
132
|
-
this.items[i] = undefined;
|
|
133
|
-
this.activeCount -= 1;
|
|
134
|
-
this.compact();
|
|
135
|
-
return item;
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
return undefined;
|
|
139
|
-
}
|
|
140
|
-
drain(callback) {
|
|
141
|
-
for (let i = this.head; i < this.items.length; i += 1) {
|
|
142
|
-
const item = this.items[i];
|
|
143
|
-
if (item)
|
|
144
|
-
callback(item);
|
|
145
|
-
}
|
|
146
|
-
this.items.length = 0;
|
|
147
|
-
this.head = 0;
|
|
148
|
-
this.activeCount = 0;
|
|
149
|
-
}
|
|
150
|
-
compact() {
|
|
151
|
-
if (this.head === 0)
|
|
152
|
-
return;
|
|
153
|
-
if (this.head >= this.items.length ||
|
|
154
|
-
(this.head > COMPACTION_HEAD_THRESHOLD &&
|
|
155
|
-
this.head > this.items.length / 2)) {
|
|
156
|
-
this.items.splice(0, this.head);
|
|
157
|
-
this.head = 0;
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
// CancelAckTracker — isolates the cancel-acknowledgement protocol
|
|
162
|
-
class CancelAckTracker {
|
|
163
|
-
pending = new Map();
|
|
164
|
-
earlyResolutions = new Set();
|
|
165
|
-
resolve(id) {
|
|
166
|
-
const entry = this.pending.get(id);
|
|
167
|
-
if (!entry) {
|
|
168
|
-
this.earlyResolutions.add(id);
|
|
169
|
-
return;
|
|
170
|
-
}
|
|
171
|
-
entry.timeout.cancel();
|
|
172
|
-
entry.resolve();
|
|
173
|
-
}
|
|
174
|
-
wait(id, timeoutMs) {
|
|
175
|
-
if (this.earlyResolutions.has(id)) {
|
|
176
|
-
this.earlyResolutions.delete(id);
|
|
177
|
-
return Promise.resolve();
|
|
178
|
-
}
|
|
179
|
-
const existing = this.pending.get(id);
|
|
180
|
-
if (existing)
|
|
181
|
-
return existing.promise;
|
|
182
|
-
const timeout = createUnrefTimeout(timeoutMs, undefined);
|
|
183
|
-
const { promise: racePromise, resolve } = Promise.withResolvers();
|
|
184
|
-
const promise = Promise.race([racePromise, timeout.promise])
|
|
185
|
-
.finally(() => {
|
|
186
|
-
this.pending.delete(id);
|
|
187
|
-
timeout.cancel();
|
|
188
|
-
})
|
|
189
|
-
.then(() => {
|
|
190
|
-
return;
|
|
191
|
-
});
|
|
192
|
-
this.pending.set(id, {
|
|
193
|
-
promise,
|
|
194
|
-
resolve: resolve,
|
|
195
|
-
timeout,
|
|
196
|
-
});
|
|
197
|
-
return promise;
|
|
198
|
-
}
|
|
199
|
-
dispose() {
|
|
200
|
-
for (const entry of this.pending.values()) {
|
|
201
|
-
entry.timeout.cancel();
|
|
202
|
-
entry.resolve();
|
|
203
|
-
}
|
|
204
|
-
this.pending.clear();
|
|
205
|
-
this.earlyResolutions.clear();
|
|
206
|
-
}
|
|
207
|
-
}
|
|
208
|
-
// WorkerPool
|
|
209
|
-
class WorkerPool {
|
|
210
|
-
static CLOSED_MESSAGE = 'Transform worker pool closed';
|
|
211
|
-
workers = [];
|
|
212
|
-
capacity;
|
|
213
|
-
minCapacity = POOL_MIN_WORKERS;
|
|
214
|
-
maxCapacity = POOL_MAX_WORKERS;
|
|
215
|
-
queue = new TaskQueue();
|
|
216
|
-
inflight = new Map();
|
|
217
|
-
cancelAcks = new CancelAckTracker();
|
|
218
|
-
timeoutMs;
|
|
219
|
-
queueMax;
|
|
220
|
-
closed = false;
|
|
221
|
-
taskIdSeq = 0;
|
|
222
|
-
busyCount = 0;
|
|
223
|
-
draining = false;
|
|
224
|
-
restartBackoff = new Map();
|
|
225
|
-
constructor(size, timeoutMs) {
|
|
226
|
-
this.capacity =
|
|
227
|
-
size === 0
|
|
228
|
-
? 0
|
|
229
|
-
: Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
|
|
230
|
-
this.timeoutMs = timeoutMs;
|
|
231
|
-
this.queueMax = this.maxCapacity * QUEUE_CAPACITY_MULTIPLIER;
|
|
232
|
-
}
|
|
233
|
-
async transform(htmlOrBuffer, url, options) {
|
|
234
|
-
this.ensureOpen();
|
|
235
|
-
if (options.signal?.aborted)
|
|
236
|
-
throw createAbortError(url, 'transform:enqueue');
|
|
237
|
-
if (this.queue.depth >= this.queueMax) {
|
|
238
|
-
logWarn('Transform worker queue capacity reached', {
|
|
239
|
-
queueDepth: this.queue.depth,
|
|
240
|
-
queueMax: this.queueMax,
|
|
241
|
-
activeWorkers: this.busyCount,
|
|
242
|
-
capacity: this.capacity,
|
|
243
|
-
url,
|
|
244
|
-
}, Loggers.LOG_TRANSFORM);
|
|
245
|
-
const error = new FetchError('Transform worker queue is full', url, HTTP_SERVICE_UNAVAILABLE, {
|
|
246
|
-
reason: SystemErrors.QUEUE_FULL,
|
|
247
|
-
stage: 'transform:enqueue',
|
|
248
|
-
});
|
|
249
|
-
throw error;
|
|
250
|
-
}
|
|
251
|
-
const { promise, resolve, reject } = Promise.withResolvers();
|
|
252
|
-
const task = this.createPendingTask(htmlOrBuffer, url, options, resolve, reject);
|
|
253
|
-
this.queue.enqueue(task);
|
|
254
|
-
this.drainQueue();
|
|
255
|
-
return promise;
|
|
256
|
-
}
|
|
257
|
-
getQueueDepth() {
|
|
258
|
-
return this.queue.depth;
|
|
259
|
-
}
|
|
260
|
-
getActiveWorkers() {
|
|
261
|
-
return this.busyCount;
|
|
262
|
-
}
|
|
263
|
-
getCapacity() {
|
|
264
|
-
return this.capacity;
|
|
265
|
-
}
|
|
266
|
-
resize(size) {
|
|
267
|
-
const newCapacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
|
|
268
|
-
if (newCapacity === this.capacity)
|
|
269
|
-
return;
|
|
270
|
-
this.capacity = newCapacity;
|
|
271
|
-
this.drainQueue();
|
|
272
|
-
}
|
|
273
|
-
async close() {
|
|
274
|
-
if (this.closed)
|
|
275
|
-
return;
|
|
276
|
-
this.closed = true;
|
|
277
|
-
logInfo('Shutting down transform worker pool', {
|
|
278
|
-
workers: this.workers.length,
|
|
279
|
-
activeWorkers: this.busyCount,
|
|
280
|
-
queueDepth: this.queue.depth,
|
|
281
|
-
inflight: this.inflight.size,
|
|
282
|
-
}, Loggers.LOG_TRANSFORM);
|
|
283
|
-
const terminations = this.workers
|
|
284
|
-
.map((slot) => slot?.worker.terminate().catch(() => undefined))
|
|
285
|
-
.filter((p) => p !== undefined);
|
|
286
|
-
this.workers.fill(undefined);
|
|
287
|
-
this.workers.length = 0;
|
|
288
|
-
this.busyCount = 0;
|
|
289
|
-
this.cancelAcks.dispose();
|
|
290
|
-
for (const id of Array.from(this.inflight.keys())) {
|
|
291
|
-
const inflight = this.takeInflight(id);
|
|
292
|
-
if (!inflight)
|
|
293
|
-
continue;
|
|
294
|
-
this.abortAndCleanTask(inflight, new Error(WorkerPool.CLOSED_MESSAGE));
|
|
295
|
-
}
|
|
296
|
-
this.queue.drain((task) => {
|
|
297
|
-
this.abortAndCleanTask(task, new Error(WorkerPool.CLOSED_MESSAGE));
|
|
298
|
-
});
|
|
299
|
-
await Promise.allSettled(terminations);
|
|
300
|
-
}
|
|
301
|
-
ensureOpen() {
|
|
302
|
-
if (this.closed)
|
|
303
|
-
throw Error(WorkerPool.CLOSED_MESSAGE);
|
|
304
|
-
}
|
|
305
|
-
createPendingTask(htmlOrBuffer, url, options, resolve, reject) {
|
|
306
|
-
const id = (this.taskIdSeq++).toString(36);
|
|
307
|
-
// Preserve request context for resolve/reject even when callbacks fire
|
|
308
|
-
// from worker thread events.
|
|
309
|
-
const context = createTaskContext();
|
|
310
|
-
let abortListener;
|
|
311
|
-
if (options.signal) {
|
|
312
|
-
abortListener = () => {
|
|
313
|
-
this.onAbortSignal(id, url, context, reject);
|
|
314
|
-
};
|
|
315
|
-
options.signal.addEventListener('abort', abortListener, { once: true });
|
|
316
|
-
}
|
|
317
|
-
const task = {
|
|
318
|
-
id,
|
|
319
|
-
url,
|
|
320
|
-
includeMetadataFooter: options.includeMetadataFooter,
|
|
321
|
-
...(options.inputTruncated
|
|
322
|
-
? { inputTruncated: options.inputTruncated }
|
|
323
|
-
: {}),
|
|
324
|
-
signal: options.signal,
|
|
325
|
-
abortListener,
|
|
326
|
-
context,
|
|
327
|
-
resolve,
|
|
328
|
-
reject,
|
|
329
|
-
};
|
|
330
|
-
if (typeof htmlOrBuffer === 'string') {
|
|
331
|
-
task.html = htmlOrBuffer;
|
|
332
|
-
}
|
|
333
|
-
else {
|
|
334
|
-
task.htmlBuffer = htmlOrBuffer;
|
|
335
|
-
if (options.encoding) {
|
|
336
|
-
task.encoding = options.encoding;
|
|
337
|
-
}
|
|
338
|
-
}
|
|
339
|
-
return task;
|
|
340
|
-
}
|
|
341
|
-
onAbortSignal(id, url, context, reject) {
|
|
342
|
-
if (this.closed) {
|
|
343
|
-
this.finalizeTask(context, () => {
|
|
344
|
-
reject(new Error(WorkerPool.CLOSED_MESSAGE));
|
|
345
|
-
});
|
|
346
|
-
return;
|
|
347
|
-
}
|
|
348
|
-
const inflight = this.inflight.get(id);
|
|
349
|
-
if (inflight) {
|
|
350
|
-
void this.abortInflight(id, url, inflight.workerIndex);
|
|
351
|
-
return;
|
|
352
|
-
}
|
|
353
|
-
const queuedTask = this.queue.removeById(id);
|
|
354
|
-
if (queuedTask) {
|
|
355
|
-
this.abortAndCleanTask(queuedTask, createAbortError(url, 'transform:queued-abort'));
|
|
356
|
-
}
|
|
357
|
-
}
|
|
358
|
-
async abortInflight(id, url, workerIndex) {
|
|
359
|
-
const slot = this.workers[workerIndex];
|
|
360
|
-
const inflight = this.inflight.get(id);
|
|
361
|
-
if (inflight) {
|
|
362
|
-
inflight.cancelPending = true;
|
|
363
|
-
}
|
|
364
|
-
if (slot) {
|
|
365
|
-
try {
|
|
366
|
-
slot.worker.postMessage({ type: 'cancel', id });
|
|
367
|
-
}
|
|
368
|
-
catch {
|
|
369
|
-
// Worker may be unavailable; failure is acceptable during abort
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
await this.cancelAcks.wait(id, config.transform.cancelAckTimeoutMs);
|
|
373
|
-
const taken = this.failTask(id, createAbortError(url, 'transform:signal-abort'));
|
|
374
|
-
if (taken && slot)
|
|
375
|
-
this.restartWorker(workerIndex, slot);
|
|
376
|
-
}
|
|
377
|
-
clearAbortListener(signal, listener) {
|
|
378
|
-
if (!signal || !listener)
|
|
379
|
-
return;
|
|
380
|
-
try {
|
|
381
|
-
signal.removeEventListener('abort', listener);
|
|
382
|
-
}
|
|
383
|
-
catch {
|
|
384
|
-
// Defensive: removeEventListener should not throw, but handle edge cases
|
|
385
|
-
}
|
|
386
|
-
}
|
|
387
|
-
spawnWorker(workerIndex) {
|
|
388
|
-
const name = `${WORKER_NAME_PREFIX}-${workerIndex + 1}`;
|
|
389
|
-
const resourceLimits = config.transform.workerResourceLimits;
|
|
390
|
-
const worker = new Worker(TRANSFORM_WORKER_PATH, {
|
|
391
|
-
name,
|
|
392
|
-
...(resourceLimits ? { resourceLimits } : {}),
|
|
393
|
-
});
|
|
394
|
-
logDebug('Spawned transform worker', {
|
|
395
|
-
workerIndex,
|
|
396
|
-
workerName: name,
|
|
397
|
-
}, Loggers.LOG_TRANSFORM);
|
|
398
|
-
worker.unref();
|
|
399
|
-
worker.on('message', (raw) => {
|
|
400
|
-
this.onWorkerMessage(workerIndex, raw);
|
|
401
|
-
});
|
|
402
|
-
worker.on('error', (error) => {
|
|
403
|
-
this.onWorkerBroken(workerIndex, `Transform worker error: ${getErrorMessage(error)}`);
|
|
404
|
-
});
|
|
405
|
-
worker.on('messageerror', (error) => {
|
|
406
|
-
this.onWorkerBroken(workerIndex, `Transform worker error: ${getErrorMessage(error)}`);
|
|
407
|
-
});
|
|
408
|
-
worker.on('exit', (code) => {
|
|
409
|
-
this.onWorkerBroken(workerIndex, `Transform worker exited (code ${code ?? 'unknown'})`);
|
|
410
|
-
});
|
|
411
|
-
return { worker, busy: false, currentTaskId: null, name };
|
|
412
|
-
}
|
|
413
|
-
onWorkerBroken(workerIndex, message) {
|
|
414
|
-
if (this.closed)
|
|
415
|
-
return;
|
|
416
|
-
const slot = this.workers[workerIndex];
|
|
417
|
-
if (!slot)
|
|
418
|
-
return;
|
|
419
|
-
logWarn('Transform worker unavailable; restarting', {
|
|
420
|
-
reason: message,
|
|
421
|
-
workerIndex,
|
|
422
|
-
workerName: slot.name,
|
|
423
|
-
threadId: slot.worker.threadId,
|
|
424
|
-
}, Loggers.LOG_TRANSFORM);
|
|
425
|
-
if (slot.busy && slot.currentTaskId) {
|
|
426
|
-
try {
|
|
427
|
-
this.failTask(slot.currentTaskId, new FetchError(message, '', HTTP_SERVICE_UNAVAILABLE, {
|
|
428
|
-
reason: 'worker_exit',
|
|
429
|
-
}));
|
|
430
|
-
}
|
|
431
|
-
catch {
|
|
432
|
-
this.markIdle(workerIndex);
|
|
433
|
-
}
|
|
434
|
-
}
|
|
435
|
-
this.restartWorker(workerIndex, slot);
|
|
436
|
-
}
|
|
437
|
-
restartWorker(workerIndex, slot) {
|
|
438
|
-
if (this.closed)
|
|
439
|
-
return;
|
|
440
|
-
const target = slot ?? this.workers[workerIndex];
|
|
441
|
-
if (target) {
|
|
442
|
-
target.worker.terminate().catch(() => undefined);
|
|
443
|
-
}
|
|
444
|
-
const attempts = this.restartBackoff.get(workerIndex) ?? 0;
|
|
445
|
-
this.restartBackoff.set(workerIndex, attempts + 1);
|
|
446
|
-
if (attempts > 0) {
|
|
447
|
-
const delayMs = Math.min(1000 * 2 ** (attempts - 1), 30_000);
|
|
448
|
-
logWarn('Scheduling transform worker restart with backoff', {
|
|
449
|
-
workerIndex,
|
|
450
|
-
delayMs,
|
|
451
|
-
attempt: attempts + 1,
|
|
452
|
-
}, Loggers.LOG_TRANSFORM);
|
|
453
|
-
setTimeout(() => {
|
|
454
|
-
if (this.closed)
|
|
455
|
-
return;
|
|
456
|
-
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
457
|
-
this.drainQueue();
|
|
458
|
-
}, delayMs).unref();
|
|
459
|
-
return;
|
|
460
|
-
}
|
|
461
|
-
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
462
|
-
this.drainQueue();
|
|
463
|
-
}
|
|
464
|
-
onWorkerMessage(workerIndex, raw) {
|
|
465
|
-
const parsed = workerResponseSchema.safeParse(raw);
|
|
466
|
-
if (!parsed.success) {
|
|
467
|
-
this.onWorkerBroken(workerIndex, `Transform worker sent invalid message: ${formatZodError(parsed.error)}`);
|
|
468
|
-
return;
|
|
469
|
-
}
|
|
470
|
-
const message = parsed.data;
|
|
471
|
-
if (message.type === 'cancelled') {
|
|
472
|
-
this.cancelAcks.resolve(message.id);
|
|
473
|
-
return;
|
|
474
|
-
}
|
|
475
|
-
const inflightPeek = this.inflight.get(message.id);
|
|
476
|
-
if (inflightPeek?.cancelPending) {
|
|
477
|
-
this.cancelAcks.resolve(message.id);
|
|
478
|
-
return;
|
|
479
|
-
}
|
|
480
|
-
const inflight = this.takeInflight(message.id);
|
|
481
|
-
if (!inflight)
|
|
482
|
-
return;
|
|
483
|
-
this.restartBackoff.delete(workerIndex);
|
|
484
|
-
this.markIdle(workerIndex);
|
|
485
|
-
this.resolveWorkerResult(inflight, message);
|
|
486
|
-
this.drainQueue();
|
|
487
|
-
}
|
|
488
|
-
resolveWorkerResult(inflight, message) {
|
|
489
|
-
this.finalizeTask(inflight.context, () => {
|
|
490
|
-
if (message.type === 'result') {
|
|
491
|
-
inflight.resolve({
|
|
492
|
-
markdown: message.result.markdown,
|
|
493
|
-
truncated: message.result.truncated,
|
|
494
|
-
title: message.result.title,
|
|
495
|
-
...(message.result.metadata
|
|
496
|
-
? { metadata: message.result.metadata }
|
|
497
|
-
: {}),
|
|
498
|
-
});
|
|
499
|
-
}
|
|
500
|
-
else {
|
|
501
|
-
const err = message.error;
|
|
502
|
-
inflight.reject(err.name === 'FetchError'
|
|
503
|
-
? new FetchError(err.message, err.url, err.statusCode, err.details ?? {})
|
|
504
|
-
: new Error(err.message));
|
|
505
|
-
}
|
|
506
|
-
});
|
|
507
|
-
}
|
|
508
|
-
takeInflight(id) {
|
|
509
|
-
const inflight = this.inflight.get(id);
|
|
510
|
-
if (!inflight)
|
|
511
|
-
return null;
|
|
512
|
-
inflight.timeout.cancel();
|
|
513
|
-
this.clearAbortListener(inflight.signal, inflight.abortListener);
|
|
514
|
-
this.inflight.delete(id);
|
|
515
|
-
return inflight;
|
|
516
|
-
}
|
|
517
|
-
markBusy(workerIndex, taskId) {
|
|
518
|
-
const slot = this.workers[workerIndex];
|
|
519
|
-
if (!slot)
|
|
520
|
-
return;
|
|
521
|
-
if (!slot.busy) {
|
|
522
|
-
slot.busy = true;
|
|
523
|
-
this.busyCount += 1;
|
|
524
|
-
}
|
|
525
|
-
slot.currentTaskId = taskId;
|
|
526
|
-
}
|
|
527
|
-
markIdle(workerIndex) {
|
|
528
|
-
const slot = this.workers[workerIndex];
|
|
529
|
-
if (!slot)
|
|
530
|
-
return;
|
|
531
|
-
if (slot.busy) {
|
|
532
|
-
slot.busy = false;
|
|
533
|
-
this.busyCount -= 1;
|
|
534
|
-
}
|
|
535
|
-
slot.currentTaskId = null;
|
|
536
|
-
}
|
|
537
|
-
failTask(id, error) {
|
|
538
|
-
const inflight = this.takeInflight(id);
|
|
539
|
-
if (!inflight)
|
|
540
|
-
return false;
|
|
541
|
-
this.abortAndCleanTask(inflight, error);
|
|
542
|
-
this.markIdle(inflight.workerIndex);
|
|
543
|
-
return true;
|
|
544
|
-
}
|
|
545
|
-
maybeScaleUp() {
|
|
546
|
-
if (this.getQueueDepth() > this.capacity * POOL_SCALE_THRESHOLD &&
|
|
547
|
-
this.capacity < this.maxCapacity) {
|
|
548
|
-
const previousCapacity = this.capacity;
|
|
549
|
-
this.capacity += 1;
|
|
550
|
-
logInfo('Scaled transform worker pool', {
|
|
551
|
-
fromCapacity: previousCapacity,
|
|
552
|
-
toCapacity: this.capacity,
|
|
553
|
-
queueDepth: this.getQueueDepth(),
|
|
554
|
-
}, Loggers.LOG_TRANSFORM);
|
|
555
|
-
}
|
|
556
|
-
}
|
|
557
|
-
drainQueue() {
|
|
558
|
-
if (this.closed || this.queue.depth === 0 || this.draining)
|
|
559
|
-
return;
|
|
560
|
-
this.draining = true;
|
|
561
|
-
try {
|
|
562
|
-
this.maybeScaleUp();
|
|
563
|
-
for (let i = 0; i < this.workers.length; i += 1) {
|
|
564
|
-
const slot = this.workers[i];
|
|
565
|
-
if (slot && !slot.busy) {
|
|
566
|
-
this.dispatchFromQueue(i, slot);
|
|
567
|
-
if (this.queue.depth === 0)
|
|
568
|
-
return;
|
|
569
|
-
}
|
|
570
|
-
}
|
|
571
|
-
if (this.workers.length < this.capacity && this.queue.depth > 0) {
|
|
572
|
-
const workerIndex = this.workers.length;
|
|
573
|
-
const slot = this.spawnWorker(workerIndex);
|
|
574
|
-
this.workers.push(slot);
|
|
575
|
-
this.dispatchFromQueue(workerIndex, slot);
|
|
576
|
-
if (this.workers.length < this.capacity && this.queue.depth > 0) {
|
|
577
|
-
setImmediate(() => {
|
|
578
|
-
this.drainQueue();
|
|
579
|
-
});
|
|
580
|
-
}
|
|
581
|
-
}
|
|
582
|
-
}
|
|
583
|
-
finally {
|
|
584
|
-
this.draining = false;
|
|
585
|
-
}
|
|
586
|
-
}
|
|
587
|
-
dispatchFromQueue(workerIndex, slot) {
|
|
588
|
-
let task = this.queue.dequeue();
|
|
589
|
-
while (task) {
|
|
590
|
-
const currentTask = task;
|
|
591
|
-
if (this.closed) {
|
|
592
|
-
this.abortAndCleanTask(currentTask, new Error(WorkerPool.CLOSED_MESSAGE));
|
|
593
|
-
return;
|
|
594
|
-
}
|
|
595
|
-
if (currentTask.signal?.aborted) {
|
|
596
|
-
this.abortAndCleanTask(currentTask, createAbortError(currentTask.url, 'transform:dispatch'));
|
|
597
|
-
task = this.queue.dequeue();
|
|
598
|
-
continue;
|
|
599
|
-
}
|
|
600
|
-
break;
|
|
601
|
-
}
|
|
602
|
-
if (!task)
|
|
603
|
-
return;
|
|
604
|
-
this.markBusy(workerIndex, task.id);
|
|
605
|
-
const timeout = this.registerInflight(task, workerIndex, slot);
|
|
606
|
-
this.sendToWorker(task, slot, workerIndex, timeout);
|
|
607
|
-
}
|
|
608
|
-
registerInflight(task, workerIndex, slot) {
|
|
609
|
-
const timeout = createUnrefTimeout(this.timeoutMs, null);
|
|
610
|
-
void timeout.promise
|
|
611
|
-
.then(() => {
|
|
612
|
-
try {
|
|
613
|
-
slot.worker.postMessage({ type: 'cancel', id: task.id });
|
|
614
|
-
}
|
|
615
|
-
catch {
|
|
616
|
-
// Worker may be unavailable; proceed with timeout handling
|
|
617
|
-
}
|
|
618
|
-
const inflight = this.takeInflight(task.id);
|
|
619
|
-
if (!inflight)
|
|
620
|
-
return;
|
|
621
|
-
logWarn('Transform worker task timed out', {
|
|
622
|
-
taskId: task.id,
|
|
623
|
-
url: task.url,
|
|
624
|
-
workerIndex,
|
|
625
|
-
timeoutMs: this.timeoutMs,
|
|
626
|
-
}, Loggers.LOG_TRANSFORM);
|
|
627
|
-
this.abortAndCleanTask(inflight, new FetchError('Request timeout', task.url, HTTP_GATEWAY_TIMEOUT, {
|
|
628
|
-
reason: 'timeout',
|
|
629
|
-
stage: 'transform:worker-timeout',
|
|
630
|
-
}));
|
|
631
|
-
this.markIdle(workerIndex);
|
|
632
|
-
this.restartWorker(workerIndex, slot);
|
|
633
|
-
})
|
|
634
|
-
.catch((error) => {
|
|
635
|
-
this.failTask(task.id, error);
|
|
636
|
-
});
|
|
637
|
-
this.inflight.set(task.id, {
|
|
638
|
-
resolve: task.resolve,
|
|
639
|
-
reject: task.reject,
|
|
640
|
-
timeout,
|
|
641
|
-
signal: task.signal,
|
|
642
|
-
abortListener: task.abortListener,
|
|
643
|
-
workerIndex,
|
|
644
|
-
context: task.context,
|
|
645
|
-
cancelPending: false,
|
|
646
|
-
});
|
|
647
|
-
return timeout;
|
|
648
|
-
}
|
|
649
|
-
sendToWorker(task, slot, workerIndex, timeout) {
|
|
650
|
-
try {
|
|
651
|
-
const { message, transferList } = buildWorkerDispatchPayload(task);
|
|
652
|
-
slot.worker.postMessage(message, transferList);
|
|
653
|
-
}
|
|
654
|
-
catch (error) {
|
|
655
|
-
timeout.cancel();
|
|
656
|
-
this.inflight.delete(task.id);
|
|
657
|
-
this.markIdle(workerIndex);
|
|
658
|
-
this.abortAndCleanTask(task, error instanceof Error
|
|
659
|
-
? error
|
|
660
|
-
: new Error('Failed to dispatch transform worker message'));
|
|
661
|
-
this.restartWorker(workerIndex, slot);
|
|
662
|
-
}
|
|
663
|
-
}
|
|
664
|
-
finalizeTask(context, fn) {
|
|
665
|
-
context.run(fn);
|
|
666
|
-
}
|
|
667
|
-
abortAndCleanTask(task, error) {
|
|
668
|
-
this.clearAbortListener(task.signal, task.abortListener);
|
|
669
|
-
this.finalizeTask(task.context, () => {
|
|
670
|
-
task.reject(error);
|
|
671
|
-
});
|
|
672
|
-
}
|
|
673
|
-
}
|
|
674
|
-
// Pool singleton management
|
|
675
|
-
let workerPool = null;
|
|
676
|
-
export function getOrCreateWorkerPool() {
|
|
677
|
-
const size = config.transform.maxWorkerScale === 0 ? 0 : POOL_MIN_WORKERS;
|
|
678
|
-
if (!workerPool) {
|
|
679
|
-
workerPool = new WorkerPool(size, DEFAULT_TIMEOUT_MS);
|
|
680
|
-
logInfo('Initialized transform worker pool', {
|
|
681
|
-
initialCapacity: workerPool.getCapacity(),
|
|
682
|
-
timeoutMs: DEFAULT_TIMEOUT_MS,
|
|
683
|
-
}, Loggers.LOG_TRANSFORM);
|
|
684
|
-
}
|
|
685
|
-
return workerPool;
|
|
686
|
-
}
|
|
687
|
-
export function getWorkerPoolStats() {
|
|
688
|
-
if (!workerPool)
|
|
689
|
-
return null;
|
|
690
|
-
return {
|
|
691
|
-
queueDepth: workerPool.getQueueDepth(),
|
|
692
|
-
activeWorkers: workerPool.getActiveWorkers(),
|
|
693
|
-
capacity: workerPool.getCapacity(),
|
|
694
|
-
};
|
|
695
|
-
}
|
|
696
|
-
export async function shutdownWorkerPool() {
|
|
697
|
-
if (!workerPool)
|
|
698
|
-
return;
|
|
699
|
-
await workerPool.close();
|
|
700
|
-
workerPool = null;
|
|
701
|
-
}
|
|
702
|
-
// Worker thread message handling
|
|
703
|
-
function bootstrapWorkerThread() {
|
|
704
|
-
if (!isMainThread && parentPort) {
|
|
705
|
-
const port = parentPort;
|
|
706
|
-
const onMessage = createTransformMessageHandler({
|
|
707
|
-
sendMessage: (message) => {
|
|
708
|
-
port.postMessage(message);
|
|
709
|
-
},
|
|
710
|
-
runTransform: transformHtmlToMarkdownInProcess,
|
|
711
|
-
});
|
|
712
|
-
port.on('message', onMessage);
|
|
713
|
-
}
|
|
714
|
-
else if (process.send) {
|
|
715
|
-
const send = process.send.bind(process);
|
|
716
|
-
const onMessage = createTransformMessageHandler({
|
|
717
|
-
sendMessage: (message) => {
|
|
718
|
-
send(message);
|
|
719
|
-
},
|
|
720
|
-
runTransform: transformHtmlToMarkdownInProcess,
|
|
721
|
-
});
|
|
722
|
-
process.on('message', onMessage);
|
|
723
|
-
}
|
|
724
|
-
}
|
|
725
|
-
bootstrapWorkerThread();
|