@j0hanz/superfetch 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +139 -46
- package/dist/cache.d.ts +42 -0
- package/dist/cache.js +565 -0
- package/dist/config/env-parsers.d.ts +1 -0
- package/dist/config/env-parsers.js +12 -0
- package/dist/config/index.d.ts +7 -0
- package/dist/config/index.js +20 -8
- package/dist/config/types/content.d.ts +1 -0
- package/dist/config.d.ts +77 -0
- package/dist/config.js +261 -0
- package/dist/crypto.d.ts +2 -0
- package/dist/crypto.js +32 -0
- package/dist/errors.d.ts +10 -0
- package/dist/errors.js +28 -0
- package/dist/fetch.d.ts +40 -0
- package/dist/fetch.js +910 -0
- package/dist/http/auth.js +161 -2
- package/dist/http/base-middleware.d.ts +7 -0
- package/dist/http/base-middleware.js +143 -0
- package/dist/http/cors.d.ts +0 -5
- package/dist/http/cors.js +0 -6
- package/dist/http/download-routes.js +6 -2
- package/dist/http/error-handler.d.ts +2 -0
- package/dist/http/error-handler.js +55 -0
- package/dist/http/host-allowlist.d.ts +3 -0
- package/dist/http/host-allowlist.js +117 -0
- package/dist/http/mcp-routes.d.ts +8 -2
- package/dist/http/mcp-routes.js +101 -8
- package/dist/http/mcp-session-eviction.d.ts +3 -0
- package/dist/http/mcp-session-eviction.js +24 -0
- package/dist/http/mcp-session-init.d.ts +7 -0
- package/dist/http/mcp-session-init.js +94 -0
- package/dist/http/mcp-session-slots.d.ts +17 -0
- package/dist/http/mcp-session-slots.js +55 -0
- package/dist/http/mcp-session-transport-init.d.ts +7 -0
- package/dist/http/mcp-session-transport-init.js +41 -0
- package/dist/http/mcp-session-types.d.ts +5 -0
- package/dist/http/mcp-session-types.js +1 -0
- package/dist/http/mcp-session.d.ts +9 -9
- package/dist/http/mcp-session.js +5 -114
- package/dist/http/mcp-sessions.d.ts +41 -0
- package/dist/http/mcp-sessions.js +392 -0
- package/dist/http/rate-limit.js +2 -2
- package/dist/http/server-middleware.d.ts +6 -1
- package/dist/http/server-middleware.js +3 -117
- package/dist/http/server-shutdown.js +1 -1
- package/dist/http/server-tuning.d.ts +9 -0
- package/dist/http/server-tuning.js +45 -0
- package/dist/http/server.js +206 -9
- package/dist/http/session-cleanup.js +8 -5
- package/dist/http.d.ts +78 -0
- package/dist/http.js +1437 -0
- package/dist/index.js +3 -3
- package/dist/mcp.d.ts +3 -0
- package/dist/mcp.js +94 -0
- package/dist/middleware/error-handler.d.ts +1 -1
- package/dist/middleware/error-handler.js +31 -30
- package/dist/observability.d.ts +16 -0
- package/dist/observability.js +78 -0
- package/dist/resources/cached-content-params.d.ts +5 -0
- package/dist/resources/cached-content-params.js +36 -0
- package/dist/resources/cached-content.js +33 -33
- package/dist/server.js +21 -6
- package/dist/services/cache-events.d.ts +8 -0
- package/dist/services/cache-events.js +19 -0
- package/dist/services/cache.d.ts +5 -4
- package/dist/services/cache.js +49 -45
- package/dist/services/context.d.ts +2 -0
- package/dist/services/context.js +3 -0
- package/dist/services/extractor.d.ts +1 -0
- package/dist/services/extractor.js +77 -40
- package/dist/services/fetcher/agents.js +1 -1
- package/dist/services/fetcher/dns-selection.js +1 -1
- package/dist/services/fetcher/interceptors.js +29 -60
- package/dist/services/fetcher/redirects.js +12 -4
- package/dist/services/fetcher/response.js +18 -8
- package/dist/services/fetcher.d.ts +23 -0
- package/dist/services/fetcher.js +553 -13
- package/dist/services/logger.js +4 -1
- package/dist/services/telemetry.d.ts +19 -0
- package/dist/services/telemetry.js +43 -0
- package/dist/services/transform-worker-pool.d.ts +10 -3
- package/dist/services/transform-worker-pool.js +213 -184
- package/dist/tools/handlers/fetch-single.shared.d.ts +11 -3
- package/dist/tools/handlers/fetch-single.shared.js +131 -2
- package/dist/tools/handlers/fetch-url.tool.d.ts +6 -0
- package/dist/tools/handlers/fetch-url.tool.js +56 -12
- package/dist/tools/index.d.ts +1 -0
- package/dist/tools/index.js +13 -1
- package/dist/tools/schemas.d.ts +2 -0
- package/dist/tools/schemas.js +8 -0
- package/dist/tools/utils/content-shaping.js +19 -4
- package/dist/tools/utils/content-transform-core.d.ts +5 -0
- package/dist/tools/utils/content-transform-core.js +180 -0
- package/dist/tools/utils/content-transform-workers.d.ts +1 -0
- package/dist/tools/utils/content-transform-workers.js +1 -0
- package/dist/tools/utils/content-transform.d.ts +2 -1
- package/dist/tools/utils/content-transform.js +37 -136
- package/dist/tools/utils/fetch-pipeline.js +47 -56
- package/dist/tools/utils/frontmatter.d.ts +3 -0
- package/dist/tools/utils/frontmatter.js +73 -0
- package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
- package/dist/tools/utils/markdown-heuristics.js +19 -0
- package/dist/tools/utils/markdown-signals.d.ts +1 -0
- package/dist/tools/utils/markdown-signals.js +19 -0
- package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
- package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
- package/dist/tools/utils/raw-markdown.d.ts +6 -0
- package/dist/tools/utils/raw-markdown.js +149 -0
- package/dist/tools.d.ts +104 -0
- package/dist/tools.js +421 -0
- package/dist/transform.d.ts +69 -0
- package/dist/transform.js +1509 -0
- package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
- package/dist/transformers/markdown/fenced-code-rule.js +38 -0
- package/dist/transformers/markdown/frontmatter.d.ts +2 -0
- package/dist/transformers/markdown/frontmatter.js +45 -0
- package/dist/transformers/markdown/noise-rule.d.ts +2 -0
- package/dist/transformers/markdown/noise-rule.js +80 -0
- package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
- package/dist/transformers/markdown/turndown-instance.js +19 -0
- package/dist/transformers/markdown.d.ts +5 -0
- package/dist/transformers/markdown.js +314 -0
- package/dist/transformers/markdown.transformer.js +2 -189
- package/dist/utils/cancellation.d.ts +1 -0
- package/dist/utils/cancellation.js +18 -0
- package/dist/utils/code-language-bash.d.ts +1 -0
- package/dist/utils/code-language-bash.js +48 -0
- package/dist/utils/code-language-core.d.ts +2 -0
- package/dist/utils/code-language-core.js +13 -0
- package/dist/utils/code-language-detectors.d.ts +5 -0
- package/dist/utils/code-language-detectors.js +142 -0
- package/dist/utils/code-language-helpers.d.ts +5 -0
- package/dist/utils/code-language-helpers.js +62 -0
- package/dist/utils/code-language-parsing.d.ts +5 -0
- package/dist/utils/code-language-parsing.js +62 -0
- package/dist/utils/code-language.js +250 -46
- package/dist/utils/error-details.d.ts +3 -0
- package/dist/utils/error-details.js +12 -0
- package/dist/utils/filename-generator.js +14 -3
- package/dist/utils/host-normalizer.d.ts +1 -0
- package/dist/utils/host-normalizer.js +37 -0
- package/dist/utils/ip-address.d.ts +4 -0
- package/dist/utils/ip-address.js +6 -0
- package/dist/utils/tool-error-handler.js +12 -17
- package/dist/utils/url-redactor.d.ts +1 -0
- package/dist/utils/url-redactor.js +13 -0
- package/dist/utils/url-validator.js +35 -20
- package/dist/workers/transform-worker.js +82 -38
- package/package.json +13 -10
|
@@ -1,215 +1,244 @@
|
|
|
1
|
+
import { randomUUID } from 'node:crypto';
|
|
2
|
+
import os from 'node:os';
|
|
1
3
|
import { Worker } from 'node:worker_threads';
|
|
2
|
-
import {
|
|
3
|
-
import { getErrorMessage } from '../utils/error-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
4
|
+
import { FetchError } from '../errors/app-error.js';
|
|
5
|
+
import { getErrorMessage } from '../utils/error-details.js';
|
|
6
|
+
let pool = null;
|
|
7
|
+
function resolveDefaultWorkerCount() {
|
|
8
|
+
const parallelism = typeof os.availableParallelism === 'function'
|
|
9
|
+
? os.availableParallelism()
|
|
10
|
+
: os.cpus().length;
|
|
11
|
+
// Leave 1 core for the event loop; cap to avoid runaway memory.
|
|
12
|
+
return Math.min(16, Math.max(1, parallelism - 1));
|
|
13
|
+
}
|
|
14
|
+
const DEFAULT_TIMEOUT_MS = 30000;
|
|
15
|
+
export function getOrCreateTransformWorkerPool() {
|
|
16
|
+
pool ??= new WorkerPool(resolveDefaultWorkerCount(), DEFAULT_TIMEOUT_MS);
|
|
17
|
+
return pool;
|
|
18
|
+
}
|
|
19
|
+
export async function shutdownTransformWorkerPool() {
|
|
20
|
+
if (!pool)
|
|
21
|
+
return;
|
|
22
|
+
await pool.close();
|
|
23
|
+
pool = null;
|
|
24
|
+
}
|
|
25
|
+
class WorkerPool {
|
|
26
|
+
workers = [];
|
|
9
27
|
queue = [];
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
return Promise.reject(new Error('Worker pool is shut down'));
|
|
21
|
-
}
|
|
22
|
-
return new Promise((resolve, reject) => {
|
|
23
|
-
if (signal?.aborted) {
|
|
24
|
-
reject(new Error('Aborted'));
|
|
25
|
-
return;
|
|
26
|
-
}
|
|
27
|
-
const task = this.createTask(request, resolve, reject, signal);
|
|
28
|
-
this.attachAbortHandler(task, signal);
|
|
29
|
-
this.enqueueTask(task);
|
|
30
|
-
});
|
|
31
|
-
}
|
|
32
|
-
async destroy() {
|
|
33
|
-
if (this.destroyed)
|
|
34
|
-
return;
|
|
35
|
-
this.destroyed = true;
|
|
36
|
-
const pending = this.queue.splice(0);
|
|
37
|
-
for (const task of pending) {
|
|
38
|
-
this.cleanupTask(task);
|
|
39
|
-
task.reject(new Error('Worker pool shutting down'));
|
|
40
|
-
}
|
|
41
|
-
for (const slot of this.slots) {
|
|
42
|
-
if (slot.current) {
|
|
43
|
-
const task = slot.current;
|
|
44
|
-
slot.current = undefined;
|
|
45
|
-
slot.busy = false;
|
|
46
|
-
this.cleanupTask(task);
|
|
47
|
-
task.reject(new Error('Worker pool shutting down'));
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
await Promise.allSettled(this.slots.map((slot) => slot.worker.terminate()));
|
|
51
|
-
this.slots = [];
|
|
52
|
-
}
|
|
53
|
-
dispatch() {
|
|
54
|
-
if (this.destroyed)
|
|
55
|
-
return;
|
|
56
|
-
const idle = this.slots.find((slot) => !slot.busy);
|
|
57
|
-
if (!idle)
|
|
58
|
-
return;
|
|
59
|
-
const task = this.queue.shift();
|
|
60
|
-
if (!task)
|
|
61
|
-
return;
|
|
62
|
-
task.status = 'running';
|
|
63
|
-
idle.busy = true;
|
|
64
|
-
idle.current = task;
|
|
65
|
-
try {
|
|
66
|
-
idle.worker.postMessage(task.request);
|
|
67
|
-
}
|
|
68
|
-
catch (error) {
|
|
69
|
-
this.failTask(idle, error);
|
|
28
|
+
inflight = new Map();
|
|
29
|
+
timeoutMs;
|
|
30
|
+
queueMax;
|
|
31
|
+
closed = false;
|
|
32
|
+
constructor(size, timeoutMs) {
|
|
33
|
+
const safeSize = Math.max(1, size);
|
|
34
|
+
this.timeoutMs = timeoutMs;
|
|
35
|
+
this.queueMax = safeSize * 2;
|
|
36
|
+
for (let index = 0; index < safeSize; index += 1) {
|
|
37
|
+
this.workers.push(this.spawnWorker(index));
|
|
70
38
|
}
|
|
71
39
|
}
|
|
72
|
-
|
|
73
|
-
const
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
signal,
|
|
81
|
-
abortHandler: undefined,
|
|
82
|
-
status: 'queued',
|
|
83
|
-
};
|
|
84
|
-
}
|
|
85
|
-
attachAbortHandler(task, signal) {
|
|
86
|
-
if (!signal)
|
|
87
|
-
return;
|
|
88
|
-
const onAbort = () => {
|
|
89
|
-
if (task.status === 'queued') {
|
|
90
|
-
this.removeQueuedTask(task);
|
|
91
|
-
task.reject(new Error('Aborted'));
|
|
92
|
-
return;
|
|
93
|
-
}
|
|
94
|
-
this.abortRunningTask(task);
|
|
40
|
+
spawnWorker(workerIndex) {
|
|
41
|
+
const worker = new Worker(new URL('../workers/transform-worker.js', import.meta.url));
|
|
42
|
+
// Workers must not keep the process alive by themselves.
|
|
43
|
+
worker.unref();
|
|
44
|
+
const slot = {
|
|
45
|
+
worker,
|
|
46
|
+
busy: false,
|
|
47
|
+
currentTaskId: null,
|
|
95
48
|
};
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
}
|
|
99
|
-
enqueueTask(task) {
|
|
100
|
-
this.queue.push(task);
|
|
101
|
-
this.dispatch();
|
|
102
|
-
}
|
|
103
|
-
attachWorker(slot) {
|
|
104
|
-
slot.worker.on('message', (message) => {
|
|
105
|
-
this.handleMessage(slot, message);
|
|
49
|
+
worker.on('message', (raw) => {
|
|
50
|
+
this.onWorkerMessage(workerIndex, raw);
|
|
106
51
|
});
|
|
107
|
-
|
|
108
|
-
this.
|
|
52
|
+
worker.on('error', (error) => {
|
|
53
|
+
this.onWorkerBroken(workerIndex, `Transform worker error: ${getErrorMessage(error)}`);
|
|
109
54
|
});
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
this.handleWorkerFailure(slot, new Error(`Worker exited with code ${code}`));
|
|
113
|
-
}
|
|
55
|
+
worker.on('exit', (code) => {
|
|
56
|
+
this.onWorkerBroken(workerIndex, `Transform worker exited (code ${code})`);
|
|
114
57
|
});
|
|
115
|
-
}
|
|
116
|
-
spawnWorker() {
|
|
117
|
-
const slot = {
|
|
118
|
-
worker: new Worker(this.workerUrl),
|
|
119
|
-
busy: false,
|
|
120
|
-
current: undefined,
|
|
121
|
-
};
|
|
122
|
-
this.attachWorker(slot);
|
|
123
58
|
return slot;
|
|
124
59
|
}
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
if (!task)
|
|
60
|
+
onWorkerBroken(workerIndex, message) {
|
|
61
|
+
if (this.closed)
|
|
128
62
|
return;
|
|
129
|
-
|
|
130
|
-
|
|
63
|
+
const slot = this.workers[workerIndex];
|
|
64
|
+
if (!slot)
|
|
65
|
+
return;
|
|
66
|
+
if (slot.busy && slot.currentTaskId) {
|
|
67
|
+
this.failTask(slot.currentTaskId, new Error(message));
|
|
68
|
+
}
|
|
69
|
+
void slot.worker.terminate();
|
|
70
|
+
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
71
|
+
this.drainQueue();
|
|
72
|
+
}
|
|
73
|
+
onWorkerMessage(workerIndex, raw) {
|
|
74
|
+
if (!raw ||
|
|
75
|
+
typeof raw !== 'object' ||
|
|
76
|
+
!('type' in raw) ||
|
|
77
|
+
!('id' in raw) ||
|
|
78
|
+
typeof raw.id !== 'string' ||
|
|
79
|
+
typeof raw.type !== 'string') {
|
|
80
|
+
return;
|
|
81
|
+
}
|
|
82
|
+
const message = raw;
|
|
83
|
+
const inflight = this.inflight.get(message.id);
|
|
84
|
+
if (!inflight)
|
|
131
85
|
return;
|
|
86
|
+
clearTimeout(inflight.timer);
|
|
87
|
+
if (inflight.signal && inflight.abortListener) {
|
|
88
|
+
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
132
89
|
}
|
|
133
|
-
|
|
134
|
-
slot
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
90
|
+
this.inflight.delete(message.id);
|
|
91
|
+
const slot = this.workers[workerIndex];
|
|
92
|
+
if (slot) {
|
|
93
|
+
slot.busy = false;
|
|
94
|
+
slot.currentTaskId = null;
|
|
95
|
+
}
|
|
96
|
+
if (message.type === 'result') {
|
|
97
|
+
inflight.resolve(message.result);
|
|
138
98
|
}
|
|
139
99
|
else {
|
|
140
|
-
|
|
100
|
+
const { error } = message;
|
|
101
|
+
if (error.name === 'FetchError') {
|
|
102
|
+
inflight.reject(new FetchError(error.message, error.url, error.statusCode, error.details ?? {}));
|
|
103
|
+
}
|
|
104
|
+
else {
|
|
105
|
+
inflight.reject(new Error(error.message));
|
|
106
|
+
}
|
|
141
107
|
}
|
|
142
|
-
this.
|
|
108
|
+
this.drainQueue();
|
|
143
109
|
}
|
|
144
|
-
|
|
145
|
-
const
|
|
146
|
-
if (
|
|
147
|
-
|
|
110
|
+
failTask(id, error) {
|
|
111
|
+
const inflight = this.inflight.get(id);
|
|
112
|
+
if (!inflight)
|
|
113
|
+
return;
|
|
114
|
+
clearTimeout(inflight.timer);
|
|
115
|
+
if (inflight.signal && inflight.abortListener) {
|
|
116
|
+
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
117
|
+
}
|
|
118
|
+
this.inflight.delete(id);
|
|
119
|
+
inflight.reject(error);
|
|
120
|
+
const slot = this.workers[inflight.workerIndex];
|
|
121
|
+
if (slot) {
|
|
148
122
|
slot.busy = false;
|
|
149
|
-
|
|
150
|
-
task.reject(error instanceof Error ? error : new Error(getErrorMessage(error)));
|
|
123
|
+
slot.currentTaskId = null;
|
|
151
124
|
}
|
|
152
|
-
logWarn('Worker thread failure', {
|
|
153
|
-
error: getErrorMessage(error),
|
|
154
|
-
});
|
|
155
|
-
this.replaceWorker(slot);
|
|
156
|
-
this.dispatch();
|
|
157
125
|
}
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
126
|
+
async transform(html, url, options) {
|
|
127
|
+
if (this.closed) {
|
|
128
|
+
throw new Error('Transform worker pool closed');
|
|
161
129
|
}
|
|
162
|
-
|
|
163
|
-
|
|
130
|
+
if (this.queue.length >= this.queueMax) {
|
|
131
|
+
throw new Error('Transform worker queue is full');
|
|
164
132
|
}
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
133
|
+
return new Promise((resolve, reject) => {
|
|
134
|
+
this.queue.push({
|
|
135
|
+
id: randomUUID(),
|
|
136
|
+
html,
|
|
137
|
+
url,
|
|
138
|
+
includeMetadata: options.includeMetadata,
|
|
139
|
+
signal: options.signal,
|
|
140
|
+
resolve,
|
|
141
|
+
reject,
|
|
142
|
+
});
|
|
143
|
+
this.drainQueue();
|
|
144
|
+
});
|
|
169
145
|
}
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
if (!task)
|
|
146
|
+
drainQueue() {
|
|
147
|
+
if (this.queue.length === 0)
|
|
173
148
|
return;
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
149
|
+
for (let workerIndex = 0; workerIndex < this.workers.length; workerIndex += 1) {
|
|
150
|
+
const slot = this.workers[workerIndex];
|
|
151
|
+
if (!slot || slot.busy)
|
|
152
|
+
continue;
|
|
153
|
+
const task = this.queue.shift();
|
|
154
|
+
if (!task)
|
|
155
|
+
return;
|
|
156
|
+
this.dispatch(workerIndex, slot, task);
|
|
157
|
+
if (this.queue.length === 0)
|
|
158
|
+
return;
|
|
159
|
+
}
|
|
179
160
|
}
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
161
|
+
dispatch(workerIndex, slot, task) {
|
|
162
|
+
if (task.signal?.aborted) {
|
|
163
|
+
task.reject(new FetchError('Request was canceled', task.url, 499, {
|
|
164
|
+
reason: 'aborted',
|
|
165
|
+
stage: 'transform:dispatch',
|
|
166
|
+
}));
|
|
183
167
|
return;
|
|
184
|
-
this.handleWorkerFailure(slot, new Error('Aborted'));
|
|
185
|
-
}
|
|
186
|
-
removeQueuedTask(task) {
|
|
187
|
-
const index = this.queue.findIndex((queued) => queued.id === task.id);
|
|
188
|
-
if (index >= 0) {
|
|
189
|
-
this.queue.splice(index, 1);
|
|
190
168
|
}
|
|
191
|
-
|
|
169
|
+
slot.busy = true;
|
|
170
|
+
slot.currentTaskId = task.id;
|
|
171
|
+
const timer = setTimeout(() => {
|
|
172
|
+
try {
|
|
173
|
+
slot.worker.postMessage({ type: 'cancel', id: task.id });
|
|
174
|
+
}
|
|
175
|
+
catch {
|
|
176
|
+
// ignore
|
|
177
|
+
}
|
|
178
|
+
const inflight = this.inflight.get(task.id);
|
|
179
|
+
if (!inflight)
|
|
180
|
+
return;
|
|
181
|
+
clearTimeout(inflight.timer);
|
|
182
|
+
if (inflight.signal && inflight.abortListener) {
|
|
183
|
+
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
184
|
+
}
|
|
185
|
+
this.inflight.delete(task.id);
|
|
186
|
+
inflight.reject(new FetchError('Request timeout', task.url, 504, {
|
|
187
|
+
reason: 'timeout',
|
|
188
|
+
stage: 'transform:worker-timeout',
|
|
189
|
+
}));
|
|
190
|
+
if (!this.closed) {
|
|
191
|
+
void slot.worker.terminate();
|
|
192
|
+
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
193
|
+
this.drainQueue();
|
|
194
|
+
}
|
|
195
|
+
}, this.timeoutMs).unref();
|
|
196
|
+
let abortListener;
|
|
197
|
+
if (task.signal) {
|
|
198
|
+
abortListener = () => {
|
|
199
|
+
try {
|
|
200
|
+
slot.worker.postMessage({ type: 'cancel', id: task.id });
|
|
201
|
+
}
|
|
202
|
+
catch {
|
|
203
|
+
// ignore
|
|
204
|
+
}
|
|
205
|
+
};
|
|
206
|
+
task.signal.addEventListener('abort', abortListener, { once: true });
|
|
207
|
+
}
|
|
208
|
+
this.inflight.set(task.id, {
|
|
209
|
+
resolve: task.resolve,
|
|
210
|
+
reject: task.reject,
|
|
211
|
+
timer,
|
|
212
|
+
signal: task.signal,
|
|
213
|
+
abortListener,
|
|
214
|
+
workerIndex,
|
|
215
|
+
});
|
|
216
|
+
slot.worker.postMessage({
|
|
217
|
+
type: 'transform',
|
|
218
|
+
id: task.id,
|
|
219
|
+
html: task.html,
|
|
220
|
+
url: task.url,
|
|
221
|
+
includeMetadata: task.includeMetadata,
|
|
222
|
+
});
|
|
192
223
|
}
|
|
193
|
-
|
|
194
|
-
if (
|
|
195
|
-
|
|
224
|
+
async close() {
|
|
225
|
+
if (this.closed)
|
|
226
|
+
return;
|
|
227
|
+
this.closed = true;
|
|
228
|
+
const terminations = this.workers.map((slot) => slot.worker.terminate());
|
|
229
|
+
this.workers.length = 0;
|
|
230
|
+
for (const [id, inflight] of this.inflight.entries()) {
|
|
231
|
+
clearTimeout(inflight.timer);
|
|
232
|
+
if (inflight.signal && inflight.abortListener) {
|
|
233
|
+
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
234
|
+
}
|
|
235
|
+
inflight.reject(new Error('Transform worker pool closed'));
|
|
236
|
+
this.inflight.delete(id);
|
|
196
237
|
}
|
|
238
|
+
for (const task of this.queue) {
|
|
239
|
+
task.reject(new Error('Transform worker pool closed'));
|
|
240
|
+
}
|
|
241
|
+
this.queue.length = 0;
|
|
242
|
+
await Promise.allSettled(terminations);
|
|
197
243
|
}
|
|
198
244
|
}
|
|
199
|
-
let pool = null;
|
|
200
|
-
function getPool() {
|
|
201
|
-
if (pool)
|
|
202
|
-
return pool;
|
|
203
|
-
pool = new TransformWorkerPool(new URL('../workers/content-transform.worker.js', import.meta.url), config.workers.poolSize);
|
|
204
|
-
return pool;
|
|
205
|
-
}
|
|
206
|
-
export async function transformInWorker(request, signal) {
|
|
207
|
-
return getPool().run(request, signal);
|
|
208
|
-
}
|
|
209
|
-
export async function destroyTransformWorkers() {
|
|
210
|
-
if (!pool)
|
|
211
|
-
return;
|
|
212
|
-
const current = pool;
|
|
213
|
-
pool = null;
|
|
214
|
-
await current.destroy();
|
|
215
|
-
}
|
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
import type { PipelineResult, ToolContentBlock } from '../../config/types/runtime.js';
|
|
2
|
-
import { executeFetchPipeline } from '../utils/fetch-pipeline.js';
|
|
3
|
-
import { applyInlineContentLimit } from '../utils/inline-content.js';
|
|
1
|
+
import type { FetchPipelineOptions, PipelineResult, ToolContentBlock } from '../../config/types/runtime.js';
|
|
4
2
|
interface SharedFetchOptions<T extends {
|
|
5
3
|
content: string;
|
|
6
4
|
}> {
|
|
@@ -20,4 +18,14 @@ export declare function performSharedFetch<T extends {
|
|
|
20
18
|
}>;
|
|
21
19
|
export type InlineResult = ReturnType<typeof applyInlineContentLimit>;
|
|
22
20
|
export declare function buildToolContentBlocks(structuredContent: Record<string, unknown>, fromCache: boolean, inlineResult: InlineResult, resourceName: string, cacheKey?: string | null, fullContent?: string, url?: string, title?: string): ToolContentBlock[];
|
|
21
|
+
interface InlineContentResult {
|
|
22
|
+
content?: string;
|
|
23
|
+
contentSize: number;
|
|
24
|
+
resourceUri?: string;
|
|
25
|
+
resourceMimeType?: string;
|
|
26
|
+
error?: string;
|
|
27
|
+
truncated?: boolean;
|
|
28
|
+
}
|
|
29
|
+
declare function applyInlineContentLimit(content: string, cacheKey: string | null): InlineContentResult;
|
|
30
|
+
export declare function executeFetchPipeline<T>(options: FetchPipelineOptions<T>): Promise<PipelineResult<T>>;
|
|
23
31
|
export {};
|
|
@@ -1,7 +1,13 @@
|
|
|
1
|
+
import { TRUNCATION_MARKER } from '../../config/formatting.js';
|
|
1
2
|
import { config } from '../../config/index.js';
|
|
3
|
+
import * as cache from '../../services/cache.js';
|
|
4
|
+
import { createCacheKey, toResourceUri } from '../../services/cache-keys.js';
|
|
5
|
+
import { fetchNormalizedUrl } from '../../services/fetcher.js';
|
|
6
|
+
import { logDebug } from '../../services/logger.js';
|
|
2
7
|
import { generateSafeFilename } from '../../utils/filename-generator.js';
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
8
|
+
import { isRecord } from '../../utils/guards.js';
|
|
9
|
+
import { transformToRawUrl } from '../../utils/url-transformer.js';
|
|
10
|
+
import { normalizeUrl } from '../../utils/url-validator.js';
|
|
5
11
|
function applyOptionalPipelineSerialization(pipelineOptions, options) {
|
|
6
12
|
if (options.serialize !== undefined) {
|
|
7
13
|
pipelineOptions.serialize = options.serialize;
|
|
@@ -92,3 +98,126 @@ export function buildToolContentBlocks(structuredContent, fromCache, inlineResul
|
|
|
92
98
|
maybeAppendResourceLink(blocks, inlineResult, resourceName);
|
|
93
99
|
return blocks;
|
|
94
100
|
}
|
|
101
|
+
function applyInlineContentLimit(content, cacheKey) {
|
|
102
|
+
const contentSize = content.length;
|
|
103
|
+
const inlineLimit = config.constants.maxInlineContentChars;
|
|
104
|
+
if (contentSize <= inlineLimit) {
|
|
105
|
+
return { content, contentSize };
|
|
106
|
+
}
|
|
107
|
+
const resourceUri = resolveResourceUri(cacheKey);
|
|
108
|
+
if (!resourceUri) {
|
|
109
|
+
return buildTruncatedFallback(content, contentSize, inlineLimit);
|
|
110
|
+
}
|
|
111
|
+
return {
|
|
112
|
+
contentSize,
|
|
113
|
+
resourceUri,
|
|
114
|
+
resourceMimeType: 'text/markdown',
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
function resolveResourceUri(cacheKey) {
|
|
118
|
+
if (!config.cache.enabled || !cacheKey)
|
|
119
|
+
return null;
|
|
120
|
+
return toResourceUri(cacheKey);
|
|
121
|
+
}
|
|
122
|
+
function buildTruncatedFallback(content, contentSize, inlineLimit) {
|
|
123
|
+
const maxContentLength = Math.max(0, inlineLimit - TRUNCATION_MARKER.length);
|
|
124
|
+
const truncatedContent = content.length > inlineLimit
|
|
125
|
+
? `${content.substring(0, maxContentLength)}${TRUNCATION_MARKER}`
|
|
126
|
+
: content;
|
|
127
|
+
return {
|
|
128
|
+
content: truncatedContent,
|
|
129
|
+
contentSize,
|
|
130
|
+
truncated: true,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
function attemptCacheRetrieval({ cacheKey, deserialize, cacheNamespace, normalizedUrl, }) {
|
|
134
|
+
if (!cacheKey)
|
|
135
|
+
return null;
|
|
136
|
+
const cached = cache.get(cacheKey);
|
|
137
|
+
if (!cached)
|
|
138
|
+
return null;
|
|
139
|
+
if (!deserialize) {
|
|
140
|
+
logCacheMiss('missing deserializer', cacheNamespace, normalizedUrl);
|
|
141
|
+
return null;
|
|
142
|
+
}
|
|
143
|
+
const data = deserialize(cached.content);
|
|
144
|
+
if (data === undefined) {
|
|
145
|
+
logCacheMiss('deserialize failure', cacheNamespace, normalizedUrl);
|
|
146
|
+
return null;
|
|
147
|
+
}
|
|
148
|
+
logDebug('Cache hit', { namespace: cacheNamespace, url: normalizedUrl });
|
|
149
|
+
return {
|
|
150
|
+
data,
|
|
151
|
+
fromCache: true,
|
|
152
|
+
url: normalizedUrl,
|
|
153
|
+
fetchedAt: cached.fetchedAt,
|
|
154
|
+
cacheKey,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
function resolveNormalizedUrl(url) {
|
|
158
|
+
const { normalizedUrl: validatedUrl } = normalizeUrl(url);
|
|
159
|
+
const { url: normalizedUrl, transformed } = transformToRawUrl(validatedUrl);
|
|
160
|
+
return { normalizedUrl, originalUrl: validatedUrl, transformed };
|
|
161
|
+
}
|
|
162
|
+
export async function executeFetchPipeline(options) {
|
|
163
|
+
const resolvedUrl = resolveNormalizedUrl(options.url);
|
|
164
|
+
logRawUrlTransformation(resolvedUrl);
|
|
165
|
+
const cacheKey = createCacheKey(options.cacheNamespace, resolvedUrl.normalizedUrl, options.cacheVary);
|
|
166
|
+
const cachedResult = attemptCacheRetrieval({
|
|
167
|
+
cacheKey,
|
|
168
|
+
deserialize: options.deserialize,
|
|
169
|
+
cacheNamespace: options.cacheNamespace,
|
|
170
|
+
normalizedUrl: resolvedUrl.normalizedUrl,
|
|
171
|
+
});
|
|
172
|
+
if (cachedResult)
|
|
173
|
+
return cachedResult;
|
|
174
|
+
logDebug('Fetching URL', { url: resolvedUrl.normalizedUrl });
|
|
175
|
+
const fetchOptions = options.signal === undefined ? {} : { signal: options.signal };
|
|
176
|
+
const html = await fetchNormalizedUrl(resolvedUrl.normalizedUrl, fetchOptions);
|
|
177
|
+
const data = await options.transform(html, resolvedUrl.normalizedUrl);
|
|
178
|
+
if (cache.isEnabled()) {
|
|
179
|
+
persistCache({
|
|
180
|
+
cacheKey,
|
|
181
|
+
data,
|
|
182
|
+
serialize: options.serialize,
|
|
183
|
+
normalizedUrl: resolvedUrl.normalizedUrl,
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
return {
|
|
187
|
+
data,
|
|
188
|
+
fromCache: false,
|
|
189
|
+
url: resolvedUrl.normalizedUrl,
|
|
190
|
+
fetchedAt: new Date().toISOString(),
|
|
191
|
+
cacheKey,
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
function persistCache({ cacheKey, data, serialize, normalizedUrl, }) {
|
|
195
|
+
if (!cacheKey)
|
|
196
|
+
return;
|
|
197
|
+
const serializer = serialize ?? JSON.stringify;
|
|
198
|
+
const title = extractTitle(data);
|
|
199
|
+
const metadata = {
|
|
200
|
+
url: normalizedUrl,
|
|
201
|
+
...(title === undefined ? {} : { title }),
|
|
202
|
+
};
|
|
203
|
+
cache.set(cacheKey, serializer(data), metadata);
|
|
204
|
+
}
|
|
205
|
+
function extractTitle(value) {
|
|
206
|
+
if (!isRecord(value))
|
|
207
|
+
return undefined;
|
|
208
|
+
const { title } = value;
|
|
209
|
+
return typeof title === 'string' ? title : undefined;
|
|
210
|
+
}
|
|
211
|
+
function logCacheMiss(reason, cacheNamespace, normalizedUrl) {
|
|
212
|
+
logDebug(`Cache miss due to ${reason}`, {
|
|
213
|
+
namespace: cacheNamespace,
|
|
214
|
+
url: normalizedUrl,
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
function logRawUrlTransformation(resolvedUrl) {
|
|
218
|
+
if (!resolvedUrl.transformed)
|
|
219
|
+
return;
|
|
220
|
+
logDebug('Using transformed raw content URL', {
|
|
221
|
+
original: resolvedUrl.originalUrl,
|
|
222
|
+
});
|
|
223
|
+
}
|
|
@@ -1,4 +1,10 @@
|
|
|
1
|
+
import type { MarkdownTransformResult } from '../../config/types/content.js';
|
|
1
2
|
import type { FetchUrlInput, ToolResponseBase } from '../../config/types/tools.js';
|
|
2
3
|
export declare const FETCH_URL_TOOL_NAME = "fetch-url";
|
|
3
4
|
export declare const FETCH_URL_TOOL_DESCRIPTION = "Fetches a webpage and converts it to clean Markdown format";
|
|
5
|
+
type MarkdownPipelineResult = MarkdownTransformResult & {
|
|
6
|
+
readonly content: string;
|
|
7
|
+
};
|
|
8
|
+
export declare function parseCachedMarkdownResult(cached: string): MarkdownPipelineResult | undefined;
|
|
4
9
|
export declare function fetchUrlToolHandler(input: FetchUrlInput): Promise<ToolResponseBase>;
|
|
10
|
+
export {};
|