@j0hanz/fetch-url-mcp 1.2.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cache.d.ts +9 -3
- package/dist/cache.d.ts.map +1 -0
- package/dist/cache.js +54 -119
- package/dist/cache.js.map +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +7 -4
- package/dist/cli.js.map +1 -0
- package/dist/config.d.ts +2 -3
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +19 -27
- package/dist/config.js.map +1 -0
- package/dist/crypto.d.ts +1 -0
- package/dist/crypto.d.ts.map +1 -0
- package/dist/crypto.js +7 -3
- package/dist/crypto.js.map +1 -0
- package/dist/dom-noise-removal.d.ts +2 -1
- package/dist/dom-noise-removal.d.ts.map +1 -0
- package/dist/dom-noise-removal.js +9 -6
- package/dist/dom-noise-removal.js.map +1 -0
- package/dist/download.d.ts +4 -0
- package/dist/download.d.ts.map +1 -0
- package/dist/download.js +106 -0
- package/dist/download.js.map +1 -0
- package/dist/errors.d.ts +1 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +2 -1
- package/dist/errors.js.map +1 -0
- package/dist/examples/mcp-fetch-url-client.js +19 -3
- package/dist/examples/mcp-fetch-url-client.js.map +1 -1
- package/dist/fetch-content.d.ts +1 -0
- package/dist/fetch-content.d.ts.map +1 -0
- package/dist/fetch-content.js +15 -14
- package/dist/fetch-content.js.map +1 -0
- package/dist/fetch-stream.d.ts +1 -0
- package/dist/fetch-stream.d.ts.map +1 -0
- package/dist/fetch-stream.js +1 -0
- package/dist/fetch-stream.js.map +1 -0
- package/dist/fetch.d.ts +1 -0
- package/dist/fetch.d.ts.map +1 -0
- package/dist/fetch.js +123 -54
- package/dist/fetch.js.map +1 -0
- package/dist/host-normalization.d.ts +1 -0
- package/dist/host-normalization.d.ts.map +1 -0
- package/dist/host-normalization.js +22 -9
- package/dist/host-normalization.js.map +1 -0
- package/dist/http/auth.d.ts +51 -0
- package/dist/http/auth.d.ts.map +1 -0
- package/dist/http/auth.js +344 -0
- package/dist/http/auth.js.map +1 -0
- package/dist/http/health.d.ts +7 -0
- package/dist/http/health.d.ts.map +1 -0
- package/dist/http/health.js +156 -0
- package/dist/http/health.js.map +1 -0
- package/dist/http/helpers.d.ts +58 -0
- package/dist/http/helpers.d.ts.map +1 -0
- package/dist/http/helpers.js +370 -0
- package/dist/http/helpers.js.map +1 -0
- package/dist/{http-native.d.ts → http/native.d.ts} +1 -0
- package/dist/http/native.d.ts.map +1 -0
- package/dist/http/native.js +618 -0
- package/dist/http/native.js.map +1 -0
- package/dist/http/rate-limit.d.ts +13 -0
- package/dist/http/rate-limit.d.ts.map +1 -0
- package/dist/http/rate-limit.js +92 -0
- package/dist/http/rate-limit.js.map +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -14
- package/dist/index.js.map +1 -0
- package/dist/instructions.d.ts +2 -0
- package/dist/instructions.d.ts.map +1 -0
- package/dist/instructions.js +41 -0
- package/dist/instructions.js.map +1 -0
- package/dist/ip-blocklist.d.ts +1 -0
- package/dist/ip-blocklist.d.ts.map +1 -0
- package/dist/ip-blocklist.js +13 -8
- package/dist/ip-blocklist.js.map +1 -0
- package/dist/json.d.ts +2 -1
- package/dist/json.d.ts.map +1 -0
- package/dist/json.js +16 -6
- package/dist/json.js.map +1 -0
- package/dist/language-detection.d.ts +1 -0
- package/dist/language-detection.d.ts.map +1 -0
- package/dist/language-detection.js +2 -7
- package/dist/language-detection.js.map +1 -0
- package/dist/markdown-cleanup.d.ts +2 -1
- package/dist/markdown-cleanup.d.ts.map +1 -0
- package/dist/markdown-cleanup.js +52 -54
- package/dist/markdown-cleanup.js.map +1 -0
- package/dist/mcp-validator.d.ts +1 -0
- package/dist/mcp-validator.d.ts.map +1 -0
- package/dist/mcp-validator.js +20 -18
- package/dist/mcp-validator.js.map +1 -0
- package/dist/mcp.d.ts +2 -2
- package/dist/mcp.d.ts.map +1 -0
- package/dist/mcp.js +35 -344
- package/dist/mcp.js.map +1 -0
- package/dist/observability.d.ts +2 -0
- package/dist/observability.d.ts.map +1 -0
- package/dist/observability.js +32 -6
- package/dist/observability.js.map +1 -0
- package/dist/prompts.d.ts +1 -0
- package/dist/prompts.d.ts.map +1 -0
- package/dist/prompts.js +15 -3
- package/dist/prompts.js.map +1 -0
- package/dist/resources.d.ts +1 -0
- package/dist/resources.d.ts.map +1 -0
- package/dist/resources.js +46 -25
- package/dist/resources.js.map +1 -0
- package/dist/server-tuning.d.ts +1 -0
- package/dist/server-tuning.d.ts.map +1 -0
- package/dist/server-tuning.js +14 -17
- package/dist/server-tuning.js.map +1 -0
- package/dist/server.d.ts +1 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +29 -35
- package/dist/server.js.map +1 -0
- package/dist/session.d.ts +2 -0
- package/dist/session.d.ts.map +1 -0
- package/dist/session.js +58 -29
- package/dist/session.js.map +1 -0
- package/dist/tasks/execution.d.ts +42 -0
- package/dist/tasks/execution.d.ts.map +1 -0
- package/dist/tasks/execution.js +241 -0
- package/dist/tasks/execution.js.map +1 -0
- package/dist/{tasks.d.ts → tasks/manager.d.ts} +12 -0
- package/dist/tasks/manager.d.ts.map +1 -0
- package/dist/{tasks.js → tasks/manager.js} +95 -43
- package/dist/tasks/manager.js.map +1 -0
- package/dist/tasks/owner.d.ts +32 -0
- package/dist/tasks/owner.d.ts.map +1 -0
- package/dist/tasks/owner.js +92 -0
- package/dist/tasks/owner.js.map +1 -0
- package/dist/timer-utils.d.ts +1 -0
- package/dist/timer-utils.d.ts.map +1 -0
- package/dist/timer-utils.js +8 -4
- package/dist/timer-utils.js.map +1 -0
- package/dist/tool-errors.d.ts +12 -0
- package/dist/tool-errors.d.ts.map +1 -0
- package/dist/tool-errors.js +55 -0
- package/dist/tool-errors.js.map +1 -0
- package/dist/tool-pipeline.d.ts +72 -0
- package/dist/tool-pipeline.d.ts.map +1 -0
- package/dist/tool-pipeline.js +408 -0
- package/dist/tool-pipeline.js.map +1 -0
- package/dist/tool-progress.d.ts +32 -0
- package/dist/tool-progress.d.ts.map +1 -0
- package/dist/tool-progress.js +129 -0
- package/dist/tool-progress.js.map +1 -0
- package/dist/tools.d.ts +35 -111
- package/dist/tools.d.ts.map +1 -0
- package/dist/tools.js +150 -610
- package/dist/tools.js.map +1 -0
- package/dist/{transform.d.ts → transform/transform.d.ts} +2 -1
- package/dist/transform/transform.d.ts.map +1 -0
- package/dist/{transform.js → transform/transform.js} +81 -771
- package/dist/transform/transform.js.map +1 -0
- package/dist/{transform-types.d.ts → transform/types.d.ts} +2 -0
- package/dist/transform/types.d.ts.map +1 -0
- package/dist/{transform-types.js → transform/types.js} +1 -0
- package/dist/transform/types.js.map +1 -0
- package/dist/transform/worker-pool.d.ts +93 -0
- package/dist/transform/worker-pool.d.ts.map +1 -0
- package/dist/transform/worker-pool.js +757 -0
- package/dist/transform/worker-pool.js.map +1 -0
- package/dist/transform/workers/transform-child.d.ts +2 -0
- package/dist/transform/workers/transform-child.d.ts.map +1 -0
- package/dist/{workers → transform/workers}/transform-child.js +17 -13
- package/dist/transform/workers/transform-child.js.map +1 -0
- package/dist/transform/workers/transform-worker.d.ts +2 -0
- package/dist/transform/workers/transform-worker.d.ts.map +1 -0
- package/dist/{workers → transform/workers}/transform-worker.js +16 -13
- package/dist/transform/workers/transform-worker.js.map +1 -0
- package/dist/type-guards.d.ts +1 -0
- package/dist/type-guards.d.ts.map +1 -0
- package/dist/type-guards.js +4 -4
- package/dist/type-guards.js.map +1 -0
- package/package.json +6 -7
- package/dist/AGENTS.md +0 -152
- package/dist/http-native.js +0 -1320
- package/dist/instructions.md +0 -113
- package/dist/workers/transform-child.d.ts +0 -1
- package/dist/workers/transform-worker.d.ts +0 -1
|
@@ -1,24 +1,18 @@
|
|
|
1
|
-
import { AsyncLocalStorage, AsyncResource } from 'node:async_hooks';
|
|
2
1
|
import { Buffer } from 'node:buffer';
|
|
3
|
-
import { fork } from 'node:child_process';
|
|
4
2
|
import diagnosticsChannel from 'node:diagnostics_channel';
|
|
5
|
-
import { availableParallelism } from 'node:os';
|
|
6
3
|
import { performance } from 'node:perf_hooks';
|
|
7
|
-
import {
|
|
8
|
-
import { isSharedArrayBuffer } from 'node:util/types';
|
|
9
|
-
import { Worker, } from 'node:worker_threads';
|
|
4
|
+
import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
10
5
|
import { parseHTML } from 'linkedom';
|
|
11
6
|
import { NodeHtmlMarkdown, } from 'node-html-markdown';
|
|
12
|
-
import {
|
|
13
|
-
import {
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
16
|
-
import {
|
|
17
|
-
import {
|
|
18
|
-
import {
|
|
19
|
-
import {
|
|
20
|
-
import {
|
|
21
|
-
import { isLikeNode, isObject } from './type-guards.js';
|
|
7
|
+
import { config } from '../config.js';
|
|
8
|
+
import { removeNoiseFromHtml } from '../dom-noise-removal.js';
|
|
9
|
+
import { FetchError, getErrorMessage } from '../errors.js';
|
|
10
|
+
import { isRawTextContentUrl } from '../fetch.js';
|
|
11
|
+
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../language-detection.js';
|
|
12
|
+
import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isRawTextContent, } from '../markdown-cleanup.js';
|
|
13
|
+
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../observability.js';
|
|
14
|
+
import { isLikeNode, isObject } from '../type-guards.js';
|
|
15
|
+
import { getOrCreateWorkerPool, getWorkerPoolStats, shutdownWorkerPool, } from './worker-pool.js';
|
|
22
16
|
const utf8Decoder = new TextDecoder('utf-8');
|
|
23
17
|
function decodeInput(input, encoding) {
|
|
24
18
|
if (typeof input === 'string')
|
|
@@ -84,7 +78,7 @@ function buildTransformSignal(signal) {
|
|
|
84
78
|
class StageTracker {
|
|
85
79
|
channel = diagnosticsChannel.channel('fetch-url-mcp.transform');
|
|
86
80
|
start(url, stage, budget) {
|
|
87
|
-
if (
|
|
81
|
+
if (this.shouldSkipTracking(budget))
|
|
88
82
|
return null;
|
|
89
83
|
const remainingBudgetMs = budget
|
|
90
84
|
? budget.totalBudgetMs - budget.elapsedMs
|
|
@@ -136,7 +130,7 @@ class StageTracker {
|
|
|
136
130
|
return durationMs;
|
|
137
131
|
}
|
|
138
132
|
run(url, stage, fn, budget) {
|
|
139
|
-
if (
|
|
133
|
+
if (this.shouldSkipTracking(budget)) {
|
|
140
134
|
return fn();
|
|
141
135
|
}
|
|
142
136
|
if (budget && budget.elapsedMs >= budget.totalBudgetMs) {
|
|
@@ -156,6 +150,9 @@ class StageTracker {
|
|
|
156
150
|
}
|
|
157
151
|
}
|
|
158
152
|
async runAsync(url, stage, fn) {
|
|
153
|
+
if (this.shouldSkipTracking()) {
|
|
154
|
+
return fn();
|
|
155
|
+
}
|
|
159
156
|
const ctx = this.start(url, stage);
|
|
160
157
|
try {
|
|
161
158
|
return await fn();
|
|
@@ -164,6 +161,9 @@ class StageTracker {
|
|
|
164
161
|
this.end(ctx);
|
|
165
162
|
}
|
|
166
163
|
}
|
|
164
|
+
shouldSkipTracking(budget) {
|
|
165
|
+
return !this.channel.hasSubscribers && !budget;
|
|
166
|
+
}
|
|
167
167
|
publish(event) {
|
|
168
168
|
if (!this.channel.hasSubscribers)
|
|
169
169
|
return;
|
|
@@ -219,26 +219,27 @@ function truncateHtml(html, inputTruncated = false) {
|
|
|
219
219
|
const maxSize = config.constants.maxHtmlSize;
|
|
220
220
|
if (maxSize <= 0)
|
|
221
221
|
return { html, truncated: false };
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
222
|
+
if (html.length <= maxSize) {
|
|
223
|
+
const byteLength = getUtf8ByteLength(html);
|
|
224
|
+
if (byteLength <= maxSize && !inputTruncated)
|
|
225
|
+
return { html, truncated: false };
|
|
226
|
+
}
|
|
226
227
|
const sliced = html.slice(0, maxSize);
|
|
227
|
-
if (
|
|
228
|
+
if (getUtf8ByteLength(sliced) <= maxSize) {
|
|
228
229
|
return { html: trimDanglingTagFragment(sliced), truncated: true };
|
|
229
230
|
}
|
|
230
231
|
const htmlBuffer = Buffer.from(sliced, 'utf8');
|
|
231
232
|
const content = trimDanglingTagFragment(trimUtf8Buffer(htmlBuffer, maxSize).toString('utf8'));
|
|
232
233
|
logWarn('HTML content exceeds maximum size, truncating', {
|
|
233
|
-
size:
|
|
234
|
+
size: getUtf8ByteLength(html),
|
|
234
235
|
maxSize,
|
|
235
|
-
truncatedSize:
|
|
236
|
+
truncatedSize: getUtf8ByteLength(content),
|
|
236
237
|
});
|
|
237
238
|
return { html: content, truncated: true };
|
|
238
239
|
}
|
|
239
240
|
function willTruncate(html) {
|
|
240
241
|
const maxSize = config.constants.maxHtmlSize;
|
|
241
|
-
return maxSize > 0 && getUtf8ByteLength(html) > maxSize;
|
|
242
|
+
return (maxSize > 0 && (html.length > maxSize || getUtf8ByteLength(html) > maxSize));
|
|
242
243
|
}
|
|
243
244
|
const HEAD_END_PATTERN = /<\/head\s*>|<body\b/i;
|
|
244
245
|
const MAX_HEAD_SCAN_LENGTH = 50_000;
|
|
@@ -694,11 +695,12 @@ function buildInlineCodeTranslator() {
|
|
|
694
695
|
};
|
|
695
696
|
}
|
|
696
697
|
function buildCodeTranslator(ctx) {
|
|
698
|
+
const inlineCodeTranslator = buildInlineCodeTranslator();
|
|
697
699
|
if (!isObject(ctx))
|
|
698
|
-
return
|
|
700
|
+
return inlineCodeTranslator;
|
|
699
701
|
const { parent } = ctx;
|
|
700
702
|
if (!isCodeBlock(parent))
|
|
701
|
-
return
|
|
703
|
+
return inlineCodeTranslator;
|
|
702
704
|
return { noEscape: true, preserveWhitespace: true };
|
|
703
705
|
}
|
|
704
706
|
function extractFirstSrcsetUrl(srcset) {
|
|
@@ -713,14 +715,17 @@ const LAZY_SRC_ATTRIBUTES = [
|
|
|
713
715
|
'data-original',
|
|
714
716
|
'data-srcset',
|
|
715
717
|
];
|
|
718
|
+
function isDataUri(value) {
|
|
719
|
+
return value.startsWith('data:');
|
|
720
|
+
}
|
|
716
721
|
function extractNonDataSrcsetUrl(value) {
|
|
717
722
|
const url = extractFirstSrcsetUrl(value);
|
|
718
|
-
return url && !url
|
|
723
|
+
return url && !isDataUri(url) ? url : undefined;
|
|
719
724
|
}
|
|
720
725
|
function resolveLazySrc(getAttribute) {
|
|
721
726
|
for (const attr of LAZY_SRC_ATTRIBUTES) {
|
|
722
727
|
const lazy = getAttribute(attr);
|
|
723
|
-
if (!lazy || lazy
|
|
728
|
+
if (!lazy || isDataUri(lazy))
|
|
724
729
|
continue;
|
|
725
730
|
if (attr === 'data-srcset') {
|
|
726
731
|
const url = extractNonDataSrcsetUrl(lazy);
|
|
@@ -736,7 +741,7 @@ function resolveImageSrc(getAttribute) {
|
|
|
736
741
|
if (!getAttribute)
|
|
737
742
|
return '';
|
|
738
743
|
const srcRaw = getAttribute('src') ?? '';
|
|
739
|
-
if (srcRaw && !srcRaw
|
|
744
|
+
if (srcRaw && !isDataUri(srcRaw))
|
|
740
745
|
return srcRaw;
|
|
741
746
|
// First check common lazy-loading attributes that may contain non-data URLs before falling back to the native srcset, as some sites use data URIs in lazy attributes while still providing valid URLs in srcset.
|
|
742
747
|
const lazySrc = resolveLazySrc(getAttribute);
|
|
@@ -750,7 +755,7 @@ function resolveImageSrc(getAttribute) {
|
|
|
750
755
|
return url;
|
|
751
756
|
}
|
|
752
757
|
// If the only available src is a data URI, we choose to omit it rather than include the raw data in the alt text or URL, as data URIs can be very long and are not useful in Markdown output.
|
|
753
|
-
if (srcRaw
|
|
758
|
+
if (isDataUri(srcRaw))
|
|
754
759
|
return '[data URI removed]';
|
|
755
760
|
return '';
|
|
756
761
|
}
|
|
@@ -1099,7 +1104,7 @@ function resolveRelativeUrlsInSegment(markdown, baseUrl, origin) {
|
|
|
1099
1104
|
}
|
|
1100
1105
|
return output;
|
|
1101
1106
|
}
|
|
1102
|
-
function resolveRelativeUrls(markdown, baseUrl) {
|
|
1107
|
+
function resolveRelativeUrls(markdown, baseUrl, signal) {
|
|
1103
1108
|
let origin;
|
|
1104
1109
|
try {
|
|
1105
1110
|
({ origin } = new URL(baseUrl));
|
|
@@ -1109,7 +1114,6 @@ function resolveRelativeUrls(markdown, baseUrl) {
|
|
|
1109
1114
|
}
|
|
1110
1115
|
if (!markdown)
|
|
1111
1116
|
return markdown;
|
|
1112
|
-
const lines = markdown.split('\n');
|
|
1113
1117
|
let output = '';
|
|
1114
1118
|
let buffer = '';
|
|
1115
1119
|
let fenceMarker = null;
|
|
@@ -1119,26 +1123,51 @@ function resolveRelativeUrls(markdown, baseUrl) {
|
|
|
1119
1123
|
output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
|
|
1120
1124
|
buffer = '';
|
|
1121
1125
|
};
|
|
1122
|
-
|
|
1123
|
-
|
|
1126
|
+
const len = markdown.length;
|
|
1127
|
+
let lastIndex = 0;
|
|
1128
|
+
let lineCount = 0;
|
|
1129
|
+
while (lastIndex < len) {
|
|
1130
|
+
if (++lineCount % 500 === 0 && signal?.aborted) {
|
|
1131
|
+
throw new Error('Transform aborted during URL resolution');
|
|
1132
|
+
}
|
|
1133
|
+
let nextIndex = markdown.indexOf('\n', lastIndex);
|
|
1134
|
+
let line;
|
|
1135
|
+
let lineWithNewline;
|
|
1136
|
+
if (nextIndex === -1) {
|
|
1137
|
+
line = markdown.slice(lastIndex);
|
|
1138
|
+
lineWithNewline = line;
|
|
1139
|
+
nextIndex = len;
|
|
1140
|
+
}
|
|
1141
|
+
else {
|
|
1142
|
+
if (nextIndex > lastIndex && markdown.charCodeAt(nextIndex - 1) === 13) {
|
|
1143
|
+
line = markdown.slice(lastIndex, nextIndex - 1);
|
|
1144
|
+
}
|
|
1145
|
+
else {
|
|
1146
|
+
line = markdown.slice(lastIndex, nextIndex);
|
|
1147
|
+
}
|
|
1148
|
+
lineWithNewline = markdown.slice(lastIndex, nextIndex + 1);
|
|
1149
|
+
nextIndex++; // Skip \n
|
|
1150
|
+
}
|
|
1124
1151
|
const trimmed = line.trimStart();
|
|
1125
|
-
const lineWithNewline = i < lines.length - 1 ? `${line}\n` : line;
|
|
1126
1152
|
if (fenceMarker) {
|
|
1127
1153
|
output += lineWithNewline;
|
|
1128
1154
|
if (trimmed.startsWith(fenceMarker) &&
|
|
1129
1155
|
trimmed.slice(fenceMarker.length).trim() === '') {
|
|
1130
1156
|
fenceMarker = null;
|
|
1131
1157
|
}
|
|
1132
|
-
continue;
|
|
1133
1158
|
}
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1159
|
+
else {
|
|
1160
|
+
const fenceMatch = FENCE_LINE_PATTERN.exec(line);
|
|
1161
|
+
if (fenceMatch?.[1]) {
|
|
1162
|
+
flushBuffer();
|
|
1163
|
+
output += lineWithNewline;
|
|
1164
|
+
fenceMarker = fenceMatch[1];
|
|
1165
|
+
}
|
|
1166
|
+
else {
|
|
1167
|
+
buffer += lineWithNewline;
|
|
1168
|
+
}
|
|
1140
1169
|
}
|
|
1141
|
-
|
|
1170
|
+
lastIndex = nextIndex;
|
|
1142
1171
|
}
|
|
1143
1172
|
flushBuffer();
|
|
1144
1173
|
return output;
|
|
@@ -1148,12 +1177,12 @@ function translateHtmlToMarkdown(params) {
|
|
|
1148
1177
|
abortPolicy.throwIfAborted(signal, url, 'markdown:begin');
|
|
1149
1178
|
const cleanedHtml = skipNoiseRemoval
|
|
1150
1179
|
? html
|
|
1151
|
-
: stageTracker.run(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
|
|
1180
|
+
: stageTracker.run(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url, signal));
|
|
1152
1181
|
abortPolicy.throwIfAborted(signal, url, 'markdown:cleaned');
|
|
1153
1182
|
const content = stageTracker.run(url, 'markdown:translate', () => translateHtmlFragmentToMarkdown(cleanedHtml));
|
|
1154
1183
|
abortPolicy.throwIfAborted(signal, url, 'markdown:translated');
|
|
1155
1184
|
const cleaned = cleanupMarkdownArtifacts(content, signal ? { signal, url } : { url });
|
|
1156
|
-
return url ? resolveRelativeUrls(cleaned, url) : cleaned;
|
|
1185
|
+
return url ? resolveRelativeUrls(cleaned, url, signal) : cleaned;
|
|
1157
1186
|
}
|
|
1158
1187
|
function appendMetadataFooter(content, metadata, url) {
|
|
1159
1188
|
const footer = buildMetadataFooter(metadata, url);
|
|
@@ -1448,13 +1477,13 @@ function shouldUseArticleContent(article, originalHtmlOrDocument) {
|
|
|
1448
1477
|
return !hasTruncatedSentences(article.textContent);
|
|
1449
1478
|
}
|
|
1450
1479
|
function buildContentSource(params) {
|
|
1451
|
-
const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, truncated, skipNoiseRemoval, } = params;
|
|
1480
|
+
const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, truncated, skipNoiseRemoval, signal, } = params;
|
|
1452
1481
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
1453
1482
|
if (useArticleContent && article) {
|
|
1454
1483
|
// Readability output can still be noisy (unless user requested skip).
|
|
1455
1484
|
const cleanedArticleHtml = skipNoiseRemoval
|
|
1456
1485
|
? article.content
|
|
1457
|
-
: removeNoiseFromHtml(article.content, undefined, url);
|
|
1486
|
+
: removeNoiseFromHtml(article.content, undefined, url, signal);
|
|
1458
1487
|
return {
|
|
1459
1488
|
sourceHtml: cleanedArticleHtml,
|
|
1460
1489
|
title: article.title,
|
|
@@ -1468,7 +1497,7 @@ function buildContentSource(params) {
|
|
|
1468
1497
|
if (document) {
|
|
1469
1498
|
const cleanedHtml = skipNoiseRemoval
|
|
1470
1499
|
? html
|
|
1471
|
-
: removeNoiseFromHtml(html, document, url);
|
|
1500
|
+
: removeNoiseFromHtml(html, document, url, signal);
|
|
1472
1501
|
const contentRoot = findContentRoot(document);
|
|
1473
1502
|
if (contentRoot) {
|
|
1474
1503
|
return {
|
|
@@ -1521,6 +1550,7 @@ function resolveContentSource(params) {
|
|
|
1521
1550
|
document,
|
|
1522
1551
|
truncated: truncated ?? false,
|
|
1523
1552
|
...(params.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1553
|
+
...(params.signal ? { signal: params.signal } : {}),
|
|
1524
1554
|
});
|
|
1525
1555
|
}
|
|
1526
1556
|
function buildMarkdownFromContext(context, url, signal) {
|
|
@@ -1603,727 +1633,6 @@ export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
|
1603
1633
|
endTotalTransformStage(totalStage, completed);
|
|
1604
1634
|
}
|
|
1605
1635
|
}
|
|
1606
|
-
function isWorkerResultPayload(value) {
|
|
1607
|
-
if (!isObject(value))
|
|
1608
|
-
return false;
|
|
1609
|
-
const { markdown, metadata, title, truncated } = value;
|
|
1610
|
-
const isMetadataObject = metadata === undefined || isObject(metadata);
|
|
1611
|
-
if (!isMetadataObject)
|
|
1612
|
-
return false;
|
|
1613
|
-
if (metadata && !isExtractedMetadataPayload(metadata)) {
|
|
1614
|
-
return false;
|
|
1615
|
-
}
|
|
1616
|
-
return (typeof markdown === 'string' &&
|
|
1617
|
-
typeof truncated === 'boolean' &&
|
|
1618
|
-
(title === undefined || typeof title === 'string'));
|
|
1619
|
-
}
|
|
1620
|
-
function isExtractedMetadataPayload(value) {
|
|
1621
|
-
if (!isObject(value))
|
|
1622
|
-
return false;
|
|
1623
|
-
const { author, description, favicon, image, modifiedAt, publishedAt, title, } = value;
|
|
1624
|
-
return ((title === undefined || typeof title === 'string') &&
|
|
1625
|
-
(description === undefined || typeof description === 'string') &&
|
|
1626
|
-
(author === undefined || typeof author === 'string') &&
|
|
1627
|
-
(image === undefined || typeof image === 'string') &&
|
|
1628
|
-
(favicon === undefined || typeof favicon === 'string') &&
|
|
1629
|
-
(publishedAt === undefined || typeof publishedAt === 'string') &&
|
|
1630
|
-
(modifiedAt === undefined || typeof modifiedAt === 'string'));
|
|
1631
|
-
}
|
|
1632
|
-
function isWorkerErrorPayload(value) {
|
|
1633
|
-
if (!isObject(value))
|
|
1634
|
-
return false;
|
|
1635
|
-
const { details, message, name, statusCode, url } = value;
|
|
1636
|
-
return (typeof name === 'string' &&
|
|
1637
|
-
typeof message === 'string' &&
|
|
1638
|
-
typeof url === 'string' &&
|
|
1639
|
-
(statusCode === undefined || typeof statusCode === 'number') &&
|
|
1640
|
-
(details === undefined || isObject(details)));
|
|
1641
|
-
}
|
|
1642
|
-
function isWorkerResponse(raw) {
|
|
1643
|
-
if (!isObject(raw))
|
|
1644
|
-
return false;
|
|
1645
|
-
if (typeof raw['id'] !== 'string')
|
|
1646
|
-
return false;
|
|
1647
|
-
if (raw['type'] === 'result') {
|
|
1648
|
-
return isWorkerResultPayload(raw['result']);
|
|
1649
|
-
}
|
|
1650
|
-
if (raw['type'] === 'error') {
|
|
1651
|
-
return isWorkerErrorPayload(raw['error']);
|
|
1652
|
-
}
|
|
1653
|
-
if (raw['type'] === 'cancelled') {
|
|
1654
|
-
return true;
|
|
1655
|
-
}
|
|
1656
|
-
return false;
|
|
1657
|
-
}
|
|
1658
|
-
function createTaskContext() {
|
|
1659
|
-
const runWithStore = AsyncLocalStorage.snapshot();
|
|
1660
|
-
const asyncResource = new AsyncResource('fetch-url-mcp.transform.task');
|
|
1661
|
-
let disposed = false;
|
|
1662
|
-
return {
|
|
1663
|
-
run: (fn) => {
|
|
1664
|
-
runWithStore(() => {
|
|
1665
|
-
asyncResource.runInAsyncScope(fn);
|
|
1666
|
-
});
|
|
1667
|
-
},
|
|
1668
|
-
dispose: () => {
|
|
1669
|
-
if (disposed)
|
|
1670
|
-
return;
|
|
1671
|
-
disposed = true;
|
|
1672
|
-
asyncResource.emitDestroy();
|
|
1673
|
-
},
|
|
1674
|
-
};
|
|
1675
|
-
}
|
|
1676
|
-
function buildWorkerDispatchPayload(task, supportsTransferList) {
|
|
1677
|
-
const message = {
|
|
1678
|
-
type: 'transform',
|
|
1679
|
-
id: task.id,
|
|
1680
|
-
url: task.url,
|
|
1681
|
-
includeMetadata: task.includeMetadata,
|
|
1682
|
-
...(task.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1683
|
-
...(task.inputTruncated ? { inputTruncated: true } : {}),
|
|
1684
|
-
};
|
|
1685
|
-
if (!task.htmlBuffer) {
|
|
1686
|
-
message.html = task.html;
|
|
1687
|
-
return { message };
|
|
1688
|
-
}
|
|
1689
|
-
const htmlBuffer = ensureTightBuffer(task.htmlBuffer);
|
|
1690
|
-
if (!supportsTransferList) {
|
|
1691
|
-
message.htmlBuffer = htmlBuffer;
|
|
1692
|
-
if (task.encoding)
|
|
1693
|
-
message.encoding = task.encoding;
|
|
1694
|
-
return { message };
|
|
1695
|
-
}
|
|
1696
|
-
const transferableHtmlBuffer = Uint8Array.from(htmlBuffer);
|
|
1697
|
-
message.htmlBuffer = transferableHtmlBuffer;
|
|
1698
|
-
if (task.encoding)
|
|
1699
|
-
message.encoding = task.encoding;
|
|
1700
|
-
const backingBuffer = transferableHtmlBuffer.buffer;
|
|
1701
|
-
if (isSharedArrayBuffer(backingBuffer))
|
|
1702
|
-
return { message };
|
|
1703
|
-
return { message, transferList: [backingBuffer] };
|
|
1704
|
-
}
|
|
1705
|
-
/**
|
|
1706
|
-
* Worker Pool Sizing Configuration
|
|
1707
|
-
*
|
|
1708
|
-
* Default: min(4, floor(availableParallelism() / 2)), constrained to [2, N]
|
|
1709
|
-
*
|
|
1710
|
-
* Tuning Guidance:
|
|
1711
|
-
* - **Default behavior**: Appropriate for most deployments. Uses half of available
|
|
1712
|
-
* CPU threads (capped at 4) to balance throughput with system resource availability.
|
|
1713
|
-
*
|
|
1714
|
-
* - **CPU-limited containers**: If running in a container with strict CPU limits
|
|
1715
|
-
* (e.g., Docker with --cpus=2), the default may over-subscribe. Consider setting
|
|
1716
|
-
* maxWorkerScale to match the container's CPU limit.
|
|
1717
|
-
*
|
|
1718
|
-
* - **High-concurrency workloads**: For dedicated servers handling many concurrent
|
|
1719
|
-
* fetch requests, increasing maxWorkerScale to (availableParallelism() + 2) may
|
|
1720
|
-
* improve throughput by overlapping I/O wait with computation.
|
|
1721
|
-
*
|
|
1722
|
-
* - **Memory-constrained environments**: Each worker allocates ~50-100MB for DOM
|
|
1723
|
-
* parsing. If memory is limited, reduce maxWorkerScale to (availableParallelism() / 2)
|
|
1724
|
-
* or lower to prevent OOM errors.
|
|
1725
|
-
*
|
|
1726
|
-
* - **Shared hosting**: On shared systems where CPU is contested, reducing the pool
|
|
1727
|
-
* size prevents starving other processes. Consider maxWorkerScale = 2 or using
|
|
1728
|
-
* process-based workers (TRANSFORM_WORKER_MODE=process) for better isolation.
|
|
1729
|
-
*
|
|
1730
|
-
* Configuration:
|
|
1731
|
-
* - TRANSFORM_MAX_WORKER_SCALE env var (default: availableParallelism())
|
|
1732
|
-
* - TRANSFORM_WORKER_MODE env var: 'threads' (default) or 'process'
|
|
1733
|
-
*
|
|
1734
|
-
* See config.ts for full worker configuration options.
|
|
1735
|
-
*/
|
|
1736
|
-
const POOL_MIN_WORKERS = Math.max(2, Math.min(4, Math.floor(availableParallelism() / 2)));
|
|
1737
|
-
const POOL_MAX_WORKERS = config.transform.maxWorkerScale;
|
|
1738
|
-
const POOL_SCALE_THRESHOLD = 0.5;
|
|
1739
|
-
const WORKER_NAME_PREFIX = 'fetch-url-mcp-transform';
|
|
1740
|
-
const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
|
|
1741
|
-
const TRANSFORM_CHILD_PATH = fileURLToPath(new URL('./workers/transform-child.js', import.meta.url));
|
|
1742
|
-
function ensureTightBuffer(buffer) {
|
|
1743
|
-
if (buffer.byteOffset === 0 &&
|
|
1744
|
-
buffer.byteLength === buffer.buffer.byteLength) {
|
|
1745
|
-
return buffer;
|
|
1746
|
-
}
|
|
1747
|
-
return Buffer.from(buffer);
|
|
1748
|
-
}
|
|
1749
|
-
function createThreadWorkerHost(_workerIndex, name) {
|
|
1750
|
-
const resourceLimits = config.transform.workerResourceLimits;
|
|
1751
|
-
const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url), {
|
|
1752
|
-
name,
|
|
1753
|
-
...(resourceLimits ? { resourceLimits } : {}),
|
|
1754
|
-
});
|
|
1755
|
-
return {
|
|
1756
|
-
kind: 'thread',
|
|
1757
|
-
supportsTransferList: true,
|
|
1758
|
-
threadId: worker.threadId,
|
|
1759
|
-
postMessage: (message, transferList) => {
|
|
1760
|
-
worker.postMessage(message, transferList);
|
|
1761
|
-
},
|
|
1762
|
-
terminate: async () => {
|
|
1763
|
-
await worker.terminate();
|
|
1764
|
-
},
|
|
1765
|
-
unref: () => {
|
|
1766
|
-
worker.unref();
|
|
1767
|
-
},
|
|
1768
|
-
onMessage: (handler) => {
|
|
1769
|
-
worker.on('message', handler);
|
|
1770
|
-
},
|
|
1771
|
-
onError: (handler) => {
|
|
1772
|
-
worker.on('error', handler);
|
|
1773
|
-
worker.on('messageerror', handler);
|
|
1774
|
-
},
|
|
1775
|
-
onExit: (handler) => {
|
|
1776
|
-
worker.on('exit', (code) => {
|
|
1777
|
-
handler(code, null);
|
|
1778
|
-
});
|
|
1779
|
-
},
|
|
1780
|
-
};
|
|
1781
|
-
}
|
|
1782
|
-
function createProcessWorkerHost(workerIndex, name) {
|
|
1783
|
-
const child = fork(TRANSFORM_CHILD_PATH, [], {
|
|
1784
|
-
stdio: ['ignore', 'ignore', 'ignore', 'ipc'],
|
|
1785
|
-
serialization: 'advanced',
|
|
1786
|
-
env: {
|
|
1787
|
-
...process.env,
|
|
1788
|
-
FETCH_URL_MCP_WORKER_INDEX: String(workerIndex),
|
|
1789
|
-
FETCH_URL_MCP_WORKER_NAME: name,
|
|
1790
|
-
},
|
|
1791
|
-
});
|
|
1792
|
-
if (child.pid === undefined) {
|
|
1793
|
-
throw new Error('Failed to fork process');
|
|
1794
|
-
}
|
|
1795
|
-
return {
|
|
1796
|
-
kind: 'process',
|
|
1797
|
-
supportsTransferList: false,
|
|
1798
|
-
pid: child.pid,
|
|
1799
|
-
postMessage: (message) => {
|
|
1800
|
-
if (!child.connected) {
|
|
1801
|
-
throw new Error('Transform worker IPC channel is closed');
|
|
1802
|
-
}
|
|
1803
|
-
child.send(message);
|
|
1804
|
-
},
|
|
1805
|
-
terminate: () => new Promise((resolve) => {
|
|
1806
|
-
if (child.exitCode !== null || child.killed) {
|
|
1807
|
-
resolve();
|
|
1808
|
-
return;
|
|
1809
|
-
}
|
|
1810
|
-
child.once('exit', () => {
|
|
1811
|
-
resolve();
|
|
1812
|
-
});
|
|
1813
|
-
try {
|
|
1814
|
-
child.kill();
|
|
1815
|
-
}
|
|
1816
|
-
catch {
|
|
1817
|
-
resolve();
|
|
1818
|
-
}
|
|
1819
|
-
}),
|
|
1820
|
-
unref: () => {
|
|
1821
|
-
child.unref();
|
|
1822
|
-
},
|
|
1823
|
-
onMessage: (handler) => {
|
|
1824
|
-
child.on('message', handler);
|
|
1825
|
-
},
|
|
1826
|
-
onError: (handler) => {
|
|
1827
|
-
child.on('error', handler);
|
|
1828
|
-
},
|
|
1829
|
-
onExit: (handler) => {
|
|
1830
|
-
child.on('exit', (code, signal) => {
|
|
1831
|
-
handler(code, signal);
|
|
1832
|
-
});
|
|
1833
|
-
},
|
|
1834
|
-
};
|
|
1835
|
-
}
|
|
1836
|
-
class WorkerPool {
|
|
1837
|
-
static CLOSED_MESSAGE = 'Transform worker pool closed';
|
|
1838
|
-
workers = [];
|
|
1839
|
-
capacity;
|
|
1840
|
-
minCapacity = POOL_MIN_WORKERS;
|
|
1841
|
-
maxCapacity = POOL_MAX_WORKERS;
|
|
1842
|
-
queue = [];
|
|
1843
|
-
queueHead = 0;
|
|
1844
|
-
inflight = new Map();
|
|
1845
|
-
cancelAcks = new Map();
|
|
1846
|
-
timeoutMs;
|
|
1847
|
-
queueMax;
|
|
1848
|
-
spawnWorkerImpl;
|
|
1849
|
-
closed = false;
|
|
1850
|
-
taskIdSeq = 0;
|
|
1851
|
-
constructor(size, timeoutMs, spawnWorker) {
|
|
1852
|
-
if (size === 0) {
|
|
1853
|
-
this.capacity = 0;
|
|
1854
|
-
}
|
|
1855
|
-
else {
|
|
1856
|
-
this.capacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
|
|
1857
|
-
}
|
|
1858
|
-
this.timeoutMs = timeoutMs;
|
|
1859
|
-
this.queueMax = this.maxCapacity * 32;
|
|
1860
|
-
this.spawnWorkerImpl = spawnWorker;
|
|
1861
|
-
}
|
|
1862
|
-
async transform(htmlOrBuffer, url, options) {
|
|
1863
|
-
this.ensureOpen();
|
|
1864
|
-
if (options.signal?.aborted)
|
|
1865
|
-
throw abortPolicy.createAbortError(url, 'transform:enqueue');
|
|
1866
|
-
if (this.getQueueDepth() >= this.queueMax) {
|
|
1867
|
-
throw new FetchError('Transform worker queue is full', url, 503, {
|
|
1868
|
-
reason: 'queue_full',
|
|
1869
|
-
stage: 'transform:enqueue',
|
|
1870
|
-
});
|
|
1871
|
-
}
|
|
1872
|
-
return new Promise((resolve, reject) => {
|
|
1873
|
-
const task = this.createPendingTask(htmlOrBuffer, url, options, resolve, reject);
|
|
1874
|
-
this.queue.push(task);
|
|
1875
|
-
this.drainQueue();
|
|
1876
|
-
});
|
|
1877
|
-
}
|
|
1878
|
-
getQueueDepth() {
|
|
1879
|
-
const depth = this.queue.length - this.queueHead;
|
|
1880
|
-
return depth > 0 ? depth : 0;
|
|
1881
|
-
}
|
|
1882
|
-
getActiveWorkers() {
|
|
1883
|
-
return this.workers.filter((s) => s?.busy).length;
|
|
1884
|
-
}
|
|
1885
|
-
getCapacity() {
|
|
1886
|
-
return this.capacity;
|
|
1887
|
-
}
|
|
1888
|
-
resize(size) {
|
|
1889
|
-
const newCapacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
|
|
1890
|
-
if (newCapacity === this.capacity)
|
|
1891
|
-
return;
|
|
1892
|
-
this.capacity = newCapacity;
|
|
1893
|
-
this.drainQueue();
|
|
1894
|
-
}
|
|
1895
|
-
async close() {
|
|
1896
|
-
if (this.closed)
|
|
1897
|
-
return;
|
|
1898
|
-
this.closed = true;
|
|
1899
|
-
const terminations = this.workers
|
|
1900
|
-
.map((slot) => slot?.host.terminate())
|
|
1901
|
-
.filter((p) => p !== undefined);
|
|
1902
|
-
this.workers.fill(undefined);
|
|
1903
|
-
this.workers.length = 0;
|
|
1904
|
-
for (const id of Array.from(this.inflight.keys())) {
|
|
1905
|
-
const inflight = this.takeInflight(id);
|
|
1906
|
-
if (!inflight)
|
|
1907
|
-
continue;
|
|
1908
|
-
this.finalizeTask(inflight.context, () => {
|
|
1909
|
-
inflight.reject(new Error(WorkerPool.CLOSED_MESSAGE));
|
|
1910
|
-
});
|
|
1911
|
-
}
|
|
1912
|
-
for (let i = this.queueHead; i < this.queue.length; i += 1) {
|
|
1913
|
-
const task = this.queue[i];
|
|
1914
|
-
if (!task)
|
|
1915
|
-
continue;
|
|
1916
|
-
this.clearAbortListener(task.signal, task.abortListener);
|
|
1917
|
-
this.finalizeTask(task.context, () => {
|
|
1918
|
-
task.reject(new Error(WorkerPool.CLOSED_MESSAGE));
|
|
1919
|
-
});
|
|
1920
|
-
}
|
|
1921
|
-
this.queue.length = 0;
|
|
1922
|
-
this.queueHead = 0;
|
|
1923
|
-
await Promise.allSettled(terminations);
|
|
1924
|
-
}
|
|
1925
|
-
ensureOpen() {
|
|
1926
|
-
if (this.closed)
|
|
1927
|
-
throw new Error(WorkerPool.CLOSED_MESSAGE);
|
|
1928
|
-
}
|
|
1929
|
-
createPendingTask(htmlOrBuffer, url, options, resolve, reject) {
|
|
1930
|
-
const id = (this.taskIdSeq++).toString(36);
|
|
1931
|
-
// Preserve request context for resolve/reject even when callbacks fire
|
|
1932
|
-
// from worker thread events.
|
|
1933
|
-
const context = createTaskContext();
|
|
1934
|
-
let abortListener;
|
|
1935
|
-
if (options.signal) {
|
|
1936
|
-
abortListener = () => {
|
|
1937
|
-
this.onAbortSignal(id, url, context, reject);
|
|
1938
|
-
};
|
|
1939
|
-
options.signal.addEventListener('abort', abortListener, { once: true });
|
|
1940
|
-
}
|
|
1941
|
-
const task = {
|
|
1942
|
-
id,
|
|
1943
|
-
url,
|
|
1944
|
-
includeMetadata: options.includeMetadata,
|
|
1945
|
-
...(options.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1946
|
-
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1947
|
-
signal: options.signal,
|
|
1948
|
-
abortListener,
|
|
1949
|
-
context,
|
|
1950
|
-
resolve,
|
|
1951
|
-
reject,
|
|
1952
|
-
};
|
|
1953
|
-
if (typeof htmlOrBuffer === 'string') {
|
|
1954
|
-
task.html = htmlOrBuffer;
|
|
1955
|
-
}
|
|
1956
|
-
else {
|
|
1957
|
-
task.htmlBuffer = htmlOrBuffer;
|
|
1958
|
-
if (options.encoding) {
|
|
1959
|
-
task.encoding = options.encoding;
|
|
1960
|
-
}
|
|
1961
|
-
}
|
|
1962
|
-
return task;
|
|
1963
|
-
}
|
|
1964
|
-
onAbortSignal(id, url, context, reject) {
|
|
1965
|
-
if (this.closed) {
|
|
1966
|
-
this.finalizeTask(context, () => {
|
|
1967
|
-
reject(new Error(WorkerPool.CLOSED_MESSAGE));
|
|
1968
|
-
});
|
|
1969
|
-
return;
|
|
1970
|
-
}
|
|
1971
|
-
const inflight = this.inflight.get(id);
|
|
1972
|
-
if (inflight) {
|
|
1973
|
-
void this.abortInflight(id, url, inflight.workerIndex);
|
|
1974
|
-
return;
|
|
1975
|
-
}
|
|
1976
|
-
const queuedIndex = this.findQueuedIndex(id);
|
|
1977
|
-
if (queuedIndex !== null) {
|
|
1978
|
-
const task = this.queue[queuedIndex];
|
|
1979
|
-
if (task)
|
|
1980
|
-
this.clearAbortListener(task.signal, task.abortListener);
|
|
1981
|
-
this.queue.splice(queuedIndex, 1);
|
|
1982
|
-
if (task) {
|
|
1983
|
-
this.finalizeTask(task.context, () => {
|
|
1984
|
-
task.reject(abortPolicy.createAbortError(url, 'transform:queued-abort'));
|
|
1985
|
-
});
|
|
1986
|
-
}
|
|
1987
|
-
else {
|
|
1988
|
-
this.finalizeTask(context, () => {
|
|
1989
|
-
reject(abortPolicy.createAbortError(url, 'transform:queued-abort'));
|
|
1990
|
-
});
|
|
1991
|
-
}
|
|
1992
|
-
this.maybeCompactQueue();
|
|
1993
|
-
}
|
|
1994
|
-
}
|
|
1995
|
-
resolveCancelAck(id) {
|
|
1996
|
-
const pending = this.cancelAcks.get(id);
|
|
1997
|
-
if (!pending)
|
|
1998
|
-
return;
|
|
1999
|
-
pending.timeout.cancel();
|
|
2000
|
-
pending.resolve();
|
|
2001
|
-
}
|
|
2002
|
-
waitForCancelAck(id) {
|
|
2003
|
-
const existing = this.cancelAcks.get(id);
|
|
2004
|
-
if (existing) {
|
|
2005
|
-
return existing.promise;
|
|
2006
|
-
}
|
|
2007
|
-
let resolve = () => { };
|
|
2008
|
-
const timeout = createUnrefTimeout(200, undefined);
|
|
2009
|
-
const racePromise = new Promise((finish) => {
|
|
2010
|
-
resolve = finish;
|
|
2011
|
-
});
|
|
2012
|
-
const promise = Promise.race([racePromise, timeout.promise]).finally(() => {
|
|
2013
|
-
this.cancelAcks.delete(id);
|
|
2014
|
-
timeout.cancel();
|
|
2015
|
-
});
|
|
2016
|
-
this.cancelAcks.set(id, { promise, resolve, timeout });
|
|
2017
|
-
return promise;
|
|
2018
|
-
}
|
|
2019
|
-
async abortInflight(id, url, workerIndex) {
|
|
2020
|
-
const slot = this.workers[workerIndex];
|
|
2021
|
-
const inflight = this.inflight.get(id);
|
|
2022
|
-
if (inflight) {
|
|
2023
|
-
inflight.cancelPending = true;
|
|
2024
|
-
}
|
|
2025
|
-
if (slot) {
|
|
2026
|
-
try {
|
|
2027
|
-
slot.host.postMessage({ type: 'cancel', id });
|
|
2028
|
-
}
|
|
2029
|
-
catch {
|
|
2030
|
-
// Worker may be unavailable; failure is acceptable during abort
|
|
2031
|
-
}
|
|
2032
|
-
}
|
|
2033
|
-
await this.waitForCancelAck(id);
|
|
2034
|
-
this.failTask(id, abortPolicy.createAbortError(url, 'transform:signal-abort'));
|
|
2035
|
-
if (slot)
|
|
2036
|
-
this.restartWorker(workerIndex, slot);
|
|
2037
|
-
}
|
|
2038
|
-
clearAbortListener(signal, listener) {
|
|
2039
|
-
if (!signal || !listener)
|
|
2040
|
-
return;
|
|
2041
|
-
try {
|
|
2042
|
-
signal.removeEventListener('abort', listener);
|
|
2043
|
-
}
|
|
2044
|
-
catch {
|
|
2045
|
-
// Defensive: removeEventListener should not throw, but handle edge cases
|
|
2046
|
-
}
|
|
2047
|
-
}
|
|
2048
|
-
spawnWorker(workerIndex) {
|
|
2049
|
-
const name = `${WORKER_NAME_PREFIX}-${workerIndex + 1}`;
|
|
2050
|
-
const host = this.spawnWorkerImpl(workerIndex, name);
|
|
2051
|
-
host.unref();
|
|
2052
|
-
host.onMessage((raw) => {
|
|
2053
|
-
this.onWorkerMessage(workerIndex, raw);
|
|
2054
|
-
});
|
|
2055
|
-
host.onError((error) => {
|
|
2056
|
-
this.onWorkerBroken(workerIndex, `Transform worker error: ${getErrorMessage(error)}`);
|
|
2057
|
-
});
|
|
2058
|
-
host.onExit((code, signal) => {
|
|
2059
|
-
const suffix = signal ? `signal ${signal}` : `code ${code ?? 'unknown'}`;
|
|
2060
|
-
this.onWorkerBroken(workerIndex, `Transform worker exited (${suffix})`);
|
|
2061
|
-
});
|
|
2062
|
-
return { host, busy: false, currentTaskId: null, name };
|
|
2063
|
-
}
|
|
2064
|
-
onWorkerBroken(workerIndex, message) {
|
|
2065
|
-
if (this.closed)
|
|
2066
|
-
return;
|
|
2067
|
-
const slot = this.workers[workerIndex];
|
|
2068
|
-
if (!slot)
|
|
2069
|
-
return;
|
|
2070
|
-
logWarn('Transform worker unavailable; restarting', {
|
|
2071
|
-
reason: message,
|
|
2072
|
-
workerIndex,
|
|
2073
|
-
workerKind: slot.host.kind,
|
|
2074
|
-
workerName: slot.name,
|
|
2075
|
-
...(slot.host.kind === 'process'
|
|
2076
|
-
? { pid: slot.host.pid }
|
|
2077
|
-
: { threadId: slot.host.threadId }),
|
|
2078
|
-
});
|
|
2079
|
-
if (slot.busy && slot.currentTaskId) {
|
|
2080
|
-
this.failTask(slot.currentTaskId, new Error(message));
|
|
2081
|
-
}
|
|
2082
|
-
this.restartWorker(workerIndex, slot);
|
|
2083
|
-
}
|
|
2084
|
-
restartWorker(workerIndex, slot) {
|
|
2085
|
-
if (this.closed)
|
|
2086
|
-
return;
|
|
2087
|
-
const target = slot ?? this.workers[workerIndex];
|
|
2088
|
-
if (target) {
|
|
2089
|
-
target.host.terminate().catch(() => undefined);
|
|
2090
|
-
}
|
|
2091
|
-
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
2092
|
-
this.drainQueue();
|
|
2093
|
-
}
|
|
2094
|
-
onWorkerMessage(workerIndex, raw) {
|
|
2095
|
-
if (!isWorkerResponse(raw))
|
|
2096
|
-
return;
|
|
2097
|
-
const message = raw;
|
|
2098
|
-
if (message.type === 'cancelled') {
|
|
2099
|
-
this.resolveCancelAck(message.id);
|
|
2100
|
-
return;
|
|
2101
|
-
}
|
|
2102
|
-
const inflightPeek = this.inflight.get(message.id);
|
|
2103
|
-
if (inflightPeek?.cancelPending) {
|
|
2104
|
-
this.resolveCancelAck(message.id);
|
|
2105
|
-
return;
|
|
2106
|
-
}
|
|
2107
|
-
const inflight = this.takeInflight(message.id);
|
|
2108
|
-
if (!inflight)
|
|
2109
|
-
return;
|
|
2110
|
-
this.markIdle(workerIndex);
|
|
2111
|
-
if (message.type === 'result') {
|
|
2112
|
-
this.finalizeTask(inflight.context, () => {
|
|
2113
|
-
inflight.resolve({
|
|
2114
|
-
markdown: message.result.markdown,
|
|
2115
|
-
truncated: message.result.truncated,
|
|
2116
|
-
title: message.result.title,
|
|
2117
|
-
...(message.result.metadata
|
|
2118
|
-
? { metadata: message.result.metadata }
|
|
2119
|
-
: {}),
|
|
2120
|
-
});
|
|
2121
|
-
});
|
|
2122
|
-
}
|
|
2123
|
-
else {
|
|
2124
|
-
const err = message.error;
|
|
2125
|
-
if (err.name === 'FetchError') {
|
|
2126
|
-
this.finalizeTask(inflight.context, () => {
|
|
2127
|
-
inflight.reject(new FetchError(err.message, err.url, err.statusCode, err.details ?? {}));
|
|
2128
|
-
});
|
|
2129
|
-
}
|
|
2130
|
-
else {
|
|
2131
|
-
this.finalizeTask(inflight.context, () => {
|
|
2132
|
-
inflight.reject(new Error(err.message));
|
|
2133
|
-
});
|
|
2134
|
-
}
|
|
2135
|
-
}
|
|
2136
|
-
this.drainQueue();
|
|
2137
|
-
}
|
|
2138
|
-
takeInflight(id) {
|
|
2139
|
-
const inflight = this.inflight.get(id);
|
|
2140
|
-
if (!inflight)
|
|
2141
|
-
return null;
|
|
2142
|
-
inflight.timeout.cancel();
|
|
2143
|
-
this.clearAbortListener(inflight.signal, inflight.abortListener);
|
|
2144
|
-
this.inflight.delete(id);
|
|
2145
|
-
return inflight;
|
|
2146
|
-
}
|
|
2147
|
-
markIdle(workerIndex) {
|
|
2148
|
-
const slot = this.workers[workerIndex];
|
|
2149
|
-
if (!slot)
|
|
2150
|
-
return;
|
|
2151
|
-
slot.busy = false;
|
|
2152
|
-
slot.currentTaskId = null;
|
|
2153
|
-
}
|
|
2154
|
-
failTask(id, error) {
|
|
2155
|
-
const inflight = this.takeInflight(id);
|
|
2156
|
-
if (!inflight)
|
|
2157
|
-
return;
|
|
2158
|
-
this.finalizeTask(inflight.context, () => {
|
|
2159
|
-
inflight.reject(error);
|
|
2160
|
-
});
|
|
2161
|
-
this.markIdle(inflight.workerIndex);
|
|
2162
|
-
}
|
|
2163
|
-
maybeScaleUp() {
|
|
2164
|
-
if (this.getQueueDepth() > this.capacity * POOL_SCALE_THRESHOLD &&
|
|
2165
|
-
this.capacity < this.maxCapacity) {
|
|
2166
|
-
this.capacity += 1;
|
|
2167
|
-
}
|
|
2168
|
-
}
|
|
2169
|
-
drainQueue() {
|
|
2170
|
-
if (this.closed || this.getQueueDepth() === 0)
|
|
2171
|
-
return;
|
|
2172
|
-
this.maybeScaleUp();
|
|
2173
|
-
for (let i = 0; i < this.workers.length; i += 1) {
|
|
2174
|
-
const slot = this.workers[i];
|
|
2175
|
-
if (slot && !slot.busy) {
|
|
2176
|
-
this.dispatchFromQueue(i, slot);
|
|
2177
|
-
if (this.getQueueDepth() === 0)
|
|
2178
|
-
return;
|
|
2179
|
-
}
|
|
2180
|
-
}
|
|
2181
|
-
if (this.workers.length < this.capacity && this.getQueueDepth() > 0) {
|
|
2182
|
-
const workerIndex = this.workers.length;
|
|
2183
|
-
const slot = this.spawnWorker(workerIndex);
|
|
2184
|
-
this.workers.push(slot);
|
|
2185
|
-
this.dispatchFromQueue(workerIndex, slot);
|
|
2186
|
-
if (this.workers.length < this.capacity && this.getQueueDepth() > 0) {
|
|
2187
|
-
setImmediate(() => {
|
|
2188
|
-
this.drainQueue();
|
|
2189
|
-
});
|
|
2190
|
-
}
|
|
2191
|
-
}
|
|
2192
|
-
}
|
|
2193
|
-
takeNextQueuedTask() {
|
|
2194
|
-
while (this.queueHead < this.queue.length) {
|
|
2195
|
-
const task = this.queue[this.queueHead];
|
|
2196
|
-
this.queueHead += 1;
|
|
2197
|
-
if (task) {
|
|
2198
|
-
this.maybeCompactQueue();
|
|
2199
|
-
return task;
|
|
2200
|
-
}
|
|
2201
|
-
}
|
|
2202
|
-
this.maybeCompactQueue();
|
|
2203
|
-
return null;
|
|
2204
|
-
}
|
|
2205
|
-
dispatchFromQueue(workerIndex, slot) {
|
|
2206
|
-
const task = this.takeNextQueuedTask();
|
|
2207
|
-
if (!task)
|
|
2208
|
-
return;
|
|
2209
|
-
if (this.closed) {
|
|
2210
|
-
this.clearAbortListener(task.signal, task.abortListener);
|
|
2211
|
-
this.finalizeTask(task.context, () => {
|
|
2212
|
-
task.reject(new Error(WorkerPool.CLOSED_MESSAGE));
|
|
2213
|
-
});
|
|
2214
|
-
return;
|
|
2215
|
-
}
|
|
2216
|
-
if (task.signal?.aborted) {
|
|
2217
|
-
this.clearAbortListener(task.signal, task.abortListener);
|
|
2218
|
-
this.finalizeTask(task.context, () => {
|
|
2219
|
-
task.reject(abortPolicy.createAbortError(task.url, 'transform:dispatch'));
|
|
2220
|
-
});
|
|
2221
|
-
return;
|
|
2222
|
-
}
|
|
2223
|
-
slot.busy = true;
|
|
2224
|
-
slot.currentTaskId = task.id;
|
|
2225
|
-
const timeout = createUnrefTimeout(this.timeoutMs, null);
|
|
2226
|
-
void timeout.promise
|
|
2227
|
-
.then(() => {
|
|
2228
|
-
try {
|
|
2229
|
-
slot.host.postMessage({ type: 'cancel', id: task.id });
|
|
2230
|
-
}
|
|
2231
|
-
catch {
|
|
2232
|
-
// Worker may be unavailable; proceed with timeout handling
|
|
2233
|
-
}
|
|
2234
|
-
const inflight = this.takeInflight(task.id);
|
|
2235
|
-
if (!inflight)
|
|
2236
|
-
return;
|
|
2237
|
-
this.finalizeTask(inflight.context, () => {
|
|
2238
|
-
inflight.reject(new FetchError('Request timeout', task.url, 504, {
|
|
2239
|
-
reason: 'timeout',
|
|
2240
|
-
stage: 'transform:worker-timeout',
|
|
2241
|
-
}));
|
|
2242
|
-
});
|
|
2243
|
-
this.restartWorker(workerIndex, slot);
|
|
2244
|
-
})
|
|
2245
|
-
.catch((error) => {
|
|
2246
|
-
this.failTask(task.id, error);
|
|
2247
|
-
});
|
|
2248
|
-
this.inflight.set(task.id, {
|
|
2249
|
-
resolve: task.resolve,
|
|
2250
|
-
reject: task.reject,
|
|
2251
|
-
timeout,
|
|
2252
|
-
signal: task.signal,
|
|
2253
|
-
abortListener: task.abortListener,
|
|
2254
|
-
workerIndex,
|
|
2255
|
-
context: task.context,
|
|
2256
|
-
cancelPending: false,
|
|
2257
|
-
});
|
|
2258
|
-
try {
|
|
2259
|
-
const { message, transferList } = buildWorkerDispatchPayload(task, slot.host.supportsTransferList);
|
|
2260
|
-
slot.host.postMessage(message, transferList);
|
|
2261
|
-
}
|
|
2262
|
-
catch (error) {
|
|
2263
|
-
timeout.cancel();
|
|
2264
|
-
this.clearAbortListener(task.signal, task.abortListener);
|
|
2265
|
-
this.inflight.delete(task.id);
|
|
2266
|
-
this.markIdle(workerIndex);
|
|
2267
|
-
this.finalizeTask(task.context, () => {
|
|
2268
|
-
task.reject(error instanceof Error
|
|
2269
|
-
? error
|
|
2270
|
-
: new Error('Failed to dispatch transform worker message'));
|
|
2271
|
-
});
|
|
2272
|
-
this.restartWorker(workerIndex, slot);
|
|
2273
|
-
}
|
|
2274
|
-
}
|
|
2275
|
-
finalizeTask(context, fn) {
|
|
2276
|
-
try {
|
|
2277
|
-
context.run(fn);
|
|
2278
|
-
}
|
|
2279
|
-
finally {
|
|
2280
|
-
context.dispose();
|
|
2281
|
-
}
|
|
2282
|
-
}
|
|
2283
|
-
findQueuedIndex(id) {
|
|
2284
|
-
for (let i = this.queueHead; i < this.queue.length; i += 1) {
|
|
2285
|
-
const task = this.queue[i];
|
|
2286
|
-
if (task?.id === id)
|
|
2287
|
-
return i;
|
|
2288
|
-
}
|
|
2289
|
-
return null;
|
|
2290
|
-
}
|
|
2291
|
-
maybeCompactQueue() {
|
|
2292
|
-
if (this.queueHead === 0)
|
|
2293
|
-
return;
|
|
2294
|
-
if (this.queueHead >= this.queue.length ||
|
|
2295
|
-
(this.queueHead > 1024 && this.queueHead > this.queue.length / 2)) {
|
|
2296
|
-
this.queue.splice(0, this.queueHead);
|
|
2297
|
-
this.queueHead = 0;
|
|
2298
|
-
}
|
|
2299
|
-
}
|
|
2300
|
-
}
|
|
2301
|
-
let workerPool = null;
|
|
2302
|
-
function resolveWorkerSpawner() {
|
|
2303
|
-
return config.transform.workerMode === 'process'
|
|
2304
|
-
? createProcessWorkerHost
|
|
2305
|
-
: createThreadWorkerHost;
|
|
2306
|
-
}
|
|
2307
|
-
function getOrCreateWorkerPool() {
|
|
2308
|
-
const size = config.transform.maxWorkerScale === 0 ? 0 : POOL_MIN_WORKERS;
|
|
2309
|
-
workerPool ??= new WorkerPool(size, DEFAULT_TIMEOUT_MS, resolveWorkerSpawner());
|
|
2310
|
-
return workerPool;
|
|
2311
|
-
}
|
|
2312
|
-
function getWorkerPoolStats() {
|
|
2313
|
-
if (!workerPool)
|
|
2314
|
-
return null;
|
|
2315
|
-
return {
|
|
2316
|
-
queueDepth: workerPool.getQueueDepth(),
|
|
2317
|
-
activeWorkers: workerPool.getActiveWorkers(),
|
|
2318
|
-
capacity: workerPool.getCapacity(),
|
|
2319
|
-
};
|
|
2320
|
-
}
|
|
2321
|
-
async function shutdownWorkerPool() {
|
|
2322
|
-
if (!workerPool)
|
|
2323
|
-
return;
|
|
2324
|
-
await workerPool.close();
|
|
2325
|
-
workerPool = null;
|
|
2326
|
-
}
|
|
2327
1636
|
export function getTransformPoolStats() {
|
|
2328
1637
|
return getWorkerPoolStats();
|
|
2329
1638
|
}
|
|
@@ -2410,3 +1719,4 @@ export async function transformHtmlToMarkdown(html, url, options) {
|
|
|
2410
1719
|
export async function transformBufferToMarkdown(htmlBuffer, url, options) {
|
|
2411
1720
|
return transformInputToMarkdown(htmlBuffer, url, options);
|
|
2412
1721
|
}
|
|
1722
|
+
//# sourceMappingURL=transform.js.map
|