@j0hanz/fetch-url-mcp 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cache.d.ts +9 -3
- package/dist/cache.d.ts.map +1 -0
- package/dist/cache.js +44 -110
- package/dist/cache.js.map +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +9 -4
- package/dist/cli.js.map +1 -0
- package/dist/config.d.ts +2 -3
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +18 -25
- package/dist/config.js.map +1 -0
- package/dist/crypto.d.ts +1 -0
- package/dist/crypto.d.ts.map +1 -0
- package/dist/crypto.js +1 -0
- package/dist/crypto.js.map +1 -0
- package/dist/dom-noise-removal.d.ts +2 -1
- package/dist/dom-noise-removal.d.ts.map +1 -0
- package/dist/dom-noise-removal.js +8 -4
- package/dist/dom-noise-removal.js.map +1 -0
- package/dist/download.d.ts +4 -0
- package/dist/download.d.ts.map +1 -0
- package/dist/download.js +106 -0
- package/dist/download.js.map +1 -0
- package/dist/errors.d.ts +1 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/errors.js +1 -0
- package/dist/errors.js.map +1 -0
- package/dist/examples/mcp-fetch-url-client.js +19 -3
- package/dist/examples/mcp-fetch-url-client.js.map +1 -1
- package/dist/fetch-content.d.ts +1 -0
- package/dist/fetch-content.d.ts.map +1 -0
- package/dist/fetch-content.js +14 -14
- package/dist/fetch-content.js.map +1 -0
- package/dist/fetch-stream.d.ts +1 -0
- package/dist/fetch-stream.d.ts.map +1 -0
- package/dist/fetch-stream.js +6 -3
- package/dist/fetch-stream.js.map +1 -0
- package/dist/fetch.d.ts +1 -0
- package/dist/fetch.d.ts.map +1 -0
- package/dist/fetch.js +120 -51
- package/dist/fetch.js.map +1 -0
- package/dist/host-normalization.d.ts +1 -0
- package/dist/host-normalization.d.ts.map +1 -0
- package/dist/host-normalization.js +19 -6
- package/dist/host-normalization.js.map +1 -0
- package/dist/http/auth.d.ts +35 -0
- package/dist/http/auth.d.ts.map +1 -0
- package/dist/http/auth.js +283 -0
- package/dist/http/auth.js.map +1 -0
- package/dist/http/health.d.ts +7 -0
- package/dist/http/health.d.ts.map +1 -0
- package/dist/http/health.js +166 -0
- package/dist/http/health.js.map +1 -0
- package/dist/http/helpers.d.ts +58 -0
- package/dist/http/helpers.d.ts.map +1 -0
- package/dist/http/helpers.js +372 -0
- package/dist/http/helpers.js.map +1 -0
- package/dist/{http-native.d.ts → http/native.d.ts} +1 -0
- package/dist/http/native.d.ts.map +1 -0
- package/dist/http/native.js +529 -0
- package/dist/http/native.js.map +1 -0
- package/dist/http/rate-limit.d.ts +13 -0
- package/dist/http/rate-limit.d.ts.map +1 -0
- package/dist/http/rate-limit.js +81 -0
- package/dist/http/rate-limit.js.map +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +2 -1
- package/dist/index.js.map +1 -0
- package/dist/instructions.d.ts +2 -0
- package/dist/instructions.d.ts.map +1 -0
- package/dist/instructions.js +108 -0
- package/dist/instructions.js.map +1 -0
- package/dist/ip-blocklist.d.ts +1 -0
- package/dist/ip-blocklist.d.ts.map +1 -0
- package/dist/ip-blocklist.js +2 -0
- package/dist/ip-blocklist.js.map +1 -0
- package/dist/json.d.ts +2 -1
- package/dist/json.d.ts.map +1 -0
- package/dist/json.js +19 -6
- package/dist/json.js.map +1 -0
- package/dist/language-detection.d.ts +1 -0
- package/dist/language-detection.d.ts.map +1 -0
- package/dist/language-detection.js +1 -0
- package/dist/language-detection.js.map +1 -0
- package/dist/markdown-cleanup.d.ts +2 -1
- package/dist/markdown-cleanup.d.ts.map +1 -0
- package/dist/markdown-cleanup.js +51 -52
- package/dist/markdown-cleanup.js.map +1 -0
- package/dist/mcp-validator.d.ts +1 -0
- package/dist/mcp-validator.d.ts.map +1 -0
- package/dist/mcp-validator.js +16 -8
- package/dist/mcp-validator.js.map +1 -0
- package/dist/mcp.d.ts +2 -2
- package/dist/mcp.d.ts.map +1 -0
- package/dist/mcp.js +17 -333
- package/dist/mcp.js.map +1 -0
- package/dist/observability.d.ts +2 -0
- package/dist/observability.d.ts.map +1 -0
- package/dist/observability.js +30 -5
- package/dist/observability.js.map +1 -0
- package/dist/prompts.d.ts +1 -0
- package/dist/prompts.d.ts.map +1 -0
- package/dist/prompts.js +15 -3
- package/dist/prompts.js.map +1 -0
- package/dist/resources.d.ts +1 -0
- package/dist/resources.d.ts.map +1 -0
- package/dist/resources.js +30 -23
- package/dist/resources.js.map +1 -0
- package/dist/server-tuning.d.ts +1 -0
- package/dist/server-tuning.d.ts.map +1 -0
- package/dist/server-tuning.js +11 -15
- package/dist/server-tuning.js.map +1 -0
- package/dist/server.d.ts +1 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +23 -23
- package/dist/server.js.map +1 -0
- package/dist/session.d.ts +1 -0
- package/dist/session.d.ts.map +1 -0
- package/dist/session.js +55 -28
- package/dist/session.js.map +1 -0
- package/dist/tasks/execution.d.ts +42 -0
- package/dist/tasks/execution.d.ts.map +1 -0
- package/dist/tasks/execution.js +232 -0
- package/dist/tasks/execution.js.map +1 -0
- package/dist/{tasks.d.ts → tasks/manager.d.ts} +6 -0
- package/dist/tasks/manager.d.ts.map +1 -0
- package/dist/{tasks.js → tasks/manager.js} +86 -37
- package/dist/tasks/manager.js.map +1 -0
- package/dist/tasks/owner.d.ts +33 -0
- package/dist/tasks/owner.d.ts.map +1 -0
- package/dist/tasks/owner.js +99 -0
- package/dist/tasks/owner.js.map +1 -0
- package/dist/timer-utils.d.ts +1 -0
- package/dist/timer-utils.d.ts.map +1 -0
- package/dist/timer-utils.js +12 -5
- package/dist/timer-utils.js.map +1 -0
- package/dist/tool-errors.d.ts +12 -0
- package/dist/tool-errors.d.ts.map +1 -0
- package/dist/tool-errors.js +52 -0
- package/dist/tool-errors.js.map +1 -0
- package/dist/tool-pipeline.d.ts +72 -0
- package/dist/tool-pipeline.d.ts.map +1 -0
- package/dist/tool-pipeline.js +407 -0
- package/dist/tool-pipeline.js.map +1 -0
- package/dist/tool-progress.d.ts +32 -0
- package/dist/tool-progress.d.ts.map +1 -0
- package/dist/tool-progress.js +123 -0
- package/dist/tool-progress.js.map +1 -0
- package/dist/tools.d.ts +35 -111
- package/dist/tools.d.ts.map +1 -0
- package/dist/tools.js +93 -566
- package/dist/tools.js.map +1 -0
- package/dist/{transform.d.ts → transform/transform.d.ts} +2 -1
- package/dist/transform/transform.d.ts.map +1 -0
- package/dist/{transform.js → transform/transform.js} +73 -769
- package/dist/transform/transform.js.map +1 -0
- package/dist/{transform-types.d.ts → transform/types.d.ts} +1 -0
- package/dist/transform/types.d.ts.map +1 -0
- package/dist/{transform-types.js → transform/types.js} +1 -0
- package/dist/transform/types.js.map +1 -0
- package/dist/transform/worker-pool.d.ts +93 -0
- package/dist/transform/worker-pool.d.ts.map +1 -0
- package/dist/transform/worker-pool.js +759 -0
- package/dist/transform/worker-pool.js.map +1 -0
- package/dist/transform/workers/transform-child.d.ts +2 -0
- package/dist/transform/workers/transform-child.d.ts.map +1 -0
- package/dist/{workers → transform/workers}/transform-child.js +3 -1
- package/dist/transform/workers/transform-child.js.map +1 -0
- package/dist/transform/workers/transform-worker.d.ts +2 -0
- package/dist/transform/workers/transform-worker.d.ts.map +1 -0
- package/dist/{workers → transform/workers}/transform-worker.js +2 -1
- package/dist/transform/workers/transform-worker.js.map +1 -0
- package/dist/type-guards.d.ts +1 -0
- package/dist/type-guards.d.ts.map +1 -0
- package/dist/type-guards.js +1 -0
- package/dist/type-guards.js.map +1 -0
- package/package.json +6 -7
- package/dist/AGENTS.md +0 -152
- package/dist/http-native.js +0 -1320
- package/dist/instructions.md +0 -113
- package/dist/workers/transform-child.d.ts +0 -1
- package/dist/workers/transform-worker.d.ts +0 -1
|
@@ -1,24 +1,18 @@
|
|
|
1
|
-
import { AsyncLocalStorage, AsyncResource } from 'node:async_hooks';
|
|
2
1
|
import { Buffer } from 'node:buffer';
|
|
3
|
-
import { fork } from 'node:child_process';
|
|
4
2
|
import diagnosticsChannel from 'node:diagnostics_channel';
|
|
5
|
-
import { availableParallelism } from 'node:os';
|
|
6
3
|
import { performance } from 'node:perf_hooks';
|
|
7
|
-
import {
|
|
8
|
-
import { isSharedArrayBuffer } from 'node:util/types';
|
|
9
|
-
import { Worker, } from 'node:worker_threads';
|
|
4
|
+
import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
10
5
|
import { parseHTML } from 'linkedom';
|
|
11
6
|
import { NodeHtmlMarkdown, } from 'node-html-markdown';
|
|
12
|
-
import {
|
|
13
|
-
import {
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
16
|
-
import {
|
|
17
|
-
import {
|
|
18
|
-
import {
|
|
19
|
-
import {
|
|
20
|
-
import {
|
|
21
|
-
import { isLikeNode, isObject } from './type-guards.js';
|
|
7
|
+
import { config } from '../config.js';
|
|
8
|
+
import { removeNoiseFromHtml } from '../dom-noise-removal.js';
|
|
9
|
+
import { FetchError, getErrorMessage } from '../errors.js';
|
|
10
|
+
import { isRawTextContentUrl } from '../fetch.js';
|
|
11
|
+
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../language-detection.js';
|
|
12
|
+
import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isRawTextContent, } from '../markdown-cleanup.js';
|
|
13
|
+
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../observability.js';
|
|
14
|
+
import { isLikeNode, isObject } from '../type-guards.js';
|
|
15
|
+
import { getOrCreateWorkerPool, getWorkerPoolStats, shutdownWorkerPool, } from './worker-pool.js';
|
|
22
16
|
const utf8Decoder = new TextDecoder('utf-8');
|
|
23
17
|
function decodeInput(input, encoding) {
|
|
24
18
|
if (typeof input === 'string')
|
|
@@ -219,26 +213,27 @@ function truncateHtml(html, inputTruncated = false) {
|
|
|
219
213
|
const maxSize = config.constants.maxHtmlSize;
|
|
220
214
|
if (maxSize <= 0)
|
|
221
215
|
return { html, truncated: false };
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
216
|
+
if (html.length <= maxSize) {
|
|
217
|
+
const byteLength = getUtf8ByteLength(html);
|
|
218
|
+
if (byteLength <= maxSize && !inputTruncated)
|
|
219
|
+
return { html, truncated: false };
|
|
220
|
+
}
|
|
226
221
|
const sliced = html.slice(0, maxSize);
|
|
227
|
-
if (
|
|
222
|
+
if (getUtf8ByteLength(sliced) <= maxSize) {
|
|
228
223
|
return { html: trimDanglingTagFragment(sliced), truncated: true };
|
|
229
224
|
}
|
|
230
225
|
const htmlBuffer = Buffer.from(sliced, 'utf8');
|
|
231
226
|
const content = trimDanglingTagFragment(trimUtf8Buffer(htmlBuffer, maxSize).toString('utf8'));
|
|
232
227
|
logWarn('HTML content exceeds maximum size, truncating', {
|
|
233
|
-
size:
|
|
228
|
+
size: getUtf8ByteLength(html),
|
|
234
229
|
maxSize,
|
|
235
|
-
truncatedSize:
|
|
230
|
+
truncatedSize: getUtf8ByteLength(content),
|
|
236
231
|
});
|
|
237
232
|
return { html: content, truncated: true };
|
|
238
233
|
}
|
|
239
234
|
function willTruncate(html) {
|
|
240
235
|
const maxSize = config.constants.maxHtmlSize;
|
|
241
|
-
return maxSize > 0 && getUtf8ByteLength(html) > maxSize;
|
|
236
|
+
return (maxSize > 0 && (html.length > maxSize || getUtf8ByteLength(html) > maxSize));
|
|
242
237
|
}
|
|
243
238
|
const HEAD_END_PATTERN = /<\/head\s*>|<body\b/i;
|
|
244
239
|
const MAX_HEAD_SCAN_LENGTH = 50_000;
|
|
@@ -694,11 +689,12 @@ function buildInlineCodeTranslator() {
|
|
|
694
689
|
};
|
|
695
690
|
}
|
|
696
691
|
function buildCodeTranslator(ctx) {
|
|
692
|
+
const inlineCodeTranslator = buildInlineCodeTranslator();
|
|
697
693
|
if (!isObject(ctx))
|
|
698
|
-
return
|
|
694
|
+
return inlineCodeTranslator;
|
|
699
695
|
const { parent } = ctx;
|
|
700
696
|
if (!isCodeBlock(parent))
|
|
701
|
-
return
|
|
697
|
+
return inlineCodeTranslator;
|
|
702
698
|
return { noEscape: true, preserveWhitespace: true };
|
|
703
699
|
}
|
|
704
700
|
function extractFirstSrcsetUrl(srcset) {
|
|
@@ -713,14 +709,17 @@ const LAZY_SRC_ATTRIBUTES = [
|
|
|
713
709
|
'data-original',
|
|
714
710
|
'data-srcset',
|
|
715
711
|
];
|
|
712
|
+
function isDataUri(value) {
|
|
713
|
+
return value.startsWith('data:');
|
|
714
|
+
}
|
|
716
715
|
function extractNonDataSrcsetUrl(value) {
|
|
717
716
|
const url = extractFirstSrcsetUrl(value);
|
|
718
|
-
return url && !url
|
|
717
|
+
return url && !isDataUri(url) ? url : undefined;
|
|
719
718
|
}
|
|
720
719
|
function resolveLazySrc(getAttribute) {
|
|
721
720
|
for (const attr of LAZY_SRC_ATTRIBUTES) {
|
|
722
721
|
const lazy = getAttribute(attr);
|
|
723
|
-
if (!lazy || lazy
|
|
722
|
+
if (!lazy || isDataUri(lazy))
|
|
724
723
|
continue;
|
|
725
724
|
if (attr === 'data-srcset') {
|
|
726
725
|
const url = extractNonDataSrcsetUrl(lazy);
|
|
@@ -736,7 +735,7 @@ function resolveImageSrc(getAttribute) {
|
|
|
736
735
|
if (!getAttribute)
|
|
737
736
|
return '';
|
|
738
737
|
const srcRaw = getAttribute('src') ?? '';
|
|
739
|
-
if (srcRaw && !srcRaw
|
|
738
|
+
if (srcRaw && !isDataUri(srcRaw))
|
|
740
739
|
return srcRaw;
|
|
741
740
|
// First check common lazy-loading attributes that may contain non-data URLs before falling back to the native srcset, as some sites use data URIs in lazy attributes while still providing valid URLs in srcset.
|
|
742
741
|
const lazySrc = resolveLazySrc(getAttribute);
|
|
@@ -750,7 +749,7 @@ function resolveImageSrc(getAttribute) {
|
|
|
750
749
|
return url;
|
|
751
750
|
}
|
|
752
751
|
// If the only available src is a data URI, we choose to omit it rather than include the raw data in the alt text or URL, as data URIs can be very long and are not useful in Markdown output.
|
|
753
|
-
if (srcRaw
|
|
752
|
+
if (isDataUri(srcRaw))
|
|
754
753
|
return '[data URI removed]';
|
|
755
754
|
return '';
|
|
756
755
|
}
|
|
@@ -1099,7 +1098,7 @@ function resolveRelativeUrlsInSegment(markdown, baseUrl, origin) {
|
|
|
1099
1098
|
}
|
|
1100
1099
|
return output;
|
|
1101
1100
|
}
|
|
1102
|
-
function resolveRelativeUrls(markdown, baseUrl) {
|
|
1101
|
+
function resolveRelativeUrls(markdown, baseUrl, signal) {
|
|
1103
1102
|
let origin;
|
|
1104
1103
|
try {
|
|
1105
1104
|
({ origin } = new URL(baseUrl));
|
|
@@ -1109,7 +1108,6 @@ function resolveRelativeUrls(markdown, baseUrl) {
|
|
|
1109
1108
|
}
|
|
1110
1109
|
if (!markdown)
|
|
1111
1110
|
return markdown;
|
|
1112
|
-
const lines = markdown.split('\n');
|
|
1113
1111
|
let output = '';
|
|
1114
1112
|
let buffer = '';
|
|
1115
1113
|
let fenceMarker = null;
|
|
@@ -1119,26 +1117,51 @@ function resolveRelativeUrls(markdown, baseUrl) {
|
|
|
1119
1117
|
output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
|
|
1120
1118
|
buffer = '';
|
|
1121
1119
|
};
|
|
1122
|
-
|
|
1123
|
-
|
|
1120
|
+
const len = markdown.length;
|
|
1121
|
+
let lastIndex = 0;
|
|
1122
|
+
let lineCount = 0;
|
|
1123
|
+
while (lastIndex < len) {
|
|
1124
|
+
if (++lineCount % 500 === 0 && signal?.aborted) {
|
|
1125
|
+
throw new Error('Transform aborted during URL resolution');
|
|
1126
|
+
}
|
|
1127
|
+
let nextIndex = markdown.indexOf('\n', lastIndex);
|
|
1128
|
+
let line;
|
|
1129
|
+
let lineWithNewline;
|
|
1130
|
+
if (nextIndex === -1) {
|
|
1131
|
+
line = markdown.slice(lastIndex);
|
|
1132
|
+
lineWithNewline = line;
|
|
1133
|
+
nextIndex = len;
|
|
1134
|
+
}
|
|
1135
|
+
else {
|
|
1136
|
+
if (nextIndex > lastIndex && markdown.charCodeAt(nextIndex - 1) === 13) {
|
|
1137
|
+
line = markdown.slice(lastIndex, nextIndex - 1);
|
|
1138
|
+
}
|
|
1139
|
+
else {
|
|
1140
|
+
line = markdown.slice(lastIndex, nextIndex);
|
|
1141
|
+
}
|
|
1142
|
+
lineWithNewline = markdown.slice(lastIndex, nextIndex + 1);
|
|
1143
|
+
nextIndex++; // Skip \n
|
|
1144
|
+
}
|
|
1124
1145
|
const trimmed = line.trimStart();
|
|
1125
|
-
const lineWithNewline = i < lines.length - 1 ? `${line}\n` : line;
|
|
1126
1146
|
if (fenceMarker) {
|
|
1127
1147
|
output += lineWithNewline;
|
|
1128
1148
|
if (trimmed.startsWith(fenceMarker) &&
|
|
1129
1149
|
trimmed.slice(fenceMarker.length).trim() === '') {
|
|
1130
1150
|
fenceMarker = null;
|
|
1131
1151
|
}
|
|
1132
|
-
continue;
|
|
1133
1152
|
}
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1153
|
+
else {
|
|
1154
|
+
const fenceMatch = FENCE_LINE_PATTERN.exec(line);
|
|
1155
|
+
if (fenceMatch?.[1]) {
|
|
1156
|
+
flushBuffer();
|
|
1157
|
+
output += lineWithNewline;
|
|
1158
|
+
fenceMarker = fenceMatch[1];
|
|
1159
|
+
}
|
|
1160
|
+
else {
|
|
1161
|
+
buffer += lineWithNewline;
|
|
1162
|
+
}
|
|
1140
1163
|
}
|
|
1141
|
-
|
|
1164
|
+
lastIndex = nextIndex;
|
|
1142
1165
|
}
|
|
1143
1166
|
flushBuffer();
|
|
1144
1167
|
return output;
|
|
@@ -1148,12 +1171,12 @@ function translateHtmlToMarkdown(params) {
|
|
|
1148
1171
|
abortPolicy.throwIfAborted(signal, url, 'markdown:begin');
|
|
1149
1172
|
const cleanedHtml = skipNoiseRemoval
|
|
1150
1173
|
? html
|
|
1151
|
-
: stageTracker.run(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
|
|
1174
|
+
: stageTracker.run(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url, signal));
|
|
1152
1175
|
abortPolicy.throwIfAborted(signal, url, 'markdown:cleaned');
|
|
1153
1176
|
const content = stageTracker.run(url, 'markdown:translate', () => translateHtmlFragmentToMarkdown(cleanedHtml));
|
|
1154
1177
|
abortPolicy.throwIfAborted(signal, url, 'markdown:translated');
|
|
1155
1178
|
const cleaned = cleanupMarkdownArtifacts(content, signal ? { signal, url } : { url });
|
|
1156
|
-
return url ? resolveRelativeUrls(cleaned, url) : cleaned;
|
|
1179
|
+
return url ? resolveRelativeUrls(cleaned, url, signal) : cleaned;
|
|
1157
1180
|
}
|
|
1158
1181
|
function appendMetadataFooter(content, metadata, url) {
|
|
1159
1182
|
const footer = buildMetadataFooter(metadata, url);
|
|
@@ -1448,13 +1471,13 @@ function shouldUseArticleContent(article, originalHtmlOrDocument) {
|
|
|
1448
1471
|
return !hasTruncatedSentences(article.textContent);
|
|
1449
1472
|
}
|
|
1450
1473
|
function buildContentSource(params) {
|
|
1451
|
-
const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, truncated, skipNoiseRemoval, } = params;
|
|
1474
|
+
const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, truncated, skipNoiseRemoval, signal, } = params;
|
|
1452
1475
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
1453
1476
|
if (useArticleContent && article) {
|
|
1454
1477
|
// Readability output can still be noisy (unless user requested skip).
|
|
1455
1478
|
const cleanedArticleHtml = skipNoiseRemoval
|
|
1456
1479
|
? article.content
|
|
1457
|
-
: removeNoiseFromHtml(article.content, undefined, url);
|
|
1480
|
+
: removeNoiseFromHtml(article.content, undefined, url, signal);
|
|
1458
1481
|
return {
|
|
1459
1482
|
sourceHtml: cleanedArticleHtml,
|
|
1460
1483
|
title: article.title,
|
|
@@ -1468,7 +1491,7 @@ function buildContentSource(params) {
|
|
|
1468
1491
|
if (document) {
|
|
1469
1492
|
const cleanedHtml = skipNoiseRemoval
|
|
1470
1493
|
? html
|
|
1471
|
-
: removeNoiseFromHtml(html, document, url);
|
|
1494
|
+
: removeNoiseFromHtml(html, document, url, signal);
|
|
1472
1495
|
const contentRoot = findContentRoot(document);
|
|
1473
1496
|
if (contentRoot) {
|
|
1474
1497
|
return {
|
|
@@ -1521,6 +1544,7 @@ function resolveContentSource(params) {
|
|
|
1521
1544
|
document,
|
|
1522
1545
|
truncated: truncated ?? false,
|
|
1523
1546
|
...(params.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1547
|
+
...(params.signal ? { signal: params.signal } : {}),
|
|
1524
1548
|
});
|
|
1525
1549
|
}
|
|
1526
1550
|
function buildMarkdownFromContext(context, url, signal) {
|
|
@@ -1603,727 +1627,6 @@ export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
|
1603
1627
|
endTotalTransformStage(totalStage, completed);
|
|
1604
1628
|
}
|
|
1605
1629
|
}
|
|
1606
|
-
function isWorkerResultPayload(value) {
|
|
1607
|
-
if (!isObject(value))
|
|
1608
|
-
return false;
|
|
1609
|
-
const { markdown, metadata, title, truncated } = value;
|
|
1610
|
-
const isMetadataObject = metadata === undefined || isObject(metadata);
|
|
1611
|
-
if (!isMetadataObject)
|
|
1612
|
-
return false;
|
|
1613
|
-
if (metadata && !isExtractedMetadataPayload(metadata)) {
|
|
1614
|
-
return false;
|
|
1615
|
-
}
|
|
1616
|
-
return (typeof markdown === 'string' &&
|
|
1617
|
-
typeof truncated === 'boolean' &&
|
|
1618
|
-
(title === undefined || typeof title === 'string'));
|
|
1619
|
-
}
|
|
1620
|
-
function isExtractedMetadataPayload(value) {
|
|
1621
|
-
if (!isObject(value))
|
|
1622
|
-
return false;
|
|
1623
|
-
const { author, description, favicon, image, modifiedAt, publishedAt, title, } = value;
|
|
1624
|
-
return ((title === undefined || typeof title === 'string') &&
|
|
1625
|
-
(description === undefined || typeof description === 'string') &&
|
|
1626
|
-
(author === undefined || typeof author === 'string') &&
|
|
1627
|
-
(image === undefined || typeof image === 'string') &&
|
|
1628
|
-
(favicon === undefined || typeof favicon === 'string') &&
|
|
1629
|
-
(publishedAt === undefined || typeof publishedAt === 'string') &&
|
|
1630
|
-
(modifiedAt === undefined || typeof modifiedAt === 'string'));
|
|
1631
|
-
}
|
|
1632
|
-
function isWorkerErrorPayload(value) {
|
|
1633
|
-
if (!isObject(value))
|
|
1634
|
-
return false;
|
|
1635
|
-
const { details, message, name, statusCode, url } = value;
|
|
1636
|
-
return (typeof name === 'string' &&
|
|
1637
|
-
typeof message === 'string' &&
|
|
1638
|
-
typeof url === 'string' &&
|
|
1639
|
-
(statusCode === undefined || typeof statusCode === 'number') &&
|
|
1640
|
-
(details === undefined || isObject(details)));
|
|
1641
|
-
}
|
|
1642
|
-
function isWorkerResponse(raw) {
|
|
1643
|
-
if (!isObject(raw))
|
|
1644
|
-
return false;
|
|
1645
|
-
if (typeof raw['id'] !== 'string')
|
|
1646
|
-
return false;
|
|
1647
|
-
if (raw['type'] === 'result') {
|
|
1648
|
-
return isWorkerResultPayload(raw['result']);
|
|
1649
|
-
}
|
|
1650
|
-
if (raw['type'] === 'error') {
|
|
1651
|
-
return isWorkerErrorPayload(raw['error']);
|
|
1652
|
-
}
|
|
1653
|
-
if (raw['type'] === 'cancelled') {
|
|
1654
|
-
return true;
|
|
1655
|
-
}
|
|
1656
|
-
return false;
|
|
1657
|
-
}
|
|
1658
|
-
function createTaskContext() {
|
|
1659
|
-
const runWithStore = AsyncLocalStorage.snapshot();
|
|
1660
|
-
const asyncResource = new AsyncResource('fetch-url-mcp.transform.task');
|
|
1661
|
-
let disposed = false;
|
|
1662
|
-
return {
|
|
1663
|
-
run: (fn) => {
|
|
1664
|
-
runWithStore(() => {
|
|
1665
|
-
asyncResource.runInAsyncScope(fn);
|
|
1666
|
-
});
|
|
1667
|
-
},
|
|
1668
|
-
dispose: () => {
|
|
1669
|
-
if (disposed)
|
|
1670
|
-
return;
|
|
1671
|
-
disposed = true;
|
|
1672
|
-
asyncResource.emitDestroy();
|
|
1673
|
-
},
|
|
1674
|
-
};
|
|
1675
|
-
}
|
|
1676
|
-
function buildWorkerDispatchPayload(task, supportsTransferList) {
|
|
1677
|
-
const message = {
|
|
1678
|
-
type: 'transform',
|
|
1679
|
-
id: task.id,
|
|
1680
|
-
url: task.url,
|
|
1681
|
-
includeMetadata: task.includeMetadata,
|
|
1682
|
-
...(task.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1683
|
-
...(task.inputTruncated ? { inputTruncated: true } : {}),
|
|
1684
|
-
};
|
|
1685
|
-
if (!task.htmlBuffer) {
|
|
1686
|
-
message.html = task.html;
|
|
1687
|
-
return { message };
|
|
1688
|
-
}
|
|
1689
|
-
const htmlBuffer = ensureTightBuffer(task.htmlBuffer);
|
|
1690
|
-
if (!supportsTransferList) {
|
|
1691
|
-
message.htmlBuffer = htmlBuffer;
|
|
1692
|
-
if (task.encoding)
|
|
1693
|
-
message.encoding = task.encoding;
|
|
1694
|
-
return { message };
|
|
1695
|
-
}
|
|
1696
|
-
const transferableHtmlBuffer = Uint8Array.from(htmlBuffer);
|
|
1697
|
-
message.htmlBuffer = transferableHtmlBuffer;
|
|
1698
|
-
if (task.encoding)
|
|
1699
|
-
message.encoding = task.encoding;
|
|
1700
|
-
const backingBuffer = transferableHtmlBuffer.buffer;
|
|
1701
|
-
if (isSharedArrayBuffer(backingBuffer))
|
|
1702
|
-
return { message };
|
|
1703
|
-
return { message, transferList: [backingBuffer] };
|
|
1704
|
-
}
|
|
1705
|
-
/**
|
|
1706
|
-
* Worker Pool Sizing Configuration
|
|
1707
|
-
*
|
|
1708
|
-
* Default: min(4, floor(availableParallelism() / 2)), constrained to [2, N]
|
|
1709
|
-
*
|
|
1710
|
-
* Tuning Guidance:
|
|
1711
|
-
* - **Default behavior**: Appropriate for most deployments. Uses half of available
|
|
1712
|
-
* CPU threads (capped at 4) to balance throughput with system resource availability.
|
|
1713
|
-
*
|
|
1714
|
-
* - **CPU-limited containers**: If running in a container with strict CPU limits
|
|
1715
|
-
* (e.g., Docker with --cpus=2), the default may over-subscribe. Consider setting
|
|
1716
|
-
* maxWorkerScale to match the container's CPU limit.
|
|
1717
|
-
*
|
|
1718
|
-
* - **High-concurrency workloads**: For dedicated servers handling many concurrent
|
|
1719
|
-
* fetch requests, increasing maxWorkerScale to (availableParallelism() + 2) may
|
|
1720
|
-
* improve throughput by overlapping I/O wait with computation.
|
|
1721
|
-
*
|
|
1722
|
-
* - **Memory-constrained environments**: Each worker allocates ~50-100MB for DOM
|
|
1723
|
-
* parsing. If memory is limited, reduce maxWorkerScale to (availableParallelism() / 2)
|
|
1724
|
-
* or lower to prevent OOM errors.
|
|
1725
|
-
*
|
|
1726
|
-
* - **Shared hosting**: On shared systems where CPU is contested, reducing the pool
|
|
1727
|
-
* size prevents starving other processes. Consider maxWorkerScale = 2 or using
|
|
1728
|
-
* process-based workers (TRANSFORM_WORKER_MODE=process) for better isolation.
|
|
1729
|
-
*
|
|
1730
|
-
* Configuration:
|
|
1731
|
-
* - TRANSFORM_MAX_WORKER_SCALE env var (default: availableParallelism())
|
|
1732
|
-
* - TRANSFORM_WORKER_MODE env var: 'threads' (default) or 'process'
|
|
1733
|
-
*
|
|
1734
|
-
* See config.ts for full worker configuration options.
|
|
1735
|
-
*/
|
|
1736
|
-
const POOL_MIN_WORKERS = Math.max(2, Math.min(4, Math.floor(availableParallelism() / 2)));
|
|
1737
|
-
const POOL_MAX_WORKERS = config.transform.maxWorkerScale;
|
|
1738
|
-
const POOL_SCALE_THRESHOLD = 0.5;
|
|
1739
|
-
const WORKER_NAME_PREFIX = 'fetch-url-mcp-transform';
|
|
1740
|
-
const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
|
|
1741
|
-
const TRANSFORM_CHILD_PATH = fileURLToPath(new URL('./workers/transform-child.js', import.meta.url));
|
|
1742
|
-
function ensureTightBuffer(buffer) {
|
|
1743
|
-
if (buffer.byteOffset === 0 &&
|
|
1744
|
-
buffer.byteLength === buffer.buffer.byteLength) {
|
|
1745
|
-
return buffer;
|
|
1746
|
-
}
|
|
1747
|
-
return Buffer.from(buffer);
|
|
1748
|
-
}
|
|
1749
|
-
function createThreadWorkerHost(_workerIndex, name) {
|
|
1750
|
-
const resourceLimits = config.transform.workerResourceLimits;
|
|
1751
|
-
const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url), {
|
|
1752
|
-
name,
|
|
1753
|
-
...(resourceLimits ? { resourceLimits } : {}),
|
|
1754
|
-
});
|
|
1755
|
-
return {
|
|
1756
|
-
kind: 'thread',
|
|
1757
|
-
supportsTransferList: true,
|
|
1758
|
-
threadId: worker.threadId,
|
|
1759
|
-
postMessage: (message, transferList) => {
|
|
1760
|
-
worker.postMessage(message, transferList);
|
|
1761
|
-
},
|
|
1762
|
-
terminate: async () => {
|
|
1763
|
-
await worker.terminate();
|
|
1764
|
-
},
|
|
1765
|
-
unref: () => {
|
|
1766
|
-
worker.unref();
|
|
1767
|
-
},
|
|
1768
|
-
onMessage: (handler) => {
|
|
1769
|
-
worker.on('message', handler);
|
|
1770
|
-
},
|
|
1771
|
-
onError: (handler) => {
|
|
1772
|
-
worker.on('error', handler);
|
|
1773
|
-
worker.on('messageerror', handler);
|
|
1774
|
-
},
|
|
1775
|
-
onExit: (handler) => {
|
|
1776
|
-
worker.on('exit', (code) => {
|
|
1777
|
-
handler(code, null);
|
|
1778
|
-
});
|
|
1779
|
-
},
|
|
1780
|
-
};
|
|
1781
|
-
}
|
|
1782
|
-
function createProcessWorkerHost(workerIndex, name) {
|
|
1783
|
-
const child = fork(TRANSFORM_CHILD_PATH, [], {
|
|
1784
|
-
stdio: ['ignore', 'ignore', 'ignore', 'ipc'],
|
|
1785
|
-
serialization: 'advanced',
|
|
1786
|
-
env: {
|
|
1787
|
-
...process.env,
|
|
1788
|
-
FETCH_URL_MCP_WORKER_INDEX: String(workerIndex),
|
|
1789
|
-
FETCH_URL_MCP_WORKER_NAME: name,
|
|
1790
|
-
},
|
|
1791
|
-
});
|
|
1792
|
-
if (child.pid === undefined) {
|
|
1793
|
-
throw new Error('Failed to fork process');
|
|
1794
|
-
}
|
|
1795
|
-
return {
|
|
1796
|
-
kind: 'process',
|
|
1797
|
-
supportsTransferList: false,
|
|
1798
|
-
pid: child.pid,
|
|
1799
|
-
postMessage: (message) => {
|
|
1800
|
-
if (!child.connected) {
|
|
1801
|
-
throw new Error('Transform worker IPC channel is closed');
|
|
1802
|
-
}
|
|
1803
|
-
child.send(message);
|
|
1804
|
-
},
|
|
1805
|
-
terminate: () => new Promise((resolve) => {
|
|
1806
|
-
if (child.exitCode !== null || child.killed) {
|
|
1807
|
-
resolve();
|
|
1808
|
-
return;
|
|
1809
|
-
}
|
|
1810
|
-
child.once('exit', () => {
|
|
1811
|
-
resolve();
|
|
1812
|
-
});
|
|
1813
|
-
try {
|
|
1814
|
-
child.kill();
|
|
1815
|
-
}
|
|
1816
|
-
catch {
|
|
1817
|
-
resolve();
|
|
1818
|
-
}
|
|
1819
|
-
}),
|
|
1820
|
-
unref: () => {
|
|
1821
|
-
child.unref();
|
|
1822
|
-
},
|
|
1823
|
-
onMessage: (handler) => {
|
|
1824
|
-
child.on('message', handler);
|
|
1825
|
-
},
|
|
1826
|
-
onError: (handler) => {
|
|
1827
|
-
child.on('error', handler);
|
|
1828
|
-
},
|
|
1829
|
-
onExit: (handler) => {
|
|
1830
|
-
child.on('exit', (code, signal) => {
|
|
1831
|
-
handler(code, signal);
|
|
1832
|
-
});
|
|
1833
|
-
},
|
|
1834
|
-
};
|
|
1835
|
-
}
|
|
1836
|
-
class WorkerPool {
|
|
1837
|
-
static CLOSED_MESSAGE = 'Transform worker pool closed';
|
|
1838
|
-
workers = [];
|
|
1839
|
-
capacity;
|
|
1840
|
-
minCapacity = POOL_MIN_WORKERS;
|
|
1841
|
-
maxCapacity = POOL_MAX_WORKERS;
|
|
1842
|
-
queue = [];
|
|
1843
|
-
queueHead = 0;
|
|
1844
|
-
inflight = new Map();
|
|
1845
|
-
cancelAcks = new Map();
|
|
1846
|
-
timeoutMs;
|
|
1847
|
-
queueMax;
|
|
1848
|
-
spawnWorkerImpl;
|
|
1849
|
-
closed = false;
|
|
1850
|
-
taskIdSeq = 0;
|
|
1851
|
-
constructor(size, timeoutMs, spawnWorker) {
|
|
1852
|
-
if (size === 0) {
|
|
1853
|
-
this.capacity = 0;
|
|
1854
|
-
}
|
|
1855
|
-
else {
|
|
1856
|
-
this.capacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
|
|
1857
|
-
}
|
|
1858
|
-
this.timeoutMs = timeoutMs;
|
|
1859
|
-
this.queueMax = this.maxCapacity * 32;
|
|
1860
|
-
this.spawnWorkerImpl = spawnWorker;
|
|
1861
|
-
}
|
|
1862
|
-
async transform(htmlOrBuffer, url, options) {
|
|
1863
|
-
this.ensureOpen();
|
|
1864
|
-
if (options.signal?.aborted)
|
|
1865
|
-
throw abortPolicy.createAbortError(url, 'transform:enqueue');
|
|
1866
|
-
if (this.getQueueDepth() >= this.queueMax) {
|
|
1867
|
-
throw new FetchError('Transform worker queue is full', url, 503, {
|
|
1868
|
-
reason: 'queue_full',
|
|
1869
|
-
stage: 'transform:enqueue',
|
|
1870
|
-
});
|
|
1871
|
-
}
|
|
1872
|
-
return new Promise((resolve, reject) => {
|
|
1873
|
-
const task = this.createPendingTask(htmlOrBuffer, url, options, resolve, reject);
|
|
1874
|
-
this.queue.push(task);
|
|
1875
|
-
this.drainQueue();
|
|
1876
|
-
});
|
|
1877
|
-
}
|
|
1878
|
-
getQueueDepth() {
|
|
1879
|
-
const depth = this.queue.length - this.queueHead;
|
|
1880
|
-
return depth > 0 ? depth : 0;
|
|
1881
|
-
}
|
|
1882
|
-
getActiveWorkers() {
|
|
1883
|
-
return this.workers.filter((s) => s?.busy).length;
|
|
1884
|
-
}
|
|
1885
|
-
getCapacity() {
|
|
1886
|
-
return this.capacity;
|
|
1887
|
-
}
|
|
1888
|
-
resize(size) {
|
|
1889
|
-
const newCapacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
|
|
1890
|
-
if (newCapacity === this.capacity)
|
|
1891
|
-
return;
|
|
1892
|
-
this.capacity = newCapacity;
|
|
1893
|
-
this.drainQueue();
|
|
1894
|
-
}
|
|
1895
|
-
async close() {
|
|
1896
|
-
if (this.closed)
|
|
1897
|
-
return;
|
|
1898
|
-
this.closed = true;
|
|
1899
|
-
const terminations = this.workers
|
|
1900
|
-
.map((slot) => slot?.host.terminate())
|
|
1901
|
-
.filter((p) => p !== undefined);
|
|
1902
|
-
this.workers.fill(undefined);
|
|
1903
|
-
this.workers.length = 0;
|
|
1904
|
-
for (const id of Array.from(this.inflight.keys())) {
|
|
1905
|
-
const inflight = this.takeInflight(id);
|
|
1906
|
-
if (!inflight)
|
|
1907
|
-
continue;
|
|
1908
|
-
this.finalizeTask(inflight.context, () => {
|
|
1909
|
-
inflight.reject(new Error(WorkerPool.CLOSED_MESSAGE));
|
|
1910
|
-
});
|
|
1911
|
-
}
|
|
1912
|
-
for (let i = this.queueHead; i < this.queue.length; i += 1) {
|
|
1913
|
-
const task = this.queue[i];
|
|
1914
|
-
if (!task)
|
|
1915
|
-
continue;
|
|
1916
|
-
this.clearAbortListener(task.signal, task.abortListener);
|
|
1917
|
-
this.finalizeTask(task.context, () => {
|
|
1918
|
-
task.reject(new Error(WorkerPool.CLOSED_MESSAGE));
|
|
1919
|
-
});
|
|
1920
|
-
}
|
|
1921
|
-
this.queue.length = 0;
|
|
1922
|
-
this.queueHead = 0;
|
|
1923
|
-
await Promise.allSettled(terminations);
|
|
1924
|
-
}
|
|
1925
|
-
ensureOpen() {
|
|
1926
|
-
if (this.closed)
|
|
1927
|
-
throw new Error(WorkerPool.CLOSED_MESSAGE);
|
|
1928
|
-
}
|
|
1929
|
-
createPendingTask(htmlOrBuffer, url, options, resolve, reject) {
|
|
1930
|
-
const id = (this.taskIdSeq++).toString(36);
|
|
1931
|
-
// Preserve request context for resolve/reject even when callbacks fire
|
|
1932
|
-
// from worker thread events.
|
|
1933
|
-
const context = createTaskContext();
|
|
1934
|
-
let abortListener;
|
|
1935
|
-
if (options.signal) {
|
|
1936
|
-
abortListener = () => {
|
|
1937
|
-
this.onAbortSignal(id, url, context, reject);
|
|
1938
|
-
};
|
|
1939
|
-
options.signal.addEventListener('abort', abortListener, { once: true });
|
|
1940
|
-
}
|
|
1941
|
-
const task = {
|
|
1942
|
-
id,
|
|
1943
|
-
url,
|
|
1944
|
-
includeMetadata: options.includeMetadata,
|
|
1945
|
-
...(options.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1946
|
-
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1947
|
-
signal: options.signal,
|
|
1948
|
-
abortListener,
|
|
1949
|
-
context,
|
|
1950
|
-
resolve,
|
|
1951
|
-
reject,
|
|
1952
|
-
};
|
|
1953
|
-
if (typeof htmlOrBuffer === 'string') {
|
|
1954
|
-
task.html = htmlOrBuffer;
|
|
1955
|
-
}
|
|
1956
|
-
else {
|
|
1957
|
-
task.htmlBuffer = htmlOrBuffer;
|
|
1958
|
-
if (options.encoding) {
|
|
1959
|
-
task.encoding = options.encoding;
|
|
1960
|
-
}
|
|
1961
|
-
}
|
|
1962
|
-
return task;
|
|
1963
|
-
}
|
|
1964
|
-
onAbortSignal(id, url, context, reject) {
|
|
1965
|
-
if (this.closed) {
|
|
1966
|
-
this.finalizeTask(context, () => {
|
|
1967
|
-
reject(new Error(WorkerPool.CLOSED_MESSAGE));
|
|
1968
|
-
});
|
|
1969
|
-
return;
|
|
1970
|
-
}
|
|
1971
|
-
const inflight = this.inflight.get(id);
|
|
1972
|
-
if (inflight) {
|
|
1973
|
-
void this.abortInflight(id, url, inflight.workerIndex);
|
|
1974
|
-
return;
|
|
1975
|
-
}
|
|
1976
|
-
const queuedIndex = this.findQueuedIndex(id);
|
|
1977
|
-
if (queuedIndex !== null) {
|
|
1978
|
-
const task = this.queue[queuedIndex];
|
|
1979
|
-
if (task)
|
|
1980
|
-
this.clearAbortListener(task.signal, task.abortListener);
|
|
1981
|
-
this.queue.splice(queuedIndex, 1);
|
|
1982
|
-
if (task) {
|
|
1983
|
-
this.finalizeTask(task.context, () => {
|
|
1984
|
-
task.reject(abortPolicy.createAbortError(url, 'transform:queued-abort'));
|
|
1985
|
-
});
|
|
1986
|
-
}
|
|
1987
|
-
else {
|
|
1988
|
-
this.finalizeTask(context, () => {
|
|
1989
|
-
reject(abortPolicy.createAbortError(url, 'transform:queued-abort'));
|
|
1990
|
-
});
|
|
1991
|
-
}
|
|
1992
|
-
this.maybeCompactQueue();
|
|
1993
|
-
}
|
|
1994
|
-
}
|
|
1995
|
-
resolveCancelAck(id) {
|
|
1996
|
-
const pending = this.cancelAcks.get(id);
|
|
1997
|
-
if (!pending)
|
|
1998
|
-
return;
|
|
1999
|
-
pending.timeout.cancel();
|
|
2000
|
-
pending.resolve();
|
|
2001
|
-
}
|
|
2002
|
-
waitForCancelAck(id) {
|
|
2003
|
-
const existing = this.cancelAcks.get(id);
|
|
2004
|
-
if (existing) {
|
|
2005
|
-
return existing.promise;
|
|
2006
|
-
}
|
|
2007
|
-
let resolve = () => { };
|
|
2008
|
-
const timeout = createUnrefTimeout(200, undefined);
|
|
2009
|
-
const racePromise = new Promise((finish) => {
|
|
2010
|
-
resolve = finish;
|
|
2011
|
-
});
|
|
2012
|
-
const promise = Promise.race([racePromise, timeout.promise]).finally(() => {
|
|
2013
|
-
this.cancelAcks.delete(id);
|
|
2014
|
-
timeout.cancel();
|
|
2015
|
-
});
|
|
2016
|
-
this.cancelAcks.set(id, { promise, resolve, timeout });
|
|
2017
|
-
return promise;
|
|
2018
|
-
}
|
|
2019
|
-
async abortInflight(id, url, workerIndex) {
|
|
2020
|
-
const slot = this.workers[workerIndex];
|
|
2021
|
-
const inflight = this.inflight.get(id);
|
|
2022
|
-
if (inflight) {
|
|
2023
|
-
inflight.cancelPending = true;
|
|
2024
|
-
}
|
|
2025
|
-
if (slot) {
|
|
2026
|
-
try {
|
|
2027
|
-
slot.host.postMessage({ type: 'cancel', id });
|
|
2028
|
-
}
|
|
2029
|
-
catch {
|
|
2030
|
-
// Worker may be unavailable; failure is acceptable during abort
|
|
2031
|
-
}
|
|
2032
|
-
}
|
|
2033
|
-
await this.waitForCancelAck(id);
|
|
2034
|
-
this.failTask(id, abortPolicy.createAbortError(url, 'transform:signal-abort'));
|
|
2035
|
-
if (slot)
|
|
2036
|
-
this.restartWorker(workerIndex, slot);
|
|
2037
|
-
}
|
|
2038
|
-
clearAbortListener(signal, listener) {
|
|
2039
|
-
if (!signal || !listener)
|
|
2040
|
-
return;
|
|
2041
|
-
try {
|
|
2042
|
-
signal.removeEventListener('abort', listener);
|
|
2043
|
-
}
|
|
2044
|
-
catch {
|
|
2045
|
-
// Defensive: removeEventListener should not throw, but handle edge cases
|
|
2046
|
-
}
|
|
2047
|
-
}
|
|
2048
|
-
spawnWorker(workerIndex) {
|
|
2049
|
-
const name = `${WORKER_NAME_PREFIX}-${workerIndex + 1}`;
|
|
2050
|
-
const host = this.spawnWorkerImpl(workerIndex, name);
|
|
2051
|
-
host.unref();
|
|
2052
|
-
host.onMessage((raw) => {
|
|
2053
|
-
this.onWorkerMessage(workerIndex, raw);
|
|
2054
|
-
});
|
|
2055
|
-
host.onError((error) => {
|
|
2056
|
-
this.onWorkerBroken(workerIndex, `Transform worker error: ${getErrorMessage(error)}`);
|
|
2057
|
-
});
|
|
2058
|
-
host.onExit((code, signal) => {
|
|
2059
|
-
const suffix = signal ? `signal ${signal}` : `code ${code ?? 'unknown'}`;
|
|
2060
|
-
this.onWorkerBroken(workerIndex, `Transform worker exited (${suffix})`);
|
|
2061
|
-
});
|
|
2062
|
-
return { host, busy: false, currentTaskId: null, name };
|
|
2063
|
-
}
|
|
2064
|
-
onWorkerBroken(workerIndex, message) {
|
|
2065
|
-
if (this.closed)
|
|
2066
|
-
return;
|
|
2067
|
-
const slot = this.workers[workerIndex];
|
|
2068
|
-
if (!slot)
|
|
2069
|
-
return;
|
|
2070
|
-
logWarn('Transform worker unavailable; restarting', {
|
|
2071
|
-
reason: message,
|
|
2072
|
-
workerIndex,
|
|
2073
|
-
workerKind: slot.host.kind,
|
|
2074
|
-
workerName: slot.name,
|
|
2075
|
-
...(slot.host.kind === 'process'
|
|
2076
|
-
? { pid: slot.host.pid }
|
|
2077
|
-
: { threadId: slot.host.threadId }),
|
|
2078
|
-
});
|
|
2079
|
-
if (slot.busy && slot.currentTaskId) {
|
|
2080
|
-
this.failTask(slot.currentTaskId, new Error(message));
|
|
2081
|
-
}
|
|
2082
|
-
this.restartWorker(workerIndex, slot);
|
|
2083
|
-
}
|
|
2084
|
-
restartWorker(workerIndex, slot) {
|
|
2085
|
-
if (this.closed)
|
|
2086
|
-
return;
|
|
2087
|
-
const target = slot ?? this.workers[workerIndex];
|
|
2088
|
-
if (target) {
|
|
2089
|
-
target.host.terminate().catch(() => undefined);
|
|
2090
|
-
}
|
|
2091
|
-
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
2092
|
-
this.drainQueue();
|
|
2093
|
-
}
|
|
2094
|
-
onWorkerMessage(workerIndex, raw) {
|
|
2095
|
-
if (!isWorkerResponse(raw))
|
|
2096
|
-
return;
|
|
2097
|
-
const message = raw;
|
|
2098
|
-
if (message.type === 'cancelled') {
|
|
2099
|
-
this.resolveCancelAck(message.id);
|
|
2100
|
-
return;
|
|
2101
|
-
}
|
|
2102
|
-
const inflightPeek = this.inflight.get(message.id);
|
|
2103
|
-
if (inflightPeek?.cancelPending) {
|
|
2104
|
-
this.resolveCancelAck(message.id);
|
|
2105
|
-
return;
|
|
2106
|
-
}
|
|
2107
|
-
const inflight = this.takeInflight(message.id);
|
|
2108
|
-
if (!inflight)
|
|
2109
|
-
return;
|
|
2110
|
-
this.markIdle(workerIndex);
|
|
2111
|
-
if (message.type === 'result') {
|
|
2112
|
-
this.finalizeTask(inflight.context, () => {
|
|
2113
|
-
inflight.resolve({
|
|
2114
|
-
markdown: message.result.markdown,
|
|
2115
|
-
truncated: message.result.truncated,
|
|
2116
|
-
title: message.result.title,
|
|
2117
|
-
...(message.result.metadata
|
|
2118
|
-
? { metadata: message.result.metadata }
|
|
2119
|
-
: {}),
|
|
2120
|
-
});
|
|
2121
|
-
});
|
|
2122
|
-
}
|
|
2123
|
-
else {
|
|
2124
|
-
const err = message.error;
|
|
2125
|
-
if (err.name === 'FetchError') {
|
|
2126
|
-
this.finalizeTask(inflight.context, () => {
|
|
2127
|
-
inflight.reject(new FetchError(err.message, err.url, err.statusCode, err.details ?? {}));
|
|
2128
|
-
});
|
|
2129
|
-
}
|
|
2130
|
-
else {
|
|
2131
|
-
this.finalizeTask(inflight.context, () => {
|
|
2132
|
-
inflight.reject(new Error(err.message));
|
|
2133
|
-
});
|
|
2134
|
-
}
|
|
2135
|
-
}
|
|
2136
|
-
this.drainQueue();
|
|
2137
|
-
}
|
|
2138
|
-
takeInflight(id) {
|
|
2139
|
-
const inflight = this.inflight.get(id);
|
|
2140
|
-
if (!inflight)
|
|
2141
|
-
return null;
|
|
2142
|
-
inflight.timeout.cancel();
|
|
2143
|
-
this.clearAbortListener(inflight.signal, inflight.abortListener);
|
|
2144
|
-
this.inflight.delete(id);
|
|
2145
|
-
return inflight;
|
|
2146
|
-
}
|
|
2147
|
-
markIdle(workerIndex) {
|
|
2148
|
-
const slot = this.workers[workerIndex];
|
|
2149
|
-
if (!slot)
|
|
2150
|
-
return;
|
|
2151
|
-
slot.busy = false;
|
|
2152
|
-
slot.currentTaskId = null;
|
|
2153
|
-
}
|
|
2154
|
-
failTask(id, error) {
|
|
2155
|
-
const inflight = this.takeInflight(id);
|
|
2156
|
-
if (!inflight)
|
|
2157
|
-
return;
|
|
2158
|
-
this.finalizeTask(inflight.context, () => {
|
|
2159
|
-
inflight.reject(error);
|
|
2160
|
-
});
|
|
2161
|
-
this.markIdle(inflight.workerIndex);
|
|
2162
|
-
}
|
|
2163
|
-
maybeScaleUp() {
|
|
2164
|
-
if (this.getQueueDepth() > this.capacity * POOL_SCALE_THRESHOLD &&
|
|
2165
|
-
this.capacity < this.maxCapacity) {
|
|
2166
|
-
this.capacity += 1;
|
|
2167
|
-
}
|
|
2168
|
-
}
|
|
2169
|
-
drainQueue() {
|
|
2170
|
-
if (this.closed || this.getQueueDepth() === 0)
|
|
2171
|
-
return;
|
|
2172
|
-
this.maybeScaleUp();
|
|
2173
|
-
for (let i = 0; i < this.workers.length; i += 1) {
|
|
2174
|
-
const slot = this.workers[i];
|
|
2175
|
-
if (slot && !slot.busy) {
|
|
2176
|
-
this.dispatchFromQueue(i, slot);
|
|
2177
|
-
if (this.getQueueDepth() === 0)
|
|
2178
|
-
return;
|
|
2179
|
-
}
|
|
2180
|
-
}
|
|
2181
|
-
if (this.workers.length < this.capacity && this.getQueueDepth() > 0) {
|
|
2182
|
-
const workerIndex = this.workers.length;
|
|
2183
|
-
const slot = this.spawnWorker(workerIndex);
|
|
2184
|
-
this.workers.push(slot);
|
|
2185
|
-
this.dispatchFromQueue(workerIndex, slot);
|
|
2186
|
-
if (this.workers.length < this.capacity && this.getQueueDepth() > 0) {
|
|
2187
|
-
setImmediate(() => {
|
|
2188
|
-
this.drainQueue();
|
|
2189
|
-
});
|
|
2190
|
-
}
|
|
2191
|
-
}
|
|
2192
|
-
}
|
|
2193
|
-
takeNextQueuedTask() {
|
|
2194
|
-
while (this.queueHead < this.queue.length) {
|
|
2195
|
-
const task = this.queue[this.queueHead];
|
|
2196
|
-
this.queueHead += 1;
|
|
2197
|
-
if (task) {
|
|
2198
|
-
this.maybeCompactQueue();
|
|
2199
|
-
return task;
|
|
2200
|
-
}
|
|
2201
|
-
}
|
|
2202
|
-
this.maybeCompactQueue();
|
|
2203
|
-
return null;
|
|
2204
|
-
}
|
|
2205
|
-
dispatchFromQueue(workerIndex, slot) {
|
|
2206
|
-
const task = this.takeNextQueuedTask();
|
|
2207
|
-
if (!task)
|
|
2208
|
-
return;
|
|
2209
|
-
if (this.closed) {
|
|
2210
|
-
this.clearAbortListener(task.signal, task.abortListener);
|
|
2211
|
-
this.finalizeTask(task.context, () => {
|
|
2212
|
-
task.reject(new Error(WorkerPool.CLOSED_MESSAGE));
|
|
2213
|
-
});
|
|
2214
|
-
return;
|
|
2215
|
-
}
|
|
2216
|
-
if (task.signal?.aborted) {
|
|
2217
|
-
this.clearAbortListener(task.signal, task.abortListener);
|
|
2218
|
-
this.finalizeTask(task.context, () => {
|
|
2219
|
-
task.reject(abortPolicy.createAbortError(task.url, 'transform:dispatch'));
|
|
2220
|
-
});
|
|
2221
|
-
return;
|
|
2222
|
-
}
|
|
2223
|
-
slot.busy = true;
|
|
2224
|
-
slot.currentTaskId = task.id;
|
|
2225
|
-
const timeout = createUnrefTimeout(this.timeoutMs, null);
|
|
2226
|
-
void timeout.promise
|
|
2227
|
-
.then(() => {
|
|
2228
|
-
try {
|
|
2229
|
-
slot.host.postMessage({ type: 'cancel', id: task.id });
|
|
2230
|
-
}
|
|
2231
|
-
catch {
|
|
2232
|
-
// Worker may be unavailable; proceed with timeout handling
|
|
2233
|
-
}
|
|
2234
|
-
const inflight = this.takeInflight(task.id);
|
|
2235
|
-
if (!inflight)
|
|
2236
|
-
return;
|
|
2237
|
-
this.finalizeTask(inflight.context, () => {
|
|
2238
|
-
inflight.reject(new FetchError('Request timeout', task.url, 504, {
|
|
2239
|
-
reason: 'timeout',
|
|
2240
|
-
stage: 'transform:worker-timeout',
|
|
2241
|
-
}));
|
|
2242
|
-
});
|
|
2243
|
-
this.restartWorker(workerIndex, slot);
|
|
2244
|
-
})
|
|
2245
|
-
.catch((error) => {
|
|
2246
|
-
this.failTask(task.id, error);
|
|
2247
|
-
});
|
|
2248
|
-
this.inflight.set(task.id, {
|
|
2249
|
-
resolve: task.resolve,
|
|
2250
|
-
reject: task.reject,
|
|
2251
|
-
timeout,
|
|
2252
|
-
signal: task.signal,
|
|
2253
|
-
abortListener: task.abortListener,
|
|
2254
|
-
workerIndex,
|
|
2255
|
-
context: task.context,
|
|
2256
|
-
cancelPending: false,
|
|
2257
|
-
});
|
|
2258
|
-
try {
|
|
2259
|
-
const { message, transferList } = buildWorkerDispatchPayload(task, slot.host.supportsTransferList);
|
|
2260
|
-
slot.host.postMessage(message, transferList);
|
|
2261
|
-
}
|
|
2262
|
-
catch (error) {
|
|
2263
|
-
timeout.cancel();
|
|
2264
|
-
this.clearAbortListener(task.signal, task.abortListener);
|
|
2265
|
-
this.inflight.delete(task.id);
|
|
2266
|
-
this.markIdle(workerIndex);
|
|
2267
|
-
this.finalizeTask(task.context, () => {
|
|
2268
|
-
task.reject(error instanceof Error
|
|
2269
|
-
? error
|
|
2270
|
-
: new Error('Failed to dispatch transform worker message'));
|
|
2271
|
-
});
|
|
2272
|
-
this.restartWorker(workerIndex, slot);
|
|
2273
|
-
}
|
|
2274
|
-
}
|
|
2275
|
-
finalizeTask(context, fn) {
|
|
2276
|
-
try {
|
|
2277
|
-
context.run(fn);
|
|
2278
|
-
}
|
|
2279
|
-
finally {
|
|
2280
|
-
context.dispose();
|
|
2281
|
-
}
|
|
2282
|
-
}
|
|
2283
|
-
findQueuedIndex(id) {
|
|
2284
|
-
for (let i = this.queueHead; i < this.queue.length; i += 1) {
|
|
2285
|
-
const task = this.queue[i];
|
|
2286
|
-
if (task?.id === id)
|
|
2287
|
-
return i;
|
|
2288
|
-
}
|
|
2289
|
-
return null;
|
|
2290
|
-
}
|
|
2291
|
-
maybeCompactQueue() {
|
|
2292
|
-
if (this.queueHead === 0)
|
|
2293
|
-
return;
|
|
2294
|
-
if (this.queueHead >= this.queue.length ||
|
|
2295
|
-
(this.queueHead > 1024 && this.queueHead > this.queue.length / 2)) {
|
|
2296
|
-
this.queue.splice(0, this.queueHead);
|
|
2297
|
-
this.queueHead = 0;
|
|
2298
|
-
}
|
|
2299
|
-
}
|
|
2300
|
-
}
|
|
2301
|
-
let workerPool = null;
|
|
2302
|
-
function resolveWorkerSpawner() {
|
|
2303
|
-
return config.transform.workerMode === 'process'
|
|
2304
|
-
? createProcessWorkerHost
|
|
2305
|
-
: createThreadWorkerHost;
|
|
2306
|
-
}
|
|
2307
|
-
function getOrCreateWorkerPool() {
|
|
2308
|
-
const size = config.transform.maxWorkerScale === 0 ? 0 : POOL_MIN_WORKERS;
|
|
2309
|
-
workerPool ??= new WorkerPool(size, DEFAULT_TIMEOUT_MS, resolveWorkerSpawner());
|
|
2310
|
-
return workerPool;
|
|
2311
|
-
}
|
|
2312
|
-
function getWorkerPoolStats() {
|
|
2313
|
-
if (!workerPool)
|
|
2314
|
-
return null;
|
|
2315
|
-
return {
|
|
2316
|
-
queueDepth: workerPool.getQueueDepth(),
|
|
2317
|
-
activeWorkers: workerPool.getActiveWorkers(),
|
|
2318
|
-
capacity: workerPool.getCapacity(),
|
|
2319
|
-
};
|
|
2320
|
-
}
|
|
2321
|
-
async function shutdownWorkerPool() {
|
|
2322
|
-
if (!workerPool)
|
|
2323
|
-
return;
|
|
2324
|
-
await workerPool.close();
|
|
2325
|
-
workerPool = null;
|
|
2326
|
-
}
|
|
2327
1630
|
export function getTransformPoolStats() {
|
|
2328
1631
|
return getWorkerPoolStats();
|
|
2329
1632
|
}
|
|
@@ -2410,3 +1713,4 @@ export async function transformHtmlToMarkdown(html, url, options) {
|
|
|
2410
1713
|
export async function transformBufferToMarkdown(htmlBuffer, url, options) {
|
|
2411
1714
|
return transformInputToMarkdown(htmlBuffer, url, options);
|
|
2412
1715
|
}
|
|
1716
|
+
//# sourceMappingURL=transform.js.map
|