@j0hanz/fetch-url-mcp 1.2.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/dist/cache.d.ts +9 -3
  2. package/dist/cache.d.ts.map +1 -0
  3. package/dist/cache.js +54 -119
  4. package/dist/cache.js.map +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.d.ts.map +1 -0
  7. package/dist/cli.js +7 -4
  8. package/dist/cli.js.map +1 -0
  9. package/dist/config.d.ts +2 -3
  10. package/dist/config.d.ts.map +1 -0
  11. package/dist/config.js +19 -27
  12. package/dist/config.js.map +1 -0
  13. package/dist/crypto.d.ts +1 -0
  14. package/dist/crypto.d.ts.map +1 -0
  15. package/dist/crypto.js +7 -3
  16. package/dist/crypto.js.map +1 -0
  17. package/dist/dom-noise-removal.d.ts +2 -1
  18. package/dist/dom-noise-removal.d.ts.map +1 -0
  19. package/dist/dom-noise-removal.js +9 -6
  20. package/dist/dom-noise-removal.js.map +1 -0
  21. package/dist/download.d.ts +4 -0
  22. package/dist/download.d.ts.map +1 -0
  23. package/dist/download.js +106 -0
  24. package/dist/download.js.map +1 -0
  25. package/dist/errors.d.ts +1 -0
  26. package/dist/errors.d.ts.map +1 -0
  27. package/dist/errors.js +2 -1
  28. package/dist/errors.js.map +1 -0
  29. package/dist/examples/mcp-fetch-url-client.js +19 -3
  30. package/dist/examples/mcp-fetch-url-client.js.map +1 -1
  31. package/dist/fetch-content.d.ts +1 -0
  32. package/dist/fetch-content.d.ts.map +1 -0
  33. package/dist/fetch-content.js +15 -14
  34. package/dist/fetch-content.js.map +1 -0
  35. package/dist/fetch-stream.d.ts +1 -0
  36. package/dist/fetch-stream.d.ts.map +1 -0
  37. package/dist/fetch-stream.js +1 -0
  38. package/dist/fetch-stream.js.map +1 -0
  39. package/dist/fetch.d.ts +1 -0
  40. package/dist/fetch.d.ts.map +1 -0
  41. package/dist/fetch.js +123 -54
  42. package/dist/fetch.js.map +1 -0
  43. package/dist/host-normalization.d.ts +1 -0
  44. package/dist/host-normalization.d.ts.map +1 -0
  45. package/dist/host-normalization.js +22 -9
  46. package/dist/host-normalization.js.map +1 -0
  47. package/dist/http/auth.d.ts +51 -0
  48. package/dist/http/auth.d.ts.map +1 -0
  49. package/dist/http/auth.js +344 -0
  50. package/dist/http/auth.js.map +1 -0
  51. package/dist/http/health.d.ts +7 -0
  52. package/dist/http/health.d.ts.map +1 -0
  53. package/dist/http/health.js +156 -0
  54. package/dist/http/health.js.map +1 -0
  55. package/dist/http/helpers.d.ts +58 -0
  56. package/dist/http/helpers.d.ts.map +1 -0
  57. package/dist/http/helpers.js +370 -0
  58. package/dist/http/helpers.js.map +1 -0
  59. package/dist/{http-native.d.ts → http/native.d.ts} +1 -0
  60. package/dist/http/native.d.ts.map +1 -0
  61. package/dist/http/native.js +618 -0
  62. package/dist/http/native.js.map +1 -0
  63. package/dist/http/rate-limit.d.ts +13 -0
  64. package/dist/http/rate-limit.d.ts.map +1 -0
  65. package/dist/http/rate-limit.js +92 -0
  66. package/dist/http/rate-limit.js.map +1 -0
  67. package/dist/index.d.ts +1 -0
  68. package/dist/index.d.ts.map +1 -0
  69. package/dist/index.js +20 -14
  70. package/dist/index.js.map +1 -0
  71. package/dist/instructions.d.ts +2 -0
  72. package/dist/instructions.d.ts.map +1 -0
  73. package/dist/instructions.js +41 -0
  74. package/dist/instructions.js.map +1 -0
  75. package/dist/ip-blocklist.d.ts +1 -0
  76. package/dist/ip-blocklist.d.ts.map +1 -0
  77. package/dist/ip-blocklist.js +13 -8
  78. package/dist/ip-blocklist.js.map +1 -0
  79. package/dist/json.d.ts +2 -1
  80. package/dist/json.d.ts.map +1 -0
  81. package/dist/json.js +16 -6
  82. package/dist/json.js.map +1 -0
  83. package/dist/language-detection.d.ts +1 -0
  84. package/dist/language-detection.d.ts.map +1 -0
  85. package/dist/language-detection.js +2 -7
  86. package/dist/language-detection.js.map +1 -0
  87. package/dist/markdown-cleanup.d.ts +2 -1
  88. package/dist/markdown-cleanup.d.ts.map +1 -0
  89. package/dist/markdown-cleanup.js +52 -54
  90. package/dist/markdown-cleanup.js.map +1 -0
  91. package/dist/mcp-validator.d.ts +1 -0
  92. package/dist/mcp-validator.d.ts.map +1 -0
  93. package/dist/mcp-validator.js +20 -18
  94. package/dist/mcp-validator.js.map +1 -0
  95. package/dist/mcp.d.ts +2 -2
  96. package/dist/mcp.d.ts.map +1 -0
  97. package/dist/mcp.js +35 -344
  98. package/dist/mcp.js.map +1 -0
  99. package/dist/observability.d.ts +2 -0
  100. package/dist/observability.d.ts.map +1 -0
  101. package/dist/observability.js +32 -6
  102. package/dist/observability.js.map +1 -0
  103. package/dist/prompts.d.ts +1 -0
  104. package/dist/prompts.d.ts.map +1 -0
  105. package/dist/prompts.js +15 -3
  106. package/dist/prompts.js.map +1 -0
  107. package/dist/resources.d.ts +1 -0
  108. package/dist/resources.d.ts.map +1 -0
  109. package/dist/resources.js +46 -25
  110. package/dist/resources.js.map +1 -0
  111. package/dist/server-tuning.d.ts +1 -0
  112. package/dist/server-tuning.d.ts.map +1 -0
  113. package/dist/server-tuning.js +14 -17
  114. package/dist/server-tuning.js.map +1 -0
  115. package/dist/server.d.ts +1 -0
  116. package/dist/server.d.ts.map +1 -0
  117. package/dist/server.js +29 -35
  118. package/dist/server.js.map +1 -0
  119. package/dist/session.d.ts +2 -0
  120. package/dist/session.d.ts.map +1 -0
  121. package/dist/session.js +58 -29
  122. package/dist/session.js.map +1 -0
  123. package/dist/tasks/execution.d.ts +42 -0
  124. package/dist/tasks/execution.d.ts.map +1 -0
  125. package/dist/tasks/execution.js +241 -0
  126. package/dist/tasks/execution.js.map +1 -0
  127. package/dist/{tasks.d.ts → tasks/manager.d.ts} +12 -0
  128. package/dist/tasks/manager.d.ts.map +1 -0
  129. package/dist/{tasks.js → tasks/manager.js} +95 -43
  130. package/dist/tasks/manager.js.map +1 -0
  131. package/dist/tasks/owner.d.ts +32 -0
  132. package/dist/tasks/owner.d.ts.map +1 -0
  133. package/dist/tasks/owner.js +92 -0
  134. package/dist/tasks/owner.js.map +1 -0
  135. package/dist/timer-utils.d.ts +1 -0
  136. package/dist/timer-utils.d.ts.map +1 -0
  137. package/dist/timer-utils.js +8 -4
  138. package/dist/timer-utils.js.map +1 -0
  139. package/dist/tool-errors.d.ts +12 -0
  140. package/dist/tool-errors.d.ts.map +1 -0
  141. package/dist/tool-errors.js +55 -0
  142. package/dist/tool-errors.js.map +1 -0
  143. package/dist/tool-pipeline.d.ts +72 -0
  144. package/dist/tool-pipeline.d.ts.map +1 -0
  145. package/dist/tool-pipeline.js +408 -0
  146. package/dist/tool-pipeline.js.map +1 -0
  147. package/dist/tool-progress.d.ts +32 -0
  148. package/dist/tool-progress.d.ts.map +1 -0
  149. package/dist/tool-progress.js +129 -0
  150. package/dist/tool-progress.js.map +1 -0
  151. package/dist/tools.d.ts +35 -111
  152. package/dist/tools.d.ts.map +1 -0
  153. package/dist/tools.js +150 -610
  154. package/dist/tools.js.map +1 -0
  155. package/dist/{transform.d.ts → transform/transform.d.ts} +2 -1
  156. package/dist/transform/transform.d.ts.map +1 -0
  157. package/dist/{transform.js → transform/transform.js} +81 -771
  158. package/dist/transform/transform.js.map +1 -0
  159. package/dist/{transform-types.d.ts → transform/types.d.ts} +2 -0
  160. package/dist/transform/types.d.ts.map +1 -0
  161. package/dist/{transform-types.js → transform/types.js} +1 -0
  162. package/dist/transform/types.js.map +1 -0
  163. package/dist/transform/worker-pool.d.ts +93 -0
  164. package/dist/transform/worker-pool.d.ts.map +1 -0
  165. package/dist/transform/worker-pool.js +757 -0
  166. package/dist/transform/worker-pool.js.map +1 -0
  167. package/dist/transform/workers/transform-child.d.ts +2 -0
  168. package/dist/transform/workers/transform-child.d.ts.map +1 -0
  169. package/dist/{workers → transform/workers}/transform-child.js +17 -13
  170. package/dist/transform/workers/transform-child.js.map +1 -0
  171. package/dist/transform/workers/transform-worker.d.ts +2 -0
  172. package/dist/transform/workers/transform-worker.d.ts.map +1 -0
  173. package/dist/{workers → transform/workers}/transform-worker.js +16 -13
  174. package/dist/transform/workers/transform-worker.js.map +1 -0
  175. package/dist/type-guards.d.ts +1 -0
  176. package/dist/type-guards.d.ts.map +1 -0
  177. package/dist/type-guards.js +4 -4
  178. package/dist/type-guards.js.map +1 -0
  179. package/package.json +6 -7
  180. package/dist/AGENTS.md +0 -152
  181. package/dist/http-native.js +0 -1320
  182. package/dist/instructions.md +0 -113
  183. package/dist/workers/transform-child.d.ts +0 -1
  184. package/dist/workers/transform-worker.d.ts +0 -1
@@ -1,24 +1,18 @@
1
- import { AsyncLocalStorage, AsyncResource } from 'node:async_hooks';
2
1
  import { Buffer } from 'node:buffer';
3
- import { fork } from 'node:child_process';
4
2
  import diagnosticsChannel from 'node:diagnostics_channel';
5
- import { availableParallelism } from 'node:os';
6
3
  import { performance } from 'node:perf_hooks';
7
- import { fileURLToPath } from 'node:url';
8
- import { isSharedArrayBuffer } from 'node:util/types';
9
- import { Worker, } from 'node:worker_threads';
4
+ import { isProbablyReaderable, Readability } from '@mozilla/readability';
10
5
  import { parseHTML } from 'linkedom';
11
6
  import { NodeHtmlMarkdown, } from 'node-html-markdown';
12
- import { isProbablyReaderable, Readability } from '@mozilla/readability';
13
- import { config } from './config.js';
14
- import { removeNoiseFromHtml } from './dom-noise-removal.js';
15
- import { FetchError, getErrorMessage } from './errors.js';
16
- import { isRawTextContentUrl } from './fetch.js';
17
- import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
18
- import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isRawTextContent, } from './markdown-cleanup.js';
19
- import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
20
- import { createUnrefTimeout } from './timer-utils.js';
21
- import { isLikeNode, isObject } from './type-guards.js';
7
+ import { config } from '../config.js';
8
+ import { removeNoiseFromHtml } from '../dom-noise-removal.js';
9
+ import { FetchError, getErrorMessage } from '../errors.js';
10
+ import { isRawTextContentUrl } from '../fetch.js';
11
+ import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../language-detection.js';
12
+ import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isRawTextContent, } from '../markdown-cleanup.js';
13
+ import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../observability.js';
14
+ import { isLikeNode, isObject } from '../type-guards.js';
15
+ import { getOrCreateWorkerPool, getWorkerPoolStats, shutdownWorkerPool, } from './worker-pool.js';
22
16
  const utf8Decoder = new TextDecoder('utf-8');
23
17
  function decodeInput(input, encoding) {
24
18
  if (typeof input === 'string')
@@ -84,7 +78,7 @@ function buildTransformSignal(signal) {
84
78
  class StageTracker {
85
79
  channel = diagnosticsChannel.channel('fetch-url-mcp.transform');
86
80
  start(url, stage, budget) {
87
- if (!this.channel.hasSubscribers && !budget)
81
+ if (this.shouldSkipTracking(budget))
88
82
  return null;
89
83
  const remainingBudgetMs = budget
90
84
  ? budget.totalBudgetMs - budget.elapsedMs
@@ -136,7 +130,7 @@ class StageTracker {
136
130
  return durationMs;
137
131
  }
138
132
  run(url, stage, fn, budget) {
139
- if (!this.channel.hasSubscribers && !budget) {
133
+ if (this.shouldSkipTracking(budget)) {
140
134
  return fn();
141
135
  }
142
136
  if (budget && budget.elapsedMs >= budget.totalBudgetMs) {
@@ -156,6 +150,9 @@ class StageTracker {
156
150
  }
157
151
  }
158
152
  async runAsync(url, stage, fn) {
153
+ if (this.shouldSkipTracking()) {
154
+ return fn();
155
+ }
159
156
  const ctx = this.start(url, stage);
160
157
  try {
161
158
  return await fn();
@@ -164,6 +161,9 @@ class StageTracker {
164
161
  this.end(ctx);
165
162
  }
166
163
  }
164
+ shouldSkipTracking(budget) {
165
+ return !this.channel.hasSubscribers && !budget;
166
+ }
167
167
  publish(event) {
168
168
  if (!this.channel.hasSubscribers)
169
169
  return;
@@ -219,26 +219,27 @@ function truncateHtml(html, inputTruncated = false) {
219
219
  const maxSize = config.constants.maxHtmlSize;
220
220
  if (maxSize <= 0)
221
221
  return { html, truncated: false };
222
- // Fast path: V8 optimized byte length check (no allocation)
223
- const byteLength = Buffer.byteLength(html, 'utf8');
224
- if (byteLength <= maxSize && !inputTruncated)
225
- return { html, truncated: false };
222
+ if (html.length <= maxSize) {
223
+ const byteLength = getUtf8ByteLength(html);
224
+ if (byteLength <= maxSize && !inputTruncated)
225
+ return { html, truncated: false };
226
+ }
226
227
  const sliced = html.slice(0, maxSize);
227
- if (Buffer.byteLength(sliced, 'utf8') <= maxSize) {
228
+ if (getUtf8ByteLength(sliced) <= maxSize) {
228
229
  return { html: trimDanglingTagFragment(sliced), truncated: true };
229
230
  }
230
231
  const htmlBuffer = Buffer.from(sliced, 'utf8');
231
232
  const content = trimDanglingTagFragment(trimUtf8Buffer(htmlBuffer, maxSize).toString('utf8'));
232
233
  logWarn('HTML content exceeds maximum size, truncating', {
233
- size: byteLength,
234
+ size: getUtf8ByteLength(html),
234
235
  maxSize,
235
- truncatedSize: Buffer.byteLength(content, 'utf8'),
236
+ truncatedSize: getUtf8ByteLength(content),
236
237
  });
237
238
  return { html: content, truncated: true };
238
239
  }
239
240
  function willTruncate(html) {
240
241
  const maxSize = config.constants.maxHtmlSize;
241
- return maxSize > 0 && getUtf8ByteLength(html) > maxSize;
242
+ return (maxSize > 0 && (html.length > maxSize || getUtf8ByteLength(html) > maxSize));
242
243
  }
243
244
  const HEAD_END_PATTERN = /<\/head\s*>|<body\b/i;
244
245
  const MAX_HEAD_SCAN_LENGTH = 50_000;
@@ -694,11 +695,12 @@ function buildInlineCodeTranslator() {
694
695
  };
695
696
  }
696
697
  function buildCodeTranslator(ctx) {
698
+ const inlineCodeTranslator = buildInlineCodeTranslator();
697
699
  if (!isObject(ctx))
698
- return buildInlineCodeTranslator();
700
+ return inlineCodeTranslator;
699
701
  const { parent } = ctx;
700
702
  if (!isCodeBlock(parent))
701
- return buildInlineCodeTranslator();
703
+ return inlineCodeTranslator;
702
704
  return { noEscape: true, preserveWhitespace: true };
703
705
  }
704
706
  function extractFirstSrcsetUrl(srcset) {
@@ -713,14 +715,17 @@ const LAZY_SRC_ATTRIBUTES = [
713
715
  'data-original',
714
716
  'data-srcset',
715
717
  ];
718
+ function isDataUri(value) {
719
+ return value.startsWith('data:');
720
+ }
716
721
  function extractNonDataSrcsetUrl(value) {
717
722
  const url = extractFirstSrcsetUrl(value);
718
- return url && !url.startsWith('data:') ? url : undefined;
723
+ return url && !isDataUri(url) ? url : undefined;
719
724
  }
720
725
  function resolveLazySrc(getAttribute) {
721
726
  for (const attr of LAZY_SRC_ATTRIBUTES) {
722
727
  const lazy = getAttribute(attr);
723
- if (!lazy || lazy.startsWith('data:'))
728
+ if (!lazy || isDataUri(lazy))
724
729
  continue;
725
730
  if (attr === 'data-srcset') {
726
731
  const url = extractNonDataSrcsetUrl(lazy);
@@ -736,7 +741,7 @@ function resolveImageSrc(getAttribute) {
736
741
  if (!getAttribute)
737
742
  return '';
738
743
  const srcRaw = getAttribute('src') ?? '';
739
- if (srcRaw && !srcRaw.startsWith('data:'))
744
+ if (srcRaw && !isDataUri(srcRaw))
740
745
  return srcRaw;
741
746
  // First check common lazy-loading attributes that may contain non-data URLs before falling back to the native srcset, as some sites use data URIs in lazy attributes while still providing valid URLs in srcset.
742
747
  const lazySrc = resolveLazySrc(getAttribute);
@@ -750,7 +755,7 @@ function resolveImageSrc(getAttribute) {
750
755
  return url;
751
756
  }
752
757
  // If the only available src is a data URI, we choose to omit it rather than include the raw data in the alt text or URL, as data URIs can be very long and are not useful in Markdown output.
753
- if (srcRaw.startsWith('data:'))
758
+ if (isDataUri(srcRaw))
754
759
  return '[data URI removed]';
755
760
  return '';
756
761
  }
@@ -1099,7 +1104,7 @@ function resolveRelativeUrlsInSegment(markdown, baseUrl, origin) {
1099
1104
  }
1100
1105
  return output;
1101
1106
  }
1102
- function resolveRelativeUrls(markdown, baseUrl) {
1107
+ function resolveRelativeUrls(markdown, baseUrl, signal) {
1103
1108
  let origin;
1104
1109
  try {
1105
1110
  ({ origin } = new URL(baseUrl));
@@ -1109,7 +1114,6 @@ function resolveRelativeUrls(markdown, baseUrl) {
1109
1114
  }
1110
1115
  if (!markdown)
1111
1116
  return markdown;
1112
- const lines = markdown.split('\n');
1113
1117
  let output = '';
1114
1118
  let buffer = '';
1115
1119
  let fenceMarker = null;
@@ -1119,26 +1123,51 @@ function resolveRelativeUrls(markdown, baseUrl) {
1119
1123
  output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
1120
1124
  buffer = '';
1121
1125
  };
1122
- for (let i = 0; i < lines.length; i += 1) {
1123
- const line = lines[i] ?? '';
1126
+ const len = markdown.length;
1127
+ let lastIndex = 0;
1128
+ let lineCount = 0;
1129
+ while (lastIndex < len) {
1130
+ if (++lineCount % 500 === 0 && signal?.aborted) {
1131
+ throw new Error('Transform aborted during URL resolution');
1132
+ }
1133
+ let nextIndex = markdown.indexOf('\n', lastIndex);
1134
+ let line;
1135
+ let lineWithNewline;
1136
+ if (nextIndex === -1) {
1137
+ line = markdown.slice(lastIndex);
1138
+ lineWithNewline = line;
1139
+ nextIndex = len;
1140
+ }
1141
+ else {
1142
+ if (nextIndex > lastIndex && markdown.charCodeAt(nextIndex - 1) === 13) {
1143
+ line = markdown.slice(lastIndex, nextIndex - 1);
1144
+ }
1145
+ else {
1146
+ line = markdown.slice(lastIndex, nextIndex);
1147
+ }
1148
+ lineWithNewline = markdown.slice(lastIndex, nextIndex + 1);
1149
+ nextIndex++; // Skip \n
1150
+ }
1124
1151
  const trimmed = line.trimStart();
1125
- const lineWithNewline = i < lines.length - 1 ? `${line}\n` : line;
1126
1152
  if (fenceMarker) {
1127
1153
  output += lineWithNewline;
1128
1154
  if (trimmed.startsWith(fenceMarker) &&
1129
1155
  trimmed.slice(fenceMarker.length).trim() === '') {
1130
1156
  fenceMarker = null;
1131
1157
  }
1132
- continue;
1133
1158
  }
1134
- const fenceMatch = FENCE_LINE_PATTERN.exec(line);
1135
- if (fenceMatch?.[1]) {
1136
- flushBuffer();
1137
- output += lineWithNewline;
1138
- fenceMarker = fenceMatch[1];
1139
- continue;
1159
+ else {
1160
+ const fenceMatch = FENCE_LINE_PATTERN.exec(line);
1161
+ if (fenceMatch?.[1]) {
1162
+ flushBuffer();
1163
+ output += lineWithNewline;
1164
+ fenceMarker = fenceMatch[1];
1165
+ }
1166
+ else {
1167
+ buffer += lineWithNewline;
1168
+ }
1140
1169
  }
1141
- buffer += lineWithNewline;
1170
+ lastIndex = nextIndex;
1142
1171
  }
1143
1172
  flushBuffer();
1144
1173
  return output;
@@ -1148,12 +1177,12 @@ function translateHtmlToMarkdown(params) {
1148
1177
  abortPolicy.throwIfAborted(signal, url, 'markdown:begin');
1149
1178
  const cleanedHtml = skipNoiseRemoval
1150
1179
  ? html
1151
- : stageTracker.run(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
1180
+ : stageTracker.run(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url, signal));
1152
1181
  abortPolicy.throwIfAborted(signal, url, 'markdown:cleaned');
1153
1182
  const content = stageTracker.run(url, 'markdown:translate', () => translateHtmlFragmentToMarkdown(cleanedHtml));
1154
1183
  abortPolicy.throwIfAborted(signal, url, 'markdown:translated');
1155
1184
  const cleaned = cleanupMarkdownArtifacts(content, signal ? { signal, url } : { url });
1156
- return url ? resolveRelativeUrls(cleaned, url) : cleaned;
1185
+ return url ? resolveRelativeUrls(cleaned, url, signal) : cleaned;
1157
1186
  }
1158
1187
  function appendMetadataFooter(content, metadata, url) {
1159
1188
  const footer = buildMetadataFooter(metadata, url);
@@ -1448,13 +1477,13 @@ function shouldUseArticleContent(article, originalHtmlOrDocument) {
1448
1477
  return !hasTruncatedSentences(article.textContent);
1449
1478
  }
1450
1479
  function buildContentSource(params) {
1451
- const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, truncated, skipNoiseRemoval, } = params;
1480
+ const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, truncated, skipNoiseRemoval, signal, } = params;
1452
1481
  const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
1453
1482
  if (useArticleContent && article) {
1454
1483
  // Readability output can still be noisy (unless user requested skip).
1455
1484
  const cleanedArticleHtml = skipNoiseRemoval
1456
1485
  ? article.content
1457
- : removeNoiseFromHtml(article.content, undefined, url);
1486
+ : removeNoiseFromHtml(article.content, undefined, url, signal);
1458
1487
  return {
1459
1488
  sourceHtml: cleanedArticleHtml,
1460
1489
  title: article.title,
@@ -1468,7 +1497,7 @@ function buildContentSource(params) {
1468
1497
  if (document) {
1469
1498
  const cleanedHtml = skipNoiseRemoval
1470
1499
  ? html
1471
- : removeNoiseFromHtml(html, document, url);
1500
+ : removeNoiseFromHtml(html, document, url, signal);
1472
1501
  const contentRoot = findContentRoot(document);
1473
1502
  if (contentRoot) {
1474
1503
  return {
@@ -1521,6 +1550,7 @@ function resolveContentSource(params) {
1521
1550
  document,
1522
1551
  truncated: truncated ?? false,
1523
1552
  ...(params.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
1553
+ ...(params.signal ? { signal: params.signal } : {}),
1524
1554
  });
1525
1555
  }
1526
1556
  function buildMarkdownFromContext(context, url, signal) {
@@ -1603,727 +1633,6 @@ export function transformHtmlToMarkdownInProcess(html, url, options) {
1603
1633
  endTotalTransformStage(totalStage, completed);
1604
1634
  }
1605
1635
  }
1606
- function isWorkerResultPayload(value) {
1607
- if (!isObject(value))
1608
- return false;
1609
- const { markdown, metadata, title, truncated } = value;
1610
- const isMetadataObject = metadata === undefined || isObject(metadata);
1611
- if (!isMetadataObject)
1612
- return false;
1613
- if (metadata && !isExtractedMetadataPayload(metadata)) {
1614
- return false;
1615
- }
1616
- return (typeof markdown === 'string' &&
1617
- typeof truncated === 'boolean' &&
1618
- (title === undefined || typeof title === 'string'));
1619
- }
1620
- function isExtractedMetadataPayload(value) {
1621
- if (!isObject(value))
1622
- return false;
1623
- const { author, description, favicon, image, modifiedAt, publishedAt, title, } = value;
1624
- return ((title === undefined || typeof title === 'string') &&
1625
- (description === undefined || typeof description === 'string') &&
1626
- (author === undefined || typeof author === 'string') &&
1627
- (image === undefined || typeof image === 'string') &&
1628
- (favicon === undefined || typeof favicon === 'string') &&
1629
- (publishedAt === undefined || typeof publishedAt === 'string') &&
1630
- (modifiedAt === undefined || typeof modifiedAt === 'string'));
1631
- }
1632
- function isWorkerErrorPayload(value) {
1633
- if (!isObject(value))
1634
- return false;
1635
- const { details, message, name, statusCode, url } = value;
1636
- return (typeof name === 'string' &&
1637
- typeof message === 'string' &&
1638
- typeof url === 'string' &&
1639
- (statusCode === undefined || typeof statusCode === 'number') &&
1640
- (details === undefined || isObject(details)));
1641
- }
1642
- function isWorkerResponse(raw) {
1643
- if (!isObject(raw))
1644
- return false;
1645
- if (typeof raw['id'] !== 'string')
1646
- return false;
1647
- if (raw['type'] === 'result') {
1648
- return isWorkerResultPayload(raw['result']);
1649
- }
1650
- if (raw['type'] === 'error') {
1651
- return isWorkerErrorPayload(raw['error']);
1652
- }
1653
- if (raw['type'] === 'cancelled') {
1654
- return true;
1655
- }
1656
- return false;
1657
- }
1658
- function createTaskContext() {
1659
- const runWithStore = AsyncLocalStorage.snapshot();
1660
- const asyncResource = new AsyncResource('fetch-url-mcp.transform.task');
1661
- let disposed = false;
1662
- return {
1663
- run: (fn) => {
1664
- runWithStore(() => {
1665
- asyncResource.runInAsyncScope(fn);
1666
- });
1667
- },
1668
- dispose: () => {
1669
- if (disposed)
1670
- return;
1671
- disposed = true;
1672
- asyncResource.emitDestroy();
1673
- },
1674
- };
1675
- }
1676
- function buildWorkerDispatchPayload(task, supportsTransferList) {
1677
- const message = {
1678
- type: 'transform',
1679
- id: task.id,
1680
- url: task.url,
1681
- includeMetadata: task.includeMetadata,
1682
- ...(task.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
1683
- ...(task.inputTruncated ? { inputTruncated: true } : {}),
1684
- };
1685
- if (!task.htmlBuffer) {
1686
- message.html = task.html;
1687
- return { message };
1688
- }
1689
- const htmlBuffer = ensureTightBuffer(task.htmlBuffer);
1690
- if (!supportsTransferList) {
1691
- message.htmlBuffer = htmlBuffer;
1692
- if (task.encoding)
1693
- message.encoding = task.encoding;
1694
- return { message };
1695
- }
1696
- const transferableHtmlBuffer = Uint8Array.from(htmlBuffer);
1697
- message.htmlBuffer = transferableHtmlBuffer;
1698
- if (task.encoding)
1699
- message.encoding = task.encoding;
1700
- const backingBuffer = transferableHtmlBuffer.buffer;
1701
- if (isSharedArrayBuffer(backingBuffer))
1702
- return { message };
1703
- return { message, transferList: [backingBuffer] };
1704
- }
1705
- /**
1706
- * Worker Pool Sizing Configuration
1707
- *
1708
- * Default: min(4, floor(availableParallelism() / 2)), constrained to [2, N]
1709
- *
1710
- * Tuning Guidance:
1711
- * - **Default behavior**: Appropriate for most deployments. Uses half of available
1712
- * CPU threads (capped at 4) to balance throughput with system resource availability.
1713
- *
1714
- * - **CPU-limited containers**: If running in a container with strict CPU limits
1715
- * (e.g., Docker with --cpus=2), the default may over-subscribe. Consider setting
1716
- * maxWorkerScale to match the container's CPU limit.
1717
- *
1718
- * - **High-concurrency workloads**: For dedicated servers handling many concurrent
1719
- * fetch requests, increasing maxWorkerScale to (availableParallelism() + 2) may
1720
- * improve throughput by overlapping I/O wait with computation.
1721
- *
1722
- * - **Memory-constrained environments**: Each worker allocates ~50-100MB for DOM
1723
- * parsing. If memory is limited, reduce maxWorkerScale to (availableParallelism() / 2)
1724
- * or lower to prevent OOM errors.
1725
- *
1726
- * - **Shared hosting**: On shared systems where CPU is contested, reducing the pool
1727
- * size prevents starving other processes. Consider maxWorkerScale = 2 or using
1728
- * process-based workers (TRANSFORM_WORKER_MODE=process) for better isolation.
1729
- *
1730
- * Configuration:
1731
- * - TRANSFORM_MAX_WORKER_SCALE env var (default: availableParallelism())
1732
- * - TRANSFORM_WORKER_MODE env var: 'threads' (default) or 'process'
1733
- *
1734
- * See config.ts for full worker configuration options.
1735
- */
1736
- const POOL_MIN_WORKERS = Math.max(2, Math.min(4, Math.floor(availableParallelism() / 2)));
1737
- const POOL_MAX_WORKERS = config.transform.maxWorkerScale;
1738
- const POOL_SCALE_THRESHOLD = 0.5;
1739
- const WORKER_NAME_PREFIX = 'fetch-url-mcp-transform';
1740
- const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
1741
- const TRANSFORM_CHILD_PATH = fileURLToPath(new URL('./workers/transform-child.js', import.meta.url));
1742
- function ensureTightBuffer(buffer) {
1743
- if (buffer.byteOffset === 0 &&
1744
- buffer.byteLength === buffer.buffer.byteLength) {
1745
- return buffer;
1746
- }
1747
- return Buffer.from(buffer);
1748
- }
1749
- function createThreadWorkerHost(_workerIndex, name) {
1750
- const resourceLimits = config.transform.workerResourceLimits;
1751
- const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url), {
1752
- name,
1753
- ...(resourceLimits ? { resourceLimits } : {}),
1754
- });
1755
- return {
1756
- kind: 'thread',
1757
- supportsTransferList: true,
1758
- threadId: worker.threadId,
1759
- postMessage: (message, transferList) => {
1760
- worker.postMessage(message, transferList);
1761
- },
1762
- terminate: async () => {
1763
- await worker.terminate();
1764
- },
1765
- unref: () => {
1766
- worker.unref();
1767
- },
1768
- onMessage: (handler) => {
1769
- worker.on('message', handler);
1770
- },
1771
- onError: (handler) => {
1772
- worker.on('error', handler);
1773
- worker.on('messageerror', handler);
1774
- },
1775
- onExit: (handler) => {
1776
- worker.on('exit', (code) => {
1777
- handler(code, null);
1778
- });
1779
- },
1780
- };
1781
- }
1782
- function createProcessWorkerHost(workerIndex, name) {
1783
- const child = fork(TRANSFORM_CHILD_PATH, [], {
1784
- stdio: ['ignore', 'ignore', 'ignore', 'ipc'],
1785
- serialization: 'advanced',
1786
- env: {
1787
- ...process.env,
1788
- FETCH_URL_MCP_WORKER_INDEX: String(workerIndex),
1789
- FETCH_URL_MCP_WORKER_NAME: name,
1790
- },
1791
- });
1792
- if (child.pid === undefined) {
1793
- throw new Error('Failed to fork process');
1794
- }
1795
- return {
1796
- kind: 'process',
1797
- supportsTransferList: false,
1798
- pid: child.pid,
1799
- postMessage: (message) => {
1800
- if (!child.connected) {
1801
- throw new Error('Transform worker IPC channel is closed');
1802
- }
1803
- child.send(message);
1804
- },
1805
- terminate: () => new Promise((resolve) => {
1806
- if (child.exitCode !== null || child.killed) {
1807
- resolve();
1808
- return;
1809
- }
1810
- child.once('exit', () => {
1811
- resolve();
1812
- });
1813
- try {
1814
- child.kill();
1815
- }
1816
- catch {
1817
- resolve();
1818
- }
1819
- }),
1820
- unref: () => {
1821
- child.unref();
1822
- },
1823
- onMessage: (handler) => {
1824
- child.on('message', handler);
1825
- },
1826
- onError: (handler) => {
1827
- child.on('error', handler);
1828
- },
1829
- onExit: (handler) => {
1830
- child.on('exit', (code, signal) => {
1831
- handler(code, signal);
1832
- });
1833
- },
1834
- };
1835
- }
1836
- class WorkerPool {
1837
- static CLOSED_MESSAGE = 'Transform worker pool closed';
1838
- workers = [];
1839
- capacity;
1840
- minCapacity = POOL_MIN_WORKERS;
1841
- maxCapacity = POOL_MAX_WORKERS;
1842
- queue = [];
1843
- queueHead = 0;
1844
- inflight = new Map();
1845
- cancelAcks = new Map();
1846
- timeoutMs;
1847
- queueMax;
1848
- spawnWorkerImpl;
1849
- closed = false;
1850
- taskIdSeq = 0;
1851
- constructor(size, timeoutMs, spawnWorker) {
1852
- if (size === 0) {
1853
- this.capacity = 0;
1854
- }
1855
- else {
1856
- this.capacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
1857
- }
1858
- this.timeoutMs = timeoutMs;
1859
- this.queueMax = this.maxCapacity * 32;
1860
- this.spawnWorkerImpl = spawnWorker;
1861
- }
1862
- async transform(htmlOrBuffer, url, options) {
1863
- this.ensureOpen();
1864
- if (options.signal?.aborted)
1865
- throw abortPolicy.createAbortError(url, 'transform:enqueue');
1866
- if (this.getQueueDepth() >= this.queueMax) {
1867
- throw new FetchError('Transform worker queue is full', url, 503, {
1868
- reason: 'queue_full',
1869
- stage: 'transform:enqueue',
1870
- });
1871
- }
1872
- return new Promise((resolve, reject) => {
1873
- const task = this.createPendingTask(htmlOrBuffer, url, options, resolve, reject);
1874
- this.queue.push(task);
1875
- this.drainQueue();
1876
- });
1877
- }
1878
- getQueueDepth() {
1879
- const depth = this.queue.length - this.queueHead;
1880
- return depth > 0 ? depth : 0;
1881
- }
1882
- getActiveWorkers() {
1883
- return this.workers.filter((s) => s?.busy).length;
1884
- }
1885
- getCapacity() {
1886
- return this.capacity;
1887
- }
1888
- resize(size) {
1889
- const newCapacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
1890
- if (newCapacity === this.capacity)
1891
- return;
1892
- this.capacity = newCapacity;
1893
- this.drainQueue();
1894
- }
1895
- async close() {
1896
- if (this.closed)
1897
- return;
1898
- this.closed = true;
1899
- const terminations = this.workers
1900
- .map((slot) => slot?.host.terminate())
1901
- .filter((p) => p !== undefined);
1902
- this.workers.fill(undefined);
1903
- this.workers.length = 0;
1904
- for (const id of Array.from(this.inflight.keys())) {
1905
- const inflight = this.takeInflight(id);
1906
- if (!inflight)
1907
- continue;
1908
- this.finalizeTask(inflight.context, () => {
1909
- inflight.reject(new Error(WorkerPool.CLOSED_MESSAGE));
1910
- });
1911
- }
1912
- for (let i = this.queueHead; i < this.queue.length; i += 1) {
1913
- const task = this.queue[i];
1914
- if (!task)
1915
- continue;
1916
- this.clearAbortListener(task.signal, task.abortListener);
1917
- this.finalizeTask(task.context, () => {
1918
- task.reject(new Error(WorkerPool.CLOSED_MESSAGE));
1919
- });
1920
- }
1921
- this.queue.length = 0;
1922
- this.queueHead = 0;
1923
- await Promise.allSettled(terminations);
1924
- }
1925
- ensureOpen() {
1926
- if (this.closed)
1927
- throw new Error(WorkerPool.CLOSED_MESSAGE);
1928
- }
1929
- createPendingTask(htmlOrBuffer, url, options, resolve, reject) {
1930
- const id = (this.taskIdSeq++).toString(36);
1931
- // Preserve request context for resolve/reject even when callbacks fire
1932
- // from worker thread events.
1933
- const context = createTaskContext();
1934
- let abortListener;
1935
- if (options.signal) {
1936
- abortListener = () => {
1937
- this.onAbortSignal(id, url, context, reject);
1938
- };
1939
- options.signal.addEventListener('abort', abortListener, { once: true });
1940
- }
1941
- const task = {
1942
- id,
1943
- url,
1944
- includeMetadata: options.includeMetadata,
1945
- ...(options.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
1946
- ...(options.inputTruncated ? { inputTruncated: true } : {}),
1947
- signal: options.signal,
1948
- abortListener,
1949
- context,
1950
- resolve,
1951
- reject,
1952
- };
1953
- if (typeof htmlOrBuffer === 'string') {
1954
- task.html = htmlOrBuffer;
1955
- }
1956
- else {
1957
- task.htmlBuffer = htmlOrBuffer;
1958
- if (options.encoding) {
1959
- task.encoding = options.encoding;
1960
- }
1961
- }
1962
- return task;
1963
- }
1964
- onAbortSignal(id, url, context, reject) {
1965
- if (this.closed) {
1966
- this.finalizeTask(context, () => {
1967
- reject(new Error(WorkerPool.CLOSED_MESSAGE));
1968
- });
1969
- return;
1970
- }
1971
- const inflight = this.inflight.get(id);
1972
- if (inflight) {
1973
- void this.abortInflight(id, url, inflight.workerIndex);
1974
- return;
1975
- }
1976
- const queuedIndex = this.findQueuedIndex(id);
1977
- if (queuedIndex !== null) {
1978
- const task = this.queue[queuedIndex];
1979
- if (task)
1980
- this.clearAbortListener(task.signal, task.abortListener);
1981
- this.queue.splice(queuedIndex, 1);
1982
- if (task) {
1983
- this.finalizeTask(task.context, () => {
1984
- task.reject(abortPolicy.createAbortError(url, 'transform:queued-abort'));
1985
- });
1986
- }
1987
- else {
1988
- this.finalizeTask(context, () => {
1989
- reject(abortPolicy.createAbortError(url, 'transform:queued-abort'));
1990
- });
1991
- }
1992
- this.maybeCompactQueue();
1993
- }
1994
- }
1995
- resolveCancelAck(id) {
1996
- const pending = this.cancelAcks.get(id);
1997
- if (!pending)
1998
- return;
1999
- pending.timeout.cancel();
2000
- pending.resolve();
2001
- }
2002
- waitForCancelAck(id) {
2003
- const existing = this.cancelAcks.get(id);
2004
- if (existing) {
2005
- return existing.promise;
2006
- }
2007
- let resolve = () => { };
2008
- const timeout = createUnrefTimeout(200, undefined);
2009
- const racePromise = new Promise((finish) => {
2010
- resolve = finish;
2011
- });
2012
- const promise = Promise.race([racePromise, timeout.promise]).finally(() => {
2013
- this.cancelAcks.delete(id);
2014
- timeout.cancel();
2015
- });
2016
- this.cancelAcks.set(id, { promise, resolve, timeout });
2017
- return promise;
2018
- }
2019
- async abortInflight(id, url, workerIndex) {
2020
- const slot = this.workers[workerIndex];
2021
- const inflight = this.inflight.get(id);
2022
- if (inflight) {
2023
- inflight.cancelPending = true;
2024
- }
2025
- if (slot) {
2026
- try {
2027
- slot.host.postMessage({ type: 'cancel', id });
2028
- }
2029
- catch {
2030
- // Worker may be unavailable; failure is acceptable during abort
2031
- }
2032
- }
2033
- await this.waitForCancelAck(id);
2034
- this.failTask(id, abortPolicy.createAbortError(url, 'transform:signal-abort'));
2035
- if (slot)
2036
- this.restartWorker(workerIndex, slot);
2037
- }
2038
- clearAbortListener(signal, listener) {
2039
- if (!signal || !listener)
2040
- return;
2041
- try {
2042
- signal.removeEventListener('abort', listener);
2043
- }
2044
- catch {
2045
- // Defensive: removeEventListener should not throw, but handle edge cases
2046
- }
2047
- }
2048
- spawnWorker(workerIndex) {
2049
- const name = `${WORKER_NAME_PREFIX}-${workerIndex + 1}`;
2050
- const host = this.spawnWorkerImpl(workerIndex, name);
2051
- host.unref();
2052
- host.onMessage((raw) => {
2053
- this.onWorkerMessage(workerIndex, raw);
2054
- });
2055
- host.onError((error) => {
2056
- this.onWorkerBroken(workerIndex, `Transform worker error: ${getErrorMessage(error)}`);
2057
- });
2058
- host.onExit((code, signal) => {
2059
- const suffix = signal ? `signal ${signal}` : `code ${code ?? 'unknown'}`;
2060
- this.onWorkerBroken(workerIndex, `Transform worker exited (${suffix})`);
2061
- });
2062
- return { host, busy: false, currentTaskId: null, name };
2063
- }
2064
- onWorkerBroken(workerIndex, message) {
2065
- if (this.closed)
2066
- return;
2067
- const slot = this.workers[workerIndex];
2068
- if (!slot)
2069
- return;
2070
- logWarn('Transform worker unavailable; restarting', {
2071
- reason: message,
2072
- workerIndex,
2073
- workerKind: slot.host.kind,
2074
- workerName: slot.name,
2075
- ...(slot.host.kind === 'process'
2076
- ? { pid: slot.host.pid }
2077
- : { threadId: slot.host.threadId }),
2078
- });
2079
- if (slot.busy && slot.currentTaskId) {
2080
- this.failTask(slot.currentTaskId, new Error(message));
2081
- }
2082
- this.restartWorker(workerIndex, slot);
2083
- }
2084
- restartWorker(workerIndex, slot) {
2085
- if (this.closed)
2086
- return;
2087
- const target = slot ?? this.workers[workerIndex];
2088
- if (target) {
2089
- target.host.terminate().catch(() => undefined);
2090
- }
2091
- this.workers[workerIndex] = this.spawnWorker(workerIndex);
2092
- this.drainQueue();
2093
- }
2094
- onWorkerMessage(workerIndex, raw) {
2095
- if (!isWorkerResponse(raw))
2096
- return;
2097
- const message = raw;
2098
- if (message.type === 'cancelled') {
2099
- this.resolveCancelAck(message.id);
2100
- return;
2101
- }
2102
- const inflightPeek = this.inflight.get(message.id);
2103
- if (inflightPeek?.cancelPending) {
2104
- this.resolveCancelAck(message.id);
2105
- return;
2106
- }
2107
- const inflight = this.takeInflight(message.id);
2108
- if (!inflight)
2109
- return;
2110
- this.markIdle(workerIndex);
2111
- if (message.type === 'result') {
2112
- this.finalizeTask(inflight.context, () => {
2113
- inflight.resolve({
2114
- markdown: message.result.markdown,
2115
- truncated: message.result.truncated,
2116
- title: message.result.title,
2117
- ...(message.result.metadata
2118
- ? { metadata: message.result.metadata }
2119
- : {}),
2120
- });
2121
- });
2122
- }
2123
- else {
2124
- const err = message.error;
2125
- if (err.name === 'FetchError') {
2126
- this.finalizeTask(inflight.context, () => {
2127
- inflight.reject(new FetchError(err.message, err.url, err.statusCode, err.details ?? {}));
2128
- });
2129
- }
2130
- else {
2131
- this.finalizeTask(inflight.context, () => {
2132
- inflight.reject(new Error(err.message));
2133
- });
2134
- }
2135
- }
2136
- this.drainQueue();
2137
- }
2138
- takeInflight(id) {
2139
- const inflight = this.inflight.get(id);
2140
- if (!inflight)
2141
- return null;
2142
- inflight.timeout.cancel();
2143
- this.clearAbortListener(inflight.signal, inflight.abortListener);
2144
- this.inflight.delete(id);
2145
- return inflight;
2146
- }
2147
- markIdle(workerIndex) {
2148
- const slot = this.workers[workerIndex];
2149
- if (!slot)
2150
- return;
2151
- slot.busy = false;
2152
- slot.currentTaskId = null;
2153
- }
2154
- failTask(id, error) {
2155
- const inflight = this.takeInflight(id);
2156
- if (!inflight)
2157
- return;
2158
- this.finalizeTask(inflight.context, () => {
2159
- inflight.reject(error);
2160
- });
2161
- this.markIdle(inflight.workerIndex);
2162
- }
2163
- maybeScaleUp() {
2164
- if (this.getQueueDepth() > this.capacity * POOL_SCALE_THRESHOLD &&
2165
- this.capacity < this.maxCapacity) {
2166
- this.capacity += 1;
2167
- }
2168
- }
2169
- drainQueue() {
2170
- if (this.closed || this.getQueueDepth() === 0)
2171
- return;
2172
- this.maybeScaleUp();
2173
- for (let i = 0; i < this.workers.length; i += 1) {
2174
- const slot = this.workers[i];
2175
- if (slot && !slot.busy) {
2176
- this.dispatchFromQueue(i, slot);
2177
- if (this.getQueueDepth() === 0)
2178
- return;
2179
- }
2180
- }
2181
- if (this.workers.length < this.capacity && this.getQueueDepth() > 0) {
2182
- const workerIndex = this.workers.length;
2183
- const slot = this.spawnWorker(workerIndex);
2184
- this.workers.push(slot);
2185
- this.dispatchFromQueue(workerIndex, slot);
2186
- if (this.workers.length < this.capacity && this.getQueueDepth() > 0) {
2187
- setImmediate(() => {
2188
- this.drainQueue();
2189
- });
2190
- }
2191
- }
2192
- }
2193
- takeNextQueuedTask() {
2194
- while (this.queueHead < this.queue.length) {
2195
- const task = this.queue[this.queueHead];
2196
- this.queueHead += 1;
2197
- if (task) {
2198
- this.maybeCompactQueue();
2199
- return task;
2200
- }
2201
- }
2202
- this.maybeCompactQueue();
2203
- return null;
2204
- }
2205
- dispatchFromQueue(workerIndex, slot) {
2206
- const task = this.takeNextQueuedTask();
2207
- if (!task)
2208
- return;
2209
- if (this.closed) {
2210
- this.clearAbortListener(task.signal, task.abortListener);
2211
- this.finalizeTask(task.context, () => {
2212
- task.reject(new Error(WorkerPool.CLOSED_MESSAGE));
2213
- });
2214
- return;
2215
- }
2216
- if (task.signal?.aborted) {
2217
- this.clearAbortListener(task.signal, task.abortListener);
2218
- this.finalizeTask(task.context, () => {
2219
- task.reject(abortPolicy.createAbortError(task.url, 'transform:dispatch'));
2220
- });
2221
- return;
2222
- }
2223
- slot.busy = true;
2224
- slot.currentTaskId = task.id;
2225
- const timeout = createUnrefTimeout(this.timeoutMs, null);
2226
- void timeout.promise
2227
- .then(() => {
2228
- try {
2229
- slot.host.postMessage({ type: 'cancel', id: task.id });
2230
- }
2231
- catch {
2232
- // Worker may be unavailable; proceed with timeout handling
2233
- }
2234
- const inflight = this.takeInflight(task.id);
2235
- if (!inflight)
2236
- return;
2237
- this.finalizeTask(inflight.context, () => {
2238
- inflight.reject(new FetchError('Request timeout', task.url, 504, {
2239
- reason: 'timeout',
2240
- stage: 'transform:worker-timeout',
2241
- }));
2242
- });
2243
- this.restartWorker(workerIndex, slot);
2244
- })
2245
- .catch((error) => {
2246
- this.failTask(task.id, error);
2247
- });
2248
- this.inflight.set(task.id, {
2249
- resolve: task.resolve,
2250
- reject: task.reject,
2251
- timeout,
2252
- signal: task.signal,
2253
- abortListener: task.abortListener,
2254
- workerIndex,
2255
- context: task.context,
2256
- cancelPending: false,
2257
- });
2258
- try {
2259
- const { message, transferList } = buildWorkerDispatchPayload(task, slot.host.supportsTransferList);
2260
- slot.host.postMessage(message, transferList);
2261
- }
2262
- catch (error) {
2263
- timeout.cancel();
2264
- this.clearAbortListener(task.signal, task.abortListener);
2265
- this.inflight.delete(task.id);
2266
- this.markIdle(workerIndex);
2267
- this.finalizeTask(task.context, () => {
2268
- task.reject(error instanceof Error
2269
- ? error
2270
- : new Error('Failed to dispatch transform worker message'));
2271
- });
2272
- this.restartWorker(workerIndex, slot);
2273
- }
2274
- }
2275
- finalizeTask(context, fn) {
2276
- try {
2277
- context.run(fn);
2278
- }
2279
- finally {
2280
- context.dispose();
2281
- }
2282
- }
2283
- findQueuedIndex(id) {
2284
- for (let i = this.queueHead; i < this.queue.length; i += 1) {
2285
- const task = this.queue[i];
2286
- if (task?.id === id)
2287
- return i;
2288
- }
2289
- return null;
2290
- }
2291
- maybeCompactQueue() {
2292
- if (this.queueHead === 0)
2293
- return;
2294
- if (this.queueHead >= this.queue.length ||
2295
- (this.queueHead > 1024 && this.queueHead > this.queue.length / 2)) {
2296
- this.queue.splice(0, this.queueHead);
2297
- this.queueHead = 0;
2298
- }
2299
- }
2300
- }
2301
- let workerPool = null;
2302
- function resolveWorkerSpawner() {
2303
- return config.transform.workerMode === 'process'
2304
- ? createProcessWorkerHost
2305
- : createThreadWorkerHost;
2306
- }
2307
- function getOrCreateWorkerPool() {
2308
- const size = config.transform.maxWorkerScale === 0 ? 0 : POOL_MIN_WORKERS;
2309
- workerPool ??= new WorkerPool(size, DEFAULT_TIMEOUT_MS, resolveWorkerSpawner());
2310
- return workerPool;
2311
- }
2312
- function getWorkerPoolStats() {
2313
- if (!workerPool)
2314
- return null;
2315
- return {
2316
- queueDepth: workerPool.getQueueDepth(),
2317
- activeWorkers: workerPool.getActiveWorkers(),
2318
- capacity: workerPool.getCapacity(),
2319
- };
2320
- }
2321
- async function shutdownWorkerPool() {
2322
- if (!workerPool)
2323
- return;
2324
- await workerPool.close();
2325
- workerPool = null;
2326
- }
2327
1636
  export function getTransformPoolStats() {
2328
1637
  return getWorkerPoolStats();
2329
1638
  }
@@ -2410,3 +1719,4 @@ export async function transformHtmlToMarkdown(html, url, options) {
2410
1719
  export async function transformBufferToMarkdown(htmlBuffer, url, options) {
2411
1720
  return transformInputToMarkdown(htmlBuffer, url, options);
2412
1721
  }
1722
+ //# sourceMappingURL=transform.js.map