@j0hanz/fetch-url-mcp 1.9.1 → 1.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/http/auth.d.ts +0 -1
- package/dist/http/auth.d.ts.map +1 -1
- package/dist/http/auth.js +1 -13
- package/dist/http/native.d.ts.map +1 -1
- package/dist/http/native.js +2 -5
- package/dist/lib/content.d.ts.map +1 -1
- package/dist/lib/content.js +301 -350
- package/dist/lib/core.d.ts +78 -71
- package/dist/lib/core.d.ts.map +1 -1
- package/dist/lib/core.js +308 -372
- package/dist/lib/fetch-pipeline.d.ts +2 -6
- package/dist/lib/fetch-pipeline.d.ts.map +1 -1
- package/dist/lib/fetch-pipeline.js +51 -137
- package/dist/lib/http.d.ts.map +1 -1
- package/dist/lib/http.js +188 -130
- package/dist/lib/mcp-tools.d.ts +3 -5
- package/dist/lib/mcp-tools.d.ts.map +1 -1
- package/dist/lib/mcp-tools.js +22 -58
- package/dist/lib/task-handlers.js +4 -4
- package/dist/lib/utils.d.ts +6 -0
- package/dist/lib/utils.d.ts.map +1 -1
- package/dist/lib/utils.js +23 -0
- package/dist/resources/index.js +1 -1
- package/dist/schemas.d.ts +0 -1
- package/dist/schemas.d.ts.map +1 -1
- package/dist/schemas.js +4 -6
- package/dist/server.js +1 -1
- package/dist/tasks/owner.d.ts +1 -1
- package/dist/tasks/owner.d.ts.map +1 -1
- package/dist/tasks/tool-registry.d.ts +1 -1
- package/dist/tasks/tool-registry.d.ts.map +1 -1
- package/dist/tools/fetch-url.d.ts +2 -3
- package/dist/tools/fetch-url.d.ts.map +1 -1
- package/dist/tools/fetch-url.js +89 -152
- package/dist/transform/transform.d.ts +8 -0
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +109 -108
- package/dist/transform/worker-pool.d.ts +3 -6
- package/dist/transform/worker-pool.d.ts.map +1 -1
- package/dist/transform/worker-pool.js +148 -118
- package/package.json +2 -1
|
@@ -35,6 +35,16 @@ function asError(value) {
|
|
|
35
35
|
return value instanceof Error ? value : undefined;
|
|
36
36
|
}
|
|
37
37
|
const abortPolicy = { throwIfAborted, createAbortError };
|
|
38
|
+
function isWhitespaceChar(code) {
|
|
39
|
+
return code === 9 || code === 10 || code === 12 || code === 13 || code === 32;
|
|
40
|
+
}
|
|
41
|
+
function containsWhitespace(value) {
|
|
42
|
+
for (let i = 0; i < value.length; i += 1) {
|
|
43
|
+
if (isWhitespaceChar(value.charCodeAt(i)))
|
|
44
|
+
return true;
|
|
45
|
+
}
|
|
46
|
+
return false;
|
|
47
|
+
}
|
|
38
48
|
function buildTransformSignal(signal) {
|
|
39
49
|
const { timeoutMs } = config.transform;
|
|
40
50
|
if (timeoutMs <= 0)
|
|
@@ -232,6 +242,9 @@ function willTruncate(html) {
|
|
|
232
242
|
const maxSize = config.constants.maxHtmlSize;
|
|
233
243
|
return (maxSize > 0 && (html.length > maxSize || getUtf8ByteLength(html) > maxSize));
|
|
234
244
|
}
|
|
245
|
+
const MIN_SPA_CONTENT_LENGTH = 100;
|
|
246
|
+
const MIN_READERABLE_TEXT_LENGTH = 400;
|
|
247
|
+
const MAX_READABILITY_ELEMENTS = 20_000;
|
|
235
248
|
function isReadabilityCompatible(doc) {
|
|
236
249
|
if (!isObject(doc))
|
|
237
250
|
return false;
|
|
@@ -283,34 +296,34 @@ function extractArticle(document, url, signal) {
|
|
|
283
296
|
logWarn('Document not compatible with Readability');
|
|
284
297
|
return null;
|
|
285
298
|
}
|
|
299
|
+
const checkAbort = (stage) => {
|
|
300
|
+
abortPolicy.throwIfAborted(signal, url, stage);
|
|
301
|
+
};
|
|
286
302
|
try {
|
|
287
303
|
const doc = document;
|
|
288
|
-
|
|
289
|
-
abortPolicy.throwIfAborted(signal, url, 'extract:article:textCheck');
|
|
304
|
+
checkAbort('extract:article:textCheck');
|
|
290
305
|
const rawText = doc.querySelector('body')?.textContent ??
|
|
291
306
|
doc.documentElement.textContent ??
|
|
292
307
|
'';
|
|
293
|
-
const textLength = resolveCollapsedTextLengthUpTo(rawText,
|
|
294
|
-
if (textLength <
|
|
308
|
+
const textLength = resolveCollapsedTextLengthUpTo(rawText, MIN_READERABLE_TEXT_LENGTH + 1);
|
|
309
|
+
if (textLength < MIN_SPA_CONTENT_LENGTH) {
|
|
295
310
|
logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
|
|
296
311
|
'This might be a client-side rendered (SPA) application. ' +
|
|
297
312
|
'Content extraction may be incomplete.', { textLength });
|
|
298
313
|
}
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
314
|
+
checkAbort('extract:article:readabilityCheck');
|
|
315
|
+
if (textLength >= MIN_READERABLE_TEXT_LENGTH &&
|
|
316
|
+
!isProbablyReaderable(doc)) {
|
|
302
317
|
return null;
|
|
303
318
|
}
|
|
304
|
-
|
|
305
|
-
abortPolicy.throwIfAborted(signal, url, 'extract:article:clone');
|
|
319
|
+
checkAbort('extract:article:clone');
|
|
306
320
|
const readabilityDoc = typeof doc.cloneNode === 'function'
|
|
307
321
|
? doc.cloneNode(true)
|
|
308
322
|
: doc;
|
|
309
323
|
preserveAlertElements(readabilityDoc);
|
|
310
|
-
|
|
311
|
-
abortPolicy.throwIfAborted(signal, url, 'extract:article:parse');
|
|
324
|
+
checkAbort('extract:article:parse');
|
|
312
325
|
const reader = new Readability(readabilityDoc, {
|
|
313
|
-
maxElemsToParse:
|
|
326
|
+
maxElemsToParse: MAX_READABILITY_ELEMENTS,
|
|
314
327
|
classesToPreserve: [
|
|
315
328
|
'admonition',
|
|
316
329
|
'callout',
|
|
@@ -410,16 +423,8 @@ export function extractContent(html, url, options = {
|
|
|
410
423
|
const result = extractContentContext(html, url, options);
|
|
411
424
|
return { article: result.article, metadata: result.metadata };
|
|
412
425
|
}
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
}
|
|
416
|
-
function containsWhitespace(value) {
|
|
417
|
-
for (let i = 0; i < value.length; i += 1) {
|
|
418
|
-
if (isWhitespaceChar(value.charCodeAt(i)))
|
|
419
|
-
return true;
|
|
420
|
-
}
|
|
421
|
-
return false;
|
|
422
|
-
}
|
|
426
|
+
const ABORT_CHECK_LINE_INTERVAL = 500;
|
|
427
|
+
const CR_CHAR_CODE = 13;
|
|
423
428
|
function resolveRelativeHref(href, baseUrl, origin) {
|
|
424
429
|
const trimmedHref = href.trim();
|
|
425
430
|
if (!trimmedHref || containsWhitespace(trimmedHref))
|
|
@@ -512,39 +517,31 @@ function resolveRelativeUrls(markdown, baseUrl, signal) {
|
|
|
512
517
|
let output = '';
|
|
513
518
|
let buffer = '';
|
|
514
519
|
let fenceMarker = null;
|
|
515
|
-
const flushBuffer = () => {
|
|
516
|
-
if (!buffer)
|
|
517
|
-
return;
|
|
518
|
-
output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
|
|
519
|
-
buffer = '';
|
|
520
|
-
};
|
|
521
520
|
const len = markdown.length;
|
|
522
521
|
let lastIndex = 0;
|
|
523
522
|
let lineCount = 0;
|
|
524
523
|
while (lastIndex < len) {
|
|
525
|
-
if (++lineCount %
|
|
524
|
+
if (++lineCount % ABORT_CHECK_LINE_INTERVAL === 0) {
|
|
526
525
|
abortPolicy.throwIfAborted(signal, baseUrl, 'markdown:resolve-urls');
|
|
527
526
|
}
|
|
527
|
+
// Extract next line (handling CR+LF)
|
|
528
528
|
let nextIndex = markdown.indexOf('\n', lastIndex);
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
if (nextIndex === -1) {
|
|
532
|
-
line = markdown.slice(lastIndex);
|
|
533
|
-
lineWithNewline = line;
|
|
529
|
+
const isLastLine = nextIndex === -1;
|
|
530
|
+
if (isLastLine)
|
|
534
531
|
nextIndex = len;
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
const trimmed = line.trimStart();
|
|
532
|
+
const lineWithNewline = isLastLine
|
|
533
|
+
? markdown.slice(lastIndex)
|
|
534
|
+
: markdown.slice(lastIndex, nextIndex + 1);
|
|
535
|
+
const lineEnd = !isLastLine &&
|
|
536
|
+
nextIndex > lastIndex &&
|
|
537
|
+
markdown.charCodeAt(nextIndex - 1) === CR_CHAR_CODE
|
|
538
|
+
? nextIndex - 1
|
|
539
|
+
: isLastLine
|
|
540
|
+
? len
|
|
541
|
+
: nextIndex;
|
|
542
|
+
const trimmed = markdown.slice(lastIndex, lineEnd).trimStart();
|
|
547
543
|
if (fenceMarker) {
|
|
544
|
+
// Inside a code fence — pass through without URL resolution
|
|
548
545
|
output += lineWithNewline;
|
|
549
546
|
if (trimmed.startsWith(fenceMarker) &&
|
|
550
547
|
trimmed.slice(fenceMarker.length).trim() === '') {
|
|
@@ -552,9 +549,13 @@ function resolveRelativeUrls(markdown, baseUrl, signal) {
|
|
|
552
549
|
}
|
|
553
550
|
}
|
|
554
551
|
else {
|
|
555
|
-
const fenceMatch = FENCE_LINE_PATTERN.exec(
|
|
552
|
+
const fenceMatch = FENCE_LINE_PATTERN.exec(markdown.slice(lastIndex, lineEnd));
|
|
556
553
|
if (fenceMatch?.[1]) {
|
|
557
|
-
|
|
554
|
+
// Entering a code fence — flush buffered content first
|
|
555
|
+
if (buffer) {
|
|
556
|
+
output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
|
|
557
|
+
buffer = '';
|
|
558
|
+
}
|
|
558
559
|
output += lineWithNewline;
|
|
559
560
|
fenceMarker = fenceMatch[1];
|
|
560
561
|
}
|
|
@@ -562,9 +563,11 @@ function resolveRelativeUrls(markdown, baseUrl, signal) {
|
|
|
562
563
|
buffer += lineWithNewline;
|
|
563
564
|
}
|
|
564
565
|
}
|
|
565
|
-
lastIndex = nextIndex;
|
|
566
|
+
lastIndex = isLastLine ? len : nextIndex + 1;
|
|
567
|
+
}
|
|
568
|
+
if (buffer) {
|
|
569
|
+
output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
|
|
566
570
|
}
|
|
567
|
-
flushBuffer();
|
|
568
571
|
return output;
|
|
569
572
|
}
|
|
570
573
|
function translateHtmlToMarkdown(params) {
|
|
@@ -676,9 +679,6 @@ function resolveHtmlDocument(htmlOrDocument) {
|
|
|
676
679
|
return parseHTML('<!DOCTYPE html><html><body></body></html>').document;
|
|
677
680
|
}
|
|
678
681
|
}
|
|
679
|
-
function countDomSelector(htmlOrDocument, selector) {
|
|
680
|
-
return resolveHtmlDocument(htmlOrDocument).querySelectorAll(selector).length;
|
|
681
|
-
}
|
|
682
682
|
function countTagsInString(html, regex) {
|
|
683
683
|
let count = 0;
|
|
684
684
|
while (regex.exec(html) !== null) {
|
|
@@ -686,12 +686,6 @@ function countTagsInString(html, regex) {
|
|
|
686
686
|
}
|
|
687
687
|
return count;
|
|
688
688
|
}
|
|
689
|
-
function countHeadingsDom(htmlOrDocument) {
|
|
690
|
-
return countDomSelector(htmlOrDocument, 'h1,h2,h3,h4,h5,h6');
|
|
691
|
-
}
|
|
692
|
-
function countCodeBlocksDom(htmlOrDocument) {
|
|
693
|
-
return countDomSelector(htmlOrDocument, 'pre');
|
|
694
|
-
}
|
|
695
689
|
function stripNonVisibleNodes(root) {
|
|
696
690
|
for (const el of root.querySelectorAll('script,style,noscript')) {
|
|
697
691
|
el.remove();
|
|
@@ -794,6 +788,9 @@ function hasTruncatedSentences(text) {
|
|
|
794
788
|
return false;
|
|
795
789
|
return incompleteFound / linesFound > MAX_TRUNCATED_LINE_RATIO;
|
|
796
790
|
}
|
|
791
|
+
const MIN_CONTENT_ROOT_LENGTH = 100;
|
|
792
|
+
const HEADING_SCAN_LIMIT = 12;
|
|
793
|
+
const BINARY_SAMPLE_SIZE = 2000;
|
|
797
794
|
export function determineContentExtractionSource(article) {
|
|
798
795
|
return article !== null;
|
|
799
796
|
}
|
|
@@ -852,7 +849,7 @@ function findContentRoot(document) {
|
|
|
852
849
|
const innerHTML = typeof element.innerHTML === 'string'
|
|
853
850
|
? element.innerHTML
|
|
854
851
|
: undefined;
|
|
855
|
-
if (innerHTML && innerHTML.trim().length >
|
|
852
|
+
if (innerHTML && innerHTML.trim().length > MIN_CONTENT_ROOT_LENGTH)
|
|
856
853
|
return innerHTML;
|
|
857
854
|
}
|
|
858
855
|
return undefined;
|
|
@@ -885,25 +882,28 @@ function isGithubRepositoryRootUrl(url) {
|
|
|
885
882
|
}
|
|
886
883
|
return parsed.pathname.split('/').filter(Boolean).length === 2;
|
|
887
884
|
}
|
|
888
|
-
|
|
885
|
+
export const TransformHeuristics = {
|
|
886
|
+
findContentRoot,
|
|
887
|
+
findPrimaryHeading,
|
|
888
|
+
isGithubRepositoryRootUrl,
|
|
889
|
+
};
|
|
890
|
+
function shouldUseArticleContent(article, document) {
|
|
889
891
|
const articleLength = article.textContent.length;
|
|
890
|
-
const originalLength = getVisibleTextLength(
|
|
892
|
+
const originalLength = getVisibleTextLength(document);
|
|
891
893
|
if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
|
|
892
894
|
const ratio = articleLength / originalLength;
|
|
893
895
|
if (ratio < MIN_CONTENT_RATIO)
|
|
894
896
|
return false;
|
|
895
897
|
}
|
|
896
|
-
const originalHeadings =
|
|
898
|
+
const originalHeadings = document.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
|
|
897
899
|
if (originalHeadings > 0) {
|
|
898
|
-
// Optimization: Use regex on article content string instead of parsing it to DOM
|
|
899
900
|
const articleHeadings = countTagsInString(article.content, /<h[1-6]\b/gi);
|
|
900
901
|
const retentionRatio = articleHeadings / originalHeadings;
|
|
901
902
|
if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
|
|
902
903
|
return false;
|
|
903
904
|
}
|
|
904
|
-
const originalCodeBlocks =
|
|
905
|
+
const originalCodeBlocks = document.querySelectorAll('pre').length;
|
|
905
906
|
if (originalCodeBlocks > 0) {
|
|
906
|
-
// Optimization: Use regex on article content string
|
|
907
907
|
const articleCodeBlocks = countTagsInString(article.content, /<pre\b/gi);
|
|
908
908
|
const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
|
|
909
909
|
if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO)
|
|
@@ -919,12 +919,14 @@ function buildContentSource(params) {
|
|
|
919
919
|
metadata,
|
|
920
920
|
extractedMetadata: extractedMeta,
|
|
921
921
|
truncated,
|
|
922
|
-
primaryHeading: document
|
|
922
|
+
primaryHeading: document
|
|
923
|
+
? TransformHeuristics.findPrimaryHeading(document)
|
|
924
|
+
: undefined,
|
|
923
925
|
};
|
|
924
926
|
if (useArticleContent && article) {
|
|
925
927
|
const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
|
|
926
928
|
prepareDocumentForMarkdown(articleDoc, url, signal);
|
|
927
|
-
const preferPrimaryHeading = isGithubRepositoryRootUrl(url);
|
|
929
|
+
const preferPrimaryHeading = TransformHeuristics.isGithubRepositoryRootUrl(url);
|
|
928
930
|
return {
|
|
929
931
|
...base,
|
|
930
932
|
sourceHtml: articleDoc.body.innerHTML,
|
|
@@ -937,7 +939,7 @@ function buildContentSource(params) {
|
|
|
937
939
|
}
|
|
938
940
|
if (document) {
|
|
939
941
|
prepareDocumentForMarkdown(document, url, signal);
|
|
940
|
-
const contentRoot = findContentRoot(document);
|
|
942
|
+
const contentRoot = TransformHeuristics.findContentRoot(document);
|
|
941
943
|
return {
|
|
942
944
|
...base,
|
|
943
945
|
sourceHtml: contentRoot ?? serializeDocumentForMarkdown(document, html),
|
|
@@ -980,7 +982,8 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
980
982
|
...(context.document ? { document: context.document } : {}),
|
|
981
983
|
...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
982
984
|
}));
|
|
983
|
-
if (context.primaryHeading &&
|
|
985
|
+
if (context.primaryHeading &&
|
|
986
|
+
TransformHeuristics.isGithubRepositoryRootUrl(url)) {
|
|
984
987
|
content = stripLeadingHeading(content, context.primaryHeading);
|
|
985
988
|
}
|
|
986
989
|
if (context.title && !/^(#{1,6})\s/.test(content.trimStart())) {
|
|
@@ -1014,7 +1017,7 @@ function stripLeadingHeading(markdown, headingText) {
|
|
|
1014
1017
|
const lines = markdown.split('\n');
|
|
1015
1018
|
const target = normalizeHeadingText(headingText);
|
|
1016
1019
|
let nonEmptySeen = 0;
|
|
1017
|
-
for (let i = 0; i < lines.length && nonEmptySeen <
|
|
1020
|
+
for (let i = 0; i < lines.length && nonEmptySeen < HEADING_SCAN_LIMIT; i += 1) {
|
|
1018
1021
|
const trimmed = lines[i]?.trim() ?? '';
|
|
1019
1022
|
if (!trimmed)
|
|
1020
1023
|
continue;
|
|
@@ -1040,7 +1043,7 @@ function hasBinaryIndicators(content) {
|
|
|
1040
1043
|
return false;
|
|
1041
1044
|
if (content.includes('\x00'))
|
|
1042
1045
|
return true;
|
|
1043
|
-
const sampleSize = Math.min(content.length,
|
|
1046
|
+
const sampleSize = Math.min(content.length, BINARY_SAMPLE_SIZE);
|
|
1044
1047
|
let replacementCount = 0;
|
|
1045
1048
|
let i = -1;
|
|
1046
1049
|
while ((i = content.indexOf(REPLACEMENT_CHAR, i + 1)) !== -1 &&
|
|
@@ -1052,37 +1055,42 @@ function hasBinaryIndicators(content) {
|
|
|
1052
1055
|
export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
1053
1056
|
const signal = buildTransformSignal(options.signal);
|
|
1054
1057
|
const totalStage = stageTracker.start(url, 'transform:total');
|
|
1055
|
-
let completed = null;
|
|
1056
1058
|
try {
|
|
1057
1059
|
abortPolicy.throwIfAborted(signal, url, 'transform:begin');
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
html,
|
|
1063
|
-
url,
|
|
1064
|
-
includeMetadata: options.includeMetadata,
|
|
1065
|
-
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1066
|
-
}));
|
|
1067
|
-
if (raw) {
|
|
1068
|
-
completed = raw;
|
|
1069
|
-
return raw;
|
|
1070
|
-
}
|
|
1071
|
-
const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
|
|
1072
|
-
html,
|
|
1073
|
-
url,
|
|
1074
|
-
includeMetadata: options.includeMetadata,
|
|
1075
|
-
...(signal ? { signal } : {}),
|
|
1076
|
-
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1077
|
-
}));
|
|
1078
|
-
const result = buildMarkdownFromContext(context, url, signal);
|
|
1079
|
-
completed = result;
|
|
1060
|
+
validateBinaryContent(html, url);
|
|
1061
|
+
const result = tryRawContentPipeline(html, url, options) ??
|
|
1062
|
+
tryHtmlContentPipeline(html, url, options, signal);
|
|
1063
|
+
stageTracker.end(totalStage, { truncated: result.truncated });
|
|
1080
1064
|
return result;
|
|
1081
1065
|
}
|
|
1082
|
-
|
|
1083
|
-
|
|
1066
|
+
catch (error) {
|
|
1067
|
+
stageTracker.end(totalStage);
|
|
1068
|
+
throw error;
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1071
|
+
function validateBinaryContent(html, url) {
|
|
1072
|
+
if (hasBinaryIndicators(html)) {
|
|
1073
|
+
throw new FetchError('Content appears to be binary data (high replacement character ratio or null bytes)', url, 415, { reason: 'binary_content_detected', stage: 'transform:validate' });
|
|
1084
1074
|
}
|
|
1085
1075
|
}
|
|
1076
|
+
function tryRawContentPipeline(html, url, options) {
|
|
1077
|
+
return stageTracker.run(url, 'transform:raw', () => tryTransformRawContent({
|
|
1078
|
+
html,
|
|
1079
|
+
url,
|
|
1080
|
+
includeMetadata: options.includeMetadata,
|
|
1081
|
+
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1082
|
+
}));
|
|
1083
|
+
}
|
|
1084
|
+
function tryHtmlContentPipeline(html, url, options, signal) {
|
|
1085
|
+
const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
|
|
1086
|
+
html,
|
|
1087
|
+
url,
|
|
1088
|
+
includeMetadata: options.includeMetadata,
|
|
1089
|
+
...(signal ? { signal } : {}),
|
|
1090
|
+
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1091
|
+
}));
|
|
1092
|
+
return buildMarkdownFromContext(context, url, signal);
|
|
1093
|
+
}
|
|
1086
1094
|
export function getTransformPoolStats() {
|
|
1087
1095
|
return getWorkerPoolStats();
|
|
1088
1096
|
}
|
|
@@ -1092,13 +1100,6 @@ export async function shutdownTransformWorkerPool() {
|
|
|
1092
1100
|
function transformInputInProcess(htmlOrBuffer, url, options) {
|
|
1093
1101
|
return transformHtmlToMarkdownInProcess(decodeInput(htmlOrBuffer, options.encoding), url, options);
|
|
1094
1102
|
}
|
|
1095
|
-
function endTotalTransformStage(context, result) {
|
|
1096
|
-
if (!result) {
|
|
1097
|
-
stageTracker.end(context);
|
|
1098
|
-
return;
|
|
1099
|
-
}
|
|
1100
|
-
stageTracker.end(context, { truncated: result.truncated });
|
|
1101
|
-
}
|
|
1102
1103
|
function buildWorkerTransformOptions(options) {
|
|
1103
1104
|
return {
|
|
1104
1105
|
includeMetadata: options.includeMetadata,
|
|
@@ -1153,15 +1154,15 @@ async function runWorkerTransformWithFallback(htmlOrBuffer, url, options) {
|
|
|
1153
1154
|
}
|
|
1154
1155
|
async function transformInputToMarkdown(htmlOrBuffer, url, options) {
|
|
1155
1156
|
const totalStage = stageTracker.start(url, 'transform:total');
|
|
1156
|
-
let completed = null;
|
|
1157
1157
|
try {
|
|
1158
1158
|
abortPolicy.throwIfAborted(options.signal, url, 'transform:begin');
|
|
1159
1159
|
const result = await runWorkerTransformWithFallback(htmlOrBuffer, url, options);
|
|
1160
|
-
|
|
1160
|
+
stageTracker.end(totalStage, { truncated: result.truncated });
|
|
1161
1161
|
return result;
|
|
1162
1162
|
}
|
|
1163
|
-
|
|
1164
|
-
|
|
1163
|
+
catch (error) {
|
|
1164
|
+
stageTracker.end(totalStage);
|
|
1165
|
+
throw error;
|
|
1165
1166
|
}
|
|
1166
1167
|
}
|
|
1167
1168
|
export async function transformHtmlToMarkdown(html, url, options) {
|
|
@@ -17,13 +17,13 @@ declare class WorkerPool implements TransformWorkerPool {
|
|
|
17
17
|
private readonly minCapacity;
|
|
18
18
|
private readonly maxCapacity;
|
|
19
19
|
private readonly queue;
|
|
20
|
-
private queueHead;
|
|
21
20
|
private readonly inflight;
|
|
22
21
|
private readonly cancelAcks;
|
|
23
22
|
private readonly timeoutMs;
|
|
24
23
|
private readonly queueMax;
|
|
25
24
|
private closed;
|
|
26
25
|
private taskIdSeq;
|
|
26
|
+
private busyCount;
|
|
27
27
|
constructor(size: number, timeoutMs: number);
|
|
28
28
|
transform(html: string, url: string, options: {
|
|
29
29
|
includeMetadata: boolean;
|
|
@@ -44,8 +44,6 @@ declare class WorkerPool implements TransformWorkerPool {
|
|
|
44
44
|
private ensureOpen;
|
|
45
45
|
private createPendingTask;
|
|
46
46
|
private onAbortSignal;
|
|
47
|
-
private resolveCancelAck;
|
|
48
|
-
private waitForCancelAck;
|
|
49
47
|
private abortInflight;
|
|
50
48
|
private clearAbortListener;
|
|
51
49
|
private spawnWorker;
|
|
@@ -57,11 +55,10 @@ declare class WorkerPool implements TransformWorkerPool {
|
|
|
57
55
|
private failTask;
|
|
58
56
|
private maybeScaleUp;
|
|
59
57
|
private drainQueue;
|
|
60
|
-
private takeNextQueuedTask;
|
|
61
58
|
private dispatchFromQueue;
|
|
59
|
+
private registerInflight;
|
|
60
|
+
private sendToWorker;
|
|
62
61
|
private finalizeTask;
|
|
63
|
-
private findQueuedIndex;
|
|
64
|
-
private maybeCompactQueue;
|
|
65
62
|
}
|
|
66
63
|
export declare function getOrCreateWorkerPool(): WorkerPool;
|
|
67
64
|
export declare function getWorkerPoolStats(): {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"worker-pool.d.ts","sourceRoot":"","sources":["../../src/transform/worker-pool.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"worker-pool.d.ts","sourceRoot":"","sources":["../../src/transform/worker-pool.ts"],"names":[],"mappings":"AA0BA,OAAO,KAAK,EACV,uBAAuB,EAGxB,MAAM,YAAY,CAAC;AAqJpB,UAAU,mBAAmB;IAC3B,SAAS,CACP,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,eAAe,EAAE,OAAO,CAAC;QACzB,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,GACA,OAAO,CAAC,uBAAuB,CAAC,CAAC;IACpC,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IACvB,aAAa,IAAI,MAAM,CAAC;IACxB,gBAAgB,IAAI,MAAM,CAAC;IAC3B,WAAW,IAAI,MAAM,CAAC;CACvB;AAiID,cAAM,UAAW,YAAW,mBAAmB;IAC7C,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,CAAkC;IAExE,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAkC;IAC1D,OAAO,CAAC,QAAQ,CAAS;IACzB,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAoB;IAChD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAoB;IAEhD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAgC;IACtD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAmC;IAC5D,OAAO,CAAC,QAAQ,CAAC,UAAU,CAA0B;IAErD,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;IAClC,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,SAAS,CAAK;IACtB,OAAO,CAAC,SAAS,CAAK;gBAEV,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM;IASrC,SAAS,CACb,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,eAAe,EAAE,OAAO,CAAC;QACzB,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,GACA,OAAO,CAAC,uBAAuB,CAAC;IAC7B,SAAS,CACb,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE;QACP,eAAe,EAAE,OAAO,CAAC;QACzB,MAAM,CAAC,EAAE,WAAW,CAAC;QACrB,cAAc,CAAC,EAAE,OAAO,CAAC;QACzB,QAAQ,CAAC,EAAE,MAAM,CAAC;KACnB,GACA,OAAO,CAAC,uBAAuB,CAAC;IAmCnC,aAAa,IAAI,MAAM;IAIvB,gBAAgB,IAAI,MAAM;IAI1B,WAAW,IAAI,MAAM;IAIrB,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI;IAWpB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IA+B5B,OAAO,CAAC,UAAU;IAIlB,OAAO,CAAC,iBAAiB;IAkDzB,OAAO,CAAC,aAAa;YA4BP,aAAa;IA2B3B,OAAO,CAAC,kBAAkB;IAY1B,OAAO,CAAC,WAAW;IAmCnB,OAAO,CAAC,cAAc;IAuBtB,OAAO,CAAC,aAAa;IAYrB,OAAO,CAAC,eAAe;IAsDvB,OAAO,CAAC,YAAY;IAWpB,OAAO,CAAC,QAAQ;IAUhB,OAAO,CAAC,QAAQ;IAWhB,OAAO,CAAC,YAAY;IASpB,OAAO,CAAC,UAAU;IA2BlB,OAAO,CAAC,iBAAiB;IA4BzB,OAAO,CAAC,gBAAgB;IA8CxB,OAAO,CAAC,YAAY;IA0BpB,OAAO,CAAC,YAAY;CAOrB;AAMD,wBAAgB,qBAAqB,IAAI,UAAU,CAIlD;AAED,wBAAgB,kBAAkB,IAAI;IACpC,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB,GAAG,IAAI,CAOP;AAED,wBAAsB,kBAAkB,IAAI,OAAO,CAAC,IAAI,CAAC,CAIxD"}
|