@j0hanz/superfetch 2.3.0 → 2.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -9
- package/dist/assets/logo.svg +24835 -0
- package/dist/cache.js +58 -4
- package/dist/config.d.ts +2 -0
- package/dist/config.js +2 -0
- package/dist/dom-noise-removal.js +15 -13
- package/dist/fetch.js +16 -25
- package/dist/http-native.js +19 -3
- package/dist/markdown-cleanup.d.ts +6 -12
- package/dist/markdown-cleanup.js +259 -25
- package/dist/mcp.js +27 -10
- package/dist/observability.d.ts +2 -0
- package/dist/observability.js +25 -0
- package/dist/tools.d.ts +6 -4
- package/dist/tools.js +39 -13
- package/dist/transform-types.d.ts +38 -0
- package/dist/transform.d.ts +12 -6
- package/dist/transform.js +158 -267
- package/package.json +1 -2
package/dist/transform.js
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { randomUUID } from 'node:crypto';
|
|
2
2
|
import diagnosticsChannel from 'node:diagnostics_channel';
|
|
3
|
-
import os from 'node:os';
|
|
4
3
|
import { performance } from 'node:perf_hooks';
|
|
5
4
|
import { Worker } from 'node:worker_threads';
|
|
6
5
|
import { parseHTML } from 'linkedom';
|
|
@@ -12,15 +11,9 @@ import { removeNoiseFromHtml } from './dom-noise-removal.js';
|
|
|
12
11
|
import { FetchError, getErrorMessage } from './errors.js';
|
|
13
12
|
import { isRawTextContentUrl } from './fetch.js';
|
|
14
13
|
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
|
|
15
|
-
import { cleanupMarkdownArtifacts,
|
|
14
|
+
import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isLikelyHtmlContent, isRawTextContent, } from './markdown-cleanup.js';
|
|
16
15
|
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
|
|
17
16
|
import { isObject } from './type-guards.js';
|
|
18
|
-
// Re-export language detection for backward compatibility
|
|
19
|
-
export { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
|
|
20
|
-
// Re-export markdown cleanup for backward compatibility
|
|
21
|
-
export { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
|
|
22
|
-
// Re-export DOM noise removal for backward compatibility
|
|
23
|
-
export { removeNoiseFromHtml } from './dom-noise-removal.js';
|
|
24
17
|
function getAbortReason(signal) {
|
|
25
18
|
if (!isObject(signal))
|
|
26
19
|
return undefined;
|
|
@@ -48,25 +41,48 @@ function publishTransformEvent(event) {
|
|
|
48
41
|
/* empty */
|
|
49
42
|
}
|
|
50
43
|
}
|
|
51
|
-
export function startTransformStage(url, stage) {
|
|
52
|
-
if (!transformChannel.hasSubscribers)
|
|
44
|
+
export function startTransformStage(url, stage, budget) {
|
|
45
|
+
if (!transformChannel.hasSubscribers && !budget)
|
|
53
46
|
return null;
|
|
54
|
-
|
|
47
|
+
const remainingBudgetMs = budget
|
|
48
|
+
? budget.totalBudgetMs - budget.elapsedMs
|
|
49
|
+
: undefined;
|
|
50
|
+
const base = {
|
|
55
51
|
stage,
|
|
56
52
|
startTime: performance.now(),
|
|
57
53
|
url: redactUrl(url),
|
|
58
54
|
};
|
|
55
|
+
if (remainingBudgetMs !== undefined && budget) {
|
|
56
|
+
return {
|
|
57
|
+
...base,
|
|
58
|
+
budgetMs: remainingBudgetMs,
|
|
59
|
+
totalBudgetMs: budget.totalBudgetMs,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
return base;
|
|
59
63
|
}
|
|
60
64
|
export function endTransformStage(context, options) {
|
|
61
65
|
if (!context)
|
|
62
|
-
return;
|
|
66
|
+
return 0;
|
|
67
|
+
const durationMs = performance.now() - context.startTime;
|
|
63
68
|
const requestId = getRequestId();
|
|
64
69
|
const operationId = getOperationId();
|
|
70
|
+
if (context.totalBudgetMs !== undefined) {
|
|
71
|
+
const warnThresholdMs = context.totalBudgetMs * config.transform.stageWarnRatio;
|
|
72
|
+
if (durationMs > warnThresholdMs) {
|
|
73
|
+
logWarn('Transform stage exceeded warning threshold', {
|
|
74
|
+
stage: context.stage,
|
|
75
|
+
durationMs: Math.round(durationMs),
|
|
76
|
+
thresholdMs: Math.round(warnThresholdMs),
|
|
77
|
+
url: context.url,
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
}
|
|
65
81
|
const event = {
|
|
66
82
|
v: 1,
|
|
67
83
|
type: 'stage',
|
|
68
84
|
stage: context.stage,
|
|
69
|
-
durationMs
|
|
85
|
+
durationMs,
|
|
70
86
|
url: context.url,
|
|
71
87
|
...(requestId ? { requestId } : {}),
|
|
72
88
|
...(operationId ? { operationId } : {}),
|
|
@@ -75,14 +91,22 @@ export function endTransformStage(context, options) {
|
|
|
75
91
|
: {}),
|
|
76
92
|
};
|
|
77
93
|
publishTransformEvent(event);
|
|
94
|
+
return durationMs;
|
|
78
95
|
}
|
|
79
|
-
function runTransformStage(url, stage, fn) {
|
|
80
|
-
|
|
96
|
+
function runTransformStage(url, stage, fn, budget) {
|
|
97
|
+
if (budget && budget.elapsedMs >= budget.totalBudgetMs) {
|
|
98
|
+
throw new FetchError('Transform budget exhausted', url, 504, {
|
|
99
|
+
reason: 'timeout',
|
|
100
|
+
stage: `${stage}:budget_exhausted`,
|
|
101
|
+
elapsedMs: budget.elapsedMs,
|
|
102
|
+
totalBudgetMs: budget.totalBudgetMs,
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
const context = startTransformStage(url, stage, budget);
|
|
81
106
|
try {
|
|
82
107
|
return fn();
|
|
83
108
|
}
|
|
84
109
|
finally {
|
|
85
|
-
// Emit duration even if the stage throws; callers decide how to handle the error.
|
|
86
110
|
endTransformStage(context);
|
|
87
111
|
}
|
|
88
112
|
}
|
|
@@ -340,21 +364,22 @@ function applyBaseUri(document, url) {
|
|
|
340
364
|
});
|
|
341
365
|
}
|
|
342
366
|
}
|
|
343
|
-
// DOM noise removal functions moved to ./dom-noise-removal.ts
|
|
344
367
|
function buildInlineCode(content) {
|
|
345
|
-
|
|
346
|
-
let
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
368
|
+
let maxBackticks = 0;
|
|
369
|
+
let currentRun = 0;
|
|
370
|
+
for (const char of content) {
|
|
371
|
+
if (char === '`') {
|
|
372
|
+
currentRun++;
|
|
373
|
+
}
|
|
374
|
+
else {
|
|
375
|
+
if (currentRun > maxBackticks)
|
|
376
|
+
maxBackticks = currentRun;
|
|
377
|
+
currentRun = 0;
|
|
352
378
|
}
|
|
353
379
|
}
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
// CommonMark recommends padding when the code starts/ends with a backtick.
|
|
380
|
+
if (currentRun > maxBackticks)
|
|
381
|
+
maxBackticks = currentRun;
|
|
382
|
+
const delimiter = '`'.repeat(maxBackticks + 1);
|
|
358
383
|
const padding = content.startsWith('`') || content.endsWith('`') ? ' ' : '';
|
|
359
384
|
return `${delimiter}${padding}${content}${padding}${delimiter}`;
|
|
360
385
|
}
|
|
@@ -494,6 +519,36 @@ function createCustomTranslators() {
|
|
|
494
519
|
.join('\n');
|
|
495
520
|
return { content: items ? `\n${items}\n\n` : '' };
|
|
496
521
|
},
|
|
522
|
+
div: (ctx) => {
|
|
523
|
+
if (!isObject(ctx) || !isObject(ctx.node)) {
|
|
524
|
+
return {};
|
|
525
|
+
}
|
|
526
|
+
const node = ctx.node;
|
|
527
|
+
const className = typeof node.attribs?.class === 'string' ? node.attribs.class : '';
|
|
528
|
+
if (!className.includes('type')) {
|
|
529
|
+
return {};
|
|
530
|
+
}
|
|
531
|
+
return {
|
|
532
|
+
postprocess: ({ content }) => {
|
|
533
|
+
const lines = content.split('\n');
|
|
534
|
+
const separated = [];
|
|
535
|
+
for (let i = 0; i < lines.length; i++) {
|
|
536
|
+
const line = lines[i] ?? '';
|
|
537
|
+
const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
|
|
538
|
+
separated.push(line);
|
|
539
|
+
if (line.trim() &&
|
|
540
|
+
nextLine.trim() &&
|
|
541
|
+
line.includes(':') &&
|
|
542
|
+
nextLine.includes(':') &&
|
|
543
|
+
!line.startsWith(' ') &&
|
|
544
|
+
!nextLine.startsWith(' ')) {
|
|
545
|
+
separated.push('');
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
return separated.join('\n');
|
|
549
|
+
},
|
|
550
|
+
};
|
|
551
|
+
},
|
|
497
552
|
kbd: () => ({
|
|
498
553
|
postprocess: ({ content }) => `\`${content}\``,
|
|
499
554
|
}),
|
|
@@ -506,7 +561,8 @@ function createCustomTranslators() {
|
|
|
506
561
|
sup: () => ({
|
|
507
562
|
postprocess: ({ content }) => `^${content}^`,
|
|
508
563
|
}),
|
|
509
|
-
//
|
|
564
|
+
// Note: section translator removed in favor of HTML preprocessing
|
|
565
|
+
// See preprocessPropertySections() for the fix to TypeDoc section spacing
|
|
510
566
|
pre: (ctx) => buildPreTranslator(ctx),
|
|
511
567
|
};
|
|
512
568
|
}
|
|
@@ -523,16 +579,20 @@ function getMarkdownConverter() {
|
|
|
523
579
|
markdownInstance ??= createMarkdownInstance();
|
|
524
580
|
return markdownInstance;
|
|
525
581
|
}
|
|
582
|
+
function preprocessPropertySections(html) {
|
|
583
|
+
const result = html.replace(/<\/section>\s*(<section[^>]*class="[^"]*tsd-panel[^"]*tsd-member[^"]*"[^>]*>)/g, '</section><p> </p>$1');
|
|
584
|
+
return result;
|
|
585
|
+
}
|
|
526
586
|
function translateHtmlToMarkdown(html, url, signal, document, skipNoiseRemoval) {
|
|
527
587
|
throwIfAborted(signal, url, 'markdown:begin');
|
|
528
588
|
const cleanedHtml = skipNoiseRemoval
|
|
529
589
|
? html
|
|
530
590
|
: runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
|
|
531
591
|
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
532
|
-
const
|
|
592
|
+
const preprocessedHtml = runTransformStage(url, 'markdown:preprocess', () => preprocessPropertySections(cleanedHtml));
|
|
593
|
+
const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(preprocessedHtml).trim());
|
|
533
594
|
throwIfAborted(signal, url, 'markdown:translated');
|
|
534
|
-
|
|
535
|
-
return promoteOrphanHeadings(cleaned);
|
|
595
|
+
return cleanupMarkdownArtifacts(content);
|
|
536
596
|
}
|
|
537
597
|
function appendMetadataFooter(content, metadata, url) {
|
|
538
598
|
const footer = buildMetadataFooter(metadata, url);
|
|
@@ -554,223 +614,6 @@ export function htmlToMarkdown(html, metadata, options) {
|
|
|
554
614
|
return buildMetadataFooter(metadata, url);
|
|
555
615
|
}
|
|
556
616
|
}
|
|
557
|
-
// Markdown cleanup functions moved to ./markdown-cleanup.ts
|
|
558
|
-
function formatFetchedDate(isoString) {
|
|
559
|
-
try {
|
|
560
|
-
const date = new Date(isoString);
|
|
561
|
-
const day = String(date.getDate()).padStart(2, '0');
|
|
562
|
-
const month = String(date.getMonth() + 1).padStart(2, '0');
|
|
563
|
-
const year = date.getFullYear();
|
|
564
|
-
return `${day}-${month}-${year}`;
|
|
565
|
-
}
|
|
566
|
-
catch {
|
|
567
|
-
return isoString;
|
|
568
|
-
}
|
|
569
|
-
}
|
|
570
|
-
function buildMetadataFooter(metadata, fallbackUrl) {
|
|
571
|
-
if (!metadata)
|
|
572
|
-
return '';
|
|
573
|
-
const lines = ['---', ''];
|
|
574
|
-
const url = metadata.url || fallbackUrl;
|
|
575
|
-
const parts = [];
|
|
576
|
-
if (metadata.title)
|
|
577
|
-
parts.push(`_${metadata.title}_`);
|
|
578
|
-
if (metadata.author)
|
|
579
|
-
parts.push(`_${metadata.author}_`);
|
|
580
|
-
if (url)
|
|
581
|
-
parts.push(`[_Original Source_](${url})`);
|
|
582
|
-
if (metadata.fetchedAt) {
|
|
583
|
-
const formattedDate = formatFetchedDate(metadata.fetchedAt);
|
|
584
|
-
parts.push(`_${formattedDate}_`);
|
|
585
|
-
}
|
|
586
|
-
if (parts.length > 0) {
|
|
587
|
-
lines.push(` ${parts.join(' | ')}`);
|
|
588
|
-
}
|
|
589
|
-
if (metadata.description) {
|
|
590
|
-
lines.push(` <sub>${metadata.description}</sub>`);
|
|
591
|
-
}
|
|
592
|
-
return lines.join('\n');
|
|
593
|
-
}
|
|
594
|
-
const HEADING_PATTERN = /^#{1,6}\s/m;
|
|
595
|
-
const LIST_PATTERN = /^(?:[-*+])\s/m;
|
|
596
|
-
const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
|
|
597
|
-
function containsMarkdownHeading(content) {
|
|
598
|
-
return HEADING_PATTERN.test(content);
|
|
599
|
-
}
|
|
600
|
-
function containsMarkdownList(content) {
|
|
601
|
-
return LIST_PATTERN.test(content);
|
|
602
|
-
}
|
|
603
|
-
function containsFencedCodeBlock(content) {
|
|
604
|
-
const first = content.indexOf('```');
|
|
605
|
-
if (first === -1)
|
|
606
|
-
return false;
|
|
607
|
-
return content.includes('```', first + 3);
|
|
608
|
-
}
|
|
609
|
-
function looksLikeMarkdown(content) {
|
|
610
|
-
return (containsMarkdownHeading(content) ||
|
|
611
|
-
containsMarkdownList(content) ||
|
|
612
|
-
containsFencedCodeBlock(content));
|
|
613
|
-
}
|
|
614
|
-
function detectLineEnding(content) {
|
|
615
|
-
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
616
|
-
}
|
|
617
|
-
const FRONTMATTER_DELIMITER = '---';
|
|
618
|
-
function findFrontmatterLines(content) {
|
|
619
|
-
const lineEnding = detectLineEnding(content);
|
|
620
|
-
const lines = content.split(lineEnding);
|
|
621
|
-
if (lines[0] !== FRONTMATTER_DELIMITER)
|
|
622
|
-
return null;
|
|
623
|
-
const endIndex = lines.indexOf(FRONTMATTER_DELIMITER, 1);
|
|
624
|
-
if (endIndex === -1)
|
|
625
|
-
return null;
|
|
626
|
-
return { lineEnding, lines, endIndex };
|
|
627
|
-
}
|
|
628
|
-
function stripOptionalQuotes(value) {
|
|
629
|
-
const trimmed = value.trim();
|
|
630
|
-
if (trimmed.length < 2)
|
|
631
|
-
return trimmed;
|
|
632
|
-
const first = trimmed[0];
|
|
633
|
-
const last = trimmed[trimmed.length - 1];
|
|
634
|
-
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
635
|
-
return trimmed.slice(1, -1).trim();
|
|
636
|
-
}
|
|
637
|
-
return trimmed;
|
|
638
|
-
}
|
|
639
|
-
function parseFrontmatterEntry(line) {
|
|
640
|
-
const trimmed = line.trim();
|
|
641
|
-
if (!trimmed)
|
|
642
|
-
return null;
|
|
643
|
-
const separatorIndex = trimmed.indexOf(':');
|
|
644
|
-
if (separatorIndex <= 0)
|
|
645
|
-
return null;
|
|
646
|
-
const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
|
|
647
|
-
const value = trimmed.slice(separatorIndex + 1);
|
|
648
|
-
return { key, value };
|
|
649
|
-
}
|
|
650
|
-
function isTitleKey(key) {
|
|
651
|
-
return key === 'title' || key === 'name';
|
|
652
|
-
}
|
|
653
|
-
function extractTitleFromHeading(content) {
|
|
654
|
-
const lineEnding = detectLineEnding(content);
|
|
655
|
-
const lines = content.split(lineEnding);
|
|
656
|
-
for (const line of lines) {
|
|
657
|
-
const trimmed = line.trim();
|
|
658
|
-
if (!trimmed)
|
|
659
|
-
continue;
|
|
660
|
-
let index = 0;
|
|
661
|
-
while (index < trimmed.length && trimmed[index] === '#') {
|
|
662
|
-
index += 1;
|
|
663
|
-
}
|
|
664
|
-
if (index === 0 || index > 6)
|
|
665
|
-
return undefined;
|
|
666
|
-
const nextChar = trimmed[index];
|
|
667
|
-
if (nextChar !== ' ' && nextChar !== '\t')
|
|
668
|
-
return undefined;
|
|
669
|
-
const heading = trimmed.slice(index).trim();
|
|
670
|
-
return heading.length > 0 ? heading : undefined;
|
|
671
|
-
}
|
|
672
|
-
return undefined;
|
|
673
|
-
}
|
|
674
|
-
function extractTitleFromRawMarkdown(content) {
|
|
675
|
-
const frontmatter = findFrontmatterLines(content);
|
|
676
|
-
if (!frontmatter) {
|
|
677
|
-
return extractTitleFromHeading(content);
|
|
678
|
-
}
|
|
679
|
-
const { lines, endIndex } = frontmatter;
|
|
680
|
-
const entry = lines
|
|
681
|
-
.slice(1, endIndex)
|
|
682
|
-
.map((line) => parseFrontmatterEntry(line))
|
|
683
|
-
.find((parsed) => parsed !== null && isTitleKey(parsed.key));
|
|
684
|
-
if (!entry)
|
|
685
|
-
return undefined;
|
|
686
|
-
const value = stripOptionalQuotes(entry.value);
|
|
687
|
-
return value || undefined;
|
|
688
|
-
}
|
|
689
|
-
function hasMarkdownSourceLine(content) {
|
|
690
|
-
const lineEnding = detectLineEnding(content);
|
|
691
|
-
const lines = content.split(lineEnding);
|
|
692
|
-
const limit = Math.min(lines.length, 50);
|
|
693
|
-
for (let index = 0; index < limit; index += 1) {
|
|
694
|
-
const line = lines[index];
|
|
695
|
-
if (!line)
|
|
696
|
-
continue;
|
|
697
|
-
if (line.trimStart().toLowerCase().startsWith('source:')) {
|
|
698
|
-
return true;
|
|
699
|
-
}
|
|
700
|
-
}
|
|
701
|
-
return false;
|
|
702
|
-
}
|
|
703
|
-
function addSourceToMarkdownMarkdownFormat(content, url) {
|
|
704
|
-
if (hasMarkdownSourceLine(content))
|
|
705
|
-
return content;
|
|
706
|
-
const lineEnding = detectLineEnding(content);
|
|
707
|
-
const lines = content.split(lineEnding);
|
|
708
|
-
const firstNonEmptyIndex = lines.findIndex((line) => line.trim().length > 0);
|
|
709
|
-
if (firstNonEmptyIndex !== -1) {
|
|
710
|
-
const firstLine = lines[firstNonEmptyIndex];
|
|
711
|
-
if (firstLine && /^#{1,6}\s+/.test(firstLine.trim())) {
|
|
712
|
-
const insertAt = firstNonEmptyIndex + 1;
|
|
713
|
-
const updated = [
|
|
714
|
-
...lines.slice(0, insertAt),
|
|
715
|
-
'',
|
|
716
|
-
`Source: ${url}`,
|
|
717
|
-
'',
|
|
718
|
-
...lines.slice(insertAt),
|
|
719
|
-
];
|
|
720
|
-
return updated.join(lineEnding);
|
|
721
|
-
}
|
|
722
|
-
}
|
|
723
|
-
return [`Source: ${url}`, '', content].join(lineEnding);
|
|
724
|
-
}
|
|
725
|
-
function addSourceToMarkdown(content, url) {
|
|
726
|
-
const frontmatter = findFrontmatterLines(content);
|
|
727
|
-
if (config.transform.metadataFormat === 'markdown' && !frontmatter) {
|
|
728
|
-
return addSourceToMarkdownMarkdownFormat(content, url);
|
|
729
|
-
}
|
|
730
|
-
if (!frontmatter) {
|
|
731
|
-
return `---\nsource: "${url}"\n---\n\n${content}`;
|
|
732
|
-
}
|
|
733
|
-
const { lineEnding, lines, endIndex } = frontmatter;
|
|
734
|
-
const bodyLines = lines.slice(1, endIndex);
|
|
735
|
-
const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
|
|
736
|
-
if (hasSource)
|
|
737
|
-
return content;
|
|
738
|
-
const updatedLines = [
|
|
739
|
-
lines[0],
|
|
740
|
-
...bodyLines,
|
|
741
|
-
`source: "${url}"`,
|
|
742
|
-
...lines.slice(endIndex),
|
|
743
|
-
];
|
|
744
|
-
return updatedLines.join(lineEnding);
|
|
745
|
-
}
|
|
746
|
-
function hasFrontmatter(trimmed) {
|
|
747
|
-
return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
|
|
748
|
-
}
|
|
749
|
-
function looksLikeHtmlDocument(trimmed) {
|
|
750
|
-
return HTML_DOCUMENT_PATTERN.test(trimmed);
|
|
751
|
-
}
|
|
752
|
-
function countCommonHtmlTags(content) {
|
|
753
|
-
const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
|
|
754
|
-
[];
|
|
755
|
-
return matches.length;
|
|
756
|
-
}
|
|
757
|
-
function isRawTextContent(content) {
|
|
758
|
-
const trimmed = content.trim();
|
|
759
|
-
const isHtmlDocument = looksLikeHtmlDocument(trimmed);
|
|
760
|
-
const hasMarkdownFrontmatter = hasFrontmatter(trimmed);
|
|
761
|
-
const hasTooManyHtmlTags = countCommonHtmlTags(content) > 2;
|
|
762
|
-
const isMarkdown = looksLikeMarkdown(content);
|
|
763
|
-
return (!isHtmlDocument &&
|
|
764
|
-
(hasMarkdownFrontmatter || (!hasTooManyHtmlTags && isMarkdown)));
|
|
765
|
-
}
|
|
766
|
-
function isLikelyHtmlContent(content) {
|
|
767
|
-
const trimmed = content.trim();
|
|
768
|
-
if (!trimmed)
|
|
769
|
-
return false;
|
|
770
|
-
if (looksLikeHtmlDocument(trimmed))
|
|
771
|
-
return true;
|
|
772
|
-
return countCommonHtmlTags(content) > 2;
|
|
773
|
-
}
|
|
774
617
|
function shouldPreserveRawContent(url, content) {
|
|
775
618
|
if (isRawTextContentUrl(url)) {
|
|
776
619
|
return !isLikelyHtmlContent(content);
|
|
@@ -1189,11 +1032,11 @@ const workerMessageSchema = z.discriminatedUnion('type', [
|
|
|
1189
1032
|
}),
|
|
1190
1033
|
]);
|
|
1191
1034
|
let pool = null;
|
|
1035
|
+
const POOL_MIN_WORKERS = 2;
|
|
1036
|
+
const POOL_MAX_WORKERS = 4;
|
|
1037
|
+
const POOL_SCALE_THRESHOLD = 0.5;
|
|
1192
1038
|
function resolveDefaultWorkerCount() {
|
|
1193
|
-
|
|
1194
|
-
? os.availableParallelism()
|
|
1195
|
-
: os.cpus().length;
|
|
1196
|
-
return Math.min(16, Math.max(1, parallelism - 1));
|
|
1039
|
+
return POOL_MIN_WORKERS;
|
|
1197
1040
|
}
|
|
1198
1041
|
const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
|
|
1199
1042
|
function getOrCreateTransformWorkerPool() {
|
|
@@ -1206,8 +1049,20 @@ export async function shutdownTransformWorkerPool() {
|
|
|
1206
1049
|
await pool.close();
|
|
1207
1050
|
pool = null;
|
|
1208
1051
|
}
|
|
1052
|
+
export function getTransformPoolStats() {
|
|
1053
|
+
if (!pool)
|
|
1054
|
+
return null;
|
|
1055
|
+
return {
|
|
1056
|
+
queueDepth: pool.getQueueDepth(),
|
|
1057
|
+
activeWorkers: pool.getActiveWorkers(),
|
|
1058
|
+
capacity: pool.getCapacity(),
|
|
1059
|
+
};
|
|
1060
|
+
}
|
|
1209
1061
|
class WorkerPool {
|
|
1210
1062
|
workers = [];
|
|
1063
|
+
capacity;
|
|
1064
|
+
minCapacity;
|
|
1065
|
+
maxCapacity;
|
|
1211
1066
|
queue = [];
|
|
1212
1067
|
inflight = new Map();
|
|
1213
1068
|
timeoutMs;
|
|
@@ -1323,12 +1178,11 @@ class WorkerPool {
|
|
|
1323
1178
|
});
|
|
1324
1179
|
}
|
|
1325
1180
|
constructor(size, timeoutMs) {
|
|
1326
|
-
|
|
1181
|
+
this.minCapacity = POOL_MIN_WORKERS;
|
|
1182
|
+
this.maxCapacity = POOL_MAX_WORKERS;
|
|
1183
|
+
this.capacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
|
|
1327
1184
|
this.timeoutMs = timeoutMs;
|
|
1328
|
-
this.queueMax =
|
|
1329
|
-
for (let index = 0; index < safeSize; index += 1) {
|
|
1330
|
-
this.workers.push(this.spawnWorker(index));
|
|
1331
|
-
}
|
|
1185
|
+
this.queueMax = this.maxCapacity * 32;
|
|
1332
1186
|
}
|
|
1333
1187
|
spawnWorker(workerIndex) {
|
|
1334
1188
|
const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url));
|
|
@@ -1426,21 +1280,46 @@ class WorkerPool {
|
|
|
1426
1280
|
this.drainQueue();
|
|
1427
1281
|
});
|
|
1428
1282
|
}
|
|
1283
|
+
/** Scale capacity up if queue pressure exceeds threshold. */
|
|
1284
|
+
maybeScaleUp() {
|
|
1285
|
+
if (this.queue.length > this.capacity * POOL_SCALE_THRESHOLD &&
|
|
1286
|
+
this.capacity < this.maxCapacity) {
|
|
1287
|
+
this.capacity += 1;
|
|
1288
|
+
}
|
|
1289
|
+
}
|
|
1429
1290
|
drainQueue() {
|
|
1291
|
+
if (this.closed)
|
|
1292
|
+
return;
|
|
1430
1293
|
if (this.queue.length === 0)
|
|
1431
1294
|
return;
|
|
1295
|
+
this.maybeScaleUp();
|
|
1296
|
+
// First pass: try to find an idle existing worker
|
|
1432
1297
|
for (let workerIndex = 0; workerIndex < this.workers.length; workerIndex += 1) {
|
|
1433
1298
|
const slot = this.workers[workerIndex];
|
|
1434
|
-
if (
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1299
|
+
if (slot && !slot.busy) {
|
|
1300
|
+
this.dispatchQueueTask(workerIndex, slot);
|
|
1301
|
+
if (this.queue.length === 0)
|
|
1302
|
+
return;
|
|
1303
|
+
}
|
|
1304
|
+
}
|
|
1305
|
+
if (this.workers.length < this.capacity && this.queue.length > 0) {
|
|
1306
|
+
const workerIndex = this.workers.length;
|
|
1307
|
+
const slot = this.spawnWorker(workerIndex);
|
|
1308
|
+
this.workers.push(slot);
|
|
1309
|
+
this.dispatchQueueTask(workerIndex, slot);
|
|
1310
|
+
if (this.workers.length < this.capacity && this.queue.length > 0) {
|
|
1311
|
+
setImmediate(() => {
|
|
1312
|
+
this.drainQueue();
|
|
1313
|
+
});
|
|
1314
|
+
}
|
|
1442
1315
|
}
|
|
1443
1316
|
}
|
|
1317
|
+
dispatchQueueTask(workerIndex, slot) {
|
|
1318
|
+
const task = this.queue.shift();
|
|
1319
|
+
if (!task)
|
|
1320
|
+
return;
|
|
1321
|
+
this.dispatch(workerIndex, slot, task);
|
|
1322
|
+
}
|
|
1444
1323
|
dispatch(workerIndex, slot, task) {
|
|
1445
1324
|
if (this.rejectIfAborted(task))
|
|
1446
1325
|
return;
|
|
@@ -1510,11 +1389,23 @@ class WorkerPool {
|
|
|
1510
1389
|
task.reject(message);
|
|
1511
1390
|
this.restartWorker(workerIndex, slot);
|
|
1512
1391
|
}
|
|
1392
|
+
getQueueDepth() {
|
|
1393
|
+
return this.queue.length;
|
|
1394
|
+
}
|
|
1395
|
+
getActiveWorkers() {
|
|
1396
|
+
return this.workers.filter((s) => s?.busy).length;
|
|
1397
|
+
}
|
|
1398
|
+
getCapacity() {
|
|
1399
|
+
return this.capacity;
|
|
1400
|
+
}
|
|
1513
1401
|
async close() {
|
|
1514
1402
|
if (this.closed)
|
|
1515
1403
|
return;
|
|
1516
1404
|
this.closed = true;
|
|
1517
|
-
const terminations = this.workers
|
|
1405
|
+
const terminations = this.workers
|
|
1406
|
+
.map((slot) => slot?.worker.terminate())
|
|
1407
|
+
.filter((p) => p !== undefined);
|
|
1408
|
+
this.workers.fill(undefined);
|
|
1518
1409
|
this.workers.length = 0;
|
|
1519
1410
|
for (const [id, inflight] of this.inflight.entries()) {
|
|
1520
1411
|
clearTimeout(inflight.timer);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@j0hanz/superfetch",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.4.1",
|
|
4
4
|
"mcpName": "io.github.j0hanz/superfetch",
|
|
5
5
|
"description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable Markdown",
|
|
6
6
|
"type": "module",
|
|
@@ -59,7 +59,6 @@
|
|
|
59
59
|
"@modelcontextprotocol/sdk": "^1.25.3",
|
|
60
60
|
"@mozilla/readability": "^0.6.0",
|
|
61
61
|
"linkedom": "^0.18.12",
|
|
62
|
-
"lru-cache": "^11.2.5",
|
|
63
62
|
"node-html-markdown": "^2.0.0",
|
|
64
63
|
"undici": "^7.19.2",
|
|
65
64
|
"zod": "^4.3.6"
|