@j0hanz/superfetch 2.2.2 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +358 -363
  2. package/dist/assets/logo.svg +24835 -0
  3. package/dist/cache.d.ts +0 -1
  4. package/dist/cache.js +71 -29
  5. package/dist/config.d.ts +2 -1
  6. package/dist/config.js +11 -7
  7. package/dist/crypto.d.ts +0 -1
  8. package/dist/crypto.js +0 -1
  9. package/dist/dom-noise-removal.d.ts +0 -1
  10. package/dist/dom-noise-removal.js +50 -45
  11. package/dist/errors.d.ts +0 -1
  12. package/dist/errors.js +0 -1
  13. package/dist/fetch.d.ts +0 -1
  14. package/dist/fetch.js +61 -54
  15. package/dist/host-normalization.d.ts +1 -0
  16. package/dist/host-normalization.js +47 -0
  17. package/dist/http-native.d.ts +0 -1
  18. package/dist/http-native.js +92 -28
  19. package/dist/index.d.ts +0 -1
  20. package/dist/index.js +0 -1
  21. package/dist/instructions.md +41 -41
  22. package/dist/json.d.ts +0 -1
  23. package/dist/json.js +0 -1
  24. package/dist/language-detection.d.ts +0 -1
  25. package/dist/language-detection.js +10 -2
  26. package/dist/markdown-cleanup.d.ts +6 -13
  27. package/dist/markdown-cleanup.js +252 -34
  28. package/dist/mcp-validator.d.ts +14 -0
  29. package/dist/mcp-validator.js +22 -0
  30. package/dist/mcp.d.ts +0 -1
  31. package/dist/mcp.js +20 -10
  32. package/dist/observability.d.ts +2 -1
  33. package/dist/observability.js +30 -3
  34. package/dist/server-tuning.d.ts +9 -0
  35. package/dist/server-tuning.js +30 -0
  36. package/dist/{http-utils.d.ts → session.d.ts} +0 -25
  37. package/dist/{http-utils.js → session.js} +11 -104
  38. package/dist/tools.d.ts +5 -4
  39. package/dist/tools.js +46 -41
  40. package/dist/transform-types.d.ts +38 -1
  41. package/dist/transform-types.js +0 -1
  42. package/dist/transform.d.ts +12 -7
  43. package/dist/transform.js +205 -344
  44. package/dist/type-guards.d.ts +0 -1
  45. package/dist/type-guards.js +0 -1
  46. package/dist/workers/transform-worker.d.ts +0 -1
  47. package/dist/workers/transform-worker.js +29 -19
  48. package/package.json +84 -85
  49. package/dist/cache.d.ts.map +0 -1
  50. package/dist/cache.js.map +0 -1
  51. package/dist/config.d.ts.map +0 -1
  52. package/dist/config.js.map +0 -1
  53. package/dist/crypto.d.ts.map +0 -1
  54. package/dist/crypto.js.map +0 -1
  55. package/dist/dom-noise-removal.d.ts.map +0 -1
  56. package/dist/dom-noise-removal.js.map +0 -1
  57. package/dist/errors.d.ts.map +0 -1
  58. package/dist/errors.js.map +0 -1
  59. package/dist/fetch.d.ts.map +0 -1
  60. package/dist/fetch.js.map +0 -1
  61. package/dist/http-native.d.ts.map +0 -1
  62. package/dist/http-native.js.map +0 -1
  63. package/dist/http-utils.d.ts.map +0 -1
  64. package/dist/http-utils.js.map +0 -1
  65. package/dist/index.d.ts.map +0 -1
  66. package/dist/index.js.map +0 -1
  67. package/dist/json.d.ts.map +0 -1
  68. package/dist/json.js.map +0 -1
  69. package/dist/language-detection.d.ts.map +0 -1
  70. package/dist/language-detection.js.map +0 -1
  71. package/dist/markdown-cleanup.d.ts.map +0 -1
  72. package/dist/markdown-cleanup.js.map +0 -1
  73. package/dist/mcp.d.ts.map +0 -1
  74. package/dist/mcp.js.map +0 -1
  75. package/dist/observability.d.ts.map +0 -1
  76. package/dist/observability.js.map +0 -1
  77. package/dist/tools.d.ts.map +0 -1
  78. package/dist/tools.js.map +0 -1
  79. package/dist/transform-types.d.ts.map +0 -1
  80. package/dist/transform-types.js.map +0 -1
  81. package/dist/transform.d.ts.map +0 -1
  82. package/dist/transform.js.map +0 -1
  83. package/dist/type-guards.d.ts.map +0 -1
  84. package/dist/type-guards.js.map +0 -1
  85. package/dist/workers/transform-worker.d.ts.map +0 -1
  86. package/dist/workers/transform-worker.js.map +0 -1
package/dist/transform.js CHANGED
@@ -1,6 +1,5 @@
1
1
  import { randomUUID } from 'node:crypto';
2
2
  import diagnosticsChannel from 'node:diagnostics_channel';
3
- import os from 'node:os';
4
3
  import { performance } from 'node:perf_hooks';
5
4
  import { Worker } from 'node:worker_threads';
6
5
  import { parseHTML } from 'linkedom';
@@ -12,15 +11,9 @@ import { removeNoiseFromHtml } from './dom-noise-removal.js';
12
11
  import { FetchError, getErrorMessage } from './errors.js';
13
12
  import { isRawTextContentUrl } from './fetch.js';
14
13
  import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
15
- import { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
14
+ import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isLikelyHtmlContent, isRawTextContent, } from './markdown-cleanup.js';
16
15
  import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
17
16
  import { isObject } from './type-guards.js';
18
- // Re-export language detection for backward compatibility
19
- export { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
20
- // Re-export markdown cleanup for backward compatibility
21
- export { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
22
- // Re-export DOM noise removal for backward compatibility
23
- export { removeNoiseFromHtml } from './dom-noise-removal.js';
24
17
  function getAbortReason(signal) {
25
18
  if (!isObject(signal))
26
19
  return undefined;
@@ -34,6 +27,10 @@ const CODE_BLOCK = {
34
27
  },
35
28
  };
36
29
  const transformChannel = diagnosticsChannel.channel('superfetch.transform');
30
+ const LOG_URL_MAX = 80;
31
+ function truncateUrlForLog(url) {
32
+ return url.substring(0, LOG_URL_MAX);
33
+ }
37
34
  function publishTransformEvent(event) {
38
35
  if (!transformChannel.hasSubscribers)
39
36
  return;
@@ -44,25 +41,48 @@ function publishTransformEvent(event) {
44
41
  /* empty */
45
42
  }
46
43
  }
47
- export function startTransformStage(url, stage) {
48
- if (!transformChannel.hasSubscribers)
44
+ export function startTransformStage(url, stage, budget) {
45
+ if (!transformChannel.hasSubscribers && !budget)
49
46
  return null;
50
- return {
47
+ const remainingBudgetMs = budget
48
+ ? budget.totalBudgetMs - budget.elapsedMs
49
+ : undefined;
50
+ const base = {
51
51
  stage,
52
52
  startTime: performance.now(),
53
53
  url: redactUrl(url),
54
54
  };
55
+ if (remainingBudgetMs !== undefined && budget) {
56
+ return {
57
+ ...base,
58
+ budgetMs: remainingBudgetMs,
59
+ totalBudgetMs: budget.totalBudgetMs,
60
+ };
61
+ }
62
+ return base;
55
63
  }
56
64
  export function endTransformStage(context, options) {
57
65
  if (!context)
58
- return;
66
+ return 0;
67
+ const durationMs = performance.now() - context.startTime;
59
68
  const requestId = getRequestId();
60
69
  const operationId = getOperationId();
70
+ if (context.totalBudgetMs !== undefined) {
71
+ const warnThresholdMs = context.totalBudgetMs * config.transform.stageWarnRatio;
72
+ if (durationMs > warnThresholdMs) {
73
+ logWarn('Transform stage exceeded warning threshold', {
74
+ stage: context.stage,
75
+ durationMs: Math.round(durationMs),
76
+ thresholdMs: Math.round(warnThresholdMs),
77
+ url: context.url,
78
+ });
79
+ }
80
+ }
61
81
  const event = {
62
82
  v: 1,
63
83
  type: 'stage',
64
84
  stage: context.stage,
65
- durationMs: performance.now() - context.startTime,
85
+ durationMs,
66
86
  url: context.url,
67
87
  ...(requestId ? { requestId } : {}),
68
88
  ...(operationId ? { operationId } : {}),
@@ -71,14 +91,22 @@ export function endTransformStage(context, options) {
71
91
  : {}),
72
92
  };
73
93
  publishTransformEvent(event);
94
+ return durationMs;
74
95
  }
75
- function runTransformStage(url, stage, fn) {
76
- const context = startTransformStage(url, stage);
96
+ function runTransformStage(url, stage, fn, budget) {
97
+ if (budget && budget.elapsedMs >= budget.totalBudgetMs) {
98
+ throw new FetchError('Transform budget exhausted', url, 504, {
99
+ reason: 'timeout',
100
+ stage: `${stage}:budget_exhausted`,
101
+ elapsedMs: budget.elapsedMs,
102
+ totalBudgetMs: budget.totalBudgetMs,
103
+ });
104
+ }
105
+ const context = startTransformStage(url, stage, budget);
77
106
  try {
78
107
  return fn();
79
108
  }
80
109
  finally {
81
- // Emit duration even if the stage throws; callers decide how to handle the error.
82
110
  endTransformStage(context);
83
111
  }
84
112
  }
@@ -336,21 +364,22 @@ function applyBaseUri(document, url) {
336
364
  });
337
365
  }
338
366
  }
339
- // DOM noise removal functions moved to ./dom-noise-removal.ts
340
367
  function buildInlineCode(content) {
341
- const runs = content.match(/`+/g);
342
- let longest = '';
343
- if (runs) {
344
- for (const run of runs) {
345
- if (run.length > longest.length) {
346
- longest = run;
347
- }
368
+ let maxBackticks = 0;
369
+ let currentRun = 0;
370
+ for (const char of content) {
371
+ if (char === '`') {
372
+ currentRun++;
373
+ }
374
+ else {
375
+ if (currentRun > maxBackticks)
376
+ maxBackticks = currentRun;
377
+ currentRun = 0;
348
378
  }
349
379
  }
350
- // Use a fence longer than any run of backticks in the content.
351
- const delimiter = `\`${longest}`;
352
- // Only pad when needed to avoid altering code spans unnecessarily.
353
- // CommonMark recommends padding when the code starts/ends with a backtick.
380
+ if (currentRun > maxBackticks)
381
+ maxBackticks = currentRun;
382
+ const delimiter = '`'.repeat(maxBackticks + 1);
354
383
  const padding = content.startsWith('`') || content.endsWith('`') ? ' ' : '';
355
384
  return `${delimiter}${padding}${content}${padding}${delimiter}`;
356
385
  }
@@ -527,8 +556,7 @@ function translateHtmlToMarkdown(html, url, signal, document, skipNoiseRemoval)
527
556
  throwIfAborted(signal, url, 'markdown:cleaned');
528
557
  const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
529
558
  throwIfAborted(signal, url, 'markdown:translated');
530
- const cleaned = cleanupMarkdownArtifacts(content);
531
- return promoteOrphanHeadings(cleaned);
559
+ return cleanupMarkdownArtifacts(content);
532
560
  }
533
561
  function appendMetadataFooter(content, metadata, url) {
534
562
  const footer = buildMetadataFooter(metadata, url);
@@ -550,223 +578,6 @@ export function htmlToMarkdown(html, metadata, options) {
550
578
  return buildMetadataFooter(metadata, url);
551
579
  }
552
580
  }
553
- // Markdown cleanup functions moved to ./markdown-cleanup.ts
554
- function formatFetchedDate(isoString) {
555
- try {
556
- const date = new Date(isoString);
557
- const day = String(date.getDate()).padStart(2, '0');
558
- const month = String(date.getMonth() + 1).padStart(2, '0');
559
- const year = date.getFullYear();
560
- return `${day}-${month}-${year}`;
561
- }
562
- catch {
563
- return isoString;
564
- }
565
- }
566
- function buildMetadataFooter(metadata, fallbackUrl) {
567
- if (!metadata)
568
- return '';
569
- const lines = ['---', ''];
570
- const url = metadata.url || fallbackUrl;
571
- const parts = [];
572
- if (metadata.title)
573
- parts.push(`_${metadata.title}_`);
574
- if (metadata.author)
575
- parts.push(`_${metadata.author}_`);
576
- if (url)
577
- parts.push(`[_Original Source_](${url})`);
578
- if (metadata.fetchedAt) {
579
- const formattedDate = formatFetchedDate(metadata.fetchedAt);
580
- parts.push(`_${formattedDate}_`);
581
- }
582
- if (parts.length > 0) {
583
- lines.push(` ${parts.join(' | ')}`);
584
- }
585
- if (metadata.description) {
586
- lines.push(` <sub>${metadata.description}</sub>`);
587
- }
588
- return lines.join('\n');
589
- }
590
- const HEADING_PATTERN = /^#{1,6}\s/m;
591
- const LIST_PATTERN = /^(?:[-*+])\s/m;
592
- const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
593
- function containsMarkdownHeading(content) {
594
- return HEADING_PATTERN.test(content);
595
- }
596
- function containsMarkdownList(content) {
597
- return LIST_PATTERN.test(content);
598
- }
599
- function containsFencedCodeBlock(content) {
600
- const first = content.indexOf('```');
601
- if (first === -1)
602
- return false;
603
- return content.includes('```', first + 3);
604
- }
605
- function looksLikeMarkdown(content) {
606
- return (containsMarkdownHeading(content) ||
607
- containsMarkdownList(content) ||
608
- containsFencedCodeBlock(content));
609
- }
610
- function detectLineEnding(content) {
611
- return content.includes('\r\n') ? '\r\n' : '\n';
612
- }
613
- const FRONTMATTER_DELIMITER = '---';
614
- function findFrontmatterLines(content) {
615
- const lineEnding = detectLineEnding(content);
616
- const lines = content.split(lineEnding);
617
- if (lines[0] !== FRONTMATTER_DELIMITER)
618
- return null;
619
- const endIndex = lines.indexOf(FRONTMATTER_DELIMITER, 1);
620
- if (endIndex === -1)
621
- return null;
622
- return { lineEnding, lines, endIndex };
623
- }
624
- function stripOptionalQuotes(value) {
625
- const trimmed = value.trim();
626
- if (trimmed.length < 2)
627
- return trimmed;
628
- const first = trimmed[0];
629
- const last = trimmed[trimmed.length - 1];
630
- if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
631
- return trimmed.slice(1, -1).trim();
632
- }
633
- return trimmed;
634
- }
635
- function parseFrontmatterEntry(line) {
636
- const trimmed = line.trim();
637
- if (!trimmed)
638
- return null;
639
- const separatorIndex = trimmed.indexOf(':');
640
- if (separatorIndex <= 0)
641
- return null;
642
- const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
643
- const value = trimmed.slice(separatorIndex + 1);
644
- return { key, value };
645
- }
646
- function isTitleKey(key) {
647
- return key === 'title' || key === 'name';
648
- }
649
- function extractTitleFromHeading(content) {
650
- const lineEnding = detectLineEnding(content);
651
- const lines = content.split(lineEnding);
652
- for (const line of lines) {
653
- const trimmed = line.trim();
654
- if (!trimmed)
655
- continue;
656
- let index = 0;
657
- while (index < trimmed.length && trimmed[index] === '#') {
658
- index += 1;
659
- }
660
- if (index === 0 || index > 6)
661
- return undefined;
662
- const nextChar = trimmed[index];
663
- if (nextChar !== ' ' && nextChar !== '\t')
664
- return undefined;
665
- const heading = trimmed.slice(index).trim();
666
- return heading.length > 0 ? heading : undefined;
667
- }
668
- return undefined;
669
- }
670
- function extractTitleFromRawMarkdown(content) {
671
- const frontmatter = findFrontmatterLines(content);
672
- if (!frontmatter) {
673
- return extractTitleFromHeading(content);
674
- }
675
- const { lines, endIndex } = frontmatter;
676
- const entry = lines
677
- .slice(1, endIndex)
678
- .map((line) => parseFrontmatterEntry(line))
679
- .find((parsed) => parsed !== null && isTitleKey(parsed.key));
680
- if (!entry)
681
- return undefined;
682
- const value = stripOptionalQuotes(entry.value);
683
- return value || undefined;
684
- }
685
- function hasMarkdownSourceLine(content) {
686
- const lineEnding = detectLineEnding(content);
687
- const lines = content.split(lineEnding);
688
- const limit = Math.min(lines.length, 50);
689
- for (let index = 0; index < limit; index += 1) {
690
- const line = lines[index];
691
- if (!line)
692
- continue;
693
- if (line.trimStart().toLowerCase().startsWith('source:')) {
694
- return true;
695
- }
696
- }
697
- return false;
698
- }
699
- function addSourceToMarkdownMarkdownFormat(content, url) {
700
- if (hasMarkdownSourceLine(content))
701
- return content;
702
- const lineEnding = detectLineEnding(content);
703
- const lines = content.split(lineEnding);
704
- const firstNonEmptyIndex = lines.findIndex((line) => line.trim().length > 0);
705
- if (firstNonEmptyIndex !== -1) {
706
- const firstLine = lines[firstNonEmptyIndex];
707
- if (firstLine && /^#{1,6}\s+/.test(firstLine.trim())) {
708
- const insertAt = firstNonEmptyIndex + 1;
709
- const updated = [
710
- ...lines.slice(0, insertAt),
711
- '',
712
- `Source: ${url}`,
713
- '',
714
- ...lines.slice(insertAt),
715
- ];
716
- return updated.join(lineEnding);
717
- }
718
- }
719
- return [`Source: ${url}`, '', content].join(lineEnding);
720
- }
721
- function addSourceToMarkdown(content, url) {
722
- const frontmatter = findFrontmatterLines(content);
723
- if (config.transform.metadataFormat === 'markdown' && !frontmatter) {
724
- return addSourceToMarkdownMarkdownFormat(content, url);
725
- }
726
- if (!frontmatter) {
727
- return `---\nsource: "${url}"\n---\n\n${content}`;
728
- }
729
- const { lineEnding, lines, endIndex } = frontmatter;
730
- const bodyLines = lines.slice(1, endIndex);
731
- const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
732
- if (hasSource)
733
- return content;
734
- const updatedLines = [
735
- lines[0],
736
- ...bodyLines,
737
- `source: "${url}"`,
738
- ...lines.slice(endIndex),
739
- ];
740
- return updatedLines.join(lineEnding);
741
- }
742
- function hasFrontmatter(trimmed) {
743
- return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
744
- }
745
- function looksLikeHtmlDocument(trimmed) {
746
- return HTML_DOCUMENT_PATTERN.test(trimmed);
747
- }
748
- function countCommonHtmlTags(content) {
749
- const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
750
- [];
751
- return matches.length;
752
- }
753
- function isRawTextContent(content) {
754
- const trimmed = content.trim();
755
- const isHtmlDocument = looksLikeHtmlDocument(trimmed);
756
- const hasMarkdownFrontmatter = hasFrontmatter(trimmed);
757
- const hasTooManyHtmlTags = countCommonHtmlTags(content) > 2;
758
- const isMarkdown = looksLikeMarkdown(content);
759
- return (!isHtmlDocument &&
760
- (hasMarkdownFrontmatter || (!hasTooManyHtmlTags && isMarkdown)));
761
- }
762
- function isLikelyHtmlContent(content) {
763
- const trimmed = content.trim();
764
- if (!trimmed)
765
- return false;
766
- if (looksLikeHtmlDocument(trimmed))
767
- return true;
768
- return countCommonHtmlTags(content) > 2;
769
- }
770
581
  function shouldPreserveRawContent(url, content) {
771
582
  if (isRawTextContentUrl(url)) {
772
583
  return !isLikelyHtmlContent(content);
@@ -780,13 +591,9 @@ function buildRawMarkdownPayload({ rawContent, url, includeMetadata, }) {
780
591
  : rawContent;
781
592
  return { content, title };
782
593
  }
783
- function tryTransformRawContent({ html, url, includeMetadata, }) {
784
- if (!shouldPreserveRawContent(url, html)) {
785
- return null;
786
- }
787
- logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
594
+ function buildRawMarkdownResult({ rawContent, url, includeMetadata, }) {
788
595
  const { content, title } = buildRawMarkdownPayload({
789
- rawContent: html,
596
+ rawContent,
790
597
  url,
791
598
  includeMetadata,
792
599
  });
@@ -796,36 +603,21 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
796
603
  truncated: false,
797
604
  };
798
605
  }
606
+ function tryTransformRawContent({ html, url, includeMetadata, }) {
607
+ if (!shouldPreserveRawContent(url, html)) {
608
+ return null;
609
+ }
610
+ logDebug('Preserving raw markdown content', { url: truncateUrlForLog(url) });
611
+ return buildRawMarkdownResult({
612
+ rawContent: html,
613
+ url,
614
+ includeMetadata,
615
+ });
616
+ }
799
617
  const MIN_CONTENT_RATIO = 0.3;
800
618
  const MIN_HTML_LENGTH_FOR_GATE = 100;
801
619
  const MIN_HEADING_RETENTION_RATIO = 0.7;
802
620
  const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
803
- /**
804
- * Count headings using DOM querySelectorAll.
805
- * Handles nested content like <h2><span>Text</span></h2> correctly.
806
- */
807
- function countHeadingsDom(htmlOrDocument) {
808
- if (typeof htmlOrDocument === 'string') {
809
- // Wrap fragments in document structure for proper parsing
810
- const htmlToParse = needsDocumentWrapper(htmlOrDocument)
811
- ? wrapHtmlFragment(htmlOrDocument)
812
- : htmlOrDocument;
813
- const { document: doc } = parseHTML(htmlToParse);
814
- return doc.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
815
- }
816
- return htmlOrDocument.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
817
- }
818
- function countCodeBlocksDom(htmlOrDocument) {
819
- if (typeof htmlOrDocument === 'string') {
820
- // Wrap fragments in document structure for proper parsing
821
- const htmlToParse = needsDocumentWrapper(htmlOrDocument)
822
- ? wrapHtmlFragment(htmlOrDocument)
823
- : htmlOrDocument;
824
- const { document: doc } = parseHTML(htmlToParse);
825
- return doc.querySelectorAll('pre').length;
826
- }
827
- return htmlOrDocument.querySelectorAll('pre').length;
828
- }
829
621
  /**
830
622
  * Check if HTML string needs document wrapper for proper parsing.
831
623
  * Fragments without doctype/html/body tags need wrapping.
@@ -842,40 +634,53 @@ function needsDocumentWrapper(html) {
842
634
  function wrapHtmlFragment(html) {
843
635
  return `<!DOCTYPE html><html><body>${html}</body></html>`;
844
636
  }
637
+ function resolveHtmlDocument(htmlOrDocument) {
638
+ if (typeof htmlOrDocument !== 'string') {
639
+ return htmlOrDocument;
640
+ }
641
+ const htmlToParse = needsDocumentWrapper(htmlOrDocument)
642
+ ? wrapHtmlFragment(htmlOrDocument)
643
+ : htmlOrDocument;
644
+ return parseHTML(htmlToParse).document;
645
+ }
646
+ function countDomSelector(htmlOrDocument, selector) {
647
+ return resolveHtmlDocument(htmlOrDocument).querySelectorAll(selector).length;
648
+ }
845
649
  /**
846
- * Get visible text length from HTML, excluding script/style/noscript content.
847
- * Fixes the bug where stripHtmlTagsForLength() counted JS/CSS as visible text.
650
+ * Count headings using DOM querySelectorAll.
651
+ * Handles nested content like <h2><span>Text</span></h2> correctly.
848
652
  */
849
- function getVisibleTextLength(htmlOrDocument) {
850
- // For string input, parse the HTML
851
- if (typeof htmlOrDocument === 'string') {
852
- // Wrap fragments in document structure for proper parsing
853
- const htmlToParse = needsDocumentWrapper(htmlOrDocument)
854
- ? wrapHtmlFragment(htmlOrDocument)
855
- : htmlOrDocument;
856
- const { document: doc } = parseHTML(htmlToParse);
857
- // Remove non-visible content that inflates text length
858
- for (const el of doc.querySelectorAll('script,style,noscript')) {
859
- el.remove();
860
- }
861
- // Get text content from body or documentElement
862
- // Note: linkedom may return null for body on HTML fragments despite types
863
- const body = doc.body;
864
- const docElement = doc.documentElement;
865
- const text = body?.textContent ?? docElement?.textContent ?? '';
866
- return text.replace(/\s+/g, ' ').trim().length;
867
- }
868
- // For Document input, clone to avoid mutation
869
- const workDoc = htmlOrDocument.cloneNode(true);
870
- // Remove non-visible content that inflates text length
871
- for (const el of workDoc.querySelectorAll('script,style,noscript')) {
653
+ function countHeadingsDom(htmlOrDocument) {
654
+ return countDomSelector(htmlOrDocument, 'h1,h2,h3,h4,h5,h6');
655
+ }
656
+ function countCodeBlocksDom(htmlOrDocument) {
657
+ return countDomSelector(htmlOrDocument, 'pre');
658
+ }
659
+ function cloneDocumentIfNeeded(htmlOrDocument, doc) {
660
+ return typeof htmlOrDocument === 'string'
661
+ ? doc
662
+ : doc.cloneNode(true);
663
+ }
664
+ function stripNonVisibleNodes(doc) {
665
+ for (const el of doc.querySelectorAll('script,style,noscript')) {
872
666
  el.remove();
873
667
  }
874
- // Get text content from body or documentElement
668
+ }
669
+ function resolveDocumentText(doc) {
875
670
  // Note: linkedom may return null for body on HTML fragments despite types
876
- const body = workDoc.body;
877
- const docElement = workDoc.documentElement;
878
- const text = body?.textContent ?? docElement?.textContent ?? '';
671
+ const body = doc.body;
672
+ const docElement = doc.documentElement;
673
+ return body?.textContent ?? docElement?.textContent ?? '';
674
+ }
675
+ /**
676
+ * Get visible text length from HTML, excluding script/style/noscript content.
677
+ * Fixes the bug where stripHtmlTagsForLength() counted JS/CSS as visible text.
678
+ */
679
+ function getVisibleTextLength(htmlOrDocument) {
680
+ const doc = resolveHtmlDocument(htmlOrDocument);
681
+ const workDoc = cloneDocumentIfNeeded(htmlOrDocument, doc);
682
+ stripNonVisibleNodes(workDoc);
683
+ const text = resolveDocumentText(workDoc);
879
684
  return text.replace(/\s+/g, ' ').trim().length;
880
685
  }
881
686
  export function isExtractionSufficient(article, originalHtmlOrDocument) {
@@ -995,7 +800,7 @@ function buildContentSource({ html, url, article, extractedMeta, includeMetadata
995
800
  const contentRoot = findContentRoot(cleanedDoc);
996
801
  if (contentRoot) {
997
802
  logDebug('Using content root fallback instead of full HTML', {
998
- url: url.substring(0, 80),
803
+ url: truncateUrlForLog(url),
999
804
  contentLength: contentRoot.length,
1000
805
  });
1001
806
  return {
@@ -1015,31 +820,39 @@ function buildContentSource({ html, url, article, extractedMeta, includeMetadata
1015
820
  ...(document ? { document } : {}),
1016
821
  };
1017
822
  }
1018
- function logQualityGateFallback({ url, articleLength, }) {
823
+ function logQualityGateFallback({ safeUrl, articleLength, }) {
1019
824
  logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
1020
- url: url.substring(0, 80),
825
+ url: safeUrl,
1021
826
  articleLength,
1022
827
  });
1023
828
  }
1024
829
  function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
1025
830
  const articleLength = article.textContent.length;
1026
831
  const originalLength = getVisibleTextLength(originalHtmlOrDocument);
832
+ const safeUrl = truncateUrlForLog(url);
833
+ let articleDocument = null;
834
+ const getArticleDocument = () => {
835
+ if (articleDocument)
836
+ return articleDocument;
837
+ articleDocument = resolveHtmlDocument(article.content);
838
+ return articleDocument;
839
+ };
1027
840
  // If the document is tiny, don't gate too aggressively.
1028
841
  if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
1029
842
  const ratio = articleLength / originalLength;
1030
843
  if (ratio < MIN_CONTENT_RATIO) {
1031
- logQualityGateFallback({ url, articleLength });
844
+ logQualityGateFallback({ safeUrl, articleLength });
1032
845
  return false;
1033
846
  }
1034
847
  }
1035
848
  // Heading structure retention (compute counts once to avoid repeated DOM queries/parses).
1036
849
  const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
1037
850
  if (originalHeadings > 0) {
1038
- const articleHeadings = countHeadingsDom(article.content);
851
+ const articleHeadings = countHeadingsDom(getArticleDocument());
1039
852
  const retentionRatio = articleHeadings / originalHeadings;
1040
853
  if (retentionRatio < MIN_HEADING_RETENTION_RATIO) {
1041
854
  logDebug('Quality gate: Readability broke heading structure, using full HTML', {
1042
- url: url.substring(0, 80),
855
+ url: safeUrl,
1043
856
  originalHeadings,
1044
857
  articleHeadings,
1045
858
  });
@@ -1048,18 +861,18 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
1048
861
  }
1049
862
  const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
1050
863
  if (originalCodeBlocks > 0) {
1051
- const articleCodeBlocks = countCodeBlocksDom(article.content);
864
+ const articleCodeBlocks = countCodeBlocksDom(getArticleDocument());
1052
865
  const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
1053
866
  // Always log code block counts for debugging
1054
867
  logDebug('Code block retention check', {
1055
- url: url.substring(0, 80),
868
+ url: safeUrl,
1056
869
  originalCodeBlocks,
1057
870
  articleCodeBlocks,
1058
871
  codeRetentionRatio,
1059
872
  });
1060
873
  if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
1061
874
  logDebug('Quality gate: Readability removed code blocks, using full HTML', {
1062
- url: url.substring(0, 80),
875
+ url: safeUrl,
1063
876
  originalCodeBlocks,
1064
877
  articleCodeBlocks,
1065
878
  });
@@ -1068,7 +881,7 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
1068
881
  }
1069
882
  // Layout extraction issue: truncated/fragmented lines.
1070
883
  if (hasTruncatedSentences(article.textContent)) {
1071
- logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', { url: url.substring(0, 80) });
884
+ logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', { url: safeUrl });
1072
885
  return false;
1073
886
  }
1074
887
  return true;
@@ -1078,7 +891,7 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
1078
891
  extractArticle: true,
1079
892
  ...(signal ? { signal } : {}),
1080
893
  });
1081
- const originalDocument = parseHTML(html).document;
894
+ const originalDocument = document;
1082
895
  const useArticleContent = article
1083
896
  ? shouldUseArticleContent(article, originalDocument, url)
1084
897
  : false;
@@ -1129,11 +942,14 @@ function runTotalTransformStage(url, fn) {
1129
942
  return result;
1130
943
  }
1131
944
  finally {
1132
- if (success) {
1133
- endTransformStage(totalStage, { truncated: false });
1134
- }
945
+ finalizeTotalTransformStage(totalStage, success);
1135
946
  }
1136
947
  }
948
+ function finalizeTotalTransformStage(stage, success) {
949
+ if (!success)
950
+ return;
951
+ endTransformStage(stage, { truncated: false });
952
+ }
1137
953
  async function runTotalTransformStageAsync(url, fn) {
1138
954
  const totalStage = startTransformStage(url, 'transform:total');
1139
955
  let success = false;
@@ -1143,9 +959,7 @@ async function runTotalTransformStageAsync(url, fn) {
1143
959
  return result;
1144
960
  }
1145
961
  finally {
1146
- if (success) {
1147
- endTransformStage(totalStage, { truncated: false });
1148
- }
962
+ finalizeTotalTransformStage(totalStage, success);
1149
963
  }
1150
964
  }
1151
965
  export function transformHtmlToMarkdownInProcess(html, url, options) {
@@ -1182,11 +996,11 @@ const workerMessageSchema = z.discriminatedUnion('type', [
1182
996
  }),
1183
997
  ]);
1184
998
  let pool = null;
999
+ const POOL_MIN_WORKERS = 2;
1000
+ const POOL_MAX_WORKERS = 4;
1001
+ const POOL_SCALE_THRESHOLD = 0.5;
1185
1002
  function resolveDefaultWorkerCount() {
1186
- const parallelism = typeof os.availableParallelism === 'function'
1187
- ? os.availableParallelism()
1188
- : os.cpus().length;
1189
- return Math.min(16, Math.max(1, parallelism - 1));
1003
+ return POOL_MIN_WORKERS;
1190
1004
  }
1191
1005
  const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
1192
1006
  function getOrCreateTransformWorkerPool() {
@@ -1199,8 +1013,20 @@ export async function shutdownTransformWorkerPool() {
1199
1013
  await pool.close();
1200
1014
  pool = null;
1201
1015
  }
1016
+ export function getTransformPoolStats() {
1017
+ if (!pool)
1018
+ return null;
1019
+ return {
1020
+ queueDepth: pool.getQueueDepth(),
1021
+ activeWorkers: pool.getActiveWorkers(),
1022
+ capacity: pool.getCapacity(),
1023
+ };
1024
+ }
1202
1025
  class WorkerPool {
1203
1026
  workers = [];
1027
+ capacity;
1028
+ minCapacity;
1029
+ maxCapacity;
1204
1030
  queue = [];
1205
1031
  inflight = new Map();
1206
1032
  timeoutMs;
@@ -1316,12 +1142,11 @@ class WorkerPool {
1316
1142
  });
1317
1143
  }
1318
1144
  constructor(size, timeoutMs) {
1319
- const safeSize = Math.max(1, size);
1145
+ this.minCapacity = POOL_MIN_WORKERS;
1146
+ this.maxCapacity = POOL_MAX_WORKERS;
1147
+ this.capacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
1320
1148
  this.timeoutMs = timeoutMs;
1321
- this.queueMax = safeSize * 2;
1322
- for (let index = 0; index < safeSize; index += 1) {
1323
- this.workers.push(this.spawnWorker(index));
1324
- }
1149
+ this.queueMax = this.maxCapacity * 32;
1325
1150
  }
1326
1151
  spawnWorker(workerIndex) {
1327
1152
  const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url));
@@ -1419,20 +1244,45 @@ class WorkerPool {
1419
1244
  this.drainQueue();
1420
1245
  });
1421
1246
  }
1247
+ /** Scale capacity up if queue pressure exceeds threshold. */
1248
+ maybeScaleUp() {
1249
+ if (this.queue.length > this.capacity * POOL_SCALE_THRESHOLD &&
1250
+ this.capacity < this.maxCapacity) {
1251
+ this.capacity += 1;
1252
+ }
1253
+ }
1422
1254
  drainQueue() {
1255
+ if (this.closed)
1256
+ return;
1423
1257
  if (this.queue.length === 0)
1424
1258
  return;
1259
+ this.maybeScaleUp();
1260
+ // First pass: try to find an idle existing worker
1425
1261
  for (let workerIndex = 0; workerIndex < this.workers.length; workerIndex += 1) {
1426
1262
  const slot = this.workers[workerIndex];
1427
- if (!slot || slot.busy)
1428
- continue;
1429
- const task = this.queue.shift();
1430
- if (!task)
1431
- return;
1432
- this.dispatch(workerIndex, slot, task);
1433
- if (this.queue.length === 0)
1434
- return;
1263
+ if (slot && !slot.busy) {
1264
+ this.dispatchQueueTask(workerIndex, slot);
1265
+ if (this.queue.length === 0)
1266
+ return;
1267
+ }
1435
1268
  }
1269
+ if (this.workers.length < this.capacity && this.queue.length > 0) {
1270
+ const workerIndex = this.workers.length;
1271
+ const slot = this.spawnWorker(workerIndex);
1272
+ this.workers.push(slot);
1273
+ this.dispatchQueueTask(workerIndex, slot);
1274
+ if (this.workers.length < this.capacity && this.queue.length > 0) {
1275
+ setImmediate(() => {
1276
+ this.drainQueue();
1277
+ });
1278
+ }
1279
+ }
1280
+ }
1281
+ dispatchQueueTask(workerIndex, slot) {
1282
+ const task = this.queue.shift();
1283
+ if (!task)
1284
+ return;
1285
+ this.dispatch(workerIndex, slot, task);
1436
1286
  }
1437
1287
  dispatch(workerIndex, slot, task) {
1438
1288
  if (this.rejectIfAborted(task))
@@ -1503,11 +1353,23 @@ class WorkerPool {
1503
1353
  task.reject(message);
1504
1354
  this.restartWorker(workerIndex, slot);
1505
1355
  }
1356
+ getQueueDepth() {
1357
+ return this.queue.length;
1358
+ }
1359
+ getActiveWorkers() {
1360
+ return this.workers.filter((s) => s?.busy).length;
1361
+ }
1362
+ getCapacity() {
1363
+ return this.capacity;
1364
+ }
1506
1365
  async close() {
1507
1366
  if (this.closed)
1508
1367
  return;
1509
1368
  this.closed = true;
1510
- const terminations = this.workers.map((slot) => slot.worker.terminate());
1369
+ const terminations = this.workers
1370
+ .map((slot) => slot?.worker.terminate())
1371
+ .filter((p) => p !== undefined);
1372
+ this.workers.fill(undefined);
1511
1373
  this.workers.length = 0;
1512
1374
  for (const [id, inflight] of this.inflight.entries()) {
1513
1375
  clearTimeout(inflight.timer);
@@ -1556,4 +1418,3 @@ export async function transformHtmlToMarkdown(html, url, options) {
1556
1418
  }
1557
1419
  });
1558
1420
  }
1559
- //# sourceMappingURL=transform.js.map