@j0hanz/superfetch 2.1.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/transform.js CHANGED
@@ -5,15 +5,47 @@ import { performance } from 'node:perf_hooks';
5
5
  import { Worker } from 'node:worker_threads';
6
6
  import { parseHTML } from 'linkedom';
7
7
  import { NodeHtmlMarkdown, } from 'node-html-markdown';
8
- import { Readability } from '@mozilla/readability';
8
+ import { z } from 'zod';
9
+ import { isProbablyReaderable, Readability } from '@mozilla/readability';
9
10
  import { config } from './config.js';
10
11
  import { FetchError, getErrorMessage } from './errors.js';
11
12
  import { isRawTextContentUrl } from './fetch.js';
12
13
  import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
13
- function isRecord(value) {
14
- return typeof value === 'object' && value !== null;
14
+ import { isRecord } from './utils.js';
15
+ function getAbortReason(signal) {
16
+ if (!isRecord(signal))
17
+ return undefined;
18
+ return 'reason' in signal ? signal.reason : undefined;
19
+ }
20
+ function getBodyInnerHtml(document) {
21
+ if (!isRecord(document))
22
+ return undefined;
23
+ const { body } = document;
24
+ if (!isRecord(body))
25
+ return undefined;
26
+ const { innerHTML } = body;
27
+ return typeof innerHTML === 'string' && innerHTML.length > 0
28
+ ? innerHTML
29
+ : undefined;
30
+ }
31
+ function getDocumentToString(document) {
32
+ if (!isRecord(document))
33
+ return undefined;
34
+ if (typeof document.toString !== 'function')
35
+ return undefined;
36
+ return document.toString.bind(document);
37
+ }
38
+ function getDocumentElementOuterHtml(document) {
39
+ if (!isRecord(document))
40
+ return undefined;
41
+ const { documentElement } = document;
42
+ if (!isRecord(documentElement))
43
+ return undefined;
44
+ const { outerHTML } = documentElement;
45
+ return typeof outerHTML === 'string' && outerHTML.length > 0
46
+ ? outerHTML
47
+ : undefined;
15
48
  }
16
- const FRONTMATTER_DELIMITER = '---';
17
49
  const CODE_BLOCK = {
18
50
  fence: '```',
19
51
  format: (code, language = '') => {
@@ -59,6 +91,12 @@ export function endTransformStage(context, options) {
59
91
  };
60
92
  publishTransformEvent(event);
61
93
  }
94
+ function runTransformStage(url, stage, fn) {
95
+ const context = startTransformStage(url, stage);
96
+ const result = fn();
97
+ endTransformStage(context);
98
+ return result;
99
+ }
62
100
  function isTimeoutReason(reason) {
63
101
  return reason instanceof Error && reason.name === 'TimeoutError';
64
102
  }
@@ -68,7 +106,7 @@ function throwIfAborted(signal, url, stage) {
68
106
  const { aborted } = signal;
69
107
  if (!aborted)
70
108
  return;
71
- const { reason } = signal;
109
+ const reason = getAbortReason(signal);
72
110
  if (isTimeoutReason(reason)) {
73
111
  throw new FetchError('Request timeout', url, 504, {
74
112
  reason: 'timeout',
@@ -192,8 +230,18 @@ function extractArticle(document) {
192
230
  }
193
231
  function parseReadabilityArticle(document) {
194
232
  try {
195
- // Type assertion is safe here due to isReadabilityCompatible check
196
- const reader = new Readability(document);
233
+ // Readability mutates the document; operate on a clone.
234
+ const documentClone = document.cloneNode(true);
235
+ // Avoid the more expensive parse() when the page is unlikely to be readable,
236
+ // but don't penalize small documents where the heuristic is often too strict.
237
+ const rawText = documentClone.body.textContent ||
238
+ documentClone.documentElement.textContent;
239
+ const textLength = rawText.replace(/\s+/g, ' ').trim().length;
240
+ if (textLength >= 400 && !isProbablyReaderable(documentClone)) {
241
+ return null;
242
+ }
243
+ // Guard against pathological DOM sizes.
244
+ const reader = new Readability(documentClone, { maxElemsToParse: 20_000 });
197
245
  return reader.parse();
198
246
  }
199
247
  catch (error) {
@@ -233,45 +281,48 @@ function addOptionalField(target, key, value) {
233
281
  export function extractContent(html, url, options = {
234
282
  extractArticle: true,
235
283
  }) {
284
+ const emptyResult = createEmptyExtractionResult();
236
285
  if (!isValidInput(html, url)) {
237
- return { article: null, metadata: {} };
286
+ return emptyResult;
238
287
  }
239
288
  return tryExtractContent(html, url, options);
240
289
  }
290
+ function createEmptyExtractionResult() {
291
+ return { article: null, metadata: {} };
292
+ }
293
+ function extractArticleWithStage(document, url, shouldExtract) {
294
+ if (!shouldExtract)
295
+ return null;
296
+ return runTransformStage(url, 'extract:article', () => resolveArticleExtraction(document, shouldExtract));
297
+ }
298
+ function handleExtractionFailure(error, url, signal) {
299
+ if (error instanceof FetchError) {
300
+ throw error;
301
+ }
302
+ throwIfAborted(signal, url, 'extract:error');
303
+ logError('Failed to extract content', error instanceof Error ? error : undefined);
304
+ return createEmptyExtractionResult();
305
+ }
306
+ function extractContentStages(html, url, options) {
307
+ throwIfAborted(options.signal, url, 'extract:begin');
308
+ const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncateHtml(html)));
309
+ throwIfAborted(options.signal, url, 'extract:parsed');
310
+ applyBaseUri(document, url);
311
+ const metadata = runTransformStage(url, 'extract:metadata', () => extractMetadata(document));
312
+ throwIfAborted(options.signal, url, 'extract:metadata');
313
+ const article = extractArticleWithStage(document, url, options.extractArticle);
314
+ throwIfAborted(options.signal, url, 'extract:article');
315
+ return {
316
+ article,
317
+ metadata,
318
+ };
319
+ }
241
320
  function tryExtractContent(html, url, options) {
242
321
  try {
243
- throwIfAborted(options.signal, url, 'extract:begin');
244
- const parseStage = startTransformStage(url, 'extract:parse');
245
- const { document } = parseHTML(truncateHtml(html));
246
- endTransformStage(parseStage);
247
- throwIfAborted(options.signal, url, 'extract:parsed');
248
- applyBaseUri(document, url);
249
- const metadataStage = startTransformStage(url, 'extract:metadata');
250
- const metadata = extractMetadata(document);
251
- endTransformStage(metadataStage);
252
- throwIfAborted(options.signal, url, 'extract:metadata');
253
- let article;
254
- if (options.extractArticle) {
255
- const articleStage = startTransformStage(url, 'extract:article');
256
- article = resolveArticleExtraction(document, options.extractArticle);
257
- endTransformStage(articleStage);
258
- }
259
- else {
260
- article = null;
261
- }
262
- throwIfAborted(options.signal, url, 'extract:article');
263
- return {
264
- article,
265
- metadata,
266
- };
322
+ return extractContentStages(html, url, options);
267
323
  }
268
324
  catch (error) {
269
- if (error instanceof FetchError) {
270
- throw error;
271
- }
272
- throwIfAborted(options.signal, url, 'extract:error');
273
- logError('Failed to extract content', error instanceof Error ? error : undefined);
274
- return { article: null, metadata: {} };
325
+ return handleExtractionFailure(error, url, options.signal);
275
326
  }
276
327
  }
277
328
  function isValidInput(html, url) {
@@ -563,56 +614,6 @@ export function resolveLanguageFromAttributes(className, dataLang) {
563
614
  const classMatch = extractLanguageFromClassName(className);
564
615
  return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
565
616
  }
566
- const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
567
- const YAML_NUMERIC = /^[\d.]+$/;
568
- const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
569
- const ESCAPE_PATTERNS = {
570
- backslash: /\\/g,
571
- quote: /"/g,
572
- newline: /\n/g,
573
- tab: /\t/g,
574
- };
575
- const YAML_QUOTE_CHECKS = [
576
- (input) => YAML_SPECIAL_CHARS.test(input),
577
- (input) => input.startsWith(' ') || input.endsWith(' '),
578
- (input) => input === '',
579
- (input) => YAML_NUMERIC.test(input),
580
- (input) => YAML_RESERVED_WORDS.test(input),
581
- ];
582
- function needsYamlQuotes(value) {
583
- return YAML_QUOTE_CHECKS.some((check) => check(value));
584
- }
585
- function escapeYamlValue(value) {
586
- if (!needsYamlQuotes(value)) {
587
- return value;
588
- }
589
- const escaped = value
590
- .replace(ESCAPE_PATTERNS.backslash, '\\\\')
591
- .replace(ESCAPE_PATTERNS.quote, '\\"')
592
- .replace(ESCAPE_PATTERNS.newline, '\\n')
593
- .replace(ESCAPE_PATTERNS.tab, '\\t');
594
- return `"${escaped}"`;
595
- }
596
- function appendFrontmatterField(lines, key, value) {
597
- if (!value)
598
- return;
599
- lines.push(`${key}: ${escapeYamlValue(value)}`);
600
- }
601
- function joinLines(lines) {
602
- return lines.join('\n');
603
- }
604
- function buildFrontmatter(metadata) {
605
- if (!metadata)
606
- return '';
607
- const lines = [FRONTMATTER_DELIMITER];
608
- appendFrontmatterField(lines, 'title', metadata.title);
609
- appendFrontmatterField(lines, 'source', metadata.url);
610
- appendFrontmatterField(lines, 'author', metadata.author);
611
- appendFrontmatterField(lines, 'description', metadata.description);
612
- appendFrontmatterField(lines, 'fetchedAt', metadata.fetchedAt);
613
- lines.push(FRONTMATTER_DELIMITER);
614
- return joinLines(lines);
615
- }
616
617
  function isElement(node) {
617
618
  return (isRecord(node) &&
618
619
  'getAttribute' in node &&
@@ -623,16 +624,13 @@ const STRUCTURAL_TAGS = new Set([
623
624
  'style',
624
625
  'noscript',
625
626
  'iframe',
626
- 'nav',
627
- 'footer',
628
- 'aside',
629
- 'header',
630
627
  'form',
631
628
  'button',
632
629
  'input',
633
630
  'select',
634
631
  'textarea',
635
632
  ]);
633
+ const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
636
634
  const NAVIGATION_ROLES = new Set([
637
635
  'navigation',
638
636
  'banner',
@@ -641,8 +639,37 @@ const NAVIGATION_ROLES = new Set([
641
639
  'tree',
642
640
  'menubar',
643
641
  'menu',
642
+ 'dialog',
643
+ 'alertdialog',
644
644
  ]);
645
- const PROMO_PATTERN = /banner|promo|announcement|cta|callout|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast/;
645
+ const PROMO_TOKENS = new Set([
646
+ 'banner',
647
+ 'promo',
648
+ 'announcement',
649
+ 'cta',
650
+ 'callout',
651
+ 'advert',
652
+ 'ad',
653
+ 'ads',
654
+ 'sponsor',
655
+ 'newsletter',
656
+ 'subscribe',
657
+ 'cookie',
658
+ 'consent',
659
+ 'popup',
660
+ 'modal',
661
+ 'overlay',
662
+ 'toast',
663
+ 'share',
664
+ 'social',
665
+ 'related',
666
+ 'recommend',
667
+ 'comment',
668
+ 'breadcrumb',
669
+ 'pagination',
670
+ 'pager',
671
+ ]);
672
+ const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
646
673
  const FIXED_PATTERN = /\b(fixed|sticky)\b/;
647
674
  const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
648
675
  const ISOLATE_PATTERN = /\bisolate\b/;
@@ -711,15 +738,26 @@ function isStructuralNoiseTag(tagName) {
711
738
  return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
712
739
  }
713
740
  function isElementHidden(element) {
741
+ const style = element.getAttribute('style') ?? '';
714
742
  return (element.getAttribute('hidden') !== null ||
715
- element.getAttribute('aria-hidden') === 'true');
743
+ element.getAttribute('aria-hidden') === 'true' ||
744
+ /\bdisplay\s*:\s*none\b/i.test(style) ||
745
+ /\bvisibility\s*:\s*hidden\b/i.test(style));
716
746
  }
717
747
  function hasNoiseRole(role) {
718
748
  return role !== null && NAVIGATION_ROLES.has(role);
719
749
  }
750
+ function tokenizeIdentifierLikeText(value) {
751
+ return value
752
+ .toLowerCase()
753
+ .replace(/[^a-z0-9]+/g, ' ')
754
+ .trim()
755
+ .split(' ')
756
+ .filter(Boolean);
757
+ }
720
758
  function matchesPromoIdOrClass(className, id) {
721
- const combined = `${className} ${id}`.toLowerCase();
722
- return PROMO_PATTERN.test(combined);
759
+ const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
760
+ return tokens.some((token) => PROMO_TOKENS.has(token));
723
761
  }
724
762
  function matchesHighZIsolate(className) {
725
763
  return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
@@ -736,42 +774,49 @@ function readElementMetadata(element) {
736
774
  isHidden: isElementHidden(element),
737
775
  };
738
776
  }
777
+ function isBoilerplateHeader({ className, id, role, }) {
778
+ if (hasNoiseRole(role))
779
+ return true;
780
+ const combined = `${className} ${id}`.toLowerCase();
781
+ return HEADER_NOISE_PATTERN.test(combined);
782
+ }
739
783
  function isNoiseElement(node) {
740
784
  const metadata = readElementMetadata(node);
741
785
  return (isStructuralNoiseTag(metadata.tagName) ||
786
+ ALWAYS_NOISE_TAGS.has(metadata.tagName) ||
787
+ (metadata.tagName === 'header' && isBoilerplateHeader(metadata)) ||
742
788
  metadata.isHidden ||
743
789
  hasNoiseRole(metadata.role) ||
744
790
  matchesFixedOrHighZIsolate(metadata.className) ||
745
791
  matchesPromoIdOrClass(metadata.className, metadata.id));
746
792
  }
793
+ function stripNoiseNodes(document) {
794
+ const nodes = document.querySelectorAll('*');
795
+ for (let index = nodes.length - 1; index >= 0; index -= 1) {
796
+ const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
797
+ if (!node)
798
+ continue;
799
+ if (isElement(node) && isNoiseElement(node)) {
800
+ node.remove();
801
+ }
802
+ }
803
+ }
747
804
  function removeNoiseFromHtml(html) {
748
805
  const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
749
806
  if (!shouldParse)
750
807
  return html;
751
- const shouldRemove = mayContainNoise(html);
752
808
  try {
753
809
  const { document } = parseHTML(html);
754
- if (shouldRemove) {
755
- const nodes = Array.from(document.querySelectorAll('*'));
756
- for (let index = nodes.length - 1; index >= 0; index -= 1) {
757
- const node = nodes[index];
758
- if (!node)
759
- continue;
760
- if (isElement(node) && isNoiseElement(node)) {
761
- node.remove();
762
- }
763
- }
764
- }
765
- const { body } = document;
766
- if (body?.innerHTML)
767
- return body.innerHTML;
768
- if (typeof document.toString ===
769
- 'function') {
770
- return document.toString();
771
- }
772
- const { documentElement } = document;
773
- if (documentElement?.outerHTML)
774
- return documentElement.outerHTML;
810
+ stripNoiseNodes(document);
811
+ const bodyInnerHtml = getBodyInnerHtml(document);
812
+ if (bodyInnerHtml)
813
+ return bodyInnerHtml;
814
+ const docToString = getDocumentToString(document);
815
+ if (docToString)
816
+ return docToString();
817
+ const documentElementOuterHtml = getDocumentElementOuterHtml(document);
818
+ if (documentElementOuterHtml)
819
+ return documentElementOuterHtml;
775
820
  return html;
776
821
  }
777
822
  catch {
@@ -785,56 +830,110 @@ function buildInlineCode(content) {
785
830
  const padding = delimiter.length > 1 ? ' ' : '';
786
831
  return `${delimiter}${padding}${content}${padding}${delimiter}`;
787
832
  }
833
+ /**
834
+ * Derive alt text from an image URL by extracting and humanizing the filename.
835
+ * Used as a fallback when the image has no alt attribute.
836
+ */
837
+ function deriveAltFromImageUrl(src) {
838
+ if (!src)
839
+ return '';
840
+ try {
841
+ // Handle both absolute and relative URLs.
842
+ const pathname = src.startsWith('http')
843
+ ? new URL(src).pathname
844
+ : (src.split('?')[0] ?? '');
845
+ // Extract filename from path.
846
+ const segments = pathname.split('/');
847
+ const filename = segments.pop() ?? '';
848
+ if (!filename)
849
+ return '';
850
+ // Remove file extension.
851
+ const dotIndex = filename.lastIndexOf('.');
852
+ const name = dotIndex > 0 ? filename.slice(0, dotIndex) : filename;
853
+ // Humanize: replace separators with spaces.
854
+ return name.replace(/[_-]+/g, ' ').trim();
855
+ }
856
+ catch {
857
+ return '';
858
+ }
859
+ }
788
860
  function isCodeBlock(parent) {
789
861
  if (!isRecord(parent))
790
862
  return false;
791
863
  const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
792
864
  return ['PRE', 'WRAPPED-PRE'].includes(tagName);
793
865
  }
794
- function createCodeTranslator() {
866
+ function hasGetAttribute(value) {
867
+ return isRecord(value) && typeof value.getAttribute === 'function';
868
+ }
869
+ function hasCodeBlockTranslators(value) {
870
+ return isRecord(value) && isRecord(value.codeBlockTranslators);
871
+ }
872
+ function buildInlineCodeTranslator() {
795
873
  return {
796
- code: (ctx) => {
797
- if (!isRecord(ctx)) {
798
- return {
799
- spaceIfRepeatingChar: true,
800
- noEscape: true,
801
- postprocess: ({ content }) => buildInlineCode(content),
802
- };
803
- }
804
- const { node, parent, visitor } = ctx;
805
- const getAttribute = isRecord(node) && typeof node.getAttribute === 'function'
806
- ? node.getAttribute.bind(node)
807
- : undefined;
808
- if (!isCodeBlock(parent)) {
809
- return {
810
- spaceIfRepeatingChar: true,
811
- noEscape: true,
812
- postprocess: ({ content }) => buildInlineCode(content),
813
- };
814
- }
815
- const className = getAttribute?.('class') ?? '';
816
- const dataLanguage = getAttribute?.('data-language') ?? '';
817
- const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
818
- const childTranslators = isRecord(visitor) ? visitor.instance : null;
819
- const codeBlockTranslators = isRecord(childTranslators) &&
820
- isRecord(childTranslators
821
- .codeBlockTranslators)
822
- ? childTranslators.codeBlockTranslators
823
- : null;
824
- return {
825
- noEscape: true,
826
- preserveWhitespace: true,
827
- ...(codeBlockTranslators
828
- ? { childTranslators: codeBlockTranslators }
829
- : null),
830
- postprocess: ({ content }) => {
831
- const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
832
- return CODE_BLOCK.format(content, language);
833
- },
834
- };
874
+ spaceIfRepeatingChar: true,
875
+ noEscape: true,
876
+ postprocess: ({ content }) => buildInlineCode(content),
877
+ };
878
+ }
879
+ function resolveAttributeLanguage(node) {
880
+ const getAttribute = hasGetAttribute(node)
881
+ ? node.getAttribute.bind(node)
882
+ : undefined;
883
+ const className = getAttribute?.('class') ?? '';
884
+ const dataLanguage = getAttribute?.('data-language') ?? '';
885
+ return resolveLanguageFromAttributes(className, dataLanguage);
886
+ }
887
+ function resolveCodeBlockTranslators(visitor) {
888
+ const childTranslators = isRecord(visitor) ? visitor.instance : null;
889
+ return hasCodeBlockTranslators(childTranslators)
890
+ ? childTranslators.codeBlockTranslators
891
+ : null;
892
+ }
893
+ function buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators) {
894
+ return {
895
+ noEscape: true,
896
+ preserveWhitespace: true,
897
+ ...(codeBlockTranslators
898
+ ? { childTranslators: codeBlockTranslators }
899
+ : null),
900
+ postprocess: ({ content }) => {
901
+ const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
902
+ return CODE_BLOCK.format(content, language);
835
903
  },
836
904
  };
837
905
  }
906
+ function buildCodeTranslator(ctx) {
907
+ if (!isRecord(ctx))
908
+ return buildInlineCodeTranslator();
909
+ const { node, parent, visitor } = ctx;
910
+ if (!isCodeBlock(parent))
911
+ return buildInlineCodeTranslator();
912
+ const attributeLanguage = resolveAttributeLanguage(node);
913
+ const codeBlockTranslators = resolveCodeBlockTranslators(visitor);
914
+ return buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators);
915
+ }
916
+ function buildImageTranslator(ctx) {
917
+ if (!isRecord(ctx))
918
+ return { content: '' };
919
+ const { node } = ctx;
920
+ const getAttribute = hasGetAttribute(node)
921
+ ? node.getAttribute.bind(node)
922
+ : undefined;
923
+ const src = getAttribute?.('src') ?? '';
924
+ const existingAlt = getAttribute?.('alt') ?? '';
925
+ // Use existing alt text if present, otherwise derive from filename.
926
+ const alt = existingAlt.trim() || deriveAltFromImageUrl(src);
927
+ return {
928
+ content: `![${alt}](${src})`,
929
+ };
930
+ }
931
+ function createCustomTranslators() {
932
+ return {
933
+ code: (ctx) => buildCodeTranslator(ctx),
934
+ img: (ctx) => buildImageTranslator(ctx),
935
+ };
936
+ }
838
937
  let markdownInstance = null;
839
938
  function createMarkdownInstance() {
840
939
  return new NodeHtmlMarkdown({
@@ -842,36 +941,86 @@ function createMarkdownInstance() {
842
941
  codeBlockStyle: 'fenced',
843
942
  emDelimiter: '_',
844
943
  bulletMarker: '-',
845
- }, createCodeTranslator());
944
+ }, createCustomTranslators());
846
945
  }
847
946
  function getMarkdownConverter() {
848
947
  markdownInstance ??= createMarkdownInstance();
849
948
  return markdownInstance;
850
949
  }
950
+ function translateHtmlToMarkdown(html, url, signal) {
951
+ throwIfAborted(signal, url, 'markdown:begin');
952
+ const cleanedHtml = runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html));
953
+ throwIfAborted(signal, url, 'markdown:cleaned');
954
+ const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
955
+ throwIfAborted(signal, url, 'markdown:translated');
956
+ // Post-process the markdown to clean up common conversion artifacts.
957
+ return cleanupMarkdownArtifacts(content);
958
+ }
959
+ function appendMetadataFooter(content, metadata, url) {
960
+ // Metadata is placed as a footer to avoid duplicating titles when the
961
+ // article content already contains an H1 heading at the top.
962
+ const footer = buildMetadataFooter(metadata, url);
963
+ return footer ? `${content}\n\n${footer}` : content;
964
+ }
851
965
  export function htmlToMarkdown(html, metadata, options) {
852
966
  const url = options?.url ?? metadata?.url ?? '';
853
- const frontmatter = buildFrontmatter(metadata);
854
967
  if (!html)
855
- return frontmatter;
968
+ return buildMetadataFooter(metadata, url);
856
969
  try {
857
- throwIfAborted(options?.signal, url, 'markdown:begin');
858
- const noiseStage = startTransformStage(url, 'markdown:noise');
859
- const cleanedHtml = removeNoiseFromHtml(html);
860
- endTransformStage(noiseStage);
861
- throwIfAborted(options?.signal, url, 'markdown:cleaned');
862
- const translateStage = startTransformStage(url, 'markdown:translate');
863
- const content = getMarkdownConverter().translate(cleanedHtml).trim();
864
- endTransformStage(translateStage);
865
- throwIfAborted(options?.signal, url, 'markdown:translated');
866
- return frontmatter ? `${frontmatter}\n${content}` : content;
970
+ const content = translateHtmlToMarkdown(html, url, options?.signal);
971
+ return appendMetadataFooter(content, metadata, url);
867
972
  }
868
973
  catch (error) {
869
974
  if (error instanceof FetchError) {
870
975
  throw error;
871
976
  }
872
- return frontmatter;
977
+ return buildMetadataFooter(metadata, url);
873
978
  }
874
979
  }
980
+ /**
981
+ * Clean up common markdown conversion artifacts:
982
+ * - Empty headings (e.g., "## " with no text)
983
+ * - Anchor-only links like [ ](#section-id) used for navigation
984
+ * - Concatenated links without spacing
985
+ * - Boilerplate phrases like "Was this page helpful?"
986
+ */
987
+ function cleanupMarkdownArtifacts(content) {
988
+ let result = content;
989
+ // Remove empty Markdown headings like "## " produced by placeholder nodes.
990
+ result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
991
+ // Remove anchor-only links like [\u200B](#section-id) or [ ](#anchor).
992
+ // These are navigation remnants with zero-width or whitespace text.
993
+ // Match: [ or whitespace or zero-width space ](#...)
994
+ const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
995
+ result = result.replace(zeroWidthAnchorLink, '');
996
+ // Add line breaks between concatenated links: ](url)[text] -> ](url)\n\n[text]
997
+ result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
998
+ // Remove common boilerplate phrases.
999
+ result = result.replace(/^Was this page helpful\??\s*$/gim, '');
1000
+ // Collapse multiple blank lines into at most two.
1001
+ result = result.replace(/\n{3,}/g, '\n\n');
1002
+ return result.trim();
1003
+ }
1004
+ function buildMetadataFooter(metadata, fallbackUrl) {
1005
+ if (!metadata)
1006
+ return '';
1007
+ const lines = [];
1008
+ // Horizontal rule as a clear footer separator.
1009
+ lines.push('---');
1010
+ if (metadata.title)
1011
+ lines.push(`**Title:** ${metadata.title}`);
1012
+ if (metadata.description)
1013
+ lines.push(`**Description:** ${metadata.description}`);
1014
+ if (metadata.author)
1015
+ lines.push(`**Author:** ${metadata.author}`);
1016
+ if (metadata.url)
1017
+ lines.push(`**Source:** ${metadata.url}`);
1018
+ else if (fallbackUrl)
1019
+ lines.push(`**Source:** ${fallbackUrl}`);
1020
+ if (metadata.fetchedAt)
1021
+ lines.push(`**Fetched:** ${metadata.fetchedAt}`);
1022
+ return lines.join('\n');
1023
+ }
875
1024
  const HEADING_PATTERN = /^#{1,6}\s/m;
876
1025
  const LIST_PATTERN = /^(?:[-*+])\s/m;
877
1026
  const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
@@ -895,6 +1044,7 @@ function looksLikeMarkdown(content) {
895
1044
  function detectLineEnding(content) {
896
1045
  return content.includes('\r\n') ? '\r\n' : '\n';
897
1046
  }
1047
+ const FRONTMATTER_DELIMITER = '---';
898
1048
  function findFrontmatterLines(content) {
899
1049
  const lineEnding = detectLineEnding(content);
900
1050
  const lines = content.split(lineEnding);
@@ -930,10 +1080,32 @@ function parseFrontmatterEntry(line) {
930
1080
  function isTitleKey(key) {
931
1081
  return key === 'title' || key === 'name';
932
1082
  }
1083
+ function extractTitleFromHeading(content) {
1084
+ const lineEnding = detectLineEnding(content);
1085
+ const lines = content.split(lineEnding);
1086
+ for (const line of lines) {
1087
+ const trimmed = line.trim();
1088
+ if (!trimmed)
1089
+ continue;
1090
+ let index = 0;
1091
+ while (index < trimmed.length && trimmed[index] === '#') {
1092
+ index += 1;
1093
+ }
1094
+ if (index === 0 || index > 6)
1095
+ return undefined;
1096
+ const nextChar = trimmed[index];
1097
+ if (nextChar !== ' ' && nextChar !== '\t')
1098
+ return undefined;
1099
+ const heading = trimmed.slice(index).trim();
1100
+ return heading.length > 0 ? heading : undefined;
1101
+ }
1102
+ return undefined;
1103
+ }
933
1104
  function extractTitleFromRawMarkdown(content) {
934
1105
  const frontmatter = findFrontmatterLines(content);
935
- if (!frontmatter)
936
- return undefined;
1106
+ if (!frontmatter) {
1107
+ return extractTitleFromHeading(content);
1108
+ }
937
1109
  const { lines, endIndex } = frontmatter;
938
1110
  const entry = lines
939
1111
  .slice(1, endIndex)
@@ -944,8 +1116,48 @@ function extractTitleFromRawMarkdown(content) {
944
1116
  const value = stripOptionalQuotes(entry.value);
945
1117
  return value || undefined;
946
1118
  }
1119
+ function hasMarkdownSourceLine(content) {
1120
+ const lineEnding = detectLineEnding(content);
1121
+ const lines = content.split(lineEnding);
1122
+ // Only scan a small prefix to avoid wasting time on huge docs.
1123
+ const limit = Math.min(lines.length, 50);
1124
+ for (let index = 0; index < limit; index += 1) {
1125
+ const line = lines[index];
1126
+ if (!line)
1127
+ continue;
1128
+ if (line.trimStart().toLowerCase().startsWith('source:')) {
1129
+ return true;
1130
+ }
1131
+ }
1132
+ return false;
1133
+ }
1134
+ function addSourceToMarkdownMarkdownFormat(content, url) {
1135
+ if (hasMarkdownSourceLine(content))
1136
+ return content;
1137
+ const lineEnding = detectLineEnding(content);
1138
+ const lines = content.split(lineEnding);
1139
+ const firstNonEmptyIndex = lines.findIndex((line) => line.trim().length > 0);
1140
+ if (firstNonEmptyIndex !== -1) {
1141
+ const firstLine = lines[firstNonEmptyIndex];
1142
+ if (firstLine && /^#{1,6}\s+/.test(firstLine.trim())) {
1143
+ const insertAt = firstNonEmptyIndex + 1;
1144
+ const updated = [
1145
+ ...lines.slice(0, insertAt),
1146
+ '',
1147
+ `Source: ${url}`,
1148
+ '',
1149
+ ...lines.slice(insertAt),
1150
+ ];
1151
+ return updated.join(lineEnding);
1152
+ }
1153
+ }
1154
+ return [`Source: ${url}`, '', content].join(lineEnding);
1155
+ }
947
1156
  function addSourceToMarkdown(content, url) {
948
1157
  const frontmatter = findFrontmatterLines(content);
1158
+ if (config.transform.metadataFormat === 'markdown' && !frontmatter) {
1159
+ return addSourceToMarkdownMarkdownFormat(content, url);
1160
+ }
949
1161
  if (!frontmatter) {
950
1162
  return `---\nsource: "${url}"\n---\n\n${content}`;
951
1163
  }
@@ -1086,19 +1298,11 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
1086
1298
  applyExtractedMetadata(metadata, extractedMeta);
1087
1299
  return metadata;
1088
1300
  }
1089
- function buildArticleContentSource({ url, article, extractedMeta, includeMetadata, }) {
1090
- const metadata = createContentMetadataBlock(url, article, extractedMeta, true, includeMetadata);
1301
+ function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, }) {
1302
+ const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
1091
1303
  return {
1092
- sourceHtml: article.content,
1093
- title: article.title,
1094
- metadata,
1095
- };
1096
- }
1097
- function buildFullHtmlContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
1098
- const metadata = createContentMetadataBlock(url, article, extractedMeta, false, includeMetadata);
1099
- return {
1100
- sourceHtml: html,
1101
- title: extractedMeta.title,
1304
+ sourceHtml: useArticleContent && article ? article.content : html,
1305
+ title: useArticleContent && article ? article.title : extractedMeta.title,
1102
1306
  metadata,
1103
1307
  };
1104
1308
  }
@@ -1108,84 +1312,83 @@ function logQualityGateFallback({ url, articleLength, }) {
1108
1312
  articleLength,
1109
1313
  });
1110
1314
  }
1111
- function tryBuildExtractedArticleContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
1112
- if (!article)
1113
- return null;
1315
+ function shouldUseArticleContent(article, html, url) {
1114
1316
  const shouldExtractFromArticle = determineContentExtractionSource(article);
1115
- if (shouldExtractFromArticle && isExtractionSufficient(article, html)) {
1116
- return buildArticleContentSource({
1117
- url,
1118
- article,
1119
- extractedMeta,
1120
- includeMetadata,
1121
- });
1122
- }
1123
- if (shouldExtractFromArticle) {
1124
- logQualityGateFallback({
1125
- url,
1126
- articleLength: article.textContent.length,
1127
- });
1317
+ if (!shouldExtractFromArticle)
1318
+ return false;
1319
+ if (isExtractionSufficient(article, html)) {
1320
+ return true;
1128
1321
  }
1129
- return null;
1322
+ logQualityGateFallback({
1323
+ url,
1324
+ articleLength: article.textContent.length,
1325
+ });
1326
+ return false;
1130
1327
  }
1131
1328
  function resolveContentSource({ html, url, includeMetadata, signal, }) {
1132
1329
  const { article, metadata: extractedMeta } = extractContent(html, url, {
1133
1330
  extractArticle: true,
1134
1331
  ...(signal ? { signal } : {}),
1135
1332
  });
1136
- const extracted = tryBuildExtractedArticleContentSource({
1333
+ const useArticleContent = article
1334
+ ? shouldUseArticleContent(article, html, url)
1335
+ : false;
1336
+ return buildContentSource({
1137
1337
  html,
1138
1338
  url,
1139
1339
  article,
1140
1340
  extractedMeta,
1141
1341
  includeMetadata,
1342
+ useArticleContent,
1142
1343
  });
1143
- if (extracted)
1144
- return extracted;
1145
- return buildFullHtmlContentSource({
1344
+ }
1345
+ function tryTransformRawStage(html, url, includeMetadata) {
1346
+ return runTransformStage(url, 'transform:raw', () => tryTransformRawContent({
1146
1347
  html,
1147
1348
  url,
1148
- article,
1149
- extractedMeta,
1150
1349
  includeMetadata,
1151
- });
1350
+ }));
1152
1351
  }
1153
- export function transformHtmlToMarkdownInProcess(html, url, options) {
1352
+ function resolveContentSourceStage(html, url, includeMetadata, signal) {
1353
+ return runTransformStage(url, 'transform:extract', () => resolveContentSource({
1354
+ html,
1355
+ url,
1356
+ includeMetadata,
1357
+ ...(signal ? { signal } : {}),
1358
+ }));
1359
+ }
1360
+ function buildMarkdownFromContext(context, url, signal) {
1361
+ const content = runTransformStage(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
1362
+ url,
1363
+ ...(signal ? { signal } : {}),
1364
+ }));
1365
+ return {
1366
+ markdown: content,
1367
+ title: context.title,
1368
+ truncated: false,
1369
+ };
1370
+ }
1371
+ function runTotalTransformStage(url, fn) {
1154
1372
  const totalStage = startTransformStage(url, 'transform:total');
1155
1373
  let success = false;
1156
1374
  try {
1157
- throwIfAborted(options.signal, url, 'transform:begin');
1158
- const rawStage = startTransformStage(url, 'transform:raw');
1159
- const raw = tryTransformRawContent({
1160
- html,
1161
- url,
1162
- includeMetadata: options.includeMetadata,
1163
- });
1164
- endTransformStage(rawStage);
1165
- if (raw) {
1166
- success = true;
1167
- return raw;
1375
+ const result = fn();
1376
+ success = true;
1377
+ return result;
1378
+ }
1379
+ finally {
1380
+ if (success) {
1381
+ endTransformStage(totalStage, { truncated: false });
1168
1382
  }
1169
- const extractStage = startTransformStage(url, 'transform:extract');
1170
- const context = resolveContentSource({
1171
- html,
1172
- url,
1173
- includeMetadata: options.includeMetadata,
1174
- ...(options.signal ? { signal: options.signal } : {}),
1175
- });
1176
- endTransformStage(extractStage);
1177
- const markdownStage = startTransformStage(url, 'transform:markdown');
1178
- const content = htmlToMarkdown(context.sourceHtml, context.metadata, {
1179
- url,
1180
- ...(options.signal ? { signal: options.signal } : {}),
1181
- });
1182
- endTransformStage(markdownStage);
1383
+ }
1384
+ }
1385
+ async function runTotalTransformStageAsync(url, fn) {
1386
+ const totalStage = startTransformStage(url, 'transform:total');
1387
+ let success = false;
1388
+ try {
1389
+ const result = await fn();
1183
1390
  success = true;
1184
- return {
1185
- markdown: content,
1186
- title: context.title,
1187
- truncated: false,
1188
- };
1391
+ return result;
1189
1392
  }
1190
1393
  finally {
1191
1394
  if (success) {
@@ -1193,15 +1396,47 @@ export function transformHtmlToMarkdownInProcess(html, url, options) {
1193
1396
  }
1194
1397
  }
1195
1398
  }
1399
+ export function transformHtmlToMarkdownInProcess(html, url, options) {
1400
+ return runTotalTransformStage(url, () => {
1401
+ throwIfAborted(options.signal, url, 'transform:begin');
1402
+ const raw = tryTransformRawStage(html, url, options.includeMetadata);
1403
+ if (raw) {
1404
+ return raw;
1405
+ }
1406
+ const context = resolveContentSourceStage(html, url, options.includeMetadata, options.signal);
1407
+ return buildMarkdownFromContext(context, url, options.signal);
1408
+ });
1409
+ }
1410
+ const workerMessageSchema = z.discriminatedUnion('type', [
1411
+ z.object({
1412
+ type: z.literal('result'),
1413
+ id: z.string(),
1414
+ result: z.object({
1415
+ markdown: z.string(),
1416
+ title: z.string().optional(),
1417
+ truncated: z.boolean(),
1418
+ }),
1419
+ }),
1420
+ z.object({
1421
+ type: z.literal('error'),
1422
+ id: z.string(),
1423
+ error: z.object({
1424
+ name: z.string(),
1425
+ message: z.string(),
1426
+ url: z.string(),
1427
+ statusCode: z.number().optional(),
1428
+ details: z.record(z.string(), z.unknown()).optional(),
1429
+ }),
1430
+ }),
1431
+ ]);
1196
1432
  let pool = null;
1197
1433
  function resolveDefaultWorkerCount() {
1198
1434
  const parallelism = typeof os.availableParallelism === 'function'
1199
1435
  ? os.availableParallelism()
1200
1436
  : os.cpus().length;
1201
- // Leave 1 core for the event loop; cap to avoid runaway memory.
1202
1437
  return Math.min(16, Math.max(1, parallelism - 1));
1203
1438
  }
1204
- const DEFAULT_TIMEOUT_MS = 30000;
1439
+ const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
1205
1440
  function getOrCreateTransformWorkerPool() {
1206
1441
  pool ??= new WorkerPool(resolveDefaultWorkerCount(), DEFAULT_TIMEOUT_MS);
1207
1442
  return pool;
@@ -1219,23 +1454,108 @@ class WorkerPool {
1219
1454
  timeoutMs;
1220
1455
  queueMax;
1221
1456
  closed = false;
1222
- constructor(size, timeoutMs) {
1223
- const safeSize = Math.max(1, size);
1224
- this.timeoutMs = timeoutMs;
1225
- this.queueMax = safeSize * 2;
1226
- for (let index = 0; index < safeSize; index += 1) {
1227
- this.workers.push(this.spawnWorker(index));
1457
+ ensureOpen() {
1458
+ if (this.closed) {
1459
+ throw new Error('Transform worker pool closed');
1228
1460
  }
1229
1461
  }
1230
- spawnWorker(workerIndex) {
1231
- const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url));
1232
- // Workers must not keep the process alive by themselves.
1233
- worker.unref();
1234
- const slot = {
1462
+ ensureNotAborted(signal, url, stage) {
1463
+ if (!signal?.aborted)
1464
+ return;
1465
+ throw new FetchError('Request was canceled', url, 499, {
1466
+ reason: 'aborted',
1467
+ stage,
1468
+ });
1469
+ }
1470
+ ensureQueueCapacity(url) {
1471
+ if (this.queue.length < this.queueMax)
1472
+ return;
1473
+ throw new FetchError('Transform worker queue is full', url, 503, {
1474
+ reason: 'queue_full',
1475
+ stage: 'transform:enqueue',
1476
+ });
1477
+ }
1478
+ clearAbortListener(signal, listener) {
1479
+ if (!signal || !listener)
1480
+ return;
1481
+ try {
1482
+ signal.removeEventListener('abort', listener);
1483
+ }
1484
+ catch {
1485
+ // ignore
1486
+ }
1487
+ }
1488
+ markSlotIdle(workerIndex) {
1489
+ const slot = this.workers[workerIndex];
1490
+ if (!slot)
1491
+ return;
1492
+ slot.busy = false;
1493
+ slot.currentTaskId = null;
1494
+ }
1495
+ takeInflight(id) {
1496
+ const inflight = this.inflight.get(id);
1497
+ if (!inflight)
1498
+ return null;
1499
+ clearTimeout(inflight.timer);
1500
+ this.clearAbortListener(inflight.signal, inflight.abortListener);
1501
+ this.inflight.delete(id);
1502
+ return inflight;
1503
+ }
1504
+ cancelWorkerTask(slot, id) {
1505
+ if (!slot)
1506
+ return;
1507
+ try {
1508
+ slot.worker.postMessage({ type: 'cancel', id });
1509
+ }
1510
+ catch {
1511
+ // ignore
1512
+ }
1513
+ }
1514
+ restartWorker(workerIndex, slot) {
1515
+ if (this.closed)
1516
+ return;
1517
+ const target = slot ?? this.workers[workerIndex];
1518
+ if (target) {
1519
+ void target.worker.terminate();
1520
+ }
1521
+ this.workers[workerIndex] = this.spawnWorker(workerIndex);
1522
+ this.drainQueue();
1523
+ }
1524
+ rejectIfClosed(reject) {
1525
+ if (!this.closed)
1526
+ return false;
1527
+ reject(new Error('Transform worker pool closed'));
1528
+ return true;
1529
+ }
1530
+ abortInflightTask(id, url, workerIndex) {
1531
+ const slot = this.workers[workerIndex];
1532
+ this.cancelWorkerTask(slot, id);
1533
+ this.failTask(id, new FetchError('Request was canceled', url, 499, {
1534
+ reason: 'aborted',
1535
+ stage: 'transform:signal-abort',
1536
+ }));
1537
+ if (slot) {
1538
+ this.restartWorker(workerIndex, slot);
1539
+ }
1540
+ }
1541
+ abortQueuedTask(id, url, reject) {
1542
+ const queuedIndex = this.queue.findIndex((task) => task.id === id);
1543
+ if (queuedIndex === -1)
1544
+ return;
1545
+ this.queue.splice(queuedIndex, 1);
1546
+ reject(new FetchError('Request was canceled', url, 499, {
1547
+ reason: 'aborted',
1548
+ stage: 'transform:queued-abort',
1549
+ }));
1550
+ }
1551
+ createWorkerSlot(worker) {
1552
+ return {
1235
1553
  worker,
1236
1554
  busy: false,
1237
1555
  currentTaskId: null,
1238
1556
  };
1557
+ }
1558
+ registerWorkerHandlers(workerIndex, worker) {
1239
1559
  worker.on('message', (raw) => {
1240
1560
  this.onWorkerMessage(workerIndex, raw);
1241
1561
  });
@@ -1245,6 +1565,21 @@ class WorkerPool {
1245
1565
  worker.on('exit', (code) => {
1246
1566
  this.onWorkerBroken(workerIndex, `Transform worker exited (code ${code})`);
1247
1567
  });
1568
+ }
1569
+ constructor(size, timeoutMs) {
1570
+ const safeSize = Math.max(1, size);
1571
+ this.timeoutMs = timeoutMs;
1572
+ this.queueMax = safeSize * 2;
1573
+ for (let index = 0; index < safeSize; index += 1) {
1574
+ this.workers.push(this.spawnWorker(index));
1575
+ }
1576
+ }
1577
+ spawnWorker(workerIndex) {
1578
+ const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url));
1579
+ // Workers must not keep the process alive by themselves.
1580
+ worker.unref();
1581
+ const slot = this.createWorkerSlot(worker);
1582
+ this.registerWorkerHandlers(workerIndex, worker);
1248
1583
  return slot;
1249
1584
  }
1250
1585
  onWorkerBroken(workerIndex, message) {
@@ -1256,129 +1591,83 @@ class WorkerPool {
1256
1591
  if (slot.busy && slot.currentTaskId) {
1257
1592
  this.failTask(slot.currentTaskId, new Error(message));
1258
1593
  }
1259
- void slot.worker.terminate();
1260
- this.workers[workerIndex] = this.spawnWorker(workerIndex);
1261
- this.drainQueue();
1594
+ this.restartWorker(workerIndex, slot);
1262
1595
  }
1263
- onWorkerMessage(workerIndex, raw) {
1264
- if (!raw ||
1265
- typeof raw !== 'object' ||
1266
- !('type' in raw) ||
1267
- !('id' in raw) ||
1268
- typeof raw.id !== 'string' ||
1269
- typeof raw.type !== 'string') {
1596
+ resolveWorkerResult(inflight, result) {
1597
+ inflight.resolve({
1598
+ markdown: result.markdown,
1599
+ truncated: result.truncated,
1600
+ title: result.title,
1601
+ });
1602
+ }
1603
+ rejectWorkerError(inflight, error) {
1604
+ if (error.name === 'FetchError') {
1605
+ inflight.reject(new FetchError(error.message, error.url, error.statusCode, error.details ?? {}));
1270
1606
  return;
1271
1607
  }
1272
- const message = raw;
1273
- const inflight = this.inflight.get(message.id);
1608
+ inflight.reject(new Error(error.message));
1609
+ }
1610
+ onWorkerMessage(workerIndex, raw) {
1611
+ const parsed = workerMessageSchema.safeParse(raw);
1612
+ if (!parsed.success)
1613
+ return;
1614
+ const message = parsed.data;
1615
+ const inflight = this.takeInflight(message.id);
1274
1616
  if (!inflight)
1275
1617
  return;
1276
- clearTimeout(inflight.timer);
1277
- if (inflight.signal && inflight.abortListener) {
1278
- inflight.signal.removeEventListener('abort', inflight.abortListener);
1279
- }
1280
- this.inflight.delete(message.id);
1281
- const slot = this.workers[workerIndex];
1282
- if (slot) {
1283
- slot.busy = false;
1284
- slot.currentTaskId = null;
1285
- }
1618
+ this.markSlotIdle(workerIndex);
1286
1619
  if (message.type === 'result') {
1287
- inflight.resolve(message.result);
1620
+ this.resolveWorkerResult(inflight, message.result);
1288
1621
  }
1289
1622
  else {
1290
- const { error } = message;
1291
- if (error.name === 'FetchError') {
1292
- inflight.reject(new FetchError(error.message, error.url, error.statusCode, error.details ?? {}));
1293
- }
1294
- else {
1295
- inflight.reject(new Error(error.message));
1296
- }
1623
+ this.rejectWorkerError(inflight, message.error);
1297
1624
  }
1298
1625
  this.drainQueue();
1299
1626
  }
1300
1627
  failTask(id, error) {
1301
- const inflight = this.inflight.get(id);
1628
+ const inflight = this.takeInflight(id);
1302
1629
  if (!inflight)
1303
1630
  return;
1304
- clearTimeout(inflight.timer);
1305
- if (inflight.signal && inflight.abortListener) {
1306
- inflight.signal.removeEventListener('abort', inflight.abortListener);
1307
- }
1308
- this.inflight.delete(id);
1309
1631
  inflight.reject(error);
1310
- const slot = this.workers[inflight.workerIndex];
1311
- if (slot) {
1312
- slot.busy = false;
1313
- slot.currentTaskId = null;
1314
- }
1632
+ this.markSlotIdle(inflight.workerIndex);
1315
1633
  }
1316
- async transform(html, url, options) {
1317
- if (this.closed) {
1318
- throw new Error('Transform worker pool closed');
1319
- }
1320
- if (options.signal?.aborted) {
1321
- throw new FetchError('Request was canceled', url, 499, {
1322
- reason: 'aborted',
1323
- stage: 'transform:enqueue',
1324
- });
1634
+ handleAbortSignal(id, url, reject) {
1635
+ if (this.rejectIfClosed(reject))
1636
+ return;
1637
+ const inflight = this.inflight.get(id);
1638
+ if (inflight) {
1639
+ this.abortInflightTask(id, url, inflight.workerIndex);
1640
+ return;
1325
1641
  }
1326
- if (this.queue.length >= this.queueMax) {
1327
- throw new Error('Transform worker queue is full');
1642
+ this.abortQueuedTask(id, url, reject);
1643
+ }
1644
+ createPendingTask(html, url, options, resolve, reject) {
1645
+ const id = randomUUID();
1646
+ let abortListener;
1647
+ if (options.signal) {
1648
+ abortListener = () => {
1649
+ this.handleAbortSignal(id, url, reject);
1650
+ };
1651
+ options.signal.addEventListener('abort', abortListener, { once: true });
1328
1652
  }
1653
+ return {
1654
+ id,
1655
+ html,
1656
+ url,
1657
+ includeMetadata: options.includeMetadata,
1658
+ signal: options.signal,
1659
+ abortListener,
1660
+ resolve,
1661
+ reject,
1662
+ };
1663
+ }
1664
+ async transform(html, url, options) {
1665
+ this.ensureOpen();
1666
+ this.ensureNotAborted(options.signal, url, 'transform:enqueue');
1667
+ this.ensureQueueCapacity(url);
1329
1668
  return new Promise((resolve, reject) => {
1330
- const id = randomUUID();
1331
- let abortListener;
1332
- if (options.signal) {
1333
- abortListener = () => {
1334
- if (this.closed) {
1335
- reject(new Error('Transform worker pool closed'));
1336
- return;
1337
- }
1338
- const inflight = this.inflight.get(id);
1339
- if (inflight) {
1340
- const { workerIndex } = inflight;
1341
- const slot = this.workers[workerIndex];
1342
- if (slot) {
1343
- try {
1344
- slot.worker.postMessage({ type: 'cancel', id });
1345
- }
1346
- catch {
1347
- // ignore
1348
- }
1349
- }
1350
- this.failTask(id, new FetchError('Request was canceled', url, 499, {
1351
- reason: 'aborted',
1352
- stage: 'transform:signal-abort',
1353
- }));
1354
- if (slot) {
1355
- void slot.worker.terminate();
1356
- this.workers[workerIndex] = this.spawnWorker(workerIndex);
1357
- this.drainQueue();
1358
- }
1359
- return;
1360
- }
1361
- const queuedIndex = this.queue.findIndex((task) => task.id === id);
1362
- if (queuedIndex !== -1) {
1363
- this.queue.splice(queuedIndex, 1);
1364
- reject(new FetchError('Request was canceled', url, 499, {
1365
- reason: 'aborted',
1366
- stage: 'transform:queued-abort',
1367
- }));
1368
- }
1369
- };
1370
- options.signal.addEventListener('abort', abortListener, { once: true });
1371
- }
1372
- this.queue.push({
1373
- id,
1374
- html,
1375
- url,
1376
- includeMetadata: options.includeMetadata,
1377
- signal: options.signal,
1378
- abortListener,
1379
- resolve,
1380
- reject,
1381
- });
1669
+ const task = this.createPendingTask(html, url, options, resolve, reject);
1670
+ this.queue.push(task);
1382
1671
  this.drainQueue();
1383
1672
  });
1384
1673
  }
@@ -1398,43 +1687,48 @@ class WorkerPool {
1398
1687
  }
1399
1688
  }
1400
1689
  dispatch(workerIndex, slot, task) {
1401
- if (task.signal?.aborted) {
1402
- if (task.abortListener) {
1403
- task.signal.removeEventListener('abort', task.abortListener);
1404
- }
1405
- task.reject(new FetchError('Request was canceled', task.url, 499, {
1406
- reason: 'aborted',
1407
- stage: 'transform:dispatch',
1408
- }));
1690
+ if (this.rejectIfAborted(task))
1409
1691
  return;
1692
+ this.markSlotBusy(slot, task);
1693
+ const timer = this.startTaskTimer(workerIndex, slot, task);
1694
+ this.registerInflightTask(task, timer, workerIndex);
1695
+ try {
1696
+ this.sendTransformMessage(slot, task);
1697
+ }
1698
+ catch (error) {
1699
+ this.handleDispatchFailure(workerIndex, slot, task, timer, error);
1410
1700
  }
1701
+ }
1702
+ rejectIfAborted(task) {
1703
+ if (!task.signal?.aborted)
1704
+ return false;
1705
+ this.clearAbortListener(task.signal, task.abortListener);
1706
+ task.reject(new FetchError('Request was canceled', task.url, 499, {
1707
+ reason: 'aborted',
1708
+ stage: 'transform:dispatch',
1709
+ }));
1710
+ return true;
1711
+ }
1712
+ markSlotBusy(slot, task) {
1411
1713
  slot.busy = true;
1412
1714
  slot.currentTaskId = task.id;
1715
+ }
1716
+ startTaskTimer(workerIndex, slot, task) {
1413
1717
  const timer = setTimeout(() => {
1414
- try {
1415
- slot.worker.postMessage({ type: 'cancel', id: task.id });
1416
- }
1417
- catch {
1418
- // ignore
1419
- }
1420
- const inflight = this.inflight.get(task.id);
1718
+ this.cancelWorkerTask(slot, task.id);
1719
+ const inflight = this.takeInflight(task.id);
1421
1720
  if (!inflight)
1422
1721
  return;
1423
- clearTimeout(inflight.timer);
1424
- if (inflight.signal && inflight.abortListener) {
1425
- inflight.signal.removeEventListener('abort', inflight.abortListener);
1426
- }
1427
- this.inflight.delete(task.id);
1428
1722
  inflight.reject(new FetchError('Request timeout', task.url, 504, {
1429
1723
  reason: 'timeout',
1430
1724
  stage: 'transform:worker-timeout',
1431
1725
  }));
1432
- if (!this.closed) {
1433
- void slot.worker.terminate();
1434
- this.workers[workerIndex] = this.spawnWorker(workerIndex);
1435
- this.drainQueue();
1436
- }
1437
- }, this.timeoutMs).unref();
1726
+ this.restartWorker(workerIndex, slot);
1727
+ }, this.timeoutMs);
1728
+ timer.unref();
1729
+ return timer;
1730
+ }
1731
+ registerInflightTask(task, timer, workerIndex) {
1438
1732
  this.inflight.set(task.id, {
1439
1733
  resolve: task.resolve,
1440
1734
  reject: task.reject,
@@ -1443,6 +1737,8 @@ class WorkerPool {
1443
1737
  abortListener: task.abortListener,
1444
1738
  workerIndex,
1445
1739
  });
1740
+ }
1741
+ sendTransformMessage(slot, task) {
1446
1742
  slot.worker.postMessage({
1447
1743
  type: 'transform',
1448
1744
  id: task.id,
@@ -1451,6 +1747,17 @@ class WorkerPool {
1451
1747
  includeMetadata: task.includeMetadata,
1452
1748
  });
1453
1749
  }
1750
+ handleDispatchFailure(workerIndex, slot, task, timer, error) {
1751
+ clearTimeout(timer);
1752
+ this.clearAbortListener(task.signal, task.abortListener);
1753
+ this.inflight.delete(task.id);
1754
+ this.markSlotIdle(workerIndex);
1755
+ const message = error instanceof Error
1756
+ ? error
1757
+ : new Error('Failed to dispatch transform worker message');
1758
+ task.reject(message);
1759
+ this.restartWorker(workerIndex, slot);
1760
+ }
1454
1761
  async close() {
1455
1762
  if (this.closed)
1456
1763
  return;
@@ -1459,9 +1766,7 @@ class WorkerPool {
1459
1766
  this.workers.length = 0;
1460
1767
  for (const [id, inflight] of this.inflight.entries()) {
1461
1768
  clearTimeout(inflight.timer);
1462
- if (inflight.signal && inflight.abortListener) {
1463
- inflight.signal.removeEventListener('abort', inflight.abortListener);
1464
- }
1769
+ this.clearAbortListener(inflight.signal, inflight.abortListener);
1465
1770
  inflight.reject(new Error('Transform worker pool closed'));
1466
1771
  this.inflight.delete(id);
1467
1772
  }
@@ -1472,38 +1777,38 @@ class WorkerPool {
1472
1777
  await Promise.allSettled(terminations);
1473
1778
  }
1474
1779
  }
1780
+ function buildWorkerTransformOptions(options) {
1781
+ return {
1782
+ includeMetadata: options.includeMetadata,
1783
+ ...(options.signal ? { signal: options.signal } : {}),
1784
+ };
1785
+ }
1786
+ async function transformWithWorkerPool(html, url, options) {
1787
+ const poolRef = getOrCreateTransformWorkerPool();
1788
+ return poolRef.transform(html, url, buildWorkerTransformOptions(options));
1789
+ }
1790
+ function resolveWorkerFallback(error, html, url, options) {
1791
+ if (error instanceof FetchError) {
1792
+ throw error;
1793
+ }
1794
+ // Stability-first: if worker infrastructure fails, fall back to in-process.
1795
+ throwIfAborted(options.signal, url, 'transform:worker-fallback');
1796
+ return transformHtmlToMarkdownInProcess(html, url, options);
1797
+ }
1475
1798
  export async function transformHtmlToMarkdown(html, url, options) {
1476
- const totalStage = startTransformStage(url, 'transform:total');
1477
- let success = false;
1478
- try {
1799
+ return runTotalTransformStageAsync(url, async () => {
1479
1800
  throwIfAborted(options.signal, url, 'transform:begin');
1480
1801
  const workerStage = startTransformStage(url, 'transform:worker');
1481
1802
  try {
1482
- const poolRef = getOrCreateTransformWorkerPool();
1483
- const result = await poolRef.transform(html, url, {
1484
- includeMetadata: options.includeMetadata,
1485
- ...(options.signal ? { signal: options.signal } : {}),
1486
- });
1487
- success = true;
1803
+ const result = await transformWithWorkerPool(html, url, options);
1488
1804
  return result;
1489
1805
  }
1490
1806
  catch (error) {
1491
- if (error instanceof FetchError) {
1492
- throw error;
1493
- }
1494
- // Stability-first: if worker infrastructure fails, fall back to in-process.
1495
- throwIfAborted(options.signal, url, 'transform:worker-fallback');
1496
- const fallback = transformHtmlToMarkdownInProcess(html, url, options);
1497
- success = true;
1807
+ const fallback = resolveWorkerFallback(error, html, url, options);
1498
1808
  return fallback;
1499
1809
  }
1500
1810
  finally {
1501
1811
  endTransformStage(workerStage);
1502
1812
  }
1503
- }
1504
- finally {
1505
- if (success) {
1506
- endTransformStage(totalStage, { truncated: false });
1507
- }
1508
- }
1813
+ });
1509
1814
  }