@j0hanz/superfetch 2.1.0 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -10
- package/dist/cache.js +125 -16
- package/dist/config.d.ts +6 -1
- package/dist/config.js +14 -1
- package/dist/fetch.js +91 -71
- package/dist/http.d.ts +9 -1
- package/dist/http.js +126 -56
- package/dist/instructions.md +66 -0
- package/dist/mcp.js +11 -1
- package/dist/observability.js +1 -1
- package/dist/tools.d.ts +7 -2
- package/dist/tools.js +29 -16
- package/dist/transform.js +714 -409
- package/dist/utils.d.ts +1 -0
- package/dist/utils.js +3 -0
- package/dist/workers/transform-worker.js +1 -3
- package/package.json +3 -3
package/dist/transform.js
CHANGED
|
@@ -5,15 +5,47 @@ import { performance } from 'node:perf_hooks';
|
|
|
5
5
|
import { Worker } from 'node:worker_threads';
|
|
6
6
|
import { parseHTML } from 'linkedom';
|
|
7
7
|
import { NodeHtmlMarkdown, } from 'node-html-markdown';
|
|
8
|
-
import {
|
|
8
|
+
import { z } from 'zod';
|
|
9
|
+
import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
9
10
|
import { config } from './config.js';
|
|
10
11
|
import { FetchError, getErrorMessage } from './errors.js';
|
|
11
12
|
import { isRawTextContentUrl } from './fetch.js';
|
|
12
13
|
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
|
|
13
|
-
|
|
14
|
-
|
|
14
|
+
import { isRecord } from './utils.js';
|
|
15
|
+
function getAbortReason(signal) {
|
|
16
|
+
if (!isRecord(signal))
|
|
17
|
+
return undefined;
|
|
18
|
+
return 'reason' in signal ? signal.reason : undefined;
|
|
19
|
+
}
|
|
20
|
+
function getBodyInnerHtml(document) {
|
|
21
|
+
if (!isRecord(document))
|
|
22
|
+
return undefined;
|
|
23
|
+
const { body } = document;
|
|
24
|
+
if (!isRecord(body))
|
|
25
|
+
return undefined;
|
|
26
|
+
const { innerHTML } = body;
|
|
27
|
+
return typeof innerHTML === 'string' && innerHTML.length > 0
|
|
28
|
+
? innerHTML
|
|
29
|
+
: undefined;
|
|
30
|
+
}
|
|
31
|
+
function getDocumentToString(document) {
|
|
32
|
+
if (!isRecord(document))
|
|
33
|
+
return undefined;
|
|
34
|
+
if (typeof document.toString !== 'function')
|
|
35
|
+
return undefined;
|
|
36
|
+
return document.toString.bind(document);
|
|
37
|
+
}
|
|
38
|
+
function getDocumentElementOuterHtml(document) {
|
|
39
|
+
if (!isRecord(document))
|
|
40
|
+
return undefined;
|
|
41
|
+
const { documentElement } = document;
|
|
42
|
+
if (!isRecord(documentElement))
|
|
43
|
+
return undefined;
|
|
44
|
+
const { outerHTML } = documentElement;
|
|
45
|
+
return typeof outerHTML === 'string' && outerHTML.length > 0
|
|
46
|
+
? outerHTML
|
|
47
|
+
: undefined;
|
|
15
48
|
}
|
|
16
|
-
const FRONTMATTER_DELIMITER = '---';
|
|
17
49
|
const CODE_BLOCK = {
|
|
18
50
|
fence: '```',
|
|
19
51
|
format: (code, language = '') => {
|
|
@@ -59,6 +91,12 @@ export function endTransformStage(context, options) {
|
|
|
59
91
|
};
|
|
60
92
|
publishTransformEvent(event);
|
|
61
93
|
}
|
|
94
|
+
function runTransformStage(url, stage, fn) {
|
|
95
|
+
const context = startTransformStage(url, stage);
|
|
96
|
+
const result = fn();
|
|
97
|
+
endTransformStage(context);
|
|
98
|
+
return result;
|
|
99
|
+
}
|
|
62
100
|
function isTimeoutReason(reason) {
|
|
63
101
|
return reason instanceof Error && reason.name === 'TimeoutError';
|
|
64
102
|
}
|
|
@@ -68,7 +106,7 @@ function throwIfAborted(signal, url, stage) {
|
|
|
68
106
|
const { aborted } = signal;
|
|
69
107
|
if (!aborted)
|
|
70
108
|
return;
|
|
71
|
-
const
|
|
109
|
+
const reason = getAbortReason(signal);
|
|
72
110
|
if (isTimeoutReason(reason)) {
|
|
73
111
|
throw new FetchError('Request timeout', url, 504, {
|
|
74
112
|
reason: 'timeout',
|
|
@@ -192,8 +230,18 @@ function extractArticle(document) {
|
|
|
192
230
|
}
|
|
193
231
|
function parseReadabilityArticle(document) {
|
|
194
232
|
try {
|
|
195
|
-
//
|
|
196
|
-
const
|
|
233
|
+
// Readability mutates the document; operate on a clone.
|
|
234
|
+
const documentClone = document.cloneNode(true);
|
|
235
|
+
// Avoid the more expensive parse() when the page is unlikely to be readable,
|
|
236
|
+
// but don't penalize small documents where the heuristic is often too strict.
|
|
237
|
+
const rawText = documentClone.body.textContent ||
|
|
238
|
+
documentClone.documentElement.textContent;
|
|
239
|
+
const textLength = rawText.replace(/\s+/g, ' ').trim().length;
|
|
240
|
+
if (textLength >= 400 && !isProbablyReaderable(documentClone)) {
|
|
241
|
+
return null;
|
|
242
|
+
}
|
|
243
|
+
// Guard against pathological DOM sizes.
|
|
244
|
+
const reader = new Readability(documentClone, { maxElemsToParse: 20_000 });
|
|
197
245
|
return reader.parse();
|
|
198
246
|
}
|
|
199
247
|
catch (error) {
|
|
@@ -233,45 +281,48 @@ function addOptionalField(target, key, value) {
|
|
|
233
281
|
export function extractContent(html, url, options = {
|
|
234
282
|
extractArticle: true,
|
|
235
283
|
}) {
|
|
284
|
+
const emptyResult = createEmptyExtractionResult();
|
|
236
285
|
if (!isValidInput(html, url)) {
|
|
237
|
-
return
|
|
286
|
+
return emptyResult;
|
|
238
287
|
}
|
|
239
288
|
return tryExtractContent(html, url, options);
|
|
240
289
|
}
|
|
290
|
+
function createEmptyExtractionResult() {
|
|
291
|
+
return { article: null, metadata: {} };
|
|
292
|
+
}
|
|
293
|
+
function extractArticleWithStage(document, url, shouldExtract) {
|
|
294
|
+
if (!shouldExtract)
|
|
295
|
+
return null;
|
|
296
|
+
return runTransformStage(url, 'extract:article', () => resolveArticleExtraction(document, shouldExtract));
|
|
297
|
+
}
|
|
298
|
+
function handleExtractionFailure(error, url, signal) {
|
|
299
|
+
if (error instanceof FetchError) {
|
|
300
|
+
throw error;
|
|
301
|
+
}
|
|
302
|
+
throwIfAborted(signal, url, 'extract:error');
|
|
303
|
+
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
304
|
+
return createEmptyExtractionResult();
|
|
305
|
+
}
|
|
306
|
+
function extractContentStages(html, url, options) {
|
|
307
|
+
throwIfAborted(options.signal, url, 'extract:begin');
|
|
308
|
+
const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncateHtml(html)));
|
|
309
|
+
throwIfAborted(options.signal, url, 'extract:parsed');
|
|
310
|
+
applyBaseUri(document, url);
|
|
311
|
+
const metadata = runTransformStage(url, 'extract:metadata', () => extractMetadata(document));
|
|
312
|
+
throwIfAborted(options.signal, url, 'extract:metadata');
|
|
313
|
+
const article = extractArticleWithStage(document, url, options.extractArticle);
|
|
314
|
+
throwIfAborted(options.signal, url, 'extract:article');
|
|
315
|
+
return {
|
|
316
|
+
article,
|
|
317
|
+
metadata,
|
|
318
|
+
};
|
|
319
|
+
}
|
|
241
320
|
function tryExtractContent(html, url, options) {
|
|
242
321
|
try {
|
|
243
|
-
|
|
244
|
-
const parseStage = startTransformStage(url, 'extract:parse');
|
|
245
|
-
const { document } = parseHTML(truncateHtml(html));
|
|
246
|
-
endTransformStage(parseStage);
|
|
247
|
-
throwIfAborted(options.signal, url, 'extract:parsed');
|
|
248
|
-
applyBaseUri(document, url);
|
|
249
|
-
const metadataStage = startTransformStage(url, 'extract:metadata');
|
|
250
|
-
const metadata = extractMetadata(document);
|
|
251
|
-
endTransformStage(metadataStage);
|
|
252
|
-
throwIfAborted(options.signal, url, 'extract:metadata');
|
|
253
|
-
let article;
|
|
254
|
-
if (options.extractArticle) {
|
|
255
|
-
const articleStage = startTransformStage(url, 'extract:article');
|
|
256
|
-
article = resolveArticleExtraction(document, options.extractArticle);
|
|
257
|
-
endTransformStage(articleStage);
|
|
258
|
-
}
|
|
259
|
-
else {
|
|
260
|
-
article = null;
|
|
261
|
-
}
|
|
262
|
-
throwIfAborted(options.signal, url, 'extract:article');
|
|
263
|
-
return {
|
|
264
|
-
article,
|
|
265
|
-
metadata,
|
|
266
|
-
};
|
|
322
|
+
return extractContentStages(html, url, options);
|
|
267
323
|
}
|
|
268
324
|
catch (error) {
|
|
269
|
-
|
|
270
|
-
throw error;
|
|
271
|
-
}
|
|
272
|
-
throwIfAborted(options.signal, url, 'extract:error');
|
|
273
|
-
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
274
|
-
return { article: null, metadata: {} };
|
|
325
|
+
return handleExtractionFailure(error, url, options.signal);
|
|
275
326
|
}
|
|
276
327
|
}
|
|
277
328
|
function isValidInput(html, url) {
|
|
@@ -563,56 +614,6 @@ export function resolveLanguageFromAttributes(className, dataLang) {
|
|
|
563
614
|
const classMatch = extractLanguageFromClassName(className);
|
|
564
615
|
return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
|
|
565
616
|
}
|
|
566
|
-
const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
|
|
567
|
-
const YAML_NUMERIC = /^[\d.]+$/;
|
|
568
|
-
const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
|
|
569
|
-
const ESCAPE_PATTERNS = {
|
|
570
|
-
backslash: /\\/g,
|
|
571
|
-
quote: /"/g,
|
|
572
|
-
newline: /\n/g,
|
|
573
|
-
tab: /\t/g,
|
|
574
|
-
};
|
|
575
|
-
const YAML_QUOTE_CHECKS = [
|
|
576
|
-
(input) => YAML_SPECIAL_CHARS.test(input),
|
|
577
|
-
(input) => input.startsWith(' ') || input.endsWith(' '),
|
|
578
|
-
(input) => input === '',
|
|
579
|
-
(input) => YAML_NUMERIC.test(input),
|
|
580
|
-
(input) => YAML_RESERVED_WORDS.test(input),
|
|
581
|
-
];
|
|
582
|
-
function needsYamlQuotes(value) {
|
|
583
|
-
return YAML_QUOTE_CHECKS.some((check) => check(value));
|
|
584
|
-
}
|
|
585
|
-
function escapeYamlValue(value) {
|
|
586
|
-
if (!needsYamlQuotes(value)) {
|
|
587
|
-
return value;
|
|
588
|
-
}
|
|
589
|
-
const escaped = value
|
|
590
|
-
.replace(ESCAPE_PATTERNS.backslash, '\\\\')
|
|
591
|
-
.replace(ESCAPE_PATTERNS.quote, '\\"')
|
|
592
|
-
.replace(ESCAPE_PATTERNS.newline, '\\n')
|
|
593
|
-
.replace(ESCAPE_PATTERNS.tab, '\\t');
|
|
594
|
-
return `"${escaped}"`;
|
|
595
|
-
}
|
|
596
|
-
function appendFrontmatterField(lines, key, value) {
|
|
597
|
-
if (!value)
|
|
598
|
-
return;
|
|
599
|
-
lines.push(`${key}: ${escapeYamlValue(value)}`);
|
|
600
|
-
}
|
|
601
|
-
function joinLines(lines) {
|
|
602
|
-
return lines.join('\n');
|
|
603
|
-
}
|
|
604
|
-
function buildFrontmatter(metadata) {
|
|
605
|
-
if (!metadata)
|
|
606
|
-
return '';
|
|
607
|
-
const lines = [FRONTMATTER_DELIMITER];
|
|
608
|
-
appendFrontmatterField(lines, 'title', metadata.title);
|
|
609
|
-
appendFrontmatterField(lines, 'source', metadata.url);
|
|
610
|
-
appendFrontmatterField(lines, 'author', metadata.author);
|
|
611
|
-
appendFrontmatterField(lines, 'description', metadata.description);
|
|
612
|
-
appendFrontmatterField(lines, 'fetchedAt', metadata.fetchedAt);
|
|
613
|
-
lines.push(FRONTMATTER_DELIMITER);
|
|
614
|
-
return joinLines(lines);
|
|
615
|
-
}
|
|
616
617
|
function isElement(node) {
|
|
617
618
|
return (isRecord(node) &&
|
|
618
619
|
'getAttribute' in node &&
|
|
@@ -623,16 +624,13 @@ const STRUCTURAL_TAGS = new Set([
|
|
|
623
624
|
'style',
|
|
624
625
|
'noscript',
|
|
625
626
|
'iframe',
|
|
626
|
-
'nav',
|
|
627
|
-
'footer',
|
|
628
|
-
'aside',
|
|
629
|
-
'header',
|
|
630
627
|
'form',
|
|
631
628
|
'button',
|
|
632
629
|
'input',
|
|
633
630
|
'select',
|
|
634
631
|
'textarea',
|
|
635
632
|
]);
|
|
633
|
+
const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
|
|
636
634
|
const NAVIGATION_ROLES = new Set([
|
|
637
635
|
'navigation',
|
|
638
636
|
'banner',
|
|
@@ -641,8 +639,37 @@ const NAVIGATION_ROLES = new Set([
|
|
|
641
639
|
'tree',
|
|
642
640
|
'menubar',
|
|
643
641
|
'menu',
|
|
642
|
+
'dialog',
|
|
643
|
+
'alertdialog',
|
|
644
644
|
]);
|
|
645
|
-
const
|
|
645
|
+
const PROMO_TOKENS = new Set([
|
|
646
|
+
'banner',
|
|
647
|
+
'promo',
|
|
648
|
+
'announcement',
|
|
649
|
+
'cta',
|
|
650
|
+
'callout',
|
|
651
|
+
'advert',
|
|
652
|
+
'ad',
|
|
653
|
+
'ads',
|
|
654
|
+
'sponsor',
|
|
655
|
+
'newsletter',
|
|
656
|
+
'subscribe',
|
|
657
|
+
'cookie',
|
|
658
|
+
'consent',
|
|
659
|
+
'popup',
|
|
660
|
+
'modal',
|
|
661
|
+
'overlay',
|
|
662
|
+
'toast',
|
|
663
|
+
'share',
|
|
664
|
+
'social',
|
|
665
|
+
'related',
|
|
666
|
+
'recommend',
|
|
667
|
+
'comment',
|
|
668
|
+
'breadcrumb',
|
|
669
|
+
'pagination',
|
|
670
|
+
'pager',
|
|
671
|
+
]);
|
|
672
|
+
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
646
673
|
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
647
674
|
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
648
675
|
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
@@ -711,15 +738,26 @@ function isStructuralNoiseTag(tagName) {
|
|
|
711
738
|
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
712
739
|
}
|
|
713
740
|
function isElementHidden(element) {
|
|
741
|
+
const style = element.getAttribute('style') ?? '';
|
|
714
742
|
return (element.getAttribute('hidden') !== null ||
|
|
715
|
-
element.getAttribute('aria-hidden') === 'true'
|
|
743
|
+
element.getAttribute('aria-hidden') === 'true' ||
|
|
744
|
+
/\bdisplay\s*:\s*none\b/i.test(style) ||
|
|
745
|
+
/\bvisibility\s*:\s*hidden\b/i.test(style));
|
|
716
746
|
}
|
|
717
747
|
function hasNoiseRole(role) {
|
|
718
748
|
return role !== null && NAVIGATION_ROLES.has(role);
|
|
719
749
|
}
|
|
750
|
+
function tokenizeIdentifierLikeText(value) {
|
|
751
|
+
return value
|
|
752
|
+
.toLowerCase()
|
|
753
|
+
.replace(/[^a-z0-9]+/g, ' ')
|
|
754
|
+
.trim()
|
|
755
|
+
.split(' ')
|
|
756
|
+
.filter(Boolean);
|
|
757
|
+
}
|
|
720
758
|
function matchesPromoIdOrClass(className, id) {
|
|
721
|
-
const
|
|
722
|
-
return
|
|
759
|
+
const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
|
|
760
|
+
return tokens.some((token) => PROMO_TOKENS.has(token));
|
|
723
761
|
}
|
|
724
762
|
function matchesHighZIsolate(className) {
|
|
725
763
|
return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
|
|
@@ -736,42 +774,49 @@ function readElementMetadata(element) {
|
|
|
736
774
|
isHidden: isElementHidden(element),
|
|
737
775
|
};
|
|
738
776
|
}
|
|
777
|
+
function isBoilerplateHeader({ className, id, role, }) {
|
|
778
|
+
if (hasNoiseRole(role))
|
|
779
|
+
return true;
|
|
780
|
+
const combined = `${className} ${id}`.toLowerCase();
|
|
781
|
+
return HEADER_NOISE_PATTERN.test(combined);
|
|
782
|
+
}
|
|
739
783
|
function isNoiseElement(node) {
|
|
740
784
|
const metadata = readElementMetadata(node);
|
|
741
785
|
return (isStructuralNoiseTag(metadata.tagName) ||
|
|
786
|
+
ALWAYS_NOISE_TAGS.has(metadata.tagName) ||
|
|
787
|
+
(metadata.tagName === 'header' && isBoilerplateHeader(metadata)) ||
|
|
742
788
|
metadata.isHidden ||
|
|
743
789
|
hasNoiseRole(metadata.role) ||
|
|
744
790
|
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
745
791
|
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
746
792
|
}
|
|
793
|
+
function stripNoiseNodes(document) {
|
|
794
|
+
const nodes = document.querySelectorAll('*');
|
|
795
|
+
for (let index = nodes.length - 1; index >= 0; index -= 1) {
|
|
796
|
+
const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
|
|
797
|
+
if (!node)
|
|
798
|
+
continue;
|
|
799
|
+
if (isElement(node) && isNoiseElement(node)) {
|
|
800
|
+
node.remove();
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
}
|
|
747
804
|
function removeNoiseFromHtml(html) {
|
|
748
805
|
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
749
806
|
if (!shouldParse)
|
|
750
807
|
return html;
|
|
751
|
-
const shouldRemove = mayContainNoise(html);
|
|
752
808
|
try {
|
|
753
809
|
const { document } = parseHTML(html);
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
}
|
|
765
|
-
const { body } = document;
|
|
766
|
-
if (body?.innerHTML)
|
|
767
|
-
return body.innerHTML;
|
|
768
|
-
if (typeof document.toString ===
|
|
769
|
-
'function') {
|
|
770
|
-
return document.toString();
|
|
771
|
-
}
|
|
772
|
-
const { documentElement } = document;
|
|
773
|
-
if (documentElement?.outerHTML)
|
|
774
|
-
return documentElement.outerHTML;
|
|
810
|
+
stripNoiseNodes(document);
|
|
811
|
+
const bodyInnerHtml = getBodyInnerHtml(document);
|
|
812
|
+
if (bodyInnerHtml)
|
|
813
|
+
return bodyInnerHtml;
|
|
814
|
+
const docToString = getDocumentToString(document);
|
|
815
|
+
if (docToString)
|
|
816
|
+
return docToString();
|
|
817
|
+
const documentElementOuterHtml = getDocumentElementOuterHtml(document);
|
|
818
|
+
if (documentElementOuterHtml)
|
|
819
|
+
return documentElementOuterHtml;
|
|
775
820
|
return html;
|
|
776
821
|
}
|
|
777
822
|
catch {
|
|
@@ -785,56 +830,110 @@ function buildInlineCode(content) {
|
|
|
785
830
|
const padding = delimiter.length > 1 ? ' ' : '';
|
|
786
831
|
return `${delimiter}${padding}${content}${padding}${delimiter}`;
|
|
787
832
|
}
|
|
833
|
+
/**
|
|
834
|
+
* Derive alt text from an image URL by extracting and humanizing the filename.
|
|
835
|
+
* Used as a fallback when the image has no alt attribute.
|
|
836
|
+
*/
|
|
837
|
+
function deriveAltFromImageUrl(src) {
|
|
838
|
+
if (!src)
|
|
839
|
+
return '';
|
|
840
|
+
try {
|
|
841
|
+
// Handle both absolute and relative URLs.
|
|
842
|
+
const pathname = src.startsWith('http')
|
|
843
|
+
? new URL(src).pathname
|
|
844
|
+
: (src.split('?')[0] ?? '');
|
|
845
|
+
// Extract filename from path.
|
|
846
|
+
const segments = pathname.split('/');
|
|
847
|
+
const filename = segments.pop() ?? '';
|
|
848
|
+
if (!filename)
|
|
849
|
+
return '';
|
|
850
|
+
// Remove file extension.
|
|
851
|
+
const dotIndex = filename.lastIndexOf('.');
|
|
852
|
+
const name = dotIndex > 0 ? filename.slice(0, dotIndex) : filename;
|
|
853
|
+
// Humanize: replace separators with spaces.
|
|
854
|
+
return name.replace(/[_-]+/g, ' ').trim();
|
|
855
|
+
}
|
|
856
|
+
catch {
|
|
857
|
+
return '';
|
|
858
|
+
}
|
|
859
|
+
}
|
|
788
860
|
function isCodeBlock(parent) {
|
|
789
861
|
if (!isRecord(parent))
|
|
790
862
|
return false;
|
|
791
863
|
const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
|
|
792
864
|
return ['PRE', 'WRAPPED-PRE'].includes(tagName);
|
|
793
865
|
}
|
|
794
|
-
function
|
|
866
|
+
function hasGetAttribute(value) {
|
|
867
|
+
return isRecord(value) && typeof value.getAttribute === 'function';
|
|
868
|
+
}
|
|
869
|
+
function hasCodeBlockTranslators(value) {
|
|
870
|
+
return isRecord(value) && isRecord(value.codeBlockTranslators);
|
|
871
|
+
}
|
|
872
|
+
function buildInlineCodeTranslator() {
|
|
795
873
|
return {
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
return
|
|
825
|
-
noEscape: true,
|
|
826
|
-
preserveWhitespace: true,
|
|
827
|
-
...(codeBlockTranslators
|
|
828
|
-
? { childTranslators: codeBlockTranslators }
|
|
829
|
-
: null),
|
|
830
|
-
postprocess: ({ content }) => {
|
|
831
|
-
const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
|
|
832
|
-
return CODE_BLOCK.format(content, language);
|
|
833
|
-
},
|
|
834
|
-
};
|
|
874
|
+
spaceIfRepeatingChar: true,
|
|
875
|
+
noEscape: true,
|
|
876
|
+
postprocess: ({ content }) => buildInlineCode(content),
|
|
877
|
+
};
|
|
878
|
+
}
|
|
879
|
+
function resolveAttributeLanguage(node) {
|
|
880
|
+
const getAttribute = hasGetAttribute(node)
|
|
881
|
+
? node.getAttribute.bind(node)
|
|
882
|
+
: undefined;
|
|
883
|
+
const className = getAttribute?.('class') ?? '';
|
|
884
|
+
const dataLanguage = getAttribute?.('data-language') ?? '';
|
|
885
|
+
return resolveLanguageFromAttributes(className, dataLanguage);
|
|
886
|
+
}
|
|
887
|
+
function resolveCodeBlockTranslators(visitor) {
|
|
888
|
+
const childTranslators = isRecord(visitor) ? visitor.instance : null;
|
|
889
|
+
return hasCodeBlockTranslators(childTranslators)
|
|
890
|
+
? childTranslators.codeBlockTranslators
|
|
891
|
+
: null;
|
|
892
|
+
}
|
|
893
|
+
function buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators) {
|
|
894
|
+
return {
|
|
895
|
+
noEscape: true,
|
|
896
|
+
preserveWhitespace: true,
|
|
897
|
+
...(codeBlockTranslators
|
|
898
|
+
? { childTranslators: codeBlockTranslators }
|
|
899
|
+
: null),
|
|
900
|
+
postprocess: ({ content }) => {
|
|
901
|
+
const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
|
|
902
|
+
return CODE_BLOCK.format(content, language);
|
|
835
903
|
},
|
|
836
904
|
};
|
|
837
905
|
}
|
|
906
|
+
function buildCodeTranslator(ctx) {
|
|
907
|
+
if (!isRecord(ctx))
|
|
908
|
+
return buildInlineCodeTranslator();
|
|
909
|
+
const { node, parent, visitor } = ctx;
|
|
910
|
+
if (!isCodeBlock(parent))
|
|
911
|
+
return buildInlineCodeTranslator();
|
|
912
|
+
const attributeLanguage = resolveAttributeLanguage(node);
|
|
913
|
+
const codeBlockTranslators = resolveCodeBlockTranslators(visitor);
|
|
914
|
+
return buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators);
|
|
915
|
+
}
|
|
916
|
+
function buildImageTranslator(ctx) {
|
|
917
|
+
if (!isRecord(ctx))
|
|
918
|
+
return { content: '' };
|
|
919
|
+
const { node } = ctx;
|
|
920
|
+
const getAttribute = hasGetAttribute(node)
|
|
921
|
+
? node.getAttribute.bind(node)
|
|
922
|
+
: undefined;
|
|
923
|
+
const src = getAttribute?.('src') ?? '';
|
|
924
|
+
const existingAlt = getAttribute?.('alt') ?? '';
|
|
925
|
+
// Use existing alt text if present, otherwise derive from filename.
|
|
926
|
+
const alt = existingAlt.trim() || deriveAltFromImageUrl(src);
|
|
927
|
+
return {
|
|
928
|
+
content: ``,
|
|
929
|
+
};
|
|
930
|
+
}
|
|
931
|
+
function createCustomTranslators() {
|
|
932
|
+
return {
|
|
933
|
+
code: (ctx) => buildCodeTranslator(ctx),
|
|
934
|
+
img: (ctx) => buildImageTranslator(ctx),
|
|
935
|
+
};
|
|
936
|
+
}
|
|
838
937
|
let markdownInstance = null;
|
|
839
938
|
function createMarkdownInstance() {
|
|
840
939
|
return new NodeHtmlMarkdown({
|
|
@@ -842,36 +941,86 @@ function createMarkdownInstance() {
|
|
|
842
941
|
codeBlockStyle: 'fenced',
|
|
843
942
|
emDelimiter: '_',
|
|
844
943
|
bulletMarker: '-',
|
|
845
|
-
},
|
|
944
|
+
}, createCustomTranslators());
|
|
846
945
|
}
|
|
847
946
|
function getMarkdownConverter() {
|
|
848
947
|
markdownInstance ??= createMarkdownInstance();
|
|
849
948
|
return markdownInstance;
|
|
850
949
|
}
|
|
950
|
+
function translateHtmlToMarkdown(html, url, signal) {
|
|
951
|
+
throwIfAborted(signal, url, 'markdown:begin');
|
|
952
|
+
const cleanedHtml = runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html));
|
|
953
|
+
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
954
|
+
const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
|
|
955
|
+
throwIfAborted(signal, url, 'markdown:translated');
|
|
956
|
+
// Post-process the markdown to clean up common conversion artifacts.
|
|
957
|
+
return cleanupMarkdownArtifacts(content);
|
|
958
|
+
}
|
|
959
|
+
function appendMetadataFooter(content, metadata, url) {
|
|
960
|
+
// Metadata is placed as a footer to avoid duplicating titles when the
|
|
961
|
+
// article content already contains an H1 heading at the top.
|
|
962
|
+
const footer = buildMetadataFooter(metadata, url);
|
|
963
|
+
return footer ? `${content}\n\n${footer}` : content;
|
|
964
|
+
}
|
|
851
965
|
export function htmlToMarkdown(html, metadata, options) {
|
|
852
966
|
const url = options?.url ?? metadata?.url ?? '';
|
|
853
|
-
const frontmatter = buildFrontmatter(metadata);
|
|
854
967
|
if (!html)
|
|
855
|
-
return
|
|
968
|
+
return buildMetadataFooter(metadata, url);
|
|
856
969
|
try {
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
const cleanedHtml = removeNoiseFromHtml(html);
|
|
860
|
-
endTransformStage(noiseStage);
|
|
861
|
-
throwIfAborted(options?.signal, url, 'markdown:cleaned');
|
|
862
|
-
const translateStage = startTransformStage(url, 'markdown:translate');
|
|
863
|
-
const content = getMarkdownConverter().translate(cleanedHtml).trim();
|
|
864
|
-
endTransformStage(translateStage);
|
|
865
|
-
throwIfAborted(options?.signal, url, 'markdown:translated');
|
|
866
|
-
return frontmatter ? `${frontmatter}\n${content}` : content;
|
|
970
|
+
const content = translateHtmlToMarkdown(html, url, options?.signal);
|
|
971
|
+
return appendMetadataFooter(content, metadata, url);
|
|
867
972
|
}
|
|
868
973
|
catch (error) {
|
|
869
974
|
if (error instanceof FetchError) {
|
|
870
975
|
throw error;
|
|
871
976
|
}
|
|
872
|
-
return
|
|
977
|
+
return buildMetadataFooter(metadata, url);
|
|
873
978
|
}
|
|
874
979
|
}
|
|
980
|
+
/**
|
|
981
|
+
* Clean up common markdown conversion artifacts:
|
|
982
|
+
* - Empty headings (e.g., "## " with no text)
|
|
983
|
+
* - Anchor-only links like [ ](#section-id) used for navigation
|
|
984
|
+
* - Concatenated links without spacing
|
|
985
|
+
* - Boilerplate phrases like "Was this page helpful?"
|
|
986
|
+
*/
|
|
987
|
+
function cleanupMarkdownArtifacts(content) {
|
|
988
|
+
let result = content;
|
|
989
|
+
// Remove empty Markdown headings like "## " produced by placeholder nodes.
|
|
990
|
+
result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
|
|
991
|
+
// Remove anchor-only links like [\u200B](#section-id) or [ ](#anchor).
|
|
992
|
+
// These are navigation remnants with zero-width or whitespace text.
|
|
993
|
+
// Match: [ or whitespace or zero-width space ](#...)
|
|
994
|
+
const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
|
|
995
|
+
result = result.replace(zeroWidthAnchorLink, '');
|
|
996
|
+
// Add line breaks between concatenated links: ](url)[text] -> ](url)\n\n[text]
|
|
997
|
+
result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
|
|
998
|
+
// Remove common boilerplate phrases.
|
|
999
|
+
result = result.replace(/^Was this page helpful\??\s*$/gim, '');
|
|
1000
|
+
// Collapse multiple blank lines into at most two.
|
|
1001
|
+
result = result.replace(/\n{3,}/g, '\n\n');
|
|
1002
|
+
return result.trim();
|
|
1003
|
+
}
|
|
1004
|
+
function buildMetadataFooter(metadata, fallbackUrl) {
|
|
1005
|
+
if (!metadata)
|
|
1006
|
+
return '';
|
|
1007
|
+
const lines = [];
|
|
1008
|
+
// Horizontal rule as a clear footer separator.
|
|
1009
|
+
lines.push('---');
|
|
1010
|
+
if (metadata.title)
|
|
1011
|
+
lines.push(`**Title:** ${metadata.title}`);
|
|
1012
|
+
if (metadata.description)
|
|
1013
|
+
lines.push(`**Description:** ${metadata.description}`);
|
|
1014
|
+
if (metadata.author)
|
|
1015
|
+
lines.push(`**Author:** ${metadata.author}`);
|
|
1016
|
+
if (metadata.url)
|
|
1017
|
+
lines.push(`**Source:** ${metadata.url}`);
|
|
1018
|
+
else if (fallbackUrl)
|
|
1019
|
+
lines.push(`**Source:** ${fallbackUrl}`);
|
|
1020
|
+
if (metadata.fetchedAt)
|
|
1021
|
+
lines.push(`**Fetched:** ${metadata.fetchedAt}`);
|
|
1022
|
+
return lines.join('\n');
|
|
1023
|
+
}
|
|
875
1024
|
const HEADING_PATTERN = /^#{1,6}\s/m;
|
|
876
1025
|
const LIST_PATTERN = /^(?:[-*+])\s/m;
|
|
877
1026
|
const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
|
|
@@ -895,6 +1044,7 @@ function looksLikeMarkdown(content) {
|
|
|
895
1044
|
function detectLineEnding(content) {
|
|
896
1045
|
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
897
1046
|
}
|
|
1047
|
+
const FRONTMATTER_DELIMITER = '---';
|
|
898
1048
|
function findFrontmatterLines(content) {
|
|
899
1049
|
const lineEnding = detectLineEnding(content);
|
|
900
1050
|
const lines = content.split(lineEnding);
|
|
@@ -930,10 +1080,32 @@ function parseFrontmatterEntry(line) {
|
|
|
930
1080
|
function isTitleKey(key) {
|
|
931
1081
|
return key === 'title' || key === 'name';
|
|
932
1082
|
}
|
|
1083
|
+
function extractTitleFromHeading(content) {
|
|
1084
|
+
const lineEnding = detectLineEnding(content);
|
|
1085
|
+
const lines = content.split(lineEnding);
|
|
1086
|
+
for (const line of lines) {
|
|
1087
|
+
const trimmed = line.trim();
|
|
1088
|
+
if (!trimmed)
|
|
1089
|
+
continue;
|
|
1090
|
+
let index = 0;
|
|
1091
|
+
while (index < trimmed.length && trimmed[index] === '#') {
|
|
1092
|
+
index += 1;
|
|
1093
|
+
}
|
|
1094
|
+
if (index === 0 || index > 6)
|
|
1095
|
+
return undefined;
|
|
1096
|
+
const nextChar = trimmed[index];
|
|
1097
|
+
if (nextChar !== ' ' && nextChar !== '\t')
|
|
1098
|
+
return undefined;
|
|
1099
|
+
const heading = trimmed.slice(index).trim();
|
|
1100
|
+
return heading.length > 0 ? heading : undefined;
|
|
1101
|
+
}
|
|
1102
|
+
return undefined;
|
|
1103
|
+
}
|
|
933
1104
|
function extractTitleFromRawMarkdown(content) {
|
|
934
1105
|
const frontmatter = findFrontmatterLines(content);
|
|
935
|
-
if (!frontmatter)
|
|
936
|
-
return
|
|
1106
|
+
if (!frontmatter) {
|
|
1107
|
+
return extractTitleFromHeading(content);
|
|
1108
|
+
}
|
|
937
1109
|
const { lines, endIndex } = frontmatter;
|
|
938
1110
|
const entry = lines
|
|
939
1111
|
.slice(1, endIndex)
|
|
@@ -944,8 +1116,48 @@ function extractTitleFromRawMarkdown(content) {
|
|
|
944
1116
|
const value = stripOptionalQuotes(entry.value);
|
|
945
1117
|
return value || undefined;
|
|
946
1118
|
}
|
|
1119
|
+
function hasMarkdownSourceLine(content) {
|
|
1120
|
+
const lineEnding = detectLineEnding(content);
|
|
1121
|
+
const lines = content.split(lineEnding);
|
|
1122
|
+
// Only scan a small prefix to avoid wasting time on huge docs.
|
|
1123
|
+
const limit = Math.min(lines.length, 50);
|
|
1124
|
+
for (let index = 0; index < limit; index += 1) {
|
|
1125
|
+
const line = lines[index];
|
|
1126
|
+
if (!line)
|
|
1127
|
+
continue;
|
|
1128
|
+
if (line.trimStart().toLowerCase().startsWith('source:')) {
|
|
1129
|
+
return true;
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
return false;
|
|
1133
|
+
}
|
|
1134
|
+
function addSourceToMarkdownMarkdownFormat(content, url) {
|
|
1135
|
+
if (hasMarkdownSourceLine(content))
|
|
1136
|
+
return content;
|
|
1137
|
+
const lineEnding = detectLineEnding(content);
|
|
1138
|
+
const lines = content.split(lineEnding);
|
|
1139
|
+
const firstNonEmptyIndex = lines.findIndex((line) => line.trim().length > 0);
|
|
1140
|
+
if (firstNonEmptyIndex !== -1) {
|
|
1141
|
+
const firstLine = lines[firstNonEmptyIndex];
|
|
1142
|
+
if (firstLine && /^#{1,6}\s+/.test(firstLine.trim())) {
|
|
1143
|
+
const insertAt = firstNonEmptyIndex + 1;
|
|
1144
|
+
const updated = [
|
|
1145
|
+
...lines.slice(0, insertAt),
|
|
1146
|
+
'',
|
|
1147
|
+
`Source: ${url}`,
|
|
1148
|
+
'',
|
|
1149
|
+
...lines.slice(insertAt),
|
|
1150
|
+
];
|
|
1151
|
+
return updated.join(lineEnding);
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
return [`Source: ${url}`, '', content].join(lineEnding);
|
|
1155
|
+
}
|
|
947
1156
|
function addSourceToMarkdown(content, url) {
|
|
948
1157
|
const frontmatter = findFrontmatterLines(content);
|
|
1158
|
+
if (config.transform.metadataFormat === 'markdown' && !frontmatter) {
|
|
1159
|
+
return addSourceToMarkdownMarkdownFormat(content, url);
|
|
1160
|
+
}
|
|
949
1161
|
if (!frontmatter) {
|
|
950
1162
|
return `---\nsource: "${url}"\n---\n\n${content}`;
|
|
951
1163
|
}
|
|
@@ -1086,19 +1298,11 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
1086
1298
|
applyExtractedMetadata(metadata, extractedMeta);
|
|
1087
1299
|
return metadata;
|
|
1088
1300
|
}
|
|
1089
|
-
function
|
|
1090
|
-
const metadata = createContentMetadataBlock(url, article, extractedMeta,
|
|
1301
|
+
function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, }) {
|
|
1302
|
+
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
1091
1303
|
return {
|
|
1092
|
-
sourceHtml: article.content,
|
|
1093
|
-
title: article.title,
|
|
1094
|
-
metadata,
|
|
1095
|
-
};
|
|
1096
|
-
}
|
|
1097
|
-
function buildFullHtmlContentSource({ html, url, article, extractedMeta, includeMetadata, }) {
|
|
1098
|
-
const metadata = createContentMetadataBlock(url, article, extractedMeta, false, includeMetadata);
|
|
1099
|
-
return {
|
|
1100
|
-
sourceHtml: html,
|
|
1101
|
-
title: extractedMeta.title,
|
|
1304
|
+
sourceHtml: useArticleContent && article ? article.content : html,
|
|
1305
|
+
title: useArticleContent && article ? article.title : extractedMeta.title,
|
|
1102
1306
|
metadata,
|
|
1103
1307
|
};
|
|
1104
1308
|
}
|
|
@@ -1108,84 +1312,83 @@ function logQualityGateFallback({ url, articleLength, }) {
|
|
|
1108
1312
|
articleLength,
|
|
1109
1313
|
});
|
|
1110
1314
|
}
|
|
1111
|
-
function
|
|
1112
|
-
if (!article)
|
|
1113
|
-
return null;
|
|
1315
|
+
function shouldUseArticleContent(article, html, url) {
|
|
1114
1316
|
const shouldExtractFromArticle = determineContentExtractionSource(article);
|
|
1115
|
-
if (shouldExtractFromArticle
|
|
1116
|
-
return
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
extractedMeta,
|
|
1120
|
-
includeMetadata,
|
|
1121
|
-
});
|
|
1122
|
-
}
|
|
1123
|
-
if (shouldExtractFromArticle) {
|
|
1124
|
-
logQualityGateFallback({
|
|
1125
|
-
url,
|
|
1126
|
-
articleLength: article.textContent.length,
|
|
1127
|
-
});
|
|
1317
|
+
if (!shouldExtractFromArticle)
|
|
1318
|
+
return false;
|
|
1319
|
+
if (isExtractionSufficient(article, html)) {
|
|
1320
|
+
return true;
|
|
1128
1321
|
}
|
|
1129
|
-
|
|
1322
|
+
logQualityGateFallback({
|
|
1323
|
+
url,
|
|
1324
|
+
articleLength: article.textContent.length,
|
|
1325
|
+
});
|
|
1326
|
+
return false;
|
|
1130
1327
|
}
|
|
1131
1328
|
function resolveContentSource({ html, url, includeMetadata, signal, }) {
|
|
1132
1329
|
const { article, metadata: extractedMeta } = extractContent(html, url, {
|
|
1133
1330
|
extractArticle: true,
|
|
1134
1331
|
...(signal ? { signal } : {}),
|
|
1135
1332
|
});
|
|
1136
|
-
const
|
|
1333
|
+
const useArticleContent = article
|
|
1334
|
+
? shouldUseArticleContent(article, html, url)
|
|
1335
|
+
: false;
|
|
1336
|
+
return buildContentSource({
|
|
1137
1337
|
html,
|
|
1138
1338
|
url,
|
|
1139
1339
|
article,
|
|
1140
1340
|
extractedMeta,
|
|
1141
1341
|
includeMetadata,
|
|
1342
|
+
useArticleContent,
|
|
1142
1343
|
});
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
return
|
|
1344
|
+
}
|
|
1345
|
+
function tryTransformRawStage(html, url, includeMetadata) {
|
|
1346
|
+
return runTransformStage(url, 'transform:raw', () => tryTransformRawContent({
|
|
1146
1347
|
html,
|
|
1147
1348
|
url,
|
|
1148
|
-
article,
|
|
1149
|
-
extractedMeta,
|
|
1150
1349
|
includeMetadata,
|
|
1151
|
-
});
|
|
1350
|
+
}));
|
|
1152
1351
|
}
|
|
1153
|
-
|
|
1352
|
+
function resolveContentSourceStage(html, url, includeMetadata, signal) {
|
|
1353
|
+
return runTransformStage(url, 'transform:extract', () => resolveContentSource({
|
|
1354
|
+
html,
|
|
1355
|
+
url,
|
|
1356
|
+
includeMetadata,
|
|
1357
|
+
...(signal ? { signal } : {}),
|
|
1358
|
+
}));
|
|
1359
|
+
}
|
|
1360
|
+
function buildMarkdownFromContext(context, url, signal) {
|
|
1361
|
+
const content = runTransformStage(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
|
|
1362
|
+
url,
|
|
1363
|
+
...(signal ? { signal } : {}),
|
|
1364
|
+
}));
|
|
1365
|
+
return {
|
|
1366
|
+
markdown: content,
|
|
1367
|
+
title: context.title,
|
|
1368
|
+
truncated: false,
|
|
1369
|
+
};
|
|
1370
|
+
}
|
|
1371
|
+
function runTotalTransformStage(url, fn) {
|
|
1154
1372
|
const totalStage = startTransformStage(url, 'transform:total');
|
|
1155
1373
|
let success = false;
|
|
1156
1374
|
try {
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
endTransformStage(rawStage);
|
|
1165
|
-
if (raw) {
|
|
1166
|
-
success = true;
|
|
1167
|
-
return raw;
|
|
1375
|
+
const result = fn();
|
|
1376
|
+
success = true;
|
|
1377
|
+
return result;
|
|
1378
|
+
}
|
|
1379
|
+
finally {
|
|
1380
|
+
if (success) {
|
|
1381
|
+
endTransformStage(totalStage, { truncated: false });
|
|
1168
1382
|
}
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
endTransformStage(extractStage);
|
|
1177
|
-
const markdownStage = startTransformStage(url, 'transform:markdown');
|
|
1178
|
-
const content = htmlToMarkdown(context.sourceHtml, context.metadata, {
|
|
1179
|
-
url,
|
|
1180
|
-
...(options.signal ? { signal: options.signal } : {}),
|
|
1181
|
-
});
|
|
1182
|
-
endTransformStage(markdownStage);
|
|
1383
|
+
}
|
|
1384
|
+
}
|
|
1385
|
+
async function runTotalTransformStageAsync(url, fn) {
|
|
1386
|
+
const totalStage = startTransformStage(url, 'transform:total');
|
|
1387
|
+
let success = false;
|
|
1388
|
+
try {
|
|
1389
|
+
const result = await fn();
|
|
1183
1390
|
success = true;
|
|
1184
|
-
return
|
|
1185
|
-
markdown: content,
|
|
1186
|
-
title: context.title,
|
|
1187
|
-
truncated: false,
|
|
1188
|
-
};
|
|
1391
|
+
return result;
|
|
1189
1392
|
}
|
|
1190
1393
|
finally {
|
|
1191
1394
|
if (success) {
|
|
@@ -1193,15 +1396,47 @@ export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
|
1193
1396
|
}
|
|
1194
1397
|
}
|
|
1195
1398
|
}
|
|
1399
|
+
export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
1400
|
+
return runTotalTransformStage(url, () => {
|
|
1401
|
+
throwIfAborted(options.signal, url, 'transform:begin');
|
|
1402
|
+
const raw = tryTransformRawStage(html, url, options.includeMetadata);
|
|
1403
|
+
if (raw) {
|
|
1404
|
+
return raw;
|
|
1405
|
+
}
|
|
1406
|
+
const context = resolveContentSourceStage(html, url, options.includeMetadata, options.signal);
|
|
1407
|
+
return buildMarkdownFromContext(context, url, options.signal);
|
|
1408
|
+
});
|
|
1409
|
+
}
|
|
1410
|
+
const workerMessageSchema = z.discriminatedUnion('type', [
|
|
1411
|
+
z.object({
|
|
1412
|
+
type: z.literal('result'),
|
|
1413
|
+
id: z.string(),
|
|
1414
|
+
result: z.object({
|
|
1415
|
+
markdown: z.string(),
|
|
1416
|
+
title: z.string().optional(),
|
|
1417
|
+
truncated: z.boolean(),
|
|
1418
|
+
}),
|
|
1419
|
+
}),
|
|
1420
|
+
z.object({
|
|
1421
|
+
type: z.literal('error'),
|
|
1422
|
+
id: z.string(),
|
|
1423
|
+
error: z.object({
|
|
1424
|
+
name: z.string(),
|
|
1425
|
+
message: z.string(),
|
|
1426
|
+
url: z.string(),
|
|
1427
|
+
statusCode: z.number().optional(),
|
|
1428
|
+
details: z.record(z.string(), z.unknown()).optional(),
|
|
1429
|
+
}),
|
|
1430
|
+
}),
|
|
1431
|
+
]);
|
|
1196
1432
|
let pool = null;
|
|
1197
1433
|
function resolveDefaultWorkerCount() {
|
|
1198
1434
|
const parallelism = typeof os.availableParallelism === 'function'
|
|
1199
1435
|
? os.availableParallelism()
|
|
1200
1436
|
: os.cpus().length;
|
|
1201
|
-
// Leave 1 core for the event loop; cap to avoid runaway memory.
|
|
1202
1437
|
return Math.min(16, Math.max(1, parallelism - 1));
|
|
1203
1438
|
}
|
|
1204
|
-
const DEFAULT_TIMEOUT_MS =
|
|
1439
|
+
const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
|
|
1205
1440
|
function getOrCreateTransformWorkerPool() {
|
|
1206
1441
|
pool ??= new WorkerPool(resolveDefaultWorkerCount(), DEFAULT_TIMEOUT_MS);
|
|
1207
1442
|
return pool;
|
|
@@ -1219,23 +1454,108 @@ class WorkerPool {
|
|
|
1219
1454
|
timeoutMs;
|
|
1220
1455
|
queueMax;
|
|
1221
1456
|
closed = false;
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
this.queueMax = safeSize * 2;
|
|
1226
|
-
for (let index = 0; index < safeSize; index += 1) {
|
|
1227
|
-
this.workers.push(this.spawnWorker(index));
|
|
1457
|
+
ensureOpen() {
|
|
1458
|
+
if (this.closed) {
|
|
1459
|
+
throw new Error('Transform worker pool closed');
|
|
1228
1460
|
}
|
|
1229
1461
|
}
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1462
|
+
ensureNotAborted(signal, url, stage) {
|
|
1463
|
+
if (!signal?.aborted)
|
|
1464
|
+
return;
|
|
1465
|
+
throw new FetchError('Request was canceled', url, 499, {
|
|
1466
|
+
reason: 'aborted',
|
|
1467
|
+
stage,
|
|
1468
|
+
});
|
|
1469
|
+
}
|
|
1470
|
+
ensureQueueCapacity(url) {
|
|
1471
|
+
if (this.queue.length < this.queueMax)
|
|
1472
|
+
return;
|
|
1473
|
+
throw new FetchError('Transform worker queue is full', url, 503, {
|
|
1474
|
+
reason: 'queue_full',
|
|
1475
|
+
stage: 'transform:enqueue',
|
|
1476
|
+
});
|
|
1477
|
+
}
|
|
1478
|
+
clearAbortListener(signal, listener) {
|
|
1479
|
+
if (!signal || !listener)
|
|
1480
|
+
return;
|
|
1481
|
+
try {
|
|
1482
|
+
signal.removeEventListener('abort', listener);
|
|
1483
|
+
}
|
|
1484
|
+
catch {
|
|
1485
|
+
// ignore
|
|
1486
|
+
}
|
|
1487
|
+
}
|
|
1488
|
+
markSlotIdle(workerIndex) {
|
|
1489
|
+
const slot = this.workers[workerIndex];
|
|
1490
|
+
if (!slot)
|
|
1491
|
+
return;
|
|
1492
|
+
slot.busy = false;
|
|
1493
|
+
slot.currentTaskId = null;
|
|
1494
|
+
}
|
|
1495
|
+
takeInflight(id) {
|
|
1496
|
+
const inflight = this.inflight.get(id);
|
|
1497
|
+
if (!inflight)
|
|
1498
|
+
return null;
|
|
1499
|
+
clearTimeout(inflight.timer);
|
|
1500
|
+
this.clearAbortListener(inflight.signal, inflight.abortListener);
|
|
1501
|
+
this.inflight.delete(id);
|
|
1502
|
+
return inflight;
|
|
1503
|
+
}
|
|
1504
|
+
cancelWorkerTask(slot, id) {
|
|
1505
|
+
if (!slot)
|
|
1506
|
+
return;
|
|
1507
|
+
try {
|
|
1508
|
+
slot.worker.postMessage({ type: 'cancel', id });
|
|
1509
|
+
}
|
|
1510
|
+
catch {
|
|
1511
|
+
// ignore
|
|
1512
|
+
}
|
|
1513
|
+
}
|
|
1514
|
+
restartWorker(workerIndex, slot) {
|
|
1515
|
+
if (this.closed)
|
|
1516
|
+
return;
|
|
1517
|
+
const target = slot ?? this.workers[workerIndex];
|
|
1518
|
+
if (target) {
|
|
1519
|
+
void target.worker.terminate();
|
|
1520
|
+
}
|
|
1521
|
+
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
1522
|
+
this.drainQueue();
|
|
1523
|
+
}
|
|
1524
|
+
rejectIfClosed(reject) {
|
|
1525
|
+
if (!this.closed)
|
|
1526
|
+
return false;
|
|
1527
|
+
reject(new Error('Transform worker pool closed'));
|
|
1528
|
+
return true;
|
|
1529
|
+
}
|
|
1530
|
+
abortInflightTask(id, url, workerIndex) {
|
|
1531
|
+
const slot = this.workers[workerIndex];
|
|
1532
|
+
this.cancelWorkerTask(slot, id);
|
|
1533
|
+
this.failTask(id, new FetchError('Request was canceled', url, 499, {
|
|
1534
|
+
reason: 'aborted',
|
|
1535
|
+
stage: 'transform:signal-abort',
|
|
1536
|
+
}));
|
|
1537
|
+
if (slot) {
|
|
1538
|
+
this.restartWorker(workerIndex, slot);
|
|
1539
|
+
}
|
|
1540
|
+
}
|
|
1541
|
+
abortQueuedTask(id, url, reject) {
|
|
1542
|
+
const queuedIndex = this.queue.findIndex((task) => task.id === id);
|
|
1543
|
+
if (queuedIndex === -1)
|
|
1544
|
+
return;
|
|
1545
|
+
this.queue.splice(queuedIndex, 1);
|
|
1546
|
+
reject(new FetchError('Request was canceled', url, 499, {
|
|
1547
|
+
reason: 'aborted',
|
|
1548
|
+
stage: 'transform:queued-abort',
|
|
1549
|
+
}));
|
|
1550
|
+
}
|
|
1551
|
+
createWorkerSlot(worker) {
|
|
1552
|
+
return {
|
|
1235
1553
|
worker,
|
|
1236
1554
|
busy: false,
|
|
1237
1555
|
currentTaskId: null,
|
|
1238
1556
|
};
|
|
1557
|
+
}
|
|
1558
|
+
registerWorkerHandlers(workerIndex, worker) {
|
|
1239
1559
|
worker.on('message', (raw) => {
|
|
1240
1560
|
this.onWorkerMessage(workerIndex, raw);
|
|
1241
1561
|
});
|
|
@@ -1245,6 +1565,21 @@ class WorkerPool {
|
|
|
1245
1565
|
worker.on('exit', (code) => {
|
|
1246
1566
|
this.onWorkerBroken(workerIndex, `Transform worker exited (code ${code})`);
|
|
1247
1567
|
});
|
|
1568
|
+
}
|
|
1569
|
+
constructor(size, timeoutMs) {
|
|
1570
|
+
const safeSize = Math.max(1, size);
|
|
1571
|
+
this.timeoutMs = timeoutMs;
|
|
1572
|
+
this.queueMax = safeSize * 2;
|
|
1573
|
+
for (let index = 0; index < safeSize; index += 1) {
|
|
1574
|
+
this.workers.push(this.spawnWorker(index));
|
|
1575
|
+
}
|
|
1576
|
+
}
|
|
1577
|
+
spawnWorker(workerIndex) {
|
|
1578
|
+
const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url));
|
|
1579
|
+
// Workers must not keep the process alive by themselves.
|
|
1580
|
+
worker.unref();
|
|
1581
|
+
const slot = this.createWorkerSlot(worker);
|
|
1582
|
+
this.registerWorkerHandlers(workerIndex, worker);
|
|
1248
1583
|
return slot;
|
|
1249
1584
|
}
|
|
1250
1585
|
onWorkerBroken(workerIndex, message) {
|
|
@@ -1256,129 +1591,83 @@ class WorkerPool {
|
|
|
1256
1591
|
if (slot.busy && slot.currentTaskId) {
|
|
1257
1592
|
this.failTask(slot.currentTaskId, new Error(message));
|
|
1258
1593
|
}
|
|
1259
|
-
|
|
1260
|
-
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
1261
|
-
this.drainQueue();
|
|
1594
|
+
this.restartWorker(workerIndex, slot);
|
|
1262
1595
|
}
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1596
|
+
resolveWorkerResult(inflight, result) {
|
|
1597
|
+
inflight.resolve({
|
|
1598
|
+
markdown: result.markdown,
|
|
1599
|
+
truncated: result.truncated,
|
|
1600
|
+
title: result.title,
|
|
1601
|
+
});
|
|
1602
|
+
}
|
|
1603
|
+
rejectWorkerError(inflight, error) {
|
|
1604
|
+
if (error.name === 'FetchError') {
|
|
1605
|
+
inflight.reject(new FetchError(error.message, error.url, error.statusCode, error.details ?? {}));
|
|
1270
1606
|
return;
|
|
1271
1607
|
}
|
|
1272
|
-
|
|
1273
|
-
|
|
1608
|
+
inflight.reject(new Error(error.message));
|
|
1609
|
+
}
|
|
1610
|
+
onWorkerMessage(workerIndex, raw) {
|
|
1611
|
+
const parsed = workerMessageSchema.safeParse(raw);
|
|
1612
|
+
if (!parsed.success)
|
|
1613
|
+
return;
|
|
1614
|
+
const message = parsed.data;
|
|
1615
|
+
const inflight = this.takeInflight(message.id);
|
|
1274
1616
|
if (!inflight)
|
|
1275
1617
|
return;
|
|
1276
|
-
|
|
1277
|
-
if (inflight.signal && inflight.abortListener) {
|
|
1278
|
-
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
1279
|
-
}
|
|
1280
|
-
this.inflight.delete(message.id);
|
|
1281
|
-
const slot = this.workers[workerIndex];
|
|
1282
|
-
if (slot) {
|
|
1283
|
-
slot.busy = false;
|
|
1284
|
-
slot.currentTaskId = null;
|
|
1285
|
-
}
|
|
1618
|
+
this.markSlotIdle(workerIndex);
|
|
1286
1619
|
if (message.type === 'result') {
|
|
1287
|
-
|
|
1620
|
+
this.resolveWorkerResult(inflight, message.result);
|
|
1288
1621
|
}
|
|
1289
1622
|
else {
|
|
1290
|
-
|
|
1291
|
-
if (error.name === 'FetchError') {
|
|
1292
|
-
inflight.reject(new FetchError(error.message, error.url, error.statusCode, error.details ?? {}));
|
|
1293
|
-
}
|
|
1294
|
-
else {
|
|
1295
|
-
inflight.reject(new Error(error.message));
|
|
1296
|
-
}
|
|
1623
|
+
this.rejectWorkerError(inflight, message.error);
|
|
1297
1624
|
}
|
|
1298
1625
|
this.drainQueue();
|
|
1299
1626
|
}
|
|
1300
1627
|
failTask(id, error) {
|
|
1301
|
-
const inflight = this.
|
|
1628
|
+
const inflight = this.takeInflight(id);
|
|
1302
1629
|
if (!inflight)
|
|
1303
1630
|
return;
|
|
1304
|
-
clearTimeout(inflight.timer);
|
|
1305
|
-
if (inflight.signal && inflight.abortListener) {
|
|
1306
|
-
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
1307
|
-
}
|
|
1308
|
-
this.inflight.delete(id);
|
|
1309
1631
|
inflight.reject(error);
|
|
1310
|
-
|
|
1311
|
-
if (slot) {
|
|
1312
|
-
slot.busy = false;
|
|
1313
|
-
slot.currentTaskId = null;
|
|
1314
|
-
}
|
|
1632
|
+
this.markSlotIdle(inflight.workerIndex);
|
|
1315
1633
|
}
|
|
1316
|
-
|
|
1317
|
-
if (this.
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
if (
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
stage: 'transform:enqueue',
|
|
1324
|
-
});
|
|
1634
|
+
handleAbortSignal(id, url, reject) {
|
|
1635
|
+
if (this.rejectIfClosed(reject))
|
|
1636
|
+
return;
|
|
1637
|
+
const inflight = this.inflight.get(id);
|
|
1638
|
+
if (inflight) {
|
|
1639
|
+
this.abortInflightTask(id, url, inflight.workerIndex);
|
|
1640
|
+
return;
|
|
1325
1641
|
}
|
|
1326
|
-
|
|
1327
|
-
|
|
1642
|
+
this.abortQueuedTask(id, url, reject);
|
|
1643
|
+
}
|
|
1644
|
+
createPendingTask(html, url, options, resolve, reject) {
|
|
1645
|
+
const id = randomUUID();
|
|
1646
|
+
let abortListener;
|
|
1647
|
+
if (options.signal) {
|
|
1648
|
+
abortListener = () => {
|
|
1649
|
+
this.handleAbortSignal(id, url, reject);
|
|
1650
|
+
};
|
|
1651
|
+
options.signal.addEventListener('abort', abortListener, { once: true });
|
|
1328
1652
|
}
|
|
1653
|
+
return {
|
|
1654
|
+
id,
|
|
1655
|
+
html,
|
|
1656
|
+
url,
|
|
1657
|
+
includeMetadata: options.includeMetadata,
|
|
1658
|
+
signal: options.signal,
|
|
1659
|
+
abortListener,
|
|
1660
|
+
resolve,
|
|
1661
|
+
reject,
|
|
1662
|
+
};
|
|
1663
|
+
}
|
|
1664
|
+
async transform(html, url, options) {
|
|
1665
|
+
this.ensureOpen();
|
|
1666
|
+
this.ensureNotAborted(options.signal, url, 'transform:enqueue');
|
|
1667
|
+
this.ensureQueueCapacity(url);
|
|
1329
1668
|
return new Promise((resolve, reject) => {
|
|
1330
|
-
const
|
|
1331
|
-
|
|
1332
|
-
if (options.signal) {
|
|
1333
|
-
abortListener = () => {
|
|
1334
|
-
if (this.closed) {
|
|
1335
|
-
reject(new Error('Transform worker pool closed'));
|
|
1336
|
-
return;
|
|
1337
|
-
}
|
|
1338
|
-
const inflight = this.inflight.get(id);
|
|
1339
|
-
if (inflight) {
|
|
1340
|
-
const { workerIndex } = inflight;
|
|
1341
|
-
const slot = this.workers[workerIndex];
|
|
1342
|
-
if (slot) {
|
|
1343
|
-
try {
|
|
1344
|
-
slot.worker.postMessage({ type: 'cancel', id });
|
|
1345
|
-
}
|
|
1346
|
-
catch {
|
|
1347
|
-
// ignore
|
|
1348
|
-
}
|
|
1349
|
-
}
|
|
1350
|
-
this.failTask(id, new FetchError('Request was canceled', url, 499, {
|
|
1351
|
-
reason: 'aborted',
|
|
1352
|
-
stage: 'transform:signal-abort',
|
|
1353
|
-
}));
|
|
1354
|
-
if (slot) {
|
|
1355
|
-
void slot.worker.terminate();
|
|
1356
|
-
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
1357
|
-
this.drainQueue();
|
|
1358
|
-
}
|
|
1359
|
-
return;
|
|
1360
|
-
}
|
|
1361
|
-
const queuedIndex = this.queue.findIndex((task) => task.id === id);
|
|
1362
|
-
if (queuedIndex !== -1) {
|
|
1363
|
-
this.queue.splice(queuedIndex, 1);
|
|
1364
|
-
reject(new FetchError('Request was canceled', url, 499, {
|
|
1365
|
-
reason: 'aborted',
|
|
1366
|
-
stage: 'transform:queued-abort',
|
|
1367
|
-
}));
|
|
1368
|
-
}
|
|
1369
|
-
};
|
|
1370
|
-
options.signal.addEventListener('abort', abortListener, { once: true });
|
|
1371
|
-
}
|
|
1372
|
-
this.queue.push({
|
|
1373
|
-
id,
|
|
1374
|
-
html,
|
|
1375
|
-
url,
|
|
1376
|
-
includeMetadata: options.includeMetadata,
|
|
1377
|
-
signal: options.signal,
|
|
1378
|
-
abortListener,
|
|
1379
|
-
resolve,
|
|
1380
|
-
reject,
|
|
1381
|
-
});
|
|
1669
|
+
const task = this.createPendingTask(html, url, options, resolve, reject);
|
|
1670
|
+
this.queue.push(task);
|
|
1382
1671
|
this.drainQueue();
|
|
1383
1672
|
});
|
|
1384
1673
|
}
|
|
@@ -1398,43 +1687,48 @@ class WorkerPool {
|
|
|
1398
1687
|
}
|
|
1399
1688
|
}
|
|
1400
1689
|
dispatch(workerIndex, slot, task) {
|
|
1401
|
-
if (task
|
|
1402
|
-
if (task.abortListener) {
|
|
1403
|
-
task.signal.removeEventListener('abort', task.abortListener);
|
|
1404
|
-
}
|
|
1405
|
-
task.reject(new FetchError('Request was canceled', task.url, 499, {
|
|
1406
|
-
reason: 'aborted',
|
|
1407
|
-
stage: 'transform:dispatch',
|
|
1408
|
-
}));
|
|
1690
|
+
if (this.rejectIfAborted(task))
|
|
1409
1691
|
return;
|
|
1692
|
+
this.markSlotBusy(slot, task);
|
|
1693
|
+
const timer = this.startTaskTimer(workerIndex, slot, task);
|
|
1694
|
+
this.registerInflightTask(task, timer, workerIndex);
|
|
1695
|
+
try {
|
|
1696
|
+
this.sendTransformMessage(slot, task);
|
|
1697
|
+
}
|
|
1698
|
+
catch (error) {
|
|
1699
|
+
this.handleDispatchFailure(workerIndex, slot, task, timer, error);
|
|
1410
1700
|
}
|
|
1701
|
+
}
|
|
1702
|
+
rejectIfAborted(task) {
|
|
1703
|
+
if (!task.signal?.aborted)
|
|
1704
|
+
return false;
|
|
1705
|
+
this.clearAbortListener(task.signal, task.abortListener);
|
|
1706
|
+
task.reject(new FetchError('Request was canceled', task.url, 499, {
|
|
1707
|
+
reason: 'aborted',
|
|
1708
|
+
stage: 'transform:dispatch',
|
|
1709
|
+
}));
|
|
1710
|
+
return true;
|
|
1711
|
+
}
|
|
1712
|
+
markSlotBusy(slot, task) {
|
|
1411
1713
|
slot.busy = true;
|
|
1412
1714
|
slot.currentTaskId = task.id;
|
|
1715
|
+
}
|
|
1716
|
+
startTaskTimer(workerIndex, slot, task) {
|
|
1413
1717
|
const timer = setTimeout(() => {
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
}
|
|
1417
|
-
catch {
|
|
1418
|
-
// ignore
|
|
1419
|
-
}
|
|
1420
|
-
const inflight = this.inflight.get(task.id);
|
|
1718
|
+
this.cancelWorkerTask(slot, task.id);
|
|
1719
|
+
const inflight = this.takeInflight(task.id);
|
|
1421
1720
|
if (!inflight)
|
|
1422
1721
|
return;
|
|
1423
|
-
clearTimeout(inflight.timer);
|
|
1424
|
-
if (inflight.signal && inflight.abortListener) {
|
|
1425
|
-
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
1426
|
-
}
|
|
1427
|
-
this.inflight.delete(task.id);
|
|
1428
1722
|
inflight.reject(new FetchError('Request timeout', task.url, 504, {
|
|
1429
1723
|
reason: 'timeout',
|
|
1430
1724
|
stage: 'transform:worker-timeout',
|
|
1431
1725
|
}));
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1726
|
+
this.restartWorker(workerIndex, slot);
|
|
1727
|
+
}, this.timeoutMs);
|
|
1728
|
+
timer.unref();
|
|
1729
|
+
return timer;
|
|
1730
|
+
}
|
|
1731
|
+
registerInflightTask(task, timer, workerIndex) {
|
|
1438
1732
|
this.inflight.set(task.id, {
|
|
1439
1733
|
resolve: task.resolve,
|
|
1440
1734
|
reject: task.reject,
|
|
@@ -1443,6 +1737,8 @@ class WorkerPool {
|
|
|
1443
1737
|
abortListener: task.abortListener,
|
|
1444
1738
|
workerIndex,
|
|
1445
1739
|
});
|
|
1740
|
+
}
|
|
1741
|
+
sendTransformMessage(slot, task) {
|
|
1446
1742
|
slot.worker.postMessage({
|
|
1447
1743
|
type: 'transform',
|
|
1448
1744
|
id: task.id,
|
|
@@ -1451,6 +1747,17 @@ class WorkerPool {
|
|
|
1451
1747
|
includeMetadata: task.includeMetadata,
|
|
1452
1748
|
});
|
|
1453
1749
|
}
|
|
1750
|
+
handleDispatchFailure(workerIndex, slot, task, timer, error) {
|
|
1751
|
+
clearTimeout(timer);
|
|
1752
|
+
this.clearAbortListener(task.signal, task.abortListener);
|
|
1753
|
+
this.inflight.delete(task.id);
|
|
1754
|
+
this.markSlotIdle(workerIndex);
|
|
1755
|
+
const message = error instanceof Error
|
|
1756
|
+
? error
|
|
1757
|
+
: new Error('Failed to dispatch transform worker message');
|
|
1758
|
+
task.reject(message);
|
|
1759
|
+
this.restartWorker(workerIndex, slot);
|
|
1760
|
+
}
|
|
1454
1761
|
async close() {
|
|
1455
1762
|
if (this.closed)
|
|
1456
1763
|
return;
|
|
@@ -1459,9 +1766,7 @@ class WorkerPool {
|
|
|
1459
1766
|
this.workers.length = 0;
|
|
1460
1767
|
for (const [id, inflight] of this.inflight.entries()) {
|
|
1461
1768
|
clearTimeout(inflight.timer);
|
|
1462
|
-
|
|
1463
|
-
inflight.signal.removeEventListener('abort', inflight.abortListener);
|
|
1464
|
-
}
|
|
1769
|
+
this.clearAbortListener(inflight.signal, inflight.abortListener);
|
|
1465
1770
|
inflight.reject(new Error('Transform worker pool closed'));
|
|
1466
1771
|
this.inflight.delete(id);
|
|
1467
1772
|
}
|
|
@@ -1472,38 +1777,38 @@ class WorkerPool {
|
|
|
1472
1777
|
await Promise.allSettled(terminations);
|
|
1473
1778
|
}
|
|
1474
1779
|
}
|
|
1780
|
+
function buildWorkerTransformOptions(options) {
|
|
1781
|
+
return {
|
|
1782
|
+
includeMetadata: options.includeMetadata,
|
|
1783
|
+
...(options.signal ? { signal: options.signal } : {}),
|
|
1784
|
+
};
|
|
1785
|
+
}
|
|
1786
|
+
async function transformWithWorkerPool(html, url, options) {
|
|
1787
|
+
const poolRef = getOrCreateTransformWorkerPool();
|
|
1788
|
+
return poolRef.transform(html, url, buildWorkerTransformOptions(options));
|
|
1789
|
+
}
|
|
1790
|
+
function resolveWorkerFallback(error, html, url, options) {
|
|
1791
|
+
if (error instanceof FetchError) {
|
|
1792
|
+
throw error;
|
|
1793
|
+
}
|
|
1794
|
+
// Stability-first: if worker infrastructure fails, fall back to in-process.
|
|
1795
|
+
throwIfAborted(options.signal, url, 'transform:worker-fallback');
|
|
1796
|
+
return transformHtmlToMarkdownInProcess(html, url, options);
|
|
1797
|
+
}
|
|
1475
1798
|
export async function transformHtmlToMarkdown(html, url, options) {
|
|
1476
|
-
|
|
1477
|
-
let success = false;
|
|
1478
|
-
try {
|
|
1799
|
+
return runTotalTransformStageAsync(url, async () => {
|
|
1479
1800
|
throwIfAborted(options.signal, url, 'transform:begin');
|
|
1480
1801
|
const workerStage = startTransformStage(url, 'transform:worker');
|
|
1481
1802
|
try {
|
|
1482
|
-
const
|
|
1483
|
-
const result = await poolRef.transform(html, url, {
|
|
1484
|
-
includeMetadata: options.includeMetadata,
|
|
1485
|
-
...(options.signal ? { signal: options.signal } : {}),
|
|
1486
|
-
});
|
|
1487
|
-
success = true;
|
|
1803
|
+
const result = await transformWithWorkerPool(html, url, options);
|
|
1488
1804
|
return result;
|
|
1489
1805
|
}
|
|
1490
1806
|
catch (error) {
|
|
1491
|
-
|
|
1492
|
-
throw error;
|
|
1493
|
-
}
|
|
1494
|
-
// Stability-first: if worker infrastructure fails, fall back to in-process.
|
|
1495
|
-
throwIfAborted(options.signal, url, 'transform:worker-fallback');
|
|
1496
|
-
const fallback = transformHtmlToMarkdownInProcess(html, url, options);
|
|
1497
|
-
success = true;
|
|
1807
|
+
const fallback = resolveWorkerFallback(error, html, url, options);
|
|
1498
1808
|
return fallback;
|
|
1499
1809
|
}
|
|
1500
1810
|
finally {
|
|
1501
1811
|
endTransformStage(workerStage);
|
|
1502
1812
|
}
|
|
1503
|
-
}
|
|
1504
|
-
finally {
|
|
1505
|
-
if (success) {
|
|
1506
|
-
endTransformStage(totalStage, { truncated: false });
|
|
1507
|
-
}
|
|
1508
|
-
}
|
|
1813
|
+
});
|
|
1509
1814
|
}
|