@j0hanz/superfetch 2.4.3 → 2.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cache.d.ts +8 -8
- package/dist/cache.js +277 -264
- package/dist/config.d.ts +1 -0
- package/dist/config.js +1 -0
- package/dist/crypto.js +4 -3
- package/dist/dom-noise-removal.js +355 -297
- package/dist/fetch.d.ts +13 -7
- package/dist/fetch.js +636 -690
- package/dist/http-native.js +535 -474
- package/dist/instructions.md +38 -27
- package/dist/language-detection.js +190 -153
- package/dist/markdown-cleanup.js +171 -158
- package/dist/mcp.js +183 -2
- package/dist/resources.d.ts +2 -0
- package/dist/resources.js +44 -0
- package/dist/session.js +144 -105
- package/dist/tasks.d.ts +37 -0
- package/dist/tasks.js +66 -0
- package/dist/tools.d.ts +8 -12
- package/dist/tools.js +196 -147
- package/dist/transform.d.ts +3 -1
- package/dist/transform.js +680 -778
- package/package.json +6 -6
package/dist/transform.js
CHANGED
|
@@ -14,133 +14,158 @@ import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './langua
|
|
|
14
14
|
import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isLikelyHtmlContent, isRawTextContent, } from './markdown-cleanup.js';
|
|
15
15
|
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
|
|
16
16
|
import { isObject } from './type-guards.js';
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
},
|
|
28
|
-
};
|
|
29
|
-
const transformChannel = diagnosticsChannel.channel('superfetch.transform');
|
|
30
|
-
const LOG_URL_MAX = 80;
|
|
31
|
-
function truncateUrlForLog(url) {
|
|
32
|
-
return url.substring(0, LOG_URL_MAX);
|
|
33
|
-
}
|
|
34
|
-
function publishTransformEvent(event) {
|
|
35
|
-
if (!transformChannel.hasSubscribers)
|
|
36
|
-
return;
|
|
37
|
-
try {
|
|
38
|
-
transformChannel.publish(event);
|
|
17
|
+
/* -------------------------------------------------------------------------------------------------
|
|
18
|
+
* Abort policy (single source of truth)
|
|
19
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
20
|
+
class AbortPolicy {
|
|
21
|
+
getAbortReason(signal) {
|
|
22
|
+
if (!isObject(signal))
|
|
23
|
+
return undefined;
|
|
24
|
+
return 'reason' in signal
|
|
25
|
+
? signal.reason
|
|
26
|
+
: undefined;
|
|
39
27
|
}
|
|
40
|
-
|
|
41
|
-
|
|
28
|
+
isTimeoutReason(reason) {
|
|
29
|
+
return reason instanceof Error && reason.name === 'TimeoutError';
|
|
30
|
+
}
|
|
31
|
+
throwIfAborted(signal, url, stage) {
|
|
32
|
+
if (!signal?.aborted)
|
|
33
|
+
return;
|
|
34
|
+
const reason = this.getAbortReason(signal);
|
|
35
|
+
if (this.isTimeoutReason(reason)) {
|
|
36
|
+
throw new FetchError('Request timeout', url, 504, {
|
|
37
|
+
reason: 'timeout',
|
|
38
|
+
stage,
|
|
39
|
+
});
|
|
40
|
+
}
|
|
41
|
+
throw new FetchError('Request was canceled', url, 499, {
|
|
42
|
+
reason: 'aborted',
|
|
43
|
+
stage,
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
createAbortError(url, stage) {
|
|
47
|
+
return new FetchError('Request was canceled', url, 499, {
|
|
48
|
+
reason: 'aborted',
|
|
49
|
+
stage,
|
|
50
|
+
});
|
|
42
51
|
}
|
|
43
52
|
}
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
53
|
+
const abortPolicy = new AbortPolicy();
|
|
54
|
+
/* -------------------------------------------------------------------------------------------------
|
|
55
|
+
* Stage tracking & diagnostics
|
|
56
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
57
|
+
class StageTracker {
|
|
58
|
+
channel = diagnosticsChannel.channel('superfetch.transform');
|
|
59
|
+
start(url, stage, budget) {
|
|
60
|
+
if (!this.channel.hasSubscribers && !budget)
|
|
61
|
+
return null;
|
|
62
|
+
const remainingBudgetMs = budget
|
|
63
|
+
? budget.totalBudgetMs - budget.elapsedMs
|
|
64
|
+
: undefined;
|
|
65
|
+
const base = {
|
|
66
|
+
stage,
|
|
67
|
+
startTime: performance.now(),
|
|
68
|
+
url: redactUrl(url),
|
|
60
69
|
};
|
|
70
|
+
if (remainingBudgetMs !== undefined && budget) {
|
|
71
|
+
return {
|
|
72
|
+
...base,
|
|
73
|
+
budgetMs: remainingBudgetMs,
|
|
74
|
+
totalBudgetMs: budget.totalBudgetMs,
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
return base;
|
|
78
|
+
}
|
|
79
|
+
end(context, options) {
|
|
80
|
+
if (!context)
|
|
81
|
+
return 0;
|
|
82
|
+
const durationMs = performance.now() - context.startTime;
|
|
83
|
+
const requestId = getRequestId();
|
|
84
|
+
const operationId = getOperationId();
|
|
85
|
+
if (context.totalBudgetMs !== undefined) {
|
|
86
|
+
const warnThresholdMs = context.totalBudgetMs * config.transform.stageWarnRatio;
|
|
87
|
+
if (durationMs > warnThresholdMs) {
|
|
88
|
+
logWarn('Transform stage exceeded warning threshold', {
|
|
89
|
+
stage: context.stage,
|
|
90
|
+
durationMs: Math.round(durationMs),
|
|
91
|
+
thresholdMs: Math.round(warnThresholdMs),
|
|
92
|
+
url: context.url,
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
const event = {
|
|
97
|
+
v: 1,
|
|
98
|
+
type: 'stage',
|
|
99
|
+
stage: context.stage,
|
|
100
|
+
durationMs,
|
|
101
|
+
url: context.url,
|
|
102
|
+
...(requestId ? { requestId } : {}),
|
|
103
|
+
...(operationId ? { operationId } : {}),
|
|
104
|
+
...(options?.truncated !== undefined
|
|
105
|
+
? { truncated: options.truncated }
|
|
106
|
+
: {}),
|
|
107
|
+
};
|
|
108
|
+
this.publish(event);
|
|
109
|
+
return durationMs;
|
|
61
110
|
}
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
const operationId = getOperationId();
|
|
70
|
-
if (context.totalBudgetMs !== undefined) {
|
|
71
|
-
const warnThresholdMs = context.totalBudgetMs * config.transform.stageWarnRatio;
|
|
72
|
-
if (durationMs > warnThresholdMs) {
|
|
73
|
-
logWarn('Transform stage exceeded warning threshold', {
|
|
74
|
-
stage: context.stage,
|
|
75
|
-
durationMs: Math.round(durationMs),
|
|
76
|
-
thresholdMs: Math.round(warnThresholdMs),
|
|
77
|
-
url: context.url,
|
|
111
|
+
run(url, stage, fn, budget) {
|
|
112
|
+
if (budget && budget.elapsedMs >= budget.totalBudgetMs) {
|
|
113
|
+
throw new FetchError('Transform budget exhausted', url, 504, {
|
|
114
|
+
reason: 'timeout',
|
|
115
|
+
stage: `${stage}:budget_exhausted`,
|
|
116
|
+
elapsedMs: budget.elapsedMs,
|
|
117
|
+
totalBudgetMs: budget.totalBudgetMs,
|
|
78
118
|
});
|
|
79
119
|
}
|
|
120
|
+
const ctx = this.start(url, stage, budget);
|
|
121
|
+
try {
|
|
122
|
+
return fn();
|
|
123
|
+
}
|
|
124
|
+
finally {
|
|
125
|
+
this.end(ctx);
|
|
126
|
+
}
|
|
80
127
|
}
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
...(options?.truncated !== undefined
|
|
90
|
-
? { truncated: options.truncated }
|
|
91
|
-
: {}),
|
|
92
|
-
};
|
|
93
|
-
publishTransformEvent(event);
|
|
94
|
-
return durationMs;
|
|
95
|
-
}
|
|
96
|
-
function runTransformStage(url, stage, fn, budget) {
|
|
97
|
-
if (budget && budget.elapsedMs >= budget.totalBudgetMs) {
|
|
98
|
-
throw new FetchError('Transform budget exhausted', url, 504, {
|
|
99
|
-
reason: 'timeout',
|
|
100
|
-
stage: `${stage}:budget_exhausted`,
|
|
101
|
-
elapsedMs: budget.elapsedMs,
|
|
102
|
-
totalBudgetMs: budget.totalBudgetMs,
|
|
103
|
-
});
|
|
104
|
-
}
|
|
105
|
-
const context = startTransformStage(url, stage, budget);
|
|
106
|
-
try {
|
|
107
|
-
return fn();
|
|
128
|
+
async runAsync(url, stage, fn) {
|
|
129
|
+
const ctx = this.start(url, stage);
|
|
130
|
+
try {
|
|
131
|
+
return await fn();
|
|
132
|
+
}
|
|
133
|
+
finally {
|
|
134
|
+
this.end(ctx);
|
|
135
|
+
}
|
|
108
136
|
}
|
|
109
|
-
|
|
110
|
-
|
|
137
|
+
publish(event) {
|
|
138
|
+
if (!this.channel.hasSubscribers)
|
|
139
|
+
return;
|
|
140
|
+
try {
|
|
141
|
+
this.channel.publish(event);
|
|
142
|
+
}
|
|
143
|
+
catch {
|
|
144
|
+
// Intentionally ignore diagnostics failures
|
|
145
|
+
}
|
|
111
146
|
}
|
|
112
147
|
}
|
|
113
|
-
|
|
114
|
-
|
|
148
|
+
const stageTracker = new StageTracker();
|
|
149
|
+
/** Backwards-compatible exports */
|
|
150
|
+
export function startTransformStage(url, stage, budget) {
|
|
151
|
+
return stageTracker.start(url, stage, budget);
|
|
115
152
|
}
|
|
116
|
-
function
|
|
117
|
-
|
|
118
|
-
return;
|
|
119
|
-
const { aborted } = signal;
|
|
120
|
-
if (!aborted)
|
|
121
|
-
return;
|
|
122
|
-
const reason = getAbortReason(signal);
|
|
123
|
-
if (isTimeoutReason(reason)) {
|
|
124
|
-
throw new FetchError('Request timeout', url, 504, {
|
|
125
|
-
reason: 'timeout',
|
|
126
|
-
stage,
|
|
127
|
-
});
|
|
128
|
-
}
|
|
129
|
-
throw new FetchError('Request was canceled', url, 499, {
|
|
130
|
-
reason: 'aborted',
|
|
131
|
-
stage,
|
|
132
|
-
});
|
|
153
|
+
export function endTransformStage(context, options) {
|
|
154
|
+
return stageTracker.end(context, options);
|
|
133
155
|
}
|
|
156
|
+
/* -------------------------------------------------------------------------------------------------
|
|
157
|
+
* HTML size guard
|
|
158
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
134
159
|
function truncateHtml(html) {
|
|
135
160
|
const maxSize = config.constants.maxHtmlSize;
|
|
136
161
|
if (html.length <= maxSize) {
|
|
137
|
-
return html;
|
|
162
|
+
return { html, truncated: false };
|
|
138
163
|
}
|
|
139
164
|
logWarn('HTML content exceeds maximum size, truncating', {
|
|
140
165
|
size: html.length,
|
|
141
166
|
maxSize,
|
|
142
167
|
});
|
|
143
|
-
return html.substring(0, maxSize);
|
|
168
|
+
return { html: html.substring(0, maxSize), truncated: true };
|
|
144
169
|
}
|
|
145
170
|
const META_PROPERTY_HANDLERS = new Map([
|
|
146
171
|
[
|
|
@@ -200,162 +225,109 @@ const META_NAME_HANDLERS = new Map([
|
|
|
200
225
|
},
|
|
201
226
|
],
|
|
202
227
|
]);
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
title: {},
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
228
|
+
class MetadataExtractor {
|
|
229
|
+
extract(document) {
|
|
230
|
+
const ctx = { title: {}, description: {} };
|
|
231
|
+
for (const tag of document.querySelectorAll('meta')) {
|
|
232
|
+
const content = tag.getAttribute('content')?.trim();
|
|
233
|
+
if (!content)
|
|
234
|
+
continue;
|
|
235
|
+
const property = tag.getAttribute('property');
|
|
236
|
+
if (property)
|
|
237
|
+
META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
|
|
238
|
+
const name = tag.getAttribute('name');
|
|
239
|
+
if (name)
|
|
240
|
+
META_NAME_HANDLERS.get(name)?.(ctx, content);
|
|
215
241
|
}
|
|
216
|
-
const
|
|
217
|
-
if (
|
|
218
|
-
|
|
242
|
+
const titleEl = document.querySelector('title');
|
|
243
|
+
if (!ctx.title.standard && titleEl?.textContent) {
|
|
244
|
+
ctx.title.standard = titleEl.textContent.trim();
|
|
219
245
|
}
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
metadata
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
246
|
+
const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
|
|
247
|
+
const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
|
|
248
|
+
const metadata = {};
|
|
249
|
+
if (resolvedTitle)
|
|
250
|
+
metadata.title = resolvedTitle;
|
|
251
|
+
if (resolvedDesc)
|
|
252
|
+
metadata.description = resolvedDesc;
|
|
253
|
+
if (ctx.author)
|
|
254
|
+
metadata.author = ctx.author;
|
|
255
|
+
if (ctx.image)
|
|
256
|
+
metadata.image = ctx.image;
|
|
257
|
+
if (ctx.publishedAt)
|
|
258
|
+
metadata.publishedAt = ctx.publishedAt;
|
|
259
|
+
if (ctx.modifiedAt)
|
|
260
|
+
metadata.modifiedAt = ctx.modifiedAt;
|
|
261
|
+
return metadata;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
const metadataExtractor = new MetadataExtractor();
|
|
265
|
+
/* -------------------------------------------------------------------------------------------------
|
|
266
|
+
* Article extraction (Readability)
|
|
267
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
242
268
|
function isReadabilityCompatible(doc) {
|
|
243
269
|
if (!isObject(doc))
|
|
244
270
|
return false;
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
}
|
|
250
|
-
function hasQuerySelectors(record) {
|
|
251
|
-
return (typeof record.querySelectorAll === 'function' &&
|
|
271
|
+
const record = doc;
|
|
272
|
+
return ('documentElement' in record &&
|
|
273
|
+
typeof record.querySelectorAll ===
|
|
274
|
+
'function' &&
|
|
252
275
|
typeof record.querySelector === 'function');
|
|
253
276
|
}
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
}
|
|
259
|
-
try {
|
|
260
|
-
const doc = document;
|
|
261
|
-
const rawText = doc.querySelector('body')?.textContent ?? doc.documentElement.textContent;
|
|
262
|
-
const textLength = rawText.replace(/\s+/g, ' ').trim().length;
|
|
263
|
-
if (textLength < 100) {
|
|
264
|
-
logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
|
|
265
|
-
'This might be a client-side rendered (SPA) application. ' +
|
|
266
|
-
'Content extraction may be incomplete.', { textLength });
|
|
267
|
-
}
|
|
268
|
-
if (textLength >= 400 && !isProbablyReaderable(doc)) {
|
|
277
|
+
class ArticleExtractor {
|
|
278
|
+
extract(document) {
|
|
279
|
+
if (!isReadabilityCompatible(document)) {
|
|
280
|
+
logWarn('Document not compatible with Readability');
|
|
269
281
|
return null;
|
|
270
282
|
}
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
283
|
+
try {
|
|
284
|
+
const doc = document;
|
|
285
|
+
const rawText = doc.querySelector('body')?.textContent ??
|
|
286
|
+
doc.documentElement.textContent;
|
|
287
|
+
const textLength = rawText.replace(/\s+/g, ' ').trim().length;
|
|
288
|
+
if (textLength < 100) {
|
|
289
|
+
logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
|
|
290
|
+
'This might be a client-side rendered (SPA) application. ' +
|
|
291
|
+
'Content extraction may be incomplete.', { textLength });
|
|
292
|
+
}
|
|
293
|
+
if (textLength >= 400 && !isProbablyReaderable(doc)) {
|
|
294
|
+
return null;
|
|
295
|
+
}
|
|
296
|
+
const reader = new Readability(doc, { maxElemsToParse: 20_000 });
|
|
297
|
+
const parsed = reader.parse();
|
|
298
|
+
if (!parsed)
|
|
299
|
+
return null;
|
|
300
|
+
return {
|
|
301
|
+
content: parsed.content ?? '',
|
|
302
|
+
textContent: parsed.textContent ?? '',
|
|
303
|
+
...(parsed.title != null && { title: parsed.title }),
|
|
304
|
+
...(parsed.byline != null && { byline: parsed.byline }),
|
|
305
|
+
...(parsed.excerpt != null && { excerpt: parsed.excerpt }),
|
|
306
|
+
...(parsed.siteName != null && { siteName: parsed.siteName }),
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
catch (error) {
|
|
310
|
+
logError('Failed to extract article with Readability', error instanceof Error ? error : undefined);
|
|
274
311
|
return null;
|
|
275
|
-
|
|
276
|
-
content: parsed.content ?? '',
|
|
277
|
-
textContent: parsed.textContent ?? '',
|
|
278
|
-
...(parsed.title != null && { title: parsed.title }),
|
|
279
|
-
...(parsed.byline != null && { byline: parsed.byline }),
|
|
280
|
-
...(parsed.excerpt != null && { excerpt: parsed.excerpt }),
|
|
281
|
-
...(parsed.siteName != null && { siteName: parsed.siteName }),
|
|
282
|
-
};
|
|
283
|
-
}
|
|
284
|
-
catch (error) {
|
|
285
|
-
logError('Failed to extract article with Readability', error instanceof Error ? error : undefined);
|
|
286
|
-
return null;
|
|
287
|
-
}
|
|
288
|
-
}
|
|
289
|
-
export function extractContent(html, url, options = {
|
|
290
|
-
extractArticle: true,
|
|
291
|
-
}) {
|
|
292
|
-
const result = extractContentWithDocument(html, url, options);
|
|
293
|
-
return { article: result.article, metadata: result.metadata };
|
|
294
|
-
}
|
|
295
|
-
function extractContentWithDocument(html, url, options) {
|
|
296
|
-
if (!isValidInput(html, url)) {
|
|
297
|
-
const { document } = parseHTML('<html></html>');
|
|
298
|
-
return { article: null, metadata: {}, document };
|
|
299
|
-
}
|
|
300
|
-
return tryExtractContent(html, url, options);
|
|
301
|
-
}
|
|
302
|
-
function extractArticleWithStage(document, url, shouldExtract) {
|
|
303
|
-
if (!shouldExtract)
|
|
304
|
-
return null;
|
|
305
|
-
return runTransformStage(url, 'extract:article', () => resolveArticleExtraction(document, shouldExtract));
|
|
306
|
-
}
|
|
307
|
-
function handleExtractionFailure(error, url, signal) {
|
|
308
|
-
if (error instanceof FetchError) {
|
|
309
|
-
throw error;
|
|
310
|
-
}
|
|
311
|
-
throwIfAborted(signal, url, 'extract:error');
|
|
312
|
-
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
313
|
-
const { document } = parseHTML('<html></html>');
|
|
314
|
-
return { article: null, metadata: {}, document };
|
|
315
|
-
}
|
|
316
|
-
function extractContentStages(html, url, options) {
|
|
317
|
-
throwIfAborted(options.signal, url, 'extract:begin');
|
|
318
|
-
const truncatedHtml = truncateHtml(html);
|
|
319
|
-
const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncatedHtml));
|
|
320
|
-
throwIfAborted(options.signal, url, 'extract:parsed');
|
|
321
|
-
applyBaseUri(document, url);
|
|
322
|
-
const metadata = runTransformStage(url, 'extract:metadata', () => extractMetadata(document));
|
|
323
|
-
throwIfAborted(options.signal, url, 'extract:metadata');
|
|
324
|
-
const article = extractArticleWithStage(document, url, options.extractArticle);
|
|
325
|
-
throwIfAborted(options.signal, url, 'extract:article');
|
|
326
|
-
return {
|
|
327
|
-
article,
|
|
328
|
-
metadata,
|
|
329
|
-
document,
|
|
330
|
-
...(truncatedHtml.length !== html.length ? { truncated: true } : {}),
|
|
331
|
-
};
|
|
332
|
-
}
|
|
333
|
-
function tryExtractContent(html, url, options) {
|
|
334
|
-
try {
|
|
335
|
-
return extractContentStages(html, url, options);
|
|
336
|
-
}
|
|
337
|
-
catch (error) {
|
|
338
|
-
return handleExtractionFailure(error, url, options.signal);
|
|
312
|
+
}
|
|
339
313
|
}
|
|
340
314
|
}
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
315
|
+
const articleExtractor = new ArticleExtractor();
|
|
316
|
+
/* -------------------------------------------------------------------------------------------------
|
|
317
|
+
* Content extraction orchestration
|
|
318
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
344
319
|
function validateRequiredString(value, message) {
|
|
345
320
|
if (typeof value === 'string' && value.length > 0)
|
|
346
321
|
return true;
|
|
347
322
|
logWarn(message);
|
|
348
323
|
return false;
|
|
349
324
|
}
|
|
350
|
-
function
|
|
351
|
-
return
|
|
325
|
+
function isValidInput(html, url) {
|
|
326
|
+
return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
|
|
352
327
|
}
|
|
353
328
|
function applyBaseUri(document, url) {
|
|
354
329
|
try {
|
|
355
|
-
Object.defineProperty(document, 'baseURI', {
|
|
356
|
-
value: url,
|
|
357
|
-
writable: true,
|
|
358
|
-
});
|
|
330
|
+
Object.defineProperty(document, 'baseURI', { value: url, writable: true });
|
|
359
331
|
}
|
|
360
332
|
catch (error) {
|
|
361
333
|
logInfo('Failed to set baseURI (non-critical)', {
|
|
@@ -364,13 +336,62 @@ function applyBaseUri(document, url) {
|
|
|
364
336
|
});
|
|
365
337
|
}
|
|
366
338
|
}
|
|
339
|
+
class ContentExtractor {
|
|
340
|
+
extract(html, url, options) {
|
|
341
|
+
if (!isValidInput(html, url)) {
|
|
342
|
+
const { document } = parseHTML('<html></html>');
|
|
343
|
+
return { article: null, metadata: {}, document };
|
|
344
|
+
}
|
|
345
|
+
try {
|
|
346
|
+
abortPolicy.throwIfAborted(options.signal, url, 'extract:begin');
|
|
347
|
+
const { html: limitedHtml, truncated } = truncateHtml(html);
|
|
348
|
+
const { document } = stageTracker.run(url, 'extract:parse', () => parseHTML(limitedHtml));
|
|
349
|
+
abortPolicy.throwIfAborted(options.signal, url, 'extract:parsed');
|
|
350
|
+
applyBaseUri(document, url);
|
|
351
|
+
const metadata = stageTracker.run(url, 'extract:metadata', () => metadataExtractor.extract(document));
|
|
352
|
+
abortPolicy.throwIfAborted(options.signal, url, 'extract:metadata');
|
|
353
|
+
const article = options.extractArticle
|
|
354
|
+
? stageTracker.run(url, 'extract:article', () => articleExtractor.extract(document))
|
|
355
|
+
: null;
|
|
356
|
+
abortPolicy.throwIfAborted(options.signal, url, 'extract:article');
|
|
357
|
+
return {
|
|
358
|
+
article,
|
|
359
|
+
metadata,
|
|
360
|
+
document,
|
|
361
|
+
...(truncated ? { truncated: true } : {}),
|
|
362
|
+
};
|
|
363
|
+
}
|
|
364
|
+
catch (error) {
|
|
365
|
+
if (error instanceof FetchError)
|
|
366
|
+
throw error;
|
|
367
|
+
abortPolicy.throwIfAborted(options.signal, url, 'extract:error');
|
|
368
|
+
logError('Failed to extract content', error instanceof Error ? error : undefined);
|
|
369
|
+
const { document } = parseHTML('<html></html>');
|
|
370
|
+
return { article: null, metadata: {}, document };
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
const contentExtractor = new ContentExtractor();
|
|
375
|
+
/** Backwards-compatible export */
|
|
376
|
+
export function extractContent(html, url, options = {
|
|
377
|
+
extractArticle: true,
|
|
378
|
+
}) {
|
|
379
|
+
const result = contentExtractor.extract(html, url, options);
|
|
380
|
+
return { article: result.article, metadata: result.metadata };
|
|
381
|
+
}
|
|
382
|
+
/* -------------------------------------------------------------------------------------------------
|
|
383
|
+
* Markdown conversion
|
|
384
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
385
|
+
const CODE_BLOCK = {
|
|
386
|
+
fence: '```',
|
|
387
|
+
format: (code, language = '') => `\`\`\`${language}\n${code}\n\`\`\``,
|
|
388
|
+
};
|
|
367
389
|
function buildInlineCode(content) {
|
|
368
390
|
let maxBackticks = 0;
|
|
369
391
|
let currentRun = 0;
|
|
370
392
|
for (const char of content) {
|
|
371
|
-
if (char === '`')
|
|
372
|
-
currentRun
|
|
373
|
-
}
|
|
393
|
+
if (char === '`')
|
|
394
|
+
currentRun += 1;
|
|
374
395
|
else {
|
|
375
396
|
if (currentRun > maxBackticks)
|
|
376
397
|
maxBackticks = currentRun;
|
|
@@ -402,21 +423,25 @@ function deriveAltFromImageUrl(src) {
|
|
|
402
423
|
return '';
|
|
403
424
|
}
|
|
404
425
|
}
|
|
426
|
+
function hasGetAttribute(value) {
|
|
427
|
+
return (isObject(value) &&
|
|
428
|
+
typeof value.getAttribute === 'function');
|
|
429
|
+
}
|
|
405
430
|
function isCodeBlock(parent) {
|
|
406
431
|
if (!isObject(parent))
|
|
407
432
|
return false;
|
|
408
|
-
const tagName = typeof parent.tagName === 'string'
|
|
433
|
+
const tagName = typeof parent.tagName === 'string'
|
|
434
|
+
? parent.tagName.toUpperCase()
|
|
435
|
+
: '';
|
|
409
436
|
return ['PRE', 'WRAPPED-PRE'].includes(tagName);
|
|
410
437
|
}
|
|
411
|
-
function
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
postprocess: ({ content }) => buildInlineCode(content),
|
|
419
|
-
};
|
|
438
|
+
function isAnchor(node) {
|
|
439
|
+
if (!isObject(node))
|
|
440
|
+
return false;
|
|
441
|
+
const tagName = typeof node.tagName === 'string'
|
|
442
|
+
? node.tagName.toUpperCase()
|
|
443
|
+
: '';
|
|
444
|
+
return tagName === 'A';
|
|
420
445
|
}
|
|
421
446
|
function resolveAttributeLanguage(node) {
|
|
422
447
|
const getAttribute = hasGetAttribute(node)
|
|
@@ -426,46 +451,20 @@ function resolveAttributeLanguage(node) {
|
|
|
426
451
|
const dataLanguage = getAttribute?.('data-language') ?? '';
|
|
427
452
|
return resolveLanguageFromAttributes(className, dataLanguage);
|
|
428
453
|
}
|
|
429
|
-
function buildCodeTranslator(ctx) {
|
|
430
|
-
if (!isObject(ctx))
|
|
431
|
-
return buildInlineCodeTranslator();
|
|
432
|
-
const { parent } = ctx;
|
|
433
|
-
if (!isCodeBlock(parent))
|
|
434
|
-
return buildInlineCodeTranslator();
|
|
435
|
-
return {
|
|
436
|
-
noEscape: true,
|
|
437
|
-
preserveWhitespace: true,
|
|
438
|
-
};
|
|
439
|
-
}
|
|
440
|
-
function buildImageTranslator(ctx) {
|
|
441
|
-
if (!isObject(ctx))
|
|
442
|
-
return { content: '' };
|
|
443
|
-
const { node } = ctx;
|
|
444
|
-
const getAttribute = hasGetAttribute(node)
|
|
445
|
-
? node.getAttribute.bind(node)
|
|
446
|
-
: undefined;
|
|
447
|
-
const src = getAttribute?.('src') ?? '';
|
|
448
|
-
const existingAlt = getAttribute?.('alt') ?? '';
|
|
449
|
-
const alt = existingAlt.trim() || deriveAltFromImageUrl(src);
|
|
450
|
-
return {
|
|
451
|
-
content: ``,
|
|
452
|
-
};
|
|
453
|
-
}
|
|
454
454
|
function findLanguageFromCodeChild(node) {
|
|
455
455
|
if (!isObject(node))
|
|
456
456
|
return undefined;
|
|
457
|
-
const
|
|
458
|
-
|
|
459
|
-
|
|
457
|
+
const childNodes = Array.isArray(node.childNodes)
|
|
458
|
+
? node.childNodes
|
|
459
|
+
: [];
|
|
460
460
|
for (const child of childNodes) {
|
|
461
461
|
if (!isObject(child))
|
|
462
462
|
continue;
|
|
463
463
|
const tagName = typeof child.rawTagName === 'string'
|
|
464
464
|
? child.rawTagName.toUpperCase()
|
|
465
465
|
: '';
|
|
466
|
-
if (tagName === 'CODE')
|
|
466
|
+
if (tagName === 'CODE')
|
|
467
467
|
return resolveAttributeLanguage(child);
|
|
468
|
-
}
|
|
469
468
|
}
|
|
470
469
|
return undefined;
|
|
471
470
|
}
|
|
@@ -478,6 +477,37 @@ function createCodeBlockPostprocessor(language) {
|
|
|
478
477
|
return CODE_BLOCK.format(trimmed, resolvedLanguage);
|
|
479
478
|
};
|
|
480
479
|
}
|
|
480
|
+
function buildInlineCodeTranslator() {
|
|
481
|
+
return {
|
|
482
|
+
spaceIfRepeatingChar: true,
|
|
483
|
+
noEscape: true,
|
|
484
|
+
postprocess: ({ content }) => buildInlineCode(content),
|
|
485
|
+
};
|
|
486
|
+
}
|
|
487
|
+
function buildCodeTranslator(ctx) {
|
|
488
|
+
if (!isObject(ctx))
|
|
489
|
+
return buildInlineCodeTranslator();
|
|
490
|
+
const { parent } = ctx;
|
|
491
|
+
if (!isCodeBlock(parent))
|
|
492
|
+
return buildInlineCodeTranslator();
|
|
493
|
+
return { noEscape: true, preserveWhitespace: true };
|
|
494
|
+
}
|
|
495
|
+
function buildImageTranslator(ctx) {
|
|
496
|
+
if (!isObject(ctx))
|
|
497
|
+
return { content: '' };
|
|
498
|
+
const { node, parent } = ctx;
|
|
499
|
+
const getAttribute = hasGetAttribute(node)
|
|
500
|
+
? node.getAttribute.bind(node)
|
|
501
|
+
: undefined;
|
|
502
|
+
const src = getAttribute?.('src') ?? '';
|
|
503
|
+
const existingAlt = getAttribute?.('alt') ?? '';
|
|
504
|
+
const alt = existingAlt.trim() || deriveAltFromImageUrl(src);
|
|
505
|
+
const markdown = ``;
|
|
506
|
+
if (isAnchor(parent)) {
|
|
507
|
+
return { content: markdown };
|
|
508
|
+
}
|
|
509
|
+
return { content: `\n\n${markdown}\n\n` };
|
|
510
|
+
}
|
|
481
511
|
function buildPreTranslator(ctx) {
|
|
482
512
|
if (!isObject(ctx))
|
|
483
513
|
return {};
|
|
@@ -494,10 +524,9 @@ function createCustomTranslators() {
|
|
|
494
524
|
code: (ctx) => buildCodeTranslator(ctx),
|
|
495
525
|
img: (ctx) => buildImageTranslator(ctx),
|
|
496
526
|
dl: (ctx) => {
|
|
497
|
-
if (!isObject(ctx) || !isObject(ctx.node))
|
|
527
|
+
if (!isObject(ctx) || !isObject(ctx.node))
|
|
498
528
|
return { content: '' };
|
|
499
|
-
}
|
|
500
|
-
const node = ctx.node;
|
|
529
|
+
const { node } = ctx;
|
|
501
530
|
const childNodes = Array.isArray(node.childNodes) ? node.childNodes : [];
|
|
502
531
|
const items = childNodes
|
|
503
532
|
.map((child) => {
|
|
@@ -520,14 +549,15 @@ function createCustomTranslators() {
|
|
|
520
549
|
return { content: items ? `\n${items}\n\n` : '' };
|
|
521
550
|
},
|
|
522
551
|
div: (ctx) => {
|
|
523
|
-
if (!isObject(ctx) || !isObject(ctx.node))
|
|
552
|
+
if (!isObject(ctx) || !isObject(ctx.node))
|
|
524
553
|
return {};
|
|
525
|
-
}
|
|
526
|
-
const
|
|
527
|
-
|
|
528
|
-
|
|
554
|
+
const { node } = ctx;
|
|
555
|
+
const getAttribute = hasGetAttribute(node)
|
|
556
|
+
? node.getAttribute.bind(node)
|
|
557
|
+
: undefined;
|
|
558
|
+
const className = getAttribute?.('class') ?? '';
|
|
559
|
+
if (!className.includes('type'))
|
|
529
560
|
return {};
|
|
530
|
-
}
|
|
531
561
|
return {
|
|
532
562
|
postprocess: ({ content }) => {
|
|
533
563
|
const lines = content.split('\n');
|
|
@@ -561,37 +591,41 @@ function createCustomTranslators() {
|
|
|
561
591
|
sup: () => ({
|
|
562
592
|
postprocess: ({ content }) => `^${content}^`,
|
|
563
593
|
}),
|
|
564
|
-
|
|
565
|
-
|
|
594
|
+
section: () => ({
|
|
595
|
+
postprocess: ({ content }) => `\n\n${content}\n\n`,
|
|
596
|
+
}),
|
|
566
597
|
pre: (ctx) => buildPreTranslator(ctx),
|
|
567
598
|
};
|
|
568
599
|
}
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
600
|
+
class MarkdownConverter {
|
|
601
|
+
instance = null;
|
|
602
|
+
translate(html) {
|
|
603
|
+
return this.get().translate(html).trim();
|
|
604
|
+
}
|
|
605
|
+
get() {
|
|
606
|
+
this.instance ??= new NodeHtmlMarkdown({
|
|
607
|
+
codeFence: CODE_BLOCK.fence,
|
|
608
|
+
codeBlockStyle: 'fenced',
|
|
609
|
+
emDelimiter: '_',
|
|
610
|
+
bulletMarker: '-',
|
|
611
|
+
}, createCustomTranslators());
|
|
612
|
+
return this.instance;
|
|
613
|
+
}
|
|
581
614
|
}
|
|
615
|
+
const markdownConverter = new MarkdownConverter();
|
|
582
616
|
function preprocessPropertySections(html) {
|
|
583
|
-
|
|
584
|
-
return result;
|
|
617
|
+
return html.replace(/<\/section>\s*(<section[^>]*class="[^"]*tsd-member[^"]*"[^>]*>)/g, '</section><p> </p>$1');
|
|
585
618
|
}
|
|
586
|
-
function translateHtmlToMarkdown(
|
|
587
|
-
|
|
619
|
+
function translateHtmlToMarkdown(params) {
|
|
620
|
+
const { html, url, signal, document, skipNoiseRemoval } = params;
|
|
621
|
+
abortPolicy.throwIfAborted(signal, url, 'markdown:begin');
|
|
588
622
|
const cleanedHtml = skipNoiseRemoval
|
|
589
623
|
? html
|
|
590
|
-
:
|
|
591
|
-
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
592
|
-
const preprocessedHtml =
|
|
593
|
-
const content =
|
|
594
|
-
throwIfAborted(signal, url, 'markdown:translated');
|
|
624
|
+
: stageTracker.run(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
|
|
625
|
+
abortPolicy.throwIfAborted(signal, url, 'markdown:cleaned');
|
|
626
|
+
const preprocessedHtml = stageTracker.run(url, 'markdown:preprocess', () => preprocessPropertySections(cleanedHtml));
|
|
627
|
+
const content = stageTracker.run(url, 'markdown:translate', () => markdownConverter.translate(preprocessedHtml));
|
|
628
|
+
abortPolicy.throwIfAborted(signal, url, 'markdown:translated');
|
|
595
629
|
return cleanupMarkdownArtifacts(content);
|
|
596
630
|
}
|
|
597
631
|
function appendMetadataFooter(content, metadata, url) {
|
|
@@ -603,77 +637,71 @@ export function htmlToMarkdown(html, metadata, options) {
|
|
|
603
637
|
if (!html)
|
|
604
638
|
return buildMetadataFooter(metadata, url);
|
|
605
639
|
try {
|
|
606
|
-
const content = translateHtmlToMarkdown(
|
|
640
|
+
const content = translateHtmlToMarkdown({
|
|
641
|
+
html,
|
|
642
|
+
url,
|
|
643
|
+
signal: options?.signal,
|
|
644
|
+
document: options?.document,
|
|
645
|
+
skipNoiseRemoval: options?.skipNoiseRemoval,
|
|
646
|
+
});
|
|
607
647
|
return appendMetadataFooter(content, metadata, url);
|
|
608
648
|
}
|
|
609
649
|
catch (error) {
|
|
610
|
-
if (error instanceof FetchError)
|
|
650
|
+
if (error instanceof FetchError)
|
|
611
651
|
throw error;
|
|
612
|
-
}
|
|
613
652
|
logError('Failed to convert HTML to markdown', error instanceof Error ? error : undefined);
|
|
614
653
|
return buildMetadataFooter(metadata, url);
|
|
615
654
|
}
|
|
616
655
|
}
|
|
656
|
+
/* -------------------------------------------------------------------------------------------------
|
|
657
|
+
* Raw content shortcut
|
|
658
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
617
659
|
function shouldPreserveRawContent(url, content) {
|
|
618
|
-
if (isRawTextContentUrl(url))
|
|
660
|
+
if (isRawTextContentUrl(url))
|
|
619
661
|
return !isLikelyHtmlContent(content);
|
|
620
|
-
}
|
|
621
662
|
return isRawTextContent(content);
|
|
622
663
|
}
|
|
623
|
-
function buildRawMarkdownPayload(
|
|
624
|
-
const title = extractTitleFromRawMarkdown(rawContent);
|
|
625
|
-
const content = includeMetadata
|
|
626
|
-
? addSourceToMarkdown(rawContent, url)
|
|
627
|
-
: rawContent;
|
|
664
|
+
function buildRawMarkdownPayload(params) {
|
|
665
|
+
const title = extractTitleFromRawMarkdown(params.rawContent);
|
|
666
|
+
const content = params.includeMetadata
|
|
667
|
+
? addSourceToMarkdown(params.rawContent, params.url)
|
|
668
|
+
: params.rawContent;
|
|
628
669
|
return { content, title };
|
|
629
670
|
}
|
|
630
|
-
function
|
|
631
|
-
|
|
632
|
-
rawContent,
|
|
633
|
-
url,
|
|
634
|
-
includeMetadata,
|
|
635
|
-
});
|
|
636
|
-
return {
|
|
637
|
-
markdown: content,
|
|
638
|
-
title,
|
|
639
|
-
truncated: false,
|
|
640
|
-
};
|
|
641
|
-
}
|
|
642
|
-
function tryTransformRawContent({ html, url, includeMetadata, }) {
|
|
643
|
-
if (!shouldPreserveRawContent(url, html)) {
|
|
671
|
+
function tryTransformRawContent(params) {
|
|
672
|
+
if (!shouldPreserveRawContent(params.url, params.html))
|
|
644
673
|
return null;
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
674
|
+
logDebug('Preserving raw markdown content', {
|
|
675
|
+
url: params.url.substring(0, 80),
|
|
676
|
+
});
|
|
677
|
+
const { content, title } = buildRawMarkdownPayload({
|
|
678
|
+
rawContent: params.html,
|
|
679
|
+
url: params.url,
|
|
680
|
+
includeMetadata: params.includeMetadata,
|
|
651
681
|
});
|
|
682
|
+
return { markdown: content, title, truncated: false };
|
|
652
683
|
}
|
|
684
|
+
/* -------------------------------------------------------------------------------------------------
|
|
685
|
+
* Quality gates + content source resolution
|
|
686
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
653
687
|
const MIN_CONTENT_RATIO = 0.3;
|
|
654
688
|
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
655
689
|
const MIN_HEADING_RETENTION_RATIO = 0.7;
|
|
656
690
|
const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
* Fragments without doctype/html/body tags need wrapping.
|
|
660
|
-
*/
|
|
691
|
+
const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
|
|
692
|
+
const MAX_TRUNCATED_LINE_RATIO = 0.5;
|
|
661
693
|
function needsDocumentWrapper(html) {
|
|
662
694
|
const trimmed = html.trim().toLowerCase();
|
|
663
695
|
return (!trimmed.startsWith('<!doctype') &&
|
|
664
696
|
!trimmed.startsWith('<html') &&
|
|
665
697
|
!trimmed.startsWith('<body'));
|
|
666
698
|
}
|
|
667
|
-
/**
|
|
668
|
-
* Wrap HTML fragment in minimal document structure for proper parsing.
|
|
669
|
-
*/
|
|
670
699
|
function wrapHtmlFragment(html) {
|
|
671
700
|
return `<!DOCTYPE html><html><body>${html}</body></html>`;
|
|
672
701
|
}
|
|
673
702
|
function resolveHtmlDocument(htmlOrDocument) {
|
|
674
|
-
if (typeof htmlOrDocument !== 'string')
|
|
703
|
+
if (typeof htmlOrDocument !== 'string')
|
|
675
704
|
return htmlOrDocument;
|
|
676
|
-
}
|
|
677
705
|
const htmlToParse = needsDocumentWrapper(htmlOrDocument)
|
|
678
706
|
? wrapHtmlFragment(htmlOrDocument)
|
|
679
707
|
: htmlOrDocument;
|
|
@@ -682,39 +710,26 @@ function resolveHtmlDocument(htmlOrDocument) {
|
|
|
682
710
|
function countDomSelector(htmlOrDocument, selector) {
|
|
683
711
|
return resolveHtmlDocument(htmlOrDocument).querySelectorAll(selector).length;
|
|
684
712
|
}
|
|
685
|
-
/**
|
|
686
|
-
* Count headings using DOM querySelectorAll.
|
|
687
|
-
* Handles nested content like <h2><span>Text</span></h2> correctly.
|
|
688
|
-
*/
|
|
689
713
|
function countHeadingsDom(htmlOrDocument) {
|
|
690
714
|
return countDomSelector(htmlOrDocument, 'h1,h2,h3,h4,h5,h6');
|
|
691
715
|
}
|
|
692
716
|
function countCodeBlocksDom(htmlOrDocument) {
|
|
693
717
|
return countDomSelector(htmlOrDocument, 'pre');
|
|
694
718
|
}
|
|
695
|
-
function cloneDocumentIfNeeded(htmlOrDocument, doc) {
|
|
696
|
-
return typeof htmlOrDocument === 'string'
|
|
697
|
-
? doc
|
|
698
|
-
: doc.cloneNode(true);
|
|
699
|
-
}
|
|
700
719
|
function stripNonVisibleNodes(doc) {
|
|
701
|
-
for (const el of doc.querySelectorAll('script,style,noscript'))
|
|
720
|
+
for (const el of doc.querySelectorAll('script,style,noscript'))
|
|
702
721
|
el.remove();
|
|
703
|
-
}
|
|
704
722
|
}
|
|
705
723
|
function resolveDocumentText(doc) {
|
|
706
|
-
// Note: linkedom may return null for body on HTML fragments despite types
|
|
707
724
|
const body = doc.body;
|
|
708
725
|
const docElement = doc.documentElement;
|
|
709
726
|
return body?.textContent ?? docElement?.textContent ?? '';
|
|
710
727
|
}
|
|
711
|
-
/**
|
|
712
|
-
* Get visible text length from HTML, excluding script/style/noscript content.
|
|
713
|
-
* Fixes the bug where stripHtmlTagsForLength() counted JS/CSS as visible text.
|
|
714
|
-
*/
|
|
715
728
|
function getVisibleTextLength(htmlOrDocument) {
|
|
716
729
|
const doc = resolveHtmlDocument(htmlOrDocument);
|
|
717
|
-
const workDoc =
|
|
730
|
+
const workDoc = typeof htmlOrDocument === 'string'
|
|
731
|
+
? doc
|
|
732
|
+
: doc.cloneNode(true);
|
|
718
733
|
stripNonVisibleNodes(workDoc);
|
|
719
734
|
const text = resolveDocumentText(workDoc);
|
|
720
735
|
return text.replace(/\s+/g, ' ').trim().length;
|
|
@@ -723,29 +738,18 @@ export function isExtractionSufficient(article, originalHtmlOrDocument) {
|
|
|
723
738
|
if (!article)
|
|
724
739
|
return false;
|
|
725
740
|
const articleLength = article.textContent.length;
|
|
726
|
-
// Use DOM-based visible text length to exclude script/style content
|
|
727
741
|
const originalLength = getVisibleTextLength(originalHtmlOrDocument);
|
|
728
742
|
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
729
743
|
return true;
|
|
730
744
|
return articleLength / originalLength >= MIN_CONTENT_RATIO;
|
|
731
745
|
}
|
|
732
|
-
const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
|
|
733
|
-
const MAX_TRUNCATED_LINE_RATIO = 0.5;
|
|
734
|
-
/**
|
|
735
|
-
* Detect if extracted text has many truncated/incomplete sentences.
|
|
736
|
-
* Lines longer than 20 chars that don't end with sentence punctuation
|
|
737
|
-
* are considered potentially truncated.
|
|
738
|
-
*/
|
|
739
746
|
function hasTruncatedSentences(text) {
|
|
740
747
|
const lines = text
|
|
741
748
|
.split('\n')
|
|
742
749
|
.filter((line) => line.trim().length > MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK);
|
|
743
750
|
if (lines.length < 3)
|
|
744
751
|
return false;
|
|
745
|
-
const incompleteLines = lines.filter((line) =>
|
|
746
|
-
const trimmed = line.trim();
|
|
747
|
-
return !/[.!?:;]$/.test(trimmed);
|
|
748
|
-
});
|
|
752
|
+
const incompleteLines = lines.filter((line) => !/[.!?:;]$/.test(line.trim()));
|
|
749
753
|
return incompleteLines.length / lines.length > MAX_TRUNCATED_LINE_RATIO;
|
|
750
754
|
}
|
|
751
755
|
export function determineContentExtractionSource(article) {
|
|
@@ -768,19 +772,13 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
768
772
|
else {
|
|
769
773
|
if (extractedMeta.title !== undefined)
|
|
770
774
|
metadata.title = extractedMeta.title;
|
|
771
|
-
if (extractedMeta.description !== undefined)
|
|
775
|
+
if (extractedMeta.description !== undefined)
|
|
772
776
|
metadata.description = extractedMeta.description;
|
|
773
|
-
|
|
774
|
-
if (extractedMeta.author !== undefined) {
|
|
777
|
+
if (extractedMeta.author !== undefined)
|
|
775
778
|
metadata.author = extractedMeta.author;
|
|
776
|
-
}
|
|
777
779
|
}
|
|
778
780
|
return metadata;
|
|
779
781
|
}
|
|
780
|
-
/**
|
|
781
|
-
* Content root selectors in priority order.
|
|
782
|
-
* These identify the main content area on a page.
|
|
783
|
-
*/
|
|
784
782
|
const CONTENT_ROOT_SELECTORS = [
|
|
785
783
|
'main',
|
|
786
784
|
'article',
|
|
@@ -797,75 +795,23 @@ const CONTENT_ROOT_SELECTORS = [
|
|
|
797
795
|
'.post-body',
|
|
798
796
|
'.article-body',
|
|
799
797
|
];
|
|
800
|
-
/**
|
|
801
|
-
* Find the main content root element in a document.
|
|
802
|
-
* Returns the innerHTML if found, undefined otherwise.
|
|
803
|
-
*/
|
|
804
798
|
function findContentRoot(document) {
|
|
805
799
|
for (const selector of CONTENT_ROOT_SELECTORS) {
|
|
806
800
|
const element = document.querySelector(selector);
|
|
807
801
|
if (!element)
|
|
808
802
|
continue;
|
|
809
|
-
// Check if element has meaningful content
|
|
810
803
|
const innerHTML = typeof element.innerHTML === 'string'
|
|
811
804
|
? element.innerHTML
|
|
812
805
|
: undefined;
|
|
813
|
-
if (innerHTML && innerHTML.trim().length > 100)
|
|
806
|
+
if (innerHTML && innerHTML.trim().length > 100)
|
|
814
807
|
return innerHTML;
|
|
815
|
-
}
|
|
816
808
|
}
|
|
817
809
|
return undefined;
|
|
818
810
|
}
|
|
819
|
-
function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, document, }) {
|
|
820
|
-
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
821
|
-
// If using article content, return it directly
|
|
822
|
-
if (useArticleContent && article) {
|
|
823
|
-
return {
|
|
824
|
-
sourceHtml: article.content,
|
|
825
|
-
title: article.title,
|
|
826
|
-
metadata,
|
|
827
|
-
};
|
|
828
|
-
}
|
|
829
|
-
// Try content root fallback before using full HTML
|
|
830
|
-
if (document) {
|
|
831
|
-
// Apply noise removal to HTML first (without passing document) to get cleaned HTML,
|
|
832
|
-
// then parse and find content root. This prevents the aggressive DOM stripping that
|
|
833
|
-
// happens when noise removal is given the original parsed document.
|
|
834
|
-
const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
|
|
835
|
-
const { document: cleanedDoc } = parseHTML(cleanedHtml);
|
|
836
|
-
const contentRoot = findContentRoot(cleanedDoc);
|
|
837
|
-
if (contentRoot) {
|
|
838
|
-
logDebug('Using content root fallback instead of full HTML', {
|
|
839
|
-
url: truncateUrlForLog(url),
|
|
840
|
-
contentLength: contentRoot.length,
|
|
841
|
-
});
|
|
842
|
-
return {
|
|
843
|
-
sourceHtml: contentRoot,
|
|
844
|
-
title: extractedMeta.title,
|
|
845
|
-
metadata,
|
|
846
|
-
// Skip noise removal - this HTML is already from a cleaned document
|
|
847
|
-
skipNoiseRemoval: true,
|
|
848
|
-
};
|
|
849
|
-
}
|
|
850
|
-
}
|
|
851
|
-
// Fall back to full HTML
|
|
852
|
-
return {
|
|
853
|
-
sourceHtml: html,
|
|
854
|
-
title: extractedMeta.title,
|
|
855
|
-
metadata,
|
|
856
|
-
...(document ? { document } : {}),
|
|
857
|
-
};
|
|
858
|
-
}
|
|
859
|
-
function logQualityGateFallback({ safeUrl, articleLength, }) {
|
|
860
|
-
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
861
|
-
url: safeUrl,
|
|
862
|
-
articleLength,
|
|
863
|
-
});
|
|
864
|
-
}
|
|
865
811
|
function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
|
|
866
812
|
const articleLength = article.textContent.length;
|
|
867
813
|
const originalLength = getVisibleTextLength(originalHtmlOrDocument);
|
|
868
|
-
const safeUrl =
|
|
814
|
+
const safeUrl = url.substring(0, 80);
|
|
869
815
|
let articleDocument = null;
|
|
870
816
|
const getArticleDocument = () => {
|
|
871
817
|
if (articleDocument)
|
|
@@ -873,15 +819,16 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
|
|
|
873
819
|
articleDocument = resolveHtmlDocument(article.content);
|
|
874
820
|
return articleDocument;
|
|
875
821
|
};
|
|
876
|
-
// If the document is tiny, don't gate too aggressively.
|
|
877
822
|
if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
|
|
878
823
|
const ratio = articleLength / originalLength;
|
|
879
824
|
if (ratio < MIN_CONTENT_RATIO) {
|
|
880
|
-
|
|
825
|
+
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
826
|
+
url: safeUrl,
|
|
827
|
+
articleLength,
|
|
828
|
+
});
|
|
881
829
|
return false;
|
|
882
830
|
}
|
|
883
831
|
}
|
|
884
|
-
// Heading structure retention (compute counts once to avoid repeated DOM queries/parses).
|
|
885
832
|
const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
|
|
886
833
|
if (originalHeadings > 0) {
|
|
887
834
|
const articleHeadings = countHeadingsDom(getArticleDocument());
|
|
@@ -899,7 +846,6 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
|
|
|
899
846
|
if (originalCodeBlocks > 0) {
|
|
900
847
|
const articleCodeBlocks = countCodeBlocksDom(getArticleDocument());
|
|
901
848
|
const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
|
|
902
|
-
// Always log code block counts for debugging
|
|
903
849
|
logDebug('Code block retention check', {
|
|
904
850
|
url: safeUrl,
|
|
905
851
|
originalCodeBlocks,
|
|
@@ -915,100 +861,106 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
|
|
|
915
861
|
return false;
|
|
916
862
|
}
|
|
917
863
|
}
|
|
918
|
-
// Layout extraction issue: truncated/fragmented lines.
|
|
919
864
|
if (hasTruncatedSentences(article.textContent)) {
|
|
920
|
-
logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', {
|
|
865
|
+
logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', {
|
|
866
|
+
url: safeUrl,
|
|
867
|
+
});
|
|
921
868
|
return false;
|
|
922
869
|
}
|
|
923
870
|
return true;
|
|
924
871
|
}
|
|
925
|
-
function
|
|
926
|
-
const { article,
|
|
872
|
+
function buildContentSource(params) {
|
|
873
|
+
const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, } = params;
|
|
874
|
+
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
875
|
+
if (useArticleContent && article) {
|
|
876
|
+
return { sourceHtml: article.content, title: article.title, metadata };
|
|
877
|
+
}
|
|
878
|
+
if (document) {
|
|
879
|
+
const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
|
|
880
|
+
const { document: cleanedDoc } = parseHTML(cleanedHtml);
|
|
881
|
+
const contentRoot = findContentRoot(cleanedDoc);
|
|
882
|
+
if (contentRoot) {
|
|
883
|
+
logDebug('Using content root fallback instead of full HTML', {
|
|
884
|
+
url: url.substring(0, 80),
|
|
885
|
+
contentLength: contentRoot.length,
|
|
886
|
+
});
|
|
887
|
+
return {
|
|
888
|
+
sourceHtml: contentRoot,
|
|
889
|
+
title: extractedMeta.title,
|
|
890
|
+
metadata,
|
|
891
|
+
skipNoiseRemoval: true,
|
|
892
|
+
};
|
|
893
|
+
}
|
|
894
|
+
}
|
|
895
|
+
return {
|
|
896
|
+
sourceHtml: html,
|
|
897
|
+
title: extractedMeta.title,
|
|
898
|
+
metadata,
|
|
899
|
+
...(document ? { document } : {}),
|
|
900
|
+
};
|
|
901
|
+
}
|
|
902
|
+
function resolveContentSource(params) {
|
|
903
|
+
const { article, metadata: extractedMeta, document, } = contentExtractor.extract(params.html, params.url, {
|
|
927
904
|
extractArticle: true,
|
|
928
|
-
...(signal ? { signal } : {}),
|
|
905
|
+
...(params.signal ? { signal: params.signal } : {}),
|
|
929
906
|
});
|
|
930
|
-
const originalDocument = document;
|
|
931
907
|
const useArticleContent = article
|
|
932
|
-
? shouldUseArticleContent(article,
|
|
908
|
+
? shouldUseArticleContent(article, document, params.url)
|
|
933
909
|
: false;
|
|
934
910
|
return buildContentSource({
|
|
935
|
-
html,
|
|
936
|
-
url,
|
|
911
|
+
html: params.html,
|
|
912
|
+
url: params.url,
|
|
937
913
|
article,
|
|
938
914
|
extractedMeta,
|
|
939
|
-
includeMetadata,
|
|
915
|
+
includeMetadata: params.includeMetadata,
|
|
940
916
|
useArticleContent,
|
|
941
917
|
document,
|
|
942
918
|
});
|
|
943
919
|
}
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
url,
|
|
948
|
-
includeMetadata,
|
|
949
|
-
}));
|
|
950
|
-
}
|
|
951
|
-
function resolveContentSourceStage(html, url, includeMetadata, signal) {
|
|
952
|
-
return runTransformStage(url, 'transform:extract', () => resolveContentSource({
|
|
953
|
-
html,
|
|
954
|
-
url,
|
|
955
|
-
includeMetadata,
|
|
956
|
-
...(signal ? { signal } : {}),
|
|
957
|
-
}));
|
|
958
|
-
}
|
|
920
|
+
/* -------------------------------------------------------------------------------------------------
|
|
921
|
+
* In-process transform pipeline (public)
|
|
922
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
959
923
|
function buildMarkdownFromContext(context, url, signal) {
|
|
960
|
-
const content =
|
|
924
|
+
const content = stageTracker.run(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
|
|
961
925
|
url,
|
|
962
926
|
...(signal ? { signal } : {}),
|
|
963
927
|
...(context.document ? { document: context.document } : {}),
|
|
964
928
|
...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
965
929
|
}));
|
|
966
|
-
return {
|
|
967
|
-
markdown: content,
|
|
968
|
-
title: context.title,
|
|
969
|
-
truncated: false,
|
|
970
|
-
};
|
|
930
|
+
return { markdown: content, title: context.title, truncated: false };
|
|
971
931
|
}
|
|
972
|
-
function
|
|
973
|
-
const totalStage =
|
|
974
|
-
let success = false;
|
|
975
|
-
try {
|
|
976
|
-
const result = fn();
|
|
977
|
-
success = true;
|
|
978
|
-
return result;
|
|
979
|
-
}
|
|
980
|
-
finally {
|
|
981
|
-
finalizeTotalTransformStage(totalStage, success);
|
|
982
|
-
}
|
|
983
|
-
}
|
|
984
|
-
function finalizeTotalTransformStage(stage, success) {
|
|
985
|
-
if (!success)
|
|
986
|
-
return;
|
|
987
|
-
endTransformStage(stage, { truncated: false });
|
|
988
|
-
}
|
|
989
|
-
async function runTotalTransformStageAsync(url, fn) {
|
|
990
|
-
const totalStage = startTransformStage(url, 'transform:total');
|
|
932
|
+
export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
933
|
+
const totalStage = stageTracker.start(url, 'transform:total');
|
|
991
934
|
let success = false;
|
|
992
935
|
try {
|
|
993
|
-
|
|
936
|
+
abortPolicy.throwIfAborted(options.signal, url, 'transform:begin');
|
|
937
|
+
const raw = stageTracker.run(url, 'transform:raw', () => tryTransformRawContent({
|
|
938
|
+
html,
|
|
939
|
+
url,
|
|
940
|
+
includeMetadata: options.includeMetadata,
|
|
941
|
+
}));
|
|
942
|
+
if (raw) {
|
|
943
|
+
success = true;
|
|
944
|
+
return raw;
|
|
945
|
+
}
|
|
946
|
+
const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
|
|
947
|
+
html,
|
|
948
|
+
url,
|
|
949
|
+
includeMetadata: options.includeMetadata,
|
|
950
|
+
...(options.signal ? { signal: options.signal } : {}),
|
|
951
|
+
}));
|
|
952
|
+
const result = buildMarkdownFromContext(context, url, options.signal);
|
|
994
953
|
success = true;
|
|
995
954
|
return result;
|
|
996
955
|
}
|
|
997
956
|
finally {
|
|
998
|
-
|
|
957
|
+
if (success)
|
|
958
|
+
stageTracker.end(totalStage, { truncated: false });
|
|
999
959
|
}
|
|
1000
960
|
}
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
const raw = tryTransformRawStage(html, url, options.includeMetadata);
|
|
1005
|
-
if (raw) {
|
|
1006
|
-
return raw;
|
|
1007
|
-
}
|
|
1008
|
-
const context = resolveContentSourceStage(html, url, options.includeMetadata, options.signal);
|
|
1009
|
-
return buildMarkdownFromContext(context, url, options.signal);
|
|
1010
|
-
});
|
|
1011
|
-
}
|
|
961
|
+
/* -------------------------------------------------------------------------------------------------
|
|
962
|
+
* Worker pool
|
|
963
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
1012
964
|
const workerMessageSchema = z.discriminatedUnion('type', [
|
|
1013
965
|
z.object({
|
|
1014
966
|
type: z.literal('result'),
|
|
@@ -1031,142 +983,137 @@ const workerMessageSchema = z.discriminatedUnion('type', [
|
|
|
1031
983
|
}),
|
|
1032
984
|
}),
|
|
1033
985
|
]);
|
|
1034
|
-
let pool = null;
|
|
1035
986
|
const POOL_MIN_WORKERS = 2;
|
|
1036
987
|
const POOL_MAX_WORKERS = 4;
|
|
1037
988
|
const POOL_SCALE_THRESHOLD = 0.5;
|
|
1038
|
-
function resolveDefaultWorkerCount() {
|
|
1039
|
-
return POOL_MIN_WORKERS;
|
|
1040
|
-
}
|
|
1041
989
|
const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
|
|
1042
|
-
function getOrCreateTransformWorkerPool() {
|
|
1043
|
-
pool ??= new WorkerPool(resolveDefaultWorkerCount(), DEFAULT_TIMEOUT_MS);
|
|
1044
|
-
return pool;
|
|
1045
|
-
}
|
|
1046
|
-
export async function shutdownTransformWorkerPool() {
|
|
1047
|
-
if (!pool)
|
|
1048
|
-
return;
|
|
1049
|
-
await pool.close();
|
|
1050
|
-
pool = null;
|
|
1051
|
-
}
|
|
1052
|
-
export function getTransformPoolStats() {
|
|
1053
|
-
if (!pool)
|
|
1054
|
-
return null;
|
|
1055
|
-
return {
|
|
1056
|
-
queueDepth: pool.getQueueDepth(),
|
|
1057
|
-
activeWorkers: pool.getActiveWorkers(),
|
|
1058
|
-
capacity: pool.getCapacity(),
|
|
1059
|
-
};
|
|
1060
|
-
}
|
|
1061
990
|
class WorkerPool {
|
|
1062
991
|
workers = [];
|
|
1063
992
|
capacity;
|
|
1064
|
-
minCapacity;
|
|
1065
|
-
maxCapacity;
|
|
993
|
+
minCapacity = POOL_MIN_WORKERS;
|
|
994
|
+
maxCapacity = POOL_MAX_WORKERS;
|
|
1066
995
|
queue = [];
|
|
1067
996
|
inflight = new Map();
|
|
1068
997
|
timeoutMs;
|
|
1069
998
|
queueMax;
|
|
1070
999
|
closed = false;
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
});
|
|
1000
|
+
constructor(size, timeoutMs) {
|
|
1001
|
+
this.capacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
|
|
1002
|
+
this.timeoutMs = timeoutMs;
|
|
1003
|
+
this.queueMax = this.maxCapacity * 32;
|
|
1076
1004
|
}
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1005
|
+
async transform(html, url, options) {
|
|
1006
|
+
this.ensureOpen();
|
|
1007
|
+
if (options.signal?.aborted)
|
|
1008
|
+
throw abortPolicy.createAbortError(url, 'transform:enqueue');
|
|
1009
|
+
if (this.queue.length >= this.queueMax) {
|
|
1010
|
+
throw new FetchError('Transform worker queue is full', url, 503, {
|
|
1011
|
+
reason: 'queue_full',
|
|
1012
|
+
stage: 'transform:enqueue',
|
|
1013
|
+
});
|
|
1080
1014
|
}
|
|
1015
|
+
return new Promise((resolve, reject) => {
|
|
1016
|
+
const task = this.createPendingTask(html, url, options, resolve, reject);
|
|
1017
|
+
this.queue.push(task);
|
|
1018
|
+
this.drainQueue();
|
|
1019
|
+
});
|
|
1081
1020
|
}
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
return;
|
|
1085
|
-
throw this.createAbortError(url, stage);
|
|
1021
|
+
getQueueDepth() {
|
|
1022
|
+
return this.queue.length;
|
|
1086
1023
|
}
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
return;
|
|
1090
|
-
throw new FetchError('Transform worker queue is full', url, 503, {
|
|
1091
|
-
reason: 'queue_full',
|
|
1092
|
-
stage: 'transform:enqueue',
|
|
1093
|
-
});
|
|
1024
|
+
getActiveWorkers() {
|
|
1025
|
+
return this.workers.filter((s) => s?.busy).length;
|
|
1094
1026
|
}
|
|
1095
|
-
|
|
1096
|
-
|
|
1027
|
+
getCapacity() {
|
|
1028
|
+
return this.capacity;
|
|
1029
|
+
}
|
|
1030
|
+
async close() {
|
|
1031
|
+
if (this.closed)
|
|
1097
1032
|
return;
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1033
|
+
this.closed = true;
|
|
1034
|
+
const terminations = this.workers
|
|
1035
|
+
.map((slot) => slot?.worker.terminate())
|
|
1036
|
+
.filter((p) => p !== undefined);
|
|
1037
|
+
this.workers.fill(undefined);
|
|
1038
|
+
this.workers.length = 0;
|
|
1039
|
+
for (const [id, inflight] of this.inflight.entries()) {
|
|
1040
|
+
clearTimeout(inflight.timer);
|
|
1041
|
+
this.clearAbortListener(inflight.signal, inflight.abortListener);
|
|
1042
|
+
inflight.reject(new Error('Transform worker pool closed'));
|
|
1043
|
+
this.inflight.delete(id);
|
|
1103
1044
|
}
|
|
1045
|
+
for (const task of this.queue)
|
|
1046
|
+
task.reject(new Error('Transform worker pool closed'));
|
|
1047
|
+
this.queue.length = 0;
|
|
1048
|
+
await Promise.allSettled(terminations);
|
|
1104
1049
|
}
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
return;
|
|
1109
|
-
slot.busy = false;
|
|
1110
|
-
slot.currentTaskId = null;
|
|
1050
|
+
ensureOpen() {
|
|
1051
|
+
if (this.closed)
|
|
1052
|
+
throw new Error('Transform worker pool closed');
|
|
1111
1053
|
}
|
|
1112
|
-
|
|
1113
|
-
const
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1054
|
+
createPendingTask(html, url, options, resolve, reject) {
|
|
1055
|
+
const id = randomUUID();
|
|
1056
|
+
let abortListener;
|
|
1057
|
+
if (options.signal) {
|
|
1058
|
+
abortListener = () => {
|
|
1059
|
+
this.onAbortSignal(id, url, reject);
|
|
1060
|
+
};
|
|
1061
|
+
options.signal.addEventListener('abort', abortListener, { once: true });
|
|
1062
|
+
}
|
|
1063
|
+
return {
|
|
1064
|
+
id,
|
|
1065
|
+
html,
|
|
1066
|
+
url,
|
|
1067
|
+
includeMetadata: options.includeMetadata,
|
|
1068
|
+
signal: options.signal,
|
|
1069
|
+
abortListener,
|
|
1070
|
+
resolve,
|
|
1071
|
+
reject,
|
|
1072
|
+
};
|
|
1120
1073
|
}
|
|
1121
|
-
|
|
1122
|
-
if (
|
|
1074
|
+
onAbortSignal(id, url, reject) {
|
|
1075
|
+
if (this.closed) {
|
|
1076
|
+
reject(new Error('Transform worker pool closed'));
|
|
1123
1077
|
return;
|
|
1124
|
-
try {
|
|
1125
|
-
slot.worker.postMessage({ type: 'cancel', id });
|
|
1126
1078
|
}
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
}
|
|
1131
|
-
restartWorker(workerIndex, slot) {
|
|
1132
|
-
if (this.closed)
|
|
1079
|
+
const inflight = this.inflight.get(id);
|
|
1080
|
+
if (inflight) {
|
|
1081
|
+
this.abortInflight(id, url, inflight.workerIndex);
|
|
1133
1082
|
return;
|
|
1134
|
-
const target = slot ?? this.workers[workerIndex];
|
|
1135
|
-
if (target) {
|
|
1136
|
-
void target.worker.terminate();
|
|
1137
1083
|
}
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
return false;
|
|
1144
|
-
reject(new Error('Transform worker pool closed'));
|
|
1145
|
-
return true;
|
|
1084
|
+
const queuedIndex = this.queue.findIndex((t) => t.id === id);
|
|
1085
|
+
if (queuedIndex !== -1) {
|
|
1086
|
+
this.queue.splice(queuedIndex, 1);
|
|
1087
|
+
reject(abortPolicy.createAbortError(url, 'transform:queued-abort'));
|
|
1088
|
+
}
|
|
1146
1089
|
}
|
|
1147
|
-
|
|
1090
|
+
abortInflight(id, url, workerIndex) {
|
|
1148
1091
|
const slot = this.workers[workerIndex];
|
|
1149
|
-
this.cancelWorkerTask(slot, id);
|
|
1150
|
-
this.failTask(id, this.createAbortError(url, 'transform:signal-abort'));
|
|
1151
1092
|
if (slot) {
|
|
1152
|
-
|
|
1093
|
+
try {
|
|
1094
|
+
slot.worker.postMessage({ type: 'cancel', id });
|
|
1095
|
+
}
|
|
1096
|
+
catch {
|
|
1097
|
+
/* ignore */
|
|
1098
|
+
}
|
|
1153
1099
|
}
|
|
1100
|
+
this.failTask(id, abortPolicy.createAbortError(url, 'transform:signal-abort'));
|
|
1101
|
+
if (slot)
|
|
1102
|
+
this.restartWorker(workerIndex, slot);
|
|
1154
1103
|
}
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
if (queuedIndex === -1)
|
|
1104
|
+
clearAbortListener(signal, listener) {
|
|
1105
|
+
if (!signal || !listener)
|
|
1158
1106
|
return;
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
busy: false,
|
|
1166
|
-
currentTaskId: null,
|
|
1167
|
-
};
|
|
1107
|
+
try {
|
|
1108
|
+
signal.removeEventListener('abort', listener);
|
|
1109
|
+
}
|
|
1110
|
+
catch {
|
|
1111
|
+
/* ignore */
|
|
1112
|
+
}
|
|
1168
1113
|
}
|
|
1169
|
-
|
|
1114
|
+
spawnWorker(workerIndex) {
|
|
1115
|
+
const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url));
|
|
1116
|
+
worker.unref();
|
|
1170
1117
|
worker.on('message', (raw) => {
|
|
1171
1118
|
this.onWorkerMessage(workerIndex, raw);
|
|
1172
1119
|
});
|
|
@@ -1176,20 +1123,7 @@ class WorkerPool {
|
|
|
1176
1123
|
worker.on('exit', (code) => {
|
|
1177
1124
|
this.onWorkerBroken(workerIndex, `Transform worker exited (code ${code})`);
|
|
1178
1125
|
});
|
|
1179
|
-
|
|
1180
|
-
constructor(size, timeoutMs) {
|
|
1181
|
-
this.minCapacity = POOL_MIN_WORKERS;
|
|
1182
|
-
this.maxCapacity = POOL_MAX_WORKERS;
|
|
1183
|
-
this.capacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
|
|
1184
|
-
this.timeoutMs = timeoutMs;
|
|
1185
|
-
this.queueMax = this.maxCapacity * 32;
|
|
1186
|
-
}
|
|
1187
|
-
spawnWorker(workerIndex) {
|
|
1188
|
-
const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url));
|
|
1189
|
-
worker.unref();
|
|
1190
|
-
const slot = this.createWorkerSlot(worker);
|
|
1191
|
-
this.registerWorkerHandlers(workerIndex, worker);
|
|
1192
|
-
return slot;
|
|
1126
|
+
return { worker, busy: false, currentTaskId: null };
|
|
1193
1127
|
}
|
|
1194
1128
|
onWorkerBroken(workerIndex, message) {
|
|
1195
1129
|
if (this.closed)
|
|
@@ -1202,19 +1136,14 @@ class WorkerPool {
|
|
|
1202
1136
|
}
|
|
1203
1137
|
this.restartWorker(workerIndex, slot);
|
|
1204
1138
|
}
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
markdown: result.markdown,
|
|
1208
|
-
truncated: result.truncated,
|
|
1209
|
-
title: result.title,
|
|
1210
|
-
});
|
|
1211
|
-
}
|
|
1212
|
-
rejectWorkerError(inflight, error) {
|
|
1213
|
-
if (error.name === 'FetchError') {
|
|
1214
|
-
inflight.reject(new FetchError(error.message, error.url, error.statusCode, error.details ?? {}));
|
|
1139
|
+
restartWorker(workerIndex, slot) {
|
|
1140
|
+
if (this.closed)
|
|
1215
1141
|
return;
|
|
1216
|
-
|
|
1217
|
-
|
|
1142
|
+
const target = slot ?? this.workers[workerIndex];
|
|
1143
|
+
if (target)
|
|
1144
|
+
void target.worker.terminate();
|
|
1145
|
+
this.workers[workerIndex] = this.spawnWorker(workerIndex);
|
|
1146
|
+
this.drainQueue();
|
|
1218
1147
|
}
|
|
1219
1148
|
onWorkerMessage(workerIndex, raw) {
|
|
1220
1149
|
const parsed = workerMessageSchema.safeParse(raw);
|
|
@@ -1224,63 +1153,48 @@ class WorkerPool {
|
|
|
1224
1153
|
const inflight = this.takeInflight(message.id);
|
|
1225
1154
|
if (!inflight)
|
|
1226
1155
|
return;
|
|
1227
|
-
this.
|
|
1156
|
+
this.markIdle(workerIndex);
|
|
1228
1157
|
if (message.type === 'result') {
|
|
1229
|
-
|
|
1158
|
+
inflight.resolve({
|
|
1159
|
+
markdown: message.result.markdown,
|
|
1160
|
+
truncated: message.result.truncated,
|
|
1161
|
+
title: message.result.title,
|
|
1162
|
+
});
|
|
1230
1163
|
}
|
|
1231
1164
|
else {
|
|
1232
|
-
|
|
1165
|
+
const err = message.error;
|
|
1166
|
+
if (err.name === 'FetchError') {
|
|
1167
|
+
inflight.reject(new FetchError(err.message, err.url, err.statusCode, err.details ?? {}));
|
|
1168
|
+
}
|
|
1169
|
+
else {
|
|
1170
|
+
inflight.reject(new Error(err.message));
|
|
1171
|
+
}
|
|
1233
1172
|
}
|
|
1234
1173
|
this.drainQueue();
|
|
1235
1174
|
}
|
|
1175
|
+
takeInflight(id) {
|
|
1176
|
+
const inflight = this.inflight.get(id);
|
|
1177
|
+
if (!inflight)
|
|
1178
|
+
return null;
|
|
1179
|
+
clearTimeout(inflight.timer);
|
|
1180
|
+
this.clearAbortListener(inflight.signal, inflight.abortListener);
|
|
1181
|
+
this.inflight.delete(id);
|
|
1182
|
+
return inflight;
|
|
1183
|
+
}
|
|
1184
|
+
markIdle(workerIndex) {
|
|
1185
|
+
const slot = this.workers[workerIndex];
|
|
1186
|
+
if (!slot)
|
|
1187
|
+
return;
|
|
1188
|
+
slot.busy = false;
|
|
1189
|
+
slot.currentTaskId = null;
|
|
1190
|
+
}
|
|
1236
1191
|
failTask(id, error) {
|
|
1237
1192
|
const inflight = this.takeInflight(id);
|
|
1238
1193
|
if (!inflight)
|
|
1239
1194
|
return;
|
|
1240
1195
|
inflight.reject(error);
|
|
1241
|
-
this.
|
|
1242
|
-
}
|
|
1243
|
-
handleAbortSignal(id, url, reject) {
|
|
1244
|
-
if (this.rejectIfClosed(reject))
|
|
1245
|
-
return;
|
|
1246
|
-
const inflight = this.inflight.get(id);
|
|
1247
|
-
if (inflight) {
|
|
1248
|
-
this.abortInflightTask(id, url, inflight.workerIndex);
|
|
1249
|
-
return;
|
|
1250
|
-
}
|
|
1251
|
-
this.abortQueuedTask(id, url, reject);
|
|
1252
|
-
}
|
|
1253
|
-
createPendingTask(html, url, options, resolve, reject) {
|
|
1254
|
-
const id = randomUUID();
|
|
1255
|
-
let abortListener;
|
|
1256
|
-
if (options.signal) {
|
|
1257
|
-
abortListener = () => {
|
|
1258
|
-
this.handleAbortSignal(id, url, reject);
|
|
1259
|
-
};
|
|
1260
|
-
options.signal.addEventListener('abort', abortListener, { once: true });
|
|
1261
|
-
}
|
|
1262
|
-
return {
|
|
1263
|
-
id,
|
|
1264
|
-
html,
|
|
1265
|
-
url,
|
|
1266
|
-
includeMetadata: options.includeMetadata,
|
|
1267
|
-
signal: options.signal,
|
|
1268
|
-
abortListener,
|
|
1269
|
-
resolve,
|
|
1270
|
-
reject,
|
|
1271
|
-
};
|
|
1196
|
+
this.markIdle(inflight.workerIndex);
|
|
1272
1197
|
}
|
|
1273
|
-
async transform(html, url, options) {
|
|
1274
|
-
this.ensureOpen();
|
|
1275
|
-
this.ensureNotAborted(options.signal, url, 'transform:enqueue');
|
|
1276
|
-
this.ensureQueueCapacity(url);
|
|
1277
|
-
return new Promise((resolve, reject) => {
|
|
1278
|
-
const task = this.createPendingTask(html, url, options, resolve, reject);
|
|
1279
|
-
this.queue.push(task);
|
|
1280
|
-
this.drainQueue();
|
|
1281
|
-
});
|
|
1282
|
-
}
|
|
1283
|
-
/** Scale capacity up if queue pressure exceeds threshold. */
|
|
1284
1198
|
maybeScaleUp() {
|
|
1285
1199
|
if (this.queue.length > this.capacity * POOL_SCALE_THRESHOLD &&
|
|
1286
1200
|
this.capacity < this.maxCapacity) {
|
|
@@ -1288,16 +1202,13 @@ class WorkerPool {
|
|
|
1288
1202
|
}
|
|
1289
1203
|
}
|
|
1290
1204
|
drainQueue() {
|
|
1291
|
-
if (this.closed)
|
|
1292
|
-
return;
|
|
1293
|
-
if (this.queue.length === 0)
|
|
1205
|
+
if (this.closed || this.queue.length === 0)
|
|
1294
1206
|
return;
|
|
1295
1207
|
this.maybeScaleUp();
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
const slot = this.workers[workerIndex];
|
|
1208
|
+
for (let i = 0; i < this.workers.length; i += 1) {
|
|
1209
|
+
const slot = this.workers[i];
|
|
1299
1210
|
if (slot && !slot.busy) {
|
|
1300
|
-
this.
|
|
1211
|
+
this.dispatchFromQueue(i, slot);
|
|
1301
1212
|
if (this.queue.length === 0)
|
|
1302
1213
|
return;
|
|
1303
1214
|
}
|
|
@@ -1306,7 +1217,7 @@ class WorkerPool {
|
|
|
1306
1217
|
const workerIndex = this.workers.length;
|
|
1307
1218
|
const slot = this.spawnWorker(workerIndex);
|
|
1308
1219
|
this.workers.push(slot);
|
|
1309
|
-
this.
|
|
1220
|
+
this.dispatchFromQueue(workerIndex, slot);
|
|
1310
1221
|
if (this.workers.length < this.capacity && this.queue.length > 0) {
|
|
1311
1222
|
setImmediate(() => {
|
|
1312
1223
|
this.drainQueue();
|
|
@@ -1314,39 +1225,28 @@ class WorkerPool {
|
|
|
1314
1225
|
}
|
|
1315
1226
|
}
|
|
1316
1227
|
}
|
|
1317
|
-
|
|
1228
|
+
dispatchFromQueue(workerIndex, slot) {
|
|
1318
1229
|
const task = this.queue.shift();
|
|
1319
1230
|
if (!task)
|
|
1320
1231
|
return;
|
|
1321
|
-
this.
|
|
1322
|
-
|
|
1323
|
-
dispatch(workerIndex, slot, task) {
|
|
1324
|
-
if (this.rejectIfAborted(task))
|
|
1232
|
+
if (this.closed) {
|
|
1233
|
+
task.reject(new Error('Transform worker pool closed'));
|
|
1325
1234
|
return;
|
|
1326
|
-
this.markSlotBusy(slot, task);
|
|
1327
|
-
const timer = this.startTaskTimer(workerIndex, slot, task);
|
|
1328
|
-
this.registerInflightTask(task, timer, workerIndex);
|
|
1329
|
-
try {
|
|
1330
|
-
this.sendTransformMessage(slot, task);
|
|
1331
1235
|
}
|
|
1332
|
-
|
|
1333
|
-
this.
|
|
1236
|
+
if (task.signal?.aborted) {
|
|
1237
|
+
this.clearAbortListener(task.signal, task.abortListener);
|
|
1238
|
+
task.reject(abortPolicy.createAbortError(task.url, 'transform:dispatch'));
|
|
1239
|
+
return;
|
|
1334
1240
|
}
|
|
1335
|
-
}
|
|
1336
|
-
rejectIfAborted(task) {
|
|
1337
|
-
if (!task.signal?.aborted)
|
|
1338
|
-
return false;
|
|
1339
|
-
this.clearAbortListener(task.signal, task.abortListener);
|
|
1340
|
-
task.reject(this.createAbortError(task.url, 'transform:dispatch'));
|
|
1341
|
-
return true;
|
|
1342
|
-
}
|
|
1343
|
-
markSlotBusy(slot, task) {
|
|
1344
1241
|
slot.busy = true;
|
|
1345
1242
|
slot.currentTaskId = task.id;
|
|
1346
|
-
}
|
|
1347
|
-
startTaskTimer(workerIndex, slot, task) {
|
|
1348
1243
|
const timer = setTimeout(() => {
|
|
1349
|
-
|
|
1244
|
+
try {
|
|
1245
|
+
slot.worker.postMessage({ type: 'cancel', id: task.id });
|
|
1246
|
+
}
|
|
1247
|
+
catch {
|
|
1248
|
+
/* ignore */
|
|
1249
|
+
}
|
|
1350
1250
|
const inflight = this.takeInflight(task.id);
|
|
1351
1251
|
if (!inflight)
|
|
1352
1252
|
return;
|
|
@@ -1357,9 +1257,6 @@ class WorkerPool {
|
|
|
1357
1257
|
this.restartWorker(workerIndex, slot);
|
|
1358
1258
|
}, this.timeoutMs);
|
|
1359
1259
|
timer.unref();
|
|
1360
|
-
return timer;
|
|
1361
|
-
}
|
|
1362
|
-
registerInflightTask(task, timer, workerIndex) {
|
|
1363
1260
|
this.inflight.set(task.id, {
|
|
1364
1261
|
resolve: task.resolve,
|
|
1365
1262
|
reject: task.reject,
|
|
@@ -1368,58 +1265,56 @@ class WorkerPool {
|
|
|
1368
1265
|
abortListener: task.abortListener,
|
|
1369
1266
|
workerIndex,
|
|
1370
1267
|
});
|
|
1268
|
+
try {
|
|
1269
|
+
slot.worker.postMessage({
|
|
1270
|
+
type: 'transform',
|
|
1271
|
+
id: task.id,
|
|
1272
|
+
html: task.html,
|
|
1273
|
+
url: task.url,
|
|
1274
|
+
includeMetadata: task.includeMetadata,
|
|
1275
|
+
});
|
|
1276
|
+
}
|
|
1277
|
+
catch (error) {
|
|
1278
|
+
clearTimeout(timer);
|
|
1279
|
+
this.clearAbortListener(task.signal, task.abortListener);
|
|
1280
|
+
this.inflight.delete(task.id);
|
|
1281
|
+
this.markIdle(workerIndex);
|
|
1282
|
+
task.reject(error instanceof Error
|
|
1283
|
+
? error
|
|
1284
|
+
: new Error('Failed to dispatch transform worker message'));
|
|
1285
|
+
this.restartWorker(workerIndex, slot);
|
|
1286
|
+
}
|
|
1371
1287
|
}
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
includeMetadata: task.includeMetadata,
|
|
1379
|
-
});
|
|
1380
|
-
}
|
|
1381
|
-
handleDispatchFailure(workerIndex, slot, task, timer, error) {
|
|
1382
|
-
clearTimeout(timer);
|
|
1383
|
-
this.clearAbortListener(task.signal, task.abortListener);
|
|
1384
|
-
this.inflight.delete(task.id);
|
|
1385
|
-
this.markSlotIdle(workerIndex);
|
|
1386
|
-
const message = error instanceof Error
|
|
1387
|
-
? error
|
|
1388
|
-
: new Error('Failed to dispatch transform worker message');
|
|
1389
|
-
task.reject(message);
|
|
1390
|
-
this.restartWorker(workerIndex, slot);
|
|
1391
|
-
}
|
|
1392
|
-
getQueueDepth() {
|
|
1393
|
-
return this.queue.length;
|
|
1394
|
-
}
|
|
1395
|
-
getActiveWorkers() {
|
|
1396
|
-
return this.workers.filter((s) => s?.busy).length;
|
|
1288
|
+
}
|
|
1289
|
+
class TransformWorkerPoolManager {
|
|
1290
|
+
pool = null;
|
|
1291
|
+
getOrCreate() {
|
|
1292
|
+
this.pool ??= new WorkerPool(POOL_MIN_WORKERS, DEFAULT_TIMEOUT_MS);
|
|
1293
|
+
return this.pool;
|
|
1397
1294
|
}
|
|
1398
|
-
|
|
1399
|
-
|
|
1295
|
+
getStats() {
|
|
1296
|
+
if (!this.pool)
|
|
1297
|
+
return null;
|
|
1298
|
+
return {
|
|
1299
|
+
queueDepth: this.pool.getQueueDepth(),
|
|
1300
|
+
activeWorkers: this.pool.getActiveWorkers(),
|
|
1301
|
+
capacity: this.pool.getCapacity(),
|
|
1302
|
+
};
|
|
1400
1303
|
}
|
|
1401
|
-
async
|
|
1402
|
-
if (this.
|
|
1304
|
+
async shutdown() {
|
|
1305
|
+
if (!this.pool)
|
|
1403
1306
|
return;
|
|
1404
|
-
this.
|
|
1405
|
-
|
|
1406
|
-
.map((slot) => slot?.worker.terminate())
|
|
1407
|
-
.filter((p) => p !== undefined);
|
|
1408
|
-
this.workers.fill(undefined);
|
|
1409
|
-
this.workers.length = 0;
|
|
1410
|
-
for (const [id, inflight] of this.inflight.entries()) {
|
|
1411
|
-
clearTimeout(inflight.timer);
|
|
1412
|
-
this.clearAbortListener(inflight.signal, inflight.abortListener);
|
|
1413
|
-
inflight.reject(new Error('Transform worker pool closed'));
|
|
1414
|
-
this.inflight.delete(id);
|
|
1415
|
-
}
|
|
1416
|
-
for (const task of this.queue) {
|
|
1417
|
-
task.reject(new Error('Transform worker pool closed'));
|
|
1418
|
-
}
|
|
1419
|
-
this.queue.length = 0;
|
|
1420
|
-
await Promise.allSettled(terminations);
|
|
1307
|
+
await this.pool.close();
|
|
1308
|
+
this.pool = null;
|
|
1421
1309
|
}
|
|
1422
1310
|
}
|
|
1311
|
+
const poolManager = new TransformWorkerPoolManager();
|
|
1312
|
+
export function getTransformPoolStats() {
|
|
1313
|
+
return poolManager.getStats();
|
|
1314
|
+
}
|
|
1315
|
+
export async function shutdownTransformWorkerPool() {
|
|
1316
|
+
await poolManager.shutdown();
|
|
1317
|
+
}
|
|
1423
1318
|
function buildWorkerTransformOptions(options) {
|
|
1424
1319
|
return {
|
|
1425
1320
|
includeMetadata: options.includeMetadata,
|
|
@@ -1427,30 +1322,37 @@ function buildWorkerTransformOptions(options) {
|
|
|
1427
1322
|
};
|
|
1428
1323
|
}
|
|
1429
1324
|
async function transformWithWorkerPool(html, url, options) {
|
|
1430
|
-
const
|
|
1431
|
-
return
|
|
1325
|
+
const pool = poolManager.getOrCreate();
|
|
1326
|
+
return pool.transform(html, url, buildWorkerTransformOptions(options));
|
|
1432
1327
|
}
|
|
1433
1328
|
function resolveWorkerFallback(error, html, url, options) {
|
|
1434
|
-
if (error instanceof FetchError)
|
|
1329
|
+
if (error instanceof FetchError)
|
|
1435
1330
|
throw error;
|
|
1436
|
-
|
|
1437
|
-
throwIfAborted(options.signal, url, 'transform:worker-fallback');
|
|
1331
|
+
abortPolicy.throwIfAborted(options.signal, url, 'transform:worker-fallback');
|
|
1438
1332
|
return transformHtmlToMarkdownInProcess(html, url, options);
|
|
1439
1333
|
}
|
|
1440
1334
|
export async function transformHtmlToMarkdown(html, url, options) {
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1335
|
+
const totalStage = stageTracker.start(url, 'transform:total');
|
|
1336
|
+
let success = false;
|
|
1337
|
+
try {
|
|
1338
|
+
abortPolicy.throwIfAborted(options.signal, url, 'transform:begin');
|
|
1339
|
+
const workerStage = stageTracker.start(url, 'transform:worker');
|
|
1444
1340
|
try {
|
|
1445
1341
|
const result = await transformWithWorkerPool(html, url, options);
|
|
1342
|
+
success = true;
|
|
1446
1343
|
return result;
|
|
1447
1344
|
}
|
|
1448
1345
|
catch (error) {
|
|
1449
1346
|
const fallback = resolveWorkerFallback(error, html, url, options);
|
|
1347
|
+
success = true;
|
|
1450
1348
|
return fallback;
|
|
1451
1349
|
}
|
|
1452
1350
|
finally {
|
|
1453
|
-
|
|
1351
|
+
stageTracker.end(workerStage);
|
|
1454
1352
|
}
|
|
1455
|
-
}
|
|
1353
|
+
}
|
|
1354
|
+
finally {
|
|
1355
|
+
if (success)
|
|
1356
|
+
stageTracker.end(totalStage, { truncated: false });
|
|
1357
|
+
}
|
|
1456
1358
|
}
|