@j0hanz/superfetch 2.4.3 → 2.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/transform.js CHANGED
@@ -14,133 +14,158 @@ import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './langua
14
14
  import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isLikelyHtmlContent, isRawTextContent, } from './markdown-cleanup.js';
15
15
  import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
16
16
  import { isObject } from './type-guards.js';
17
- function getAbortReason(signal) {
18
- if (!isObject(signal))
19
- return undefined;
20
- return 'reason' in signal ? signal.reason : undefined;
21
- }
22
- // DOM accessor helpers moved to ./dom-noise-removal.ts
23
- const CODE_BLOCK = {
24
- fence: '```',
25
- format: (code, language = '') => {
26
- return `\`\`\`${language}\n${code}\n\`\`\``;
27
- },
28
- };
29
- const transformChannel = diagnosticsChannel.channel('superfetch.transform');
30
- const LOG_URL_MAX = 80;
31
- function truncateUrlForLog(url) {
32
- return url.substring(0, LOG_URL_MAX);
33
- }
34
- function publishTransformEvent(event) {
35
- if (!transformChannel.hasSubscribers)
36
- return;
37
- try {
38
- transformChannel.publish(event);
17
+ /* -------------------------------------------------------------------------------------------------
18
+ * Abort policy (single source of truth)
19
+ * ------------------------------------------------------------------------------------------------- */
20
+ class AbortPolicy {
21
+ getAbortReason(signal) {
22
+ if (!isObject(signal))
23
+ return undefined;
24
+ return 'reason' in signal
25
+ ? signal.reason
26
+ : undefined;
39
27
  }
40
- catch {
41
- /* empty */
28
+ isTimeoutReason(reason) {
29
+ return reason instanceof Error && reason.name === 'TimeoutError';
30
+ }
31
+ throwIfAborted(signal, url, stage) {
32
+ if (!signal?.aborted)
33
+ return;
34
+ const reason = this.getAbortReason(signal);
35
+ if (this.isTimeoutReason(reason)) {
36
+ throw new FetchError('Request timeout', url, 504, {
37
+ reason: 'timeout',
38
+ stage,
39
+ });
40
+ }
41
+ throw new FetchError('Request was canceled', url, 499, {
42
+ reason: 'aborted',
43
+ stage,
44
+ });
45
+ }
46
+ createAbortError(url, stage) {
47
+ return new FetchError('Request was canceled', url, 499, {
48
+ reason: 'aborted',
49
+ stage,
50
+ });
42
51
  }
43
52
  }
44
- export function startTransformStage(url, stage, budget) {
45
- if (!transformChannel.hasSubscribers && !budget)
46
- return null;
47
- const remainingBudgetMs = budget
48
- ? budget.totalBudgetMs - budget.elapsedMs
49
- : undefined;
50
- const base = {
51
- stage,
52
- startTime: performance.now(),
53
- url: redactUrl(url),
54
- };
55
- if (remainingBudgetMs !== undefined && budget) {
56
- return {
57
- ...base,
58
- budgetMs: remainingBudgetMs,
59
- totalBudgetMs: budget.totalBudgetMs,
53
+ const abortPolicy = new AbortPolicy();
54
+ /* -------------------------------------------------------------------------------------------------
55
+ * Stage tracking & diagnostics
56
+ * ------------------------------------------------------------------------------------------------- */
57
+ class StageTracker {
58
+ channel = diagnosticsChannel.channel('superfetch.transform');
59
+ start(url, stage, budget) {
60
+ if (!this.channel.hasSubscribers && !budget)
61
+ return null;
62
+ const remainingBudgetMs = budget
63
+ ? budget.totalBudgetMs - budget.elapsedMs
64
+ : undefined;
65
+ const base = {
66
+ stage,
67
+ startTime: performance.now(),
68
+ url: redactUrl(url),
60
69
  };
70
+ if (remainingBudgetMs !== undefined && budget) {
71
+ return {
72
+ ...base,
73
+ budgetMs: remainingBudgetMs,
74
+ totalBudgetMs: budget.totalBudgetMs,
75
+ };
76
+ }
77
+ return base;
78
+ }
79
+ end(context, options) {
80
+ if (!context)
81
+ return 0;
82
+ const durationMs = performance.now() - context.startTime;
83
+ const requestId = getRequestId();
84
+ const operationId = getOperationId();
85
+ if (context.totalBudgetMs !== undefined) {
86
+ const warnThresholdMs = context.totalBudgetMs * config.transform.stageWarnRatio;
87
+ if (durationMs > warnThresholdMs) {
88
+ logWarn('Transform stage exceeded warning threshold', {
89
+ stage: context.stage,
90
+ durationMs: Math.round(durationMs),
91
+ thresholdMs: Math.round(warnThresholdMs),
92
+ url: context.url,
93
+ });
94
+ }
95
+ }
96
+ const event = {
97
+ v: 1,
98
+ type: 'stage',
99
+ stage: context.stage,
100
+ durationMs,
101
+ url: context.url,
102
+ ...(requestId ? { requestId } : {}),
103
+ ...(operationId ? { operationId } : {}),
104
+ ...(options?.truncated !== undefined
105
+ ? { truncated: options.truncated }
106
+ : {}),
107
+ };
108
+ this.publish(event);
109
+ return durationMs;
61
110
  }
62
- return base;
63
- }
64
- export function endTransformStage(context, options) {
65
- if (!context)
66
- return 0;
67
- const durationMs = performance.now() - context.startTime;
68
- const requestId = getRequestId();
69
- const operationId = getOperationId();
70
- if (context.totalBudgetMs !== undefined) {
71
- const warnThresholdMs = context.totalBudgetMs * config.transform.stageWarnRatio;
72
- if (durationMs > warnThresholdMs) {
73
- logWarn('Transform stage exceeded warning threshold', {
74
- stage: context.stage,
75
- durationMs: Math.round(durationMs),
76
- thresholdMs: Math.round(warnThresholdMs),
77
- url: context.url,
111
+ run(url, stage, fn, budget) {
112
+ if (budget && budget.elapsedMs >= budget.totalBudgetMs) {
113
+ throw new FetchError('Transform budget exhausted', url, 504, {
114
+ reason: 'timeout',
115
+ stage: `${stage}:budget_exhausted`,
116
+ elapsedMs: budget.elapsedMs,
117
+ totalBudgetMs: budget.totalBudgetMs,
78
118
  });
79
119
  }
120
+ const ctx = this.start(url, stage, budget);
121
+ try {
122
+ return fn();
123
+ }
124
+ finally {
125
+ this.end(ctx);
126
+ }
80
127
  }
81
- const event = {
82
- v: 1,
83
- type: 'stage',
84
- stage: context.stage,
85
- durationMs,
86
- url: context.url,
87
- ...(requestId ? { requestId } : {}),
88
- ...(operationId ? { operationId } : {}),
89
- ...(options?.truncated !== undefined
90
- ? { truncated: options.truncated }
91
- : {}),
92
- };
93
- publishTransformEvent(event);
94
- return durationMs;
95
- }
96
- function runTransformStage(url, stage, fn, budget) {
97
- if (budget && budget.elapsedMs >= budget.totalBudgetMs) {
98
- throw new FetchError('Transform budget exhausted', url, 504, {
99
- reason: 'timeout',
100
- stage: `${stage}:budget_exhausted`,
101
- elapsedMs: budget.elapsedMs,
102
- totalBudgetMs: budget.totalBudgetMs,
103
- });
104
- }
105
- const context = startTransformStage(url, stage, budget);
106
- try {
107
- return fn();
128
+ async runAsync(url, stage, fn) {
129
+ const ctx = this.start(url, stage);
130
+ try {
131
+ return await fn();
132
+ }
133
+ finally {
134
+ this.end(ctx);
135
+ }
108
136
  }
109
- finally {
110
- endTransformStage(context);
137
+ publish(event) {
138
+ if (!this.channel.hasSubscribers)
139
+ return;
140
+ try {
141
+ this.channel.publish(event);
142
+ }
143
+ catch {
144
+ // Intentionally ignore diagnostics failures
145
+ }
111
146
  }
112
147
  }
113
- function isTimeoutReason(reason) {
114
- return reason instanceof Error && reason.name === 'TimeoutError';
148
+ const stageTracker = new StageTracker();
149
+ /** Backwards-compatible exports */
150
+ export function startTransformStage(url, stage, budget) {
151
+ return stageTracker.start(url, stage, budget);
115
152
  }
116
- function throwIfAborted(signal, url, stage) {
117
- if (!signal)
118
- return;
119
- const { aborted } = signal;
120
- if (!aborted)
121
- return;
122
- const reason = getAbortReason(signal);
123
- if (isTimeoutReason(reason)) {
124
- throw new FetchError('Request timeout', url, 504, {
125
- reason: 'timeout',
126
- stage,
127
- });
128
- }
129
- throw new FetchError('Request was canceled', url, 499, {
130
- reason: 'aborted',
131
- stage,
132
- });
153
+ export function endTransformStage(context, options) {
154
+ return stageTracker.end(context, options);
133
155
  }
156
+ /* -------------------------------------------------------------------------------------------------
157
+ * HTML size guard
158
+ * ------------------------------------------------------------------------------------------------- */
134
159
  function truncateHtml(html) {
135
160
  const maxSize = config.constants.maxHtmlSize;
136
161
  if (html.length <= maxSize) {
137
- return html;
162
+ return { html, truncated: false };
138
163
  }
139
164
  logWarn('HTML content exceeds maximum size, truncating', {
140
165
  size: html.length,
141
166
  maxSize,
142
167
  });
143
- return html.substring(0, maxSize);
168
+ return { html: html.substring(0, maxSize), truncated: true };
144
169
  }
145
170
  const META_PROPERTY_HANDLERS = new Map([
146
171
  [
@@ -200,162 +225,109 @@ const META_NAME_HANDLERS = new Map([
200
225
  },
201
226
  ],
202
227
  ]);
203
- function extractMetadata(document) {
204
- const ctx = {
205
- title: {},
206
- description: {},
207
- };
208
- for (const tag of document.querySelectorAll('meta')) {
209
- const content = tag.getAttribute('content')?.trim();
210
- if (!content)
211
- continue;
212
- const property = tag.getAttribute('property');
213
- if (property) {
214
- META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
228
+ class MetadataExtractor {
229
+ extract(document) {
230
+ const ctx = { title: {}, description: {} };
231
+ for (const tag of document.querySelectorAll('meta')) {
232
+ const content = tag.getAttribute('content')?.trim();
233
+ if (!content)
234
+ continue;
235
+ const property = tag.getAttribute('property');
236
+ if (property)
237
+ META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
238
+ const name = tag.getAttribute('name');
239
+ if (name)
240
+ META_NAME_HANDLERS.get(name)?.(ctx, content);
215
241
  }
216
- const name = tag.getAttribute('name');
217
- if (name) {
218
- META_NAME_HANDLERS.get(name)?.(ctx, content);
242
+ const titleEl = document.querySelector('title');
243
+ if (!ctx.title.standard && titleEl?.textContent) {
244
+ ctx.title.standard = titleEl.textContent.trim();
219
245
  }
220
- }
221
- const titleEl = document.querySelector('title');
222
- if (!ctx.title.standard && titleEl?.textContent) {
223
- ctx.title.standard = titleEl.textContent.trim();
224
- }
225
- const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
226
- const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
227
- const metadata = {};
228
- if (resolvedTitle)
229
- metadata.title = resolvedTitle;
230
- if (resolvedDesc)
231
- metadata.description = resolvedDesc;
232
- if (ctx.author)
233
- metadata.author = ctx.author;
234
- if (ctx.image)
235
- metadata.image = ctx.image;
236
- if (ctx.publishedAt)
237
- metadata.publishedAt = ctx.publishedAt;
238
- if (ctx.modifiedAt)
239
- metadata.modifiedAt = ctx.modifiedAt;
240
- return metadata;
241
- }
246
+ const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
247
+ const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
248
+ const metadata = {};
249
+ if (resolvedTitle)
250
+ metadata.title = resolvedTitle;
251
+ if (resolvedDesc)
252
+ metadata.description = resolvedDesc;
253
+ if (ctx.author)
254
+ metadata.author = ctx.author;
255
+ if (ctx.image)
256
+ metadata.image = ctx.image;
257
+ if (ctx.publishedAt)
258
+ metadata.publishedAt = ctx.publishedAt;
259
+ if (ctx.modifiedAt)
260
+ metadata.modifiedAt = ctx.modifiedAt;
261
+ return metadata;
262
+ }
263
+ }
264
+ const metadataExtractor = new MetadataExtractor();
265
+ /* -------------------------------------------------------------------------------------------------
266
+ * Article extraction (Readability)
267
+ * ------------------------------------------------------------------------------------------------- */
242
268
  function isReadabilityCompatible(doc) {
243
269
  if (!isObject(doc))
244
270
  return false;
245
- return hasDocumentElement(doc) && hasQuerySelectors(doc);
246
- }
247
- function hasDocumentElement(record) {
248
- return 'documentElement' in record;
249
- }
250
- function hasQuerySelectors(record) {
251
- return (typeof record.querySelectorAll === 'function' &&
271
+ const record = doc;
272
+ return ('documentElement' in record &&
273
+ typeof record.querySelectorAll ===
274
+ 'function' &&
252
275
  typeof record.querySelector === 'function');
253
276
  }
254
- function extractArticle(document) {
255
- if (!isReadabilityCompatible(document)) {
256
- logWarn('Document not compatible with Readability');
257
- return null;
258
- }
259
- try {
260
- const doc = document;
261
- const rawText = doc.querySelector('body')?.textContent ?? doc.documentElement.textContent;
262
- const textLength = rawText.replace(/\s+/g, ' ').trim().length;
263
- if (textLength < 100) {
264
- logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
265
- 'This might be a client-side rendered (SPA) application. ' +
266
- 'Content extraction may be incomplete.', { textLength });
267
- }
268
- if (textLength >= 400 && !isProbablyReaderable(doc)) {
277
+ class ArticleExtractor {
278
+ extract(document) {
279
+ if (!isReadabilityCompatible(document)) {
280
+ logWarn('Document not compatible with Readability');
269
281
  return null;
270
282
  }
271
- const reader = new Readability(doc, { maxElemsToParse: 20_000 });
272
- const parsed = reader.parse();
273
- if (!parsed)
283
+ try {
284
+ const doc = document;
285
+ const rawText = doc.querySelector('body')?.textContent ??
286
+ doc.documentElement.textContent;
287
+ const textLength = rawText.replace(/\s+/g, ' ').trim().length;
288
+ if (textLength < 100) {
289
+ logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
290
+ 'This might be a client-side rendered (SPA) application. ' +
291
+ 'Content extraction may be incomplete.', { textLength });
292
+ }
293
+ if (textLength >= 400 && !isProbablyReaderable(doc)) {
294
+ return null;
295
+ }
296
+ const reader = new Readability(doc, { maxElemsToParse: 20_000 });
297
+ const parsed = reader.parse();
298
+ if (!parsed)
299
+ return null;
300
+ return {
301
+ content: parsed.content ?? '',
302
+ textContent: parsed.textContent ?? '',
303
+ ...(parsed.title != null && { title: parsed.title }),
304
+ ...(parsed.byline != null && { byline: parsed.byline }),
305
+ ...(parsed.excerpt != null && { excerpt: parsed.excerpt }),
306
+ ...(parsed.siteName != null && { siteName: parsed.siteName }),
307
+ };
308
+ }
309
+ catch (error) {
310
+ logError('Failed to extract article with Readability', error instanceof Error ? error : undefined);
274
311
  return null;
275
- return {
276
- content: parsed.content ?? '',
277
- textContent: parsed.textContent ?? '',
278
- ...(parsed.title != null && { title: parsed.title }),
279
- ...(parsed.byline != null && { byline: parsed.byline }),
280
- ...(parsed.excerpt != null && { excerpt: parsed.excerpt }),
281
- ...(parsed.siteName != null && { siteName: parsed.siteName }),
282
- };
283
- }
284
- catch (error) {
285
- logError('Failed to extract article with Readability', error instanceof Error ? error : undefined);
286
- return null;
287
- }
288
- }
289
- export function extractContent(html, url, options = {
290
- extractArticle: true,
291
- }) {
292
- const result = extractContentWithDocument(html, url, options);
293
- return { article: result.article, metadata: result.metadata };
294
- }
295
- function extractContentWithDocument(html, url, options) {
296
- if (!isValidInput(html, url)) {
297
- const { document } = parseHTML('<html></html>');
298
- return { article: null, metadata: {}, document };
299
- }
300
- return tryExtractContent(html, url, options);
301
- }
302
- function extractArticleWithStage(document, url, shouldExtract) {
303
- if (!shouldExtract)
304
- return null;
305
- return runTransformStage(url, 'extract:article', () => resolveArticleExtraction(document, shouldExtract));
306
- }
307
- function handleExtractionFailure(error, url, signal) {
308
- if (error instanceof FetchError) {
309
- throw error;
310
- }
311
- throwIfAborted(signal, url, 'extract:error');
312
- logError('Failed to extract content', error instanceof Error ? error : undefined);
313
- const { document } = parseHTML('<html></html>');
314
- return { article: null, metadata: {}, document };
315
- }
316
- function extractContentStages(html, url, options) {
317
- throwIfAborted(options.signal, url, 'extract:begin');
318
- const truncatedHtml = truncateHtml(html);
319
- const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncatedHtml));
320
- throwIfAborted(options.signal, url, 'extract:parsed');
321
- applyBaseUri(document, url);
322
- const metadata = runTransformStage(url, 'extract:metadata', () => extractMetadata(document));
323
- throwIfAborted(options.signal, url, 'extract:metadata');
324
- const article = extractArticleWithStage(document, url, options.extractArticle);
325
- throwIfAborted(options.signal, url, 'extract:article');
326
- return {
327
- article,
328
- metadata,
329
- document,
330
- ...(truncatedHtml.length !== html.length ? { truncated: true } : {}),
331
- };
332
- }
333
- function tryExtractContent(html, url, options) {
334
- try {
335
- return extractContentStages(html, url, options);
336
- }
337
- catch (error) {
338
- return handleExtractionFailure(error, url, options.signal);
312
+ }
339
313
  }
340
314
  }
341
- function isValidInput(html, url) {
342
- return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
343
- }
315
+ const articleExtractor = new ArticleExtractor();
316
+ /* -------------------------------------------------------------------------------------------------
317
+ * Content extraction orchestration
318
+ * ------------------------------------------------------------------------------------------------- */
344
319
  function validateRequiredString(value, message) {
345
320
  if (typeof value === 'string' && value.length > 0)
346
321
  return true;
347
322
  logWarn(message);
348
323
  return false;
349
324
  }
350
- function resolveArticleExtraction(document, shouldExtract) {
351
- return shouldExtract ? extractArticle(document) : null;
325
+ function isValidInput(html, url) {
326
+ return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
352
327
  }
353
328
  function applyBaseUri(document, url) {
354
329
  try {
355
- Object.defineProperty(document, 'baseURI', {
356
- value: url,
357
- writable: true,
358
- });
330
+ Object.defineProperty(document, 'baseURI', { value: url, writable: true });
359
331
  }
360
332
  catch (error) {
361
333
  logInfo('Failed to set baseURI (non-critical)', {
@@ -364,13 +336,62 @@ function applyBaseUri(document, url) {
364
336
  });
365
337
  }
366
338
  }
339
+ class ContentExtractor {
340
+ extract(html, url, options) {
341
+ if (!isValidInput(html, url)) {
342
+ const { document } = parseHTML('<html></html>');
343
+ return { article: null, metadata: {}, document };
344
+ }
345
+ try {
346
+ abortPolicy.throwIfAborted(options.signal, url, 'extract:begin');
347
+ const { html: limitedHtml, truncated } = truncateHtml(html);
348
+ const { document } = stageTracker.run(url, 'extract:parse', () => parseHTML(limitedHtml));
349
+ abortPolicy.throwIfAborted(options.signal, url, 'extract:parsed');
350
+ applyBaseUri(document, url);
351
+ const metadata = stageTracker.run(url, 'extract:metadata', () => metadataExtractor.extract(document));
352
+ abortPolicy.throwIfAborted(options.signal, url, 'extract:metadata');
353
+ const article = options.extractArticle
354
+ ? stageTracker.run(url, 'extract:article', () => articleExtractor.extract(document))
355
+ : null;
356
+ abortPolicy.throwIfAborted(options.signal, url, 'extract:article');
357
+ return {
358
+ article,
359
+ metadata,
360
+ document,
361
+ ...(truncated ? { truncated: true } : {}),
362
+ };
363
+ }
364
+ catch (error) {
365
+ if (error instanceof FetchError)
366
+ throw error;
367
+ abortPolicy.throwIfAborted(options.signal, url, 'extract:error');
368
+ logError('Failed to extract content', error instanceof Error ? error : undefined);
369
+ const { document } = parseHTML('<html></html>');
370
+ return { article: null, metadata: {}, document };
371
+ }
372
+ }
373
+ }
374
+ const contentExtractor = new ContentExtractor();
375
+ /** Backwards-compatible export */
376
+ export function extractContent(html, url, options = {
377
+ extractArticle: true,
378
+ }) {
379
+ const result = contentExtractor.extract(html, url, options);
380
+ return { article: result.article, metadata: result.metadata };
381
+ }
382
+ /* -------------------------------------------------------------------------------------------------
383
+ * Markdown conversion
384
+ * ------------------------------------------------------------------------------------------------- */
385
+ const CODE_BLOCK = {
386
+ fence: '```',
387
+ format: (code, language = '') => `\`\`\`${language}\n${code}\n\`\`\``,
388
+ };
367
389
  function buildInlineCode(content) {
368
390
  let maxBackticks = 0;
369
391
  let currentRun = 0;
370
392
  for (const char of content) {
371
- if (char === '`') {
372
- currentRun++;
373
- }
393
+ if (char === '`')
394
+ currentRun += 1;
374
395
  else {
375
396
  if (currentRun > maxBackticks)
376
397
  maxBackticks = currentRun;
@@ -402,21 +423,25 @@ function deriveAltFromImageUrl(src) {
402
423
  return '';
403
424
  }
404
425
  }
426
+ function hasGetAttribute(value) {
427
+ return (isObject(value) &&
428
+ typeof value.getAttribute === 'function');
429
+ }
405
430
  function isCodeBlock(parent) {
406
431
  if (!isObject(parent))
407
432
  return false;
408
- const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
433
+ const tagName = typeof parent.tagName === 'string'
434
+ ? parent.tagName.toUpperCase()
435
+ : '';
409
436
  return ['PRE', 'WRAPPED-PRE'].includes(tagName);
410
437
  }
411
- function hasGetAttribute(value) {
412
- return isObject(value) && typeof value.getAttribute === 'function';
413
- }
414
- function buildInlineCodeTranslator() {
415
- return {
416
- spaceIfRepeatingChar: true,
417
- noEscape: true,
418
- postprocess: ({ content }) => buildInlineCode(content),
419
- };
438
+ function isAnchor(node) {
439
+ if (!isObject(node))
440
+ return false;
441
+ const tagName = typeof node.tagName === 'string'
442
+ ? node.tagName.toUpperCase()
443
+ : '';
444
+ return tagName === 'A';
420
445
  }
421
446
  function resolveAttributeLanguage(node) {
422
447
  const getAttribute = hasGetAttribute(node)
@@ -426,46 +451,20 @@ function resolveAttributeLanguage(node) {
426
451
  const dataLanguage = getAttribute?.('data-language') ?? '';
427
452
  return resolveLanguageFromAttributes(className, dataLanguage);
428
453
  }
429
- function buildCodeTranslator(ctx) {
430
- if (!isObject(ctx))
431
- return buildInlineCodeTranslator();
432
- const { parent } = ctx;
433
- if (!isCodeBlock(parent))
434
- return buildInlineCodeTranslator();
435
- return {
436
- noEscape: true,
437
- preserveWhitespace: true,
438
- };
439
- }
440
- function buildImageTranslator(ctx) {
441
- if (!isObject(ctx))
442
- return { content: '' };
443
- const { node } = ctx;
444
- const getAttribute = hasGetAttribute(node)
445
- ? node.getAttribute.bind(node)
446
- : undefined;
447
- const src = getAttribute?.('src') ?? '';
448
- const existingAlt = getAttribute?.('alt') ?? '';
449
- const alt = existingAlt.trim() || deriveAltFromImageUrl(src);
450
- return {
451
- content: `![${alt}](${src})`,
452
- };
453
- }
454
454
  function findLanguageFromCodeChild(node) {
455
455
  if (!isObject(node))
456
456
  return undefined;
457
- const { childNodes } = node;
458
- if (!Array.isArray(childNodes))
459
- return undefined;
457
+ const childNodes = Array.isArray(node.childNodes)
458
+ ? node.childNodes
459
+ : [];
460
460
  for (const child of childNodes) {
461
461
  if (!isObject(child))
462
462
  continue;
463
463
  const tagName = typeof child.rawTagName === 'string'
464
464
  ? child.rawTagName.toUpperCase()
465
465
  : '';
466
- if (tagName === 'CODE') {
466
+ if (tagName === 'CODE')
467
467
  return resolveAttributeLanguage(child);
468
- }
469
468
  }
470
469
  return undefined;
471
470
  }
@@ -478,6 +477,37 @@ function createCodeBlockPostprocessor(language) {
478
477
  return CODE_BLOCK.format(trimmed, resolvedLanguage);
479
478
  };
480
479
  }
480
+ function buildInlineCodeTranslator() {
481
+ return {
482
+ spaceIfRepeatingChar: true,
483
+ noEscape: true,
484
+ postprocess: ({ content }) => buildInlineCode(content),
485
+ };
486
+ }
487
+ function buildCodeTranslator(ctx) {
488
+ if (!isObject(ctx))
489
+ return buildInlineCodeTranslator();
490
+ const { parent } = ctx;
491
+ if (!isCodeBlock(parent))
492
+ return buildInlineCodeTranslator();
493
+ return { noEscape: true, preserveWhitespace: true };
494
+ }
495
+ function buildImageTranslator(ctx) {
496
+ if (!isObject(ctx))
497
+ return { content: '' };
498
+ const { node, parent } = ctx;
499
+ const getAttribute = hasGetAttribute(node)
500
+ ? node.getAttribute.bind(node)
501
+ : undefined;
502
+ const src = getAttribute?.('src') ?? '';
503
+ const existingAlt = getAttribute?.('alt') ?? '';
504
+ const alt = existingAlt.trim() || deriveAltFromImageUrl(src);
505
+ const markdown = `![${alt}](${src})`;
506
+ if (isAnchor(parent)) {
507
+ return { content: markdown };
508
+ }
509
+ return { content: `\n\n${markdown}\n\n` };
510
+ }
481
511
  function buildPreTranslator(ctx) {
482
512
  if (!isObject(ctx))
483
513
  return {};
@@ -494,10 +524,9 @@ function createCustomTranslators() {
494
524
  code: (ctx) => buildCodeTranslator(ctx),
495
525
  img: (ctx) => buildImageTranslator(ctx),
496
526
  dl: (ctx) => {
497
- if (!isObject(ctx) || !isObject(ctx.node)) {
527
+ if (!isObject(ctx) || !isObject(ctx.node))
498
528
  return { content: '' };
499
- }
500
- const node = ctx.node;
529
+ const { node } = ctx;
501
530
  const childNodes = Array.isArray(node.childNodes) ? node.childNodes : [];
502
531
  const items = childNodes
503
532
  .map((child) => {
@@ -520,14 +549,15 @@ function createCustomTranslators() {
520
549
  return { content: items ? `\n${items}\n\n` : '' };
521
550
  },
522
551
  div: (ctx) => {
523
- if (!isObject(ctx) || !isObject(ctx.node)) {
552
+ if (!isObject(ctx) || !isObject(ctx.node))
524
553
  return {};
525
- }
526
- const node = ctx.node;
527
- const className = typeof node.attribs?.class === 'string' ? node.attribs.class : '';
528
- if (!className.includes('type')) {
554
+ const { node } = ctx;
555
+ const getAttribute = hasGetAttribute(node)
556
+ ? node.getAttribute.bind(node)
557
+ : undefined;
558
+ const className = getAttribute?.('class') ?? '';
559
+ if (!className.includes('type'))
529
560
  return {};
530
- }
531
561
  return {
532
562
  postprocess: ({ content }) => {
533
563
  const lines = content.split('\n');
@@ -561,37 +591,41 @@ function createCustomTranslators() {
561
591
  sup: () => ({
562
592
  postprocess: ({ content }) => `^${content}^`,
563
593
  }),
564
- // Note: section translator removed in favor of HTML preprocessing
565
- // See preprocessPropertySections() for the fix to TypeDoc section spacing
594
+ section: () => ({
595
+ postprocess: ({ content }) => `\n\n${content}\n\n`,
596
+ }),
566
597
  pre: (ctx) => buildPreTranslator(ctx),
567
598
  };
568
599
  }
569
- let markdownInstance = null;
570
- function createMarkdownInstance() {
571
- return new NodeHtmlMarkdown({
572
- codeFence: CODE_BLOCK.fence,
573
- codeBlockStyle: 'fenced',
574
- emDelimiter: '_',
575
- bulletMarker: '-',
576
- }, createCustomTranslators());
577
- }
578
- function getMarkdownConverter() {
579
- markdownInstance ??= createMarkdownInstance();
580
- return markdownInstance;
600
+ class MarkdownConverter {
601
+ instance = null;
602
+ translate(html) {
603
+ return this.get().translate(html).trim();
604
+ }
605
+ get() {
606
+ this.instance ??= new NodeHtmlMarkdown({
607
+ codeFence: CODE_BLOCK.fence,
608
+ codeBlockStyle: 'fenced',
609
+ emDelimiter: '_',
610
+ bulletMarker: '-',
611
+ }, createCustomTranslators());
612
+ return this.instance;
613
+ }
581
614
  }
615
+ const markdownConverter = new MarkdownConverter();
582
616
  function preprocessPropertySections(html) {
583
- const result = html.replace(/<\/section>\s*(<section[^>]*class="[^"]*tsd-panel[^"]*tsd-member[^"]*"[^>]*>)/g, '</section><p>&nbsp;</p>$1');
584
- return result;
617
+ return html.replace(/<\/section>\s*(<section[^>]*class="[^"]*tsd-member[^"]*"[^>]*>)/g, '</section><p>&nbsp;</p>$1');
585
618
  }
586
- function translateHtmlToMarkdown(html, url, signal, document, skipNoiseRemoval) {
587
- throwIfAborted(signal, url, 'markdown:begin');
619
+ function translateHtmlToMarkdown(params) {
620
+ const { html, url, signal, document, skipNoiseRemoval } = params;
621
+ abortPolicy.throwIfAborted(signal, url, 'markdown:begin');
588
622
  const cleanedHtml = skipNoiseRemoval
589
623
  ? html
590
- : runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
591
- throwIfAborted(signal, url, 'markdown:cleaned');
592
- const preprocessedHtml = runTransformStage(url, 'markdown:preprocess', () => preprocessPropertySections(cleanedHtml));
593
- const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(preprocessedHtml).trim());
594
- throwIfAborted(signal, url, 'markdown:translated');
624
+ : stageTracker.run(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
625
+ abortPolicy.throwIfAborted(signal, url, 'markdown:cleaned');
626
+ const preprocessedHtml = stageTracker.run(url, 'markdown:preprocess', () => preprocessPropertySections(cleanedHtml));
627
+ const content = stageTracker.run(url, 'markdown:translate', () => markdownConverter.translate(preprocessedHtml));
628
+ abortPolicy.throwIfAborted(signal, url, 'markdown:translated');
595
629
  return cleanupMarkdownArtifacts(content);
596
630
  }
597
631
  function appendMetadataFooter(content, metadata, url) {
@@ -603,77 +637,71 @@ export function htmlToMarkdown(html, metadata, options) {
603
637
  if (!html)
604
638
  return buildMetadataFooter(metadata, url);
605
639
  try {
606
- const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document, options?.skipNoiseRemoval);
640
+ const content = translateHtmlToMarkdown({
641
+ html,
642
+ url,
643
+ signal: options?.signal,
644
+ document: options?.document,
645
+ skipNoiseRemoval: options?.skipNoiseRemoval,
646
+ });
607
647
  return appendMetadataFooter(content, metadata, url);
608
648
  }
609
649
  catch (error) {
610
- if (error instanceof FetchError) {
650
+ if (error instanceof FetchError)
611
651
  throw error;
612
- }
613
652
  logError('Failed to convert HTML to markdown', error instanceof Error ? error : undefined);
614
653
  return buildMetadataFooter(metadata, url);
615
654
  }
616
655
  }
656
+ /* -------------------------------------------------------------------------------------------------
657
+ * Raw content shortcut
658
+ * ------------------------------------------------------------------------------------------------- */
617
659
  function shouldPreserveRawContent(url, content) {
618
- if (isRawTextContentUrl(url)) {
660
+ if (isRawTextContentUrl(url))
619
661
  return !isLikelyHtmlContent(content);
620
- }
621
662
  return isRawTextContent(content);
622
663
  }
623
- function buildRawMarkdownPayload({ rawContent, url, includeMetadata, }) {
624
- const title = extractTitleFromRawMarkdown(rawContent);
625
- const content = includeMetadata
626
- ? addSourceToMarkdown(rawContent, url)
627
- : rawContent;
664
+ function buildRawMarkdownPayload(params) {
665
+ const title = extractTitleFromRawMarkdown(params.rawContent);
666
+ const content = params.includeMetadata
667
+ ? addSourceToMarkdown(params.rawContent, params.url)
668
+ : params.rawContent;
628
669
  return { content, title };
629
670
  }
630
- function buildRawMarkdownResult({ rawContent, url, includeMetadata, }) {
631
- const { content, title } = buildRawMarkdownPayload({
632
- rawContent,
633
- url,
634
- includeMetadata,
635
- });
636
- return {
637
- markdown: content,
638
- title,
639
- truncated: false,
640
- };
641
- }
642
- function tryTransformRawContent({ html, url, includeMetadata, }) {
643
- if (!shouldPreserveRawContent(url, html)) {
671
+ function tryTransformRawContent(params) {
672
+ if (!shouldPreserveRawContent(params.url, params.html))
644
673
  return null;
645
- }
646
- logDebug('Preserving raw markdown content', { url: truncateUrlForLog(url) });
647
- return buildRawMarkdownResult({
648
- rawContent: html,
649
- url,
650
- includeMetadata,
674
+ logDebug('Preserving raw markdown content', {
675
+ url: params.url.substring(0, 80),
676
+ });
677
+ const { content, title } = buildRawMarkdownPayload({
678
+ rawContent: params.html,
679
+ url: params.url,
680
+ includeMetadata: params.includeMetadata,
651
681
  });
682
+ return { markdown: content, title, truncated: false };
652
683
  }
684
+ /* -------------------------------------------------------------------------------------------------
685
+ * Quality gates + content source resolution
686
+ * ------------------------------------------------------------------------------------------------- */
653
687
  const MIN_CONTENT_RATIO = 0.3;
654
688
  const MIN_HTML_LENGTH_FOR_GATE = 100;
655
689
  const MIN_HEADING_RETENTION_RATIO = 0.7;
656
690
  const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
657
- /**
658
- * Check if HTML string needs document wrapper for proper parsing.
659
- * Fragments without doctype/html/body tags need wrapping.
660
- */
691
+ const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
692
+ const MAX_TRUNCATED_LINE_RATIO = 0.5;
661
693
  function needsDocumentWrapper(html) {
662
694
  const trimmed = html.trim().toLowerCase();
663
695
  return (!trimmed.startsWith('<!doctype') &&
664
696
  !trimmed.startsWith('<html') &&
665
697
  !trimmed.startsWith('<body'));
666
698
  }
667
- /**
668
- * Wrap HTML fragment in minimal document structure for proper parsing.
669
- */
670
699
  function wrapHtmlFragment(html) {
671
700
  return `<!DOCTYPE html><html><body>${html}</body></html>`;
672
701
  }
673
702
  function resolveHtmlDocument(htmlOrDocument) {
674
- if (typeof htmlOrDocument !== 'string') {
703
+ if (typeof htmlOrDocument !== 'string')
675
704
  return htmlOrDocument;
676
- }
677
705
  const htmlToParse = needsDocumentWrapper(htmlOrDocument)
678
706
  ? wrapHtmlFragment(htmlOrDocument)
679
707
  : htmlOrDocument;
@@ -682,39 +710,26 @@ function resolveHtmlDocument(htmlOrDocument) {
682
710
  function countDomSelector(htmlOrDocument, selector) {
683
711
  return resolveHtmlDocument(htmlOrDocument).querySelectorAll(selector).length;
684
712
  }
685
- /**
686
- * Count headings using DOM querySelectorAll.
687
- * Handles nested content like <h2><span>Text</span></h2> correctly.
688
- */
689
713
  function countHeadingsDom(htmlOrDocument) {
690
714
  return countDomSelector(htmlOrDocument, 'h1,h2,h3,h4,h5,h6');
691
715
  }
692
716
  function countCodeBlocksDom(htmlOrDocument) {
693
717
  return countDomSelector(htmlOrDocument, 'pre');
694
718
  }
695
- function cloneDocumentIfNeeded(htmlOrDocument, doc) {
696
- return typeof htmlOrDocument === 'string'
697
- ? doc
698
- : doc.cloneNode(true);
699
- }
700
719
  function stripNonVisibleNodes(doc) {
701
- for (const el of doc.querySelectorAll('script,style,noscript')) {
720
+ for (const el of doc.querySelectorAll('script,style,noscript'))
702
721
  el.remove();
703
- }
704
722
  }
705
723
  function resolveDocumentText(doc) {
706
- // Note: linkedom may return null for body on HTML fragments despite types
707
724
  const body = doc.body;
708
725
  const docElement = doc.documentElement;
709
726
  return body?.textContent ?? docElement?.textContent ?? '';
710
727
  }
711
- /**
712
- * Get visible text length from HTML, excluding script/style/noscript content.
713
- * Fixes the bug where stripHtmlTagsForLength() counted JS/CSS as visible text.
714
- */
715
728
  function getVisibleTextLength(htmlOrDocument) {
716
729
  const doc = resolveHtmlDocument(htmlOrDocument);
717
- const workDoc = cloneDocumentIfNeeded(htmlOrDocument, doc);
730
+ const workDoc = typeof htmlOrDocument === 'string'
731
+ ? doc
732
+ : doc.cloneNode(true);
718
733
  stripNonVisibleNodes(workDoc);
719
734
  const text = resolveDocumentText(workDoc);
720
735
  return text.replace(/\s+/g, ' ').trim().length;
@@ -723,29 +738,18 @@ export function isExtractionSufficient(article, originalHtmlOrDocument) {
723
738
  if (!article)
724
739
  return false;
725
740
  const articleLength = article.textContent.length;
726
- // Use DOM-based visible text length to exclude script/style content
727
741
  const originalLength = getVisibleTextLength(originalHtmlOrDocument);
728
742
  if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
729
743
  return true;
730
744
  return articleLength / originalLength >= MIN_CONTENT_RATIO;
731
745
  }
732
- const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
733
- const MAX_TRUNCATED_LINE_RATIO = 0.5;
734
- /**
735
- * Detect if extracted text has many truncated/incomplete sentences.
736
- * Lines longer than 20 chars that don't end with sentence punctuation
737
- * are considered potentially truncated.
738
- */
739
746
  function hasTruncatedSentences(text) {
740
747
  const lines = text
741
748
  .split('\n')
742
749
  .filter((line) => line.trim().length > MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK);
743
750
  if (lines.length < 3)
744
751
  return false;
745
- const incompleteLines = lines.filter((line) => {
746
- const trimmed = line.trim();
747
- return !/[.!?:;]$/.test(trimmed);
748
- });
752
+ const incompleteLines = lines.filter((line) => !/[.!?:;]$/.test(line.trim()));
749
753
  return incompleteLines.length / lines.length > MAX_TRUNCATED_LINE_RATIO;
750
754
  }
751
755
  export function determineContentExtractionSource(article) {
@@ -768,19 +772,13 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
768
772
  else {
769
773
  if (extractedMeta.title !== undefined)
770
774
  metadata.title = extractedMeta.title;
771
- if (extractedMeta.description !== undefined) {
775
+ if (extractedMeta.description !== undefined)
772
776
  metadata.description = extractedMeta.description;
773
- }
774
- if (extractedMeta.author !== undefined) {
777
+ if (extractedMeta.author !== undefined)
775
778
  metadata.author = extractedMeta.author;
776
- }
777
779
  }
778
780
  return metadata;
779
781
  }
780
- /**
781
- * Content root selectors in priority order.
782
- * These identify the main content area on a page.
783
- */
784
782
  const CONTENT_ROOT_SELECTORS = [
785
783
  'main',
786
784
  'article',
@@ -797,75 +795,23 @@ const CONTENT_ROOT_SELECTORS = [
797
795
  '.post-body',
798
796
  '.article-body',
799
797
  ];
800
- /**
801
- * Find the main content root element in a document.
802
- * Returns the innerHTML if found, undefined otherwise.
803
- */
804
798
  function findContentRoot(document) {
805
799
  for (const selector of CONTENT_ROOT_SELECTORS) {
806
800
  const element = document.querySelector(selector);
807
801
  if (!element)
808
802
  continue;
809
- // Check if element has meaningful content
810
803
  const innerHTML = typeof element.innerHTML === 'string'
811
804
  ? element.innerHTML
812
805
  : undefined;
813
- if (innerHTML && innerHTML.trim().length > 100) {
806
+ if (innerHTML && innerHTML.trim().length > 100)
814
807
  return innerHTML;
815
- }
816
808
  }
817
809
  return undefined;
818
810
  }
819
- function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, document, }) {
820
- const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
821
- // If using article content, return it directly
822
- if (useArticleContent && article) {
823
- return {
824
- sourceHtml: article.content,
825
- title: article.title,
826
- metadata,
827
- };
828
- }
829
- // Try content root fallback before using full HTML
830
- if (document) {
831
- // Apply noise removal to HTML first (without passing document) to get cleaned HTML,
832
- // then parse and find content root. This prevents the aggressive DOM stripping that
833
- // happens when noise removal is given the original parsed document.
834
- const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
835
- const { document: cleanedDoc } = parseHTML(cleanedHtml);
836
- const contentRoot = findContentRoot(cleanedDoc);
837
- if (contentRoot) {
838
- logDebug('Using content root fallback instead of full HTML', {
839
- url: truncateUrlForLog(url),
840
- contentLength: contentRoot.length,
841
- });
842
- return {
843
- sourceHtml: contentRoot,
844
- title: extractedMeta.title,
845
- metadata,
846
- // Skip noise removal - this HTML is already from a cleaned document
847
- skipNoiseRemoval: true,
848
- };
849
- }
850
- }
851
- // Fall back to full HTML
852
- return {
853
- sourceHtml: html,
854
- title: extractedMeta.title,
855
- metadata,
856
- ...(document ? { document } : {}),
857
- };
858
- }
859
- function logQualityGateFallback({ safeUrl, articleLength, }) {
860
- logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
861
- url: safeUrl,
862
- articleLength,
863
- });
864
- }
865
811
  function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
866
812
  const articleLength = article.textContent.length;
867
813
  const originalLength = getVisibleTextLength(originalHtmlOrDocument);
868
- const safeUrl = truncateUrlForLog(url);
814
+ const safeUrl = url.substring(0, 80);
869
815
  let articleDocument = null;
870
816
  const getArticleDocument = () => {
871
817
  if (articleDocument)
@@ -873,15 +819,16 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
873
819
  articleDocument = resolveHtmlDocument(article.content);
874
820
  return articleDocument;
875
821
  };
876
- // If the document is tiny, don't gate too aggressively.
877
822
  if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
878
823
  const ratio = articleLength / originalLength;
879
824
  if (ratio < MIN_CONTENT_RATIO) {
880
- logQualityGateFallback({ safeUrl, articleLength });
825
+ logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
826
+ url: safeUrl,
827
+ articleLength,
828
+ });
881
829
  return false;
882
830
  }
883
831
  }
884
- // Heading structure retention (compute counts once to avoid repeated DOM queries/parses).
885
832
  const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
886
833
  if (originalHeadings > 0) {
887
834
  const articleHeadings = countHeadingsDom(getArticleDocument());
@@ -899,7 +846,6 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
899
846
  if (originalCodeBlocks > 0) {
900
847
  const articleCodeBlocks = countCodeBlocksDom(getArticleDocument());
901
848
  const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
902
- // Always log code block counts for debugging
903
849
  logDebug('Code block retention check', {
904
850
  url: safeUrl,
905
851
  originalCodeBlocks,
@@ -915,100 +861,106 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
915
861
  return false;
916
862
  }
917
863
  }
918
- // Layout extraction issue: truncated/fragmented lines.
919
864
  if (hasTruncatedSentences(article.textContent)) {
920
- logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', { url: safeUrl });
865
+ logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', {
866
+ url: safeUrl,
867
+ });
921
868
  return false;
922
869
  }
923
870
  return true;
924
871
  }
925
- function resolveContentSource({ html, url, includeMetadata, signal, }) {
926
- const { article, metadata: extractedMeta, document, } = extractContentWithDocument(html, url, {
872
+ function buildContentSource(params) {
873
+ const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, } = params;
874
+ const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
875
+ if (useArticleContent && article) {
876
+ return { sourceHtml: article.content, title: article.title, metadata };
877
+ }
878
+ if (document) {
879
+ const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
880
+ const { document: cleanedDoc } = parseHTML(cleanedHtml);
881
+ const contentRoot = findContentRoot(cleanedDoc);
882
+ if (contentRoot) {
883
+ logDebug('Using content root fallback instead of full HTML', {
884
+ url: url.substring(0, 80),
885
+ contentLength: contentRoot.length,
886
+ });
887
+ return {
888
+ sourceHtml: contentRoot,
889
+ title: extractedMeta.title,
890
+ metadata,
891
+ skipNoiseRemoval: true,
892
+ };
893
+ }
894
+ }
895
+ return {
896
+ sourceHtml: html,
897
+ title: extractedMeta.title,
898
+ metadata,
899
+ ...(document ? { document } : {}),
900
+ };
901
+ }
902
+ function resolveContentSource(params) {
903
+ const { article, metadata: extractedMeta, document, } = contentExtractor.extract(params.html, params.url, {
927
904
  extractArticle: true,
928
- ...(signal ? { signal } : {}),
905
+ ...(params.signal ? { signal: params.signal } : {}),
929
906
  });
930
- const originalDocument = document;
931
907
  const useArticleContent = article
932
- ? shouldUseArticleContent(article, originalDocument, url)
908
+ ? shouldUseArticleContent(article, document, params.url)
933
909
  : false;
934
910
  return buildContentSource({
935
- html,
936
- url,
911
+ html: params.html,
912
+ url: params.url,
937
913
  article,
938
914
  extractedMeta,
939
- includeMetadata,
915
+ includeMetadata: params.includeMetadata,
940
916
  useArticleContent,
941
917
  document,
942
918
  });
943
919
  }
944
- function tryTransformRawStage(html, url, includeMetadata) {
945
- return runTransformStage(url, 'transform:raw', () => tryTransformRawContent({
946
- html,
947
- url,
948
- includeMetadata,
949
- }));
950
- }
951
- function resolveContentSourceStage(html, url, includeMetadata, signal) {
952
- return runTransformStage(url, 'transform:extract', () => resolveContentSource({
953
- html,
954
- url,
955
- includeMetadata,
956
- ...(signal ? { signal } : {}),
957
- }));
958
- }
920
+ /* -------------------------------------------------------------------------------------------------
921
+ * In-process transform pipeline (public)
922
+ * ------------------------------------------------------------------------------------------------- */
959
923
  function buildMarkdownFromContext(context, url, signal) {
960
- const content = runTransformStage(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
924
+ const content = stageTracker.run(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
961
925
  url,
962
926
  ...(signal ? { signal } : {}),
963
927
  ...(context.document ? { document: context.document } : {}),
964
928
  ...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
965
929
  }));
966
- return {
967
- markdown: content,
968
- title: context.title,
969
- truncated: false,
970
- };
930
+ return { markdown: content, title: context.title, truncated: false };
971
931
  }
972
- function runTotalTransformStage(url, fn) {
973
- const totalStage = startTransformStage(url, 'transform:total');
974
- let success = false;
975
- try {
976
- const result = fn();
977
- success = true;
978
- return result;
979
- }
980
- finally {
981
- finalizeTotalTransformStage(totalStage, success);
982
- }
983
- }
984
- function finalizeTotalTransformStage(stage, success) {
985
- if (!success)
986
- return;
987
- endTransformStage(stage, { truncated: false });
988
- }
989
- async function runTotalTransformStageAsync(url, fn) {
990
- const totalStage = startTransformStage(url, 'transform:total');
932
+ export function transformHtmlToMarkdownInProcess(html, url, options) {
933
+ const totalStage = stageTracker.start(url, 'transform:total');
991
934
  let success = false;
992
935
  try {
993
- const result = await fn();
936
+ abortPolicy.throwIfAborted(options.signal, url, 'transform:begin');
937
+ const raw = stageTracker.run(url, 'transform:raw', () => tryTransformRawContent({
938
+ html,
939
+ url,
940
+ includeMetadata: options.includeMetadata,
941
+ }));
942
+ if (raw) {
943
+ success = true;
944
+ return raw;
945
+ }
946
+ const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
947
+ html,
948
+ url,
949
+ includeMetadata: options.includeMetadata,
950
+ ...(options.signal ? { signal: options.signal } : {}),
951
+ }));
952
+ const result = buildMarkdownFromContext(context, url, options.signal);
994
953
  success = true;
995
954
  return result;
996
955
  }
997
956
  finally {
998
- finalizeTotalTransformStage(totalStage, success);
957
+ if (success)
958
+ stageTracker.end(totalStage, { truncated: false });
999
959
  }
1000
960
  }
1001
- export function transformHtmlToMarkdownInProcess(html, url, options) {
1002
- return runTotalTransformStage(url, () => {
1003
- throwIfAborted(options.signal, url, 'transform:begin');
1004
- const raw = tryTransformRawStage(html, url, options.includeMetadata);
1005
- if (raw) {
1006
- return raw;
1007
- }
1008
- const context = resolveContentSourceStage(html, url, options.includeMetadata, options.signal);
1009
- return buildMarkdownFromContext(context, url, options.signal);
1010
- });
1011
- }
961
+ /* -------------------------------------------------------------------------------------------------
962
+ * Worker pool
963
+ * ------------------------------------------------------------------------------------------------- */
1012
964
  const workerMessageSchema = z.discriminatedUnion('type', [
1013
965
  z.object({
1014
966
  type: z.literal('result'),
@@ -1031,142 +983,137 @@ const workerMessageSchema = z.discriminatedUnion('type', [
1031
983
  }),
1032
984
  }),
1033
985
  ]);
1034
- let pool = null;
1035
986
  const POOL_MIN_WORKERS = 2;
1036
987
  const POOL_MAX_WORKERS = 4;
1037
988
  const POOL_SCALE_THRESHOLD = 0.5;
1038
- function resolveDefaultWorkerCount() {
1039
- return POOL_MIN_WORKERS;
1040
- }
1041
989
  const DEFAULT_TIMEOUT_MS = config.transform.timeoutMs;
1042
- function getOrCreateTransformWorkerPool() {
1043
- pool ??= new WorkerPool(resolveDefaultWorkerCount(), DEFAULT_TIMEOUT_MS);
1044
- return pool;
1045
- }
1046
- export async function shutdownTransformWorkerPool() {
1047
- if (!pool)
1048
- return;
1049
- await pool.close();
1050
- pool = null;
1051
- }
1052
- export function getTransformPoolStats() {
1053
- if (!pool)
1054
- return null;
1055
- return {
1056
- queueDepth: pool.getQueueDepth(),
1057
- activeWorkers: pool.getActiveWorkers(),
1058
- capacity: pool.getCapacity(),
1059
- };
1060
- }
1061
990
  class WorkerPool {
1062
991
  workers = [];
1063
992
  capacity;
1064
- minCapacity;
1065
- maxCapacity;
993
+ minCapacity = POOL_MIN_WORKERS;
994
+ maxCapacity = POOL_MAX_WORKERS;
1066
995
  queue = [];
1067
996
  inflight = new Map();
1068
997
  timeoutMs;
1069
998
  queueMax;
1070
999
  closed = false;
1071
- createAbortError(url, stage) {
1072
- return new FetchError('Request was canceled', url, 499, {
1073
- reason: 'aborted',
1074
- stage,
1075
- });
1000
+ constructor(size, timeoutMs) {
1001
+ this.capacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
1002
+ this.timeoutMs = timeoutMs;
1003
+ this.queueMax = this.maxCapacity * 32;
1076
1004
  }
1077
- ensureOpen() {
1078
- if (this.closed) {
1079
- throw new Error('Transform worker pool closed');
1005
+ async transform(html, url, options) {
1006
+ this.ensureOpen();
1007
+ if (options.signal?.aborted)
1008
+ throw abortPolicy.createAbortError(url, 'transform:enqueue');
1009
+ if (this.queue.length >= this.queueMax) {
1010
+ throw new FetchError('Transform worker queue is full', url, 503, {
1011
+ reason: 'queue_full',
1012
+ stage: 'transform:enqueue',
1013
+ });
1080
1014
  }
1015
+ return new Promise((resolve, reject) => {
1016
+ const task = this.createPendingTask(html, url, options, resolve, reject);
1017
+ this.queue.push(task);
1018
+ this.drainQueue();
1019
+ });
1081
1020
  }
1082
- ensureNotAborted(signal, url, stage) {
1083
- if (!signal?.aborted)
1084
- return;
1085
- throw this.createAbortError(url, stage);
1021
+ getQueueDepth() {
1022
+ return this.queue.length;
1086
1023
  }
1087
- ensureQueueCapacity(url) {
1088
- if (this.queue.length < this.queueMax)
1089
- return;
1090
- throw new FetchError('Transform worker queue is full', url, 503, {
1091
- reason: 'queue_full',
1092
- stage: 'transform:enqueue',
1093
- });
1024
+ getActiveWorkers() {
1025
+ return this.workers.filter((s) => s?.busy).length;
1094
1026
  }
1095
- clearAbortListener(signal, listener) {
1096
- if (!signal || !listener)
1027
+ getCapacity() {
1028
+ return this.capacity;
1029
+ }
1030
+ async close() {
1031
+ if (this.closed)
1097
1032
  return;
1098
- try {
1099
- signal.removeEventListener('abort', listener);
1100
- }
1101
- catch {
1102
- /* empty */
1033
+ this.closed = true;
1034
+ const terminations = this.workers
1035
+ .map((slot) => slot?.worker.terminate())
1036
+ .filter((p) => p !== undefined);
1037
+ this.workers.fill(undefined);
1038
+ this.workers.length = 0;
1039
+ for (const [id, inflight] of this.inflight.entries()) {
1040
+ clearTimeout(inflight.timer);
1041
+ this.clearAbortListener(inflight.signal, inflight.abortListener);
1042
+ inflight.reject(new Error('Transform worker pool closed'));
1043
+ this.inflight.delete(id);
1103
1044
  }
1045
+ for (const task of this.queue)
1046
+ task.reject(new Error('Transform worker pool closed'));
1047
+ this.queue.length = 0;
1048
+ await Promise.allSettled(terminations);
1104
1049
  }
1105
- markSlotIdle(workerIndex) {
1106
- const slot = this.workers[workerIndex];
1107
- if (!slot)
1108
- return;
1109
- slot.busy = false;
1110
- slot.currentTaskId = null;
1050
+ ensureOpen() {
1051
+ if (this.closed)
1052
+ throw new Error('Transform worker pool closed');
1111
1053
  }
1112
- takeInflight(id) {
1113
- const inflight = this.inflight.get(id);
1114
- if (!inflight)
1115
- return null;
1116
- clearTimeout(inflight.timer);
1117
- this.clearAbortListener(inflight.signal, inflight.abortListener);
1118
- this.inflight.delete(id);
1119
- return inflight;
1054
+ createPendingTask(html, url, options, resolve, reject) {
1055
+ const id = randomUUID();
1056
+ let abortListener;
1057
+ if (options.signal) {
1058
+ abortListener = () => {
1059
+ this.onAbortSignal(id, url, reject);
1060
+ };
1061
+ options.signal.addEventListener('abort', abortListener, { once: true });
1062
+ }
1063
+ return {
1064
+ id,
1065
+ html,
1066
+ url,
1067
+ includeMetadata: options.includeMetadata,
1068
+ signal: options.signal,
1069
+ abortListener,
1070
+ resolve,
1071
+ reject,
1072
+ };
1120
1073
  }
1121
- cancelWorkerTask(slot, id) {
1122
- if (!slot)
1074
+ onAbortSignal(id, url, reject) {
1075
+ if (this.closed) {
1076
+ reject(new Error('Transform worker pool closed'));
1123
1077
  return;
1124
- try {
1125
- slot.worker.postMessage({ type: 'cancel', id });
1126
1078
  }
1127
- catch {
1128
- /* empty */
1129
- }
1130
- }
1131
- restartWorker(workerIndex, slot) {
1132
- if (this.closed)
1079
+ const inflight = this.inflight.get(id);
1080
+ if (inflight) {
1081
+ this.abortInflight(id, url, inflight.workerIndex);
1133
1082
  return;
1134
- const target = slot ?? this.workers[workerIndex];
1135
- if (target) {
1136
- void target.worker.terminate();
1137
1083
  }
1138
- this.workers[workerIndex] = this.spawnWorker(workerIndex);
1139
- this.drainQueue();
1140
- }
1141
- rejectIfClosed(reject) {
1142
- if (!this.closed)
1143
- return false;
1144
- reject(new Error('Transform worker pool closed'));
1145
- return true;
1084
+ const queuedIndex = this.queue.findIndex((t) => t.id === id);
1085
+ if (queuedIndex !== -1) {
1086
+ this.queue.splice(queuedIndex, 1);
1087
+ reject(abortPolicy.createAbortError(url, 'transform:queued-abort'));
1088
+ }
1146
1089
  }
1147
- abortInflightTask(id, url, workerIndex) {
1090
+ abortInflight(id, url, workerIndex) {
1148
1091
  const slot = this.workers[workerIndex];
1149
- this.cancelWorkerTask(slot, id);
1150
- this.failTask(id, this.createAbortError(url, 'transform:signal-abort'));
1151
1092
  if (slot) {
1152
- this.restartWorker(workerIndex, slot);
1093
+ try {
1094
+ slot.worker.postMessage({ type: 'cancel', id });
1095
+ }
1096
+ catch {
1097
+ /* ignore */
1098
+ }
1153
1099
  }
1100
+ this.failTask(id, abortPolicy.createAbortError(url, 'transform:signal-abort'));
1101
+ if (slot)
1102
+ this.restartWorker(workerIndex, slot);
1154
1103
  }
1155
- abortQueuedTask(id, url, reject) {
1156
- const queuedIndex = this.queue.findIndex((task) => task.id === id);
1157
- if (queuedIndex === -1)
1104
+ clearAbortListener(signal, listener) {
1105
+ if (!signal || !listener)
1158
1106
  return;
1159
- this.queue.splice(queuedIndex, 1);
1160
- reject(this.createAbortError(url, 'transform:queued-abort'));
1161
- }
1162
- createWorkerSlot(worker) {
1163
- return {
1164
- worker,
1165
- busy: false,
1166
- currentTaskId: null,
1167
- };
1107
+ try {
1108
+ signal.removeEventListener('abort', listener);
1109
+ }
1110
+ catch {
1111
+ /* ignore */
1112
+ }
1168
1113
  }
1169
- registerWorkerHandlers(workerIndex, worker) {
1114
+ spawnWorker(workerIndex) {
1115
+ const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url));
1116
+ worker.unref();
1170
1117
  worker.on('message', (raw) => {
1171
1118
  this.onWorkerMessage(workerIndex, raw);
1172
1119
  });
@@ -1176,20 +1123,7 @@ class WorkerPool {
1176
1123
  worker.on('exit', (code) => {
1177
1124
  this.onWorkerBroken(workerIndex, `Transform worker exited (code ${code})`);
1178
1125
  });
1179
- }
1180
- constructor(size, timeoutMs) {
1181
- this.minCapacity = POOL_MIN_WORKERS;
1182
- this.maxCapacity = POOL_MAX_WORKERS;
1183
- this.capacity = Math.max(this.minCapacity, Math.min(size, this.maxCapacity));
1184
- this.timeoutMs = timeoutMs;
1185
- this.queueMax = this.maxCapacity * 32;
1186
- }
1187
- spawnWorker(workerIndex) {
1188
- const worker = new Worker(new URL('./workers/transform-worker.js', import.meta.url));
1189
- worker.unref();
1190
- const slot = this.createWorkerSlot(worker);
1191
- this.registerWorkerHandlers(workerIndex, worker);
1192
- return slot;
1126
+ return { worker, busy: false, currentTaskId: null };
1193
1127
  }
1194
1128
  onWorkerBroken(workerIndex, message) {
1195
1129
  if (this.closed)
@@ -1202,19 +1136,14 @@ class WorkerPool {
1202
1136
  }
1203
1137
  this.restartWorker(workerIndex, slot);
1204
1138
  }
1205
- resolveWorkerResult(inflight, result) {
1206
- inflight.resolve({
1207
- markdown: result.markdown,
1208
- truncated: result.truncated,
1209
- title: result.title,
1210
- });
1211
- }
1212
- rejectWorkerError(inflight, error) {
1213
- if (error.name === 'FetchError') {
1214
- inflight.reject(new FetchError(error.message, error.url, error.statusCode, error.details ?? {}));
1139
+ restartWorker(workerIndex, slot) {
1140
+ if (this.closed)
1215
1141
  return;
1216
- }
1217
- inflight.reject(new Error(error.message));
1142
+ const target = slot ?? this.workers[workerIndex];
1143
+ if (target)
1144
+ void target.worker.terminate();
1145
+ this.workers[workerIndex] = this.spawnWorker(workerIndex);
1146
+ this.drainQueue();
1218
1147
  }
1219
1148
  onWorkerMessage(workerIndex, raw) {
1220
1149
  const parsed = workerMessageSchema.safeParse(raw);
@@ -1224,63 +1153,48 @@ class WorkerPool {
1224
1153
  const inflight = this.takeInflight(message.id);
1225
1154
  if (!inflight)
1226
1155
  return;
1227
- this.markSlotIdle(workerIndex);
1156
+ this.markIdle(workerIndex);
1228
1157
  if (message.type === 'result') {
1229
- this.resolveWorkerResult(inflight, message.result);
1158
+ inflight.resolve({
1159
+ markdown: message.result.markdown,
1160
+ truncated: message.result.truncated,
1161
+ title: message.result.title,
1162
+ });
1230
1163
  }
1231
1164
  else {
1232
- this.rejectWorkerError(inflight, message.error);
1165
+ const err = message.error;
1166
+ if (err.name === 'FetchError') {
1167
+ inflight.reject(new FetchError(err.message, err.url, err.statusCode, err.details ?? {}));
1168
+ }
1169
+ else {
1170
+ inflight.reject(new Error(err.message));
1171
+ }
1233
1172
  }
1234
1173
  this.drainQueue();
1235
1174
  }
1175
+ takeInflight(id) {
1176
+ const inflight = this.inflight.get(id);
1177
+ if (!inflight)
1178
+ return null;
1179
+ clearTimeout(inflight.timer);
1180
+ this.clearAbortListener(inflight.signal, inflight.abortListener);
1181
+ this.inflight.delete(id);
1182
+ return inflight;
1183
+ }
1184
+ markIdle(workerIndex) {
1185
+ const slot = this.workers[workerIndex];
1186
+ if (!slot)
1187
+ return;
1188
+ slot.busy = false;
1189
+ slot.currentTaskId = null;
1190
+ }
1236
1191
  failTask(id, error) {
1237
1192
  const inflight = this.takeInflight(id);
1238
1193
  if (!inflight)
1239
1194
  return;
1240
1195
  inflight.reject(error);
1241
- this.markSlotIdle(inflight.workerIndex);
1242
- }
1243
- handleAbortSignal(id, url, reject) {
1244
- if (this.rejectIfClosed(reject))
1245
- return;
1246
- const inflight = this.inflight.get(id);
1247
- if (inflight) {
1248
- this.abortInflightTask(id, url, inflight.workerIndex);
1249
- return;
1250
- }
1251
- this.abortQueuedTask(id, url, reject);
1252
- }
1253
- createPendingTask(html, url, options, resolve, reject) {
1254
- const id = randomUUID();
1255
- let abortListener;
1256
- if (options.signal) {
1257
- abortListener = () => {
1258
- this.handleAbortSignal(id, url, reject);
1259
- };
1260
- options.signal.addEventListener('abort', abortListener, { once: true });
1261
- }
1262
- return {
1263
- id,
1264
- html,
1265
- url,
1266
- includeMetadata: options.includeMetadata,
1267
- signal: options.signal,
1268
- abortListener,
1269
- resolve,
1270
- reject,
1271
- };
1196
+ this.markIdle(inflight.workerIndex);
1272
1197
  }
1273
- async transform(html, url, options) {
1274
- this.ensureOpen();
1275
- this.ensureNotAborted(options.signal, url, 'transform:enqueue');
1276
- this.ensureQueueCapacity(url);
1277
- return new Promise((resolve, reject) => {
1278
- const task = this.createPendingTask(html, url, options, resolve, reject);
1279
- this.queue.push(task);
1280
- this.drainQueue();
1281
- });
1282
- }
1283
- /** Scale capacity up if queue pressure exceeds threshold. */
1284
1198
  maybeScaleUp() {
1285
1199
  if (this.queue.length > this.capacity * POOL_SCALE_THRESHOLD &&
1286
1200
  this.capacity < this.maxCapacity) {
@@ -1288,16 +1202,13 @@ class WorkerPool {
1288
1202
  }
1289
1203
  }
1290
1204
  drainQueue() {
1291
- if (this.closed)
1292
- return;
1293
- if (this.queue.length === 0)
1205
+ if (this.closed || this.queue.length === 0)
1294
1206
  return;
1295
1207
  this.maybeScaleUp();
1296
- // First pass: try to find an idle existing worker
1297
- for (let workerIndex = 0; workerIndex < this.workers.length; workerIndex += 1) {
1298
- const slot = this.workers[workerIndex];
1208
+ for (let i = 0; i < this.workers.length; i += 1) {
1209
+ const slot = this.workers[i];
1299
1210
  if (slot && !slot.busy) {
1300
- this.dispatchQueueTask(workerIndex, slot);
1211
+ this.dispatchFromQueue(i, slot);
1301
1212
  if (this.queue.length === 0)
1302
1213
  return;
1303
1214
  }
@@ -1306,7 +1217,7 @@ class WorkerPool {
1306
1217
  const workerIndex = this.workers.length;
1307
1218
  const slot = this.spawnWorker(workerIndex);
1308
1219
  this.workers.push(slot);
1309
- this.dispatchQueueTask(workerIndex, slot);
1220
+ this.dispatchFromQueue(workerIndex, slot);
1310
1221
  if (this.workers.length < this.capacity && this.queue.length > 0) {
1311
1222
  setImmediate(() => {
1312
1223
  this.drainQueue();
@@ -1314,39 +1225,28 @@ class WorkerPool {
1314
1225
  }
1315
1226
  }
1316
1227
  }
1317
- dispatchQueueTask(workerIndex, slot) {
1228
+ dispatchFromQueue(workerIndex, slot) {
1318
1229
  const task = this.queue.shift();
1319
1230
  if (!task)
1320
1231
  return;
1321
- this.dispatch(workerIndex, slot, task);
1322
- }
1323
- dispatch(workerIndex, slot, task) {
1324
- if (this.rejectIfAborted(task))
1232
+ if (this.closed) {
1233
+ task.reject(new Error('Transform worker pool closed'));
1325
1234
  return;
1326
- this.markSlotBusy(slot, task);
1327
- const timer = this.startTaskTimer(workerIndex, slot, task);
1328
- this.registerInflightTask(task, timer, workerIndex);
1329
- try {
1330
- this.sendTransformMessage(slot, task);
1331
1235
  }
1332
- catch (error) {
1333
- this.handleDispatchFailure(workerIndex, slot, task, timer, error);
1236
+ if (task.signal?.aborted) {
1237
+ this.clearAbortListener(task.signal, task.abortListener);
1238
+ task.reject(abortPolicy.createAbortError(task.url, 'transform:dispatch'));
1239
+ return;
1334
1240
  }
1335
- }
1336
- rejectIfAborted(task) {
1337
- if (!task.signal?.aborted)
1338
- return false;
1339
- this.clearAbortListener(task.signal, task.abortListener);
1340
- task.reject(this.createAbortError(task.url, 'transform:dispatch'));
1341
- return true;
1342
- }
1343
- markSlotBusy(slot, task) {
1344
1241
  slot.busy = true;
1345
1242
  slot.currentTaskId = task.id;
1346
- }
1347
- startTaskTimer(workerIndex, slot, task) {
1348
1243
  const timer = setTimeout(() => {
1349
- this.cancelWorkerTask(slot, task.id);
1244
+ try {
1245
+ slot.worker.postMessage({ type: 'cancel', id: task.id });
1246
+ }
1247
+ catch {
1248
+ /* ignore */
1249
+ }
1350
1250
  const inflight = this.takeInflight(task.id);
1351
1251
  if (!inflight)
1352
1252
  return;
@@ -1357,9 +1257,6 @@ class WorkerPool {
1357
1257
  this.restartWorker(workerIndex, slot);
1358
1258
  }, this.timeoutMs);
1359
1259
  timer.unref();
1360
- return timer;
1361
- }
1362
- registerInflightTask(task, timer, workerIndex) {
1363
1260
  this.inflight.set(task.id, {
1364
1261
  resolve: task.resolve,
1365
1262
  reject: task.reject,
@@ -1368,58 +1265,56 @@ class WorkerPool {
1368
1265
  abortListener: task.abortListener,
1369
1266
  workerIndex,
1370
1267
  });
1268
+ try {
1269
+ slot.worker.postMessage({
1270
+ type: 'transform',
1271
+ id: task.id,
1272
+ html: task.html,
1273
+ url: task.url,
1274
+ includeMetadata: task.includeMetadata,
1275
+ });
1276
+ }
1277
+ catch (error) {
1278
+ clearTimeout(timer);
1279
+ this.clearAbortListener(task.signal, task.abortListener);
1280
+ this.inflight.delete(task.id);
1281
+ this.markIdle(workerIndex);
1282
+ task.reject(error instanceof Error
1283
+ ? error
1284
+ : new Error('Failed to dispatch transform worker message'));
1285
+ this.restartWorker(workerIndex, slot);
1286
+ }
1371
1287
  }
1372
- sendTransformMessage(slot, task) {
1373
- slot.worker.postMessage({
1374
- type: 'transform',
1375
- id: task.id,
1376
- html: task.html,
1377
- url: task.url,
1378
- includeMetadata: task.includeMetadata,
1379
- });
1380
- }
1381
- handleDispatchFailure(workerIndex, slot, task, timer, error) {
1382
- clearTimeout(timer);
1383
- this.clearAbortListener(task.signal, task.abortListener);
1384
- this.inflight.delete(task.id);
1385
- this.markSlotIdle(workerIndex);
1386
- const message = error instanceof Error
1387
- ? error
1388
- : new Error('Failed to dispatch transform worker message');
1389
- task.reject(message);
1390
- this.restartWorker(workerIndex, slot);
1391
- }
1392
- getQueueDepth() {
1393
- return this.queue.length;
1394
- }
1395
- getActiveWorkers() {
1396
- return this.workers.filter((s) => s?.busy).length;
1288
+ }
1289
+ class TransformWorkerPoolManager {
1290
+ pool = null;
1291
+ getOrCreate() {
1292
+ this.pool ??= new WorkerPool(POOL_MIN_WORKERS, DEFAULT_TIMEOUT_MS);
1293
+ return this.pool;
1397
1294
  }
1398
- getCapacity() {
1399
- return this.capacity;
1295
+ getStats() {
1296
+ if (!this.pool)
1297
+ return null;
1298
+ return {
1299
+ queueDepth: this.pool.getQueueDepth(),
1300
+ activeWorkers: this.pool.getActiveWorkers(),
1301
+ capacity: this.pool.getCapacity(),
1302
+ };
1400
1303
  }
1401
- async close() {
1402
- if (this.closed)
1304
+ async shutdown() {
1305
+ if (!this.pool)
1403
1306
  return;
1404
- this.closed = true;
1405
- const terminations = this.workers
1406
- .map((slot) => slot?.worker.terminate())
1407
- .filter((p) => p !== undefined);
1408
- this.workers.fill(undefined);
1409
- this.workers.length = 0;
1410
- for (const [id, inflight] of this.inflight.entries()) {
1411
- clearTimeout(inflight.timer);
1412
- this.clearAbortListener(inflight.signal, inflight.abortListener);
1413
- inflight.reject(new Error('Transform worker pool closed'));
1414
- this.inflight.delete(id);
1415
- }
1416
- for (const task of this.queue) {
1417
- task.reject(new Error('Transform worker pool closed'));
1418
- }
1419
- this.queue.length = 0;
1420
- await Promise.allSettled(terminations);
1307
+ await this.pool.close();
1308
+ this.pool = null;
1421
1309
  }
1422
1310
  }
1311
+ const poolManager = new TransformWorkerPoolManager();
1312
+ export function getTransformPoolStats() {
1313
+ return poolManager.getStats();
1314
+ }
1315
+ export async function shutdownTransformWorkerPool() {
1316
+ await poolManager.shutdown();
1317
+ }
1423
1318
  function buildWorkerTransformOptions(options) {
1424
1319
  return {
1425
1320
  includeMetadata: options.includeMetadata,
@@ -1427,30 +1322,37 @@ function buildWorkerTransformOptions(options) {
1427
1322
  };
1428
1323
  }
1429
1324
  async function transformWithWorkerPool(html, url, options) {
1430
- const poolRef = getOrCreateTransformWorkerPool();
1431
- return poolRef.transform(html, url, buildWorkerTransformOptions(options));
1325
+ const pool = poolManager.getOrCreate();
1326
+ return pool.transform(html, url, buildWorkerTransformOptions(options));
1432
1327
  }
1433
1328
  function resolveWorkerFallback(error, html, url, options) {
1434
- if (error instanceof FetchError) {
1329
+ if (error instanceof FetchError)
1435
1330
  throw error;
1436
- }
1437
- throwIfAborted(options.signal, url, 'transform:worker-fallback');
1331
+ abortPolicy.throwIfAborted(options.signal, url, 'transform:worker-fallback');
1438
1332
  return transformHtmlToMarkdownInProcess(html, url, options);
1439
1333
  }
1440
1334
  export async function transformHtmlToMarkdown(html, url, options) {
1441
- return runTotalTransformStageAsync(url, async () => {
1442
- throwIfAborted(options.signal, url, 'transform:begin');
1443
- const workerStage = startTransformStage(url, 'transform:worker');
1335
+ const totalStage = stageTracker.start(url, 'transform:total');
1336
+ let success = false;
1337
+ try {
1338
+ abortPolicy.throwIfAborted(options.signal, url, 'transform:begin');
1339
+ const workerStage = stageTracker.start(url, 'transform:worker');
1444
1340
  try {
1445
1341
  const result = await transformWithWorkerPool(html, url, options);
1342
+ success = true;
1446
1343
  return result;
1447
1344
  }
1448
1345
  catch (error) {
1449
1346
  const fallback = resolveWorkerFallback(error, html, url, options);
1347
+ success = true;
1450
1348
  return fallback;
1451
1349
  }
1452
1350
  finally {
1453
- endTransformStage(workerStage);
1351
+ stageTracker.end(workerStage);
1454
1352
  }
1455
- });
1353
+ }
1354
+ finally {
1355
+ if (success)
1356
+ stageTracker.end(totalStage, { truncated: false });
1357
+ }
1456
1358
  }