@j0hanz/superfetch 2.2.1 → 2.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/README.md +363 -614
  2. package/dist/cache.d.ts +2 -2
  3. package/dist/cache.d.ts.map +1 -1
  4. package/dist/cache.js +47 -225
  5. package/dist/cache.js.map +1 -1
  6. package/dist/config.d.ts +6 -0
  7. package/dist/config.d.ts.map +1 -1
  8. package/dist/config.js +20 -27
  9. package/dist/config.js.map +1 -1
  10. package/dist/dom-noise-removal.d.ts +6 -0
  11. package/dist/dom-noise-removal.d.ts.map +1 -0
  12. package/dist/dom-noise-removal.js +482 -0
  13. package/dist/dom-noise-removal.js.map +1 -0
  14. package/dist/errors.d.ts.map +1 -1
  15. package/dist/errors.js +8 -5
  16. package/dist/errors.js.map +1 -1
  17. package/dist/fetch.d.ts.map +1 -1
  18. package/dist/fetch.js +26 -32
  19. package/dist/fetch.js.map +1 -1
  20. package/dist/http-native.d.ts +6 -0
  21. package/dist/http-native.d.ts.map +1 -0
  22. package/dist/http-native.js +645 -0
  23. package/dist/http-native.js.map +1 -0
  24. package/dist/http-utils.d.ts +61 -0
  25. package/dist/http-utils.d.ts.map +1 -0
  26. package/dist/http-utils.js +252 -0
  27. package/dist/http-utils.js.map +1 -0
  28. package/dist/index.js +1 -1
  29. package/dist/index.js.map +1 -1
  30. package/dist/instructions.md +41 -39
  31. package/dist/json.d.ts +2 -0
  32. package/dist/json.d.ts.map +1 -0
  33. package/dist/json.js +30 -0
  34. package/dist/json.js.map +1 -0
  35. package/dist/language-detection.d.ts +13 -0
  36. package/dist/language-detection.d.ts.map +1 -0
  37. package/dist/language-detection.js +283 -0
  38. package/dist/language-detection.js.map +1 -0
  39. package/dist/markdown-cleanup.d.ts +19 -0
  40. package/dist/markdown-cleanup.d.ts.map +1 -0
  41. package/dist/markdown-cleanup.js +283 -0
  42. package/dist/markdown-cleanup.js.map +1 -0
  43. package/dist/observability.d.ts +1 -0
  44. package/dist/observability.d.ts.map +1 -1
  45. package/dist/observability.js +10 -0
  46. package/dist/observability.js.map +1 -1
  47. package/dist/tools.js +4 -4
  48. package/dist/transform-types.d.ts +81 -0
  49. package/dist/transform-types.d.ts.map +1 -0
  50. package/dist/transform-types.js +6 -0
  51. package/dist/transform-types.js.map +1 -0
  52. package/dist/transform.d.ts +7 -52
  53. package/dist/transform.d.ts.map +1 -1
  54. package/dist/transform.js +411 -839
  55. package/dist/transform.js.map +1 -1
  56. package/dist/type-guards.d.ts +1 -1
  57. package/dist/type-guards.d.ts.map +1 -1
  58. package/dist/type-guards.js +1 -1
  59. package/dist/type-guards.js.map +1 -1
  60. package/dist/workers/transform-worker.js +23 -24
  61. package/dist/workers/transform-worker.js.map +1 -1
  62. package/package.json +85 -86
  63. package/dist/http.d.ts +0 -90
  64. package/dist/http.d.ts.map +0 -1
  65. package/dist/http.js +0 -1576
  66. package/dist/http.js.map +0 -1
package/dist/transform.js CHANGED
@@ -8,44 +8,25 @@ import { NodeHtmlMarkdown, } from 'node-html-markdown';
8
8
  import { z } from 'zod';
9
9
  import { isProbablyReaderable, Readability } from '@mozilla/readability';
10
10
  import { config } from './config.js';
11
+ import { removeNoiseFromHtml } from './dom-noise-removal.js';
11
12
  import { FetchError, getErrorMessage } from './errors.js';
12
13
  import { isRawTextContentUrl } from './fetch.js';
14
+ import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
15
+ import { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
13
16
  import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
14
- import { isRecord } from './type-guards.js';
17
+ import { isObject } from './type-guards.js';
18
+ // Re-export language detection for backward compatibility
19
+ export { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
20
+ // Re-export markdown cleanup for backward compatibility
21
+ export { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
22
+ // Re-export DOM noise removal for backward compatibility
23
+ export { removeNoiseFromHtml } from './dom-noise-removal.js';
15
24
  function getAbortReason(signal) {
16
- if (!isRecord(signal))
25
+ if (!isObject(signal))
17
26
  return undefined;
18
27
  return 'reason' in signal ? signal.reason : undefined;
19
28
  }
20
- function getBodyInnerHtml(document) {
21
- if (!isRecord(document))
22
- return undefined;
23
- const { body } = document;
24
- if (!isRecord(body))
25
- return undefined;
26
- const { innerHTML } = body;
27
- return typeof innerHTML === 'string' && innerHTML.length > 0
28
- ? innerHTML
29
- : undefined;
30
- }
31
- function getDocumentToString(document) {
32
- if (!isRecord(document))
33
- return undefined;
34
- if (typeof document.toString !== 'function')
35
- return undefined;
36
- return document.toString.bind(document);
37
- }
38
- function getDocumentElementOuterHtml(document) {
39
- if (!isRecord(document))
40
- return undefined;
41
- const { documentElement } = document;
42
- if (!isRecord(documentElement))
43
- return undefined;
44
- const { outerHTML } = documentElement;
45
- return typeof outerHTML === 'string' && outerHTML.length > 0
46
- ? outerHTML
47
- : undefined;
48
- }
29
+ // DOM accessor helpers moved to ./dom-noise-removal.ts
49
30
  const CODE_BLOCK = {
50
31
  fence: '```',
51
32
  format: (code, language = '') => {
@@ -93,9 +74,13 @@ export function endTransformStage(context, options) {
93
74
  }
94
75
  function runTransformStage(url, stage, fn) {
95
76
  const context = startTransformStage(url, stage);
96
- const result = fn();
97
- endTransformStage(context);
98
- return result;
77
+ try {
78
+ return fn();
79
+ }
80
+ finally {
81
+ // Emit duration even if the stage throws; callers decide how to handle the error.
82
+ endTransformStage(context);
83
+ }
99
84
  }
100
85
  function isTimeoutReason(reason) {
101
86
  return reason instanceof Error && reason.name === 'TimeoutError';
@@ -129,46 +114,105 @@ function truncateHtml(html) {
129
114
  });
130
115
  return html.substring(0, maxSize);
131
116
  }
117
+ const META_PROPERTY_HANDLERS = new Map([
118
+ [
119
+ 'og:title',
120
+ (ctx, c) => {
121
+ ctx.title.og = c;
122
+ },
123
+ ],
124
+ [
125
+ 'og:description',
126
+ (ctx, c) => {
127
+ ctx.description.og = c;
128
+ },
129
+ ],
130
+ [
131
+ 'og:image',
132
+ (ctx, c) => {
133
+ ctx.image = c;
134
+ },
135
+ ],
136
+ [
137
+ 'article:published_time',
138
+ (ctx, c) => {
139
+ ctx.publishedAt = c;
140
+ },
141
+ ],
142
+ [
143
+ 'article:modified_time',
144
+ (ctx, c) => {
145
+ ctx.modifiedAt = c;
146
+ },
147
+ ],
148
+ ]);
149
+ const META_NAME_HANDLERS = new Map([
150
+ [
151
+ 'twitter:title',
152
+ (ctx, c) => {
153
+ ctx.title.twitter = c;
154
+ },
155
+ ],
156
+ [
157
+ 'twitter:description',
158
+ (ctx, c) => {
159
+ ctx.description.twitter = c;
160
+ },
161
+ ],
162
+ [
163
+ 'description',
164
+ (ctx, c) => {
165
+ ctx.description.standard = c;
166
+ },
167
+ ],
168
+ [
169
+ 'author',
170
+ (ctx, c) => {
171
+ ctx.author = c;
172
+ },
173
+ ],
174
+ ]);
132
175
  function extractMetadata(document) {
133
- const title = {};
134
- const description = {};
135
- let author;
176
+ const ctx = {
177
+ title: {},
178
+ description: {},
179
+ };
136
180
  for (const tag of document.querySelectorAll('meta')) {
137
181
  const content = tag.getAttribute('content')?.trim();
138
182
  if (!content)
139
183
  continue;
140
184
  const property = tag.getAttribute('property');
185
+ if (property) {
186
+ META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
187
+ }
141
188
  const name = tag.getAttribute('name');
142
- if (property === 'og:title')
143
- title.og = content;
144
- else if (property === 'og:description')
145
- description.og = content;
146
- else if (name === 'twitter:title')
147
- title.twitter = content;
148
- else if (name === 'twitter:description')
149
- description.twitter = content;
150
- else if (name === 'description')
151
- description.standard = content;
152
- else if (name === 'author')
153
- author = content;
189
+ if (name) {
190
+ META_NAME_HANDLERS.get(name)?.(ctx, content);
191
+ }
154
192
  }
155
193
  const titleEl = document.querySelector('title');
156
- if (!title.standard && titleEl?.textContent) {
157
- title.standard = titleEl.textContent.trim();
194
+ if (!ctx.title.standard && titleEl?.textContent) {
195
+ ctx.title.standard = titleEl.textContent.trim();
158
196
  }
159
- const resolvedTitle = title.og ?? title.twitter ?? title.standard;
160
- const resolvedDesc = description.og ?? description.twitter ?? description.standard;
197
+ const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
198
+ const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
161
199
  const metadata = {};
162
200
  if (resolvedTitle)
163
201
  metadata.title = resolvedTitle;
164
202
  if (resolvedDesc)
165
203
  metadata.description = resolvedDesc;
166
- if (author)
167
- metadata.author = author;
204
+ if (ctx.author)
205
+ metadata.author = ctx.author;
206
+ if (ctx.image)
207
+ metadata.image = ctx.image;
208
+ if (ctx.publishedAt)
209
+ metadata.publishedAt = ctx.publishedAt;
210
+ if (ctx.modifiedAt)
211
+ metadata.modifiedAt = ctx.modifiedAt;
168
212
  return metadata;
169
213
  }
170
214
  function isReadabilityCompatible(doc) {
171
- if (!isRecord(doc))
215
+ if (!isObject(doc))
172
216
  return false;
173
217
  return hasDocumentElement(doc) && hasQuerySelectors(doc);
174
218
  }
@@ -185,14 +229,18 @@ function extractArticle(document) {
185
229
  return null;
186
230
  }
187
231
  try {
188
- const documentClone = document.cloneNode(true);
189
- const rawText = documentClone.querySelector('body')?.textContent ??
190
- documentClone.documentElement.textContent;
232
+ const doc = document;
233
+ const rawText = doc.querySelector('body')?.textContent ?? doc.documentElement.textContent;
191
234
  const textLength = rawText.replace(/\s+/g, ' ').trim().length;
192
- if (textLength >= 400 && !isProbablyReaderable(documentClone)) {
235
+ if (textLength < 100) {
236
+ logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
237
+ 'This might be a client-side rendered (SPA) application. ' +
238
+ 'Content extraction may be incomplete.', { textLength });
239
+ }
240
+ if (textLength >= 400 && !isProbablyReaderable(doc)) {
193
241
  return null;
194
242
  }
195
- const reader = new Readability(documentClone, { maxElemsToParse: 20_000 });
243
+ const reader = new Readability(doc, { maxElemsToParse: 20_000 });
196
244
  const parsed = reader.parse();
197
245
  if (!parsed)
198
246
  return null;
@@ -218,7 +266,8 @@ export function extractContent(html, url, options = {
218
266
  }
219
267
  function extractContentWithDocument(html, url, options) {
220
268
  if (!isValidInput(html, url)) {
221
- return { article: null, metadata: {} };
269
+ const { document } = parseHTML('<html></html>');
270
+ return { article: null, metadata: {}, document };
222
271
  }
223
272
  return tryExtractContent(html, url, options);
224
273
  }
@@ -233,7 +282,8 @@ function handleExtractionFailure(error, url, signal) {
233
282
  }
234
283
  throwIfAborted(signal, url, 'extract:error');
235
284
  logError('Failed to extract content', error instanceof Error ? error : undefined);
236
- return { article: null, metadata: {} };
285
+ const { document } = parseHTML('<html></html>');
286
+ return { article: null, metadata: {}, document };
237
287
  }
238
288
  function extractContentStages(html, url, options) {
239
289
  throwIfAborted(options.signal, url, 'extract:begin');
@@ -248,7 +298,8 @@ function extractContentStages(html, url, options) {
248
298
  return {
249
299
  article,
250
300
  metadata,
251
- ...(truncatedHtml.length === html.length ? { document } : {}),
301
+ document,
302
+ ...(truncatedHtml.length !== html.length ? { truncated: true } : {}),
252
303
  };
253
304
  }
254
305
  function tryExtractContent(html, url, options) {
@@ -285,532 +336,7 @@ function applyBaseUri(document, url) {
285
336
  });
286
337
  }
287
338
  }
288
- function containsJsxTag(code) {
289
- for (let index = 0; index < code.length - 1; index += 1) {
290
- if (code[index] !== '<')
291
- continue;
292
- const next = code[index + 1];
293
- if (!next)
294
- continue;
295
- if (next >= 'A' && next <= 'Z')
296
- return true;
297
- }
298
- return false;
299
- }
300
- function containsWord(source, word) {
301
- let startIndex = source.indexOf(word);
302
- while (startIndex !== -1) {
303
- const before = startIndex === 0 ? '' : source[startIndex - 1];
304
- const afterIndex = startIndex + word.length;
305
- const after = afterIndex >= source.length ? '' : source[afterIndex];
306
- if (!isWordChar(before) && !isWordChar(after))
307
- return true;
308
- startIndex = source.indexOf(word, startIndex + word.length);
309
- }
310
- return false;
311
- }
312
- function splitLines(content) {
313
- return content.split('\n');
314
- }
315
- function extractLanguageFromClassName(className) {
316
- const tokens = className.match(/\S+/g);
317
- if (!tokens)
318
- return undefined;
319
- for (const token of tokens) {
320
- const lower = token.toLowerCase();
321
- if (lower.startsWith('language-'))
322
- return token.slice('language-'.length);
323
- if (lower.startsWith('lang-'))
324
- return token.slice('lang-'.length);
325
- if (lower.startsWith('highlight-')) {
326
- return token.slice('highlight-'.length);
327
- }
328
- }
329
- if (tokens.includes('hljs')) {
330
- const langClass = tokens.find((t) => t !== 'hljs' && !t.startsWith('hljs-'));
331
- if (langClass)
332
- return langClass;
333
- }
334
- return undefined;
335
- }
336
- function resolveLanguageFromDataAttribute(dataLang) {
337
- const trimmed = dataLang.trim();
338
- if (!trimmed)
339
- return undefined;
340
- for (const char of trimmed) {
341
- if (!isWordChar(char))
342
- return undefined;
343
- }
344
- return trimmed;
345
- }
346
- function isWordChar(char) {
347
- if (!char)
348
- return false;
349
- const code = char.charCodeAt(0);
350
- return ((code >= 48 && code <= 57) ||
351
- (code >= 65 && code <= 90) ||
352
- (code >= 97 && code <= 122) ||
353
- char === '_');
354
- }
355
- const LANGUAGE_PATTERNS = [
356
- {
357
- language: 'jsx',
358
- pattern: {
359
- keywords: ['classname=', 'jsx:', "from 'react'", 'from "react"'],
360
- custom: (code) => containsJsxTag(code),
361
- },
362
- },
363
- {
364
- language: 'typescript',
365
- pattern: {
366
- wordBoundary: ['interface', 'type'],
367
- custom: (_, lower) => [
368
- ': string',
369
- ':string',
370
- ': number',
371
- ':number',
372
- ': boolean',
373
- ':boolean',
374
- ': void',
375
- ':void',
376
- ': any',
377
- ':any',
378
- ': unknown',
379
- ':unknown',
380
- ': never',
381
- ':never',
382
- ].some((hint) => lower.includes(hint)),
383
- },
384
- },
385
- {
386
- language: 'rust',
387
- pattern: {
388
- regex: /\b(?:fn|impl|struct|enum)\b/,
389
- keywords: ['let mut'],
390
- custom: (_, lower) => lower.includes('use ') && lower.includes('::'),
391
- },
392
- },
393
- {
394
- language: 'javascript',
395
- pattern: {
396
- regex: /\b(?:const|let|var|function|class|async|await|export|import)\b/,
397
- },
398
- },
399
- {
400
- language: 'python',
401
- pattern: {
402
- regex: /\b(?:def|class|import|from)\b/,
403
- keywords: ['print(', '__name__'],
404
- },
405
- },
406
- {
407
- language: 'bash',
408
- pattern: {
409
- custom: (code) => detectBashIndicators(code),
410
- },
411
- },
412
- {
413
- language: 'css',
414
- pattern: {
415
- regex: /@media|@import|@keyframes/,
416
- custom: (code) => detectCssStructure(code),
417
- },
418
- },
419
- {
420
- language: 'html',
421
- pattern: {
422
- keywords: [
423
- '<!doctype',
424
- '<html',
425
- '<head',
426
- '<body',
427
- '<div',
428
- '<span',
429
- '<p',
430
- '<a',
431
- '<script',
432
- '<style',
433
- ],
434
- },
435
- },
436
- {
437
- language: 'json',
438
- pattern: {
439
- startsWith: ['{', '['],
440
- },
441
- },
442
- {
443
- language: 'yaml',
444
- pattern: {
445
- custom: (code) => detectYamlStructure(code),
446
- },
447
- },
448
- {
449
- language: 'sql',
450
- pattern: {
451
- wordBoundary: [
452
- 'select',
453
- 'insert',
454
- 'update',
455
- 'delete',
456
- 'create',
457
- 'alter',
458
- 'drop',
459
- ],
460
- },
461
- },
462
- {
463
- language: 'go',
464
- pattern: {
465
- wordBoundary: ['package', 'func'],
466
- keywords: ['import "'],
467
- },
468
- },
469
- ];
470
- // Bash detection constants
471
- const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
472
- const BASH_PKG_MANAGERS = [
473
- 'npm',
474
- 'yarn',
475
- 'pnpm',
476
- 'npx',
477
- 'brew',
478
- 'apt',
479
- 'pip',
480
- 'cargo',
481
- 'go',
482
- ];
483
- const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
484
- function isShellPrefix(line) {
485
- return (line.startsWith('#!') || line.startsWith('$ ') || line.startsWith('# '));
486
- }
487
- function matchesBashCommand(line) {
488
- return BASH_COMMANDS.some((cmd) => line === cmd || line.startsWith(`${cmd} `));
489
- }
490
- function matchesPackageManagerVerb(line) {
491
- for (const mgr of BASH_PKG_MANAGERS) {
492
- if (!line.startsWith(`${mgr} `))
493
- continue;
494
- const rest = line.slice(mgr.length + 1);
495
- if (BASH_VERBS.some((v) => rest === v || rest.startsWith(`${v} `))) {
496
- return true;
497
- }
498
- }
499
- return false;
500
- }
501
- function detectBashIndicators(code) {
502
- for (const line of splitLines(code)) {
503
- const trimmed = line.trimStart();
504
- if (!trimmed)
505
- continue;
506
- if (isShellPrefix(trimmed) ||
507
- matchesBashCommand(trimmed) ||
508
- matchesPackageManagerVerb(trimmed)) {
509
- return true;
510
- }
511
- }
512
- return false;
513
- }
514
- function detectCssStructure(code) {
515
- for (const line of splitLines(code)) {
516
- const trimmed = line.trimStart();
517
- if (!trimmed)
518
- continue;
519
- const isSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
520
- trimmed.includes('{');
521
- const isProperty = trimmed.includes(':') && trimmed.includes(';');
522
- if (isSelector || isProperty)
523
- return true;
524
- }
525
- return false;
526
- }
527
- function detectYamlStructure(code) {
528
- for (const line of splitLines(code)) {
529
- const trimmed = line.trim();
530
- if (!trimmed)
531
- continue;
532
- const colonIdx = trimmed.indexOf(':');
533
- if (colonIdx <= 0)
534
- continue;
535
- const after = trimmed[colonIdx + 1];
536
- if (after === ' ' || after === '\t')
537
- return true;
538
- }
539
- return false;
540
- }
541
- function matchesLanguagePattern(code, lower, pattern) {
542
- if (pattern.keywords?.some((kw) => lower.includes(kw)))
543
- return true;
544
- if (pattern.wordBoundary?.some((w) => containsWord(lower, w)))
545
- return true;
546
- if (pattern.regex?.test(lower))
547
- return true;
548
- if (pattern.startsWith) {
549
- const trimmed = code.trimStart();
550
- if (pattern.startsWith.some((prefix) => trimmed.startsWith(prefix)))
551
- return true;
552
- }
553
- if (pattern.custom?.(code, lower))
554
- return true;
555
- return false;
556
- }
557
- export function detectLanguageFromCode(code) {
558
- const lower = code.toLowerCase();
559
- for (const { language, pattern } of LANGUAGE_PATTERNS) {
560
- if (matchesLanguagePattern(code, lower, pattern))
561
- return language;
562
- }
563
- return undefined;
564
- }
565
- export function resolveLanguageFromAttributes(className, dataLang) {
566
- const classMatch = extractLanguageFromClassName(className);
567
- return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
568
- }
569
- function isElement(node) {
570
- return (isRecord(node) &&
571
- 'getAttribute' in node &&
572
- typeof node.getAttribute === 'function');
573
- }
574
- const STRUCTURAL_TAGS = new Set([
575
- 'script',
576
- 'style',
577
- 'noscript',
578
- 'iframe',
579
- 'form',
580
- 'button',
581
- 'input',
582
- 'select',
583
- 'textarea',
584
- 'svg',
585
- ]);
586
- const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
587
- const NAVIGATION_ROLES = new Set([
588
- 'navigation',
589
- 'banner',
590
- 'complementary',
591
- 'contentinfo',
592
- 'tree',
593
- 'menubar',
594
- 'menu',
595
- 'dialog',
596
- 'alertdialog',
597
- 'search',
598
- ]);
599
- const PROMO_TOKENS = new Set([
600
- 'banner',
601
- 'promo',
602
- 'announcement',
603
- 'cta',
604
- 'callout',
605
- 'advert',
606
- 'ad',
607
- 'ads',
608
- 'sponsor',
609
- 'newsletter',
610
- 'subscribe',
611
- 'cookie',
612
- 'consent',
613
- 'popup',
614
- 'modal',
615
- 'overlay',
616
- 'toast',
617
- 'share',
618
- 'social',
619
- 'related',
620
- 'recommend',
621
- 'comment',
622
- 'breadcrumb',
623
- 'pagination',
624
- 'pager',
625
- 'taglist',
626
- ]);
627
- const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
628
- const FIXED_PATTERN = /\b(fixed|sticky)\b/;
629
- const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
630
- const ISOLATE_PATTERN = /\bisolate\b/;
631
- const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
632
- const NOISE_MARKERS = [
633
- '<script',
634
- '<style',
635
- '<noscript',
636
- '<iframe',
637
- '<nav',
638
- '<footer',
639
- '<aside',
640
- '<header',
641
- '<form',
642
- '<button',
643
- '<input',
644
- '<select',
645
- '<textarea',
646
- '<svg',
647
- '<canvas',
648
- ' aria-hidden="true"',
649
- " aria-hidden='true'",
650
- ' hidden',
651
- ' role="navigation"',
652
- " role='navigation'",
653
- ' role="banner"',
654
- " role='banner'",
655
- ' role="complementary"',
656
- " role='complementary'",
657
- ' role="contentinfo"',
658
- " role='contentinfo'",
659
- ' role="tree"',
660
- " role='tree'",
661
- ' role="menubar"',
662
- " role='menubar'",
663
- ' role="menu"',
664
- " role='menu'",
665
- ' banner',
666
- ' promo',
667
- ' announcement',
668
- ' cta',
669
- ' callout',
670
- ' advert',
671
- ' newsletter',
672
- ' subscribe',
673
- ' cookie',
674
- ' consent',
675
- ' popup',
676
- ' modal',
677
- ' overlay',
678
- ' toast',
679
- ' fixed',
680
- ' sticky',
681
- ' z-50',
682
- ' z-4',
683
- ' isolate',
684
- ' breadcrumb',
685
- ' pagination',
686
- ];
687
- function mayContainNoise(html) {
688
- const haystack = html.toLowerCase();
689
- return NOISE_MARKERS.some((marker) => haystack.includes(marker));
690
- }
691
- function isFullDocumentHtml(html) {
692
- return HTML_DOCUMENT_MARKERS.test(html);
693
- }
694
- function isStructuralNoiseTag(tagName) {
695
- return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
696
- }
697
- function isElementHidden(element) {
698
- const style = element.getAttribute('style') ?? '';
699
- return (element.getAttribute('hidden') !== null ||
700
- element.getAttribute('aria-hidden') === 'true' ||
701
- /\bdisplay\s*:\s*none\b/i.test(style) ||
702
- /\bvisibility\s*:\s*hidden\b/i.test(style));
703
- }
704
- function hasNoiseRole(role) {
705
- return role !== null && NAVIGATION_ROLES.has(role);
706
- }
707
- function tokenizeIdentifierLikeText(value) {
708
- return value
709
- .toLowerCase()
710
- .replace(/[^a-z0-9]+/g, ' ')
711
- .trim()
712
- .split(' ')
713
- .filter(Boolean);
714
- }
715
- function matchesPromoIdOrClass(className, id) {
716
- const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
717
- return tokens.some((token) => PROMO_TOKENS.has(token));
718
- }
719
- function matchesFixedOrHighZIsolate(className) {
720
- return (FIXED_PATTERN.test(className) ||
721
- (HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
722
- }
723
- function readElementMetadata(element) {
724
- return {
725
- tagName: element.tagName.toLowerCase(),
726
- className: element.getAttribute('class') ?? '',
727
- id: element.getAttribute('id') ?? '',
728
- role: element.getAttribute('role'),
729
- isHidden: isElementHidden(element),
730
- };
731
- }
732
- function isBoilerplateHeader({ className, id, role, }) {
733
- if (hasNoiseRole(role))
734
- return true;
735
- const combined = `${className} ${id}`.toLowerCase();
736
- return HEADER_NOISE_PATTERN.test(combined);
737
- }
738
- function isNoiseElement(node) {
739
- const metadata = readElementMetadata(node);
740
- return (isStructuralNoiseTag(metadata.tagName) ||
741
- ALWAYS_NOISE_TAGS.has(metadata.tagName) ||
742
- (metadata.tagName === 'header' && isBoilerplateHeader(metadata)) ||
743
- metadata.isHidden ||
744
- hasNoiseRole(metadata.role) ||
745
- matchesFixedOrHighZIsolate(metadata.className) ||
746
- matchesPromoIdOrClass(metadata.className, metadata.id));
747
- }
748
- function removeNoiseNodes(nodes) {
749
- for (let index = nodes.length - 1; index >= 0; index -= 1) {
750
- const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
751
- if (!node)
752
- continue;
753
- if (isElement(node) && isNoiseElement(node)) {
754
- node.remove();
755
- }
756
- }
757
- }
758
- function stripNoiseNodes(document) {
759
- // Use targeted selectors for common noise elements instead of querySelectorAll('*')
760
- const targetSelectors = [
761
- 'nav',
762
- 'footer',
763
- 'aside',
764
- 'header[class*="site"]',
765
- 'header[class*="nav"]',
766
- 'header[class*="menu"]',
767
- '[role="banner"]',
768
- '[role="navigation"]',
769
- '[role="dialog"]',
770
- '[style*="display: none"]',
771
- '[style*="display:none"]',
772
- '[hidden]',
773
- '[aria-hidden="true"]',
774
- ].join(',');
775
- const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
776
- // Remove in reverse order to handle nested elements correctly
777
- removeNoiseNodes(potentialNoiseNodes);
778
- // Second pass: check remaining elements for noise patterns (promo, fixed positioning, etc.)
779
- const candidateSelectors = [
780
- ...STRUCTURAL_TAGS,
781
- ...ALWAYS_NOISE_TAGS,
782
- 'header',
783
- 'canvas',
784
- '[class]',
785
- '[id]',
786
- '[role]',
787
- '[style]',
788
- ].join(',');
789
- const allElements = document.querySelectorAll(candidateSelectors);
790
- removeNoiseNodes(allElements);
791
- }
792
- function removeNoiseFromHtml(html, document) {
793
- const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
794
- if (!shouldParse)
795
- return html;
796
- try {
797
- const resolvedDocument = document ?? parseHTML(html).document;
798
- stripNoiseNodes(resolvedDocument);
799
- const bodyInnerHtml = getBodyInnerHtml(resolvedDocument);
800
- if (bodyInnerHtml)
801
- return bodyInnerHtml;
802
- const docToString = getDocumentToString(resolvedDocument);
803
- if (docToString)
804
- return docToString();
805
- const documentElementOuterHtml = getDocumentElementOuterHtml(resolvedDocument);
806
- if (documentElementOuterHtml)
807
- return documentElementOuterHtml;
808
- return html;
809
- }
810
- catch {
811
- return html;
812
- }
813
- }
339
+ // DOM noise removal functions moved to ./dom-noise-removal.ts
814
340
  function buildInlineCode(content) {
815
341
  const runs = content.match(/`+/g);
816
342
  let longest = '';
@@ -821,8 +347,11 @@ function buildInlineCode(content) {
821
347
  }
822
348
  }
823
349
  }
350
+ // Use a fence longer than any run of backticks in the content.
824
351
  const delimiter = `\`${longest}`;
825
- const padding = delimiter.length > 1 ? ' ' : '';
352
+ // Only pad when needed to avoid altering code spans unnecessarily.
353
+ // CommonMark recommends padding when the code starts/ends with a backtick.
354
+ const padding = content.startsWith('`') || content.endsWith('`') ? ' ' : '';
826
355
  return `${delimiter}${padding}${content}${padding}${delimiter}`;
827
356
  }
828
357
  function deriveAltFromImageUrl(src) {
@@ -845,16 +374,13 @@ function deriveAltFromImageUrl(src) {
845
374
  }
846
375
  }
847
376
  function isCodeBlock(parent) {
848
- if (!isRecord(parent))
377
+ if (!isObject(parent))
849
378
  return false;
850
379
  const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
851
380
  return ['PRE', 'WRAPPED-PRE'].includes(tagName);
852
381
  }
853
382
  function hasGetAttribute(value) {
854
- return isRecord(value) && typeof value.getAttribute === 'function';
855
- }
856
- function hasCodeBlockTranslators(value) {
857
- return isRecord(value) && isRecord(value.codeBlockTranslators);
383
+ return isObject(value) && typeof value.getAttribute === 'function';
858
384
  }
859
385
  function buildInlineCodeTranslator() {
860
386
  return {
@@ -871,37 +397,19 @@ function resolveAttributeLanguage(node) {
871
397
  const dataLanguage = getAttribute?.('data-language') ?? '';
872
398
  return resolveLanguageFromAttributes(className, dataLanguage);
873
399
  }
874
- function resolveCodeBlockTranslators(visitor) {
875
- const childTranslators = isRecord(visitor) ? visitor.instance : null;
876
- return hasCodeBlockTranslators(childTranslators)
877
- ? childTranslators.codeBlockTranslators
878
- : null;
879
- }
880
- function buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators) {
881
- return {
882
- noEscape: true,
883
- preserveWhitespace: true,
884
- ...(codeBlockTranslators
885
- ? { childTranslators: codeBlockTranslators }
886
- : null),
887
- postprocess: ({ content }) => {
888
- const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
889
- return CODE_BLOCK.format(content, language);
890
- },
891
- };
892
- }
893
400
  function buildCodeTranslator(ctx) {
894
- if (!isRecord(ctx))
401
+ if (!isObject(ctx))
895
402
  return buildInlineCodeTranslator();
896
- const { node, parent, visitor } = ctx;
403
+ const { parent } = ctx;
897
404
  if (!isCodeBlock(parent))
898
405
  return buildInlineCodeTranslator();
899
- const attributeLanguage = resolveAttributeLanguage(node);
900
- const codeBlockTranslators = resolveCodeBlockTranslators(visitor);
901
- return buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators);
406
+ return {
407
+ noEscape: true,
408
+ preserveWhitespace: true,
409
+ };
902
410
  }
903
411
  function buildImageTranslator(ctx) {
904
- if (!isRecord(ctx))
412
+ if (!isObject(ctx))
905
413
  return { content: '' };
906
414
  const { node } = ctx;
907
415
  const getAttribute = hasGetAttribute(node)
@@ -914,19 +422,57 @@ function buildImageTranslator(ctx) {
914
422
  content: `![${alt}](${src})`,
915
423
  };
916
424
  }
425
+ function findLanguageFromCodeChild(node) {
426
+ if (!isObject(node))
427
+ return undefined;
428
+ const { childNodes } = node;
429
+ if (!Array.isArray(childNodes))
430
+ return undefined;
431
+ for (const child of childNodes) {
432
+ if (!isObject(child))
433
+ continue;
434
+ const tagName = typeof child.rawTagName === 'string'
435
+ ? child.rawTagName.toUpperCase()
436
+ : '';
437
+ if (tagName === 'CODE') {
438
+ return resolveAttributeLanguage(child);
439
+ }
440
+ }
441
+ return undefined;
442
+ }
443
+ function createCodeBlockPostprocessor(language) {
444
+ return ({ content }) => {
445
+ const trimmed = content.trim();
446
+ if (!trimmed)
447
+ return '';
448
+ const resolvedLanguage = language ?? detectLanguageFromCode(trimmed) ?? '';
449
+ return CODE_BLOCK.format(trimmed, resolvedLanguage);
450
+ };
451
+ }
452
+ function buildPreTranslator(ctx) {
453
+ if (!isObject(ctx))
454
+ return {};
455
+ const { node } = ctx;
456
+ const attributeLanguage = resolveAttributeLanguage(node) ?? findLanguageFromCodeChild(node);
457
+ return {
458
+ noEscape: true,
459
+ preserveWhitespace: true,
460
+ postprocess: createCodeBlockPostprocessor(attributeLanguage),
461
+ };
462
+ }
917
463
  function createCustomTranslators() {
918
464
  return {
919
465
  code: (ctx) => buildCodeTranslator(ctx),
920
466
  img: (ctx) => buildImageTranslator(ctx),
921
467
  dl: (ctx) => {
922
- if (!isRecord(ctx) || !isRecord(ctx.node)) {
468
+ if (!isObject(ctx) || !isObject(ctx.node)) {
923
469
  return { content: '' };
924
470
  }
925
471
  const node = ctx.node;
926
472
  const childNodes = Array.isArray(node.childNodes) ? node.childNodes : [];
927
473
  const items = childNodes
928
474
  .map((child) => {
929
- if (!isRecord(child))
475
+ if (!isObject(child))
930
476
  return '';
931
477
  const nodeName = typeof child.nodeName === 'string'
932
478
  ? child.nodeName.toUpperCase()
@@ -956,6 +502,8 @@ function createCustomTranslators() {
956
502
  sup: () => ({
957
503
  postprocess: ({ content }) => `^${content}^`,
958
504
  }),
505
+ // Fix #6: Handle <pre> without <code> - wrap in fenced code block
506
+ pre: (ctx) => buildPreTranslator(ctx),
959
507
  };
960
508
  }
961
509
  let markdownInstance = null;
@@ -971,9 +519,11 @@ function getMarkdownConverter() {
971
519
  markdownInstance ??= createMarkdownInstance();
972
520
  return markdownInstance;
973
521
  }
974
- function translateHtmlToMarkdown(html, url, signal, document) {
522
+ function translateHtmlToMarkdown(html, url, signal, document, skipNoiseRemoval) {
975
523
  throwIfAborted(signal, url, 'markdown:begin');
976
- const cleanedHtml = runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document));
524
+ const cleanedHtml = skipNoiseRemoval
525
+ ? html
526
+ : runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
977
527
  throwIfAborted(signal, url, 'markdown:cleaned');
978
528
  const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
979
529
  throwIfAborted(signal, url, 'markdown:translated');
@@ -989,151 +539,18 @@ export function htmlToMarkdown(html, metadata, options) {
989
539
  if (!html)
990
540
  return buildMetadataFooter(metadata, url);
991
541
  try {
992
- const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document);
542
+ const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document, options?.skipNoiseRemoval);
993
543
  return appendMetadataFooter(content, metadata, url);
994
544
  }
995
545
  catch (error) {
996
546
  if (error instanceof FetchError) {
997
547
  throw error;
998
548
  }
549
+ logError('Failed to convert HTML to markdown', error instanceof Error ? error : undefined);
999
550
  return buildMetadataFooter(metadata, url);
1000
551
  }
1001
552
  }
1002
- function cleanupMarkdownArtifacts(content) {
1003
- let result = content;
1004
- const fixOrphanHeadings = (text) => {
1005
- return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (match, prefix, hashes, heading) => {
1006
- if (typeof prefix !== 'string' ||
1007
- typeof hashes !== 'string' ||
1008
- typeof heading !== 'string') {
1009
- return match;
1010
- }
1011
- if (heading.length > 150) {
1012
- return match;
1013
- }
1014
- const trimmedPrefix = prefix.trim();
1015
- if (trimmedPrefix === '') {
1016
- return `${hashes} ${heading}\n\n`;
1017
- }
1018
- return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
1019
- });
1020
- };
1021
- result = fixOrphanHeadings(result);
1022
- result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
1023
- const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
1024
- result = result.replace(zeroWidthAnchorLink, '');
1025
- result = result.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '');
1026
- result = result.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '');
1027
- result = result.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
1028
- result = result.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
1029
- result = result.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
1030
- result = result.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
1031
- const tocLinkLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
1032
- const lines = result.split('\n');
1033
- const filtered = [];
1034
- let skipTocBlock = false;
1035
- for (let i = 0; i < lines.length; i += 1) {
1036
- const line = lines[i] ?? '';
1037
- const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
1038
- const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
1039
- if (tocLinkLine.test(line)) {
1040
- const prevIsToc = tocLinkLine.test(prevLine) || prevLine.trim() === '';
1041
- const nextIsToc = tocLinkLine.test(nextLine) || nextLine.trim() === '';
1042
- if (prevIsToc || nextIsToc) {
1043
- skipTocBlock = true;
1044
- continue;
1045
- }
1046
- }
1047
- else if (line.trim() === '' && skipTocBlock) {
1048
- skipTocBlock = false;
1049
- continue;
1050
- }
1051
- else {
1052
- skipTocBlock = false;
1053
- }
1054
- filtered.push(line);
1055
- }
1056
- result = filtered.join('\n');
1057
- result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
1058
- result = result.replace(/^Was this page helpful\??\s*$/gim, '');
1059
- result = result.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ');
1060
- result = result.replace(/\\([[]])/g, '$1');
1061
- result = result.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
1062
- result = result.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
1063
- result = result.replace(/\n{3,}/g, '\n\n');
1064
- return result.trim();
1065
- }
1066
- const HEADING_KEYWORDS = new Set([
1067
- 'overview',
1068
- 'introduction',
1069
- 'summary',
1070
- 'conclusion',
1071
- 'prerequisites',
1072
- 'requirements',
1073
- 'installation',
1074
- 'configuration',
1075
- 'usage',
1076
- 'features',
1077
- 'limitations',
1078
- 'troubleshooting',
1079
- 'faq',
1080
- 'resources',
1081
- 'references',
1082
- 'changelog',
1083
- 'license',
1084
- 'acknowledgments',
1085
- 'appendix',
1086
- ]);
1087
- function isLikelyHeadingLine(line) {
1088
- const trimmed = line.trim();
1089
- if (!trimmed || trimmed.length > 80)
1090
- return false;
1091
- if (/^#{1,6}\s/.test(trimmed))
1092
- return false;
1093
- if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
1094
- return false;
1095
- if (/[.!?]$/.test(trimmed))
1096
- return false;
1097
- if (/^\[.*\]\(.*\)$/.test(trimmed))
1098
- return false;
1099
- if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
1100
- return true;
1101
- }
1102
- const words = trimmed.split(/\s+/);
1103
- if (words.length >= 2 && words.length <= 6) {
1104
- const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
1105
- if (isTitleCase)
1106
- return true;
1107
- }
1108
- if (words.length === 1) {
1109
- const lower = trimmed.toLowerCase();
1110
- if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed)) {
1111
- return true;
1112
- }
1113
- }
1114
- return false;
1115
- }
1116
- function promoteOrphanHeadings(markdown) {
1117
- const lines = markdown.split('\n');
1118
- const result = [];
1119
- for (let i = 0; i < lines.length; i += 1) {
1120
- const line = lines[i] ?? '';
1121
- const prevLine = i > 0 ? lines[i - 1] : '';
1122
- const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
1123
- const isStandalone = prevLine?.trim() === '' && nextLine?.trim() === '';
1124
- const isPrecededByBlank = prevLine?.trim() === '';
1125
- if ((isStandalone || isPrecededByBlank) && isLikelyHeadingLine(line)) {
1126
- const trimmed = line.trim();
1127
- const isExample = /^example:\s/i.test(trimmed);
1128
- const prefix = isExample ? '### ' : '## ';
1129
- result.push(prefix + trimmed);
1130
- }
1131
- else {
1132
- result.push(line);
1133
- }
1134
- }
1135
- return result.join('\n');
1136
- }
553
+ // Markdown cleanup functions moved to ./markdown-cleanup.ts
1137
554
  function formatFetchedDate(isoString) {
1138
555
  try {
1139
556
  const date = new Date(isoString);
@@ -1382,54 +799,114 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
1382
799
  const MIN_CONTENT_RATIO = 0.3;
1383
800
  const MIN_HTML_LENGTH_FOR_GATE = 100;
1384
801
  const MIN_HEADING_RETENTION_RATIO = 0.7;
1385
- function countHeadings(html) {
1386
- if (!html)
1387
- return 0;
1388
- // Match opening heading tags <h1> through <h6>
1389
- const headingPattern = /<h[1-6](?:\s[^>]*)?>([^<]*)<\/h[1-6]>/gi;
1390
- const matches = html.match(headingPattern);
1391
- return matches ? matches.length : 0;
1392
- }
1393
- function isHeadingStructurePreserved(article, originalHtml) {
1394
- if (!article)
1395
- return false;
1396
- // Cache heading counts to avoid duplicate regex matching
1397
- const originalHeadingCount = countHeadings(originalHtml);
1398
- const articleHeadingCount = countHeadings(article.content);
1399
- // If original has no headings, structure is trivially preserved
1400
- if (originalHeadingCount === 0)
1401
- return true;
1402
- // If article lost >50% of headings, structure is broken
1403
- const retentionRatio = articleHeadingCount / originalHeadingCount;
1404
- return retentionRatio >= MIN_HEADING_RETENTION_RATIO;
1405
- }
1406
- function stripHtmlTagsForLength(html) {
1407
- let result = '';
1408
- let inTag = false;
1409
- for (const char of html) {
1410
- if (char === '<') {
1411
- inTag = true;
1412
- }
1413
- else if (char === '>') {
1414
- inTag = false;
802
+ const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
803
+ /**
804
+ * Count headings using DOM querySelectorAll.
805
+ * Handles nested content like <h2><span>Text</span></h2> correctly.
806
+ */
807
+ function countHeadingsDom(htmlOrDocument) {
808
+ if (typeof htmlOrDocument === 'string') {
809
+ // Wrap fragments in document structure for proper parsing
810
+ const htmlToParse = needsDocumentWrapper(htmlOrDocument)
811
+ ? wrapHtmlFragment(htmlOrDocument)
812
+ : htmlOrDocument;
813
+ const { document: doc } = parseHTML(htmlToParse);
814
+ return doc.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
815
+ }
816
+ return htmlOrDocument.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
817
+ }
818
+ function countCodeBlocksDom(htmlOrDocument) {
819
+ if (typeof htmlOrDocument === 'string') {
820
+ // Wrap fragments in document structure for proper parsing
821
+ const htmlToParse = needsDocumentWrapper(htmlOrDocument)
822
+ ? wrapHtmlFragment(htmlOrDocument)
823
+ : htmlOrDocument;
824
+ const { document: doc } = parseHTML(htmlToParse);
825
+ return doc.querySelectorAll('pre').length;
826
+ }
827
+ return htmlOrDocument.querySelectorAll('pre').length;
828
+ }
829
+ /**
830
+ * Check if HTML string needs document wrapper for proper parsing.
831
+ * Fragments without doctype/html/body tags need wrapping.
832
+ */
833
+ function needsDocumentWrapper(html) {
834
+ const trimmed = html.trim().toLowerCase();
835
+ return (!trimmed.startsWith('<!doctype') &&
836
+ !trimmed.startsWith('<html') &&
837
+ !trimmed.startsWith('<body'));
838
+ }
839
+ /**
840
+ * Wrap HTML fragment in minimal document structure for proper parsing.
841
+ */
842
+ function wrapHtmlFragment(html) {
843
+ return `<!DOCTYPE html><html><body>${html}</body></html>`;
844
+ }
845
+ /**
846
+ * Get visible text length from HTML, excluding script/style/noscript content.
847
+ * Fixes the bug where stripHtmlTagsForLength() counted JS/CSS as visible text.
848
+ */
849
+ function getVisibleTextLength(htmlOrDocument) {
850
+ // For string input, parse the HTML
851
+ if (typeof htmlOrDocument === 'string') {
852
+ // Wrap fragments in document structure for proper parsing
853
+ const htmlToParse = needsDocumentWrapper(htmlOrDocument)
854
+ ? wrapHtmlFragment(htmlOrDocument)
855
+ : htmlOrDocument;
856
+ const { document: doc } = parseHTML(htmlToParse);
857
+ // Remove non-visible content that inflates text length
858
+ for (const el of doc.querySelectorAll('script,style,noscript')) {
859
+ el.remove();
1415
860
  }
1416
- else if (!inTag) {
1417
- result += char;
1418
- }
1419
- }
1420
- return result;
1421
- }
1422
- export function isExtractionSufficient(article, originalHtml) {
861
+ // Get text content from body or documentElement
862
+ // Note: linkedom may return null for body on HTML fragments despite types
863
+ const body = doc.body;
864
+ const docElement = doc.documentElement;
865
+ const text = body?.textContent ?? docElement?.textContent ?? '';
866
+ return text.replace(/\s+/g, ' ').trim().length;
867
+ }
868
+ // For Document input, clone to avoid mutation
869
+ const workDoc = htmlOrDocument.cloneNode(true);
870
+ // Remove non-visible content that inflates text length
871
+ for (const el of workDoc.querySelectorAll('script,style,noscript')) {
872
+ el.remove();
873
+ }
874
+ // Get text content from body or documentElement
875
+ // Note: linkedom may return null for body on HTML fragments despite types
876
+ const body = workDoc.body;
877
+ const docElement = workDoc.documentElement;
878
+ const text = body?.textContent ?? docElement?.textContent ?? '';
879
+ return text.replace(/\s+/g, ' ').trim().length;
880
+ }
881
+ export function isExtractionSufficient(article, originalHtmlOrDocument) {
1423
882
  if (!article)
1424
883
  return false;
1425
884
  const articleLength = article.textContent.length;
1426
- const originalLength = stripHtmlTagsForLength(originalHtml)
1427
- .replace(/\s+/g, ' ')
1428
- .trim().length;
885
+ // Use DOM-based visible text length to exclude script/style content
886
+ const originalLength = getVisibleTextLength(originalHtmlOrDocument);
1429
887
  if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
1430
888
  return true;
1431
889
  return articleLength / originalLength >= MIN_CONTENT_RATIO;
1432
890
  }
891
+ const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
892
+ const MAX_TRUNCATED_LINE_RATIO = 0.5;
893
+ /**
894
+ * Detect if extracted text has many truncated/incomplete sentences.
895
+ * Lines longer than 20 chars that don't end with sentence punctuation
896
+ * are considered potentially truncated.
897
+ */
898
+ function hasTruncatedSentences(text) {
899
+ const lines = text
900
+ .split('\n')
901
+ .filter((line) => line.trim().length > MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK);
902
+ if (lines.length < 3)
903
+ return false;
904
+ const incompleteLines = lines.filter((line) => {
905
+ const trimmed = line.trim();
906
+ return !/[.!?:;]$/.test(trimmed);
907
+ });
908
+ return incompleteLines.length / lines.length > MAX_TRUNCATED_LINE_RATIO;
909
+ }
1433
910
  export function determineContentExtractionSource(article) {
1434
911
  return article !== null;
1435
912
  }
@@ -1459,17 +936,84 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
1459
936
  }
1460
937
  return metadata;
1461
938
  }
939
+ /**
940
+ * Content root selectors in priority order.
941
+ * These identify the main content area on a page.
942
+ */
943
+ const CONTENT_ROOT_SELECTORS = [
944
+ 'main',
945
+ 'article',
946
+ '[role="main"]',
947
+ '#content',
948
+ '#main-content',
949
+ '.content',
950
+ '.main-content',
951
+ '.post-content',
952
+ '.article-content',
953
+ '.entry-content',
954
+ '[itemprop="articleBody"]',
955
+ '[data-content]',
956
+ '.post-body',
957
+ '.article-body',
958
+ ];
959
+ /**
960
+ * Find the main content root element in a document.
961
+ * Returns the innerHTML if found, undefined otherwise.
962
+ */
963
+ function findContentRoot(document) {
964
+ for (const selector of CONTENT_ROOT_SELECTORS) {
965
+ const element = document.querySelector(selector);
966
+ if (!element)
967
+ continue;
968
+ // Check if element has meaningful content
969
+ const innerHTML = typeof element.innerHTML === 'string'
970
+ ? element.innerHTML
971
+ : undefined;
972
+ if (innerHTML && innerHTML.trim().length > 100) {
973
+ return innerHTML;
974
+ }
975
+ }
976
+ return undefined;
977
+ }
1462
978
  function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, document, }) {
1463
979
  const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
1464
- const source = {
1465
- sourceHtml: useArticleContent && article ? article.content : html,
1466
- title: useArticleContent && article ? article.title : extractedMeta.title,
980
+ // If using article content, return it directly
981
+ if (useArticleContent && article) {
982
+ return {
983
+ sourceHtml: article.content,
984
+ title: article.title,
985
+ metadata,
986
+ };
987
+ }
988
+ // Try content root fallback before using full HTML
989
+ if (document) {
990
+ // Apply noise removal to HTML first (without passing document) to get cleaned HTML,
991
+ // then parse and find content root. This prevents the aggressive DOM stripping that
992
+ // happens when noise removal is given the original parsed document.
993
+ const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
994
+ const { document: cleanedDoc } = parseHTML(cleanedHtml);
995
+ const contentRoot = findContentRoot(cleanedDoc);
996
+ if (contentRoot) {
997
+ logDebug('Using content root fallback instead of full HTML', {
998
+ url: url.substring(0, 80),
999
+ contentLength: contentRoot.length,
1000
+ });
1001
+ return {
1002
+ sourceHtml: contentRoot,
1003
+ title: extractedMeta.title,
1004
+ metadata,
1005
+ // Skip noise removal - this HTML is already from a cleaned document
1006
+ skipNoiseRemoval: true,
1007
+ };
1008
+ }
1009
+ }
1010
+ // Fall back to full HTML
1011
+ return {
1012
+ sourceHtml: html,
1013
+ title: extractedMeta.title,
1467
1014
  metadata,
1015
+ ...(document ? { document } : {}),
1468
1016
  };
1469
- if (!useArticleContent && document) {
1470
- return { ...source, document };
1471
- }
1472
- return source;
1473
1017
  }
1474
1018
  function logQualityGateFallback({ url, articleLength, }) {
1475
1019
  logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
@@ -1477,22 +1021,54 @@ function logQualityGateFallback({ url, articleLength, }) {
1477
1021
  articleLength,
1478
1022
  });
1479
1023
  }
1480
- function shouldUseArticleContent(article, html, url) {
1481
- // Check content sufficiency (length-based quality gate)
1482
- if (!isExtractionSufficient(article, html)) {
1483
- logQualityGateFallback({
1484
- url,
1485
- articleLength: article.textContent.length,
1486
- });
1487
- return false;
1024
+ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
1025
+ const articleLength = article.textContent.length;
1026
+ const originalLength = getVisibleTextLength(originalHtmlOrDocument);
1027
+ // If the document is tiny, don't gate too aggressively.
1028
+ if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
1029
+ const ratio = articleLength / originalLength;
1030
+ if (ratio < MIN_CONTENT_RATIO) {
1031
+ logQualityGateFallback({ url, articleLength });
1032
+ return false;
1033
+ }
1488
1034
  }
1489
- // Check heading structure preservation
1490
- if (!isHeadingStructurePreserved(article, html)) {
1491
- logDebug('Quality gate: Readability broke heading structure, using full HTML', {
1035
+ // Heading structure retention (compute counts once to avoid repeated DOM queries/parses).
1036
+ const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
1037
+ if (originalHeadings > 0) {
1038
+ const articleHeadings = countHeadingsDom(article.content);
1039
+ const retentionRatio = articleHeadings / originalHeadings;
1040
+ if (retentionRatio < MIN_HEADING_RETENTION_RATIO) {
1041
+ logDebug('Quality gate: Readability broke heading structure, using full HTML', {
1042
+ url: url.substring(0, 80),
1043
+ originalHeadings,
1044
+ articleHeadings,
1045
+ });
1046
+ return false;
1047
+ }
1048
+ }
1049
+ const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
1050
+ if (originalCodeBlocks > 0) {
1051
+ const articleCodeBlocks = countCodeBlocksDom(article.content);
1052
+ const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
1053
+ // Always log code block counts for debugging
1054
+ logDebug('Code block retention check', {
1492
1055
  url: url.substring(0, 80),
1493
- originalHeadings: countHeadings(html),
1494
- articleHeadings: countHeadings(article.content),
1056
+ originalCodeBlocks,
1057
+ articleCodeBlocks,
1058
+ codeRetentionRatio,
1495
1059
  });
1060
+ if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
1061
+ logDebug('Quality gate: Readability removed code blocks, using full HTML', {
1062
+ url: url.substring(0, 80),
1063
+ originalCodeBlocks,
1064
+ articleCodeBlocks,
1065
+ });
1066
+ return false;
1067
+ }
1068
+ }
1069
+ // Layout extraction issue: truncated/fragmented lines.
1070
+ if (hasTruncatedSentences(article.textContent)) {
1071
+ logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', { url: url.substring(0, 80) });
1496
1072
  return false;
1497
1073
  }
1498
1074
  return true;
@@ -1502,8 +1078,9 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
1502
1078
  extractArticle: true,
1503
1079
  ...(signal ? { signal } : {}),
1504
1080
  });
1081
+ const originalDocument = parseHTML(html).document;
1505
1082
  const useArticleContent = article
1506
- ? shouldUseArticleContent(article, html, url)
1083
+ ? shouldUseArticleContent(article, originalDocument, url)
1507
1084
  : false;
1508
1085
  return buildContentSource({
1509
1086
  html,
@@ -1512,7 +1089,7 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
1512
1089
  extractedMeta,
1513
1090
  includeMetadata,
1514
1091
  useArticleContent,
1515
- ...(document ? { document } : {}),
1092
+ document,
1516
1093
  });
1517
1094
  }
1518
1095
  function tryTransformRawStage(html, url, includeMetadata) {
@@ -1535,6 +1112,7 @@ function buildMarkdownFromContext(context, url, signal) {
1535
1112
  url,
1536
1113
  ...(signal ? { signal } : {}),
1537
1114
  ...(context.document ? { document: context.document } : {}),
1115
+ ...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
1538
1116
  }));
1539
1117
  return {
1540
1118
  markdown: content,
@@ -1628,6 +1206,12 @@ class WorkerPool {
1628
1206
  timeoutMs;
1629
1207
  queueMax;
1630
1208
  closed = false;
1209
+ createAbortError(url, stage) {
1210
+ return new FetchError('Request was canceled', url, 499, {
1211
+ reason: 'aborted',
1212
+ stage,
1213
+ });
1214
+ }
1631
1215
  ensureOpen() {
1632
1216
  if (this.closed) {
1633
1217
  throw new Error('Transform worker pool closed');
@@ -1636,10 +1220,7 @@ class WorkerPool {
1636
1220
  ensureNotAborted(signal, url, stage) {
1637
1221
  if (!signal?.aborted)
1638
1222
  return;
1639
- throw new FetchError('Request was canceled', url, 499, {
1640
- reason: 'aborted',
1641
- stage,
1642
- });
1223
+ throw this.createAbortError(url, stage);
1643
1224
  }
1644
1225
  ensureQueueCapacity(url) {
1645
1226
  if (this.queue.length < this.queueMax)
@@ -1704,10 +1285,7 @@ class WorkerPool {
1704
1285
  abortInflightTask(id, url, workerIndex) {
1705
1286
  const slot = this.workers[workerIndex];
1706
1287
  this.cancelWorkerTask(slot, id);
1707
- this.failTask(id, new FetchError('Request was canceled', url, 499, {
1708
- reason: 'aborted',
1709
- stage: 'transform:signal-abort',
1710
- }));
1288
+ this.failTask(id, this.createAbortError(url, 'transform:signal-abort'));
1711
1289
  if (slot) {
1712
1290
  this.restartWorker(workerIndex, slot);
1713
1291
  }
@@ -1717,10 +1295,7 @@ class WorkerPool {
1717
1295
  if (queuedIndex === -1)
1718
1296
  return;
1719
1297
  this.queue.splice(queuedIndex, 1);
1720
- reject(new FetchError('Request was canceled', url, 499, {
1721
- reason: 'aborted',
1722
- stage: 'transform:queued-abort',
1723
- }));
1298
+ reject(this.createAbortError(url, 'transform:queued-abort'));
1724
1299
  }
1725
1300
  createWorkerSlot(worker) {
1726
1301
  return {
@@ -1876,10 +1451,7 @@ class WorkerPool {
1876
1451
  if (!task.signal?.aborted)
1877
1452
  return false;
1878
1453
  this.clearAbortListener(task.signal, task.abortListener);
1879
- task.reject(new FetchError('Request was canceled', task.url, 499, {
1880
- reason: 'aborted',
1881
- stage: 'transform:dispatch',
1882
- }));
1454
+ task.reject(this.createAbortError(task.url, 'transform:dispatch'));
1883
1455
  return true;
1884
1456
  }
1885
1457
  markSlotBusy(slot, task) {