@j0hanz/superfetch 2.2.0 → 2.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/README.md +363 -614
  2. package/dist/cache.d.ts +2 -2
  3. package/dist/cache.d.ts.map +1 -1
  4. package/dist/cache.js +49 -227
  5. package/dist/cache.js.map +1 -1
  6. package/dist/config.d.ts +6 -0
  7. package/dist/config.d.ts.map +1 -1
  8. package/dist/config.js +20 -27
  9. package/dist/config.js.map +1 -1
  10. package/dist/dom-noise-removal.d.ts +6 -0
  11. package/dist/dom-noise-removal.d.ts.map +1 -0
  12. package/dist/dom-noise-removal.js +482 -0
  13. package/dist/dom-noise-removal.js.map +1 -0
  14. package/dist/errors.d.ts.map +1 -1
  15. package/dist/errors.js +8 -5
  16. package/dist/errors.js.map +1 -1
  17. package/dist/fetch.d.ts.map +1 -1
  18. package/dist/fetch.js +26 -32
  19. package/dist/fetch.js.map +1 -1
  20. package/dist/http-native.d.ts +6 -0
  21. package/dist/http-native.d.ts.map +1 -0
  22. package/dist/http-native.js +645 -0
  23. package/dist/http-native.js.map +1 -0
  24. package/dist/http-utils.d.ts +61 -0
  25. package/dist/http-utils.d.ts.map +1 -0
  26. package/dist/http-utils.js +252 -0
  27. package/dist/http-utils.js.map +1 -0
  28. package/dist/index.js +1 -1
  29. package/dist/index.js.map +1 -1
  30. package/dist/instructions.md +41 -39
  31. package/dist/json.d.ts +2 -0
  32. package/dist/json.d.ts.map +1 -0
  33. package/dist/json.js +30 -0
  34. package/dist/json.js.map +1 -0
  35. package/dist/language-detection.d.ts +13 -0
  36. package/dist/language-detection.d.ts.map +1 -0
  37. package/dist/language-detection.js +283 -0
  38. package/dist/language-detection.js.map +1 -0
  39. package/dist/markdown-cleanup.d.ts +19 -0
  40. package/dist/markdown-cleanup.d.ts.map +1 -0
  41. package/dist/markdown-cleanup.js +283 -0
  42. package/dist/markdown-cleanup.js.map +1 -0
  43. package/dist/observability.d.ts +1 -0
  44. package/dist/observability.d.ts.map +1 -1
  45. package/dist/observability.js +10 -0
  46. package/dist/observability.js.map +1 -1
  47. package/dist/tools.d.ts.map +1 -1
  48. package/dist/tools.js +23 -8
  49. package/dist/tools.js.map +1 -1
  50. package/dist/transform-types.d.ts +81 -0
  51. package/dist/transform-types.d.ts.map +1 -0
  52. package/dist/transform-types.js +6 -0
  53. package/dist/transform-types.js.map +1 -0
  54. package/dist/transform.d.ts +8 -52
  55. package/dist/transform.d.ts.map +1 -1
  56. package/dist/transform.js +419 -825
  57. package/dist/transform.js.map +1 -1
  58. package/dist/type-guards.d.ts +1 -1
  59. package/dist/type-guards.d.ts.map +1 -1
  60. package/dist/type-guards.js +1 -1
  61. package/dist/type-guards.js.map +1 -1
  62. package/dist/workers/transform-worker.js +23 -24
  63. package/dist/workers/transform-worker.js.map +1 -1
  64. package/package.json +85 -86
  65. package/dist/http.d.ts +0 -90
  66. package/dist/http.d.ts.map +0 -1
  67. package/dist/http.js +0 -1576
  68. package/dist/http.js.map +0 -1
package/dist/transform.js CHANGED
@@ -8,44 +8,25 @@ import { NodeHtmlMarkdown, } from 'node-html-markdown';
8
8
  import { z } from 'zod';
9
9
  import { isProbablyReaderable, Readability } from '@mozilla/readability';
10
10
  import { config } from './config.js';
11
+ import { removeNoiseFromHtml } from './dom-noise-removal.js';
11
12
  import { FetchError, getErrorMessage } from './errors.js';
12
13
  import { isRawTextContentUrl } from './fetch.js';
14
+ import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
15
+ import { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
13
16
  import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
14
- import { isRecord } from './type-guards.js';
17
+ import { isObject } from './type-guards.js';
18
+ // Re-export language detection for backward compatibility
19
+ export { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
20
+ // Re-export markdown cleanup for backward compatibility
21
+ export { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
22
+ // Re-export DOM noise removal for backward compatibility
23
+ export { removeNoiseFromHtml } from './dom-noise-removal.js';
15
24
  function getAbortReason(signal) {
16
- if (!isRecord(signal))
25
+ if (!isObject(signal))
17
26
  return undefined;
18
27
  return 'reason' in signal ? signal.reason : undefined;
19
28
  }
20
- function getBodyInnerHtml(document) {
21
- if (!isRecord(document))
22
- return undefined;
23
- const { body } = document;
24
- if (!isRecord(body))
25
- return undefined;
26
- const { innerHTML } = body;
27
- return typeof innerHTML === 'string' && innerHTML.length > 0
28
- ? innerHTML
29
- : undefined;
30
- }
31
- function getDocumentToString(document) {
32
- if (!isRecord(document))
33
- return undefined;
34
- if (typeof document.toString !== 'function')
35
- return undefined;
36
- return document.toString.bind(document);
37
- }
38
- function getDocumentElementOuterHtml(document) {
39
- if (!isRecord(document))
40
- return undefined;
41
- const { documentElement } = document;
42
- if (!isRecord(documentElement))
43
- return undefined;
44
- const { outerHTML } = documentElement;
45
- return typeof outerHTML === 'string' && outerHTML.length > 0
46
- ? outerHTML
47
- : undefined;
48
- }
29
+ // DOM accessor helpers moved to ./dom-noise-removal.ts
49
30
  const CODE_BLOCK = {
50
31
  fence: '```',
51
32
  format: (code, language = '') => {
@@ -93,9 +74,13 @@ export function endTransformStage(context, options) {
93
74
  }
94
75
  function runTransformStage(url, stage, fn) {
95
76
  const context = startTransformStage(url, stage);
96
- const result = fn();
97
- endTransformStage(context);
98
- return result;
77
+ try {
78
+ return fn();
79
+ }
80
+ finally {
81
+ // Emit duration even if the stage throws; callers decide how to handle the error.
82
+ endTransformStage(context);
83
+ }
99
84
  }
100
85
  function isTimeoutReason(reason) {
101
86
  return reason instanceof Error && reason.name === 'TimeoutError';
@@ -129,46 +114,105 @@ function truncateHtml(html) {
129
114
  });
130
115
  return html.substring(0, maxSize);
131
116
  }
117
+ const META_PROPERTY_HANDLERS = new Map([
118
+ [
119
+ 'og:title',
120
+ (ctx, c) => {
121
+ ctx.title.og = c;
122
+ },
123
+ ],
124
+ [
125
+ 'og:description',
126
+ (ctx, c) => {
127
+ ctx.description.og = c;
128
+ },
129
+ ],
130
+ [
131
+ 'og:image',
132
+ (ctx, c) => {
133
+ ctx.image = c;
134
+ },
135
+ ],
136
+ [
137
+ 'article:published_time',
138
+ (ctx, c) => {
139
+ ctx.publishedAt = c;
140
+ },
141
+ ],
142
+ [
143
+ 'article:modified_time',
144
+ (ctx, c) => {
145
+ ctx.modifiedAt = c;
146
+ },
147
+ ],
148
+ ]);
149
+ const META_NAME_HANDLERS = new Map([
150
+ [
151
+ 'twitter:title',
152
+ (ctx, c) => {
153
+ ctx.title.twitter = c;
154
+ },
155
+ ],
156
+ [
157
+ 'twitter:description',
158
+ (ctx, c) => {
159
+ ctx.description.twitter = c;
160
+ },
161
+ ],
162
+ [
163
+ 'description',
164
+ (ctx, c) => {
165
+ ctx.description.standard = c;
166
+ },
167
+ ],
168
+ [
169
+ 'author',
170
+ (ctx, c) => {
171
+ ctx.author = c;
172
+ },
173
+ ],
174
+ ]);
132
175
  function extractMetadata(document) {
133
- const title = {};
134
- const description = {};
135
- let author;
176
+ const ctx = {
177
+ title: {},
178
+ description: {},
179
+ };
136
180
  for (const tag of document.querySelectorAll('meta')) {
137
181
  const content = tag.getAttribute('content')?.trim();
138
182
  if (!content)
139
183
  continue;
140
184
  const property = tag.getAttribute('property');
185
+ if (property) {
186
+ META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
187
+ }
141
188
  const name = tag.getAttribute('name');
142
- if (property === 'og:title')
143
- title.og = content;
144
- else if (property === 'og:description')
145
- description.og = content;
146
- else if (name === 'twitter:title')
147
- title.twitter = content;
148
- else if (name === 'twitter:description')
149
- description.twitter = content;
150
- else if (name === 'description')
151
- description.standard = content;
152
- else if (name === 'author')
153
- author = content;
189
+ if (name) {
190
+ META_NAME_HANDLERS.get(name)?.(ctx, content);
191
+ }
154
192
  }
155
193
  const titleEl = document.querySelector('title');
156
- if (!title.standard && titleEl?.textContent) {
157
- title.standard = titleEl.textContent.trim();
194
+ if (!ctx.title.standard && titleEl?.textContent) {
195
+ ctx.title.standard = titleEl.textContent.trim();
158
196
  }
159
- const resolvedTitle = title.og ?? title.twitter ?? title.standard;
160
- const resolvedDesc = description.og ?? description.twitter ?? description.standard;
197
+ const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
198
+ const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
161
199
  const metadata = {};
162
200
  if (resolvedTitle)
163
201
  metadata.title = resolvedTitle;
164
202
  if (resolvedDesc)
165
203
  metadata.description = resolvedDesc;
166
- if (author)
167
- metadata.author = author;
204
+ if (ctx.author)
205
+ metadata.author = ctx.author;
206
+ if (ctx.image)
207
+ metadata.image = ctx.image;
208
+ if (ctx.publishedAt)
209
+ metadata.publishedAt = ctx.publishedAt;
210
+ if (ctx.modifiedAt)
211
+ metadata.modifiedAt = ctx.modifiedAt;
168
212
  return metadata;
169
213
  }
170
214
  function isReadabilityCompatible(doc) {
171
- if (!isRecord(doc))
215
+ if (!isObject(doc))
172
216
  return false;
173
217
  return hasDocumentElement(doc) && hasQuerySelectors(doc);
174
218
  }
@@ -185,14 +229,18 @@ function extractArticle(document) {
185
229
  return null;
186
230
  }
187
231
  try {
188
- const documentClone = document.cloneNode(true);
189
- const rawText = documentClone.body.textContent ||
190
- documentClone.documentElement.textContent;
232
+ const doc = document;
233
+ const rawText = doc.querySelector('body')?.textContent ?? doc.documentElement.textContent;
191
234
  const textLength = rawText.replace(/\s+/g, ' ').trim().length;
192
- if (textLength >= 400 && !isProbablyReaderable(documentClone)) {
235
+ if (textLength < 100) {
236
+ logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
237
+ 'This might be a client-side rendered (SPA) application. ' +
238
+ 'Content extraction may be incomplete.', { textLength });
239
+ }
240
+ if (textLength >= 400 && !isProbablyReaderable(doc)) {
193
241
  return null;
194
242
  }
195
- const reader = new Readability(documentClone, { maxElemsToParse: 20_000 });
243
+ const reader = new Readability(doc, { maxElemsToParse: 20_000 });
196
244
  const parsed = reader.parse();
197
245
  if (!parsed)
198
246
  return null;
@@ -213,8 +261,13 @@ function extractArticle(document) {
213
261
  export function extractContent(html, url, options = {
214
262
  extractArticle: true,
215
263
  }) {
264
+ const result = extractContentWithDocument(html, url, options);
265
+ return { article: result.article, metadata: result.metadata };
266
+ }
267
+ function extractContentWithDocument(html, url, options) {
216
268
  if (!isValidInput(html, url)) {
217
- return { article: null, metadata: {} };
269
+ const { document } = parseHTML('<html></html>');
270
+ return { article: null, metadata: {}, document };
218
271
  }
219
272
  return tryExtractContent(html, url, options);
220
273
  }
@@ -229,11 +282,13 @@ function handleExtractionFailure(error, url, signal) {
229
282
  }
230
283
  throwIfAborted(signal, url, 'extract:error');
231
284
  logError('Failed to extract content', error instanceof Error ? error : undefined);
232
- return { article: null, metadata: {} };
285
+ const { document } = parseHTML('<html></html>');
286
+ return { article: null, metadata: {}, document };
233
287
  }
234
288
  function extractContentStages(html, url, options) {
235
289
  throwIfAborted(options.signal, url, 'extract:begin');
236
- const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncateHtml(html)));
290
+ const truncatedHtml = truncateHtml(html);
291
+ const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncatedHtml));
237
292
  throwIfAborted(options.signal, url, 'extract:parsed');
238
293
  applyBaseUri(document, url);
239
294
  const metadata = runTransformStage(url, 'extract:metadata', () => extractMetadata(document));
@@ -243,6 +298,8 @@ function extractContentStages(html, url, options) {
243
298
  return {
244
299
  article,
245
300
  metadata,
301
+ document,
302
+ ...(truncatedHtml.length !== html.length ? { truncated: true } : {}),
246
303
  };
247
304
  }
248
305
  function tryExtractContent(html, url, options) {
@@ -279,522 +336,7 @@ function applyBaseUri(document, url) {
279
336
  });
280
337
  }
281
338
  }
282
- function containsJsxTag(code) {
283
- for (let index = 0; index < code.length - 1; index += 1) {
284
- if (code[index] !== '<')
285
- continue;
286
- const next = code[index + 1];
287
- if (!next)
288
- continue;
289
- if (next >= 'A' && next <= 'Z')
290
- return true;
291
- }
292
- return false;
293
- }
294
- function containsWord(source, word) {
295
- let startIndex = source.indexOf(word);
296
- while (startIndex !== -1) {
297
- const before = startIndex === 0 ? '' : source[startIndex - 1];
298
- const afterIndex = startIndex + word.length;
299
- const after = afterIndex >= source.length ? '' : source[afterIndex];
300
- if (!isWordChar(before) && !isWordChar(after))
301
- return true;
302
- startIndex = source.indexOf(word, startIndex + word.length);
303
- }
304
- return false;
305
- }
306
- function splitLines(content) {
307
- return content.split('\n');
308
- }
309
- function extractLanguageFromClassName(className) {
310
- const tokens = className.match(/\S+/g);
311
- if (!tokens)
312
- return undefined;
313
- for (const token of tokens) {
314
- const lower = token.toLowerCase();
315
- if (lower.startsWith('language-'))
316
- return token.slice('language-'.length);
317
- if (lower.startsWith('lang-'))
318
- return token.slice('lang-'.length);
319
- if (lower.startsWith('highlight-')) {
320
- return token.slice('highlight-'.length);
321
- }
322
- }
323
- if (tokens.includes('hljs')) {
324
- const langClass = tokens.find((t) => t !== 'hljs' && !t.startsWith('hljs-'));
325
- if (langClass)
326
- return langClass;
327
- }
328
- return undefined;
329
- }
330
- function resolveLanguageFromDataAttribute(dataLang) {
331
- const trimmed = dataLang.trim();
332
- if (!trimmed)
333
- return undefined;
334
- for (const char of trimmed) {
335
- if (!isWordChar(char))
336
- return undefined;
337
- }
338
- return trimmed;
339
- }
340
- function isWordChar(char) {
341
- if (!char)
342
- return false;
343
- const code = char.charCodeAt(0);
344
- return ((code >= 48 && code <= 57) ||
345
- (code >= 65 && code <= 90) ||
346
- (code >= 97 && code <= 122) ||
347
- char === '_');
348
- }
349
- const LANGUAGE_PATTERNS = [
350
- {
351
- language: 'jsx',
352
- pattern: {
353
- keywords: ['classname=', 'jsx:', "from 'react'", 'from "react"'],
354
- custom: (code) => containsJsxTag(code),
355
- },
356
- },
357
- {
358
- language: 'typescript',
359
- pattern: {
360
- wordBoundary: ['interface', 'type'],
361
- custom: (_, lower) => [
362
- ': string',
363
- ':string',
364
- ': number',
365
- ':number',
366
- ': boolean',
367
- ':boolean',
368
- ': void',
369
- ':void',
370
- ': any',
371
- ':any',
372
- ': unknown',
373
- ':unknown',
374
- ': never',
375
- ':never',
376
- ].some((hint) => lower.includes(hint)),
377
- },
378
- },
379
- {
380
- language: 'rust',
381
- pattern: {
382
- regex: /\b(?:fn|impl|struct|enum)\b/,
383
- keywords: ['let mut'],
384
- custom: (_, lower) => lower.includes('use ') && lower.includes('::'),
385
- },
386
- },
387
- {
388
- language: 'javascript',
389
- pattern: {
390
- regex: /\b(?:const|let|var|function|class|async|await|export|import)\b/,
391
- },
392
- },
393
- {
394
- language: 'python',
395
- pattern: {
396
- regex: /\b(?:def|class|import|from)\b/,
397
- keywords: ['print(', '__name__'],
398
- },
399
- },
400
- {
401
- language: 'bash',
402
- pattern: {
403
- custom: (code) => detectBashIndicators(code),
404
- },
405
- },
406
- {
407
- language: 'css',
408
- pattern: {
409
- regex: /@media|@import|@keyframes/,
410
- custom: (code) => detectCssStructure(code),
411
- },
412
- },
413
- {
414
- language: 'html',
415
- pattern: {
416
- keywords: [
417
- '<!doctype',
418
- '<html',
419
- '<head',
420
- '<body',
421
- '<div',
422
- '<span',
423
- '<p',
424
- '<a',
425
- '<script',
426
- '<style',
427
- ],
428
- },
429
- },
430
- {
431
- language: 'json',
432
- pattern: {
433
- startsWith: ['{', '['],
434
- },
435
- },
436
- {
437
- language: 'yaml',
438
- pattern: {
439
- custom: (code) => detectYamlStructure(code),
440
- },
441
- },
442
- {
443
- language: 'sql',
444
- pattern: {
445
- wordBoundary: [
446
- 'select',
447
- 'insert',
448
- 'update',
449
- 'delete',
450
- 'create',
451
- 'alter',
452
- 'drop',
453
- ],
454
- },
455
- },
456
- {
457
- language: 'go',
458
- pattern: {
459
- wordBoundary: ['package', 'func'],
460
- keywords: ['import "'],
461
- },
462
- },
463
- ];
464
- // Bash detection constants
465
- const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
466
- const BASH_PKG_MANAGERS = [
467
- 'npm',
468
- 'yarn',
469
- 'pnpm',
470
- 'npx',
471
- 'brew',
472
- 'apt',
473
- 'pip',
474
- 'cargo',
475
- 'go',
476
- ];
477
- const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
478
- function isShellPrefix(line) {
479
- return (line.startsWith('#!') || line.startsWith('$ ') || line.startsWith('# '));
480
- }
481
- function matchesBashCommand(line) {
482
- return BASH_COMMANDS.some((cmd) => line === cmd || line.startsWith(`${cmd} `));
483
- }
484
- function matchesPackageManagerVerb(line) {
485
- for (const mgr of BASH_PKG_MANAGERS) {
486
- if (!line.startsWith(`${mgr} `))
487
- continue;
488
- const rest = line.slice(mgr.length + 1);
489
- if (BASH_VERBS.some((v) => rest === v || rest.startsWith(`${v} `))) {
490
- return true;
491
- }
492
- }
493
- return false;
494
- }
495
- function detectBashIndicators(code) {
496
- for (const line of splitLines(code)) {
497
- const trimmed = line.trimStart();
498
- if (!trimmed)
499
- continue;
500
- if (isShellPrefix(trimmed) ||
501
- matchesBashCommand(trimmed) ||
502
- matchesPackageManagerVerb(trimmed)) {
503
- return true;
504
- }
505
- }
506
- return false;
507
- }
508
- function detectCssStructure(code) {
509
- for (const line of splitLines(code)) {
510
- const trimmed = line.trimStart();
511
- if (!trimmed)
512
- continue;
513
- const isSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
514
- trimmed.includes('{');
515
- const isProperty = trimmed.includes(':') && trimmed.includes(';');
516
- if (isSelector || isProperty)
517
- return true;
518
- }
519
- return false;
520
- }
521
- function detectYamlStructure(code) {
522
- for (const line of splitLines(code)) {
523
- const trimmed = line.trim();
524
- if (!trimmed)
525
- continue;
526
- const colonIdx = trimmed.indexOf(':');
527
- if (colonIdx <= 0)
528
- continue;
529
- const after = trimmed[colonIdx + 1];
530
- if (after === ' ' || after === '\t')
531
- return true;
532
- }
533
- return false;
534
- }
535
- function matchesLanguagePattern(code, lower, pattern) {
536
- if (pattern.keywords?.some((kw) => lower.includes(kw)))
537
- return true;
538
- if (pattern.wordBoundary?.some((w) => containsWord(lower, w)))
539
- return true;
540
- if (pattern.regex?.test(lower))
541
- return true;
542
- if (pattern.startsWith) {
543
- const trimmed = code.trimStart();
544
- if (pattern.startsWith.some((prefix) => trimmed.startsWith(prefix)))
545
- return true;
546
- }
547
- if (pattern.custom?.(code, lower))
548
- return true;
549
- return false;
550
- }
551
- export function detectLanguageFromCode(code) {
552
- const lower = code.toLowerCase();
553
- for (const { language, pattern } of LANGUAGE_PATTERNS) {
554
- if (matchesLanguagePattern(code, lower, pattern))
555
- return language;
556
- }
557
- return undefined;
558
- }
559
- export function resolveLanguageFromAttributes(className, dataLang) {
560
- const classMatch = extractLanguageFromClassName(className);
561
- return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
562
- }
563
- function isElement(node) {
564
- return (isRecord(node) &&
565
- 'getAttribute' in node &&
566
- typeof node.getAttribute === 'function');
567
- }
568
- const STRUCTURAL_TAGS = new Set([
569
- 'script',
570
- 'style',
571
- 'noscript',
572
- 'iframe',
573
- 'form',
574
- 'button',
575
- 'input',
576
- 'select',
577
- 'textarea',
578
- 'svg',
579
- ]);
580
- const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
581
- const NAVIGATION_ROLES = new Set([
582
- 'navigation',
583
- 'banner',
584
- 'complementary',
585
- 'contentinfo',
586
- 'tree',
587
- 'menubar',
588
- 'menu',
589
- 'dialog',
590
- 'alertdialog',
591
- 'search',
592
- ]);
593
- const PROMO_TOKENS = new Set([
594
- 'banner',
595
- 'promo',
596
- 'announcement',
597
- 'cta',
598
- 'callout',
599
- 'advert',
600
- 'ad',
601
- 'ads',
602
- 'sponsor',
603
- 'newsletter',
604
- 'subscribe',
605
- 'cookie',
606
- 'consent',
607
- 'popup',
608
- 'modal',
609
- 'overlay',
610
- 'toast',
611
- 'share',
612
- 'social',
613
- 'related',
614
- 'recommend',
615
- 'comment',
616
- 'breadcrumb',
617
- 'pagination',
618
- 'pager',
619
- 'taglist',
620
- ]);
621
- const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
622
- const FIXED_PATTERN = /\b(fixed|sticky)\b/;
623
- const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
624
- const ISOLATE_PATTERN = /\bisolate\b/;
625
- const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
626
- const NOISE_MARKERS = [
627
- '<script',
628
- '<style',
629
- '<noscript',
630
- '<iframe',
631
- '<nav',
632
- '<footer',
633
- '<aside',
634
- '<header',
635
- '<form',
636
- '<button',
637
- '<input',
638
- '<select',
639
- '<textarea',
640
- '<svg',
641
- '<canvas',
642
- ' aria-hidden="true"',
643
- " aria-hidden='true'",
644
- ' hidden',
645
- ' role="navigation"',
646
- " role='navigation'",
647
- ' role="banner"',
648
- " role='banner'",
649
- ' role="complementary"',
650
- " role='complementary'",
651
- ' role="contentinfo"',
652
- " role='contentinfo'",
653
- ' role="tree"',
654
- " role='tree'",
655
- ' role="menubar"',
656
- " role='menubar'",
657
- ' role="menu"',
658
- " role='menu'",
659
- ' banner',
660
- ' promo',
661
- ' announcement',
662
- ' cta',
663
- ' callout',
664
- ' advert',
665
- ' newsletter',
666
- ' subscribe',
667
- ' cookie',
668
- ' consent',
669
- ' popup',
670
- ' modal',
671
- ' overlay',
672
- ' toast',
673
- ' fixed',
674
- ' sticky',
675
- ' z-50',
676
- ' z-4',
677
- ' isolate',
678
- ' breadcrumb',
679
- ' pagination',
680
- ];
681
- function mayContainNoise(html) {
682
- const haystack = html.toLowerCase();
683
- return NOISE_MARKERS.some((marker) => haystack.includes(marker));
684
- }
685
- function isFullDocumentHtml(html) {
686
- return HTML_DOCUMENT_MARKERS.test(html);
687
- }
688
- function isStructuralNoiseTag(tagName) {
689
- return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
690
- }
691
- function isElementHidden(element) {
692
- const style = element.getAttribute('style') ?? '';
693
- return (element.getAttribute('hidden') !== null ||
694
- element.getAttribute('aria-hidden') === 'true' ||
695
- /\bdisplay\s*:\s*none\b/i.test(style) ||
696
- /\bvisibility\s*:\s*hidden\b/i.test(style));
697
- }
698
- function hasNoiseRole(role) {
699
- return role !== null && NAVIGATION_ROLES.has(role);
700
- }
701
- function tokenizeIdentifierLikeText(value) {
702
- return value
703
- .toLowerCase()
704
- .replace(/[^a-z0-9]+/g, ' ')
705
- .trim()
706
- .split(' ')
707
- .filter(Boolean);
708
- }
709
- function matchesPromoIdOrClass(className, id) {
710
- const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
711
- return tokens.some((token) => PROMO_TOKENS.has(token));
712
- }
713
- function matchesFixedOrHighZIsolate(className) {
714
- return (FIXED_PATTERN.test(className) ||
715
- (HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
716
- }
717
- function readElementMetadata(element) {
718
- return {
719
- tagName: element.tagName.toLowerCase(),
720
- className: element.getAttribute('class') ?? '',
721
- id: element.getAttribute('id') ?? '',
722
- role: element.getAttribute('role'),
723
- isHidden: isElementHidden(element),
724
- };
725
- }
726
- function isBoilerplateHeader({ className, id, role, }) {
727
- if (hasNoiseRole(role))
728
- return true;
729
- const combined = `${className} ${id}`.toLowerCase();
730
- return HEADER_NOISE_PATTERN.test(combined);
731
- }
732
- function isNoiseElement(node) {
733
- const metadata = readElementMetadata(node);
734
- return (isStructuralNoiseTag(metadata.tagName) ||
735
- ALWAYS_NOISE_TAGS.has(metadata.tagName) ||
736
- (metadata.tagName === 'header' && isBoilerplateHeader(metadata)) ||
737
- metadata.isHidden ||
738
- hasNoiseRole(metadata.role) ||
739
- matchesFixedOrHighZIsolate(metadata.className) ||
740
- matchesPromoIdOrClass(metadata.className, metadata.id));
741
- }
742
- function removeNoiseNodes(nodes) {
743
- for (let index = nodes.length - 1; index >= 0; index -= 1) {
744
- const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
745
- if (!node)
746
- continue;
747
- if (isElement(node) && isNoiseElement(node)) {
748
- node.remove();
749
- }
750
- }
751
- }
752
- function stripNoiseNodes(document) {
753
- // Use targeted selectors for common noise elements instead of querySelectorAll('*')
754
- const targetSelectors = [
755
- 'nav',
756
- 'footer',
757
- 'aside',
758
- 'header[class*="site"]',
759
- 'header[class*="nav"]',
760
- 'header[class*="menu"]',
761
- '[role="banner"]',
762
- '[role="navigation"]',
763
- '[role="dialog"]',
764
- '[style*="display: none"]',
765
- '[style*="display:none"]',
766
- '[hidden]',
767
- '[aria-hidden="true"]',
768
- ].join(',');
769
- const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
770
- // Remove in reverse order to handle nested elements correctly
771
- removeNoiseNodes(potentialNoiseNodes);
772
- // Second pass: check remaining elements for noise patterns (promo, fixed positioning, etc.)
773
- const allElements = document.querySelectorAll('*');
774
- removeNoiseNodes(allElements);
775
- }
776
- function removeNoiseFromHtml(html) {
777
- const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
778
- if (!shouldParse)
779
- return html;
780
- try {
781
- const { document } = parseHTML(html);
782
- stripNoiseNodes(document);
783
- const bodyInnerHtml = getBodyInnerHtml(document);
784
- if (bodyInnerHtml)
785
- return bodyInnerHtml;
786
- const docToString = getDocumentToString(document);
787
- if (docToString)
788
- return docToString();
789
- const documentElementOuterHtml = getDocumentElementOuterHtml(document);
790
- if (documentElementOuterHtml)
791
- return documentElementOuterHtml;
792
- return html;
793
- }
794
- catch {
795
- return html;
796
- }
797
- }
339
+ // DOM noise removal functions moved to ./dom-noise-removal.ts
798
340
  function buildInlineCode(content) {
799
341
  const runs = content.match(/`+/g);
800
342
  let longest = '';
@@ -805,8 +347,11 @@ function buildInlineCode(content) {
805
347
  }
806
348
  }
807
349
  }
350
+ // Use a fence longer than any run of backticks in the content.
808
351
  const delimiter = `\`${longest}`;
809
- const padding = delimiter.length > 1 ? ' ' : '';
352
+ // Only pad when needed to avoid altering code spans unnecessarily.
353
+ // CommonMark recommends padding when the code starts/ends with a backtick.
354
+ const padding = content.startsWith('`') || content.endsWith('`') ? ' ' : '';
810
355
  return `${delimiter}${padding}${content}${padding}${delimiter}`;
811
356
  }
812
357
  function deriveAltFromImageUrl(src) {
@@ -829,16 +374,13 @@ function deriveAltFromImageUrl(src) {
829
374
  }
830
375
  }
831
376
  function isCodeBlock(parent) {
832
- if (!isRecord(parent))
377
+ if (!isObject(parent))
833
378
  return false;
834
379
  const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
835
380
  return ['PRE', 'WRAPPED-PRE'].includes(tagName);
836
381
  }
837
382
  function hasGetAttribute(value) {
838
- return isRecord(value) && typeof value.getAttribute === 'function';
839
- }
840
- function hasCodeBlockTranslators(value) {
841
- return isRecord(value) && isRecord(value.codeBlockTranslators);
383
+ return isObject(value) && typeof value.getAttribute === 'function';
842
384
  }
843
385
  function buildInlineCodeTranslator() {
844
386
  return {
@@ -855,37 +397,19 @@ function resolveAttributeLanguage(node) {
855
397
  const dataLanguage = getAttribute?.('data-language') ?? '';
856
398
  return resolveLanguageFromAttributes(className, dataLanguage);
857
399
  }
858
- function resolveCodeBlockTranslators(visitor) {
859
- const childTranslators = isRecord(visitor) ? visitor.instance : null;
860
- return hasCodeBlockTranslators(childTranslators)
861
- ? childTranslators.codeBlockTranslators
862
- : null;
863
- }
864
- function buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators) {
865
- return {
866
- noEscape: true,
867
- preserveWhitespace: true,
868
- ...(codeBlockTranslators
869
- ? { childTranslators: codeBlockTranslators }
870
- : null),
871
- postprocess: ({ content }) => {
872
- const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
873
- return CODE_BLOCK.format(content, language);
874
- },
875
- };
876
- }
877
400
  function buildCodeTranslator(ctx) {
878
- if (!isRecord(ctx))
401
+ if (!isObject(ctx))
879
402
  return buildInlineCodeTranslator();
880
- const { node, parent, visitor } = ctx;
403
+ const { parent } = ctx;
881
404
  if (!isCodeBlock(parent))
882
405
  return buildInlineCodeTranslator();
883
- const attributeLanguage = resolveAttributeLanguage(node);
884
- const codeBlockTranslators = resolveCodeBlockTranslators(visitor);
885
- return buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators);
406
+ return {
407
+ noEscape: true,
408
+ preserveWhitespace: true,
409
+ };
886
410
  }
887
411
  function buildImageTranslator(ctx) {
888
- if (!isRecord(ctx))
412
+ if (!isObject(ctx))
889
413
  return { content: '' };
890
414
  const { node } = ctx;
891
415
  const getAttribute = hasGetAttribute(node)
@@ -898,19 +422,57 @@ function buildImageTranslator(ctx) {
898
422
  content: `![${alt}](${src})`,
899
423
  };
900
424
  }
425
+ function findLanguageFromCodeChild(node) {
426
+ if (!isObject(node))
427
+ return undefined;
428
+ const { childNodes } = node;
429
+ if (!Array.isArray(childNodes))
430
+ return undefined;
431
+ for (const child of childNodes) {
432
+ if (!isObject(child))
433
+ continue;
434
+ const tagName = typeof child.rawTagName === 'string'
435
+ ? child.rawTagName.toUpperCase()
436
+ : '';
437
+ if (tagName === 'CODE') {
438
+ return resolveAttributeLanguage(child);
439
+ }
440
+ }
441
+ return undefined;
442
+ }
443
+ function createCodeBlockPostprocessor(language) {
444
+ return ({ content }) => {
445
+ const trimmed = content.trim();
446
+ if (!trimmed)
447
+ return '';
448
+ const resolvedLanguage = language ?? detectLanguageFromCode(trimmed) ?? '';
449
+ return CODE_BLOCK.format(trimmed, resolvedLanguage);
450
+ };
451
+ }
452
+ function buildPreTranslator(ctx) {
453
+ if (!isObject(ctx))
454
+ return {};
455
+ const { node } = ctx;
456
+ const attributeLanguage = resolveAttributeLanguage(node) ?? findLanguageFromCodeChild(node);
457
+ return {
458
+ noEscape: true,
459
+ preserveWhitespace: true,
460
+ postprocess: createCodeBlockPostprocessor(attributeLanguage),
461
+ };
462
+ }
901
463
  function createCustomTranslators() {
902
464
  return {
903
465
  code: (ctx) => buildCodeTranslator(ctx),
904
466
  img: (ctx) => buildImageTranslator(ctx),
905
467
  dl: (ctx) => {
906
- if (!isRecord(ctx) || !isRecord(ctx.node)) {
468
+ if (!isObject(ctx) || !isObject(ctx.node)) {
907
469
  return { content: '' };
908
470
  }
909
471
  const node = ctx.node;
910
472
  const childNodes = Array.isArray(node.childNodes) ? node.childNodes : [];
911
473
  const items = childNodes
912
474
  .map((child) => {
913
- if (!isRecord(child))
475
+ if (!isObject(child))
914
476
  return '';
915
477
  const nodeName = typeof child.nodeName === 'string'
916
478
  ? child.nodeName.toUpperCase()
@@ -940,6 +502,8 @@ function createCustomTranslators() {
940
502
  sup: () => ({
941
503
  postprocess: ({ content }) => `^${content}^`,
942
504
  }),
505
+ // Fix #6: Handle <pre> without <code> - wrap in fenced code block
506
+ pre: (ctx) => buildPreTranslator(ctx),
943
507
  };
944
508
  }
945
509
  let markdownInstance = null;
@@ -955,9 +519,11 @@ function getMarkdownConverter() {
955
519
  markdownInstance ??= createMarkdownInstance();
956
520
  return markdownInstance;
957
521
  }
958
- function translateHtmlToMarkdown(html, url, signal) {
522
+ function translateHtmlToMarkdown(html, url, signal, document, skipNoiseRemoval) {
959
523
  throwIfAborted(signal, url, 'markdown:begin');
960
- const cleanedHtml = runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html));
524
+ const cleanedHtml = skipNoiseRemoval
525
+ ? html
526
+ : runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
961
527
  throwIfAborted(signal, url, 'markdown:cleaned');
962
528
  const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
963
529
  throwIfAborted(signal, url, 'markdown:translated');
@@ -973,151 +539,18 @@ export function htmlToMarkdown(html, metadata, options) {
973
539
  if (!html)
974
540
  return buildMetadataFooter(metadata, url);
975
541
  try {
976
- const content = translateHtmlToMarkdown(html, url, options?.signal);
542
+ const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document, options?.skipNoiseRemoval);
977
543
  return appendMetadataFooter(content, metadata, url);
978
544
  }
979
545
  catch (error) {
980
546
  if (error instanceof FetchError) {
981
547
  throw error;
982
548
  }
549
+ logError('Failed to convert HTML to markdown', error instanceof Error ? error : undefined);
983
550
  return buildMetadataFooter(metadata, url);
984
551
  }
985
552
  }
986
- function cleanupMarkdownArtifacts(content) {
987
- let result = content;
988
- const fixOrphanHeadings = (text) => {
989
- return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (match, prefix, hashes, heading) => {
990
- if (typeof prefix !== 'string' ||
991
- typeof hashes !== 'string' ||
992
- typeof heading !== 'string') {
993
- return match;
994
- }
995
- if (heading.length > 150) {
996
- return match;
997
- }
998
- const trimmedPrefix = prefix.trim();
999
- if (trimmedPrefix === '') {
1000
- return `${hashes} ${heading}\n\n`;
1001
- }
1002
- return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
1003
- });
1004
- };
1005
- result = fixOrphanHeadings(result);
1006
- result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
1007
- const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
1008
- result = result.replace(zeroWidthAnchorLink, '');
1009
- result = result.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '');
1010
- result = result.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '');
1011
- result = result.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
1012
- result = result.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
1013
- result = result.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
1014
- result = result.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
1015
- const tocLinkLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
1016
- const lines = result.split('\n');
1017
- const filtered = [];
1018
- let skipTocBlock = false;
1019
- for (let i = 0; i < lines.length; i += 1) {
1020
- const line = lines[i] ?? '';
1021
- const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
1022
- const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
1023
- if (tocLinkLine.test(line)) {
1024
- const prevIsToc = tocLinkLine.test(prevLine) || prevLine.trim() === '';
1025
- const nextIsToc = tocLinkLine.test(nextLine) || nextLine.trim() === '';
1026
- if (prevIsToc || nextIsToc) {
1027
- skipTocBlock = true;
1028
- continue;
1029
- }
1030
- }
1031
- else if (line.trim() === '' && skipTocBlock) {
1032
- skipTocBlock = false;
1033
- continue;
1034
- }
1035
- else {
1036
- skipTocBlock = false;
1037
- }
1038
- filtered.push(line);
1039
- }
1040
- result = filtered.join('\n');
1041
- result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
1042
- result = result.replace(/^Was this page helpful\??\s*$/gim, '');
1043
- result = result.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ');
1044
- result = result.replace(/\\([[]])/g, '$1');
1045
- result = result.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
1046
- result = result.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
1047
- result = result.replace(/\n{3,}/g, '\n\n');
1048
- return result.trim();
1049
- }
1050
- const HEADING_KEYWORDS = new Set([
1051
- 'overview',
1052
- 'introduction',
1053
- 'summary',
1054
- 'conclusion',
1055
- 'prerequisites',
1056
- 'requirements',
1057
- 'installation',
1058
- 'configuration',
1059
- 'usage',
1060
- 'features',
1061
- 'limitations',
1062
- 'troubleshooting',
1063
- 'faq',
1064
- 'resources',
1065
- 'references',
1066
- 'changelog',
1067
- 'license',
1068
- 'acknowledgments',
1069
- 'appendix',
1070
- ]);
1071
- function isLikelyHeadingLine(line) {
1072
- const trimmed = line.trim();
1073
- if (!trimmed || trimmed.length > 80)
1074
- return false;
1075
- if (/^#{1,6}\s/.test(trimmed))
1076
- return false;
1077
- if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
1078
- return false;
1079
- if (/[.!?]$/.test(trimmed))
1080
- return false;
1081
- if (/^\[.*\]\(.*\)$/.test(trimmed))
1082
- return false;
1083
- if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
1084
- return true;
1085
- }
1086
- const words = trimmed.split(/\s+/);
1087
- if (words.length >= 2 && words.length <= 6) {
1088
- const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
1089
- if (isTitleCase)
1090
- return true;
1091
- }
1092
- if (words.length === 1) {
1093
- const lower = trimmed.toLowerCase();
1094
- if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed)) {
1095
- return true;
1096
- }
1097
- }
1098
- return false;
1099
- }
1100
- function promoteOrphanHeadings(markdown) {
1101
- const lines = markdown.split('\n');
1102
- const result = [];
1103
- for (let i = 0; i < lines.length; i += 1) {
1104
- const line = lines[i] ?? '';
1105
- const prevLine = i > 0 ? lines[i - 1] : '';
1106
- const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
1107
- const isStandalone = prevLine?.trim() === '' && nextLine?.trim() === '';
1108
- const isPrecededByBlank = prevLine?.trim() === '';
1109
- if ((isStandalone || isPrecededByBlank) && isLikelyHeadingLine(line)) {
1110
- const trimmed = line.trim();
1111
- const isExample = /^example:\s/i.test(trimmed);
1112
- const prefix = isExample ? '### ' : '## ';
1113
- result.push(prefix + trimmed);
1114
- }
1115
- else {
1116
- result.push(line);
1117
- }
1118
- }
1119
- return result.join('\n');
1120
- }
553
+ // Markdown cleanup functions moved to ./markdown-cleanup.ts
1121
554
  function formatFetchedDate(isoString) {
1122
555
  try {
1123
556
  const date = new Date(isoString);
@@ -1366,54 +799,114 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
1366
799
  const MIN_CONTENT_RATIO = 0.3;
1367
800
  const MIN_HTML_LENGTH_FOR_GATE = 100;
1368
801
  const MIN_HEADING_RETENTION_RATIO = 0.7;
1369
- function countHeadings(html) {
1370
- if (!html)
1371
- return 0;
1372
- // Match opening heading tags <h1> through <h6>
1373
- const headingPattern = /<h[1-6](?:\s[^>]*)?>([^<]*)<\/h[1-6]>/gi;
1374
- const matches = html.match(headingPattern);
1375
- return matches ? matches.length : 0;
1376
- }
1377
- function isHeadingStructurePreserved(article, originalHtml) {
1378
- if (!article)
1379
- return false;
1380
- // Cache heading counts to avoid duplicate regex matching
1381
- const originalHeadingCount = countHeadings(originalHtml);
1382
- const articleHeadingCount = countHeadings(article.content);
1383
- // If original has no headings, structure is trivially preserved
1384
- if (originalHeadingCount === 0)
1385
- return true;
1386
- // If article lost >50% of headings, structure is broken
1387
- const retentionRatio = articleHeadingCount / originalHeadingCount;
1388
- return retentionRatio >= MIN_HEADING_RETENTION_RATIO;
1389
- }
1390
- function stripHtmlTagsForLength(html) {
1391
- let result = '';
1392
- let inTag = false;
1393
- for (const char of html) {
1394
- if (char === '<') {
1395
- inTag = true;
802
+ const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
803
+ /**
804
+ * Count headings using DOM querySelectorAll.
805
+ * Handles nested content like <h2><span>Text</span></h2> correctly.
806
+ */
807
+ function countHeadingsDom(htmlOrDocument) {
808
+ if (typeof htmlOrDocument === 'string') {
809
+ // Wrap fragments in document structure for proper parsing
810
+ const htmlToParse = needsDocumentWrapper(htmlOrDocument)
811
+ ? wrapHtmlFragment(htmlOrDocument)
812
+ : htmlOrDocument;
813
+ const { document: doc } = parseHTML(htmlToParse);
814
+ return doc.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
815
+ }
816
+ return htmlOrDocument.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
817
+ }
818
+ function countCodeBlocksDom(htmlOrDocument) {
819
+ if (typeof htmlOrDocument === 'string') {
820
+ // Wrap fragments in document structure for proper parsing
821
+ const htmlToParse = needsDocumentWrapper(htmlOrDocument)
822
+ ? wrapHtmlFragment(htmlOrDocument)
823
+ : htmlOrDocument;
824
+ const { document: doc } = parseHTML(htmlToParse);
825
+ return doc.querySelectorAll('pre').length;
826
+ }
827
+ return htmlOrDocument.querySelectorAll('pre').length;
828
+ }
829
+ /**
830
+ * Check if HTML string needs document wrapper for proper parsing.
831
+ * Fragments without doctype/html/body tags need wrapping.
832
+ */
833
+ function needsDocumentWrapper(html) {
834
+ const trimmed = html.trim().toLowerCase();
835
+ return (!trimmed.startsWith('<!doctype') &&
836
+ !trimmed.startsWith('<html') &&
837
+ !trimmed.startsWith('<body'));
838
+ }
839
+ /**
840
+ * Wrap HTML fragment in minimal document structure for proper parsing.
841
+ */
842
+ function wrapHtmlFragment(html) {
843
+ return `<!DOCTYPE html><html><body>${html}</body></html>`;
844
+ }
845
+ /**
846
+ * Get visible text length from HTML, excluding script/style/noscript content.
847
+ * Fixes the bug where stripHtmlTagsForLength() counted JS/CSS as visible text.
848
+ */
849
+ function getVisibleTextLength(htmlOrDocument) {
850
+ // For string input, parse the HTML
851
+ if (typeof htmlOrDocument === 'string') {
852
+ // Wrap fragments in document structure for proper parsing
853
+ const htmlToParse = needsDocumentWrapper(htmlOrDocument)
854
+ ? wrapHtmlFragment(htmlOrDocument)
855
+ : htmlOrDocument;
856
+ const { document: doc } = parseHTML(htmlToParse);
857
+ // Remove non-visible content that inflates text length
858
+ for (const el of doc.querySelectorAll('script,style,noscript')) {
859
+ el.remove();
1396
860
  }
1397
- else if (char === '>') {
1398
- inTag = false;
1399
- }
1400
- else if (!inTag) {
1401
- result += char;
1402
- }
1403
- }
1404
- return result;
1405
- }
1406
- export function isExtractionSufficient(article, originalHtml) {
861
+ // Get text content from body or documentElement
862
+ // Note: linkedom may return null for body on HTML fragments despite types
863
+ const body = doc.body;
864
+ const docElement = doc.documentElement;
865
+ const text = body?.textContent ?? docElement?.textContent ?? '';
866
+ return text.replace(/\s+/g, ' ').trim().length;
867
+ }
868
+ // For Document input, clone to avoid mutation
869
+ const workDoc = htmlOrDocument.cloneNode(true);
870
+ // Remove non-visible content that inflates text length
871
+ for (const el of workDoc.querySelectorAll('script,style,noscript')) {
872
+ el.remove();
873
+ }
874
+ // Get text content from body or documentElement
875
+ // Note: linkedom may return null for body on HTML fragments despite types
876
+ const body = workDoc.body;
877
+ const docElement = workDoc.documentElement;
878
+ const text = body?.textContent ?? docElement?.textContent ?? '';
879
+ return text.replace(/\s+/g, ' ').trim().length;
880
+ }
881
+ export function isExtractionSufficient(article, originalHtmlOrDocument) {
1407
882
  if (!article)
1408
883
  return false;
1409
884
  const articleLength = article.textContent.length;
1410
- const originalLength = stripHtmlTagsForLength(originalHtml)
1411
- .replace(/\s+/g, ' ')
1412
- .trim().length;
885
+ // Use DOM-based visible text length to exclude script/style content
886
+ const originalLength = getVisibleTextLength(originalHtmlOrDocument);
1413
887
  if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
1414
888
  return true;
1415
889
  return articleLength / originalLength >= MIN_CONTENT_RATIO;
1416
890
  }
891
+ const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
892
+ const MAX_TRUNCATED_LINE_RATIO = 0.5;
893
+ /**
894
+ * Detect if extracted text has many truncated/incomplete sentences.
895
+ * Lines longer than 20 chars that don't end with sentence punctuation
896
+ * are considered potentially truncated.
897
+ */
898
+ function hasTruncatedSentences(text) {
899
+ const lines = text
900
+ .split('\n')
901
+ .filter((line) => line.trim().length > MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK);
902
+ if (lines.length < 3)
903
+ return false;
904
+ const incompleteLines = lines.filter((line) => {
905
+ const trimmed = line.trim();
906
+ return !/[.!?:;]$/.test(trimmed);
907
+ });
908
+ return incompleteLines.length / lines.length > MAX_TRUNCATED_LINE_RATIO;
909
+ }
1417
910
  export function determineContentExtractionSource(article) {
1418
911
  return article !== null;
1419
912
  }
@@ -1443,12 +936,83 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
1443
936
  }
1444
937
  return metadata;
1445
938
  }
1446
- function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, }) {
939
+ /**
940
+ * Content root selectors in priority order.
941
+ * These identify the main content area on a page.
942
+ */
943
+ const CONTENT_ROOT_SELECTORS = [
944
+ 'main',
945
+ 'article',
946
+ '[role="main"]',
947
+ '#content',
948
+ '#main-content',
949
+ '.content',
950
+ '.main-content',
951
+ '.post-content',
952
+ '.article-content',
953
+ '.entry-content',
954
+ '[itemprop="articleBody"]',
955
+ '[data-content]',
956
+ '.post-body',
957
+ '.article-body',
958
+ ];
959
+ /**
960
+ * Find the main content root element in a document.
961
+ * Returns the innerHTML if found, undefined otherwise.
962
+ */
963
+ function findContentRoot(document) {
964
+ for (const selector of CONTENT_ROOT_SELECTORS) {
965
+ const element = document.querySelector(selector);
966
+ if (!element)
967
+ continue;
968
+ // Check if element has meaningful content
969
+ const innerHTML = typeof element.innerHTML === 'string'
970
+ ? element.innerHTML
971
+ : undefined;
972
+ if (innerHTML && innerHTML.trim().length > 100) {
973
+ return innerHTML;
974
+ }
975
+ }
976
+ return undefined;
977
+ }
978
+ function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, document, }) {
1447
979
  const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
980
+ // If using article content, return it directly
981
+ if (useArticleContent && article) {
982
+ return {
983
+ sourceHtml: article.content,
984
+ title: article.title,
985
+ metadata,
986
+ };
987
+ }
988
+ // Try content root fallback before using full HTML
989
+ if (document) {
990
+ // Apply noise removal to HTML first (without passing document) to get cleaned HTML,
991
+ // then parse and find content root. This prevents the aggressive DOM stripping that
992
+ // happens when noise removal is given the original parsed document.
993
+ const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
994
+ const { document: cleanedDoc } = parseHTML(cleanedHtml);
995
+ const contentRoot = findContentRoot(cleanedDoc);
996
+ if (contentRoot) {
997
+ logDebug('Using content root fallback instead of full HTML', {
998
+ url: url.substring(0, 80),
999
+ contentLength: contentRoot.length,
1000
+ });
1001
+ return {
1002
+ sourceHtml: contentRoot,
1003
+ title: extractedMeta.title,
1004
+ metadata,
1005
+ // Skip noise removal - this HTML is already from a cleaned document
1006
+ skipNoiseRemoval: true,
1007
+ };
1008
+ }
1009
+ }
1010
+ // Fall back to full HTML
1448
1011
  return {
1449
- sourceHtml: useArticleContent && article ? article.content : html,
1450
- title: useArticleContent && article ? article.title : extractedMeta.title,
1012
+ sourceHtml: html,
1013
+ title: extractedMeta.title,
1451
1014
  metadata,
1015
+ ...(document ? { document } : {}),
1452
1016
  };
1453
1017
  }
1454
1018
  function logQualityGateFallback({ url, articleLength, }) {
@@ -1457,33 +1021,66 @@ function logQualityGateFallback({ url, articleLength, }) {
1457
1021
  articleLength,
1458
1022
  });
1459
1023
  }
1460
- function shouldUseArticleContent(article, html, url) {
1461
- // Check content sufficiency (length-based quality gate)
1462
- if (!isExtractionSufficient(article, html)) {
1463
- logQualityGateFallback({
1464
- url,
1465
- articleLength: article.textContent.length,
1466
- });
1467
- return false;
1024
+ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
1025
+ const articleLength = article.textContent.length;
1026
+ const originalLength = getVisibleTextLength(originalHtmlOrDocument);
1027
+ // If the document is tiny, don't gate too aggressively.
1028
+ if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
1029
+ const ratio = articleLength / originalLength;
1030
+ if (ratio < MIN_CONTENT_RATIO) {
1031
+ logQualityGateFallback({ url, articleLength });
1032
+ return false;
1033
+ }
1468
1034
  }
1469
- // Check heading structure preservation
1470
- if (!isHeadingStructurePreserved(article, html)) {
1471
- logDebug('Quality gate: Readability broke heading structure, using full HTML', {
1035
+ // Heading structure retention (compute counts once to avoid repeated DOM queries/parses).
1036
+ const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
1037
+ if (originalHeadings > 0) {
1038
+ const articleHeadings = countHeadingsDom(article.content);
1039
+ const retentionRatio = articleHeadings / originalHeadings;
1040
+ if (retentionRatio < MIN_HEADING_RETENTION_RATIO) {
1041
+ logDebug('Quality gate: Readability broke heading structure, using full HTML', {
1042
+ url: url.substring(0, 80),
1043
+ originalHeadings,
1044
+ articleHeadings,
1045
+ });
1046
+ return false;
1047
+ }
1048
+ }
1049
+ const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
1050
+ if (originalCodeBlocks > 0) {
1051
+ const articleCodeBlocks = countCodeBlocksDom(article.content);
1052
+ const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
1053
+ // Always log code block counts for debugging
1054
+ logDebug('Code block retention check', {
1472
1055
  url: url.substring(0, 80),
1473
- originalHeadings: countHeadings(html),
1474
- articleHeadings: countHeadings(article.content),
1056
+ originalCodeBlocks,
1057
+ articleCodeBlocks,
1058
+ codeRetentionRatio,
1475
1059
  });
1060
+ if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
1061
+ logDebug('Quality gate: Readability removed code blocks, using full HTML', {
1062
+ url: url.substring(0, 80),
1063
+ originalCodeBlocks,
1064
+ articleCodeBlocks,
1065
+ });
1066
+ return false;
1067
+ }
1068
+ }
1069
+ // Layout extraction issue: truncated/fragmented lines.
1070
+ if (hasTruncatedSentences(article.textContent)) {
1071
+ logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', { url: url.substring(0, 80) });
1476
1072
  return false;
1477
1073
  }
1478
1074
  return true;
1479
1075
  }
1480
1076
  function resolveContentSource({ html, url, includeMetadata, signal, }) {
1481
- const { article, metadata: extractedMeta } = extractContent(html, url, {
1077
+ const { article, metadata: extractedMeta, document, } = extractContentWithDocument(html, url, {
1482
1078
  extractArticle: true,
1483
1079
  ...(signal ? { signal } : {}),
1484
1080
  });
1081
+ const originalDocument = parseHTML(html).document;
1485
1082
  const useArticleContent = article
1486
- ? shouldUseArticleContent(article, html, url)
1083
+ ? shouldUseArticleContent(article, originalDocument, url)
1487
1084
  : false;
1488
1085
  return buildContentSource({
1489
1086
  html,
@@ -1492,6 +1089,7 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
1492
1089
  extractedMeta,
1493
1090
  includeMetadata,
1494
1091
  useArticleContent,
1092
+ document,
1495
1093
  });
1496
1094
  }
1497
1095
  function tryTransformRawStage(html, url, includeMetadata) {
@@ -1513,6 +1111,8 @@ function buildMarkdownFromContext(context, url, signal) {
1513
1111
  const content = runTransformStage(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
1514
1112
  url,
1515
1113
  ...(signal ? { signal } : {}),
1114
+ ...(context.document ? { document: context.document } : {}),
1115
+ ...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
1516
1116
  }));
1517
1117
  return {
1518
1118
  markdown: content,
@@ -1606,6 +1206,12 @@ class WorkerPool {
1606
1206
  timeoutMs;
1607
1207
  queueMax;
1608
1208
  closed = false;
1209
+ createAbortError(url, stage) {
1210
+ return new FetchError('Request was canceled', url, 499, {
1211
+ reason: 'aborted',
1212
+ stage,
1213
+ });
1214
+ }
1609
1215
  ensureOpen() {
1610
1216
  if (this.closed) {
1611
1217
  throw new Error('Transform worker pool closed');
@@ -1614,10 +1220,7 @@ class WorkerPool {
1614
1220
  ensureNotAborted(signal, url, stage) {
1615
1221
  if (!signal?.aborted)
1616
1222
  return;
1617
- throw new FetchError('Request was canceled', url, 499, {
1618
- reason: 'aborted',
1619
- stage,
1620
- });
1223
+ throw this.createAbortError(url, stage);
1621
1224
  }
1622
1225
  ensureQueueCapacity(url) {
1623
1226
  if (this.queue.length < this.queueMax)
@@ -1682,10 +1285,7 @@ class WorkerPool {
1682
1285
  abortInflightTask(id, url, workerIndex) {
1683
1286
  const slot = this.workers[workerIndex];
1684
1287
  this.cancelWorkerTask(slot, id);
1685
- this.failTask(id, new FetchError('Request was canceled', url, 499, {
1686
- reason: 'aborted',
1687
- stage: 'transform:signal-abort',
1688
- }));
1288
+ this.failTask(id, this.createAbortError(url, 'transform:signal-abort'));
1689
1289
  if (slot) {
1690
1290
  this.restartWorker(workerIndex, slot);
1691
1291
  }
@@ -1695,10 +1295,7 @@ class WorkerPool {
1695
1295
  if (queuedIndex === -1)
1696
1296
  return;
1697
1297
  this.queue.splice(queuedIndex, 1);
1698
- reject(new FetchError('Request was canceled', url, 499, {
1699
- reason: 'aborted',
1700
- stage: 'transform:queued-abort',
1701
- }));
1298
+ reject(this.createAbortError(url, 'transform:queued-abort'));
1702
1299
  }
1703
1300
  createWorkerSlot(worker) {
1704
1301
  return {
@@ -1854,10 +1451,7 @@ class WorkerPool {
1854
1451
  if (!task.signal?.aborted)
1855
1452
  return false;
1856
1453
  this.clearAbortListener(task.signal, task.abortListener);
1857
- task.reject(new FetchError('Request was canceled', task.url, 499, {
1858
- reason: 'aborted',
1859
- stage: 'transform:dispatch',
1860
- }));
1454
+ task.reject(this.createAbortError(task.url, 'transform:dispatch'));
1861
1455
  return true;
1862
1456
  }
1863
1457
  markSlotBusy(slot, task) {