@j0hanz/superfetch 2.2.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +243 -494
  2. package/dist/cache.d.ts +2 -3
  3. package/dist/cache.js +51 -241
  4. package/dist/config.d.ts +6 -1
  5. package/dist/config.js +29 -34
  6. package/dist/crypto.d.ts +0 -1
  7. package/dist/crypto.js +0 -1
  8. package/dist/dom-noise-removal.d.ts +5 -0
  9. package/dist/dom-noise-removal.js +485 -0
  10. package/dist/errors.d.ts +0 -1
  11. package/dist/errors.js +8 -6
  12. package/dist/fetch.d.ts +0 -1
  13. package/dist/fetch.js +71 -61
  14. package/dist/host-normalization.d.ts +1 -0
  15. package/dist/host-normalization.js +47 -0
  16. package/dist/http-native.d.ts +5 -0
  17. package/dist/http-native.js +693 -0
  18. package/dist/index.d.ts +0 -1
  19. package/dist/index.js +1 -2
  20. package/dist/instructions.md +22 -20
  21. package/dist/json.d.ts +1 -0
  22. package/dist/json.js +29 -0
  23. package/dist/language-detection.d.ts +12 -0
  24. package/dist/language-detection.js +291 -0
  25. package/dist/markdown-cleanup.d.ts +18 -0
  26. package/dist/markdown-cleanup.js +283 -0
  27. package/dist/mcp-validator.d.ts +14 -0
  28. package/dist/mcp-validator.js +22 -0
  29. package/dist/mcp.d.ts +0 -1
  30. package/dist/mcp.js +0 -1
  31. package/dist/observability.d.ts +1 -1
  32. package/dist/observability.js +15 -3
  33. package/dist/server-tuning.d.ts +9 -0
  34. package/dist/server-tuning.js +30 -0
  35. package/dist/session.d.ts +36 -0
  36. package/dist/session.js +159 -0
  37. package/dist/tools.d.ts +0 -1
  38. package/dist/tools.js +23 -33
  39. package/dist/transform-types.d.ts +80 -0
  40. package/dist/transform-types.js +5 -0
  41. package/dist/transform.d.ts +7 -53
  42. package/dist/transform.js +434 -856
  43. package/dist/type-guards.d.ts +1 -2
  44. package/dist/type-guards.js +1 -2
  45. package/dist/workers/transform-worker.d.ts +0 -1
  46. package/dist/workers/transform-worker.js +52 -43
  47. package/package.json +11 -12
  48. package/dist/cache.d.ts.map +0 -1
  49. package/dist/cache.js.map +0 -1
  50. package/dist/config.d.ts.map +0 -1
  51. package/dist/config.js.map +0 -1
  52. package/dist/crypto.d.ts.map +0 -1
  53. package/dist/crypto.js.map +0 -1
  54. package/dist/errors.d.ts.map +0 -1
  55. package/dist/errors.js.map +0 -1
  56. package/dist/fetch.d.ts.map +0 -1
  57. package/dist/fetch.js.map +0 -1
  58. package/dist/http.d.ts +0 -90
  59. package/dist/http.d.ts.map +0 -1
  60. package/dist/http.js +0 -1576
  61. package/dist/http.js.map +0 -1
  62. package/dist/index.d.ts.map +0 -1
  63. package/dist/index.js.map +0 -1
  64. package/dist/mcp.d.ts.map +0 -1
  65. package/dist/mcp.js.map +0 -1
  66. package/dist/observability.d.ts.map +0 -1
  67. package/dist/observability.js.map +0 -1
  68. package/dist/tools.d.ts.map +0 -1
  69. package/dist/tools.js.map +0 -1
  70. package/dist/transform.d.ts.map +0 -1
  71. package/dist/transform.js.map +0 -1
  72. package/dist/type-guards.d.ts.map +0 -1
  73. package/dist/type-guards.js.map +0 -1
  74. package/dist/workers/transform-worker.d.ts.map +0 -1
  75. package/dist/workers/transform-worker.js.map +0 -1
package/dist/transform.js CHANGED
@@ -8,44 +8,25 @@ import { NodeHtmlMarkdown, } from 'node-html-markdown';
8
8
  import { z } from 'zod';
9
9
  import { isProbablyReaderable, Readability } from '@mozilla/readability';
10
10
  import { config } from './config.js';
11
+ import { removeNoiseFromHtml } from './dom-noise-removal.js';
11
12
  import { FetchError, getErrorMessage } from './errors.js';
12
13
  import { isRawTextContentUrl } from './fetch.js';
14
+ import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
15
+ import { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
13
16
  import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
14
- import { isRecord } from './type-guards.js';
17
+ import { isObject } from './type-guards.js';
18
+ // Re-export language detection for backward compatibility
19
+ export { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
20
+ // Re-export markdown cleanup for backward compatibility
21
+ export { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
22
+ // Re-export DOM noise removal for backward compatibility
23
+ export { removeNoiseFromHtml } from './dom-noise-removal.js';
15
24
  function getAbortReason(signal) {
16
- if (!isRecord(signal))
25
+ if (!isObject(signal))
17
26
  return undefined;
18
27
  return 'reason' in signal ? signal.reason : undefined;
19
28
  }
20
- function getBodyInnerHtml(document) {
21
- if (!isRecord(document))
22
- return undefined;
23
- const { body } = document;
24
- if (!isRecord(body))
25
- return undefined;
26
- const { innerHTML } = body;
27
- return typeof innerHTML === 'string' && innerHTML.length > 0
28
- ? innerHTML
29
- : undefined;
30
- }
31
- function getDocumentToString(document) {
32
- if (!isRecord(document))
33
- return undefined;
34
- if (typeof document.toString !== 'function')
35
- return undefined;
36
- return document.toString.bind(document);
37
- }
38
- function getDocumentElementOuterHtml(document) {
39
- if (!isRecord(document))
40
- return undefined;
41
- const { documentElement } = document;
42
- if (!isRecord(documentElement))
43
- return undefined;
44
- const { outerHTML } = documentElement;
45
- return typeof outerHTML === 'string' && outerHTML.length > 0
46
- ? outerHTML
47
- : undefined;
48
- }
29
+ // DOM accessor helpers moved to ./dom-noise-removal.ts
49
30
  const CODE_BLOCK = {
50
31
  fence: '```',
51
32
  format: (code, language = '') => {
@@ -53,6 +34,10 @@ const CODE_BLOCK = {
53
34
  },
54
35
  };
55
36
  const transformChannel = diagnosticsChannel.channel('superfetch.transform');
37
+ const LOG_URL_MAX = 80;
38
+ function truncateUrlForLog(url) {
39
+ return url.substring(0, LOG_URL_MAX);
40
+ }
56
41
  function publishTransformEvent(event) {
57
42
  if (!transformChannel.hasSubscribers)
58
43
  return;
@@ -93,9 +78,13 @@ export function endTransformStage(context, options) {
93
78
  }
94
79
  function runTransformStage(url, stage, fn) {
95
80
  const context = startTransformStage(url, stage);
96
- const result = fn();
97
- endTransformStage(context);
98
- return result;
81
+ try {
82
+ return fn();
83
+ }
84
+ finally {
85
+ // Emit duration even if the stage throws; callers decide how to handle the error.
86
+ endTransformStage(context);
87
+ }
99
88
  }
100
89
  function isTimeoutReason(reason) {
101
90
  return reason instanceof Error && reason.name === 'TimeoutError';
@@ -129,46 +118,105 @@ function truncateHtml(html) {
129
118
  });
130
119
  return html.substring(0, maxSize);
131
120
  }
121
+ const META_PROPERTY_HANDLERS = new Map([
122
+ [
123
+ 'og:title',
124
+ (ctx, c) => {
125
+ ctx.title.og = c;
126
+ },
127
+ ],
128
+ [
129
+ 'og:description',
130
+ (ctx, c) => {
131
+ ctx.description.og = c;
132
+ },
133
+ ],
134
+ [
135
+ 'og:image',
136
+ (ctx, c) => {
137
+ ctx.image = c;
138
+ },
139
+ ],
140
+ [
141
+ 'article:published_time',
142
+ (ctx, c) => {
143
+ ctx.publishedAt = c;
144
+ },
145
+ ],
146
+ [
147
+ 'article:modified_time',
148
+ (ctx, c) => {
149
+ ctx.modifiedAt = c;
150
+ },
151
+ ],
152
+ ]);
153
+ const META_NAME_HANDLERS = new Map([
154
+ [
155
+ 'twitter:title',
156
+ (ctx, c) => {
157
+ ctx.title.twitter = c;
158
+ },
159
+ ],
160
+ [
161
+ 'twitter:description',
162
+ (ctx, c) => {
163
+ ctx.description.twitter = c;
164
+ },
165
+ ],
166
+ [
167
+ 'description',
168
+ (ctx, c) => {
169
+ ctx.description.standard = c;
170
+ },
171
+ ],
172
+ [
173
+ 'author',
174
+ (ctx, c) => {
175
+ ctx.author = c;
176
+ },
177
+ ],
178
+ ]);
132
179
  function extractMetadata(document) {
133
- const title = {};
134
- const description = {};
135
- let author;
180
+ const ctx = {
181
+ title: {},
182
+ description: {},
183
+ };
136
184
  for (const tag of document.querySelectorAll('meta')) {
137
185
  const content = tag.getAttribute('content')?.trim();
138
186
  if (!content)
139
187
  continue;
140
188
  const property = tag.getAttribute('property');
189
+ if (property) {
190
+ META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
191
+ }
141
192
  const name = tag.getAttribute('name');
142
- if (property === 'og:title')
143
- title.og = content;
144
- else if (property === 'og:description')
145
- description.og = content;
146
- else if (name === 'twitter:title')
147
- title.twitter = content;
148
- else if (name === 'twitter:description')
149
- description.twitter = content;
150
- else if (name === 'description')
151
- description.standard = content;
152
- else if (name === 'author')
153
- author = content;
193
+ if (name) {
194
+ META_NAME_HANDLERS.get(name)?.(ctx, content);
195
+ }
154
196
  }
155
197
  const titleEl = document.querySelector('title');
156
- if (!title.standard && titleEl?.textContent) {
157
- title.standard = titleEl.textContent.trim();
198
+ if (!ctx.title.standard && titleEl?.textContent) {
199
+ ctx.title.standard = titleEl.textContent.trim();
158
200
  }
159
- const resolvedTitle = title.og ?? title.twitter ?? title.standard;
160
- const resolvedDesc = description.og ?? description.twitter ?? description.standard;
201
+ const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
202
+ const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
161
203
  const metadata = {};
162
204
  if (resolvedTitle)
163
205
  metadata.title = resolvedTitle;
164
206
  if (resolvedDesc)
165
207
  metadata.description = resolvedDesc;
166
- if (author)
167
- metadata.author = author;
208
+ if (ctx.author)
209
+ metadata.author = ctx.author;
210
+ if (ctx.image)
211
+ metadata.image = ctx.image;
212
+ if (ctx.publishedAt)
213
+ metadata.publishedAt = ctx.publishedAt;
214
+ if (ctx.modifiedAt)
215
+ metadata.modifiedAt = ctx.modifiedAt;
168
216
  return metadata;
169
217
  }
170
218
  function isReadabilityCompatible(doc) {
171
- if (!isRecord(doc))
219
+ if (!isObject(doc))
172
220
  return false;
173
221
  return hasDocumentElement(doc) && hasQuerySelectors(doc);
174
222
  }
@@ -185,14 +233,18 @@ function extractArticle(document) {
185
233
  return null;
186
234
  }
187
235
  try {
188
- const documentClone = document.cloneNode(true);
189
- const rawText = documentClone.querySelector('body')?.textContent ??
190
- documentClone.documentElement.textContent;
236
+ const doc = document;
237
+ const rawText = doc.querySelector('body')?.textContent ?? doc.documentElement.textContent;
191
238
  const textLength = rawText.replace(/\s+/g, ' ').trim().length;
192
- if (textLength >= 400 && !isProbablyReaderable(documentClone)) {
239
+ if (textLength < 100) {
240
+ logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
241
+ 'This might be a client-side rendered (SPA) application. ' +
242
+ 'Content extraction may be incomplete.', { textLength });
243
+ }
244
+ if (textLength >= 400 && !isProbablyReaderable(doc)) {
193
245
  return null;
194
246
  }
195
- const reader = new Readability(documentClone, { maxElemsToParse: 20_000 });
247
+ const reader = new Readability(doc, { maxElemsToParse: 20_000 });
196
248
  const parsed = reader.parse();
197
249
  if (!parsed)
198
250
  return null;
@@ -218,7 +270,8 @@ export function extractContent(html, url, options = {
218
270
  }
219
271
  function extractContentWithDocument(html, url, options) {
220
272
  if (!isValidInput(html, url)) {
221
- return { article: null, metadata: {} };
273
+ const { document } = parseHTML('<html></html>');
274
+ return { article: null, metadata: {}, document };
222
275
  }
223
276
  return tryExtractContent(html, url, options);
224
277
  }
@@ -233,7 +286,8 @@ function handleExtractionFailure(error, url, signal) {
233
286
  }
234
287
  throwIfAborted(signal, url, 'extract:error');
235
288
  logError('Failed to extract content', error instanceof Error ? error : undefined);
236
- return { article: null, metadata: {} };
289
+ const { document } = parseHTML('<html></html>');
290
+ return { article: null, metadata: {}, document };
237
291
  }
238
292
  function extractContentStages(html, url, options) {
239
293
  throwIfAborted(options.signal, url, 'extract:begin');
@@ -248,7 +302,8 @@ function extractContentStages(html, url, options) {
248
302
  return {
249
303
  article,
250
304
  metadata,
251
- ...(truncatedHtml.length === html.length ? { document } : {}),
305
+ document,
306
+ ...(truncatedHtml.length !== html.length ? { truncated: true } : {}),
252
307
  };
253
308
  }
254
309
  function tryExtractContent(html, url, options) {
@@ -285,532 +340,7 @@ function applyBaseUri(document, url) {
285
340
  });
286
341
  }
287
342
  }
288
- function containsJsxTag(code) {
289
- for (let index = 0; index < code.length - 1; index += 1) {
290
- if (code[index] !== '<')
291
- continue;
292
- const next = code[index + 1];
293
- if (!next)
294
- continue;
295
- if (next >= 'A' && next <= 'Z')
296
- return true;
297
- }
298
- return false;
299
- }
300
- function containsWord(source, word) {
301
- let startIndex = source.indexOf(word);
302
- while (startIndex !== -1) {
303
- const before = startIndex === 0 ? '' : source[startIndex - 1];
304
- const afterIndex = startIndex + word.length;
305
- const after = afterIndex >= source.length ? '' : source[afterIndex];
306
- if (!isWordChar(before) && !isWordChar(after))
307
- return true;
308
- startIndex = source.indexOf(word, startIndex + word.length);
309
- }
310
- return false;
311
- }
312
- function splitLines(content) {
313
- return content.split('\n');
314
- }
315
- function extractLanguageFromClassName(className) {
316
- const tokens = className.match(/\S+/g);
317
- if (!tokens)
318
- return undefined;
319
- for (const token of tokens) {
320
- const lower = token.toLowerCase();
321
- if (lower.startsWith('language-'))
322
- return token.slice('language-'.length);
323
- if (lower.startsWith('lang-'))
324
- return token.slice('lang-'.length);
325
- if (lower.startsWith('highlight-')) {
326
- return token.slice('highlight-'.length);
327
- }
328
- }
329
- if (tokens.includes('hljs')) {
330
- const langClass = tokens.find((t) => t !== 'hljs' && !t.startsWith('hljs-'));
331
- if (langClass)
332
- return langClass;
333
- }
334
- return undefined;
335
- }
336
- function resolveLanguageFromDataAttribute(dataLang) {
337
- const trimmed = dataLang.trim();
338
- if (!trimmed)
339
- return undefined;
340
- for (const char of trimmed) {
341
- if (!isWordChar(char))
342
- return undefined;
343
- }
344
- return trimmed;
345
- }
346
- function isWordChar(char) {
347
- if (!char)
348
- return false;
349
- const code = char.charCodeAt(0);
350
- return ((code >= 48 && code <= 57) ||
351
- (code >= 65 && code <= 90) ||
352
- (code >= 97 && code <= 122) ||
353
- char === '_');
354
- }
355
- const LANGUAGE_PATTERNS = [
356
- {
357
- language: 'jsx',
358
- pattern: {
359
- keywords: ['classname=', 'jsx:', "from 'react'", 'from "react"'],
360
- custom: (code) => containsJsxTag(code),
361
- },
362
- },
363
- {
364
- language: 'typescript',
365
- pattern: {
366
- wordBoundary: ['interface', 'type'],
367
- custom: (_, lower) => [
368
- ': string',
369
- ':string',
370
- ': number',
371
- ':number',
372
- ': boolean',
373
- ':boolean',
374
- ': void',
375
- ':void',
376
- ': any',
377
- ':any',
378
- ': unknown',
379
- ':unknown',
380
- ': never',
381
- ':never',
382
- ].some((hint) => lower.includes(hint)),
383
- },
384
- },
385
- {
386
- language: 'rust',
387
- pattern: {
388
- regex: /\b(?:fn|impl|struct|enum)\b/,
389
- keywords: ['let mut'],
390
- custom: (_, lower) => lower.includes('use ') && lower.includes('::'),
391
- },
392
- },
393
- {
394
- language: 'javascript',
395
- pattern: {
396
- regex: /\b(?:const|let|var|function|class|async|await|export|import)\b/,
397
- },
398
- },
399
- {
400
- language: 'python',
401
- pattern: {
402
- regex: /\b(?:def|class|import|from)\b/,
403
- keywords: ['print(', '__name__'],
404
- },
405
- },
406
- {
407
- language: 'bash',
408
- pattern: {
409
- custom: (code) => detectBashIndicators(code),
410
- },
411
- },
412
- {
413
- language: 'css',
414
- pattern: {
415
- regex: /@media|@import|@keyframes/,
416
- custom: (code) => detectCssStructure(code),
417
- },
418
- },
419
- {
420
- language: 'html',
421
- pattern: {
422
- keywords: [
423
- '<!doctype',
424
- '<html',
425
- '<head',
426
- '<body',
427
- '<div',
428
- '<span',
429
- '<p',
430
- '<a',
431
- '<script',
432
- '<style',
433
- ],
434
- },
435
- },
436
- {
437
- language: 'json',
438
- pattern: {
439
- startsWith: ['{', '['],
440
- },
441
- },
442
- {
443
- language: 'yaml',
444
- pattern: {
445
- custom: (code) => detectYamlStructure(code),
446
- },
447
- },
448
- {
449
- language: 'sql',
450
- pattern: {
451
- wordBoundary: [
452
- 'select',
453
- 'insert',
454
- 'update',
455
- 'delete',
456
- 'create',
457
- 'alter',
458
- 'drop',
459
- ],
460
- },
461
- },
462
- {
463
- language: 'go',
464
- pattern: {
465
- wordBoundary: ['package', 'func'],
466
- keywords: ['import "'],
467
- },
468
- },
469
- ];
470
- // Bash detection constants
471
- const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
472
- const BASH_PKG_MANAGERS = [
473
- 'npm',
474
- 'yarn',
475
- 'pnpm',
476
- 'npx',
477
- 'brew',
478
- 'apt',
479
- 'pip',
480
- 'cargo',
481
- 'go',
482
- ];
483
- const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
484
- function isShellPrefix(line) {
485
- return (line.startsWith('#!') || line.startsWith('$ ') || line.startsWith('# '));
486
- }
487
- function matchesBashCommand(line) {
488
- return BASH_COMMANDS.some((cmd) => line === cmd || line.startsWith(`${cmd} `));
489
- }
490
- function matchesPackageManagerVerb(line) {
491
- for (const mgr of BASH_PKG_MANAGERS) {
492
- if (!line.startsWith(`${mgr} `))
493
- continue;
494
- const rest = line.slice(mgr.length + 1);
495
- if (BASH_VERBS.some((v) => rest === v || rest.startsWith(`${v} `))) {
496
- return true;
497
- }
498
- }
499
- return false;
500
- }
501
- function detectBashIndicators(code) {
502
- for (const line of splitLines(code)) {
503
- const trimmed = line.trimStart();
504
- if (!trimmed)
505
- continue;
506
- if (isShellPrefix(trimmed) ||
507
- matchesBashCommand(trimmed) ||
508
- matchesPackageManagerVerb(trimmed)) {
509
- return true;
510
- }
511
- }
512
- return false;
513
- }
514
- function detectCssStructure(code) {
515
- for (const line of splitLines(code)) {
516
- const trimmed = line.trimStart();
517
- if (!trimmed)
518
- continue;
519
- const isSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
520
- trimmed.includes('{');
521
- const isProperty = trimmed.includes(':') && trimmed.includes(';');
522
- if (isSelector || isProperty)
523
- return true;
524
- }
525
- return false;
526
- }
527
- function detectYamlStructure(code) {
528
- for (const line of splitLines(code)) {
529
- const trimmed = line.trim();
530
- if (!trimmed)
531
- continue;
532
- const colonIdx = trimmed.indexOf(':');
533
- if (colonIdx <= 0)
534
- continue;
535
- const after = trimmed[colonIdx + 1];
536
- if (after === ' ' || after === '\t')
537
- return true;
538
- }
539
- return false;
540
- }
541
- function matchesLanguagePattern(code, lower, pattern) {
542
- if (pattern.keywords?.some((kw) => lower.includes(kw)))
543
- return true;
544
- if (pattern.wordBoundary?.some((w) => containsWord(lower, w)))
545
- return true;
546
- if (pattern.regex?.test(lower))
547
- return true;
548
- if (pattern.startsWith) {
549
- const trimmed = code.trimStart();
550
- if (pattern.startsWith.some((prefix) => trimmed.startsWith(prefix)))
551
- return true;
552
- }
553
- if (pattern.custom?.(code, lower))
554
- return true;
555
- return false;
556
- }
557
- export function detectLanguageFromCode(code) {
558
- const lower = code.toLowerCase();
559
- for (const { language, pattern } of LANGUAGE_PATTERNS) {
560
- if (matchesLanguagePattern(code, lower, pattern))
561
- return language;
562
- }
563
- return undefined;
564
- }
565
- export function resolveLanguageFromAttributes(className, dataLang) {
566
- const classMatch = extractLanguageFromClassName(className);
567
- return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
568
- }
569
- function isElement(node) {
570
- return (isRecord(node) &&
571
- 'getAttribute' in node &&
572
- typeof node.getAttribute === 'function');
573
- }
574
- const STRUCTURAL_TAGS = new Set([
575
- 'script',
576
- 'style',
577
- 'noscript',
578
- 'iframe',
579
- 'form',
580
- 'button',
581
- 'input',
582
- 'select',
583
- 'textarea',
584
- 'svg',
585
- ]);
586
- const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
587
- const NAVIGATION_ROLES = new Set([
588
- 'navigation',
589
- 'banner',
590
- 'complementary',
591
- 'contentinfo',
592
- 'tree',
593
- 'menubar',
594
- 'menu',
595
- 'dialog',
596
- 'alertdialog',
597
- 'search',
598
- ]);
599
- const PROMO_TOKENS = new Set([
600
- 'banner',
601
- 'promo',
602
- 'announcement',
603
- 'cta',
604
- 'callout',
605
- 'advert',
606
- 'ad',
607
- 'ads',
608
- 'sponsor',
609
- 'newsletter',
610
- 'subscribe',
611
- 'cookie',
612
- 'consent',
613
- 'popup',
614
- 'modal',
615
- 'overlay',
616
- 'toast',
617
- 'share',
618
- 'social',
619
- 'related',
620
- 'recommend',
621
- 'comment',
622
- 'breadcrumb',
623
- 'pagination',
624
- 'pager',
625
- 'taglist',
626
- ]);
627
- const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
628
- const FIXED_PATTERN = /\b(fixed|sticky)\b/;
629
- const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
630
- const ISOLATE_PATTERN = /\bisolate\b/;
631
- const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
632
- const NOISE_MARKERS = [
633
- '<script',
634
- '<style',
635
- '<noscript',
636
- '<iframe',
637
- '<nav',
638
- '<footer',
639
- '<aside',
640
- '<header',
641
- '<form',
642
- '<button',
643
- '<input',
644
- '<select',
645
- '<textarea',
646
- '<svg',
647
- '<canvas',
648
- ' aria-hidden="true"',
649
- " aria-hidden='true'",
650
- ' hidden',
651
- ' role="navigation"',
652
- " role='navigation'",
653
- ' role="banner"',
654
- " role='banner'",
655
- ' role="complementary"',
656
- " role='complementary'",
657
- ' role="contentinfo"',
658
- " role='contentinfo'",
659
- ' role="tree"',
660
- " role='tree'",
661
- ' role="menubar"',
662
- " role='menubar'",
663
- ' role="menu"',
664
- " role='menu'",
665
- ' banner',
666
- ' promo',
667
- ' announcement',
668
- ' cta',
669
- ' callout',
670
- ' advert',
671
- ' newsletter',
672
- ' subscribe',
673
- ' cookie',
674
- ' consent',
675
- ' popup',
676
- ' modal',
677
- ' overlay',
678
- ' toast',
679
- ' fixed',
680
- ' sticky',
681
- ' z-50',
682
- ' z-4',
683
- ' isolate',
684
- ' breadcrumb',
685
- ' pagination',
686
- ];
687
- function mayContainNoise(html) {
688
- const haystack = html.toLowerCase();
689
- return NOISE_MARKERS.some((marker) => haystack.includes(marker));
690
- }
691
- function isFullDocumentHtml(html) {
692
- return HTML_DOCUMENT_MARKERS.test(html);
693
- }
694
- function isStructuralNoiseTag(tagName) {
695
- return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
696
- }
697
- function isElementHidden(element) {
698
- const style = element.getAttribute('style') ?? '';
699
- return (element.getAttribute('hidden') !== null ||
700
- element.getAttribute('aria-hidden') === 'true' ||
701
- /\bdisplay\s*:\s*none\b/i.test(style) ||
702
- /\bvisibility\s*:\s*hidden\b/i.test(style));
703
- }
704
- function hasNoiseRole(role) {
705
- return role !== null && NAVIGATION_ROLES.has(role);
706
- }
707
- function tokenizeIdentifierLikeText(value) {
708
- return value
709
- .toLowerCase()
710
- .replace(/[^a-z0-9]+/g, ' ')
711
- .trim()
712
- .split(' ')
713
- .filter(Boolean);
714
- }
715
- function matchesPromoIdOrClass(className, id) {
716
- const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
717
- return tokens.some((token) => PROMO_TOKENS.has(token));
718
- }
719
- function matchesFixedOrHighZIsolate(className) {
720
- return (FIXED_PATTERN.test(className) ||
721
- (HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
722
- }
723
- function readElementMetadata(element) {
724
- return {
725
- tagName: element.tagName.toLowerCase(),
726
- className: element.getAttribute('class') ?? '',
727
- id: element.getAttribute('id') ?? '',
728
- role: element.getAttribute('role'),
729
- isHidden: isElementHidden(element),
730
- };
731
- }
732
- function isBoilerplateHeader({ className, id, role, }) {
733
- if (hasNoiseRole(role))
734
- return true;
735
- const combined = `${className} ${id}`.toLowerCase();
736
- return HEADER_NOISE_PATTERN.test(combined);
737
- }
738
- function isNoiseElement(node) {
739
- const metadata = readElementMetadata(node);
740
- return (isStructuralNoiseTag(metadata.tagName) ||
741
- ALWAYS_NOISE_TAGS.has(metadata.tagName) ||
742
- (metadata.tagName === 'header' && isBoilerplateHeader(metadata)) ||
743
- metadata.isHidden ||
744
- hasNoiseRole(metadata.role) ||
745
- matchesFixedOrHighZIsolate(metadata.className) ||
746
- matchesPromoIdOrClass(metadata.className, metadata.id));
747
- }
748
- function removeNoiseNodes(nodes) {
749
- for (let index = nodes.length - 1; index >= 0; index -= 1) {
750
- const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
751
- if (!node)
752
- continue;
753
- if (isElement(node) && isNoiseElement(node)) {
754
- node.remove();
755
- }
756
- }
757
- }
758
- function stripNoiseNodes(document) {
759
- // Use targeted selectors for common noise elements instead of querySelectorAll('*')
760
- const targetSelectors = [
761
- 'nav',
762
- 'footer',
763
- 'aside',
764
- 'header[class*="site"]',
765
- 'header[class*="nav"]',
766
- 'header[class*="menu"]',
767
- '[role="banner"]',
768
- '[role="navigation"]',
769
- '[role="dialog"]',
770
- '[style*="display: none"]',
771
- '[style*="display:none"]',
772
- '[hidden]',
773
- '[aria-hidden="true"]',
774
- ].join(',');
775
- const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
776
- // Remove in reverse order to handle nested elements correctly
777
- removeNoiseNodes(potentialNoiseNodes);
778
- // Second pass: check remaining elements for noise patterns (promo, fixed positioning, etc.)
779
- const candidateSelectors = [
780
- ...STRUCTURAL_TAGS,
781
- ...ALWAYS_NOISE_TAGS,
782
- 'header',
783
- 'canvas',
784
- '[class]',
785
- '[id]',
786
- '[role]',
787
- '[style]',
788
- ].join(',');
789
- const allElements = document.querySelectorAll(candidateSelectors);
790
- removeNoiseNodes(allElements);
791
- }
792
- function removeNoiseFromHtml(html, document) {
793
- const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
794
- if (!shouldParse)
795
- return html;
796
- try {
797
- const resolvedDocument = document ?? parseHTML(html).document;
798
- stripNoiseNodes(resolvedDocument);
799
- const bodyInnerHtml = getBodyInnerHtml(resolvedDocument);
800
- if (bodyInnerHtml)
801
- return bodyInnerHtml;
802
- const docToString = getDocumentToString(resolvedDocument);
803
- if (docToString)
804
- return docToString();
805
- const documentElementOuterHtml = getDocumentElementOuterHtml(resolvedDocument);
806
- if (documentElementOuterHtml)
807
- return documentElementOuterHtml;
808
- return html;
809
- }
810
- catch {
811
- return html;
812
- }
813
- }
343
+ // DOM noise removal functions moved to ./dom-noise-removal.ts
814
344
  function buildInlineCode(content) {
815
345
  const runs = content.match(/`+/g);
816
346
  let longest = '';
@@ -821,8 +351,11 @@ function buildInlineCode(content) {
821
351
  }
822
352
  }
823
353
  }
354
+ // Use a fence longer than any run of backticks in the content.
824
355
  const delimiter = `\`${longest}`;
825
- const padding = delimiter.length > 1 ? ' ' : '';
356
+ // Only pad when needed to avoid altering code spans unnecessarily.
357
+ // CommonMark recommends padding when the code starts/ends with a backtick.
358
+ const padding = content.startsWith('`') || content.endsWith('`') ? ' ' : '';
826
359
  return `${delimiter}${padding}${content}${padding}${delimiter}`;
827
360
  }
828
361
  function deriveAltFromImageUrl(src) {
@@ -845,16 +378,13 @@ function deriveAltFromImageUrl(src) {
845
378
  }
846
379
  }
847
380
  function isCodeBlock(parent) {
848
- if (!isRecord(parent))
381
+ if (!isObject(parent))
849
382
  return false;
850
383
  const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
851
384
  return ['PRE', 'WRAPPED-PRE'].includes(tagName);
852
385
  }
853
386
  function hasGetAttribute(value) {
854
- return isRecord(value) && typeof value.getAttribute === 'function';
855
- }
856
- function hasCodeBlockTranslators(value) {
857
- return isRecord(value) && isRecord(value.codeBlockTranslators);
387
+ return isObject(value) && typeof value.getAttribute === 'function';
858
388
  }
859
389
  function buildInlineCodeTranslator() {
860
390
  return {
@@ -871,37 +401,19 @@ function resolveAttributeLanguage(node) {
871
401
  const dataLanguage = getAttribute?.('data-language') ?? '';
872
402
  return resolveLanguageFromAttributes(className, dataLanguage);
873
403
  }
874
- function resolveCodeBlockTranslators(visitor) {
875
- const childTranslators = isRecord(visitor) ? visitor.instance : null;
876
- return hasCodeBlockTranslators(childTranslators)
877
- ? childTranslators.codeBlockTranslators
878
- : null;
879
- }
880
- function buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators) {
881
- return {
882
- noEscape: true,
883
- preserveWhitespace: true,
884
- ...(codeBlockTranslators
885
- ? { childTranslators: codeBlockTranslators }
886
- : null),
887
- postprocess: ({ content }) => {
888
- const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
889
- return CODE_BLOCK.format(content, language);
890
- },
891
- };
892
- }
893
404
  function buildCodeTranslator(ctx) {
894
- if (!isRecord(ctx))
405
+ if (!isObject(ctx))
895
406
  return buildInlineCodeTranslator();
896
- const { node, parent, visitor } = ctx;
407
+ const { parent } = ctx;
897
408
  if (!isCodeBlock(parent))
898
409
  return buildInlineCodeTranslator();
899
- const attributeLanguage = resolveAttributeLanguage(node);
900
- const codeBlockTranslators = resolveCodeBlockTranslators(visitor);
901
- return buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators);
410
+ return {
411
+ noEscape: true,
412
+ preserveWhitespace: true,
413
+ };
902
414
  }
903
415
  function buildImageTranslator(ctx) {
904
- if (!isRecord(ctx))
416
+ if (!isObject(ctx))
905
417
  return { content: '' };
906
418
  const { node } = ctx;
907
419
  const getAttribute = hasGetAttribute(node)
@@ -914,19 +426,57 @@ function buildImageTranslator(ctx) {
914
426
  content: `![${alt}](${src})`,
915
427
  };
916
428
  }
429
+ function findLanguageFromCodeChild(node) {
430
+ if (!isObject(node))
431
+ return undefined;
432
+ const { childNodes } = node;
433
+ if (!Array.isArray(childNodes))
434
+ return undefined;
435
+ for (const child of childNodes) {
436
+ if (!isObject(child))
437
+ continue;
438
+ const tagName = typeof child.rawTagName === 'string'
439
+ ? child.rawTagName.toUpperCase()
440
+ : '';
441
+ if (tagName === 'CODE') {
442
+ return resolveAttributeLanguage(child);
443
+ }
444
+ }
445
+ return undefined;
446
+ }
447
+ function createCodeBlockPostprocessor(language) {
448
+ return ({ content }) => {
449
+ const trimmed = content.trim();
450
+ if (!trimmed)
451
+ return '';
452
+ const resolvedLanguage = language ?? detectLanguageFromCode(trimmed) ?? '';
453
+ return CODE_BLOCK.format(trimmed, resolvedLanguage);
454
+ };
455
+ }
456
+ function buildPreTranslator(ctx) {
457
+ if (!isObject(ctx))
458
+ return {};
459
+ const { node } = ctx;
460
+ const attributeLanguage = resolveAttributeLanguage(node) ?? findLanguageFromCodeChild(node);
461
+ return {
462
+ noEscape: true,
463
+ preserveWhitespace: true,
464
+ postprocess: createCodeBlockPostprocessor(attributeLanguage),
465
+ };
466
+ }
917
467
  function createCustomTranslators() {
918
468
  return {
919
469
  code: (ctx) => buildCodeTranslator(ctx),
920
470
  img: (ctx) => buildImageTranslator(ctx),
921
471
  dl: (ctx) => {
922
- if (!isRecord(ctx) || !isRecord(ctx.node)) {
472
+ if (!isObject(ctx) || !isObject(ctx.node)) {
923
473
  return { content: '' };
924
474
  }
925
475
  const node = ctx.node;
926
476
  const childNodes = Array.isArray(node.childNodes) ? node.childNodes : [];
927
477
  const items = childNodes
928
478
  .map((child) => {
929
- if (!isRecord(child))
479
+ if (!isObject(child))
930
480
  return '';
931
481
  const nodeName = typeof child.nodeName === 'string'
932
482
  ? child.nodeName.toUpperCase()
@@ -956,6 +506,8 @@ function createCustomTranslators() {
956
506
  sup: () => ({
957
507
  postprocess: ({ content }) => `^${content}^`,
958
508
  }),
509
+ // Fix #6: Handle <pre> without <code> - wrap in fenced code block
510
+ pre: (ctx) => buildPreTranslator(ctx),
959
511
  };
960
512
  }
961
513
  let markdownInstance = null;
@@ -971,9 +523,11 @@ function getMarkdownConverter() {
971
523
  markdownInstance ??= createMarkdownInstance();
972
524
  return markdownInstance;
973
525
  }
974
- function translateHtmlToMarkdown(html, url, signal, document) {
526
+ function translateHtmlToMarkdown(html, url, signal, document, skipNoiseRemoval) {
975
527
  throwIfAborted(signal, url, 'markdown:begin');
976
- const cleanedHtml = runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document));
528
+ const cleanedHtml = skipNoiseRemoval
529
+ ? html
530
+ : runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
977
531
  throwIfAborted(signal, url, 'markdown:cleaned');
978
532
  const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
979
533
  throwIfAborted(signal, url, 'markdown:translated');
@@ -989,151 +543,18 @@ export function htmlToMarkdown(html, metadata, options) {
989
543
  if (!html)
990
544
  return buildMetadataFooter(metadata, url);
991
545
  try {
992
- const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document);
546
+ const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document, options?.skipNoiseRemoval);
993
547
  return appendMetadataFooter(content, metadata, url);
994
548
  }
995
549
  catch (error) {
996
550
  if (error instanceof FetchError) {
997
551
  throw error;
998
552
  }
553
+ logError('Failed to convert HTML to markdown', error instanceof Error ? error : undefined);
999
554
  return buildMetadataFooter(metadata, url);
1000
555
  }
1001
556
  }
1002
- function cleanupMarkdownArtifacts(content) {
1003
- let result = content;
1004
- const fixOrphanHeadings = (text) => {
1005
- return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (match, prefix, hashes, heading) => {
1006
- if (typeof prefix !== 'string' ||
1007
- typeof hashes !== 'string' ||
1008
- typeof heading !== 'string') {
1009
- return match;
1010
- }
1011
- if (heading.length > 150) {
1012
- return match;
1013
- }
1014
- const trimmedPrefix = prefix.trim();
1015
- if (trimmedPrefix === '') {
1016
- return `${hashes} ${heading}\n\n`;
1017
- }
1018
- return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
1019
- });
1020
- };
1021
- result = fixOrphanHeadings(result);
1022
- result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
1023
- const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
1024
- result = result.replace(zeroWidthAnchorLink, '');
1025
- result = result.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '');
1026
- result = result.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '');
1027
- result = result.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
1028
- result = result.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
1029
- result = result.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
1030
- result = result.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
1031
- const tocLinkLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
1032
- const lines = result.split('\n');
1033
- const filtered = [];
1034
- let skipTocBlock = false;
1035
- for (let i = 0; i < lines.length; i += 1) {
1036
- const line = lines[i] ?? '';
1037
- const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
1038
- const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
1039
- if (tocLinkLine.test(line)) {
1040
- const prevIsToc = tocLinkLine.test(prevLine) || prevLine.trim() === '';
1041
- const nextIsToc = tocLinkLine.test(nextLine) || nextLine.trim() === '';
1042
- if (prevIsToc || nextIsToc) {
1043
- skipTocBlock = true;
1044
- continue;
1045
- }
1046
- }
1047
- else if (line.trim() === '' && skipTocBlock) {
1048
- skipTocBlock = false;
1049
- continue;
1050
- }
1051
- else {
1052
- skipTocBlock = false;
1053
- }
1054
- filtered.push(line);
1055
- }
1056
- result = filtered.join('\n');
1057
- result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
1058
- result = result.replace(/^Was this page helpful\??\s*$/gim, '');
1059
- result = result.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ');
1060
- result = result.replace(/\\([[]])/g, '$1');
1061
- result = result.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
1062
- result = result.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
1063
- result = result.replace(/\n{3,}/g, '\n\n');
1064
- return result.trim();
1065
- }
1066
- const HEADING_KEYWORDS = new Set([
1067
- 'overview',
1068
- 'introduction',
1069
- 'summary',
1070
- 'conclusion',
1071
- 'prerequisites',
1072
- 'requirements',
1073
- 'installation',
1074
- 'configuration',
1075
- 'usage',
1076
- 'features',
1077
- 'limitations',
1078
- 'troubleshooting',
1079
- 'faq',
1080
- 'resources',
1081
- 'references',
1082
- 'changelog',
1083
- 'license',
1084
- 'acknowledgments',
1085
- 'appendix',
1086
- ]);
1087
- function isLikelyHeadingLine(line) {
1088
- const trimmed = line.trim();
1089
- if (!trimmed || trimmed.length > 80)
1090
- return false;
1091
- if (/^#{1,6}\s/.test(trimmed))
1092
- return false;
1093
- if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
1094
- return false;
1095
- if (/[.!?]$/.test(trimmed))
1096
- return false;
1097
- if (/^\[.*\]\(.*\)$/.test(trimmed))
1098
- return false;
1099
- if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
1100
- return true;
1101
- }
1102
- const words = trimmed.split(/\s+/);
1103
- if (words.length >= 2 && words.length <= 6) {
1104
- const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
1105
- if (isTitleCase)
1106
- return true;
1107
- }
1108
- if (words.length === 1) {
1109
- const lower = trimmed.toLowerCase();
1110
- if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed)) {
1111
- return true;
1112
- }
1113
- }
1114
- return false;
1115
- }
1116
- function promoteOrphanHeadings(markdown) {
1117
- const lines = markdown.split('\n');
1118
- const result = [];
1119
- for (let i = 0; i < lines.length; i += 1) {
1120
- const line = lines[i] ?? '';
1121
- const prevLine = i > 0 ? lines[i - 1] : '';
1122
- const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
1123
- const isStandalone = prevLine?.trim() === '' && nextLine?.trim() === '';
1124
- const isPrecededByBlank = prevLine?.trim() === '';
1125
- if ((isStandalone || isPrecededByBlank) && isLikelyHeadingLine(line)) {
1126
- const trimmed = line.trim();
1127
- const isExample = /^example:\s/i.test(trimmed);
1128
- const prefix = isExample ? '### ' : '## ';
1129
- result.push(prefix + trimmed);
1130
- }
1131
- else {
1132
- result.push(line);
1133
- }
1134
- }
1135
- return result.join('\n');
1136
- }
557
+ // Markdown cleanup functions moved to ./markdown-cleanup.ts
1137
558
  function formatFetchedDate(isoString) {
1138
559
  try {
1139
560
  const date = new Date(isoString);
@@ -1363,13 +784,9 @@ function buildRawMarkdownPayload({ rawContent, url, includeMetadata, }) {
1363
784
  : rawContent;
1364
785
  return { content, title };
1365
786
  }
1366
- function tryTransformRawContent({ html, url, includeMetadata, }) {
1367
- if (!shouldPreserveRawContent(url, html)) {
1368
- return null;
1369
- }
1370
- logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
787
+ function buildRawMarkdownResult({ rawContent, url, includeMetadata, }) {
1371
788
  const { content, title } = buildRawMarkdownPayload({
1372
- rawContent: html,
789
+ rawContent,
1373
790
  url,
1374
791
  includeMetadata,
1375
792
  });
@@ -1379,57 +796,115 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
1379
796
  truncated: false,
1380
797
  };
1381
798
  }
799
+ function tryTransformRawContent({ html, url, includeMetadata, }) {
800
+ if (!shouldPreserveRawContent(url, html)) {
801
+ return null;
802
+ }
803
+ logDebug('Preserving raw markdown content', { url: truncateUrlForLog(url) });
804
+ return buildRawMarkdownResult({
805
+ rawContent: html,
806
+ url,
807
+ includeMetadata,
808
+ });
809
+ }
1382
810
  const MIN_CONTENT_RATIO = 0.3;
1383
811
  const MIN_HTML_LENGTH_FOR_GATE = 100;
1384
812
  const MIN_HEADING_RETENTION_RATIO = 0.7;
1385
- function countHeadings(html) {
1386
- if (!html)
1387
- return 0;
1388
- // Match opening heading tags <h1> through <h6>
1389
- const headingPattern = /<h[1-6](?:\s[^>]*)?>([^<]*)<\/h[1-6]>/gi;
1390
- const matches = html.match(headingPattern);
1391
- return matches ? matches.length : 0;
1392
- }
1393
- function isHeadingStructurePreserved(article, originalHtml) {
1394
- if (!article)
1395
- return false;
1396
- // Cache heading counts to avoid duplicate regex matching
1397
- const originalHeadingCount = countHeadings(originalHtml);
1398
- const articleHeadingCount = countHeadings(article.content);
1399
- // If original has no headings, structure is trivially preserved
1400
- if (originalHeadingCount === 0)
1401
- return true;
1402
- // If article lost >50% of headings, structure is broken
1403
- const retentionRatio = articleHeadingCount / originalHeadingCount;
1404
- return retentionRatio >= MIN_HEADING_RETENTION_RATIO;
1405
- }
1406
- function stripHtmlTagsForLength(html) {
1407
- let result = '';
1408
- let inTag = false;
1409
- for (const char of html) {
1410
- if (char === '<') {
1411
- inTag = true;
1412
- }
1413
- else if (char === '>') {
1414
- inTag = false;
1415
- }
1416
- else if (!inTag) {
1417
- result += char;
1418
- }
1419
- }
1420
- return result;
1421
- }
1422
- export function isExtractionSufficient(article, originalHtml) {
813
+ const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
814
+ /**
815
+ * Check if HTML string needs document wrapper for proper parsing.
816
+ * Fragments without doctype/html/body tags need wrapping.
817
+ */
818
+ function needsDocumentWrapper(html) {
819
+ const trimmed = html.trim().toLowerCase();
820
+ return (!trimmed.startsWith('<!doctype') &&
821
+ !trimmed.startsWith('<html') &&
822
+ !trimmed.startsWith('<body'));
823
+ }
824
+ /**
825
+ * Wrap HTML fragment in minimal document structure for proper parsing.
826
+ */
827
+ function wrapHtmlFragment(html) {
828
+ return `<!DOCTYPE html><html><body>${html}</body></html>`;
829
+ }
830
+ function resolveHtmlDocument(htmlOrDocument) {
831
+ if (typeof htmlOrDocument !== 'string') {
832
+ return htmlOrDocument;
833
+ }
834
+ const htmlToParse = needsDocumentWrapper(htmlOrDocument)
835
+ ? wrapHtmlFragment(htmlOrDocument)
836
+ : htmlOrDocument;
837
+ return parseHTML(htmlToParse).document;
838
+ }
839
+ function countDomSelector(htmlOrDocument, selector) {
840
+ return resolveHtmlDocument(htmlOrDocument).querySelectorAll(selector).length;
841
+ }
842
+ /**
843
+ * Count headings using DOM querySelectorAll.
844
+ * Handles nested content like <h2><span>Text</span></h2> correctly.
845
+ */
846
+ function countHeadingsDom(htmlOrDocument) {
847
+ return countDomSelector(htmlOrDocument, 'h1,h2,h3,h4,h5,h6');
848
+ }
849
+ function countCodeBlocksDom(htmlOrDocument) {
850
+ return countDomSelector(htmlOrDocument, 'pre');
851
+ }
852
+ function cloneDocumentIfNeeded(htmlOrDocument, doc) {
853
+ return typeof htmlOrDocument === 'string'
854
+ ? doc
855
+ : doc.cloneNode(true);
856
+ }
857
+ function stripNonVisibleNodes(doc) {
858
+ for (const el of doc.querySelectorAll('script,style,noscript')) {
859
+ el.remove();
860
+ }
861
+ }
862
+ function resolveDocumentText(doc) {
863
+ // Note: linkedom may return null for body on HTML fragments despite types
864
+ const body = doc.body;
865
+ const docElement = doc.documentElement;
866
+ return body?.textContent ?? docElement?.textContent ?? '';
867
+ }
868
+ /**
869
+ * Get visible text length from HTML, excluding script/style/noscript content.
870
+ * Fixes the bug where stripHtmlTagsForLength() counted JS/CSS as visible text.
871
+ */
872
+ function getVisibleTextLength(htmlOrDocument) {
873
+ const doc = resolveHtmlDocument(htmlOrDocument);
874
+ const workDoc = cloneDocumentIfNeeded(htmlOrDocument, doc);
875
+ stripNonVisibleNodes(workDoc);
876
+ const text = resolveDocumentText(workDoc);
877
+ return text.replace(/\s+/g, ' ').trim().length;
878
+ }
879
+ export function isExtractionSufficient(article, originalHtmlOrDocument) {
1423
880
  if (!article)
1424
881
  return false;
1425
882
  const articleLength = article.textContent.length;
1426
- const originalLength = stripHtmlTagsForLength(originalHtml)
1427
- .replace(/\s+/g, ' ')
1428
- .trim().length;
883
+ // Use DOM-based visible text length to exclude script/style content
884
+ const originalLength = getVisibleTextLength(originalHtmlOrDocument);
1429
885
  if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
1430
886
  return true;
1431
887
  return articleLength / originalLength >= MIN_CONTENT_RATIO;
1432
888
  }
889
+ const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
890
+ const MAX_TRUNCATED_LINE_RATIO = 0.5;
891
+ /**
892
+ * Detect if extracted text has many truncated/incomplete sentences.
893
+ * Lines longer than 20 chars that don't end with sentence punctuation
894
+ * are considered potentially truncated.
895
+ */
896
+ function hasTruncatedSentences(text) {
897
+ const lines = text
898
+ .split('\n')
899
+ .filter((line) => line.trim().length > MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK);
900
+ if (lines.length < 3)
901
+ return false;
902
+ const incompleteLines = lines.filter((line) => {
903
+ const trimmed = line.trim();
904
+ return !/[.!?:;]$/.test(trimmed);
905
+ });
906
+ return incompleteLines.length / lines.length > MAX_TRUNCATED_LINE_RATIO;
907
+ }
1433
908
  export function determineContentExtractionSource(article) {
1434
909
  return article !== null;
1435
910
  }
@@ -1459,40 +934,147 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
1459
934
  }
1460
935
  return metadata;
1461
936
  }
937
+ /**
938
+ * Content root selectors in priority order.
939
+ * These identify the main content area on a page.
940
+ */
941
+ const CONTENT_ROOT_SELECTORS = [
942
+ 'main',
943
+ 'article',
944
+ '[role="main"]',
945
+ '#content',
946
+ '#main-content',
947
+ '.content',
948
+ '.main-content',
949
+ '.post-content',
950
+ '.article-content',
951
+ '.entry-content',
952
+ '[itemprop="articleBody"]',
953
+ '[data-content]',
954
+ '.post-body',
955
+ '.article-body',
956
+ ];
957
+ /**
958
+ * Find the main content root element in a document.
959
+ * Returns the innerHTML if found, undefined otherwise.
960
+ */
961
+ function findContentRoot(document) {
962
+ for (const selector of CONTENT_ROOT_SELECTORS) {
963
+ const element = document.querySelector(selector);
964
+ if (!element)
965
+ continue;
966
+ // Check if element has meaningful content
967
+ const innerHTML = typeof element.innerHTML === 'string'
968
+ ? element.innerHTML
969
+ : undefined;
970
+ if (innerHTML && innerHTML.trim().length > 100) {
971
+ return innerHTML;
972
+ }
973
+ }
974
+ return undefined;
975
+ }
1462
976
  function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, document, }) {
1463
977
  const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
1464
- const source = {
1465
- sourceHtml: useArticleContent && article ? article.content : html,
1466
- title: useArticleContent && article ? article.title : extractedMeta.title,
978
+ // If using article content, return it directly
979
+ if (useArticleContent && article) {
980
+ return {
981
+ sourceHtml: article.content,
982
+ title: article.title,
983
+ metadata,
984
+ };
985
+ }
986
+ // Try content root fallback before using full HTML
987
+ if (document) {
988
+ // Apply noise removal to HTML first (without passing document) to get cleaned HTML,
989
+ // then parse and find content root. This prevents the aggressive DOM stripping that
990
+ // happens when noise removal is given the original parsed document.
991
+ const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
992
+ const { document: cleanedDoc } = parseHTML(cleanedHtml);
993
+ const contentRoot = findContentRoot(cleanedDoc);
994
+ if (contentRoot) {
995
+ logDebug('Using content root fallback instead of full HTML', {
996
+ url: truncateUrlForLog(url),
997
+ contentLength: contentRoot.length,
998
+ });
999
+ return {
1000
+ sourceHtml: contentRoot,
1001
+ title: extractedMeta.title,
1002
+ metadata,
1003
+ // Skip noise removal - this HTML is already from a cleaned document
1004
+ skipNoiseRemoval: true,
1005
+ };
1006
+ }
1007
+ }
1008
+ // Fall back to full HTML
1009
+ return {
1010
+ sourceHtml: html,
1011
+ title: extractedMeta.title,
1467
1012
  metadata,
1013
+ ...(document ? { document } : {}),
1468
1014
  };
1469
- if (!useArticleContent && document) {
1470
- return { ...source, document };
1471
- }
1472
- return source;
1473
1015
  }
1474
- function logQualityGateFallback({ url, articleLength, }) {
1016
+ function logQualityGateFallback({ safeUrl, articleLength, }) {
1475
1017
  logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
1476
- url: url.substring(0, 80),
1018
+ url: safeUrl,
1477
1019
  articleLength,
1478
1020
  });
1479
1021
  }
1480
- function shouldUseArticleContent(article, html, url) {
1481
- // Check content sufficiency (length-based quality gate)
1482
- if (!isExtractionSufficient(article, html)) {
1483
- logQualityGateFallback({
1484
- url,
1485
- articleLength: article.textContent.length,
1486
- });
1487
- return false;
1022
+ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
1023
+ const articleLength = article.textContent.length;
1024
+ const originalLength = getVisibleTextLength(originalHtmlOrDocument);
1025
+ const safeUrl = truncateUrlForLog(url);
1026
+ let articleDocument = null;
1027
+ const getArticleDocument = () => {
1028
+ if (articleDocument)
1029
+ return articleDocument;
1030
+ articleDocument = resolveHtmlDocument(article.content);
1031
+ return articleDocument;
1032
+ };
1033
+ // If the document is tiny, don't gate too aggressively.
1034
+ if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
1035
+ const ratio = articleLength / originalLength;
1036
+ if (ratio < MIN_CONTENT_RATIO) {
1037
+ logQualityGateFallback({ safeUrl, articleLength });
1038
+ return false;
1039
+ }
1488
1040
  }
1489
- // Check heading structure preservation
1490
- if (!isHeadingStructurePreserved(article, html)) {
1491
- logDebug('Quality gate: Readability broke heading structure, using full HTML', {
1492
- url: url.substring(0, 80),
1493
- originalHeadings: countHeadings(html),
1494
- articleHeadings: countHeadings(article.content),
1041
+ // Heading structure retention (compute counts once to avoid repeated DOM queries/parses).
1042
+ const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
1043
+ if (originalHeadings > 0) {
1044
+ const articleHeadings = countHeadingsDom(getArticleDocument());
1045
+ const retentionRatio = articleHeadings / originalHeadings;
1046
+ if (retentionRatio < MIN_HEADING_RETENTION_RATIO) {
1047
+ logDebug('Quality gate: Readability broke heading structure, using full HTML', {
1048
+ url: safeUrl,
1049
+ originalHeadings,
1050
+ articleHeadings,
1051
+ });
1052
+ return false;
1053
+ }
1054
+ }
1055
+ const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
1056
+ if (originalCodeBlocks > 0) {
1057
+ const articleCodeBlocks = countCodeBlocksDom(getArticleDocument());
1058
+ const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
1059
+ // Always log code block counts for debugging
1060
+ logDebug('Code block retention check', {
1061
+ url: safeUrl,
1062
+ originalCodeBlocks,
1063
+ articleCodeBlocks,
1064
+ codeRetentionRatio,
1495
1065
  });
1066
+ if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
1067
+ logDebug('Quality gate: Readability removed code blocks, using full HTML', {
1068
+ url: safeUrl,
1069
+ originalCodeBlocks,
1070
+ articleCodeBlocks,
1071
+ });
1072
+ return false;
1073
+ }
1074
+ }
1075
+ // Layout extraction issue: truncated/fragmented lines.
1076
+ if (hasTruncatedSentences(article.textContent)) {
1077
+ logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', { url: safeUrl });
1496
1078
  return false;
1497
1079
  }
1498
1080
  return true;
@@ -1502,8 +1084,9 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
1502
1084
  extractArticle: true,
1503
1085
  ...(signal ? { signal } : {}),
1504
1086
  });
1087
+ const originalDocument = document;
1505
1088
  const useArticleContent = article
1506
- ? shouldUseArticleContent(article, html, url)
1089
+ ? shouldUseArticleContent(article, originalDocument, url)
1507
1090
  : false;
1508
1091
  return buildContentSource({
1509
1092
  html,
@@ -1512,7 +1095,7 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
1512
1095
  extractedMeta,
1513
1096
  includeMetadata,
1514
1097
  useArticleContent,
1515
- ...(document ? { document } : {}),
1098
+ document,
1516
1099
  });
1517
1100
  }
1518
1101
  function tryTransformRawStage(html, url, includeMetadata) {
@@ -1535,6 +1118,7 @@ function buildMarkdownFromContext(context, url, signal) {
1535
1118
  url,
1536
1119
  ...(signal ? { signal } : {}),
1537
1120
  ...(context.document ? { document: context.document } : {}),
1121
+ ...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
1538
1122
  }));
1539
1123
  return {
1540
1124
  markdown: content,
@@ -1551,11 +1135,14 @@ function runTotalTransformStage(url, fn) {
1551
1135
  return result;
1552
1136
  }
1553
1137
  finally {
1554
- if (success) {
1555
- endTransformStage(totalStage, { truncated: false });
1556
- }
1138
+ finalizeTotalTransformStage(totalStage, success);
1557
1139
  }
1558
1140
  }
1141
+ function finalizeTotalTransformStage(stage, success) {
1142
+ if (!success)
1143
+ return;
1144
+ endTransformStage(stage, { truncated: false });
1145
+ }
1559
1146
  async function runTotalTransformStageAsync(url, fn) {
1560
1147
  const totalStage = startTransformStage(url, 'transform:total');
1561
1148
  let success = false;
@@ -1565,9 +1152,7 @@ async function runTotalTransformStageAsync(url, fn) {
1565
1152
  return result;
1566
1153
  }
1567
1154
  finally {
1568
- if (success) {
1569
- endTransformStage(totalStage, { truncated: false });
1570
- }
1155
+ finalizeTotalTransformStage(totalStage, success);
1571
1156
  }
1572
1157
  }
1573
1158
  export function transformHtmlToMarkdownInProcess(html, url, options) {
@@ -1628,6 +1213,12 @@ class WorkerPool {
1628
1213
  timeoutMs;
1629
1214
  queueMax;
1630
1215
  closed = false;
1216
+ createAbortError(url, stage) {
1217
+ return new FetchError('Request was canceled', url, 499, {
1218
+ reason: 'aborted',
1219
+ stage,
1220
+ });
1221
+ }
1631
1222
  ensureOpen() {
1632
1223
  if (this.closed) {
1633
1224
  throw new Error('Transform worker pool closed');
@@ -1636,10 +1227,7 @@ class WorkerPool {
1636
1227
  ensureNotAborted(signal, url, stage) {
1637
1228
  if (!signal?.aborted)
1638
1229
  return;
1639
- throw new FetchError('Request was canceled', url, 499, {
1640
- reason: 'aborted',
1641
- stage,
1642
- });
1230
+ throw this.createAbortError(url, stage);
1643
1231
  }
1644
1232
  ensureQueueCapacity(url) {
1645
1233
  if (this.queue.length < this.queueMax)
@@ -1704,10 +1292,7 @@ class WorkerPool {
1704
1292
  abortInflightTask(id, url, workerIndex) {
1705
1293
  const slot = this.workers[workerIndex];
1706
1294
  this.cancelWorkerTask(slot, id);
1707
- this.failTask(id, new FetchError('Request was canceled', url, 499, {
1708
- reason: 'aborted',
1709
- stage: 'transform:signal-abort',
1710
- }));
1295
+ this.failTask(id, this.createAbortError(url, 'transform:signal-abort'));
1711
1296
  if (slot) {
1712
1297
  this.restartWorker(workerIndex, slot);
1713
1298
  }
@@ -1717,10 +1302,7 @@ class WorkerPool {
1717
1302
  if (queuedIndex === -1)
1718
1303
  return;
1719
1304
  this.queue.splice(queuedIndex, 1);
1720
- reject(new FetchError('Request was canceled', url, 499, {
1721
- reason: 'aborted',
1722
- stage: 'transform:queued-abort',
1723
- }));
1305
+ reject(this.createAbortError(url, 'transform:queued-abort'));
1724
1306
  }
1725
1307
  createWorkerSlot(worker) {
1726
1308
  return {
@@ -1876,10 +1458,7 @@ class WorkerPool {
1876
1458
  if (!task.signal?.aborted)
1877
1459
  return false;
1878
1460
  this.clearAbortListener(task.signal, task.abortListener);
1879
- task.reject(new FetchError('Request was canceled', task.url, 499, {
1880
- reason: 'aborted',
1881
- stage: 'transform:dispatch',
1882
- }));
1461
+ task.reject(this.createAbortError(task.url, 'transform:dispatch'));
1883
1462
  return true;
1884
1463
  }
1885
1464
  markSlotBusy(slot, task) {
@@ -1984,4 +1563,3 @@ export async function transformHtmlToMarkdown(html, url, options) {
1984
1563
  }
1985
1564
  });
1986
1565
  }
1987
- //# sourceMappingURL=transform.js.map