@j0hanz/superfetch 2.1.8 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/transform.js CHANGED
@@ -129,83 +129,41 @@ function truncateHtml(html) {
129
129
  });
130
130
  return html.substring(0, maxSize);
131
131
  }
132
- function createMetaCollectorState() {
133
- return {
134
- title: {},
135
- description: {},
136
- author: {},
137
- };
138
- }
139
- function resolveMetaField(state, field) {
140
- const sources = state[field];
141
- return sources.og ?? sources.twitter ?? sources.standard;
142
- }
143
- function parseOpenGraphKey(property) {
144
- if (!property?.startsWith('og:'))
145
- return null;
146
- const key = property.replace('og:', '');
147
- return key === 'title' || key === 'description' ? key : null;
148
- }
149
- function parseTwitterKey(name) {
150
- if (!name?.startsWith('twitter:'))
151
- return null;
152
- const key = name.replace('twitter:', '');
153
- return key === 'title' || key === 'description' ? key : null;
154
- }
155
- function parseStandardKey(name) {
156
- if (name === 'description')
157
- return 'description';
158
- if (name === 'author')
159
- return 'author';
160
- return null;
161
- }
162
- function collectMetaTag(state, tag) {
163
- const content = tag.getAttribute('content')?.trim();
164
- if (!content)
165
- return;
166
- const ogKey = parseOpenGraphKey(tag.getAttribute('property'));
167
- if (ogKey) {
168
- state[ogKey].og = content;
169
- return;
170
- }
171
- const name = tag.getAttribute('name');
172
- const twitterKey = parseTwitterKey(name);
173
- if (twitterKey) {
174
- state[twitterKey].twitter = content;
175
- return;
176
- }
177
- const standardKey = parseStandardKey(name);
178
- if (standardKey) {
179
- state[standardKey].standard = content;
180
- }
181
- }
182
- function scanMetaTags(document, state) {
183
- const metaTags = document.querySelectorAll('meta');
184
- for (const tag of metaTags) {
185
- collectMetaTag(state, tag);
132
+ function extractMetadata(document) {
133
+ const title = {};
134
+ const description = {};
135
+ let author;
136
+ for (const tag of document.querySelectorAll('meta')) {
137
+ const content = tag.getAttribute('content')?.trim();
138
+ if (!content)
139
+ continue;
140
+ const property = tag.getAttribute('property');
141
+ const name = tag.getAttribute('name');
142
+ if (property === 'og:title')
143
+ title.og = content;
144
+ else if (property === 'og:description')
145
+ description.og = content;
146
+ else if (name === 'twitter:title')
147
+ title.twitter = content;
148
+ else if (name === 'twitter:description')
149
+ description.twitter = content;
150
+ else if (name === 'description')
151
+ description.standard = content;
152
+ else if (name === 'author')
153
+ author = content;
186
154
  }
187
- }
188
- function ensureTitleFallback(document, state) {
189
- if (state.title.standard)
190
- return;
191
155
  const titleEl = document.querySelector('title');
192
- if (titleEl?.textContent) {
193
- state.title.standard = titleEl.textContent.trim();
156
+ if (!title.standard && titleEl?.textContent) {
157
+ title.standard = titleEl.textContent.trim();
194
158
  }
195
- }
196
- function extractMetadata(document) {
197
- const state = createMetaCollectorState();
198
- scanMetaTags(document, state);
199
- ensureTitleFallback(document, state);
159
+ const resolvedTitle = title.og ?? title.twitter ?? title.standard;
160
+ const resolvedDesc = description.og ?? description.twitter ?? description.standard;
200
161
  const metadata = {};
201
- const title = resolveMetaField(state, 'title');
202
- const description = resolveMetaField(state, 'description');
203
- const author = resolveMetaField(state, 'author');
204
- if (title !== undefined)
205
- metadata.title = title;
206
- if (description !== undefined)
207
- metadata.description = description;
208
- if (author !== undefined)
162
+ if (resolvedTitle)
163
+ metadata.title = resolvedTitle;
164
+ if (resolvedDesc)
165
+ metadata.description = resolvedDesc;
166
+ if (author)
209
167
  metadata.author = author;
210
168
  return metadata;
211
169
  }
@@ -226,9 +184,6 @@ function extractArticle(document) {
226
184
  logWarn('Document not compatible with Readability');
227
185
  return null;
228
186
  }
229
- return mapParsedArticle(parseReadabilityArticle(document));
230
- }
231
- function parseReadabilityArticle(document) {
232
187
  try {
233
188
  const documentClone = document.cloneNode(true);
234
189
  const rawText = documentClone.body.textContent ||
@@ -238,54 +193,31 @@ function parseReadabilityArticle(document) {
238
193
  return null;
239
194
  }
240
195
  const reader = new Readability(documentClone, { maxElemsToParse: 20_000 });
241
- return reader.parse();
196
+ const parsed = reader.parse();
197
+ if (!parsed)
198
+ return null;
199
+ return {
200
+ content: parsed.content ?? '',
201
+ textContent: parsed.textContent ?? '',
202
+ ...(parsed.title != null && { title: parsed.title }),
203
+ ...(parsed.byline != null && { byline: parsed.byline }),
204
+ ...(parsed.excerpt != null && { excerpt: parsed.excerpt }),
205
+ ...(parsed.siteName != null && { siteName: parsed.siteName }),
206
+ };
242
207
  }
243
208
  catch (error) {
244
- logError('Failed to extract article with Readability', asError(error));
209
+ logError('Failed to extract article with Readability', error instanceof Error ? error : undefined);
245
210
  return null;
246
211
  }
247
212
  }
248
- function asError(error) {
249
- if (error instanceof Error) {
250
- return error;
251
- }
252
- return undefined;
253
- }
254
- function mapParsedArticle(parsed) {
255
- return parsed ? mapReadabilityResult(parsed) : null;
256
- }
257
- function mapReadabilityResult(parsed) {
258
- return {
259
- content: parsed.content ?? '',
260
- textContent: parsed.textContent ?? '',
261
- ...buildOptionalArticleFields(parsed),
262
- };
263
- }
264
- function buildOptionalArticleFields(parsed) {
265
- const optional = {};
266
- addOptionalField(optional, 'title', parsed.title);
267
- addOptionalField(optional, 'byline', parsed.byline);
268
- addOptionalField(optional, 'excerpt', parsed.excerpt);
269
- addOptionalField(optional, 'siteName', parsed.siteName);
270
- return optional;
271
- }
272
- function addOptionalField(target, key, value) {
273
- if (value == null)
274
- return;
275
- target[key] = value;
276
- }
277
213
  export function extractContent(html, url, options = {
278
214
  extractArticle: true,
279
215
  }) {
280
- const emptyResult = createEmptyExtractionResult();
281
216
  if (!isValidInput(html, url)) {
282
- return emptyResult;
217
+ return { article: null, metadata: {} };
283
218
  }
284
219
  return tryExtractContent(html, url, options);
285
220
  }
286
- function createEmptyExtractionResult() {
287
- return { article: null, metadata: {} };
288
- }
289
221
  function extractArticleWithStage(document, url, shouldExtract) {
290
222
  if (!shouldExtract)
291
223
  return null;
@@ -297,7 +229,7 @@ function handleExtractionFailure(error, url, signal) {
297
229
  }
298
230
  throwIfAborted(signal, url, 'extract:error');
299
231
  logError('Failed to extract content', error instanceof Error ? error : undefined);
300
- return createEmptyExtractionResult();
232
+ return { article: null, metadata: {} };
301
233
  }
302
234
  function extractContentStages(html, url, options) {
303
235
  throwIfAborted(options.signal, url, 'extract:begin');
@@ -325,14 +257,11 @@ function isValidInput(html, url) {
325
257
  return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
326
258
  }
327
259
  function validateRequiredString(value, message) {
328
- if (isNonEmptyString(value))
260
+ if (typeof value === 'string' && value.length > 0)
329
261
  return true;
330
262
  logWarn(message);
331
263
  return false;
332
264
  }
333
- function isNonEmptyString(value) {
334
- return typeof value === 'string' && value.length > 0;
335
- }
336
265
  function resolveArticleExtraction(document, shouldExtract) {
337
266
  return shouldExtract ? extractArticle(document) : null;
338
267
  }
@@ -417,7 +346,124 @@ function isWordChar(char) {
417
346
  (code >= 97 && code <= 122) ||
418
347
  char === '_');
419
348
  }
420
- const BASH_PACKAGE_MANAGERS = [
349
+ const LANGUAGE_PATTERNS = [
350
+ {
351
+ language: 'jsx',
352
+ pattern: {
353
+ keywords: ['classname=', 'jsx:', "from 'react'", 'from "react"'],
354
+ custom: (code) => containsJsxTag(code),
355
+ },
356
+ },
357
+ {
358
+ language: 'typescript',
359
+ pattern: {
360
+ wordBoundary: ['interface', 'type'],
361
+ custom: (_, lower) => [
362
+ ': string',
363
+ ':string',
364
+ ': number',
365
+ ':number',
366
+ ': boolean',
367
+ ':boolean',
368
+ ': void',
369
+ ':void',
370
+ ': any',
371
+ ':any',
372
+ ': unknown',
373
+ ':unknown',
374
+ ': never',
375
+ ':never',
376
+ ].some((hint) => lower.includes(hint)),
377
+ },
378
+ },
379
+ {
380
+ language: 'rust',
381
+ pattern: {
382
+ regex: /\b(?:fn|impl|struct|enum)\b/,
383
+ keywords: ['let mut'],
384
+ custom: (_, lower) => lower.includes('use ') && lower.includes('::'),
385
+ },
386
+ },
387
+ {
388
+ language: 'javascript',
389
+ pattern: {
390
+ regex: /\b(?:const|let|var|function|class|async|await|export|import)\b/,
391
+ },
392
+ },
393
+ {
394
+ language: 'python',
395
+ pattern: {
396
+ regex: /\b(?:def|class|import|from)\b/,
397
+ keywords: ['print(', '__name__'],
398
+ },
399
+ },
400
+ {
401
+ language: 'bash',
402
+ pattern: {
403
+ custom: (code) => detectBashIndicators(code),
404
+ },
405
+ },
406
+ {
407
+ language: 'css',
408
+ pattern: {
409
+ regex: /@media|@import|@keyframes/,
410
+ custom: (code) => detectCssStructure(code),
411
+ },
412
+ },
413
+ {
414
+ language: 'html',
415
+ pattern: {
416
+ keywords: [
417
+ '<!doctype',
418
+ '<html',
419
+ '<head',
420
+ '<body',
421
+ '<div',
422
+ '<span',
423
+ '<p',
424
+ '<a',
425
+ '<script',
426
+ '<style',
427
+ ],
428
+ },
429
+ },
430
+ {
431
+ language: 'json',
432
+ pattern: {
433
+ startsWith: ['{', '['],
434
+ },
435
+ },
436
+ {
437
+ language: 'yaml',
438
+ pattern: {
439
+ custom: (code) => detectYamlStructure(code),
440
+ },
441
+ },
442
+ {
443
+ language: 'sql',
444
+ pattern: {
445
+ wordBoundary: [
446
+ 'select',
447
+ 'insert',
448
+ 'update',
449
+ 'delete',
450
+ 'create',
451
+ 'alter',
452
+ 'drop',
453
+ ],
454
+ },
455
+ },
456
+ {
457
+ language: 'go',
458
+ pattern: {
459
+ wordBoundary: ['package', 'func'],
460
+ keywords: ['import "'],
461
+ },
462
+ },
463
+ ];
464
+ // Bash detection constants
465
+ const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
466
+ const BASH_PKG_MANAGERS = [
421
467
  'npm',
422
468
  'yarn',
423
469
  'pnpm',
@@ -429,184 +475,83 @@ const BASH_PACKAGE_MANAGERS = [
429
475
  'go',
430
476
  ];
431
477
  const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
432
- const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
433
- function detectBash(code) {
434
- const lines = splitLines(code);
435
- for (const line of lines) {
436
- const trimmed = line.trimStart();
437
- if (!trimmed)
478
+ function isShellPrefix(line) {
479
+ return (line.startsWith('#!') || line.startsWith('$ ') || line.startsWith('# '));
480
+ }
481
+ function matchesBashCommand(line) {
482
+ return BASH_COMMANDS.some((cmd) => line === cmd || line.startsWith(`${cmd} `));
483
+ }
484
+ function matchesPackageManagerVerb(line) {
485
+ for (const mgr of BASH_PKG_MANAGERS) {
486
+ if (!line.startsWith(`${mgr} `))
438
487
  continue;
439
- if (isBashIndicator(trimmed))
488
+ const rest = line.slice(mgr.length + 1);
489
+ if (BASH_VERBS.some((v) => rest === v || rest.startsWith(`${v} `))) {
440
490
  return true;
491
+ }
441
492
  }
442
493
  return false;
443
494
  }
444
- function startsWithCommand(line, commands) {
445
- return commands.some((command) => line === command || line.startsWith(`${command} `));
446
- }
447
- function isBashIndicator(line) {
448
- return (isShebang(line) ||
449
- isPromptLine(line) ||
450
- startsWithCommand(line, BASH_COMMANDS) ||
451
- startsWithPackageManagerCommand(line));
452
- }
453
- function isShebang(line) {
454
- return line.startsWith('#!');
455
- }
456
- function isPromptLine(line) {
457
- return line.startsWith('$ ') || line.startsWith('# ');
458
- }
459
- function startsWithPackageManagerCommand(line) {
460
- return BASH_PACKAGE_MANAGERS.some((manager) => {
461
- if (!line.startsWith(`${manager} `))
462
- return false;
463
- const rest = line.slice(manager.length + 1);
464
- return BASH_VERBS.some((verb) => rest === verb || rest.startsWith(`${verb} `));
465
- });
466
- }
467
- const TYPE_HINTS = [
468
- 'string',
469
- 'number',
470
- 'boolean',
471
- 'void',
472
- 'any',
473
- 'unknown',
474
- 'never',
475
- ];
476
- const HTML_TAGS = [
477
- '<!doctype',
478
- '<html',
479
- '<head',
480
- '<body',
481
- '<div',
482
- '<span',
483
- '<p',
484
- '<a',
485
- '<script',
486
- '<style',
487
- ];
488
- const SQL_KEYWORDS = [
489
- 'select',
490
- 'insert',
491
- 'update',
492
- 'delete',
493
- 'create',
494
- 'alter',
495
- 'drop',
496
- ];
497
- const JS_WORD_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
498
- const PYTHON_WORD_REGEX = /\b(?:def|class|import|from)\b/;
499
- const RUST_WORD_REGEX = /\b(?:fn|impl|struct|enum)\b/;
500
- const CSS_DIRECTIVE_REGEX = /@media|@import|@keyframes/;
501
- const CODE_DETECTORS = [
502
- { language: 'jsx', detect: detectJsx },
503
- { language: 'typescript', detect: detectTypescript },
504
- { language: 'rust', detect: detectRust },
505
- { language: 'javascript', detect: detectJavascript },
506
- { language: 'python', detect: detectPython },
507
- { language: 'bash', detect: detectBash },
508
- { language: 'css', detect: detectCss },
509
- { language: 'html', detect: detectHtml },
510
- { language: 'json', detect: detectJson },
511
- { language: 'yaml', detect: detectYaml },
512
- { language: 'sql', detect: detectSql },
513
- { language: 'go', detect: detectGo },
514
- ];
515
- function detectJsx(code) {
516
- const lower = code.toLowerCase();
517
- if (lower.includes('classname='))
518
- return true;
519
- if (lower.includes('jsx:'))
520
- return true;
521
- if (lower.includes("from 'react'") || lower.includes('from "react"')) {
522
- return true;
495
+ function detectBashIndicators(code) {
496
+ for (const line of splitLines(code)) {
497
+ const trimmed = line.trimStart();
498
+ if (!trimmed)
499
+ continue;
500
+ if (isShellPrefix(trimmed) ||
501
+ matchesBashCommand(trimmed) ||
502
+ matchesPackageManagerVerb(trimmed)) {
503
+ return true;
504
+ }
523
505
  }
524
- return containsJsxTag(code);
525
- }
526
- function detectTypescript(code) {
527
- const lower = code.toLowerCase();
528
- if (containsWord(lower, 'interface'))
529
- return true;
530
- if (containsWord(lower, 'type'))
531
- return true;
532
- return TYPE_HINTS.some((hint) => lower.includes(`: ${hint}`) || lower.includes(`:${hint}`));
533
- }
534
- function detectRust(code) {
535
- const lower = code.toLowerCase();
536
- return (RUST_WORD_REGEX.test(lower) ||
537
- lower.includes('let mut') ||
538
- (lower.includes('use ') && lower.includes('::')));
539
- }
540
- function detectJavascript(code) {
541
- const lower = code.toLowerCase();
542
- return JS_WORD_REGEX.test(lower);
543
- }
544
- function detectPython(code) {
545
- const lower = code.toLowerCase();
546
- return (PYTHON_WORD_REGEX.test(lower) ||
547
- lower.includes('print(') ||
548
- lower.includes('__name__'));
506
+ return false;
549
507
  }
550
- function detectCss(code) {
551
- const lower = code.toLowerCase();
552
- if (CSS_DIRECTIVE_REGEX.test(lower))
553
- return true;
554
- const lines = splitLines(code);
555
- for (const line of lines) {
508
+ function detectCssStructure(code) {
509
+ for (const line of splitLines(code)) {
556
510
  const trimmed = line.trimStart();
557
511
  if (!trimmed)
558
512
  continue;
559
- if (isCssSelectorLine(trimmed) || isCssPropertyLine(trimmed))
513
+ const isSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
514
+ trimmed.includes('{');
515
+ const isProperty = trimmed.includes(':') && trimmed.includes(';');
516
+ if (isSelector || isProperty)
560
517
  return true;
561
518
  }
562
519
  return false;
563
520
  }
564
- function detectHtml(code) {
565
- const lower = code.toLowerCase();
566
- return HTML_TAGS.some((tag) => lower.includes(tag));
567
- }
568
- function detectJson(code) {
569
- const trimmed = code.trimStart();
570
- if (!trimmed)
571
- return false;
572
- return trimmed.startsWith('{') || trimmed.startsWith('[');
573
- }
574
- function detectYaml(code) {
575
- const lines = splitLines(code);
576
- for (const line of lines) {
521
+ function detectYamlStructure(code) {
522
+ for (const line of splitLines(code)) {
577
523
  const trimmed = line.trim();
578
524
  if (!trimmed)
579
525
  continue;
580
- const colonIndex = trimmed.indexOf(':');
581
- if (colonIndex <= 0)
526
+ const colonIdx = trimmed.indexOf(':');
527
+ if (colonIdx <= 0)
582
528
  continue;
583
- const after = trimmed[colonIndex + 1];
529
+ const after = trimmed[colonIdx + 1];
584
530
  if (after === ' ' || after === '\t')
585
531
  return true;
586
532
  }
587
533
  return false;
588
534
  }
589
- function detectSql(code) {
590
- const lower = code.toLowerCase();
591
- return SQL_KEYWORDS.some((keyword) => containsWord(lower, keyword));
592
- }
593
- function detectGo(code) {
594
- const lower = code.toLowerCase();
595
- return (containsWord(lower, 'package') ||
596
- containsWord(lower, 'func') ||
597
- lower.includes('import "'));
598
- }
599
- function isCssSelectorLine(line) {
600
- if (!line.startsWith('.') && !line.startsWith('#'))
601
- return false;
602
- return line.includes('{');
603
- }
604
- function isCssPropertyLine(line) {
605
- return line.includes(':') && line.includes(';');
535
+ function matchesLanguagePattern(code, lower, pattern) {
536
+ if (pattern.keywords?.some((kw) => lower.includes(kw)))
537
+ return true;
538
+ if (pattern.wordBoundary?.some((w) => containsWord(lower, w)))
539
+ return true;
540
+ if (pattern.regex?.test(lower))
541
+ return true;
542
+ if (pattern.startsWith) {
543
+ const trimmed = code.trimStart();
544
+ if (pattern.startsWith.some((prefix) => trimmed.startsWith(prefix)))
545
+ return true;
546
+ }
547
+ if (pattern.custom?.(code, lower))
548
+ return true;
549
+ return false;
606
550
  }
607
551
  export function detectLanguageFromCode(code) {
608
- for (const { language, detect } of CODE_DETECTORS) {
609
- if (detect(code))
552
+ const lower = code.toLowerCase();
553
+ for (const { language, pattern } of LANGUAGE_PATTERNS) {
554
+ if (matchesLanguagePattern(code, lower, pattern))
610
555
  return language;
611
556
  }
612
557
  return undefined;
@@ -630,6 +575,7 @@ const STRUCTURAL_TAGS = new Set([
630
575
  'input',
631
576
  'select',
632
577
  'textarea',
578
+ 'svg',
633
579
  ]);
634
580
  const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
635
581
  const NAVIGATION_ROLES = new Set([
@@ -642,6 +588,7 @@ const NAVIGATION_ROLES = new Set([
642
588
  'menu',
643
589
  'dialog',
644
590
  'alertdialog',
591
+ 'search',
645
592
  ]);
646
593
  const PROMO_TOKENS = new Set([
647
594
  'banner',
@@ -669,6 +616,7 @@ const PROMO_TOKENS = new Set([
669
616
  'breadcrumb',
670
617
  'pagination',
671
618
  'pager',
619
+ 'taglist',
672
620
  ]);
673
621
  const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
674
622
  const FIXED_PATTERN = /\b(fixed|sticky)\b/;
@@ -727,6 +675,8 @@ const NOISE_MARKERS = [
727
675
  ' z-50',
728
676
  ' z-4',
729
677
  ' isolate',
678
+ ' breadcrumb',
679
+ ' pagination',
730
680
  ];
731
681
  function mayContainNoise(html) {
732
682
  const haystack = html.toLowerCase();
@@ -760,11 +710,9 @@ function matchesPromoIdOrClass(className, id) {
760
710
  const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
761
711
  return tokens.some((token) => PROMO_TOKENS.has(token));
762
712
  }
763
- function matchesHighZIsolate(className) {
764
- return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
765
- }
766
713
  function matchesFixedOrHighZIsolate(className) {
767
- return FIXED_PATTERN.test(className) || matchesHighZIsolate(className);
714
+ return (FIXED_PATTERN.test(className) ||
715
+ (HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
768
716
  }
769
717
  function readElementMetadata(element) {
770
718
  return {
@@ -791,8 +739,7 @@ function isNoiseElement(node) {
791
739
  matchesFixedOrHighZIsolate(metadata.className) ||
792
740
  matchesPromoIdOrClass(metadata.className, metadata.id));
793
741
  }
794
- function stripNoiseNodes(document) {
795
- const nodes = document.querySelectorAll('*');
742
+ function removeNoiseNodes(nodes) {
796
743
  for (let index = nodes.length - 1; index >= 0; index -= 1) {
797
744
  const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
798
745
  if (!node)
@@ -802,6 +749,30 @@ function stripNoiseNodes(document) {
802
749
  }
803
750
  }
804
751
  }
752
+ function stripNoiseNodes(document) {
753
+ // Use targeted selectors for common noise elements instead of querySelectorAll('*')
754
+ const targetSelectors = [
755
+ 'nav',
756
+ 'footer',
757
+ 'aside',
758
+ 'header[class*="site"]',
759
+ 'header[class*="nav"]',
760
+ 'header[class*="menu"]',
761
+ '[role="banner"]',
762
+ '[role="navigation"]',
763
+ '[role="dialog"]',
764
+ '[style*="display: none"]',
765
+ '[style*="display:none"]',
766
+ '[hidden]',
767
+ '[aria-hidden="true"]',
768
+ ].join(',');
769
+ const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
770
+ // Remove in reverse order to handle nested elements correctly
771
+ removeNoiseNodes(potentialNoiseNodes);
772
+ // Second pass: check remaining elements for noise patterns (promo, fixed positioning, etc.)
773
+ const allElements = document.querySelectorAll('*');
774
+ removeNoiseNodes(allElements);
775
+ }
805
776
  function removeNoiseFromHtml(html) {
806
777
  const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
807
778
  if (!shouldParse)
@@ -826,7 +797,14 @@ function removeNoiseFromHtml(html) {
826
797
  }
827
798
  function buildInlineCode(content) {
828
799
  const runs = content.match(/`+/g);
829
- const longest = runs?.sort((a, b) => b.length - a.length)[0] ?? '';
800
+ let longest = '';
801
+ if (runs) {
802
+ for (const run of runs) {
803
+ if (run.length > longest.length) {
804
+ longest = run;
805
+ }
806
+ }
807
+ }
830
808
  const delimiter = `\`${longest}`;
831
809
  const padding = delimiter.length > 1 ? ' ' : '';
832
810
  return `${delimiter}${padding}${content}${padding}${delimiter}`;
@@ -983,11 +961,8 @@ function translateHtmlToMarkdown(html, url, signal) {
983
961
  throwIfAborted(signal, url, 'markdown:cleaned');
984
962
  const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
985
963
  throwIfAborted(signal, url, 'markdown:translated');
986
- let finalMarkdown = cleanupMarkdownArtifacts(content);
987
- finalMarkdown = normalizeBlockSpacing(finalMarkdown);
988
- finalMarkdown = normalizeTableWhitespace(finalMarkdown);
989
- finalMarkdown = normalizeLineEndings(finalMarkdown);
990
- return finalMarkdown;
964
+ const cleaned = cleanupMarkdownArtifacts(content);
965
+ return promoteOrphanHeadings(cleaned);
991
966
  }
992
967
  function appendMetadataFooter(content, metadata, url) {
993
968
  const footer = buildMetadataFooter(metadata, url);
@@ -1010,37 +985,146 @@ export function htmlToMarkdown(html, metadata, options) {
1010
985
  }
1011
986
  function cleanupMarkdownArtifacts(content) {
1012
987
  let result = content;
988
+ const fixOrphanHeadings = (text) => {
989
+ return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (match, prefix, hashes, heading) => {
990
+ if (typeof prefix !== 'string' ||
991
+ typeof hashes !== 'string' ||
992
+ typeof heading !== 'string') {
993
+ return match;
994
+ }
995
+ if (heading.length > 150) {
996
+ return match;
997
+ }
998
+ const trimmedPrefix = prefix.trim();
999
+ if (trimmedPrefix === '') {
1000
+ return `${hashes} ${heading}\n\n`;
1001
+ }
1002
+ return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
1003
+ });
1004
+ };
1005
+ result = fixOrphanHeadings(result);
1013
1006
  result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
1014
1007
  const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
1015
1008
  result = result.replace(zeroWidthAnchorLink, '');
1009
+ result = result.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '');
1010
+ result = result.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '');
1011
+ result = result.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
1012
+ result = result.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
1013
+ result = result.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
1014
+ result = result.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
1015
+ const tocLinkLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
1016
+ const lines = result.split('\n');
1017
+ const filtered = [];
1018
+ let skipTocBlock = false;
1019
+ for (let i = 0; i < lines.length; i += 1) {
1020
+ const line = lines[i] ?? '';
1021
+ const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
1022
+ const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
1023
+ if (tocLinkLine.test(line)) {
1024
+ const prevIsToc = tocLinkLine.test(prevLine) || prevLine.trim() === '';
1025
+ const nextIsToc = tocLinkLine.test(nextLine) || nextLine.trim() === '';
1026
+ if (prevIsToc || nextIsToc) {
1027
+ skipTocBlock = true;
1028
+ continue;
1029
+ }
1030
+ }
1031
+ else if (line.trim() === '' && skipTocBlock) {
1032
+ skipTocBlock = false;
1033
+ continue;
1034
+ }
1035
+ else {
1036
+ skipTocBlock = false;
1037
+ }
1038
+ filtered.push(line);
1039
+ }
1040
+ result = filtered.join('\n');
1016
1041
  result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
1017
1042
  result = result.replace(/^Was this page helpful\??\s*$/gim, '');
1043
+ result = result.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ');
1044
+ result = result.replace(/\\([[]])/g, '$1');
1045
+ result = result.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
1046
+ result = result.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
1018
1047
  result = result.replace(/\n{3,}/g, '\n\n');
1019
1048
  return result.trim();
1020
1049
  }
1021
- function normalizeBlockSpacing(markdown) {
1022
- return markdown
1023
- .replace(/(\n#{1,6} .+)\n(?!\n)/g, '$1\n\n')
1024
- .replace(/\n{3,}/g, '\n\n');
1025
- }
1026
- function normalizeTableWhitespace(markdown) {
1027
- return markdown.replace(/\|([^|\n]+)\|/g, (_match, content) => {
1028
- const trimmed = typeof content === 'string' ? content.trim() : '';
1029
- return `| ${trimmed} |`;
1030
- });
1050
+ const HEADING_KEYWORDS = new Set([
1051
+ 'overview',
1052
+ 'introduction',
1053
+ 'summary',
1054
+ 'conclusion',
1055
+ 'prerequisites',
1056
+ 'requirements',
1057
+ 'installation',
1058
+ 'configuration',
1059
+ 'usage',
1060
+ 'features',
1061
+ 'limitations',
1062
+ 'troubleshooting',
1063
+ 'faq',
1064
+ 'resources',
1065
+ 'references',
1066
+ 'changelog',
1067
+ 'license',
1068
+ 'acknowledgments',
1069
+ 'appendix',
1070
+ ]);
1071
+ function isLikelyHeadingLine(line) {
1072
+ const trimmed = line.trim();
1073
+ if (!trimmed || trimmed.length > 80)
1074
+ return false;
1075
+ if (/^#{1,6}\s/.test(trimmed))
1076
+ return false;
1077
+ if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
1078
+ return false;
1079
+ if (/[.!?]$/.test(trimmed))
1080
+ return false;
1081
+ if (/^\[.*\]\(.*\)$/.test(trimmed))
1082
+ return false;
1083
+ if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
1084
+ return true;
1085
+ }
1086
+ const words = trimmed.split(/\s+/);
1087
+ if (words.length >= 2 && words.length <= 6) {
1088
+ const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
1089
+ if (isTitleCase)
1090
+ return true;
1091
+ }
1092
+ if (words.length === 1) {
1093
+ const lower = trimmed.toLowerCase();
1094
+ if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed)) {
1095
+ return true;
1096
+ }
1097
+ }
1098
+ return false;
1031
1099
  }
1032
- function normalizeLineEndings(markdown) {
1033
- return markdown.replace(/\r\n/g, '\n');
1100
+ function promoteOrphanHeadings(markdown) {
1101
+ const lines = markdown.split('\n');
1102
+ const result = [];
1103
+ for (let i = 0; i < lines.length; i += 1) {
1104
+ const line = lines[i] ?? '';
1105
+ const prevLine = i > 0 ? lines[i - 1] : '';
1106
+ const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
1107
+ const isStandalone = prevLine?.trim() === '' && nextLine?.trim() === '';
1108
+ const isPrecededByBlank = prevLine?.trim() === '';
1109
+ if ((isStandalone || isPrecededByBlank) && isLikelyHeadingLine(line)) {
1110
+ const trimmed = line.trim();
1111
+ const isExample = /^example:\s/i.test(trimmed);
1112
+ const prefix = isExample ? '### ' : '## ';
1113
+ result.push(prefix + trimmed);
1114
+ }
1115
+ else {
1116
+ result.push(line);
1117
+ }
1118
+ }
1119
+ return result.join('\n');
1034
1120
  }
1035
1121
  function formatFetchedDate(isoString) {
1036
1122
  try {
1037
1123
  const date = new Date(isoString);
1038
- const options = {
1039
- year: 'numeric',
1040
- month: 'short',
1041
- day: 'numeric',
1042
- };
1043
- return date.toLocaleDateString('en-US', options);
1124
+ const day = String(date.getDate()).padStart(2, '0');
1125
+ const month = String(date.getMonth() + 1).padStart(2, '0');
1126
+ const year = date.getFullYear();
1127
+ return `${day}-${month}-${year}`;
1044
1128
  }
1045
1129
  catch {
1046
1130
  return isoString;
@@ -1049,20 +1133,24 @@ function formatFetchedDate(isoString) {
1049
1133
  function buildMetadataFooter(metadata, fallbackUrl) {
1050
1134
  if (!metadata)
1051
1135
  return '';
1052
- const lines = [];
1136
+ const lines = ['---', ''];
1137
+ const url = metadata.url || fallbackUrl;
1138
+ const parts = [];
1053
1139
  if (metadata.title)
1054
- lines.push(`> *${metadata.title}*`);
1055
- if (metadata.description)
1056
- lines.push(`> *${metadata.description}*`);
1140
+ parts.push(`_${metadata.title}_`);
1057
1141
  if (metadata.author)
1058
- lines.push(`> *${metadata.author}*`);
1059
- if (metadata.url)
1060
- lines.push(`> *<${metadata.url}>*`);
1061
- else if (fallbackUrl)
1062
- lines.push(`> *<${fallbackUrl}>*`);
1142
+ parts.push(`_${metadata.author}_`);
1143
+ if (url)
1144
+ parts.push(`[_Original Source_](${url})`);
1063
1145
  if (metadata.fetchedAt) {
1064
1146
  const formattedDate = formatFetchedDate(metadata.fetchedAt);
1065
- lines.push(`> *${formattedDate}*`);
1147
+ parts.push(`_${formattedDate}_`);
1148
+ }
1149
+ if (parts.length > 0) {
1150
+ lines.push(` ${parts.join(' | ')}`);
1151
+ }
1152
+ if (metadata.description) {
1153
+ lines.push(` <sub>${metadata.description}</sub>`);
1066
1154
  }
1067
1155
  return lines.join('\n');
1068
1156
  }
@@ -1277,69 +1365,82 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
1277
1365
  }
1278
1366
  const MIN_CONTENT_RATIO = 0.3;
1279
1367
  const MIN_HTML_LENGTH_FOR_GATE = 100;
1280
- function stripHtmlTags(html) {
1281
- const parts = [];
1368
+ const MIN_HEADING_RETENTION_RATIO = 0.7;
1369
+ function countHeadings(html) {
1370
+ if (!html)
1371
+ return 0;
1372
+ // Match opening heading tags <h1> through <h6>
1373
+ const headingPattern = /<h[1-6](?:\s[^>]*)?>([^<]*)<\/h[1-6]>/gi;
1374
+ const matches = html.match(headingPattern);
1375
+ return matches ? matches.length : 0;
1376
+ }
1377
+ function isHeadingStructurePreserved(article, originalHtml) {
1378
+ if (!article)
1379
+ return false;
1380
+ // Cache heading counts to avoid duplicate regex matching
1381
+ const originalHeadingCount = countHeadings(originalHtml);
1382
+ const articleHeadingCount = countHeadings(article.content);
1383
+ // If original has no headings, structure is trivially preserved
1384
+ if (originalHeadingCount === 0)
1385
+ return true;
1386
+ // If article lost >50% of headings, structure is broken
1387
+ const retentionRatio = articleHeadingCount / originalHeadingCount;
1388
+ return retentionRatio >= MIN_HEADING_RETENTION_RATIO;
1389
+ }
1390
+ function stripHtmlTagsForLength(html) {
1391
+ let result = '';
1282
1392
  let inTag = false;
1283
1393
  for (const char of html) {
1284
1394
  if (char === '<') {
1285
1395
  inTag = true;
1286
- continue;
1287
1396
  }
1288
- if (char === '>') {
1397
+ else if (char === '>') {
1289
1398
  inTag = false;
1290
- continue;
1291
1399
  }
1292
- if (!inTag) {
1293
- parts.push(char);
1400
+ else if (!inTag) {
1401
+ result += char;
1294
1402
  }
1295
1403
  }
1296
- return parts.join('');
1297
- }
1298
- function estimateTextLength(html) {
1299
- return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
1404
+ return result;
1300
1405
  }
1301
1406
  export function isExtractionSufficient(article, originalHtml) {
1302
1407
  if (!article)
1303
1408
  return false;
1304
1409
  const articleLength = article.textContent.length;
1305
- const originalLength = estimateTextLength(originalHtml);
1410
+ const originalLength = stripHtmlTagsForLength(originalHtml)
1411
+ .replace(/\s+/g, ' ')
1412
+ .trim().length;
1306
1413
  if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
1307
1414
  return true;
1308
1415
  return articleLength / originalLength >= MIN_CONTENT_RATIO;
1309
1416
  }
1310
1417
  export function determineContentExtractionSource(article) {
1311
- return !!article;
1312
- }
1313
- function applyArticleMetadata(metadata, article) {
1314
- if (article.title !== undefined)
1315
- metadata.title = article.title;
1316
- if (article.byline !== undefined)
1317
- metadata.author = article.byline;
1318
- }
1319
- function applyExtractedMetadata(metadata, extractedMeta) {
1320
- if (extractedMeta.title !== undefined)
1321
- metadata.title = extractedMeta.title;
1322
- if (extractedMeta.description !== undefined) {
1323
- metadata.description = extractedMeta.description;
1324
- }
1325
- if (extractedMeta.author !== undefined) {
1326
- metadata.author = extractedMeta.author;
1327
- }
1418
+ return article !== null;
1328
1419
  }
1329
1420
  export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
1330
1421
  if (!includeMetadata)
1331
1422
  return undefined;
1332
- const now = new Date().toISOString();
1333
1423
  const metadata = {
1334
1424
  type: 'metadata',
1335
1425
  url,
1336
- fetchedAt: now,
1426
+ fetchedAt: new Date().toISOString(),
1337
1427
  };
1338
1428
  if (shouldExtractFromArticle && article) {
1339
- applyArticleMetadata(metadata, article);
1340
- return metadata;
1429
+ if (article.title !== undefined)
1430
+ metadata.title = article.title;
1431
+ if (article.byline !== undefined)
1432
+ metadata.author = article.byline;
1433
+ }
1434
+ else {
1435
+ if (extractedMeta.title !== undefined)
1436
+ metadata.title = extractedMeta.title;
1437
+ if (extractedMeta.description !== undefined) {
1438
+ metadata.description = extractedMeta.description;
1439
+ }
1440
+ if (extractedMeta.author !== undefined) {
1441
+ metadata.author = extractedMeta.author;
1442
+ }
1341
1443
  }
1342
- applyExtractedMetadata(metadata, extractedMeta);
1343
1444
  return metadata;
1344
1445
  }
1345
1446
  function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, }) {
@@ -1357,17 +1458,24 @@ function logQualityGateFallback({ url, articleLength, }) {
1357
1458
  });
1358
1459
  }
1359
1460
  function shouldUseArticleContent(article, html, url) {
1360
- const shouldExtractFromArticle = determineContentExtractionSource(article);
1361
- if (!shouldExtractFromArticle)
1461
+ // Check content sufficiency (length-based quality gate)
1462
+ if (!isExtractionSufficient(article, html)) {
1463
+ logQualityGateFallback({
1464
+ url,
1465
+ articleLength: article.textContent.length,
1466
+ });
1362
1467
  return false;
1363
- if (isExtractionSufficient(article, html)) {
1364
- return true;
1365
1468
  }
1366
- logQualityGateFallback({
1367
- url,
1368
- articleLength: article.textContent.length,
1369
- });
1370
- return false;
1469
+ // Check heading structure preservation
1470
+ if (!isHeadingStructurePreserved(article, html)) {
1471
+ logDebug('Quality gate: Readability broke heading structure, using full HTML', {
1472
+ url: url.substring(0, 80),
1473
+ originalHeadings: countHeadings(html),
1474
+ articleHeadings: countHeadings(article.content),
1475
+ });
1476
+ return false;
1477
+ }
1478
+ return true;
1371
1479
  }
1372
1480
  function resolveContentSource({ html, url, includeMetadata, signal, }) {
1373
1481
  const { article, metadata: extractedMeta } = extractContent(html, url, {