@j0hanz/superfetch 2.1.8 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/transform.js CHANGED
@@ -129,83 +129,41 @@ function truncateHtml(html) {
129
129
  });
130
130
  return html.substring(0, maxSize);
131
131
  }
132
- function createMetaCollectorState() {
133
- return {
134
- title: {},
135
- description: {},
136
- author: {},
137
- };
138
- }
139
- function resolveMetaField(state, field) {
140
- const sources = state[field];
141
- return sources.og ?? sources.twitter ?? sources.standard;
142
- }
143
- function parseOpenGraphKey(property) {
144
- if (!property?.startsWith('og:'))
145
- return null;
146
- const key = property.replace('og:', '');
147
- return key === 'title' || key === 'description' ? key : null;
148
- }
149
- function parseTwitterKey(name) {
150
- if (!name?.startsWith('twitter:'))
151
- return null;
152
- const key = name.replace('twitter:', '');
153
- return key === 'title' || key === 'description' ? key : null;
154
- }
155
- function parseStandardKey(name) {
156
- if (name === 'description')
157
- return 'description';
158
- if (name === 'author')
159
- return 'author';
160
- return null;
161
- }
162
- function collectMetaTag(state, tag) {
163
- const content = tag.getAttribute('content')?.trim();
164
- if (!content)
165
- return;
166
- const ogKey = parseOpenGraphKey(tag.getAttribute('property'));
167
- if (ogKey) {
168
- state[ogKey].og = content;
169
- return;
170
- }
171
- const name = tag.getAttribute('name');
172
- const twitterKey = parseTwitterKey(name);
173
- if (twitterKey) {
174
- state[twitterKey].twitter = content;
175
- return;
176
- }
177
- const standardKey = parseStandardKey(name);
178
- if (standardKey) {
179
- state[standardKey].standard = content;
180
- }
181
- }
182
- function scanMetaTags(document, state) {
183
- const metaTags = document.querySelectorAll('meta');
184
- for (const tag of metaTags) {
185
- collectMetaTag(state, tag);
132
+ function extractMetadata(document) {
133
+ const title = {};
134
+ const description = {};
135
+ let author;
136
+ for (const tag of document.querySelectorAll('meta')) {
137
+ const content = tag.getAttribute('content')?.trim();
138
+ if (!content)
139
+ continue;
140
+ const property = tag.getAttribute('property');
141
+ const name = tag.getAttribute('name');
142
+ if (property === 'og:title')
143
+ title.og = content;
144
+ else if (property === 'og:description')
145
+ description.og = content;
146
+ else if (name === 'twitter:title')
147
+ title.twitter = content;
148
+ else if (name === 'twitter:description')
149
+ description.twitter = content;
150
+ else if (name === 'description')
151
+ description.standard = content;
152
+ else if (name === 'author')
153
+ author = content;
186
154
  }
187
- }
188
- function ensureTitleFallback(document, state) {
189
- if (state.title.standard)
190
- return;
191
155
  const titleEl = document.querySelector('title');
192
- if (titleEl?.textContent) {
193
- state.title.standard = titleEl.textContent.trim();
156
+ if (!title.standard && titleEl?.textContent) {
157
+ title.standard = titleEl.textContent.trim();
194
158
  }
195
- }
196
- function extractMetadata(document) {
197
- const state = createMetaCollectorState();
198
- scanMetaTags(document, state);
199
- ensureTitleFallback(document, state);
159
+ const resolvedTitle = title.og ?? title.twitter ?? title.standard;
160
+ const resolvedDesc = description.og ?? description.twitter ?? description.standard;
200
161
  const metadata = {};
201
- const title = resolveMetaField(state, 'title');
202
- const description = resolveMetaField(state, 'description');
203
- const author = resolveMetaField(state, 'author');
204
- if (title !== undefined)
205
- metadata.title = title;
206
- if (description !== undefined)
207
- metadata.description = description;
208
- if (author !== undefined)
162
+ if (resolvedTitle)
163
+ metadata.title = resolvedTitle;
164
+ if (resolvedDesc)
165
+ metadata.description = resolvedDesc;
166
+ if (author)
209
167
  metadata.author = author;
210
168
  return metadata;
211
169
  }
@@ -226,66 +184,44 @@ function extractArticle(document) {
226
184
  logWarn('Document not compatible with Readability');
227
185
  return null;
228
186
  }
229
- return mapParsedArticle(parseReadabilityArticle(document));
230
- }
231
- function parseReadabilityArticle(document) {
232
187
  try {
233
188
  const documentClone = document.cloneNode(true);
234
- const rawText = documentClone.body.textContent ||
189
+ const rawText = documentClone.querySelector('body')?.textContent ??
235
190
  documentClone.documentElement.textContent;
236
191
  const textLength = rawText.replace(/\s+/g, ' ').trim().length;
237
192
  if (textLength >= 400 && !isProbablyReaderable(documentClone)) {
238
193
  return null;
239
194
  }
240
195
  const reader = new Readability(documentClone, { maxElemsToParse: 20_000 });
241
- return reader.parse();
196
+ const parsed = reader.parse();
197
+ if (!parsed)
198
+ return null;
199
+ return {
200
+ content: parsed.content ?? '',
201
+ textContent: parsed.textContent ?? '',
202
+ ...(parsed.title != null && { title: parsed.title }),
203
+ ...(parsed.byline != null && { byline: parsed.byline }),
204
+ ...(parsed.excerpt != null && { excerpt: parsed.excerpt }),
205
+ ...(parsed.siteName != null && { siteName: parsed.siteName }),
206
+ };
242
207
  }
243
208
  catch (error) {
244
- logError('Failed to extract article with Readability', asError(error));
209
+ logError('Failed to extract article with Readability', error instanceof Error ? error : undefined);
245
210
  return null;
246
211
  }
247
212
  }
248
- function asError(error) {
249
- if (error instanceof Error) {
250
- return error;
251
- }
252
- return undefined;
253
- }
254
- function mapParsedArticle(parsed) {
255
- return parsed ? mapReadabilityResult(parsed) : null;
256
- }
257
- function mapReadabilityResult(parsed) {
258
- return {
259
- content: parsed.content ?? '',
260
- textContent: parsed.textContent ?? '',
261
- ...buildOptionalArticleFields(parsed),
262
- };
263
- }
264
- function buildOptionalArticleFields(parsed) {
265
- const optional = {};
266
- addOptionalField(optional, 'title', parsed.title);
267
- addOptionalField(optional, 'byline', parsed.byline);
268
- addOptionalField(optional, 'excerpt', parsed.excerpt);
269
- addOptionalField(optional, 'siteName', parsed.siteName);
270
- return optional;
271
- }
272
- function addOptionalField(target, key, value) {
273
- if (value == null)
274
- return;
275
- target[key] = value;
276
- }
277
213
  export function extractContent(html, url, options = {
278
214
  extractArticle: true,
279
215
  }) {
280
- const emptyResult = createEmptyExtractionResult();
216
+ const result = extractContentWithDocument(html, url, options);
217
+ return { article: result.article, metadata: result.metadata };
218
+ }
219
+ function extractContentWithDocument(html, url, options) {
281
220
  if (!isValidInput(html, url)) {
282
- return emptyResult;
221
+ return { article: null, metadata: {} };
283
222
  }
284
223
  return tryExtractContent(html, url, options);
285
224
  }
286
- function createEmptyExtractionResult() {
287
- return { article: null, metadata: {} };
288
- }
289
225
  function extractArticleWithStage(document, url, shouldExtract) {
290
226
  if (!shouldExtract)
291
227
  return null;
@@ -297,11 +233,12 @@ function handleExtractionFailure(error, url, signal) {
297
233
  }
298
234
  throwIfAborted(signal, url, 'extract:error');
299
235
  logError('Failed to extract content', error instanceof Error ? error : undefined);
300
- return createEmptyExtractionResult();
236
+ return { article: null, metadata: {} };
301
237
  }
302
238
  function extractContentStages(html, url, options) {
303
239
  throwIfAborted(options.signal, url, 'extract:begin');
304
- const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncateHtml(html)));
240
+ const truncatedHtml = truncateHtml(html);
241
+ const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncatedHtml));
305
242
  throwIfAborted(options.signal, url, 'extract:parsed');
306
243
  applyBaseUri(document, url);
307
244
  const metadata = runTransformStage(url, 'extract:metadata', () => extractMetadata(document));
@@ -311,6 +248,7 @@ function extractContentStages(html, url, options) {
311
248
  return {
312
249
  article,
313
250
  metadata,
251
+ ...(truncatedHtml.length === html.length ? { document } : {}),
314
252
  };
315
253
  }
316
254
  function tryExtractContent(html, url, options) {
@@ -325,14 +263,11 @@ function isValidInput(html, url) {
325
263
  return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
326
264
  }
327
265
  function validateRequiredString(value, message) {
328
- if (isNonEmptyString(value))
266
+ if (typeof value === 'string' && value.length > 0)
329
267
  return true;
330
268
  logWarn(message);
331
269
  return false;
332
270
  }
333
- function isNonEmptyString(value) {
334
- return typeof value === 'string' && value.length > 0;
335
- }
336
271
  function resolveArticleExtraction(document, shouldExtract) {
337
272
  return shouldExtract ? extractArticle(document) : null;
338
273
  }
@@ -417,7 +352,124 @@ function isWordChar(char) {
417
352
  (code >= 97 && code <= 122) ||
418
353
  char === '_');
419
354
  }
420
- const BASH_PACKAGE_MANAGERS = [
355
+ const LANGUAGE_PATTERNS = [
356
+ {
357
+ language: 'jsx',
358
+ pattern: {
359
+ keywords: ['classname=', 'jsx:', "from 'react'", 'from "react"'],
360
+ custom: (code) => containsJsxTag(code),
361
+ },
362
+ },
363
+ {
364
+ language: 'typescript',
365
+ pattern: {
366
+ wordBoundary: ['interface', 'type'],
367
+ custom: (_, lower) => [
368
+ ': string',
369
+ ':string',
370
+ ': number',
371
+ ':number',
372
+ ': boolean',
373
+ ':boolean',
374
+ ': void',
375
+ ':void',
376
+ ': any',
377
+ ':any',
378
+ ': unknown',
379
+ ':unknown',
380
+ ': never',
381
+ ':never',
382
+ ].some((hint) => lower.includes(hint)),
383
+ },
384
+ },
385
+ {
386
+ language: 'rust',
387
+ pattern: {
388
+ regex: /\b(?:fn|impl|struct|enum)\b/,
389
+ keywords: ['let mut'],
390
+ custom: (_, lower) => lower.includes('use ') && lower.includes('::'),
391
+ },
392
+ },
393
+ {
394
+ language: 'javascript',
395
+ pattern: {
396
+ regex: /\b(?:const|let|var|function|class|async|await|export|import)\b/,
397
+ },
398
+ },
399
+ {
400
+ language: 'python',
401
+ pattern: {
402
+ regex: /\b(?:def|class|import|from)\b/,
403
+ keywords: ['print(', '__name__'],
404
+ },
405
+ },
406
+ {
407
+ language: 'bash',
408
+ pattern: {
409
+ custom: (code) => detectBashIndicators(code),
410
+ },
411
+ },
412
+ {
413
+ language: 'css',
414
+ pattern: {
415
+ regex: /@media|@import|@keyframes/,
416
+ custom: (code) => detectCssStructure(code),
417
+ },
418
+ },
419
+ {
420
+ language: 'html',
421
+ pattern: {
422
+ keywords: [
423
+ '<!doctype',
424
+ '<html',
425
+ '<head',
426
+ '<body',
427
+ '<div',
428
+ '<span',
429
+ '<p',
430
+ '<a',
431
+ '<script',
432
+ '<style',
433
+ ],
434
+ },
435
+ },
436
+ {
437
+ language: 'json',
438
+ pattern: {
439
+ startsWith: ['{', '['],
440
+ },
441
+ },
442
+ {
443
+ language: 'yaml',
444
+ pattern: {
445
+ custom: (code) => detectYamlStructure(code),
446
+ },
447
+ },
448
+ {
449
+ language: 'sql',
450
+ pattern: {
451
+ wordBoundary: [
452
+ 'select',
453
+ 'insert',
454
+ 'update',
455
+ 'delete',
456
+ 'create',
457
+ 'alter',
458
+ 'drop',
459
+ ],
460
+ },
461
+ },
462
+ {
463
+ language: 'go',
464
+ pattern: {
465
+ wordBoundary: ['package', 'func'],
466
+ keywords: ['import "'],
467
+ },
468
+ },
469
+ ];
470
+ // Bash detection constants
471
+ const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
472
+ const BASH_PKG_MANAGERS = [
421
473
  'npm',
422
474
  'yarn',
423
475
  'pnpm',
@@ -429,184 +481,83 @@ const BASH_PACKAGE_MANAGERS = [
429
481
  'go',
430
482
  ];
431
483
  const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
432
- const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
433
- function detectBash(code) {
434
- const lines = splitLines(code);
435
- for (const line of lines) {
436
- const trimmed = line.trimStart();
437
- if (!trimmed)
484
+ function isShellPrefix(line) {
485
+ return (line.startsWith('#!') || line.startsWith('$ ') || line.startsWith('# '));
486
+ }
487
+ function matchesBashCommand(line) {
488
+ return BASH_COMMANDS.some((cmd) => line === cmd || line.startsWith(`${cmd} `));
489
+ }
490
+ function matchesPackageManagerVerb(line) {
491
+ for (const mgr of BASH_PKG_MANAGERS) {
492
+ if (!line.startsWith(`${mgr} `))
438
493
  continue;
439
- if (isBashIndicator(trimmed))
494
+ const rest = line.slice(mgr.length + 1);
495
+ if (BASH_VERBS.some((v) => rest === v || rest.startsWith(`${v} `))) {
440
496
  return true;
497
+ }
441
498
  }
442
499
  return false;
443
500
  }
444
- function startsWithCommand(line, commands) {
445
- return commands.some((command) => line === command || line.startsWith(`${command} `));
446
- }
447
- function isBashIndicator(line) {
448
- return (isShebang(line) ||
449
- isPromptLine(line) ||
450
- startsWithCommand(line, BASH_COMMANDS) ||
451
- startsWithPackageManagerCommand(line));
452
- }
453
- function isShebang(line) {
454
- return line.startsWith('#!');
455
- }
456
- function isPromptLine(line) {
457
- return line.startsWith('$ ') || line.startsWith('# ');
458
- }
459
- function startsWithPackageManagerCommand(line) {
460
- return BASH_PACKAGE_MANAGERS.some((manager) => {
461
- if (!line.startsWith(`${manager} `))
462
- return false;
463
- const rest = line.slice(manager.length + 1);
464
- return BASH_VERBS.some((verb) => rest === verb || rest.startsWith(`${verb} `));
465
- });
466
- }
467
- const TYPE_HINTS = [
468
- 'string',
469
- 'number',
470
- 'boolean',
471
- 'void',
472
- 'any',
473
- 'unknown',
474
- 'never',
475
- ];
476
- const HTML_TAGS = [
477
- '<!doctype',
478
- '<html',
479
- '<head',
480
- '<body',
481
- '<div',
482
- '<span',
483
- '<p',
484
- '<a',
485
- '<script',
486
- '<style',
487
- ];
488
- const SQL_KEYWORDS = [
489
- 'select',
490
- 'insert',
491
- 'update',
492
- 'delete',
493
- 'create',
494
- 'alter',
495
- 'drop',
496
- ];
497
- const JS_WORD_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
498
- const PYTHON_WORD_REGEX = /\b(?:def|class|import|from)\b/;
499
- const RUST_WORD_REGEX = /\b(?:fn|impl|struct|enum)\b/;
500
- const CSS_DIRECTIVE_REGEX = /@media|@import|@keyframes/;
501
- const CODE_DETECTORS = [
502
- { language: 'jsx', detect: detectJsx },
503
- { language: 'typescript', detect: detectTypescript },
504
- { language: 'rust', detect: detectRust },
505
- { language: 'javascript', detect: detectJavascript },
506
- { language: 'python', detect: detectPython },
507
- { language: 'bash', detect: detectBash },
508
- { language: 'css', detect: detectCss },
509
- { language: 'html', detect: detectHtml },
510
- { language: 'json', detect: detectJson },
511
- { language: 'yaml', detect: detectYaml },
512
- { language: 'sql', detect: detectSql },
513
- { language: 'go', detect: detectGo },
514
- ];
515
- function detectJsx(code) {
516
- const lower = code.toLowerCase();
517
- if (lower.includes('classname='))
518
- return true;
519
- if (lower.includes('jsx:'))
520
- return true;
521
- if (lower.includes("from 'react'") || lower.includes('from "react"')) {
522
- return true;
501
+ function detectBashIndicators(code) {
502
+ for (const line of splitLines(code)) {
503
+ const trimmed = line.trimStart();
504
+ if (!trimmed)
505
+ continue;
506
+ if (isShellPrefix(trimmed) ||
507
+ matchesBashCommand(trimmed) ||
508
+ matchesPackageManagerVerb(trimmed)) {
509
+ return true;
510
+ }
523
511
  }
524
- return containsJsxTag(code);
525
- }
526
- function detectTypescript(code) {
527
- const lower = code.toLowerCase();
528
- if (containsWord(lower, 'interface'))
529
- return true;
530
- if (containsWord(lower, 'type'))
531
- return true;
532
- return TYPE_HINTS.some((hint) => lower.includes(`: ${hint}`) || lower.includes(`:${hint}`));
533
- }
534
- function detectRust(code) {
535
- const lower = code.toLowerCase();
536
- return (RUST_WORD_REGEX.test(lower) ||
537
- lower.includes('let mut') ||
538
- (lower.includes('use ') && lower.includes('::')));
539
- }
540
- function detectJavascript(code) {
541
- const lower = code.toLowerCase();
542
- return JS_WORD_REGEX.test(lower);
543
- }
544
- function detectPython(code) {
545
- const lower = code.toLowerCase();
546
- return (PYTHON_WORD_REGEX.test(lower) ||
547
- lower.includes('print(') ||
548
- lower.includes('__name__'));
512
+ return false;
549
513
  }
550
- function detectCss(code) {
551
- const lower = code.toLowerCase();
552
- if (CSS_DIRECTIVE_REGEX.test(lower))
553
- return true;
554
- const lines = splitLines(code);
555
- for (const line of lines) {
514
+ function detectCssStructure(code) {
515
+ for (const line of splitLines(code)) {
556
516
  const trimmed = line.trimStart();
557
517
  if (!trimmed)
558
518
  continue;
559
- if (isCssSelectorLine(trimmed) || isCssPropertyLine(trimmed))
519
+ const isSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
520
+ trimmed.includes('{');
521
+ const isProperty = trimmed.includes(':') && trimmed.includes(';');
522
+ if (isSelector || isProperty)
560
523
  return true;
561
524
  }
562
525
  return false;
563
526
  }
564
- function detectHtml(code) {
565
- const lower = code.toLowerCase();
566
- return HTML_TAGS.some((tag) => lower.includes(tag));
567
- }
568
- function detectJson(code) {
569
- const trimmed = code.trimStart();
570
- if (!trimmed)
571
- return false;
572
- return trimmed.startsWith('{') || trimmed.startsWith('[');
573
- }
574
- function detectYaml(code) {
575
- const lines = splitLines(code);
576
- for (const line of lines) {
527
+ function detectYamlStructure(code) {
528
+ for (const line of splitLines(code)) {
577
529
  const trimmed = line.trim();
578
530
  if (!trimmed)
579
531
  continue;
580
- const colonIndex = trimmed.indexOf(':');
581
- if (colonIndex <= 0)
532
+ const colonIdx = trimmed.indexOf(':');
533
+ if (colonIdx <= 0)
582
534
  continue;
583
- const after = trimmed[colonIndex + 1];
535
+ const after = trimmed[colonIdx + 1];
584
536
  if (after === ' ' || after === '\t')
585
537
  return true;
586
538
  }
587
539
  return false;
588
540
  }
589
- function detectSql(code) {
590
- const lower = code.toLowerCase();
591
- return SQL_KEYWORDS.some((keyword) => containsWord(lower, keyword));
592
- }
593
- function detectGo(code) {
594
- const lower = code.toLowerCase();
595
- return (containsWord(lower, 'package') ||
596
- containsWord(lower, 'func') ||
597
- lower.includes('import "'));
598
- }
599
- function isCssSelectorLine(line) {
600
- if (!line.startsWith('.') && !line.startsWith('#'))
601
- return false;
602
- return line.includes('{');
603
- }
604
- function isCssPropertyLine(line) {
605
- return line.includes(':') && line.includes(';');
541
+ function matchesLanguagePattern(code, lower, pattern) {
542
+ if (pattern.keywords?.some((kw) => lower.includes(kw)))
543
+ return true;
544
+ if (pattern.wordBoundary?.some((w) => containsWord(lower, w)))
545
+ return true;
546
+ if (pattern.regex?.test(lower))
547
+ return true;
548
+ if (pattern.startsWith) {
549
+ const trimmed = code.trimStart();
550
+ if (pattern.startsWith.some((prefix) => trimmed.startsWith(prefix)))
551
+ return true;
552
+ }
553
+ if (pattern.custom?.(code, lower))
554
+ return true;
555
+ return false;
606
556
  }
607
557
  export function detectLanguageFromCode(code) {
608
- for (const { language, detect } of CODE_DETECTORS) {
609
- if (detect(code))
558
+ const lower = code.toLowerCase();
559
+ for (const { language, pattern } of LANGUAGE_PATTERNS) {
560
+ if (matchesLanguagePattern(code, lower, pattern))
610
561
  return language;
611
562
  }
612
563
  return undefined;
@@ -630,6 +581,7 @@ const STRUCTURAL_TAGS = new Set([
630
581
  'input',
631
582
  'select',
632
583
  'textarea',
584
+ 'svg',
633
585
  ]);
634
586
  const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
635
587
  const NAVIGATION_ROLES = new Set([
@@ -642,6 +594,7 @@ const NAVIGATION_ROLES = new Set([
642
594
  'menu',
643
595
  'dialog',
644
596
  'alertdialog',
597
+ 'search',
645
598
  ]);
646
599
  const PROMO_TOKENS = new Set([
647
600
  'banner',
@@ -669,6 +622,7 @@ const PROMO_TOKENS = new Set([
669
622
  'breadcrumb',
670
623
  'pagination',
671
624
  'pager',
625
+ 'taglist',
672
626
  ]);
673
627
  const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
674
628
  const FIXED_PATTERN = /\b(fixed|sticky)\b/;
@@ -727,6 +681,8 @@ const NOISE_MARKERS = [
727
681
  ' z-50',
728
682
  ' z-4',
729
683
  ' isolate',
684
+ ' breadcrumb',
685
+ ' pagination',
730
686
  ];
731
687
  function mayContainNoise(html) {
732
688
  const haystack = html.toLowerCase();
@@ -760,11 +716,9 @@ function matchesPromoIdOrClass(className, id) {
760
716
  const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
761
717
  return tokens.some((token) => PROMO_TOKENS.has(token));
762
718
  }
763
- function matchesHighZIsolate(className) {
764
- return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
765
- }
766
719
  function matchesFixedOrHighZIsolate(className) {
767
- return FIXED_PATTERN.test(className) || matchesHighZIsolate(className);
720
+ return (FIXED_PATTERN.test(className) ||
721
+ (HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
768
722
  }
769
723
  function readElementMetadata(element) {
770
724
  return {
@@ -791,8 +745,7 @@ function isNoiseElement(node) {
791
745
  matchesFixedOrHighZIsolate(metadata.className) ||
792
746
  matchesPromoIdOrClass(metadata.className, metadata.id));
793
747
  }
794
- function stripNoiseNodes(document) {
795
- const nodes = document.querySelectorAll('*');
748
+ function removeNoiseNodes(nodes) {
796
749
  for (let index = nodes.length - 1; index >= 0; index -= 1) {
797
750
  const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
798
751
  if (!node)
@@ -802,20 +755,54 @@ function stripNoiseNodes(document) {
802
755
  }
803
756
  }
804
757
  }
805
- function removeNoiseFromHtml(html) {
758
+ function stripNoiseNodes(document) {
759
+ // Use targeted selectors for common noise elements instead of querySelectorAll('*')
760
+ const targetSelectors = [
761
+ 'nav',
762
+ 'footer',
763
+ 'aside',
764
+ 'header[class*="site"]',
765
+ 'header[class*="nav"]',
766
+ 'header[class*="menu"]',
767
+ '[role="banner"]',
768
+ '[role="navigation"]',
769
+ '[role="dialog"]',
770
+ '[style*="display: none"]',
771
+ '[style*="display:none"]',
772
+ '[hidden]',
773
+ '[aria-hidden="true"]',
774
+ ].join(',');
775
+ const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
776
+ // Remove in reverse order to handle nested elements correctly
777
+ removeNoiseNodes(potentialNoiseNodes);
778
+ // Second pass: check remaining elements for noise patterns (promo, fixed positioning, etc.)
779
+ const candidateSelectors = [
780
+ ...STRUCTURAL_TAGS,
781
+ ...ALWAYS_NOISE_TAGS,
782
+ 'header',
783
+ 'canvas',
784
+ '[class]',
785
+ '[id]',
786
+ '[role]',
787
+ '[style]',
788
+ ].join(',');
789
+ const allElements = document.querySelectorAll(candidateSelectors);
790
+ removeNoiseNodes(allElements);
791
+ }
792
+ function removeNoiseFromHtml(html, document) {
806
793
  const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
807
794
  if (!shouldParse)
808
795
  return html;
809
796
  try {
810
- const { document } = parseHTML(html);
811
- stripNoiseNodes(document);
812
- const bodyInnerHtml = getBodyInnerHtml(document);
797
+ const resolvedDocument = document ?? parseHTML(html).document;
798
+ stripNoiseNodes(resolvedDocument);
799
+ const bodyInnerHtml = getBodyInnerHtml(resolvedDocument);
813
800
  if (bodyInnerHtml)
814
801
  return bodyInnerHtml;
815
- const docToString = getDocumentToString(document);
802
+ const docToString = getDocumentToString(resolvedDocument);
816
803
  if (docToString)
817
804
  return docToString();
818
- const documentElementOuterHtml = getDocumentElementOuterHtml(document);
805
+ const documentElementOuterHtml = getDocumentElementOuterHtml(resolvedDocument);
819
806
  if (documentElementOuterHtml)
820
807
  return documentElementOuterHtml;
821
808
  return html;
@@ -826,7 +813,14 @@ function removeNoiseFromHtml(html) {
826
813
  }
827
814
  function buildInlineCode(content) {
828
815
  const runs = content.match(/`+/g);
829
- const longest = runs?.sort((a, b) => b.length - a.length)[0] ?? '';
816
+ let longest = '';
817
+ if (runs) {
818
+ for (const run of runs) {
819
+ if (run.length > longest.length) {
820
+ longest = run;
821
+ }
822
+ }
823
+ }
830
824
  const delimiter = `\`${longest}`;
831
825
  const padding = delimiter.length > 1 ? ' ' : '';
832
826
  return `${delimiter}${padding}${content}${padding}${delimiter}`;
@@ -977,17 +971,14 @@ function getMarkdownConverter() {
977
971
  markdownInstance ??= createMarkdownInstance();
978
972
  return markdownInstance;
979
973
  }
980
- function translateHtmlToMarkdown(html, url, signal) {
974
+ function translateHtmlToMarkdown(html, url, signal, document) {
981
975
  throwIfAborted(signal, url, 'markdown:begin');
982
- const cleanedHtml = runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html));
976
+ const cleanedHtml = runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document));
983
977
  throwIfAborted(signal, url, 'markdown:cleaned');
984
978
  const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
985
979
  throwIfAborted(signal, url, 'markdown:translated');
986
- let finalMarkdown = cleanupMarkdownArtifacts(content);
987
- finalMarkdown = normalizeBlockSpacing(finalMarkdown);
988
- finalMarkdown = normalizeTableWhitespace(finalMarkdown);
989
- finalMarkdown = normalizeLineEndings(finalMarkdown);
990
- return finalMarkdown;
980
+ const cleaned = cleanupMarkdownArtifacts(content);
981
+ return promoteOrphanHeadings(cleaned);
991
982
  }
992
983
  function appendMetadataFooter(content, metadata, url) {
993
984
  const footer = buildMetadataFooter(metadata, url);
@@ -998,7 +989,7 @@ export function htmlToMarkdown(html, metadata, options) {
998
989
  if (!html)
999
990
  return buildMetadataFooter(metadata, url);
1000
991
  try {
1001
- const content = translateHtmlToMarkdown(html, url, options?.signal);
992
+ const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document);
1002
993
  return appendMetadataFooter(content, metadata, url);
1003
994
  }
1004
995
  catch (error) {
@@ -1010,37 +1001,146 @@ export function htmlToMarkdown(html, metadata, options) {
1010
1001
  }
1011
1002
  function cleanupMarkdownArtifacts(content) {
1012
1003
  let result = content;
1004
+ const fixOrphanHeadings = (text) => {
1005
+ return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (match, prefix, hashes, heading) => {
1006
+ if (typeof prefix !== 'string' ||
1007
+ typeof hashes !== 'string' ||
1008
+ typeof heading !== 'string') {
1009
+ return match;
1010
+ }
1011
+ if (heading.length > 150) {
1012
+ return match;
1013
+ }
1014
+ const trimmedPrefix = prefix.trim();
1015
+ if (trimmedPrefix === '') {
1016
+ return `${hashes} ${heading}\n\n`;
1017
+ }
1018
+ return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
1019
+ });
1020
+ };
1021
+ result = fixOrphanHeadings(result);
1013
1022
  result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
1014
1023
  const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
1015
1024
  result = result.replace(zeroWidthAnchorLink, '');
1025
+ result = result.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '');
1026
+ result = result.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '');
1027
+ result = result.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
1028
+ result = result.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
1029
+ result = result.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
1030
+ result = result.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
1031
+ const tocLinkLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
1032
+ const lines = result.split('\n');
1033
+ const filtered = [];
1034
+ let skipTocBlock = false;
1035
+ for (let i = 0; i < lines.length; i += 1) {
1036
+ const line = lines[i] ?? '';
1037
+ const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
1038
+ const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
1039
+ if (tocLinkLine.test(line)) {
1040
+ const prevIsToc = tocLinkLine.test(prevLine) || prevLine.trim() === '';
1041
+ const nextIsToc = tocLinkLine.test(nextLine) || nextLine.trim() === '';
1042
+ if (prevIsToc || nextIsToc) {
1043
+ skipTocBlock = true;
1044
+ continue;
1045
+ }
1046
+ }
1047
+ else if (line.trim() === '' && skipTocBlock) {
1048
+ skipTocBlock = false;
1049
+ continue;
1050
+ }
1051
+ else {
1052
+ skipTocBlock = false;
1053
+ }
1054
+ filtered.push(line);
1055
+ }
1056
+ result = filtered.join('\n');
1016
1057
  result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
1017
1058
  result = result.replace(/^Was this page helpful\??\s*$/gim, '');
1059
+ result = result.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ');
1060
+ result = result.replace(/\\([[]])/g, '$1');
1061
+ result = result.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
1062
+ result = result.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
1018
1063
  result = result.replace(/\n{3,}/g, '\n\n');
1019
1064
  return result.trim();
1020
1065
  }
1021
- function normalizeBlockSpacing(markdown) {
1022
- return markdown
1023
- .replace(/(\n#{1,6} .+)\n(?!\n)/g, '$1\n\n')
1024
- .replace(/\n{3,}/g, '\n\n');
1025
- }
1026
- function normalizeTableWhitespace(markdown) {
1027
- return markdown.replace(/\|([^|\n]+)\|/g, (_match, content) => {
1028
- const trimmed = typeof content === 'string' ? content.trim() : '';
1029
- return `| ${trimmed} |`;
1030
- });
1066
+ const HEADING_KEYWORDS = new Set([
1067
+ 'overview',
1068
+ 'introduction',
1069
+ 'summary',
1070
+ 'conclusion',
1071
+ 'prerequisites',
1072
+ 'requirements',
1073
+ 'installation',
1074
+ 'configuration',
1075
+ 'usage',
1076
+ 'features',
1077
+ 'limitations',
1078
+ 'troubleshooting',
1079
+ 'faq',
1080
+ 'resources',
1081
+ 'references',
1082
+ 'changelog',
1083
+ 'license',
1084
+ 'acknowledgments',
1085
+ 'appendix',
1086
+ ]);
1087
+ function isLikelyHeadingLine(line) {
1088
+ const trimmed = line.trim();
1089
+ if (!trimmed || trimmed.length > 80)
1090
+ return false;
1091
+ if (/^#{1,6}\s/.test(trimmed))
1092
+ return false;
1093
+ if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
1094
+ return false;
1095
+ if (/[.!?]$/.test(trimmed))
1096
+ return false;
1097
+ if (/^\[.*\]\(.*\)$/.test(trimmed))
1098
+ return false;
1099
+ if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
1100
+ return true;
1101
+ }
1102
+ const words = trimmed.split(/\s+/);
1103
+ if (words.length >= 2 && words.length <= 6) {
1104
+ const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
1105
+ if (isTitleCase)
1106
+ return true;
1107
+ }
1108
+ if (words.length === 1) {
1109
+ const lower = trimmed.toLowerCase();
1110
+ if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed)) {
1111
+ return true;
1112
+ }
1113
+ }
1114
+ return false;
1031
1115
  }
1032
- function normalizeLineEndings(markdown) {
1033
- return markdown.replace(/\r\n/g, '\n');
1116
+ function promoteOrphanHeadings(markdown) {
1117
+ const lines = markdown.split('\n');
1118
+ const result = [];
1119
+ for (let i = 0; i < lines.length; i += 1) {
1120
+ const line = lines[i] ?? '';
1121
+ const prevLine = i > 0 ? lines[i - 1] : '';
1122
+ const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
1123
+ const isStandalone = prevLine?.trim() === '' && nextLine?.trim() === '';
1124
+ const isPrecededByBlank = prevLine?.trim() === '';
1125
+ if ((isStandalone || isPrecededByBlank) && isLikelyHeadingLine(line)) {
1126
+ const trimmed = line.trim();
1127
+ const isExample = /^example:\s/i.test(trimmed);
1128
+ const prefix = isExample ? '### ' : '## ';
1129
+ result.push(prefix + trimmed);
1130
+ }
1131
+ else {
1132
+ result.push(line);
1133
+ }
1134
+ }
1135
+ return result.join('\n');
1034
1136
  }
1035
1137
  function formatFetchedDate(isoString) {
1036
1138
  try {
1037
1139
  const date = new Date(isoString);
1038
- const options = {
1039
- year: 'numeric',
1040
- month: 'short',
1041
- day: 'numeric',
1042
- };
1043
- return date.toLocaleDateString('en-US', options);
1140
+ const day = String(date.getDate()).padStart(2, '0');
1141
+ const month = String(date.getMonth() + 1).padStart(2, '0');
1142
+ const year = date.getFullYear();
1143
+ return `${day}-${month}-${year}`;
1044
1144
  }
1045
1145
  catch {
1046
1146
  return isoString;
@@ -1049,20 +1149,24 @@ function formatFetchedDate(isoString) {
1049
1149
  function buildMetadataFooter(metadata, fallbackUrl) {
1050
1150
  if (!metadata)
1051
1151
  return '';
1052
- const lines = [];
1152
+ const lines = ['---', ''];
1153
+ const url = metadata.url || fallbackUrl;
1154
+ const parts = [];
1053
1155
  if (metadata.title)
1054
- lines.push(`> *${metadata.title}*`);
1055
- if (metadata.description)
1056
- lines.push(`> *${metadata.description}*`);
1156
+ parts.push(`_${metadata.title}_`);
1057
1157
  if (metadata.author)
1058
- lines.push(`> *${metadata.author}*`);
1059
- if (metadata.url)
1060
- lines.push(`> *<${metadata.url}>*`);
1061
- else if (fallbackUrl)
1062
- lines.push(`> *<${fallbackUrl}>*`);
1158
+ parts.push(`_${metadata.author}_`);
1159
+ if (url)
1160
+ parts.push(`[_Original Source_](${url})`);
1063
1161
  if (metadata.fetchedAt) {
1064
1162
  const formattedDate = formatFetchedDate(metadata.fetchedAt);
1065
- lines.push(`> *${formattedDate}*`);
1163
+ parts.push(`_${formattedDate}_`);
1164
+ }
1165
+ if (parts.length > 0) {
1166
+ lines.push(` ${parts.join(' | ')}`);
1167
+ }
1168
+ if (metadata.description) {
1169
+ lines.push(` <sub>${metadata.description}</sub>`);
1066
1170
  }
1067
1171
  return lines.join('\n');
1068
1172
  }
@@ -1277,78 +1381,95 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
1277
1381
  }
1278
1382
  const MIN_CONTENT_RATIO = 0.3;
1279
1383
  const MIN_HTML_LENGTH_FOR_GATE = 100;
1280
- function stripHtmlTags(html) {
1281
- const parts = [];
1384
+ const MIN_HEADING_RETENTION_RATIO = 0.7;
1385
+ function countHeadings(html) {
1386
+ if (!html)
1387
+ return 0;
1388
+ // Match opening heading tags <h1> through <h6>
1389
+ const headingPattern = /<h[1-6](?:\s[^>]*)?>([^<]*)<\/h[1-6]>/gi;
1390
+ const matches = html.match(headingPattern);
1391
+ return matches ? matches.length : 0;
1392
+ }
1393
+ function isHeadingStructurePreserved(article, originalHtml) {
1394
+ if (!article)
1395
+ return false;
1396
+ // Cache heading counts to avoid duplicate regex matching
1397
+ const originalHeadingCount = countHeadings(originalHtml);
1398
+ const articleHeadingCount = countHeadings(article.content);
1399
+ // If original has no headings, structure is trivially preserved
1400
+ if (originalHeadingCount === 0)
1401
+ return true;
1402
+ // If article lost >50% of headings, structure is broken
1403
+ const retentionRatio = articleHeadingCount / originalHeadingCount;
1404
+ return retentionRatio >= MIN_HEADING_RETENTION_RATIO;
1405
+ }
1406
+ function stripHtmlTagsForLength(html) {
1407
+ let result = '';
1282
1408
  let inTag = false;
1283
1409
  for (const char of html) {
1284
1410
  if (char === '<') {
1285
1411
  inTag = true;
1286
- continue;
1287
1412
  }
1288
- if (char === '>') {
1413
+ else if (char === '>') {
1289
1414
  inTag = false;
1290
- continue;
1291
1415
  }
1292
- if (!inTag) {
1293
- parts.push(char);
1416
+ else if (!inTag) {
1417
+ result += char;
1294
1418
  }
1295
1419
  }
1296
- return parts.join('');
1297
- }
1298
- function estimateTextLength(html) {
1299
- return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
1420
+ return result;
1300
1421
  }
1301
1422
  export function isExtractionSufficient(article, originalHtml) {
1302
1423
  if (!article)
1303
1424
  return false;
1304
1425
  const articleLength = article.textContent.length;
1305
- const originalLength = estimateTextLength(originalHtml);
1426
+ const originalLength = stripHtmlTagsForLength(originalHtml)
1427
+ .replace(/\s+/g, ' ')
1428
+ .trim().length;
1306
1429
  if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
1307
1430
  return true;
1308
1431
  return articleLength / originalLength >= MIN_CONTENT_RATIO;
1309
1432
  }
1310
1433
  export function determineContentExtractionSource(article) {
1311
- return !!article;
1312
- }
1313
- function applyArticleMetadata(metadata, article) {
1314
- if (article.title !== undefined)
1315
- metadata.title = article.title;
1316
- if (article.byline !== undefined)
1317
- metadata.author = article.byline;
1318
- }
1319
- function applyExtractedMetadata(metadata, extractedMeta) {
1320
- if (extractedMeta.title !== undefined)
1321
- metadata.title = extractedMeta.title;
1322
- if (extractedMeta.description !== undefined) {
1323
- metadata.description = extractedMeta.description;
1324
- }
1325
- if (extractedMeta.author !== undefined) {
1326
- metadata.author = extractedMeta.author;
1327
- }
1434
+ return article !== null;
1328
1435
  }
1329
1436
  export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
1330
1437
  if (!includeMetadata)
1331
1438
  return undefined;
1332
- const now = new Date().toISOString();
1333
1439
  const metadata = {
1334
1440
  type: 'metadata',
1335
1441
  url,
1336
- fetchedAt: now,
1442
+ fetchedAt: new Date().toISOString(),
1337
1443
  };
1338
1444
  if (shouldExtractFromArticle && article) {
1339
- applyArticleMetadata(metadata, article);
1340
- return metadata;
1445
+ if (article.title !== undefined)
1446
+ metadata.title = article.title;
1447
+ if (article.byline !== undefined)
1448
+ metadata.author = article.byline;
1449
+ }
1450
+ else {
1451
+ if (extractedMeta.title !== undefined)
1452
+ metadata.title = extractedMeta.title;
1453
+ if (extractedMeta.description !== undefined) {
1454
+ metadata.description = extractedMeta.description;
1455
+ }
1456
+ if (extractedMeta.author !== undefined) {
1457
+ metadata.author = extractedMeta.author;
1458
+ }
1341
1459
  }
1342
- applyExtractedMetadata(metadata, extractedMeta);
1343
1460
  return metadata;
1344
1461
  }
1345
- function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, }) {
1462
+ function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, document, }) {
1346
1463
  const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
1347
- return {
1464
+ const source = {
1348
1465
  sourceHtml: useArticleContent && article ? article.content : html,
1349
1466
  title: useArticleContent && article ? article.title : extractedMeta.title,
1350
1467
  metadata,
1351
1468
  };
1469
+ if (!useArticleContent && document) {
1470
+ return { ...source, document };
1471
+ }
1472
+ return source;
1352
1473
  }
1353
1474
  function logQualityGateFallback({ url, articleLength, }) {
1354
1475
  logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
@@ -1357,20 +1478,27 @@ function logQualityGateFallback({ url, articleLength, }) {
1357
1478
  });
1358
1479
  }
1359
1480
  function shouldUseArticleContent(article, html, url) {
1360
- const shouldExtractFromArticle = determineContentExtractionSource(article);
1361
- if (!shouldExtractFromArticle)
1481
+ // Check content sufficiency (length-based quality gate)
1482
+ if (!isExtractionSufficient(article, html)) {
1483
+ logQualityGateFallback({
1484
+ url,
1485
+ articleLength: article.textContent.length,
1486
+ });
1362
1487
  return false;
1363
- if (isExtractionSufficient(article, html)) {
1364
- return true;
1365
1488
  }
1366
- logQualityGateFallback({
1367
- url,
1368
- articleLength: article.textContent.length,
1369
- });
1370
- return false;
1489
+ // Check heading structure preservation
1490
+ if (!isHeadingStructurePreserved(article, html)) {
1491
+ logDebug('Quality gate: Readability broke heading structure, using full HTML', {
1492
+ url: url.substring(0, 80),
1493
+ originalHeadings: countHeadings(html),
1494
+ articleHeadings: countHeadings(article.content),
1495
+ });
1496
+ return false;
1497
+ }
1498
+ return true;
1371
1499
  }
1372
1500
  function resolveContentSource({ html, url, includeMetadata, signal, }) {
1373
- const { article, metadata: extractedMeta } = extractContent(html, url, {
1501
+ const { article, metadata: extractedMeta, document, } = extractContentWithDocument(html, url, {
1374
1502
  extractArticle: true,
1375
1503
  ...(signal ? { signal } : {}),
1376
1504
  });
@@ -1384,6 +1512,7 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
1384
1512
  extractedMeta,
1385
1513
  includeMetadata,
1386
1514
  useArticleContent,
1515
+ ...(document ? { document } : {}),
1387
1516
  });
1388
1517
  }
1389
1518
  function tryTransformRawStage(html, url, includeMetadata) {
@@ -1405,6 +1534,7 @@ function buildMarkdownFromContext(context, url, signal) {
1405
1534
  const content = runTransformStage(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
1406
1535
  url,
1407
1536
  ...(signal ? { signal } : {}),
1537
+ ...(context.document ? { document: context.document } : {}),
1408
1538
  }));
1409
1539
  return {
1410
1540
  markdown: content,