npm - @j0hanz/superfetch - Versions diffs - 2.1.8 → 2.2.1 - Mend

@j0hanz/superfetch 2.1.8 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/README.md +33 -34
package/dist/cache.d.ts.map +1 -1
package/dist/cache.js +57 -14
package/dist/cache.js.map +1 -1
package/dist/config.d.ts +3 -0
package/dist/config.d.ts.map +1 -1
package/dist/config.js +6 -0
package/dist/config.js.map +1 -1
package/dist/errors.d.ts.map +1 -1
package/dist/errors.js +14 -1
package/dist/errors.js.map +1 -1
package/dist/fetch.d.ts.map +1 -1
package/dist/fetch.js +6 -3
package/dist/fetch.js.map +1 -1
package/dist/http.d.ts +1 -1
package/dist/http.d.ts.map +1 -1
package/dist/http.js +50 -25
package/dist/http.js.map +1 -1
package/dist/index.js +8 -11
package/dist/index.js.map +1 -1
package/dist/mcp.d.ts.map +1 -1
package/dist/mcp.js +6 -5
package/dist/mcp.js.map +1 -1
package/dist/observability.d.ts.map +1 -1
package/dist/observability.js +9 -12
package/dist/observability.js.map +1 -1
package/dist/tools.d.ts.map +1 -1
package/dist/tools.js +45 -32
package/dist/tools.js.map +1 -1
package/dist/transform.d.ts +1 -0
package/dist/transform.d.ts.map +1 -1
package/dist/transform.js +498 -368
package/dist/transform.js.map +1 -1
package/dist/type-guards.js +1 -1
package/dist/type-guards.js.map +1 -1
package/package.json +1 -1

package/dist/transform.js CHANGED Viewed

@@ -129,83 +129,41 @@ function truncateHtml(html) {
     });
     return html.substring(0, maxSize);
 }
-function createMetaCollectorState() {
-    return {
-        title: {},
-        description: {},
-        author: {},
-    };
-}
-function resolveMetaField(state, field) {
-    const sources = state[field];
-    return sources.og ?? sources.twitter ?? sources.standard;
-}
-function parseOpenGraphKey(property) {
-    if (!property?.startsWith('og:'))
-        return null;
-    const key = property.replace('og:', '');
-    return key === 'title' || key === 'description' ? key : null;
-}
-function parseTwitterKey(name) {
-    if (!name?.startsWith('twitter:'))
-        return null;
-    const key = name.replace('twitter:', '');
-    return key === 'title' || key === 'description' ? key : null;
-}
-function parseStandardKey(name) {
-    if (name === 'description')
-        return 'description';
-    if (name === 'author')
-        return 'author';
-    return null;
-}
-function collectMetaTag(state, tag) {
-    const content = tag.getAttribute('content')?.trim();
-    if (!content)
-        return;
-    const ogKey = parseOpenGraphKey(tag.getAttribute('property'));
-    if (ogKey) {
-        state[ogKey].og = content;
-        return;
-    }
-    const name = tag.getAttribute('name');
-    const twitterKey = parseTwitterKey(name);
-    if (twitterKey) {
-        state[twitterKey].twitter = content;
-        return;
-    }
-    const standardKey = parseStandardKey(name);
-    if (standardKey) {
-        state[standardKey].standard = content;
-    }
-}
-function scanMetaTags(document, state) {
-    const metaTags = document.querySelectorAll('meta');
-    for (const tag of metaTags) {
-        collectMetaTag(state, tag);
+function extractMetadata(document) {
+    const title = {};
+    const description = {};
+    let author;
+    for (const tag of document.querySelectorAll('meta')) {
+        const content = tag.getAttribute('content')?.trim();
+        if (!content)
+            continue;
+        const property = tag.getAttribute('property');
+        const name = tag.getAttribute('name');
+        if (property === 'og:title')
+            title.og = content;
+        else if (property === 'og:description')
+            description.og = content;
+        else if (name === 'twitter:title')
+            title.twitter = content;
+        else if (name === 'twitter:description')
+            description.twitter = content;
+        else if (name === 'description')
+            description.standard = content;
+        else if (name === 'author')
+            author = content;
     }
-}
-function ensureTitleFallback(document, state) {
-    if (state.title.standard)
-        return;
     const titleEl = document.querySelector('title');
-    if (titleEl?.textContent) {
-        state.title.standard = titleEl.textContent.trim();
+    if (!title.standard && titleEl?.textContent) {
+        title.standard = titleEl.textContent.trim();
     }
-}
-function extractMetadata(document) {
-    const state = createMetaCollectorState();
-    scanMetaTags(document, state);
-    ensureTitleFallback(document, state);
+    const resolvedTitle = title.og ?? title.twitter ?? title.standard;
+    const resolvedDesc = description.og ?? description.twitter ?? description.standard;
     const metadata = {};
-    const title = resolveMetaField(state, 'title');
-    const description = resolveMetaField(state, 'description');
-    const author = resolveMetaField(state, 'author');
-    if (title !== undefined)
-        metadata.title = title;
-    if (description !== undefined)
-        metadata.description = description;
-    if (author !== undefined)
+    if (resolvedTitle)
+        metadata.title = resolvedTitle;
+    if (resolvedDesc)
+        metadata.description = resolvedDesc;
+    if (author)
         metadata.author = author;
     return metadata;
 }
@@ -226,66 +184,44 @@ function extractArticle(document) {
         logWarn('Document not compatible with Readability');
         return null;
     }
-    return mapParsedArticle(parseReadabilityArticle(document));
-}
-function parseReadabilityArticle(document) {
     try {
         const documentClone = document.cloneNode(true);
-        const rawText = documentClone.body.textContent ||
+        const rawText = documentClone.querySelector('body')?.textContent ??
             documentClone.documentElement.textContent;
         const textLength = rawText.replace(/\s+/g, ' ').trim().length;
         if (textLength >= 400 && !isProbablyReaderable(documentClone)) {
             return null;
         }
         const reader = new Readability(documentClone, { maxElemsToParse: 20_000 });
-        return reader.parse();
+        const parsed = reader.parse();
+        if (!parsed)
+            return null;
+        return {
+            content: parsed.content ?? '',
+            textContent: parsed.textContent ?? '',
+            ...(parsed.title != null && { title: parsed.title }),
+            ...(parsed.byline != null && { byline: parsed.byline }),
+            ...(parsed.excerpt != null && { excerpt: parsed.excerpt }),
+            ...(parsed.siteName != null && { siteName: parsed.siteName }),
+        };
     }
     catch (error) {
-        logError('Failed to extract article with Readability', asError(error));
+        logError('Failed to extract article with Readability', error instanceof Error ? error : undefined);
         return null;
     }
 }
-function asError(error) {
-    if (error instanceof Error) {
-        return error;
-    }
-    return undefined;
-}
-function mapParsedArticle(parsed) {
-    return parsed ? mapReadabilityResult(parsed) : null;
-}
-function mapReadabilityResult(parsed) {
-    return {
-        content: parsed.content ?? '',
-        textContent: parsed.textContent ?? '',
-        ...buildOptionalArticleFields(parsed),
-    };
-}
-function buildOptionalArticleFields(parsed) {
-    const optional = {};
-    addOptionalField(optional, 'title', parsed.title);
-    addOptionalField(optional, 'byline', parsed.byline);
-    addOptionalField(optional, 'excerpt', parsed.excerpt);
-    addOptionalField(optional, 'siteName', parsed.siteName);
-    return optional;
-}
-function addOptionalField(target, key, value) {
-    if (value == null)
-        return;
-    target[key] = value;
-}
 export function extractContent(html, url, options = {
     extractArticle: true,
 }) {
-    const emptyResult = createEmptyExtractionResult();
+    const result = extractContentWithDocument(html, url, options);
+    return { article: result.article, metadata: result.metadata };
+}
+function extractContentWithDocument(html, url, options) {
     if (!isValidInput(html, url)) {
-        return emptyResult;
+        return { article: null, metadata: {} };
     }
     return tryExtractContent(html, url, options);
 }
-function createEmptyExtractionResult() {
-    return { article: null, metadata: {} };
-}
 function extractArticleWithStage(document, url, shouldExtract) {
     if (!shouldExtract)
         return null;
@@ -297,11 +233,12 @@ function handleExtractionFailure(error, url, signal) {
     }
     throwIfAborted(signal, url, 'extract:error');
     logError('Failed to extract content', error instanceof Error ? error : undefined);
-    return createEmptyExtractionResult();
+    return { article: null, metadata: {} };
 }
 function extractContentStages(html, url, options) {
     throwIfAborted(options.signal, url, 'extract:begin');
-    const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncateHtml(html)));
+    const truncatedHtml = truncateHtml(html);
+    const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncatedHtml));
     throwIfAborted(options.signal, url, 'extract:parsed');
     applyBaseUri(document, url);
     const metadata = runTransformStage(url, 'extract:metadata', () => extractMetadata(document));
@@ -311,6 +248,7 @@ function extractContentStages(html, url, options) {
     return {
         article,
         metadata,
+        ...(truncatedHtml.length === html.length ? { document } : {}),
     };
 }
 function tryExtractContent(html, url, options) {
@@ -325,14 +263,11 @@ function isValidInput(html, url) {
     return (validateRequiredString(html, 'extractContent called with invalid HTML input') && validateRequiredString(url, 'extractContent called with invalid URL'));
 }
 function validateRequiredString(value, message) {
-    if (isNonEmptyString(value))
+    if (typeof value === 'string' && value.length > 0)
         return true;
     logWarn(message);
     return false;
 }
-function isNonEmptyString(value) {
-    return typeof value === 'string' && value.length > 0;
-}
 function resolveArticleExtraction(document, shouldExtract) {
     return shouldExtract ? extractArticle(document) : null;
 }
@@ -417,7 +352,124 @@ function isWordChar(char) {
         (code >= 97 && code <= 122) ||
         char === '_');
 }
-const BASH_PACKAGE_MANAGERS = [
+const LANGUAGE_PATTERNS = [
+    {
+        language: 'jsx',
+        pattern: {
+            keywords: ['classname=', 'jsx:', "from 'react'", 'from "react"'],
+            custom: (code) => containsJsxTag(code),
+        },
+    },
+    {
+        language: 'typescript',
+        pattern: {
+            wordBoundary: ['interface', 'type'],
+            custom: (_, lower) => [
+                ': string',
+                ':string',
+                ': number',
+                ':number',
+                ': boolean',
+                ':boolean',
+                ': void',
+                ':void',
+                ': any',
+                ':any',
+                ': unknown',
+                ':unknown',
+                ': never',
+                ':never',
+            ].some((hint) => lower.includes(hint)),
+        },
+    },
+    {
+        language: 'rust',
+        pattern: {
+            regex: /\b(?:fn|impl|struct|enum)\b/,
+            keywords: ['let mut'],
+            custom: (_, lower) => lower.includes('use ') && lower.includes('::'),
+        },
+    },
+    {
+        language: 'javascript',
+        pattern: {
+            regex: /\b(?:const|let|var|function|class|async|await|export|import)\b/,
+        },
+    },
+    {
+        language: 'python',
+        pattern: {
+            regex: /\b(?:def|class|import|from)\b/,
+            keywords: ['print(', '__name__'],
+        },
+    },
+    {
+        language: 'bash',
+        pattern: {
+            custom: (code) => detectBashIndicators(code),
+        },
+    },
+    {
+        language: 'css',
+        pattern: {
+            regex: /@media|@import|@keyframes/,
+            custom: (code) => detectCssStructure(code),
+        },
+    },
+    {
+        language: 'html',
+        pattern: {
+            keywords: [
+                '<!doctype',
+                '<html',
+                '<head',
+                '<body',
+                '<div',
+                '<span',
+                '<p',
+                '<a',
+                '<script',
+                '<style',
+            ],
+        },
+    },
+    {
+        language: 'json',
+        pattern: {
+            startsWith: ['{', '['],
+        },
+    },
+    {
+        language: 'yaml',
+        pattern: {
+            custom: (code) => detectYamlStructure(code),
+        },
+    },
+    {
+        language: 'sql',
+        pattern: {
+            wordBoundary: [
+                'select',
+                'insert',
+                'update',
+                'delete',
+                'create',
+                'alter',
+                'drop',
+            ],
+        },
+    },
+    {
+        language: 'go',
+        pattern: {
+            wordBoundary: ['package', 'func'],
+            keywords: ['import "'],
+        },
+    },
+];
+// Bash detection constants
+const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
+const BASH_PKG_MANAGERS = [
     'npm',
     'yarn',
     'pnpm',
@@ -429,184 +481,83 @@ const BASH_PACKAGE_MANAGERS = [
     'go',
 ];
 const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
-const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
-function detectBash(code) {
-    const lines = splitLines(code);
-    for (const line of lines) {
-        const trimmed = line.trimStart();
-        if (!trimmed)
+function isShellPrefix(line) {
+    return (line.startsWith('#!') || line.startsWith('$ ') || line.startsWith('# '));
+}
+function matchesBashCommand(line) {
+    return BASH_COMMANDS.some((cmd) => line === cmd || line.startsWith(`${cmd} `));
+}
+function matchesPackageManagerVerb(line) {
+    for (const mgr of BASH_PKG_MANAGERS) {
+        if (!line.startsWith(`${mgr} `))
             continue;
-        if (isBashIndicator(trimmed))
+        const rest = line.slice(mgr.length + 1);
+        if (BASH_VERBS.some((v) => rest === v || rest.startsWith(`${v} `))) {
             return true;
+        }
     }
     return false;
 }
-function startsWithCommand(line, commands) {
-    return commands.some((command) => line === command || line.startsWith(`${command} `));
-}
-function isBashIndicator(line) {
-    return (isShebang(line) ||
-        isPromptLine(line) ||
-        startsWithCommand(line, BASH_COMMANDS) ||
-        startsWithPackageManagerCommand(line));
-}
-function isShebang(line) {
-    return line.startsWith('#!');
-}
-function isPromptLine(line) {
-    return line.startsWith('$ ') || line.startsWith('# ');
-}
-function startsWithPackageManagerCommand(line) {
-    return BASH_PACKAGE_MANAGERS.some((manager) => {
-        if (!line.startsWith(`${manager} `))
-            return false;
-        const rest = line.slice(manager.length + 1);
-        return BASH_VERBS.some((verb) => rest === verb || rest.startsWith(`${verb} `));
-    });
-}
-const TYPE_HINTS = [
-    'string',
-    'number',
-    'boolean',
-    'void',
-    'any',
-    'unknown',
-    'never',
-];
-const HTML_TAGS = [
-    '<!doctype',
-    '<html',
-    '<head',
-    '<body',
-    '<div',
-    '<span',
-    '<p',
-    '<a',
-    '<script',
-    '<style',
-];
-const SQL_KEYWORDS = [
-    'select',
-    'insert',
-    'update',
-    'delete',
-    'create',
-    'alter',
-    'drop',
-];
-const JS_WORD_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
-const PYTHON_WORD_REGEX = /\b(?:def|class|import|from)\b/;
-const RUST_WORD_REGEX = /\b(?:fn|impl|struct|enum)\b/;
-const CSS_DIRECTIVE_REGEX = /@media|@import|@keyframes/;
-const CODE_DETECTORS = [
-    { language: 'jsx', detect: detectJsx },
-    { language: 'typescript', detect: detectTypescript },
-    { language: 'rust', detect: detectRust },
-    { language: 'javascript', detect: detectJavascript },
-    { language: 'python', detect: detectPython },
-    { language: 'bash', detect: detectBash },
-    { language: 'css', detect: detectCss },
-    { language: 'html', detect: detectHtml },
-    { language: 'json', detect: detectJson },
-    { language: 'yaml', detect: detectYaml },
-    { language: 'sql', detect: detectSql },
-    { language: 'go', detect: detectGo },
-];
-function detectJsx(code) {
-    const lower = code.toLowerCase();
-    if (lower.includes('classname='))
-        return true;
-    if (lower.includes('jsx:'))
-        return true;
-    if (lower.includes("from 'react'") || lower.includes('from "react"')) {
-        return true;
+function detectBashIndicators(code) {
+    for (const line of splitLines(code)) {
+        const trimmed = line.trimStart();
+        if (!trimmed)
+            continue;
+        if (isShellPrefix(trimmed) ||
+            matchesBashCommand(trimmed) ||
+            matchesPackageManagerVerb(trimmed)) {
+            return true;
+        }
     }
-    return containsJsxTag(code);
-}
-function detectTypescript(code) {
-    const lower = code.toLowerCase();
-    if (containsWord(lower, 'interface'))
-        return true;
-    if (containsWord(lower, 'type'))
-        return true;
-    return TYPE_HINTS.some((hint) => lower.includes(`: ${hint}`) || lower.includes(`:${hint}`));
-}
-function detectRust(code) {
-    const lower = code.toLowerCase();
-    return (RUST_WORD_REGEX.test(lower) ||
-        lower.includes('let mut') ||
-        (lower.includes('use ') && lower.includes('::')));
-}
-function detectJavascript(code) {
-    const lower = code.toLowerCase();
-    return JS_WORD_REGEX.test(lower);
-}
-function detectPython(code) {
-    const lower = code.toLowerCase();
-    return (PYTHON_WORD_REGEX.test(lower) ||
-        lower.includes('print(') ||
-        lower.includes('__name__'));
+    return false;
 }
-function detectCss(code) {
-    const lower = code.toLowerCase();
-    if (CSS_DIRECTIVE_REGEX.test(lower))
-        return true;
-    const lines = splitLines(code);
-    for (const line of lines) {
+function detectCssStructure(code) {
+    for (const line of splitLines(code)) {
         const trimmed = line.trimStart();
         if (!trimmed)
             continue;
-        if (isCssSelectorLine(trimmed) || isCssPropertyLine(trimmed))
+        const isSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
+            trimmed.includes('{');
+        const isProperty = trimmed.includes(':') && trimmed.includes(';');
+        if (isSelector || isProperty)
             return true;
     }
     return false;
 }
-function detectHtml(code) {
-    const lower = code.toLowerCase();
-    return HTML_TAGS.some((tag) => lower.includes(tag));
-}
-function detectJson(code) {
-    const trimmed = code.trimStart();
-    if (!trimmed)
-        return false;
-    return trimmed.startsWith('{') || trimmed.startsWith('[');
-}
-function detectYaml(code) {
-    const lines = splitLines(code);
-    for (const line of lines) {
+function detectYamlStructure(code) {
+    for (const line of splitLines(code)) {
         const trimmed = line.trim();
         if (!trimmed)
             continue;
-        const colonIndex = trimmed.indexOf(':');
-        if (colonIndex <= 0)
+        const colonIdx = trimmed.indexOf(':');
+        if (colonIdx <= 0)
             continue;
-        const after = trimmed[colonIndex + 1];
+        const after = trimmed[colonIdx + 1];
         if (after === ' ' || after === '\t')
             return true;
     }
     return false;
 }
-function detectSql(code) {
-    const lower = code.toLowerCase();
-    return SQL_KEYWORDS.some((keyword) => containsWord(lower, keyword));
-}
-function detectGo(code) {
-    const lower = code.toLowerCase();
-    return (containsWord(lower, 'package') ||
-        containsWord(lower, 'func') ||
-        lower.includes('import "'));
-}
-function isCssSelectorLine(line) {
-    if (!line.startsWith('.') && !line.startsWith('#'))
-        return false;
-    return line.includes('{');
-}
-function isCssPropertyLine(line) {
-    return line.includes(':') && line.includes(';');
+function matchesLanguagePattern(code, lower, pattern) {
+    if (pattern.keywords?.some((kw) => lower.includes(kw)))
+        return true;
+    if (pattern.wordBoundary?.some((w) => containsWord(lower, w)))
+        return true;
+    if (pattern.regex?.test(lower))
+        return true;
+    if (pattern.startsWith) {
+        const trimmed = code.trimStart();
+        if (pattern.startsWith.some((prefix) => trimmed.startsWith(prefix)))
+            return true;
+    }
+    if (pattern.custom?.(code, lower))
+        return true;
+    return false;
 }
 export function detectLanguageFromCode(code) {
-    for (const { language, detect } of CODE_DETECTORS) {
-        if (detect(code))
+    const lower = code.toLowerCase();
+    for (const { language, pattern } of LANGUAGE_PATTERNS) {
+        if (matchesLanguagePattern(code, lower, pattern))
             return language;
     }
     return undefined;
@@ -630,6 +581,7 @@ const STRUCTURAL_TAGS = new Set([
     'input',
     'select',
     'textarea',
+    'svg',
 ]);
 const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
 const NAVIGATION_ROLES = new Set([
@@ -642,6 +594,7 @@ const NAVIGATION_ROLES = new Set([
     'menu',
     'dialog',
     'alertdialog',
+    'search',
 ]);
 const PROMO_TOKENS = new Set([
     'banner',
@@ -669,6 +622,7 @@ const PROMO_TOKENS = new Set([
     'breadcrumb',
     'pagination',
     'pager',
+    'taglist',
 ]);
 const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
 const FIXED_PATTERN = /\b(fixed|sticky)\b/;
@@ -727,6 +681,8 @@ const NOISE_MARKERS = [
     ' z-50',
     ' z-4',
     ' isolate',
+    ' breadcrumb',
+    ' pagination',
 ];
 function mayContainNoise(html) {
     const haystack = html.toLowerCase();
@@ -760,11 +716,9 @@ function matchesPromoIdOrClass(className, id) {
     const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
     return tokens.some((token) => PROMO_TOKENS.has(token));
 }
-function matchesHighZIsolate(className) {
-    return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
-}
 function matchesFixedOrHighZIsolate(className) {
-    return FIXED_PATTERN.test(className) || matchesHighZIsolate(className);
+    return (FIXED_PATTERN.test(className) ||
+        (HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
 }
 function readElementMetadata(element) {
     return {
@@ -791,8 +745,7 @@ function isNoiseElement(node) {
         matchesFixedOrHighZIsolate(metadata.className) ||
         matchesPromoIdOrClass(metadata.className, metadata.id));
 }
-function stripNoiseNodes(document) {
-    const nodes = document.querySelectorAll('*');
+function removeNoiseNodes(nodes) {
     for (let index = nodes.length - 1; index >= 0; index -= 1) {
         const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
         if (!node)
@@ -802,20 +755,54 @@ function stripNoiseNodes(document) {
         }
     }
 }
-function removeNoiseFromHtml(html) {
+function stripNoiseNodes(document) {
+    // Use targeted selectors for common noise elements instead of querySelectorAll('*')
+    const targetSelectors = [
+        'nav',
+        'footer',
+        'aside',
+        'header[class*="site"]',
+        'header[class*="nav"]',
+        'header[class*="menu"]',
+        '[role="banner"]',
+        '[role="navigation"]',
+        '[role="dialog"]',
+        '[style*="display: none"]',
+        '[style*="display:none"]',
+        '[hidden]',
+        '[aria-hidden="true"]',
+    ].join(',');
+    const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
+    // Remove in reverse order to handle nested elements correctly
+    removeNoiseNodes(potentialNoiseNodes);
+    // Second pass: check remaining elements for noise patterns (promo, fixed positioning, etc.)
+    const candidateSelectors = [
+        ...STRUCTURAL_TAGS,
+        ...ALWAYS_NOISE_TAGS,
+        'header',
+        'canvas',
+        '[class]',
+        '[id]',
+        '[role]',
+        '[style]',
+    ].join(',');
+    const allElements = document.querySelectorAll(candidateSelectors);
+    removeNoiseNodes(allElements);
+}
+function removeNoiseFromHtml(html, document) {
     const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
     if (!shouldParse)
         return html;
     try {
-        const { document } = parseHTML(html);
-        stripNoiseNodes(document);
-        const bodyInnerHtml = getBodyInnerHtml(document);
+        const resolvedDocument = document ?? parseHTML(html).document;
+        stripNoiseNodes(resolvedDocument);
+        const bodyInnerHtml = getBodyInnerHtml(resolvedDocument);
         if (bodyInnerHtml)
             return bodyInnerHtml;
-        const docToString = getDocumentToString(document);
+        const docToString = getDocumentToString(resolvedDocument);
         if (docToString)
             return docToString();
-        const documentElementOuterHtml = getDocumentElementOuterHtml(document);
+        const documentElementOuterHtml = getDocumentElementOuterHtml(resolvedDocument);
         if (documentElementOuterHtml)
             return documentElementOuterHtml;
         return html;
@@ -826,7 +813,14 @@ function removeNoiseFromHtml(html) {
 }
 function buildInlineCode(content) {
     const runs = content.match(/`+/g);
-    const longest = runs?.sort((a, b) => b.length - a.length)[0] ?? '';
+    let longest = '';
+    if (runs) {
+        for (const run of runs) {
+            if (run.length > longest.length) {
+                longest = run;
+            }
+        }
+    }
     const delimiter = `\`${longest}`;
     const padding = delimiter.length > 1 ? ' ' : '';
     return `${delimiter}${padding}${content}${padding}${delimiter}`;
@@ -977,17 +971,14 @@ function getMarkdownConverter() {
     markdownInstance ??= createMarkdownInstance();
     return markdownInstance;
 }
-function translateHtmlToMarkdown(html, url, signal) {
+function translateHtmlToMarkdown(html, url, signal, document) {
     throwIfAborted(signal, url, 'markdown:begin');
-    const cleanedHtml = runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html));
+    const cleanedHtml = runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document));
     throwIfAborted(signal, url, 'markdown:cleaned');
     const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
     throwIfAborted(signal, url, 'markdown:translated');
-    let finalMarkdown = cleanupMarkdownArtifacts(content);
-    finalMarkdown = normalizeBlockSpacing(finalMarkdown);
-    finalMarkdown = normalizeTableWhitespace(finalMarkdown);
-    finalMarkdown = normalizeLineEndings(finalMarkdown);
-    return finalMarkdown;
+    const cleaned = cleanupMarkdownArtifacts(content);
+    return promoteOrphanHeadings(cleaned);
 }
 function appendMetadataFooter(content, metadata, url) {
     const footer = buildMetadataFooter(metadata, url);
@@ -998,7 +989,7 @@ export function htmlToMarkdown(html, metadata, options) {
     if (!html)
         return buildMetadataFooter(metadata, url);
     try {
-        const content = translateHtmlToMarkdown(html, url, options?.signal);
+        const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document);
         return appendMetadataFooter(content, metadata, url);
     }
     catch (error) {
@@ -1010,37 +1001,146 @@ export function htmlToMarkdown(html, metadata, options) {
 }
 function cleanupMarkdownArtifacts(content) {
     let result = content;
+    const fixOrphanHeadings = (text) => {
+        return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (match, prefix, hashes, heading) => {
+            if (typeof prefix !== 'string' ||
+                typeof hashes !== 'string' ||
+                typeof heading !== 'string') {
+                return match;
+            }
+            if (heading.length > 150) {
+                return match;
+            }
+            const trimmedPrefix = prefix.trim();
+            if (trimmedPrefix === '') {
+                return `${hashes} ${heading}\n\n`;
+            }
+            return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
+        });
+    };
+    result = fixOrphanHeadings(result);
     result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
     const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
     result = result.replace(zeroWidthAnchorLink, '');
+    result = result.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '');
+    result = result.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '');
+    result = result.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
+    result = result.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
+    result = result.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
+    result = result.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
+    const tocLinkLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
+    const lines = result.split('\n');
+    const filtered = [];
+    let skipTocBlock = false;
+    for (let i = 0; i < lines.length; i += 1) {
+        const line = lines[i] ?? '';
+        const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
+        const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
+        if (tocLinkLine.test(line)) {
+            const prevIsToc = tocLinkLine.test(prevLine) || prevLine.trim() === '';
+            const nextIsToc = tocLinkLine.test(nextLine) || nextLine.trim() === '';
+            if (prevIsToc || nextIsToc) {
+                skipTocBlock = true;
+                continue;
+            }
+        }
+        else if (line.trim() === '' && skipTocBlock) {
+            skipTocBlock = false;
+            continue;
+        }
+        else {
+            skipTocBlock = false;
+        }
+        filtered.push(line);
+    }
+    result = filtered.join('\n');
     result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
     result = result.replace(/^Was this page helpful\??\s*$/gim, '');
+    result = result.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ');
+    result = result.replace(/\\([[]])/g, '$1');
+    result = result.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
+    result = result.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
     result = result.replace(/\n{3,}/g, '\n\n');
     return result.trim();
 }
-function normalizeBlockSpacing(markdown) {
-    return markdown
-        .replace(/(\n#{1,6} .+)\n(?!\n)/g, '$1\n\n')
-        .replace(/\n{3,}/g, '\n\n');
-}
-function normalizeTableWhitespace(markdown) {
-    return markdown.replace(/\|([^|\n]+)\|/g, (_match, content) => {
-        const trimmed = typeof content === 'string' ? content.trim() : '';
-        return `| ${trimmed} |`;
-    });
+const HEADING_KEYWORDS = new Set([
+    'overview',
+    'introduction',
+    'summary',
+    'conclusion',
+    'prerequisites',
+    'requirements',
+    'installation',
+    'configuration',
+    'usage',
+    'features',
+    'limitations',
+    'troubleshooting',
+    'faq',
+    'resources',
+    'references',
+    'changelog',
+    'license',
+    'acknowledgments',
+    'appendix',
+]);
+function isLikelyHeadingLine(line) {
+    const trimmed = line.trim();
+    if (!trimmed || trimmed.length > 80)
+        return false;
+    if (/^#{1,6}\s/.test(trimmed))
+        return false;
+    if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
+        return false;
+    if (/[.!?]$/.test(trimmed))
+        return false;
+    if (/^\[.*\]\(.*\)$/.test(trimmed))
+        return false;
+    if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
+        return true;
+    }
+    const words = trimmed.split(/\s+/);
+    if (words.length >= 2 && words.length <= 6) {
+        const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
+        if (isTitleCase)
+            return true;
+    }
+    if (words.length === 1) {
+        const lower = trimmed.toLowerCase();
+        if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed)) {
+            return true;
+        }
+    }
+    return false;
 }
-function normalizeLineEndings(markdown) {
-    return markdown.replace(/\r\n/g, '\n');
+function promoteOrphanHeadings(markdown) {
+    const lines = markdown.split('\n');
+    const result = [];
+    for (let i = 0; i < lines.length; i += 1) {
+        const line = lines[i] ?? '';
+        const prevLine = i > 0 ? lines[i - 1] : '';
+        const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
+        const isStandalone = prevLine?.trim() === '' && nextLine?.trim() === '';
+        const isPrecededByBlank = prevLine?.trim() === '';
+        if ((isStandalone || isPrecededByBlank) && isLikelyHeadingLine(line)) {
+            const trimmed = line.trim();
+            const isExample = /^example:\s/i.test(trimmed);
+            const prefix = isExample ? '### ' : '## ';
+            result.push(prefix + trimmed);
+        }
+        else {
+            result.push(line);
+        }
+    }
+    return result.join('\n');
 }
 function formatFetchedDate(isoString) {
     try {
         const date = new Date(isoString);
-        const options = {
-            year: 'numeric',
-            month: 'short',
-            day: 'numeric',
-        };
-        return date.toLocaleDateString('en-US', options);
+        const day = String(date.getDate()).padStart(2, '0');
+        const month = String(date.getMonth() + 1).padStart(2, '0');
+        const year = date.getFullYear();
+        return `${day}-${month}-${year}`;
     }
     catch {
         return isoString;
@@ -1049,20 +1149,24 @@ function formatFetchedDate(isoString) {
 function buildMetadataFooter(metadata, fallbackUrl) {
     if (!metadata)
         return '';
-    const lines = [];
+    const lines = ['---', ''];
+    const url = metadata.url || fallbackUrl;
+    const parts = [];
     if (metadata.title)
-        lines.push(`> *${metadata.title}*`);
-    if (metadata.description)
-        lines.push(`> *${metadata.description}*`);
+        parts.push(`_${metadata.title}_`);
     if (metadata.author)
-        lines.push(`> *${metadata.author}*`);
-    if (metadata.url)
-        lines.push(`> *<${metadata.url}>*`);
-    else if (fallbackUrl)
-        lines.push(`> *<${fallbackUrl}>*`);
+        parts.push(`_${metadata.author}_`);
+    if (url)
+        parts.push(`[_Original Source_](${url})`);
     if (metadata.fetchedAt) {
         const formattedDate = formatFetchedDate(metadata.fetchedAt);
-        lines.push(`> *${formattedDate}*`);
+        parts.push(`_${formattedDate}_`);
+    }
+    if (parts.length > 0) {
+        lines.push(` ${parts.join(' | ')}`);
+    }
+    if (metadata.description) {
+        lines.push(` <sub>${metadata.description}</sub>`);
     }
     return lines.join('\n');
 }
@@ -1277,78 +1381,95 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
 }
 const MIN_CONTENT_RATIO = 0.3;
 const MIN_HTML_LENGTH_FOR_GATE = 100;
-function stripHtmlTags(html) {
-    const parts = [];
+const MIN_HEADING_RETENTION_RATIO = 0.7;
+function countHeadings(html) {
+    if (!html)
+        return 0;
+    // Match opening heading tags <h1> through <h6>
+    const headingPattern = /<h[1-6](?:\s[^>]*)?>([^<]*)<\/h[1-6]>/gi;
+    const matches = html.match(headingPattern);
+    return matches ? matches.length : 0;
+}
+function isHeadingStructurePreserved(article, originalHtml) {
+    if (!article)
+        return false;
+    // Cache heading counts to avoid duplicate regex matching
+    const originalHeadingCount = countHeadings(originalHtml);
+    const articleHeadingCount = countHeadings(article.content);
+    // If original has no headings, structure is trivially preserved
+    if (originalHeadingCount === 0)
+        return true;
+    // If article lost >50% of headings, structure is broken
+    const retentionRatio = articleHeadingCount / originalHeadingCount;
+    return retentionRatio >= MIN_HEADING_RETENTION_RATIO;
+}
+function stripHtmlTagsForLength(html) {
+    let result = '';
     let inTag = false;
     for (const char of html) {
         if (char === '<') {
             inTag = true;
-            continue;
         }
-        if (char === '>') {
+        else if (char === '>') {
             inTag = false;
-            continue;
         }
-        if (!inTag) {
-            parts.push(char);
+        else if (!inTag) {
+            result += char;
         }
     }
-    return parts.join('');
-}
-function estimateTextLength(html) {
-    return stripHtmlTags(html).replace(/\s+/g, ' ').trim().length;
+    return result;
 }
 export function isExtractionSufficient(article, originalHtml) {
     if (!article)
         return false;
     const articleLength = article.textContent.length;
-    const originalLength = estimateTextLength(originalHtml);
+    const originalLength = stripHtmlTagsForLength(originalHtml)
+        .replace(/\s+/g, ' ')
+        .trim().length;
     if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
         return true;
     return articleLength / originalLength >= MIN_CONTENT_RATIO;
 }
 export function determineContentExtractionSource(article) {
-    return !!article;
-}
-function applyArticleMetadata(metadata, article) {
-    if (article.title !== undefined)
-        metadata.title = article.title;
-    if (article.byline !== undefined)
-        metadata.author = article.byline;
-}
-function applyExtractedMetadata(metadata, extractedMeta) {
-    if (extractedMeta.title !== undefined)
-        metadata.title = extractedMeta.title;
-    if (extractedMeta.description !== undefined) {
-        metadata.description = extractedMeta.description;
-    }
-    if (extractedMeta.author !== undefined) {
-        metadata.author = extractedMeta.author;
-    }
+    return article !== null;
 }
 export function createContentMetadataBlock(url, article, extractedMeta, shouldExtractFromArticle, includeMetadata) {
     if (!includeMetadata)
         return undefined;
-    const now = new Date().toISOString();
     const metadata = {
         type: 'metadata',
         url,
-        fetchedAt: now,
+        fetchedAt: new Date().toISOString(),
     };
     if (shouldExtractFromArticle && article) {
-        applyArticleMetadata(metadata, article);
-        return metadata;
+        if (article.title !== undefined)
+            metadata.title = article.title;
+        if (article.byline !== undefined)
+            metadata.author = article.byline;
+    }
+    else {
+        if (extractedMeta.title !== undefined)
+            metadata.title = extractedMeta.title;
+        if (extractedMeta.description !== undefined) {
+            metadata.description = extractedMeta.description;
+        }
+        if (extractedMeta.author !== undefined) {
+            metadata.author = extractedMeta.author;
+        }
     }
-    applyExtractedMetadata(metadata, extractedMeta);
     return metadata;
 }
-function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, }) {
+function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, document, }) {
     const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
-    return {
+    const source = {
         sourceHtml: useArticleContent && article ? article.content : html,
         title: useArticleContent && article ? article.title : extractedMeta.title,
         metadata,
     };
+    if (!useArticleContent && document) {
+        return { ...source, document };
+    }
+    return source;
 }
 function logQualityGateFallback({ url, articleLength, }) {
     logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
@@ -1357,20 +1478,27 @@ function logQualityGateFallback({ url, articleLength, }) {
     });
 }
 function shouldUseArticleContent(article, html, url) {
-    const shouldExtractFromArticle = determineContentExtractionSource(article);
-    if (!shouldExtractFromArticle)
+    // Check content sufficiency (length-based quality gate)
+    if (!isExtractionSufficient(article, html)) {
+        logQualityGateFallback({
+            url,
+            articleLength: article.textContent.length,
+        });
         return false;
-    if (isExtractionSufficient(article, html)) {
-        return true;
     }
-    logQualityGateFallback({
-        url,
-        articleLength: article.textContent.length,
-    });
-    return false;
+    // Check heading structure preservation
+    if (!isHeadingStructurePreserved(article, html)) {
+        logDebug('Quality gate: Readability broke heading structure, using full HTML', {
+            url: url.substring(0, 80),
+            originalHeadings: countHeadings(html),
+            articleHeadings: countHeadings(article.content),
+        });
+        return false;
+    }
+    return true;
 }
 function resolveContentSource({ html, url, includeMetadata, signal, }) {
-    const { article, metadata: extractedMeta } = extractContent(html, url, {
+    const { article, metadata: extractedMeta, document, } = extractContentWithDocument(html, url, {
         extractArticle: true,
         ...(signal ? { signal } : {}),
     });
@@ -1384,6 +1512,7 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
         extractedMeta,
         includeMetadata,
         useArticleContent,
+        ...(document ? { document } : {}),
     });
 }
 function tryTransformRawStage(html, url, includeMetadata) {
@@ -1405,6 +1534,7 @@ function buildMarkdownFromContext(context, url, signal) {
     const content = runTransformStage(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
         url,
         ...(signal ? { signal } : {}),
+        ...(context.document ? { document: context.document } : {}),
     }));
     return {
         markdown: content,