npm - @j0hanz/superfetch - Versions diffs - 2.2.0 → 2.2.2 - Mend

@j0hanz/superfetch 2.2.0 → 2.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

package/README.md +363 -614
package/dist/cache.d.ts +2 -2
package/dist/cache.d.ts.map +1 -1
package/dist/cache.js +49 -227
package/dist/cache.js.map +1 -1
package/dist/config.d.ts +6 -0
package/dist/config.d.ts.map +1 -1
package/dist/config.js +20 -27
package/dist/config.js.map +1 -1
package/dist/dom-noise-removal.d.ts +6 -0
package/dist/dom-noise-removal.d.ts.map +1 -0
package/dist/dom-noise-removal.js +482 -0
package/dist/dom-noise-removal.js.map +1 -0
package/dist/errors.d.ts.map +1 -1
package/dist/errors.js +8 -5
package/dist/errors.js.map +1 -1
package/dist/fetch.d.ts.map +1 -1
package/dist/fetch.js +26 -32
package/dist/fetch.js.map +1 -1
package/dist/http-native.d.ts +6 -0
package/dist/http-native.d.ts.map +1 -0
package/dist/http-native.js +645 -0
package/dist/http-native.js.map +1 -0
package/dist/http-utils.d.ts +61 -0
package/dist/http-utils.d.ts.map +1 -0
package/dist/http-utils.js +252 -0
package/dist/http-utils.js.map +1 -0
package/dist/index.js +1 -1
package/dist/index.js.map +1 -1
package/dist/instructions.md +41 -39
package/dist/json.d.ts +2 -0
package/dist/json.d.ts.map +1 -0
package/dist/json.js +30 -0
package/dist/json.js.map +1 -0
package/dist/language-detection.d.ts +13 -0
package/dist/language-detection.d.ts.map +1 -0
package/dist/language-detection.js +283 -0
package/dist/language-detection.js.map +1 -0
package/dist/markdown-cleanup.d.ts +19 -0
package/dist/markdown-cleanup.d.ts.map +1 -0
package/dist/markdown-cleanup.js +283 -0
package/dist/markdown-cleanup.js.map +1 -0
package/dist/observability.d.ts +1 -0
package/dist/observability.d.ts.map +1 -1
package/dist/observability.js +10 -0
package/dist/observability.js.map +1 -1
package/dist/tools.d.ts.map +1 -1
package/dist/tools.js +23 -8
package/dist/tools.js.map +1 -1
package/dist/transform-types.d.ts +81 -0
package/dist/transform-types.d.ts.map +1 -0
package/dist/transform-types.js +6 -0
package/dist/transform-types.js.map +1 -0
package/dist/transform.d.ts +8 -52
package/dist/transform.d.ts.map +1 -1
package/dist/transform.js +419 -825
package/dist/transform.js.map +1 -1
package/dist/type-guards.d.ts +1 -1
package/dist/type-guards.d.ts.map +1 -1
package/dist/type-guards.js +1 -1
package/dist/type-guards.js.map +1 -1
package/dist/workers/transform-worker.js +23 -24
package/dist/workers/transform-worker.js.map +1 -1
package/package.json +85 -86
package/dist/http.d.ts +0 -90
package/dist/http.d.ts.map +0 -1
package/dist/http.js +0 -1576
package/dist/http.js.map +0 -1

package/dist/transform.js CHANGED Viewed

@@ -8,44 +8,25 @@ import { NodeHtmlMarkdown, } from 'node-html-markdown';
 import { z } from 'zod';
 import { isProbablyReaderable, Readability } from '@mozilla/readability';
 import { config } from './config.js';
+import { removeNoiseFromHtml } from './dom-noise-removal.js';
 import { FetchError, getErrorMessage } from './errors.js';
 import { isRawTextContentUrl } from './fetch.js';
+import { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
+import { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
 import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from './observability.js';
-import { isRecord } from './type-guards.js';
+import { isObject } from './type-guards.js';
+// Re-export language detection for backward compatibility
+export { detectLanguageFromCode, resolveLanguageFromAttributes, } from './language-detection.js';
+// Re-export markdown cleanup for backward compatibility
+export { cleanupMarkdownArtifacts, promoteOrphanHeadings, } from './markdown-cleanup.js';
+// Re-export DOM noise removal for backward compatibility
+export { removeNoiseFromHtml } from './dom-noise-removal.js';
 function getAbortReason(signal) {
-    if (!isRecord(signal))
+    if (!isObject(signal))
         return undefined;
     return 'reason' in signal ? signal.reason : undefined;
 }
-function getBodyInnerHtml(document) {
-    if (!isRecord(document))
-        return undefined;
-    const { body } = document;
-    if (!isRecord(body))
-        return undefined;
-    const { innerHTML } = body;
-    return typeof innerHTML === 'string' && innerHTML.length > 0
-        ? innerHTML
-        : undefined;
-}
-function getDocumentToString(document) {
-    if (!isRecord(document))
-        return undefined;
-    if (typeof document.toString !== 'function')
-        return undefined;
-    return document.toString.bind(document);
-}
-function getDocumentElementOuterHtml(document) {
-    if (!isRecord(document))
-        return undefined;
-    const { documentElement } = document;
-    if (!isRecord(documentElement))
-        return undefined;
-    const { outerHTML } = documentElement;
-    return typeof outerHTML === 'string' && outerHTML.length > 0
-        ? outerHTML
-        : undefined;
-}
+// DOM accessor helpers moved to ./dom-noise-removal.ts
 const CODE_BLOCK = {
     fence: '```',
     format: (code, language = '') => {
@@ -93,9 +74,13 @@ export function endTransformStage(context, options) {
 }
 function runTransformStage(url, stage, fn) {
     const context = startTransformStage(url, stage);
-    const result = fn();
-    endTransformStage(context);
-    return result;
+    try {
+        return fn();
+    }
+    finally {
+        // Emit duration even if the stage throws; callers decide how to handle the error.
+        endTransformStage(context);
+    }
 }
 function isTimeoutReason(reason) {
     return reason instanceof Error && reason.name === 'TimeoutError';
@@ -129,46 +114,105 @@ function truncateHtml(html) {
     });
     return html.substring(0, maxSize);
 }
+const META_PROPERTY_HANDLERS = new Map([
+    [
+        'og:title',
+        (ctx, c) => {
+            ctx.title.og = c;
+        },
+    ],
+    [
+        'og:description',
+        (ctx, c) => {
+            ctx.description.og = c;
+        },
+    ],
+    [
+        'og:image',
+        (ctx, c) => {
+            ctx.image = c;
+        },
+    ],
+    [
+        'article:published_time',
+        (ctx, c) => {
+            ctx.publishedAt = c;
+        },
+    ],
+    [
+        'article:modified_time',
+        (ctx, c) => {
+            ctx.modifiedAt = c;
+        },
+    ],
+]);
+const META_NAME_HANDLERS = new Map([
+    [
+        'twitter:title',
+        (ctx, c) => {
+            ctx.title.twitter = c;
+        },
+    ],
+    [
+        'twitter:description',
+        (ctx, c) => {
+            ctx.description.twitter = c;
+        },
+    ],
+    [
+        'description',
+        (ctx, c) => {
+            ctx.description.standard = c;
+        },
+    ],
+    [
+        'author',
+        (ctx, c) => {
+            ctx.author = c;
+        },
+    ],
+]);
 function extractMetadata(document) {
-    const title = {};
-    const description = {};
-    let author;
+    const ctx = {
+        title: {},
+        description: {},
+    };
     for (const tag of document.querySelectorAll('meta')) {
         const content = tag.getAttribute('content')?.trim();
         if (!content)
             continue;
         const property = tag.getAttribute('property');
+        if (property) {
+            META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
+        }
         const name = tag.getAttribute('name');
-        if (property === 'og:title')
-            title.og = content;
-        else if (property === 'og:description')
-            description.og = content;
-        else if (name === 'twitter:title')
-            title.twitter = content;
-        else if (name === 'twitter:description')
-            description.twitter = content;
-        else if (name === 'description')
-            description.standard = content;
-        else if (name === 'author')
-            author = content;
+        if (name) {
+            META_NAME_HANDLERS.get(name)?.(ctx, content);
+        }
     }
     const titleEl = document.querySelector('title');
-    if (!title.standard && titleEl?.textContent) {
-        title.standard = titleEl.textContent.trim();
+    if (!ctx.title.standard && titleEl?.textContent) {
+        ctx.title.standard = titleEl.textContent.trim();
     }
-    const resolvedTitle = title.og ?? title.twitter ?? title.standard;
-    const resolvedDesc = description.og ?? description.twitter ?? description.standard;
+    const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
+    const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
     const metadata = {};
     if (resolvedTitle)
         metadata.title = resolvedTitle;
     if (resolvedDesc)
         metadata.description = resolvedDesc;
-    if (author)
-        metadata.author = author;
+    if (ctx.author)
+        metadata.author = ctx.author;
+    if (ctx.image)
+        metadata.image = ctx.image;
+    if (ctx.publishedAt)
+        metadata.publishedAt = ctx.publishedAt;
+    if (ctx.modifiedAt)
+        metadata.modifiedAt = ctx.modifiedAt;
     return metadata;
 }
 function isReadabilityCompatible(doc) {
-    if (!isRecord(doc))
+    if (!isObject(doc))
         return false;
     return hasDocumentElement(doc) && hasQuerySelectors(doc);
 }
@@ -185,14 +229,18 @@ function extractArticle(document) {
         return null;
     }
     try {
-        const documentClone = document.cloneNode(true);
-        const rawText = documentClone.body.textContent ||
-            documentClone.documentElement.textContent;
+        const doc = document;
+        const rawText = doc.querySelector('body')?.textContent ?? doc.documentElement.textContent;
         const textLength = rawText.replace(/\s+/g, ' ').trim().length;
-        if (textLength >= 400 && !isProbablyReaderable(documentClone)) {
+        if (textLength < 100) {
+            logWarn('Very minimal server-rendered content detected (< 100 chars). ' +
+                'This might be a client-side rendered (SPA) application. ' +
+                'Content extraction may be incomplete.', { textLength });
+        }
+        if (textLength >= 400 && !isProbablyReaderable(doc)) {
             return null;
         }
-        const reader = new Readability(documentClone, { maxElemsToParse: 20_000 });
+        const reader = new Readability(doc, { maxElemsToParse: 20_000 });
         const parsed = reader.parse();
         if (!parsed)
             return null;
@@ -213,8 +261,13 @@ function extractArticle(document) {
 export function extractContent(html, url, options = {
     extractArticle: true,
 }) {
+    const result = extractContentWithDocument(html, url, options);
+    return { article: result.article, metadata: result.metadata };
+}
+function extractContentWithDocument(html, url, options) {
     if (!isValidInput(html, url)) {
-        return { article: null, metadata: {} };
+        const { document } = parseHTML('<html></html>');
+        return { article: null, metadata: {}, document };
     }
     return tryExtractContent(html, url, options);
 }
@@ -229,11 +282,13 @@ function handleExtractionFailure(error, url, signal) {
     }
     throwIfAborted(signal, url, 'extract:error');
     logError('Failed to extract content', error instanceof Error ? error : undefined);
-    return { article: null, metadata: {} };
+    const { document } = parseHTML('<html></html>');
+    return { article: null, metadata: {}, document };
 }
 function extractContentStages(html, url, options) {
     throwIfAborted(options.signal, url, 'extract:begin');
-    const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncateHtml(html)));
+    const truncatedHtml = truncateHtml(html);
+    const { document } = runTransformStage(url, 'extract:parse', () => parseHTML(truncatedHtml));
     throwIfAborted(options.signal, url, 'extract:parsed');
     applyBaseUri(document, url);
     const metadata = runTransformStage(url, 'extract:metadata', () => extractMetadata(document));
@@ -243,6 +298,8 @@ function extractContentStages(html, url, options) {
     return {
         article,
         metadata,
+        document,
+        ...(truncatedHtml.length !== html.length ? { truncated: true } : {}),
     };
 }
 function tryExtractContent(html, url, options) {
@@ -279,522 +336,7 @@ function applyBaseUri(document, url) {
         });
     }
 }
-function containsJsxTag(code) {
-    for (let index = 0; index < code.length - 1; index += 1) {
-        if (code[index] !== '<')
-            continue;
-        const next = code[index + 1];
-        if (!next)
-            continue;
-        if (next >= 'A' && next <= 'Z')
-            return true;
-    }
-    return false;
-}
-function containsWord(source, word) {
-    let startIndex = source.indexOf(word);
-    while (startIndex !== -1) {
-        const before = startIndex === 0 ? '' : source[startIndex - 1];
-        const afterIndex = startIndex + word.length;
-        const after = afterIndex >= source.length ? '' : source[afterIndex];
-        if (!isWordChar(before) && !isWordChar(after))
-            return true;
-        startIndex = source.indexOf(word, startIndex + word.length);
-    }
-    return false;
-}
-function splitLines(content) {
-    return content.split('\n');
-}
-function extractLanguageFromClassName(className) {
-    const tokens = className.match(/\S+/g);
-    if (!tokens)
-        return undefined;
-    for (const token of tokens) {
-        const lower = token.toLowerCase();
-        if (lower.startsWith('language-'))
-            return token.slice('language-'.length);
-        if (lower.startsWith('lang-'))
-            return token.slice('lang-'.length);
-        if (lower.startsWith('highlight-')) {
-            return token.slice('highlight-'.length);
-        }
-    }
-    if (tokens.includes('hljs')) {
-        const langClass = tokens.find((t) => t !== 'hljs' && !t.startsWith('hljs-'));
-        if (langClass)
-            return langClass;
-    }
-    return undefined;
-}
-function resolveLanguageFromDataAttribute(dataLang) {
-    const trimmed = dataLang.trim();
-    if (!trimmed)
-        return undefined;
-    for (const char of trimmed) {
-        if (!isWordChar(char))
-            return undefined;
-    }
-    return trimmed;
-}
-function isWordChar(char) {
-    if (!char)
-        return false;
-    const code = char.charCodeAt(0);
-    return ((code >= 48 && code <= 57) ||
-        (code >= 65 && code <= 90) ||
-        (code >= 97 && code <= 122) ||
-        char === '_');
-}
-const LANGUAGE_PATTERNS = [
-    {
-        language: 'jsx',
-        pattern: {
-            keywords: ['classname=', 'jsx:', "from 'react'", 'from "react"'],
-            custom: (code) => containsJsxTag(code),
-        },
-    },
-    {
-        language: 'typescript',
-        pattern: {
-            wordBoundary: ['interface', 'type'],
-            custom: (_, lower) => [
-                ': string',
-                ':string',
-                ': number',
-                ':number',
-                ': boolean',
-                ':boolean',
-                ': void',
-                ':void',
-                ': any',
-                ':any',
-                ': unknown',
-                ':unknown',
-                ': never',
-                ':never',
-            ].some((hint) => lower.includes(hint)),
-        },
-    },
-    {
-        language: 'rust',
-        pattern: {
-            regex: /\b(?:fn|impl|struct|enum)\b/,
-            keywords: ['let mut'],
-            custom: (_, lower) => lower.includes('use ') && lower.includes('::'),
-        },
-    },
-    {
-        language: 'javascript',
-        pattern: {
-            regex: /\b(?:const|let|var|function|class|async|await|export|import)\b/,
-        },
-    },
-    {
-        language: 'python',
-        pattern: {
-            regex: /\b(?:def|class|import|from)\b/,
-            keywords: ['print(', '__name__'],
-        },
-    },
-    {
-        language: 'bash',
-        pattern: {
-            custom: (code) => detectBashIndicators(code),
-        },
-    },
-    {
-        language: 'css',
-        pattern: {
-            regex: /@media|@import|@keyframes/,
-            custom: (code) => detectCssStructure(code),
-        },
-    },
-    {
-        language: 'html',
-        pattern: {
-            keywords: [
-                '<!doctype',
-                '<html',
-                '<head',
-                '<body',
-                '<div',
-                '<span',
-                '<p',
-                '<a',
-                '<script',
-                '<style',
-            ],
-        },
-    },
-    {
-        language: 'json',
-        pattern: {
-            startsWith: ['{', '['],
-        },
-    },
-    {
-        language: 'yaml',
-        pattern: {
-            custom: (code) => detectYamlStructure(code),
-        },
-    },
-    {
-        language: 'sql',
-        pattern: {
-            wordBoundary: [
-                'select',
-                'insert',
-                'update',
-                'delete',
-                'create',
-                'alter',
-                'drop',
-            ],
-        },
-    },
-    {
-        language: 'go',
-        pattern: {
-            wordBoundary: ['package', 'func'],
-            keywords: ['import "'],
-        },
-    },
-];
-// Bash detection constants
-const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
-const BASH_PKG_MANAGERS = [
-    'npm',
-    'yarn',
-    'pnpm',
-    'npx',
-    'brew',
-    'apt',
-    'pip',
-    'cargo',
-    'go',
-];
-const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
-function isShellPrefix(line) {
-    return (line.startsWith('#!') || line.startsWith('$ ') || line.startsWith('# '));
-}
-function matchesBashCommand(line) {
-    return BASH_COMMANDS.some((cmd) => line === cmd || line.startsWith(`${cmd} `));
-}
-function matchesPackageManagerVerb(line) {
-    for (const mgr of BASH_PKG_MANAGERS) {
-        if (!line.startsWith(`${mgr} `))
-            continue;
-        const rest = line.slice(mgr.length + 1);
-        if (BASH_VERBS.some((v) => rest === v || rest.startsWith(`${v} `))) {
-            return true;
-        }
-    }
-    return false;
-}
-function detectBashIndicators(code) {
-    for (const line of splitLines(code)) {
-        const trimmed = line.trimStart();
-        if (!trimmed)
-            continue;
-        if (isShellPrefix(trimmed) ||
-            matchesBashCommand(trimmed) ||
-            matchesPackageManagerVerb(trimmed)) {
-            return true;
-        }
-    }
-    return false;
-}
-function detectCssStructure(code) {
-    for (const line of splitLines(code)) {
-        const trimmed = line.trimStart();
-        if (!trimmed)
-            continue;
-        const isSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
-            trimmed.includes('{');
-        const isProperty = trimmed.includes(':') && trimmed.includes(';');
-        if (isSelector || isProperty)
-            return true;
-    }
-    return false;
-}
-function detectYamlStructure(code) {
-    for (const line of splitLines(code)) {
-        const trimmed = line.trim();
-        if (!trimmed)
-            continue;
-        const colonIdx = trimmed.indexOf(':');
-        if (colonIdx <= 0)
-            continue;
-        const after = trimmed[colonIdx + 1];
-        if (after === ' ' || after === '\t')
-            return true;
-    }
-    return false;
-}
-function matchesLanguagePattern(code, lower, pattern) {
-    if (pattern.keywords?.some((kw) => lower.includes(kw)))
-        return true;
-    if (pattern.wordBoundary?.some((w) => containsWord(lower, w)))
-        return true;
-    if (pattern.regex?.test(lower))
-        return true;
-    if (pattern.startsWith) {
-        const trimmed = code.trimStart();
-        if (pattern.startsWith.some((prefix) => trimmed.startsWith(prefix)))
-            return true;
-    }
-    if (pattern.custom?.(code, lower))
-        return true;
-    return false;
-}
-export function detectLanguageFromCode(code) {
-    const lower = code.toLowerCase();
-    for (const { language, pattern } of LANGUAGE_PATTERNS) {
-        if (matchesLanguagePattern(code, lower, pattern))
-            return language;
-    }
-    return undefined;
-}
-export function resolveLanguageFromAttributes(className, dataLang) {
-    const classMatch = extractLanguageFromClassName(className);
-    return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
-}
-function isElement(node) {
-    return (isRecord(node) &&
-        'getAttribute' in node &&
-        typeof node.getAttribute === 'function');
-}
-const STRUCTURAL_TAGS = new Set([
-    'script',
-    'style',
-    'noscript',
-    'iframe',
-    'form',
-    'button',
-    'input',
-    'select',
-    'textarea',
-    'svg',
-]);
-const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer', 'aside']);
-const NAVIGATION_ROLES = new Set([
-    'navigation',
-    'banner',
-    'complementary',
-    'contentinfo',
-    'tree',
-    'menubar',
-    'menu',
-    'dialog',
-    'alertdialog',
-    'search',
-]);
-const PROMO_TOKENS = new Set([
-    'banner',
-    'promo',
-    'announcement',
-    'cta',
-    'callout',
-    'advert',
-    'ad',
-    'ads',
-    'sponsor',
-    'newsletter',
-    'subscribe',
-    'cookie',
-    'consent',
-    'popup',
-    'modal',
-    'overlay',
-    'toast',
-    'share',
-    'social',
-    'related',
-    'recommend',
-    'comment',
-    'breadcrumb',
-    'pagination',
-    'pager',
-    'taglist',
-]);
-const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
-const FIXED_PATTERN = /\b(fixed|sticky)\b/;
-const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
-const ISOLATE_PATTERN = /\bisolate\b/;
-const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
-const NOISE_MARKERS = [
-    '<script',
-    '<style',
-    '<noscript',
-    '<iframe',
-    '<nav',
-    '<footer',
-    '<aside',
-    '<header',
-    '<form',
-    '<button',
-    '<input',
-    '<select',
-    '<textarea',
-    '<svg',
-    '<canvas',
-    ' aria-hidden="true"',
-    " aria-hidden='true'",
-    ' hidden',
-    ' role="navigation"',
-    " role='navigation'",
-    ' role="banner"',
-    " role='banner'",
-    ' role="complementary"',
-    " role='complementary'",
-    ' role="contentinfo"',
-    " role='contentinfo'",
-    ' role="tree"',
-    " role='tree'",
-    ' role="menubar"',
-    " role='menubar'",
-    ' role="menu"',
-    " role='menu'",
-    ' banner',
-    ' promo',
-    ' announcement',
-    ' cta',
-    ' callout',
-    ' advert',
-    ' newsletter',
-    ' subscribe',
-    ' cookie',
-    ' consent',
-    ' popup',
-    ' modal',
-    ' overlay',
-    ' toast',
-    ' fixed',
-    ' sticky',
-    ' z-50',
-    ' z-4',
-    ' isolate',
-    ' breadcrumb',
-    ' pagination',
-];
-function mayContainNoise(html) {
-    const haystack = html.toLowerCase();
-    return NOISE_MARKERS.some((marker) => haystack.includes(marker));
-}
-function isFullDocumentHtml(html) {
-    return HTML_DOCUMENT_MARKERS.test(html);
-}
-function isStructuralNoiseTag(tagName) {
-    return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
-}
-function isElementHidden(element) {
-    const style = element.getAttribute('style') ?? '';
-    return (element.getAttribute('hidden') !== null ||
-        element.getAttribute('aria-hidden') === 'true' ||
-        /\bdisplay\s*:\s*none\b/i.test(style) ||
-        /\bvisibility\s*:\s*hidden\b/i.test(style));
-}
-function hasNoiseRole(role) {
-    return role !== null && NAVIGATION_ROLES.has(role);
-}
-function tokenizeIdentifierLikeText(value) {
-    return value
-        .toLowerCase()
-        .replace(/[^a-z0-9]+/g, ' ')
-        .trim()
-        .split(' ')
-        .filter(Boolean);
-}
-function matchesPromoIdOrClass(className, id) {
-    const tokens = tokenizeIdentifierLikeText(`${className} ${id}`);
-    return tokens.some((token) => PROMO_TOKENS.has(token));
-}
-function matchesFixedOrHighZIsolate(className) {
-    return (FIXED_PATTERN.test(className) ||
-        (HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
-}
-function readElementMetadata(element) {
-    return {
-        tagName: element.tagName.toLowerCase(),
-        className: element.getAttribute('class') ?? '',
-        id: element.getAttribute('id') ?? '',
-        role: element.getAttribute('role'),
-        isHidden: isElementHidden(element),
-    };
-}
-function isBoilerplateHeader({ className, id, role, }) {
-    if (hasNoiseRole(role))
-        return true;
-    const combined = `${className} ${id}`.toLowerCase();
-    return HEADER_NOISE_PATTERN.test(combined);
-}
-function isNoiseElement(node) {
-    const metadata = readElementMetadata(node);
-    return (isStructuralNoiseTag(metadata.tagName) ||
-        ALWAYS_NOISE_TAGS.has(metadata.tagName) ||
-        (metadata.tagName === 'header' && isBoilerplateHeader(metadata)) ||
-        metadata.isHidden ||
-        hasNoiseRole(metadata.role) ||
-        matchesFixedOrHighZIsolate(metadata.className) ||
-        matchesPromoIdOrClass(metadata.className, metadata.id));
-}
-function removeNoiseNodes(nodes) {
-    for (let index = nodes.length - 1; index >= 0; index -= 1) {
-        const node = typeof nodes.item === 'function' ? nodes.item(index) : nodes[index];
-        if (!node)
-            continue;
-        if (isElement(node) && isNoiseElement(node)) {
-            node.remove();
-        }
-    }
-}
-function stripNoiseNodes(document) {
-    // Use targeted selectors for common noise elements instead of querySelectorAll('*')
-    const targetSelectors = [
-        'nav',
-        'footer',
-        'aside',
-        'header[class*="site"]',
-        'header[class*="nav"]',
-        'header[class*="menu"]',
-        '[role="banner"]',
-        '[role="navigation"]',
-        '[role="dialog"]',
-        '[style*="display: none"]',
-        '[style*="display:none"]',
-        '[hidden]',
-        '[aria-hidden="true"]',
-    ].join(',');
-    const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
-    // Remove in reverse order to handle nested elements correctly
-    removeNoiseNodes(potentialNoiseNodes);
-    // Second pass: check remaining elements for noise patterns (promo, fixed positioning, etc.)
-    const allElements = document.querySelectorAll('*');
-    removeNoiseNodes(allElements);
-}
-function removeNoiseFromHtml(html) {
-    const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
-    if (!shouldParse)
-        return html;
-    try {
-        const { document } = parseHTML(html);
-        stripNoiseNodes(document);
-        const bodyInnerHtml = getBodyInnerHtml(document);
-        if (bodyInnerHtml)
-            return bodyInnerHtml;
-        const docToString = getDocumentToString(document);
-        if (docToString)
-            return docToString();
-        const documentElementOuterHtml = getDocumentElementOuterHtml(document);
-        if (documentElementOuterHtml)
-            return documentElementOuterHtml;
-        return html;
-    }
-    catch {
-        return html;
-    }
-}
+// DOM noise removal functions moved to ./dom-noise-removal.ts
 function buildInlineCode(content) {
     const runs = content.match(/`+/g);
     let longest = '';
@@ -805,8 +347,11 @@ function buildInlineCode(content) {
             }
         }
     }
+    // Use a fence longer than any run of backticks in the content.
     const delimiter = `\`${longest}`;
-    const padding = delimiter.length > 1 ? ' ' : '';
+    // Only pad when needed to avoid altering code spans unnecessarily.
+    // CommonMark recommends padding when the code starts/ends with a backtick.
+    const padding = content.startsWith('`') || content.endsWith('`') ? ' ' : '';
     return `${delimiter}${padding}${content}${padding}${delimiter}`;
 }
 function deriveAltFromImageUrl(src) {
@@ -829,16 +374,13 @@ function deriveAltFromImageUrl(src) {
     }
 }
 function isCodeBlock(parent) {
-    if (!isRecord(parent))
+    if (!isObject(parent))
         return false;
     const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
     return ['PRE', 'WRAPPED-PRE'].includes(tagName);
 }
 function hasGetAttribute(value) {
-    return isRecord(value) && typeof value.getAttribute === 'function';
-}
-function hasCodeBlockTranslators(value) {
-    return isRecord(value) && isRecord(value.codeBlockTranslators);
+    return isObject(value) && typeof value.getAttribute === 'function';
 }
 function buildInlineCodeTranslator() {
     return {
@@ -855,37 +397,19 @@ function resolveAttributeLanguage(node) {
     const dataLanguage = getAttribute?.('data-language') ?? '';
     return resolveLanguageFromAttributes(className, dataLanguage);
 }
-function resolveCodeBlockTranslators(visitor) {
-    const childTranslators = isRecord(visitor) ? visitor.instance : null;
-    return hasCodeBlockTranslators(childTranslators)
-        ? childTranslators.codeBlockTranslators
-        : null;
-}
-function buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators) {
-    return {
-        noEscape: true,
-        preserveWhitespace: true,
-        ...(codeBlockTranslators
-            ? { childTranslators: codeBlockTranslators }
-            : null),
-        postprocess: ({ content }) => {
-            const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
-            return CODE_BLOCK.format(content, language);
-        },
-    };
-}
 function buildCodeTranslator(ctx) {
-    if (!isRecord(ctx))
+    if (!isObject(ctx))
         return buildInlineCodeTranslator();
-    const { node, parent, visitor } = ctx;
+    const { parent } = ctx;
     if (!isCodeBlock(parent))
         return buildInlineCodeTranslator();
-    const attributeLanguage = resolveAttributeLanguage(node);
-    const codeBlockTranslators = resolveCodeBlockTranslators(visitor);
-    return buildCodeBlockTranslator(attributeLanguage, codeBlockTranslators);
+    return {
+        noEscape: true,
+        preserveWhitespace: true,
+    };
 }
 function buildImageTranslator(ctx) {
-    if (!isRecord(ctx))
+    if (!isObject(ctx))
         return { content: '' };
     const { node } = ctx;
     const getAttribute = hasGetAttribute(node)
@@ -898,19 +422,57 @@ function buildImageTranslator(ctx) {
         content: `![${alt}](${src})`,
     };
 }
+function findLanguageFromCodeChild(node) {
+    if (!isObject(node))
+        return undefined;
+    const { childNodes } = node;
+    if (!Array.isArray(childNodes))
+        return undefined;
+    for (const child of childNodes) {
+        if (!isObject(child))
+            continue;
+        const tagName = typeof child.rawTagName === 'string'
+            ? child.rawTagName.toUpperCase()
+            : '';
+        if (tagName === 'CODE') {
+            return resolveAttributeLanguage(child);
+        }
+    }
+    return undefined;
+}
+function createCodeBlockPostprocessor(language) {
+    return ({ content }) => {
+        const trimmed = content.trim();
+        if (!trimmed)
+            return '';
+        const resolvedLanguage = language ?? detectLanguageFromCode(trimmed) ?? '';
+        return CODE_BLOCK.format(trimmed, resolvedLanguage);
+    };
+}
+function buildPreTranslator(ctx) {
+    if (!isObject(ctx))
+        return {};
+    const { node } = ctx;
+    const attributeLanguage = resolveAttributeLanguage(node) ?? findLanguageFromCodeChild(node);
+    return {
+        noEscape: true,
+        preserveWhitespace: true,
+        postprocess: createCodeBlockPostprocessor(attributeLanguage),
+    };
+}
 function createCustomTranslators() {
     return {
         code: (ctx) => buildCodeTranslator(ctx),
         img: (ctx) => buildImageTranslator(ctx),
         dl: (ctx) => {
-            if (!isRecord(ctx) || !isRecord(ctx.node)) {
+            if (!isObject(ctx) || !isObject(ctx.node)) {
                 return { content: '' };
             }
             const node = ctx.node;
             const childNodes = Array.isArray(node.childNodes) ? node.childNodes : [];
             const items = childNodes
                 .map((child) => {
-                if (!isRecord(child))
+                if (!isObject(child))
                     return '';
                 const nodeName = typeof child.nodeName === 'string'
                     ? child.nodeName.toUpperCase()
@@ -940,6 +502,8 @@ function createCustomTranslators() {
         sup: () => ({
             postprocess: ({ content }) => `^${content}^`,
         }),
+        // Fix #6: Handle <pre> without <code> - wrap in fenced code block
+        pre: (ctx) => buildPreTranslator(ctx),
     };
 }
 let markdownInstance = null;
@@ -955,9 +519,11 @@ function getMarkdownConverter() {
     markdownInstance ??= createMarkdownInstance();
     return markdownInstance;
 }
-function translateHtmlToMarkdown(html, url, signal) {
+function translateHtmlToMarkdown(html, url, signal, document, skipNoiseRemoval) {
     throwIfAborted(signal, url, 'markdown:begin');
-    const cleanedHtml = runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html));
+    const cleanedHtml = skipNoiseRemoval
+        ? html
+        : runTransformStage(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url));
     throwIfAborted(signal, url, 'markdown:cleaned');
     const content = runTransformStage(url, 'markdown:translate', () => getMarkdownConverter().translate(cleanedHtml).trim());
     throwIfAborted(signal, url, 'markdown:translated');
@@ -973,151 +539,18 @@ export function htmlToMarkdown(html, metadata, options) {
     if (!html)
         return buildMetadataFooter(metadata, url);
     try {
-        const content = translateHtmlToMarkdown(html, url, options?.signal);
+        const content = translateHtmlToMarkdown(html, url, options?.signal, options?.document, options?.skipNoiseRemoval);
         return appendMetadataFooter(content, metadata, url);
     }
     catch (error) {
         if (error instanceof FetchError) {
             throw error;
         }
+        logError('Failed to convert HTML to markdown', error instanceof Error ? error : undefined);
         return buildMetadataFooter(metadata, url);
     }
 }
-function cleanupMarkdownArtifacts(content) {
-    let result = content;
-    const fixOrphanHeadings = (text) => {
-        return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (match, prefix, hashes, heading) => {
-            if (typeof prefix !== 'string' ||
-                typeof hashes !== 'string' ||
-                typeof heading !== 'string') {
-                return match;
-            }
-            if (heading.length > 150) {
-                return match;
-            }
-            const trimmedPrefix = prefix.trim();
-            if (trimmedPrefix === '') {
-                return `${hashes} ${heading}\n\n`;
-            }
-            return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
-        });
-    };
-    result = fixOrphanHeadings(result);
-    result = result.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
-    const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)\s*/g;
-    result = result.replace(zeroWidthAnchorLink, '');
-    result = result.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '');
-    result = result.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '');
-    result = result.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
-    result = result.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
-    result = result.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
-    result = result.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
-    const tocLinkLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
-    const lines = result.split('\n');
-    const filtered = [];
-    let skipTocBlock = false;
-    for (let i = 0; i < lines.length; i += 1) {
-        const line = lines[i] ?? '';
-        const prevLine = i > 0 ? (lines[i - 1] ?? '') : '';
-        const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
-        if (tocLinkLine.test(line)) {
-            const prevIsToc = tocLinkLine.test(prevLine) || prevLine.trim() === '';
-            const nextIsToc = tocLinkLine.test(nextLine) || nextLine.trim() === '';
-            if (prevIsToc || nextIsToc) {
-                skipTocBlock = true;
-                continue;
-            }
-        }
-        else if (line.trim() === '' && skipTocBlock) {
-            skipTocBlock = false;
-            continue;
-        }
-        else {
-            skipTocBlock = false;
-        }
-        filtered.push(line);
-    }
-    result = filtered.join('\n');
-    result = result.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[');
-    result = result.replace(/^Was this page helpful\??\s*$/gim, '');
-    result = result.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ');
-    result = result.replace(/\\([[]])/g, '$1');
-    result = result.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
-    result = result.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
-    result = result.replace(/\n{3,}/g, '\n\n');
-    return result.trim();
-}
-const HEADING_KEYWORDS = new Set([
-    'overview',
-    'introduction',
-    'summary',
-    'conclusion',
-    'prerequisites',
-    'requirements',
-    'installation',
-    'configuration',
-    'usage',
-    'features',
-    'limitations',
-    'troubleshooting',
-    'faq',
-    'resources',
-    'references',
-    'changelog',
-    'license',
-    'acknowledgments',
-    'appendix',
-]);
-function isLikelyHeadingLine(line) {
-    const trimmed = line.trim();
-    if (!trimmed || trimmed.length > 80)
-        return false;
-    if (/^#{1,6}\s/.test(trimmed))
-        return false;
-    if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
-        return false;
-    if (/[.!?]$/.test(trimmed))
-        return false;
-    if (/^\[.*\]\(.*\)$/.test(trimmed))
-        return false;
-    if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
-        return true;
-    }
-    const words = trimmed.split(/\s+/);
-    if (words.length >= 2 && words.length <= 6) {
-        const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
-        if (isTitleCase)
-            return true;
-    }
-    if (words.length === 1) {
-        const lower = trimmed.toLowerCase();
-        if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed)) {
-            return true;
-        }
-    }
-    return false;
-}
-function promoteOrphanHeadings(markdown) {
-    const lines = markdown.split('\n');
-    const result = [];
-    for (let i = 0; i < lines.length; i += 1) {
-        const line = lines[i] ?? '';
-        const prevLine = i > 0 ? lines[i - 1] : '';
-        const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
-        const isStandalone = prevLine?.trim() === '' && nextLine?.trim() === '';
-        const isPrecededByBlank = prevLine?.trim() === '';
-        if ((isStandalone || isPrecededByBlank) && isLikelyHeadingLine(line)) {
-            const trimmed = line.trim();
-            const isExample = /^example:\s/i.test(trimmed);
-            const prefix = isExample ? '### ' : '## ';
-            result.push(prefix + trimmed);
-        }
-        else {
-            result.push(line);
-        }
-    }
-    return result.join('\n');
-}
+// Markdown cleanup functions moved to ./markdown-cleanup.ts
 function formatFetchedDate(isoString) {
     try {
         const date = new Date(isoString);
@@ -1366,54 +799,114 @@ function tryTransformRawContent({ html, url, includeMetadata, }) {
 const MIN_CONTENT_RATIO = 0.3;
 const MIN_HTML_LENGTH_FOR_GATE = 100;
 const MIN_HEADING_RETENTION_RATIO = 0.7;
-function countHeadings(html) {
-    if (!html)
-        return 0;
-    // Match opening heading tags <h1> through <h6>
-    const headingPattern = /<h[1-6](?:\s[^>]*)?>([^<]*)<\/h[1-6]>/gi;
-    const matches = html.match(headingPattern);
-    return matches ? matches.length : 0;
-}
-function isHeadingStructurePreserved(article, originalHtml) {
-    if (!article)
-        return false;
-    // Cache heading counts to avoid duplicate regex matching
-    const originalHeadingCount = countHeadings(originalHtml);
-    const articleHeadingCount = countHeadings(article.content);
-    // If original has no headings, structure is trivially preserved
-    if (originalHeadingCount === 0)
-        return true;
-    // If article lost >50% of headings, structure is broken
-    const retentionRatio = articleHeadingCount / originalHeadingCount;
-    return retentionRatio >= MIN_HEADING_RETENTION_RATIO;
-}
-function stripHtmlTagsForLength(html) {
-    let result = '';
-    let inTag = false;
-    for (const char of html) {
-        if (char === '<') {
-            inTag = true;
+const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
+/**
+ * Count headings using DOM querySelectorAll.
+ * Handles nested content like <h2><span>Text</span></h2> correctly.
+ */
+function countHeadingsDom(htmlOrDocument) {
+    if (typeof htmlOrDocument === 'string') {
+        // Wrap fragments in document structure for proper parsing
+        const htmlToParse = needsDocumentWrapper(htmlOrDocument)
+            ? wrapHtmlFragment(htmlOrDocument)
+            : htmlOrDocument;
+        const { document: doc } = parseHTML(htmlToParse);
+        return doc.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
+    }
+    return htmlOrDocument.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
+}
+function countCodeBlocksDom(htmlOrDocument) {
+    if (typeof htmlOrDocument === 'string') {
+        // Wrap fragments in document structure for proper parsing
+        const htmlToParse = needsDocumentWrapper(htmlOrDocument)
+            ? wrapHtmlFragment(htmlOrDocument)
+            : htmlOrDocument;
+        const { document: doc } = parseHTML(htmlToParse);
+        return doc.querySelectorAll('pre').length;
+    }
+    return htmlOrDocument.querySelectorAll('pre').length;
+}
+/**
+ * Check if HTML string needs document wrapper for proper parsing.
+ * Fragments without doctype/html/body tags need wrapping.
+ */
+function needsDocumentWrapper(html) {
+    const trimmed = html.trim().toLowerCase();
+    return (!trimmed.startsWith('<!doctype') &&
+        !trimmed.startsWith('<html') &&
+        !trimmed.startsWith('<body'));
+}
+/**
+ * Wrap HTML fragment in minimal document structure for proper parsing.
+ */
+function wrapHtmlFragment(html) {
+    return `<!DOCTYPE html><html><body>${html}</body></html>`;
+}
+/**
+ * Get visible text length from HTML, excluding script/style/noscript content.
+ * Fixes the bug where stripHtmlTagsForLength() counted JS/CSS as visible text.
+ */
+function getVisibleTextLength(htmlOrDocument) {
+    // For string input, parse the HTML
+    if (typeof htmlOrDocument === 'string') {
+        // Wrap fragments in document structure for proper parsing
+        const htmlToParse = needsDocumentWrapper(htmlOrDocument)
+            ? wrapHtmlFragment(htmlOrDocument)
+            : htmlOrDocument;
+        const { document: doc } = parseHTML(htmlToParse);
+        // Remove non-visible content that inflates text length
+        for (const el of doc.querySelectorAll('script,style,noscript')) {
+            el.remove();
         }
-        else if (char === '>') {
-            inTag = false;
-        }
-        else if (!inTag) {
-            result += char;
-        }
-    }
-    return result;
-}
-export function isExtractionSufficient(article, originalHtml) {
+        // Get text content from body or documentElement
+        // Note: linkedom may return null for body on HTML fragments despite types
+        const body = doc.body;
+        const docElement = doc.documentElement;
+        const text = body?.textContent ?? docElement?.textContent ?? '';
+        return text.replace(/\s+/g, ' ').trim().length;
+    }
+    // For Document input, clone to avoid mutation
+    const workDoc = htmlOrDocument.cloneNode(true);
+    // Remove non-visible content that inflates text length
+    for (const el of workDoc.querySelectorAll('script,style,noscript')) {
+        el.remove();
+    }
+    // Get text content from body or documentElement
+    // Note: linkedom may return null for body on HTML fragments despite types
+    const body = workDoc.body;
+    const docElement = workDoc.documentElement;
+    const text = body?.textContent ?? docElement?.textContent ?? '';
+    return text.replace(/\s+/g, ' ').trim().length;
+}
+export function isExtractionSufficient(article, originalHtmlOrDocument) {
     if (!article)
         return false;
     const articleLength = article.textContent.length;
-    const originalLength = stripHtmlTagsForLength(originalHtml)
-        .replace(/\s+/g, ' ')
-        .trim().length;
+    // Use DOM-based visible text length to exclude script/style content
+    const originalLength = getVisibleTextLength(originalHtmlOrDocument);
     if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
         return true;
     return articleLength / originalLength >= MIN_CONTENT_RATIO;
 }
+const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
+const MAX_TRUNCATED_LINE_RATIO = 0.5;
+/**
+ * Detect if extracted text has many truncated/incomplete sentences.
+ * Lines longer than 20 chars that don't end with sentence punctuation
+ * are considered potentially truncated.
+ */
+function hasTruncatedSentences(text) {
+    const lines = text
+        .split('\n')
+        .filter((line) => line.trim().length > MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK);
+    if (lines.length < 3)
+        return false;
+    const incompleteLines = lines.filter((line) => {
+        const trimmed = line.trim();
+        return !/[.!?:;]$/.test(trimmed);
+    });
+    return incompleteLines.length / lines.length > MAX_TRUNCATED_LINE_RATIO;
+}
 export function determineContentExtractionSource(article) {
     return article !== null;
 }
@@ -1443,12 +936,83 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
     }
     return metadata;
 }
-function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, }) {
+/**
+ * Content root selectors in priority order.
+ * These identify the main content area on a page.
+ */
+const CONTENT_ROOT_SELECTORS = [
+    'main',
+    'article',
+    '[role="main"]',
+    '#content',
+    '#main-content',
+    '.content',
+    '.main-content',
+    '.post-content',
+    '.article-content',
+    '.entry-content',
+    '[itemprop="articleBody"]',
+    '[data-content]',
+    '.post-body',
+    '.article-body',
+];
+/**
+ * Find the main content root element in a document.
+ * Returns the innerHTML if found, undefined otherwise.
+ */
+function findContentRoot(document) {
+    for (const selector of CONTENT_ROOT_SELECTORS) {
+        const element = document.querySelector(selector);
+        if (!element)
+            continue;
+        // Check if element has meaningful content
+        const innerHTML = typeof element.innerHTML === 'string'
+            ? element.innerHTML
+            : undefined;
+        if (innerHTML && innerHTML.trim().length > 100) {
+            return innerHTML;
+        }
+    }
+    return undefined;
+}
+function buildContentSource({ html, url, article, extractedMeta, includeMetadata, useArticleContent, document, }) {
     const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
+    // If using article content, return it directly
+    if (useArticleContent && article) {
+        return {
+            sourceHtml: article.content,
+            title: article.title,
+            metadata,
+        };
+    }
+    // Try content root fallback before using full HTML
+    if (document) {
+        // Apply noise removal to HTML first (without passing document) to get cleaned HTML,
+        // then parse and find content root. This prevents the aggressive DOM stripping that
+        // happens when noise removal is given the original parsed document.
+        const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
+        const { document: cleanedDoc } = parseHTML(cleanedHtml);
+        const contentRoot = findContentRoot(cleanedDoc);
+        if (contentRoot) {
+            logDebug('Using content root fallback instead of full HTML', {
+                url: url.substring(0, 80),
+                contentLength: contentRoot.length,
+            });
+            return {
+                sourceHtml: contentRoot,
+                title: extractedMeta.title,
+                metadata,
+                // Skip noise removal - this HTML is already from a cleaned document
+                skipNoiseRemoval: true,
+            };
+        }
+    }
+    // Fall back to full HTML
     return {
-        sourceHtml: useArticleContent && article ? article.content : html,
-        title: useArticleContent && article ? article.title : extractedMeta.title,
+        sourceHtml: html,
+        title: extractedMeta.title,
         metadata,
+        ...(document ? { document } : {}),
     };
 }
 function logQualityGateFallback({ url, articleLength, }) {
@@ -1457,33 +1021,66 @@ function logQualityGateFallback({ url, articleLength, }) {
         articleLength,
     });
 }
-function shouldUseArticleContent(article, html, url) {
-    // Check content sufficiency (length-based quality gate)
-    if (!isExtractionSufficient(article, html)) {
-        logQualityGateFallback({
-            url,
-            articleLength: article.textContent.length,
-        });
-        return false;
+function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
+    const articleLength = article.textContent.length;
+    const originalLength = getVisibleTextLength(originalHtmlOrDocument);
+    // If the document is tiny, don't gate too aggressively.
+    if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
+        const ratio = articleLength / originalLength;
+        if (ratio < MIN_CONTENT_RATIO) {
+            logQualityGateFallback({ url, articleLength });
+            return false;
+        }
     }
-    // Check heading structure preservation
-    if (!isHeadingStructurePreserved(article, html)) {
-        logDebug('Quality gate: Readability broke heading structure, using full HTML', {
+    // Heading structure retention (compute counts once to avoid repeated DOM queries/parses).
+    const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
+    if (originalHeadings > 0) {
+        const articleHeadings = countHeadingsDom(article.content);
+        const retentionRatio = articleHeadings / originalHeadings;
+        if (retentionRatio < MIN_HEADING_RETENTION_RATIO) {
+            logDebug('Quality gate: Readability broke heading structure, using full HTML', {
+                url: url.substring(0, 80),
+                originalHeadings,
+                articleHeadings,
+            });
+            return false;
+        }
+    }
+    const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
+    if (originalCodeBlocks > 0) {
+        const articleCodeBlocks = countCodeBlocksDom(article.content);
+        const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
+        // Always log code block counts for debugging
+        logDebug('Code block retention check', {
             url: url.substring(0, 80),
-            originalHeadings: countHeadings(html),
-            articleHeadings: countHeadings(article.content),
+            originalCodeBlocks,
+            articleCodeBlocks,
+            codeRetentionRatio,
         });
+        if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
+            logDebug('Quality gate: Readability removed code blocks, using full HTML', {
+                url: url.substring(0, 80),
+                originalCodeBlocks,
+                articleCodeBlocks,
+            });
+            return false;
+        }
+    }
+    // Layout extraction issue: truncated/fragmented lines.
+    if (hasTruncatedSentences(article.textContent)) {
+        logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', { url: url.substring(0, 80) });
         return false;
     }
     return true;
 }
 function resolveContentSource({ html, url, includeMetadata, signal, }) {
-    const { article, metadata: extractedMeta } = extractContent(html, url, {
+    const { article, metadata: extractedMeta, document, } = extractContentWithDocument(html, url, {
         extractArticle: true,
         ...(signal ? { signal } : {}),
     });
+    const originalDocument = parseHTML(html).document;
     const useArticleContent = article
-        ? shouldUseArticleContent(article, html, url)
+        ? shouldUseArticleContent(article, originalDocument, url)
         : false;
     return buildContentSource({
         html,
@@ -1492,6 +1089,7 @@ function resolveContentSource({ html, url, includeMetadata, signal, }) {
         extractedMeta,
         includeMetadata,
         useArticleContent,
+        document,
     });
 }
 function tryTransformRawStage(html, url, includeMetadata) {
@@ -1513,6 +1111,8 @@ function buildMarkdownFromContext(context, url, signal) {
     const content = runTransformStage(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
         url,
         ...(signal ? { signal } : {}),
+        ...(context.document ? { document: context.document } : {}),
+        ...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
     }));
     return {
         markdown: content,
@@ -1606,6 +1206,12 @@ class WorkerPool {
     timeoutMs;
     queueMax;
     closed = false;
+    createAbortError(url, stage) {
+        return new FetchError('Request was canceled', url, 499, {
+            reason: 'aborted',
+            stage,
+        });
+    }
     ensureOpen() {
         if (this.closed) {
             throw new Error('Transform worker pool closed');
@@ -1614,10 +1220,7 @@ class WorkerPool {
     ensureNotAborted(signal, url, stage) {
         if (!signal?.aborted)
             return;
-        throw new FetchError('Request was canceled', url, 499, {
-            reason: 'aborted',
-            stage,
-        });
+        throw this.createAbortError(url, stage);
     }
     ensureQueueCapacity(url) {
         if (this.queue.length < this.queueMax)
@@ -1682,10 +1285,7 @@ class WorkerPool {
     abortInflightTask(id, url, workerIndex) {
         const slot = this.workers[workerIndex];
         this.cancelWorkerTask(slot, id);
-        this.failTask(id, new FetchError('Request was canceled', url, 499, {
-            reason: 'aborted',
-            stage: 'transform:signal-abort',
-        }));
+        this.failTask(id, this.createAbortError(url, 'transform:signal-abort'));
         if (slot) {
             this.restartWorker(workerIndex, slot);
         }
@@ -1695,10 +1295,7 @@ class WorkerPool {
         if (queuedIndex === -1)
             return;
         this.queue.splice(queuedIndex, 1);
-        reject(new FetchError('Request was canceled', url, 499, {
-            reason: 'aborted',
-            stage: 'transform:queued-abort',
-        }));
+        reject(this.createAbortError(url, 'transform:queued-abort'));
     }
     createWorkerSlot(worker) {
         return {
@@ -1854,10 +1451,7 @@ class WorkerPool {
         if (!task.signal?.aborted)
             return false;
         this.clearAbortListener(task.signal, task.abortListener);
-        task.reject(new FetchError('Request was canceled', task.url, 499, {
-            reason: 'aborted',
-            stage: 'transform:dispatch',
-        }));
+        task.reject(this.createAbortError(task.url, 'transform:dispatch'));
         return true;
     }
     markSlotBusy(slot, task) {