PyPI - html-to-markdown - Versions diffs - 2.0.0__cp310-abi3-macosx_11_0_arm64.whl - Mend

html-to-markdown 2.0.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (17) hide show

html_to_markdown/__init__.py +48 -0
html_to_markdown/__main__.py +16 -0
html_to_markdown/_html_to_markdown.abi3.so +0 -0
html_to_markdown/_rust.pyi +79 -0
html_to_markdown/api.py +100 -0
html_to_markdown/bin/html-to-markdown +0 -0
html_to_markdown/cli.py +9 -0
html_to_markdown/cli_proxy.py +144 -0
html_to_markdown/exceptions.py +81 -0
html_to_markdown/options.py +211 -0
html_to_markdown/py.typed +0 -0
html_to_markdown/v1_compat.py +161 -0
html_to_markdown-2.0.0.data/scripts/html-to-markdown +0 -0
html_to_markdown-2.0.0.dist-info/METADATA +422 -0
html_to_markdown-2.0.0.dist-info/RECORD +17 -0
html_to_markdown-2.0.0.dist-info/WHEEL +6 -0
html_to_markdown-2.0.0.dist-info/licenses/LICENSE +21 -0

html_to_markdown/__init__.py ADDED Viewed

@@ -0,0 +1,48 @@
+"""html-to-markdown: Convert HTML to Markdown using Rust backend.
+This package provides high-performance HTML to Markdown conversion
+powered by Rust with a clean Python API.
+V2 API (current):
+    from html_to_markdown import convert, ConversionOptions
+    options = ConversionOptions(heading_style="atx")
+    markdown = convert(html, options)
+V1 API (backward compatibility):
+    from html_to_markdown import convert_to_markdown
+    markdown = convert_to_markdown(html, heading_style="atx")
+"""
+from html_to_markdown.api import convert
+from html_to_markdown.exceptions import (
+    ConflictingOptionsError,
+    EmptyHtmlError,
+    HtmlToMarkdownError,
+    InvalidParserError,
+    MissingDependencyError,
+)
+from html_to_markdown.options import (
+    ConversionOptions,
+    ParsingOptions,
+    PreprocessingOptions,
+)
+from html_to_markdown.v1_compat import convert_to_markdown, convert_to_markdown_stream, markdownify
+__all__ = [
+    "ConflictingOptionsError",
+    "ConversionOptions",
+    "EmptyHtmlError",
+    "HtmlToMarkdownError",
+    "InvalidParserError",
+    "MissingDependencyError",
+    "ParsingOptions",
+    "PreprocessingOptions",
+    "convert",
+    "convert_to_markdown",
+    "convert_to_markdown_stream",
+    "markdownify",
+]
+__version__ = "2.0.0"

html_to_markdown/__main__.py ADDED Viewed

@@ -0,0 +1,16 @@
+import sys
+from html_to_markdown.cli_proxy import main
+def cli() -> None:
+    try:
+        result = main(sys.argv[1:])
+        print(result, end="")  # noqa: T201
+    except (ValueError, FileNotFoundError) as e:
+        print(str(e), file=sys.stderr)  # noqa: T201
+        sys.exit(1)
+if __name__ == "__main__":
+    cli()

html_to_markdown/_html_to_markdown.abi3.so ADDED Viewed

Binary file

html_to_markdown/_rust.pyi ADDED Viewed

@@ -0,0 +1,79 @@
+class ConversionOptions:
+    heading_style: str
+    list_indent_type: str
+    list_indent_width: int
+    bullets: str
+    strong_em_symbol: str
+    escape_asterisks: bool
+    escape_underscores: bool
+    escape_misc: bool
+    code_language: str
+    autolinks: bool
+    default_title: bool
+    br_in_tables: bool
+    highlight_style: str
+    extract_metadata: bool
+    whitespace_mode: str
+    strip_newlines: bool
+    wrap: bool
+    wrap_width: int
+    convert_as_inline: bool
+    sub_symbol: str
+    sup_symbol: str
+    newline_style: str
+    preprocessing: PreprocessingOptions
+    parsing: ParsingOptions
+    def __init__(
+        self,
+        heading_style: str = "underlined",
+        list_indent_type: str = "spaces",
+        list_indent_width: int = 4,
+        bullets: str = "*+-",
+        strong_em_symbol: str = "*",
+        escape_asterisks: bool = True,
+        escape_underscores: bool = True,
+        escape_misc: bool = True,
+        code_language: str = "",
+        autolinks: bool = True,
+        default_title: bool = False,
+        br_in_tables: bool = False,
+        highlight_style: str = "double-equal",
+        extract_metadata: bool = True,
+        whitespace_mode: str = "normalized",
+        strip_newlines: bool = False,
+        wrap: bool = False,
+        wrap_width: int = 80,
+        convert_as_inline: bool = False,
+        sub_symbol: str = "",
+        sup_symbol: str = "",
+        newline_style: str = "spaces",
+        preprocessing: PreprocessingOptions | None = None,
+        parsing: ParsingOptions | None = None,
+    ) -> None: ...
+class PreprocessingOptions:
+    enabled: bool
+    preset: str
+    remove_navigation: bool
+    remove_forms: bool
+    def __init__(
+        self,
+        enabled: bool = False,
+        preset: str = "standard",
+        remove_navigation: bool = True,
+        remove_forms: bool = True,
+    ) -> None: ...
+class ParsingOptions:
+    encoding: str
+    parser: str | None
+    def __init__(
+        self,
+        encoding: str = "utf-8",
+        parser: str | None = None,
+    ) -> None: ...
+def convert(html: str, options: ConversionOptions | None = None) -> str: ...

html_to_markdown/api.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""New v2 functional API for HTML to Markdown conversion.
+This module provides the new functional API with dataclass-based options,
+using the Rust backend for conversion.
+"""
+from __future__ import annotations
+import html_to_markdown._html_to_markdown as _rust  # type: ignore[import-not-found]
+from html_to_markdown.options import (
+    ConversionOptions,
+    ParsingOptions,
+    PreprocessingOptions,
+)
+def convert(
+    html: str,
+    options: ConversionOptions | None = None,
+    preprocessing: PreprocessingOptions | None = None,
+    parsing: ParsingOptions | None = None,
+) -> str:
+    """Convert HTML to Markdown using Rust backend.
+    This is the main entry point for the v2 API, using dataclass-based configuration
+    and Rust implementation for high-performance conversion.
+    Args:
+        html: HTML string to convert
+        options: Conversion options (uses defaults if None)
+        preprocessing: HTML preprocessing options (uses defaults if None)
+        parsing: HTML parsing options (uses defaults if None)
+    Returns:
+        Markdown string
+    Example:
+        >>> from html_to_markdown import convert, ConversionOptions
+        >>> options = ConversionOptions(heading_style="atx", list_indent_width=2)
+        >>> markdown = convert("<h1>Title</h1>", options)
+        >>> print(markdown)
+        # Title
+        <BLANKLINE>
+    """
+    if options is None:
+        options = ConversionOptions()
+    if preprocessing is None:
+        preprocessing = PreprocessingOptions()
+    if parsing is None:
+        parsing = ParsingOptions()
+    rust_preprocessing = _rust.PreprocessingOptions(
+        enabled=preprocessing.enabled,
+        preset=preprocessing.preset,
+        remove_navigation=preprocessing.remove_navigation,
+        remove_forms=preprocessing.remove_forms,
+    )
+    rust_parsing = _rust.ParsingOptions(
+        encoding=parsing.encoding,
+        parser=parsing.parser,
+    )
+    rust_options = _rust.ConversionOptions(
+        heading_style=options.heading_style,
+        list_indent_type=options.list_indent_type,
+        list_indent_width=options.list_indent_width,
+        bullets=options.bullets,
+        strong_em_symbol=options.strong_em_symbol,
+        escape_asterisks=options.escape_asterisks,
+        escape_underscores=options.escape_underscores,
+        escape_misc=options.escape_misc,
+        escape_ascii=options.escape_ascii,
+        code_language=options.code_language,
+        autolinks=options.autolinks,
+        default_title=options.default_title,
+        br_in_tables=options.br_in_tables,
+        hocr_extract_tables=options.hocr_extract_tables,
+        hocr_table_column_threshold=options.hocr_table_column_threshold,
+        hocr_table_row_threshold_ratio=options.hocr_table_row_threshold_ratio,
+        highlight_style=options.highlight_style,
+        extract_metadata=options.extract_metadata,
+        whitespace_mode=options.whitespace_mode,
+        strip_newlines=options.strip_newlines,
+        wrap=options.wrap,
+        wrap_width=options.wrap_width,
+        convert_as_inline=options.convert_as_inline,
+        sub_symbol=options.sub_symbol,
+        sup_symbol=options.sup_symbol,
+        newline_style=options.newline_style,
+        code_block_style=options.code_block_style,
+        keep_inline_images_in=list(options.keep_inline_images_in) if options.keep_inline_images_in else [],
+        preprocessing=rust_preprocessing,
+        parsing=rust_parsing,
+        debug=options.debug,
+        strip_tags=list(options.strip_tags) if options.strip_tags else [],
+    )
+    result: str = _rust.convert(html, rust_options)
+    return result

html_to_markdown/bin/html-to-markdown ADDED Viewed

Binary file

html_to_markdown/cli.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""CLI wrapper that proxies to Rust CLI binary.
+This module provides backwards compatibility for code that imports
+from html_to_markdown.cli. The actual CLI implementation is in Rust.
+"""
+from html_to_markdown.cli_proxy import main
+__all__ = ["main"]

html_to_markdown/cli_proxy.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""CLI proxy that calls the Rust CLI binary.
+This module provides a Python wrapper around the Rust CLI binary,
+allowing the Python package to use the high-performance Rust implementation
+for command-line operations. It also provides v1 -> v2 CLI argument translation.
+"""
+import subprocess
+import sys
+from pathlib import Path
+from html_to_markdown.exceptions import RedundantV1FlagError, RemovedV1FlagError
+def find_cli_binary() -> Path:
+    """Find the html-to-markdown CLI binary.
+    Returns:
+        Path to the CLI binary
+    Raises:
+        FileNotFoundError: If the binary cannot be found
+    """
+    binary_name = "html-to-markdown.exe" if sys.platform == "win32" else "html-to-markdown"
+    possible_locations = [
+        Path(__file__).parent.parent / "target" / "release" / binary_name,
+        Path(__file__).parent / "bin" / binary_name,
+        Path(__file__).parent / binary_name,
+    ]
+    for location in possible_locations:
+        if location.exists() and location.is_file():
+            return location
+    msg = "html-to-markdown CLI binary not found. Please install or build the package."
+    raise FileNotFoundError(msg)
+def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
+    """Translate v1 CLI arguments to v2 Rust CLI arguments.
+    This handles differences between the v1 Python CLI and v2 Rust CLI:
+    - Boolean flags: v1 used --flag/--no-flag, v2 uses presence/absence
+    - Flag name changes: --preprocess-html -> --preprocess
+    - Unsupported flags: --strip, --convert (raise errors)
+    Args:
+        argv: v1 CLI arguments
+    Returns:
+        Translated v2 CLI arguments
+    Raises:
+        RemovedV1FlagError: If a v1 flag has been removed in v2
+    """
+    translated = []
+    i = 0
+    while i < len(argv):
+        arg = argv[i]
+        # Error on removed/unsupported v1 features
+        if arg in ("--strip", "--convert"):
+            raise RemovedV1FlagError(
+                flag=arg,
+                reason=f"{arg} option has been removed in v2.",
+                migration="Remove this flag from your command. The feature is no longer available.",
+            )
+        # These flags are redundant (match v2 defaults) but we accept them for v1 compatibility
+        # Silently skip - Rust CLI defaults match these flags
+        if arg in (
+            "--no-escape-asterisks",
+            "--no-escape-underscores",
+            "--no-escape-misc",
+            "--no-wrap",
+            "--no-autolinks",
+            "--no-extract-metadata",
+        ):
+            # Skip this flag - matches Rust CLI defaults
+            pass
+        # Flag name translations
+        elif arg == "--preprocess-html":
+            translated.append("--preprocess")
+        # Positive flags that should be passed through
+        elif arg in (
+            "--escape-asterisks",
+            "--escape-underscores",
+            "--escape-misc",
+            "--autolinks",
+            "--extract-metadata",
+            "--wrap",
+        ):
+            translated.append(arg)
+        # All other args pass through unchanged
+        else:
+            translated.append(arg)
+        i += 1
+    return translated
+def main(argv: list[str]) -> str:
+    """Run the Rust CLI with the given arguments.
+    Translates v1 CLI arguments to v2 format if needed.
+    Exits with non-zero status on errors (FileNotFoundError, UnsupportedV1FeatureError, CLI errors).
+    Args:
+        argv: Command line arguments (without program name)
+    Returns:
+        Output from the CLI
+    """
+    cli_binary = find_cli_binary()
+    try:
+        translated_args = translate_v1_args_to_v2(argv)
+    except (RemovedV1FlagError, RedundantV1FlagError) as e:
+        # Format the error nicely for CLI users
+        sys.stderr.write(f"\n❌ Error: {e.flag}\n\n")
+        sys.stderr.write(f"   {e.reason}\n\n")
+        sys.stderr.write(f"   💡 {e.migration}\n\n")
+        sys.exit(1)
+    except ValueError as e:
+        sys.stderr.write(f"Error: {e}\n")
+        sys.exit(1)
+    result = subprocess.run(  # noqa: S603
+        [str(cli_binary), *translated_args],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if result.returncode != 0:
+        sys.stderr.write(result.stderr)
+        sys.exit(result.returncode)
+    return result.stdout

html_to_markdown/exceptions.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""Exception classes for html-to-markdown."""
+from __future__ import annotations
+class HtmlToMarkdownError(Exception):
+    """Base exception for html-to-markdown errors."""
+class MissingDependencyError(HtmlToMarkdownError):
+    """Raised when a required dependency is not installed."""
+    def __init__(self, dependency: str, install_command: str | None = None) -> None:
+        self.dependency = dependency
+        self.install_command = install_command
+        message = f"{dependency} is not installed."
+        if install_command:
+            message += f" Install with: {install_command}"
+        super().__init__(message)
+class InvalidParserError(HtmlToMarkdownError):
+    """Raised when an invalid HTML parser is specified."""
+    def __init__(self, parser: str, available_parsers: list[str]) -> None:
+        self.parser = parser
+        self.available_parsers = available_parsers
+        message = f"Invalid parser '{parser}'. Available parsers: {', '.join(available_parsers)}"
+        super().__init__(message)
+class EmptyHtmlError(HtmlToMarkdownError):
+    """Raised when the input HTML is empty."""
+    def __init__(self) -> None:
+        super().__init__("The input HTML is empty.")
+class ConflictingOptionsError(HtmlToMarkdownError):
+    """Raised when conflicting options are specified."""
+    def __init__(self, option1: str, option2: str) -> None:
+        self.option1 = option1
+        self.option2 = option2
+        super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
+class InvalidEncodingError(HtmlToMarkdownError):
+    """Raised when an invalid encoding is specified."""
+    def __init__(self, encoding: str) -> None:
+        super().__init__(f"The specified encoding ({encoding}) is not valid.")
+class UnsupportedV1FeatureError(HtmlToMarkdownError):
+    """Raised when a v1 feature is not supported in v2.
+    Args:
+        flag: The CLI flag or feature that is not supported
+        reason: Why the feature is not supported
+        migration: How to migrate away from this feature
+    """
+    def __init__(self, flag: str, reason: str, migration: str) -> None:
+        self.flag = flag
+        self.reason = reason
+        self.migration = migration
+        message = f"'{flag}' is not supported in v2.\n\nReason: {reason}\n\nMigration: {migration}"
+        super().__init__(message)
+class RemovedV1FlagError(UnsupportedV1FeatureError):
+    """Raised when a CLI flag has been completely removed in v2."""
+class RedundantV1FlagError(UnsupportedV1FeatureError):
+    """Raised when a v1 flag is redundant in v2 because it's the default behavior."""

html_to_markdown/options.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""Configuration options for HTML to Markdown conversion.
+This module provides dataclass-based configuration for the v2 API.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Literal, Protocol
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from bs4 import Tag
+class ConverterFunction(Protocol):
+    """Protocol for custom converter functions.
+    Converter functions receive keyword-only arguments including the HTML tag,
+    processed text content, and any conversion options needed.
+    Example:
+        >>> def custom_link_converter(*, tag: Tag, text: str, autolinks: bool, **kwargs: Any) -> str:
+        ...     href = tag.get("href", "")
+        ...     return f"[{text}]({href})"
+    """
+    def __call__(self, *, tag: Tag, text: str, **kwargs: Any) -> str:
+        """Convert an HTML element to Markdown.
+        Args:
+            tag: BeautifulSoup Tag object representing the HTML element
+            text: Processed text content of the element's children
+            **kwargs: Additional conversion options (varies by converter)
+        Returns:
+            Markdown string representation of the element
+        """
+        ...
+@dataclass
+class ConversionOptions:
+    """Main conversion configuration.
+    This class groups all conversion-related options together, replacing
+    the large number of keyword arguments in the v1 API.
+    Example:
+        >>> options = ConversionOptions(
+        ...     heading_style="atx",
+        ...     list_indent_width=2,
+        ...     escape_asterisks=True,
+        ... )
+        >>> from html_to_markdown import convert
+        >>> markdown = convert("<h1>Title</h1>", options)
+    """
+    heading_style: Literal["underlined", "atx", "atx_closed"] = "atx"
+    """Style for headings: 'atx' (#) is CommonMark default, 'underlined' (===), or 'atx_closed' (# #)."""
+    list_indent_type: Literal["spaces", "tabs"] = "spaces"
+    """Type of indentation for lists."""
+    list_indent_width: int = 2
+    """Number of spaces for list indentation (CommonMark uses 2 spaces, ignored if list_indent_type='tabs')."""
+    bullets: str = "-*+"
+    """Characters to use for unordered list bullets (cycles through -, *, + for nested levels). CommonMark compliant."""
+    strong_em_symbol: Literal["*", "_"] = "*"
+    """Symbol for strong/emphasis formatting."""
+    escape_asterisks: bool = False
+    """Escape asterisk characters in text to prevent accidental formatting. Default False for minimal escaping (CommonMark)."""
+    escape_underscores: bool = False
+    """Escape underscore characters in text to prevent accidental formatting. Default False for minimal escaping (CommonMark)."""
+    escape_misc: bool = False
+    """Escape miscellaneous Markdown characters. Default False for minimal escaping (CommonMark)."""
+    escape_ascii: bool = False
+    """Escape all ASCII punctuation (for CommonMark spec compliance tests). Disabled by default for minimal escaping."""
+    code_language: str = ""
+    """Default language for code blocks."""
+    code_language_callback: Callable[[Tag], str] | None = None
+    """Callback to determine code language from element."""
+    autolinks: bool = True
+    """Convert bare URLs to automatic links."""
+    default_title: bool = False
+    """Add a default title if none exists."""
+    keep_inline_images_in: set[str] | None = None
+    """Parent tag names where images should remain inline."""
+    br_in_tables: bool = False
+    """Use <br> tags for line breaks in table cells instead of spaces."""
+    hocr_extract_tables: bool = True
+    """Enable table extraction from hOCR (HTML-based OCR) documents."""
+    hocr_table_column_threshold: int = 50
+    """Pixel threshold for detecting column boundaries in hOCR tables."""
+    hocr_table_row_threshold_ratio: float = 0.5
+    """Row height ratio threshold for detecting row boundaries in hOCR tables."""
+    highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
+    """Style for highlighting <mark> elements."""
+    extract_metadata: bool = True
+    """Extract metadata from HTML head and include as comment."""
+    whitespace_mode: Literal["normalized", "strict"] = "normalized"
+    """How to handle whitespace: 'normalized' or 'strict'."""
+    strip_newlines: bool = False
+    """Remove newlines from HTML before processing."""
+    wrap: bool = False
+    """Enable text wrapping."""
+    wrap_width: int = 80
+    """Column width for text wrapping."""
+    convert: set[str] | None = None
+    """HTML tags to convert to Markdown (None = all supported tags). v1 compatibility only."""
+    strip_tags: set[str] | None = None
+    """HTML tags to strip from output (output only text content, no markdown conversion)."""
+    convert_as_inline: bool = False
+    """Treat block elements as inline during conversion."""
+    sub_symbol: str = ""
+    """Symbol for subscript text."""
+    sup_symbol: str = ""
+    """Symbol for superscript text."""
+    newline_style: Literal["spaces", "backslash"] = "spaces"
+    """Style for newlines: 'spaces' (two trailing spaces, CommonMark default) or 'backslash' (\\). Both are equally CommonMark compliant."""
+    code_block_style: Literal["indented", "backticks", "tildes"] = "backticks"
+    """Style for code blocks: 'backticks' (```, better whitespace preservation), 'indented' (4 spaces), or 'tildes' (~~~). All are CommonMark compliant."""
+    custom_converters: dict[str, Callable[..., str]] | None = None
+    """Custom converter functions for specific HTML elements."""
+    debug: bool = False
+    """Enable debug mode with diagnostic warnings about unhandled elements and hOCR processing."""
+@dataclass
+class PreprocessingOptions:
+    """HTML preprocessing configuration.
+    Controls how HTML is cleaned and preprocessed before conversion.
+    Example:
+        >>> options = PreprocessingOptions(
+        ...     enabled=True,
+        ...     preset="aggressive",
+        ...     remove_navigation=True,
+        ... )
+    """
+    enabled: bool = False
+    """Whether to enable HTML preprocessing (disabled by default for minimal transformation)."""
+    preset: Literal["minimal", "standard", "aggressive"] = "standard"
+    """Preprocessing aggressiveness level."""
+    remove_navigation: bool = True
+    """Remove navigation elements during preprocessing."""
+    remove_forms: bool = True
+    """Remove form elements during preprocessing."""
+    excluded_navigation_classes: set[str] | None = None
+    """Navigation class fragments to keep even when removing navigation."""
+    extra_navigation_classes: set[str] | None = None
+    """Additional navigation class fragments to strip beyond defaults."""
+@dataclass
+class ParsingOptions:
+    """HTML parsing configuration.
+    Example:
+        >>> options = ParsingOptions(
+        ...     encoding="utf-8",
+        ...     detect_encoding=True,
+        ... )
+    """
+    encoding: str = "utf-8"
+    """Character encoding for decoding bytes input."""
+    detect_encoding: bool = False
+    """Attempt to detect encoding from HTML (not yet implemented)."""
+    parser: str | None = None
+    """HTML parser to use: 'html.parser', 'lxml', or 'html5lib' (None = auto)."""

html_to_markdown/py.typed ADDED Viewed

File without changes

html_to_markdown/v1_compat.py ADDED Viewed

@@ -0,0 +1,161 @@
+"""V1 API compatibility layer.
+Provides backward compatibility for the v1 convert_to_markdown API
+by translating v1 kwargs to v2 ConversionOptions/PreprocessingOptions/ParsingOptions.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+from html_to_markdown import ConversionOptions, ParsingOptions, PreprocessingOptions
+from html_to_markdown import convert as convert_v2
+def convert_to_markdown(  # noqa: D417
+    html: str,
+    *,
+    heading_style: str = "underlined",
+    list_indent_type: str = "spaces",
+    list_indent_width: int = 4,
+    bullets: str = "*+-",
+    strong_em_symbol: str = "*",
+    escape_asterisks: bool = True,
+    escape_underscores: bool = True,
+    escape_misc: bool = True,
+    code_language: str = "",
+    autolinks: bool = True,
+    default_title: bool = False,
+    br_in_tables: bool = False,
+    hocr_extract_tables: bool = True,
+    hocr_table_column_threshold: int = 50,
+    hocr_table_row_threshold_ratio: float = 0.5,
+    highlight_style: str = "double-equal",
+    extract_metadata: bool = True,
+    whitespace_mode: str = "normalized",
+    strip_newlines: bool = False,
+    wrap: bool = False,
+    wrap_width: int = 80,
+    convert_as_inline: bool = False,
+    sub_symbol: str = "",
+    sup_symbol: str = "",
+    newline_style: str = "spaces",
+    keep_inline_images_in: set[str] | None = None,
+    preprocess: bool = False,
+    preprocessing_preset: str = "standard",
+    remove_navigation: bool = True,
+    remove_forms: bool = True,
+    parser: str = "html.parser",
+    source_encoding: str = "utf-8",
+    code_language_callback: object | None = None,
+    strip: list[str] | None = None,
+    convert: list[str] | None = None,
+    custom_converters: dict[str, object] | None = None,
+) -> str:
+    """Convert HTML to Markdown (v1 API compatibility).
+    This function provides backward compatibility with the v1 API by accepting
+    the same kwargs and translating them to v2 ConversionOptions.
+    Note: Some v1 options are not supported in v2:
+    - code_language_callback: Removed in v2
+    - convert: Removed in v2
+    - custom_converters: Not yet implemented in v2
+    Args:
+        html: HTML string to convert
+    Returns:
+        Markdown string
+    Raises:
+        NotImplementedError: If unsupported v1 options are provided
+    """
+    if code_language_callback is not None:
+        raise NotImplementedError(
+            "code_language_callback was removed in v2. Use the code_language option to set a default language."
+        )
+    if convert is not None:
+        raise NotImplementedError("convert option was removed in v2. All supported tags are converted by default.")
+    if custom_converters is not None:
+        raise NotImplementedError("custom_converters is not yet implemented in v2")
+    # V1 behavior: if code_language is set, use fenced code blocks (backticks)
+    # V2 default is indented code blocks, so we need to override
+    code_block_style = "backticks" if code_language else "indented"
+    options = ConversionOptions(
+        heading_style=heading_style,  # type: ignore[arg-type]
+        list_indent_type=list_indent_type,  # type: ignore[arg-type]
+        list_indent_width=list_indent_width,
+        bullets=bullets,
+        strong_em_symbol=strong_em_symbol,  # type: ignore[arg-type]
+        escape_asterisks=escape_asterisks,
+        escape_underscores=escape_underscores,
+        escape_misc=escape_misc,
+        code_block_style=code_block_style,  # type: ignore[arg-type]
+        code_language=code_language,
+        autolinks=autolinks,
+        default_title=default_title,
+        br_in_tables=br_in_tables,
+        hocr_extract_tables=hocr_extract_tables,
+        hocr_table_column_threshold=hocr_table_column_threshold,
+        hocr_table_row_threshold_ratio=hocr_table_row_threshold_ratio,
+        highlight_style=highlight_style,  # type: ignore[arg-type]
+        extract_metadata=extract_metadata,
+        whitespace_mode=whitespace_mode,  # type: ignore[arg-type]
+        strip_newlines=strip_newlines,
+        wrap=wrap,
+        wrap_width=wrap_width,
+        convert_as_inline=convert_as_inline,
+        sub_symbol=sub_symbol,
+        sup_symbol=sup_symbol,
+        newline_style=newline_style,  # type: ignore[arg-type]
+        keep_inline_images_in=keep_inline_images_in,
+        strip_tags=set(strip) if strip else None,
+    )
+    preprocessing = PreprocessingOptions(
+        enabled=preprocess,
+        preset=preprocessing_preset,  # type: ignore[arg-type]
+        remove_navigation=remove_navigation,
+        remove_forms=remove_forms,
+    )
+    parsing = ParsingOptions(
+        encoding=source_encoding,
+        parser=parser,
+    )
+    return convert_v2(html, options, preprocessing, parsing)
+def convert_to_markdown_stream(  # noqa: D417
+    html: str,
+    *,
+    chunk_size: int = 4096,
+    **kwargs: object,
+) -> Iterator[str]:
+    """Stream HTML to Markdown conversion (v1 API).
+    Note: Streaming was removed in v2.
+    Args:
+        html: HTML string to convert
+        chunk_size: Size of chunks to yield (not used in v2)
+    Raises:
+        NotImplementedError: Streaming was removed in v2
+    """
+    raise NotImplementedError(
+        "Streaming API (convert_to_markdown_stream) was removed in v2 (html5ever does not support streaming). "
+        "Use convert_to_markdown() instead."
+    )
+markdownify = convert_to_markdown
+__all__ = ["convert_to_markdown", "convert_to_markdown_stream", "markdownify"]

html_to_markdown-2.0.0.data/scripts/html-to-markdown ADDED Viewed

Binary file

html_to_markdown-2.0.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,422 @@
+Metadata-Version: 2.4
+Name: html-to-markdown
+Version: 2.0.0
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Rust
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing
+Classifier: Topic :: Text Processing :: Markup
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Classifier: Topic :: Text Processing :: Markup :: Markdown
+Classifier: Typing :: Typed
+License-File: LICENSE
+Summary: High-performance HTML to Markdown converter powered by Rust with a clean Python API
+Keywords: cli-tool,converter,html,html2markdown,html5,markdown,markup,parser,rust,text-processing
+Home-Page: https://github.com/Goldziher/html-to-markdown
+Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
+License: MIT
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
+Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
+Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
+Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
+Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
+# html-to-markdown
+High-performance HTML to Markdown converter Rust crate and CLI with Python bindings and CLI. Available via PyPI, Homebrew, and Cargo. Cross-platform support for Linux, macOS, and Windows.
+[![PyPI version](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
+[![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://crates.io/crates/html-to-markdown-rs)
+[![Python Versions](https://img.shields.io/pypi/pyversions/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
+[![Documentation](https://img.shields.io/badge/docs-github-blue)](https://github.com/Goldziher/html-to-markdown)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
+Part of the [Kreuzberg](https://kreuzberg.dev) ecosystem for document intelligence.
+**📚 [Full V2 Documentation](crates/html-to-markdown/README.md)** - Comprehensive guide for Rust, Python, and CLI usage.
+## ⚡ Benchmarks
+### Throughput (Python API)
+Real Wikipedia documents on Apple M1 Pro:
+| Document            | Size  | Latency | Throughput | Docs/sec |
+| ------------------- | ----- | ------- | ---------- | -------- |
+| Lists (Timeline)    | 129KB | 0.62ms  | 208 MB/s   | 1,613    |
+| Tables (Countries)  | 360KB | 2.02ms  | 178 MB/s   | 495      |
+| Mixed (Python wiki) | 656KB | 4.56ms  | 144 MB/s   | 219      |
+**Throughput scales linearly** from 144-208 MB/s across all document sizes.
+### Memory Usage
+| Document Size | Memory Delta | Peak RSS | Leak Detection |
+| ------------- | ------------ | -------- | -------------- |
+| 10KB          | < 2 MB       | < 20 MB  | ✅ None        |
+| 50KB          | < 8 MB       | < 35 MB  | ✅ None        |
+| 500KB         | < 40 MB      | < 80 MB  | ✅ None        |
+Memory usage is linear and stable across 50+ repeated conversions.
+**V2 is 19-30x faster** than v1 Python/BeautifulSoup implementation.
+📊 **[Benchmark Results](BENCHMARK_RESULTS.md)** - Detailed Python API comparison
+📈 **[Performance Analysis](PERFORMANCE.md)** - Rust core benchmarks and profiling
+🔧 **[Benchmarking Guide](BENCHMARKS.md)** - How to run benchmarks
+✅ **[CommonMark Compliance](COMMONMARK_COMPLIANCE.md)** - CommonMark specification compliance
+## Features
+- **🚀 Blazing Fast**: Pure Rust core with ultra-fast `tl` HTML parser
+- **🐍 Python Bindings**: Clean Python API via PyO3 with full type hints
+- **🦀 Native CLI**: Rust CLI binary with comprehensive options
+- **📊 hOCR 1.2 Compliant**: Full support for all 40+ elements and 20+ properties
+- **📝 CommonMark Compliant**: Follows CommonMark specification for list formatting
+- **🎯 Type Safe**: Full type hints and `.pyi` stubs for excellent IDE support
+- **🌍 Cross-Platform**: Wheels for Linux, macOS, Windows (x86_64 + ARM64)
+- **✅ Well-Tested**: 900+ tests with dual Python + Rust coverage
+## Installation
+> **📦 Package Names**: Due to a naming conflict on crates.io, the Rust crate is published as `html-to-markdown-rs`, while the Python package remains `html-to-markdown` on PyPI. The CLI binary name is `html-to-markdown` for both.
+### Python Package
+```bash
+pip install html-to-markdown
+```
+### Rust Library
+```bash
+cargo add html-to-markdown-rs
+```
+### CLI Binary
+#### via Homebrew (macOS/Linux)
+```bash
+brew tap goldziher/tap
+brew install html-to-markdown
+```
+#### via Cargo
+```bash
+cargo install html-to-markdown-cli
+```
+#### Direct Download
+Download pre-built binaries from [GitHub Releases](https://github.com/Goldziher/html-to-markdown/releases).
+## Quick Start
+### Python API
+Clean, type-safe configuration with dataclasses:
+```python
+from html_to_markdown import convert, ConversionOptions
+html = """
+<h1>Welcome</h1>
+<p>This is <strong>fast</strong> Rust-powered conversion!</p>
+<ul>
+    <li>Blazing fast</li>
+    <li>Type safe</li>
+    <li>Easy to use</li>
+</ul>
+"""
+options = ConversionOptions(
+    heading_style="atx",
+    strong_em_symbol="*",
+    bullets="*+-",
+)
+markdown = convert(html, options)
+print(markdown)
+```
+Output:
+```markdown
+# Welcome
+This is **fast** Rust-powered conversion!
+* Blazing fast
++ Type safe
+- Easy to use
+```
+### Rust API
+```rust
+use html_to_markdown_rs::{convert, ConversionOptions, HeadingStyle};
+fn main() {
+    let html = r#"
+        <h1>Welcome</h1>
+        <p>This is <strong>fast</strong> conversion!</p>
+    "#;
+    let options = ConversionOptions {
+        heading_style: HeadingStyle::Atx,
+        ..Default::default()
+    };
+    let markdown = convert(html, Some(options)).unwrap();
+    println!("{}", markdown);
+}
+```
+### CLI Usage
+```bash
+# Convert file
+html-to-markdown input.html > output.md
+# From stdin
+cat input.html | html-to-markdown > output.md
+# With options
+html-to-markdown --heading-style atx --list-indent-width 2 input.html
+# Clean web-scraped content
+html-to-markdown \
+    --preprocess \
+    --preset aggressive \
+    --no-extract-metadata \
+    scraped.html > clean.md
+```
+## Configuration
+### Python: Dataclass Configuration
+```python
+from html_to_markdown import (
+    convert,
+    ConversionOptions,
+    PreprocessingOptions,
+)
+# Conversion settings
+options = ConversionOptions(
+    heading_style="atx",  # "atx", "atx_closed", "underlined"
+    list_indent_width=2,  # Discord/Slack: use 2
+    bullets="*+-",  # Bullet characters
+    strong_em_symbol="*",  # "*" or "_"
+    escape_asterisks=True,  # Escape * in text
+    code_language="python",  # Default code block language
+    extract_metadata=True,  # Extract HTML metadata
+    highlight_style="double-equal",  # "double-equal", "html", "bold"
+)
+# HTML preprocessing
+preprocessing = PreprocessingOptions(
+    enabled=True,
+    preset="standard",  # "minimal", "standard", "aggressive"
+    remove_navigation=True,
+    remove_forms=True,
+)
+markdown = convert(html, options, preprocessing)
+```
+### Python: Legacy API (v1 compatibility)
+For backward compatibility with existing v1 code:
+```python
+from html_to_markdown import convert_to_markdown
+markdown = convert_to_markdown(
+    html,
+    heading_style="atx",
+    list_indent_width=2,
+    preprocess=True,
+    preprocessing_preset="standard",
+)
+```
+## Common Use Cases
+### Discord/Slack Compatible Lists
+```python
+from html_to_markdown import convert, ConversionOptions
+options = ConversionOptions(list_indent_width=2)
+markdown = convert(html, options)
+```
+### Clean Web-Scraped HTML
+```python
+from html_to_markdown import convert, PreprocessingOptions
+preprocessing = PreprocessingOptions(
+    enabled=True,
+    preset="aggressive",  # Heavy cleaning
+    remove_navigation=True,
+    remove_forms=True,
+)
+markdown = convert(html, preprocessing=preprocessing)
+```
+### hOCR 1.2 Support
+**Complete hOCR 1.2 specification compliance** with support for all elements, properties, and metadata:
+```python
+from html_to_markdown import convert, ConversionOptions
+# Option 1: Document structure extraction (NEW in v2)
+# Extracts all hOCR elements and converts to structured markdown
+# Supports: paragraphs, sections, chapters, headers/footers, images, math, etc.
+markdown = convert(hocr_html)
+# Option 2: Legacy table extraction (spatial reconstruction)
+# Reconstructs tables from word bounding boxes
+options = ConversionOptions(
+    hocr_extract_tables=True,
+    hocr_table_column_threshold=50,
+    hocr_table_row_threshold_ratio=0.5,
+)
+markdown = convert(hocr_html, options)
+```
+**Full hOCR 1.2 Spec Coverage:**
+- ✅ **All 40 Element Types** - Logical structure (12), typesetting (6), float (13), inline (6), engine-specific (3)
+- ✅ **All 20+ Properties** - bbox, baseline, textangle, poly, x_wconf, x_confs, x_font, x_fsize, order, cflow, cuts, x_bboxes, image, ppageno, lpageno, scan_res, and more
+- ✅ **All 5 Metadata Fields** - ocr-system, ocr-capabilities, ocr-number-of-pages, ocr-langs, ocr-scripts
+- ✅ **37 Tests** - Complete coverage of all elements and properties
+**Semantic Markdown Conversion:**
+| Element Category | Examples                        | Markdown Output                           |
+| ---------------- | ------------------------------- | ----------------------------------------- |
+| Headings         | `ocr_title`, `ocr_chapter`      | `# Heading`                               |
+| Sections         | `ocr_section`, `ocr_subsection` | `##`, `###`                               |
+| Structure        | `ocr_par`, `ocr_blockquote`     | Paragraphs, `> quotes`                    |
+| Metadata         | `ocr_abstract`, `ocr_author`    | `**Abstract**`, `*Author*`                |
+| Floats           | `ocr_header`, `ocr_footer`      | `*Header*`, `*Footer*`                    |
+| Images           | `ocr_image`, `ocr_photo`        | `![alt](path)` with image property        |
+| Math             | `ocr_math`, `ocr_display`       | `` `formula` ``, ```` ```equation``` ```` |
+| Layout           | `ocr_separator`                 | `---` horizontal rule                     |
+| Inline           | `ocrx_word`, `ocr_dropcap`      | Text, `**Letter**`                        |
+**HTML Entity Handling:** Automatically decodes `&quot;`, `&apos;`, `&lt;`, `&gt;`, `&amp;` in title attributes for proper property parsing.
+## Configuration Reference
+**V2 Defaults (CommonMark-compliant):**
+- `list_indent_width`: 2 (CommonMark standard)
+- `bullets`: "\*+-" (cycles through `*`, `+`, `-` for nested levels)
+- `escape_asterisks`: false (minimal escaping)
+- `escape_underscores`: false (minimal escaping)
+- `escape_misc`: false (minimal escaping)
+- `newline_style`: "spaces" (CommonMark: two trailing spaces)
+- `code_block_style`: "backticks" (fenced code blocks with \`\`\`, better whitespace preservation)
+- `heading_style`: "atx" (CommonMark: `#`)
+- `preprocessing.enabled`: false (no preprocessing by default)
+For complete configuration reference, see **[Full Documentation](crates/html-to-markdown/README.md#configuration-reference)**.
+## Upgrading from v1.x
+### Backward Compatibility
+Existing v1 code works without changes:
+```python
+from html_to_markdown import convert_to_markdown
+markdown = convert_to_markdown(html, heading_style="atx")  # Still works!
+```
+### Modern API (Recommended)
+For new projects, use the dataclass-based API:
+```python
+from html_to_markdown import convert, ConversionOptions
+options = ConversionOptions(heading_style="atx", list_indent_width=2)
+markdown = convert(html, options)
+```
+### What Changed in v2
+**Core Rewrite:**
+- Complete Rust rewrite using `tl` HTML parser
+- 19-30x performance improvement over v1
+- CommonMark-compliant defaults (2-space indents, minimal escaping, ATX headings)
+- No BeautifulSoup or lxml dependencies
+**Removed Features:**
+- `code_language_callback` - use `code_language` for default language
+- `strip` / `convert` options - use `strip_tags` or preprocessing
+- `convert_to_markdown_stream()` - not supported in v2
+**Planned:**
+- `custom_converters` - planned for future release
+See **[CHANGELOG.md](CHANGELOG.md)** for complete v1 vs v2 comparison and migration guide.
+## Kreuzberg Ecosystem
+html-to-markdown is part of the [Kreuzberg](https://kreuzberg.dev) ecosystem, a comprehensive framework for document intelligence and processing. While html-to-markdown focuses on converting HTML to Markdown with maximum performance, Kreuzberg provides a complete solution for:
+- **Document Extraction**: Extract text, images, and metadata from 50+ document formats
+- **OCR Processing**: Multiple OCR backends (Tesseract, EasyOCR, PaddleOCR)
+- **Table Extraction**: Vision-based and OCR-based table detection
+- **Document Classification**: Automatic detection of contracts, forms, invoices, etc.
+- **RAG Pipelines**: Integration with retrieval-augmented generation workflows
+Learn more at [kreuzberg.dev](https://kreuzberg.dev) or join our [Discord community](https://discord.gg/pXxagNK2zN).
+## Contributing
+See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, testing, and contribution guidelines.
+## License
+MIT License - see [LICENSE](LICENSE) for details.
+## Acknowledgments
+Version 1 started as a fork of [markdownify](https://pypi.org/project/markdownify/), rewritten, extended, and enhanced with better typing and features. Version 2 is a complete Rust rewrite for high performance.
+## Support
+If you find this library useful, consider:
+<a href="https://github.com/sponsors/Goldziher">
+  <img src="https://img.shields.io/badge/Sponsor-%E2%9D%A4-pink?logo=github-sponsors" alt="Sponsor" height="32">
+</a>
+Your support helps maintain and improve this library!

html_to_markdown-2.0.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,17 @@
+html_to_markdown-2.0.0.data/scripts/html-to-markdown,sha256=brVDlJvTJykaMtqOu_ls037UVWdX2UgNIXKcQhmnPTE,3734448
+html_to_markdown-2.0.0.dist-info/RECORD,,
+html_to_markdown-2.0.0.dist-info/WHEEL,sha256=HtAbUhtjhH1WdiDuIy2CapdoAiKCwe6bij_Tlxr1lEg,131
+html_to_markdown-2.0.0.dist-info/METADATA,sha256=UbI9PqMoGe0EC3j_1Y6NeTgCU7b3fJ9jW_MOkDAM7yM,14285
+html_to_markdown-2.0.0.dist-info/licenses/LICENSE,sha256=oQvPC-0UWvfg0WaeUBe11OJMtX60An-TW1ev_oaAA0k,1086
+html_to_markdown/options.py,sha256=LXOUDqWwuvC-ryE118LttnATDO6-rlogYbbEGVfynhM,7241
+html_to_markdown/_html_to_markdown.abi3.so,sha256=Ff4tPbv2sLfflQxy1P07Yl9put-HpiHKKbNGRujaBug,2989792
+html_to_markdown/__init__.py,sha256=0r7a2ruI_9xqj0Ko-5O4yCGrQ4Nga89qSUY4lTSyiDE,1266
+html_to_markdown/api.py,sha256=0KgVWCDX-pWxrADxxxnqzk5_IhYc4fDxRytgeHttCKQ,3620
+html_to_markdown/_rust.pyi,sha256=6GZ5fXfQ7VqglKB-kSZ395cysOdLIdQidDq6yoAHICA,2141
+html_to_markdown/v1_compat.py,sha256=ThGk8g5rsZ_2gO1pA4_VThiLKuNhu4injClyv2pQmg4,5521
+html_to_markdown/cli.py,sha256=OW6GZAR7adSOfqSaRGx5YqNU3xChAkwG98WHcRhL5ss,254
+html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+html_to_markdown/exceptions.py,sha256=0Yrzndw1kSqN-HMnE34TjZzo21iihiD1TZG1k2dmpdI,2626
+html_to_markdown/cli_proxy.py,sha256=nuBMky_q_ArDUKGgWW6Vrxf2JwOa_RgmUPH8qYBIcRQ,4298
+html_to_markdown/__main__.py,sha256=3Ic_EbOt2h6W88q084pkz5IKU6iY5z_woBygH6u9aw0,327
+html_to_markdown/bin/html-to-markdown,sha256=brVDlJvTJykaMtqOu_ls037UVWdX2UgNIXKcQhmnPTE,3734448

html_to_markdown-2.0.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,6 @@
+Wheel-Version: 1.0
+Generator: maturin (1.9.6)
+Root-Is-Purelib: false
+Tag: cp310-abi3-macosx_11_0_arm64
+Generator: delocate 0.13.0

html_to_markdown-2.0.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright 2024-2025 Na'aman Hirschfeld
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.