PyPI - html-to-markdown - Versions diffs - 2.14.4__cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - Mend

html-to-markdown 2.14.4__cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (17) hide show

html_to_markdown/__init__.py +62 -0
html_to_markdown/__main__.py +16 -0
html_to_markdown/_html_to_markdown.abi3.so +0 -0
html_to_markdown/_html_to_markdown.pyi +196 -0
html_to_markdown/api.py +195 -0
html_to_markdown/bin/html-to-markdown +0 -0
html_to_markdown/cli.py +3 -0
html_to_markdown/cli_proxy.py +142 -0
html_to_markdown/exceptions.py +73 -0
html_to_markdown/options.py +144 -0
html_to_markdown/py.typed +0 -0
html_to_markdown/v1_compat.py +191 -0
html_to_markdown-2.14.4.data/scripts/html-to-markdown +0 -0
html_to_markdown-2.14.4.dist-info/METADATA +634 -0
html_to_markdown-2.14.4.dist-info/RECORD +17 -0
html_to_markdown-2.14.4.dist-info/WHEEL +6 -0
html_to_markdown-2.14.4.dist-info/licenses/LICENSE +21 -0

html_to_markdown/__init__.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""html-to-markdown: Convert HTML to Markdown using Rust backend.
+This package provides high-performance HTML to Markdown conversion
+powered by Rust with a clean Python API.
+V2 API (current):
+    from html_to_markdown import convert, ConversionOptions
+    options = ConversionOptions(heading_style="atx")
+    markdown = convert(html, options)
+V1 API (backward compatibility):
+    from html_to_markdown import convert_to_markdown
+    markdown = convert_to_markdown(html, heading_style="atx")
+"""
+from html_to_markdown.api import (
+    InlineImage,
+    InlineImageConfig,
+    InlineImageWarning,
+    MetadataConfig,
+    OptionsHandle,
+    convert,
+    convert_with_handle,
+    convert_with_inline_images,
+    convert_with_metadata,
+    create_options_handle,
+)
+from html_to_markdown.exceptions import (
+    ConflictingOptionsError,
+    EmptyHtmlError,
+    HtmlToMarkdownError,
+    InvalidParserError,
+    MissingDependencyError,
+)
+from html_to_markdown.options import ConversionOptions, PreprocessingOptions
+from html_to_markdown.v1_compat import convert_to_markdown, markdownify
+__all__ = [
+    "ConflictingOptionsError",
+    "ConversionOptions",
+    "EmptyHtmlError",
+    "HtmlToMarkdownError",
+    "InlineImage",
+    "InlineImageConfig",
+    "InlineImageWarning",
+    "InvalidParserError",
+    "MetadataConfig",
+    "MissingDependencyError",
+    "OptionsHandle",
+    "PreprocessingOptions",
+    "convert",
+    "convert_to_markdown",
+    "convert_with_handle",
+    "convert_with_inline_images",
+    "convert_with_metadata",
+    "create_options_handle",
+    "markdownify",
+]
+__version__ = "2.14.4"

html_to_markdown/__main__.py ADDED Viewed

@@ -0,0 +1,16 @@
+import sys
+from html_to_markdown.cli_proxy import main
+def cli() -> None:
+    try:
+        result = main(sys.argv[1:])
+        print(result, end="")  # noqa: T201
+    except (ValueError, FileNotFoundError) as e:
+        print(str(e), file=sys.stderr)  # noqa: T201
+        sys.exit(1)
+if __name__ == "__main__":
+    cli()

html_to_markdown/_html_to_markdown.abi3.so ADDED Viewed

Binary file

html_to_markdown/_html_to_markdown.pyi ADDED Viewed

@@ -0,0 +1,196 @@
+from typing import Literal, TypedDict
+class PreprocessingOptions:
+    enabled: bool
+    preset: Literal["minimal", "standard", "aggressive"]
+    remove_navigation: bool
+    remove_forms: bool
+    def __init__(
+        self,
+        *,
+        enabled: bool = False,
+        preset: Literal["minimal", "standard", "aggressive"] = "standard",
+        remove_navigation: bool = True,
+        remove_forms: bool = True,
+    ) -> None: ...
+class ConversionOptions:
+    heading_style: Literal["underlined", "atx", "atx_closed"]
+    list_indent_type: Literal["spaces", "tabs"]
+    list_indent_width: int
+    bullets: str
+    strong_em_symbol: str
+    escape_asterisks: bool
+    escape_underscores: bool
+    escape_misc: bool
+    escape_ascii: bool
+    code_language: str
+    autolinks: bool
+    default_title: bool
+    br_in_tables: bool
+    hocr_spatial_tables: bool
+    highlight_style: Literal["double-equal", "html", "bold", "none"]
+    extract_metadata: bool
+    whitespace_mode: Literal["normalized", "strict"]
+    strip_newlines: bool
+    wrap: bool
+    wrap_width: int
+    convert_as_inline: bool
+    sub_symbol: str
+    sup_symbol: str
+    newline_style: Literal["spaces", "backslash"]
+    code_block_style: Literal["indented", "backticks", "tildes"]
+    keep_inline_images_in: list[str]
+    preprocessing: PreprocessingOptions
+    encoding: str
+    debug: bool
+    strip_tags: list[str]
+    preserve_tags: list[str]
+    def __init__(
+        self,
+        *,
+        heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined",
+        list_indent_type: Literal["spaces", "tabs"] = "spaces",
+        list_indent_width: int = 4,
+        bullets: str = "*+-",
+        strong_em_symbol: str = "*",
+        escape_asterisks: bool = False,
+        escape_underscores: bool = False,
+        escape_misc: bool = False,
+        escape_ascii: bool = False,
+        code_language: str = "",
+        autolinks: bool = True,
+        default_title: bool = False,
+        br_in_tables: bool = False,
+        hocr_spatial_tables: bool = True,
+        highlight_style: Literal["double-equal", "html", "bold", "none"] = "double-equal",
+        extract_metadata: bool = True,
+        whitespace_mode: Literal["normalized", "strict"] = "normalized",
+        strip_newlines: bool = False,
+        wrap: bool = False,
+        wrap_width: int = 80,
+        convert_as_inline: bool = False,
+        sub_symbol: str = "",
+        sup_symbol: str = "",
+        newline_style: Literal["spaces", "backslash"] = "spaces",
+        code_block_style: Literal["indented", "backticks", "tildes"] = "indented",
+        keep_inline_images_in: list[str] = [],
+        preprocessing: PreprocessingOptions | None = None,
+        encoding: str = "utf-8",
+        debug: bool = False,
+        strip_tags: list[str] = [],
+        preserve_tags: list[str] = [],
+    ) -> None: ...
+class InlineImageConfig:
+    max_decoded_size_bytes: int
+    filename_prefix: str | None
+    capture_svg: bool
+    infer_dimensions: bool
+    def __init__(
+        self,
+        max_decoded_size_bytes: int = ...,
+        filename_prefix: str | None = None,
+        capture_svg: bool = True,
+        infer_dimensions: bool = False,
+    ) -> None: ...
+class ConversionOptionsHandle:
+    def __init__(self, options: ConversionOptions | None = None) -> None: ...
+class InlineImage(TypedDict):
+    data: bytes
+    format: str
+    filename: str | None
+    description: str | None
+    dimensions: tuple[int, int] | None
+    source: Literal["img_data_uri", "svg_element"]
+    attributes: dict[str, str]
+class InlineImageWarning(TypedDict):
+    index: int
+    message: str
+class MetadataConfig:
+    extract_document: bool
+    extract_headers: bool
+    extract_links: bool
+    extract_images: bool
+    extract_structured_data: bool
+    max_structured_data_size: int
+    def __init__(
+        self,
+        *,
+        extract_document: bool = True,
+        extract_headers: bool = True,
+        extract_links: bool = True,
+        extract_images: bool = True,
+        extract_structured_data: bool = True,
+        max_structured_data_size: int = 1_000_000,
+    ) -> None: ...
+class DocumentMetadata(TypedDict):
+    title: str | None
+    description: str | None
+    keywords: list[str]
+    author: str | None
+    canonical_url: str | None
+    base_href: str | None
+    language: str | None
+    text_direction: str | None
+    open_graph: dict[str, str]
+    twitter_card: dict[str, str]
+    meta_tags: dict[str, str]
+class HeaderMetadata(TypedDict):
+    level: int
+    text: str
+    id: str | None
+    depth: int
+    html_offset: int
+class LinkMetadata(TypedDict):
+    href: str
+    text: str
+    title: str | None
+    link_type: str
+    rel: list[str]
+    attributes: dict[str, str]
+class ImageMetadata(TypedDict):
+    src: str
+    alt: str | None
+    title: str | None
+    dimensions: tuple[int, int] | None
+    image_type: str
+    attributes: dict[str, str]
+class StructuredData(TypedDict):
+    data_type: str
+    raw_json: str
+    schema_type: str | None
+class ExtendedMetadata(TypedDict):
+    document: DocumentMetadata
+    headers: list[HeaderMetadata]
+    links: list[LinkMetadata]
+    images: list[ImageMetadata]
+    structured_data: list[StructuredData]
+def convert(html: str, options: ConversionOptions | None = None) -> str: ...
+def convert_with_inline_images(
+    html: str,
+    options: ConversionOptions | None = None,
+    image_config: InlineImageConfig | None = None,
+) -> tuple[str, list[InlineImage], list[InlineImageWarning]]: ...
+def convert_with_metadata(
+    html: str,
+    options: ConversionOptions | None = None,
+    metadata_config: MetadataConfig | None = None,
+) -> tuple[str, ExtendedMetadata]: ...
+def create_options_handle(options: ConversionOptions | None = None) -> ConversionOptionsHandle: ...
+def convert_with_options_handle(html: str, handle: ConversionOptionsHandle) -> str: ...

html_to_markdown/api.py ADDED Viewed

@@ -0,0 +1,195 @@
+"""High-level Python API backed by the Rust core."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Literal, TypedDict
+import html_to_markdown._html_to_markdown as _rust
+from html_to_markdown._html_to_markdown import (
+    ConversionOptionsHandle as OptionsHandle,
+)
+from html_to_markdown._html_to_markdown import (
+    InlineImageConfig,
+    MetadataConfig,
+)
+from html_to_markdown.options import ConversionOptions, PreprocessingOptions
+if TYPE_CHECKING:
+    from html_to_markdown._html_to_markdown import ExtendedMetadata  # pragma: no cover
+else:
+    ExtendedMetadata = dict[str, object]  # type: ignore[assignment]
+class InlineImage(TypedDict):
+    """Inline image extracted during conversion."""
+    data: bytes
+    format: str
+    filename: str | None
+    description: str | None
+    dimensions: tuple[int, int] | None
+    source: Literal["img_data_uri", "svg_element"]
+    attributes: dict[str, str]
+class InlineImageWarning(TypedDict):
+    """Warning produced during inline image extraction."""
+    index: int
+    message: str
+def _to_rust_preprocessing(options: PreprocessingOptions) -> _rust.PreprocessingOptions:
+    return _rust.PreprocessingOptions(
+        enabled=options.enabled,
+        preset=options.preset,
+        remove_navigation=options.remove_navigation,
+        remove_forms=options.remove_forms,
+    )
+def _to_rust_options(
+    options: ConversionOptions,
+    preprocessing: PreprocessingOptions,
+) -> _rust.ConversionOptions:
+    return _rust.ConversionOptions(
+        heading_style=options.heading_style,
+        list_indent_type=options.list_indent_type,
+        list_indent_width=options.list_indent_width,
+        bullets=options.bullets,
+        strong_em_symbol=options.strong_em_symbol,
+        escape_asterisks=options.escape_asterisks,
+        escape_underscores=options.escape_underscores,
+        escape_misc=options.escape_misc,
+        escape_ascii=options.escape_ascii,
+        code_language=options.code_language,
+        autolinks=options.autolinks,
+        default_title=options.default_title,
+        br_in_tables=options.br_in_tables,
+        hocr_spatial_tables=options.hocr_spatial_tables,
+        highlight_style=options.highlight_style,
+        extract_metadata=options.extract_metadata,
+        whitespace_mode=options.whitespace_mode,
+        strip_newlines=options.strip_newlines,
+        wrap=options.wrap,
+        wrap_width=options.wrap_width,
+        convert_as_inline=options.convert_as_inline,
+        sub_symbol=options.sub_symbol,
+        sup_symbol=options.sup_symbol,
+        newline_style=options.newline_style,
+        code_block_style=options.code_block_style,
+        keep_inline_images_in=list(options.keep_inline_images_in) if options.keep_inline_images_in else [],
+        preprocessing=_to_rust_preprocessing(preprocessing),
+        encoding=options.encoding,
+        debug=options.debug,
+        strip_tags=list(options.strip_tags) if options.strip_tags else [],
+        preserve_tags=list(options.preserve_tags) if options.preserve_tags else [],
+    )
+def convert(
+    html: str,
+    options: ConversionOptions | None = None,
+    preprocessing: PreprocessingOptions | None = None,
+) -> str:
+    """Convert HTML to Markdown using the Rust backend."""
+    if options is None and preprocessing is None:
+        return _rust.convert(html, None)
+    if options is None:
+        options = ConversionOptions()
+    if preprocessing is None:
+        preprocessing = PreprocessingOptions()
+    rust_options = _to_rust_options(options, preprocessing)
+    return _rust.convert(html, rust_options)
+def convert_with_inline_images(
+    html: str,
+    options: ConversionOptions | None = None,
+    preprocessing: PreprocessingOptions | None = None,
+    image_config: InlineImageConfig | None = None,
+) -> tuple[str, list[InlineImage], list[InlineImageWarning]]:
+    """Convert HTML and extract inline images."""
+    if options is None:
+        options = ConversionOptions()
+    if preprocessing is None:
+        preprocessing = PreprocessingOptions()
+    if image_config is None:
+        image_config = InlineImageConfig()
+    rust_options = _to_rust_options(options, preprocessing)
+    markdown, images, warnings = _rust.convert_with_inline_images(html, rust_options, image_config)
+    return markdown, list(images), list(warnings)
+def create_options_handle(
+    options: ConversionOptions | None = None,
+    preprocessing: PreprocessingOptions | None = None,
+) -> OptionsHandle:
+    """Create a reusable ConversionOptions handle backed by Rust."""
+    if options is None:
+        options = ConversionOptions()
+    if preprocessing is None:
+        preprocessing = PreprocessingOptions()
+    rust_options = _to_rust_options(options, preprocessing)
+    return _rust.create_options_handle(rust_options)
+def convert_with_handle(html: str, handle: OptionsHandle) -> str:
+    """Convert HTML using a pre-parsed ConversionOptions handle."""
+    return _rust.convert_with_options_handle(html, handle)
+def convert_with_metadata(
+    html: str,
+    options: ConversionOptions | None = None,
+    preprocessing: PreprocessingOptions | None = None,
+    metadata_config: MetadataConfig | None = None,
+) -> tuple[str, ExtendedMetadata]:
+    """Convert HTML and extract comprehensive metadata.
+    Args:
+        html: HTML string to convert
+        options: Optional conversion configuration
+        preprocessing: Optional preprocessing configuration
+        metadata_config: Optional metadata extraction configuration
+    Returns:
+        Tuple of (markdown, metadata_dict) where metadata_dict contains:
+        - document: Document-level metadata (title, description, lang, etc.)
+        - headers: List of header elements with hierarchy
+        - links: List of extracted hyperlinks with classification
+        - images: List of extracted images with metadata
+        - structured_data: List of JSON-LD, Microdata, or RDFa blocks
+    """
+    if not hasattr(_rust, "convert_with_metadata"):
+        raise ImportError(
+            "convert_with_metadata is missing from the native extension; this indicates a broken/partial installation."
+        )
+    if options is None:
+        options = ConversionOptions()
+    if preprocessing is None:
+        preprocessing = PreprocessingOptions()
+    if metadata_config is None:
+        metadata_config = MetadataConfig()
+    rust_options = _to_rust_options(options, preprocessing)
+    markdown, metadata = _rust.convert_with_metadata(html, rust_options, metadata_config)
+    return markdown, metadata
+__all__ = [
+    "InlineImage",
+    "InlineImageConfig",
+    "InlineImageWarning",
+    "MetadataConfig",
+    "OptionsHandle",
+    "convert",
+    "convert_with_handle",
+    "convert_with_inline_images",
+    "convert_with_metadata",
+    "create_options_handle",
+]

html_to_markdown/bin/html-to-markdown ADDED Viewed

Binary file

html_to_markdown/cli.py ADDED Viewed

@@ -0,0 +1,3 @@
+from html_to_markdown.cli_proxy import main
+__all__ = ["main"]

html_to_markdown/cli_proxy.py ADDED Viewed

@@ -0,0 +1,142 @@
+import subprocess
+import sys
+import warnings
+from pathlib import Path
+from html_to_markdown.exceptions import RedundantV1FlagError, RemovedV1FlagError
+def find_cli_binary() -> Path:
+    """Find the html-to-markdown CLI binary in expected locations.
+    Returns:
+        Path to the CLI binary.
+    Raises:
+        FileNotFoundError: If the binary cannot be found.
+    """
+    binary_name = "html-to-markdown.exe" if sys.platform == "win32" else "html-to-markdown"
+    module_dir = Path(__file__).resolve().parent
+    parent_dirs = list(module_dir.parents)
+    search_roots = []
+    for parent in parent_dirs:
+        candidate = parent / "target" / "release" / binary_name
+        search_roots.append(candidate)
+    possible_locations = [
+        *search_roots,
+        module_dir / "bin" / binary_name,
+        module_dir / binary_name,
+    ]
+    for location in possible_locations:
+        if location.exists() and location.is_file():
+            return location
+    msg = "html-to-markdown CLI binary not found. Please install or build the package."
+    raise FileNotFoundError(msg)
+def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
+    """Translate v1 CLI arguments to v2 format.
+    Args:
+        argv: List of command-line arguments.
+    Returns:
+        Translated list of arguments compatible with v2.
+    Raises:
+        RemovedV1FlagError: If a v1 flag has been removed in v2.
+    """
+    translated = []
+    i = 0
+    while i < len(argv):
+        arg = argv[i]
+        if arg in ("--strip", "--convert"):
+            raise RemovedV1FlagError(
+                flag=arg,
+                reason=f"{arg} option has been removed in v2.",
+                migration="Remove this flag from your command. The feature is no longer available.",
+            )
+        if arg in (
+            "--no-escape-asterisks",
+            "--no-escape-underscores",
+            "--no-escape-misc",
+            "--no-wrap",
+            "--no-autolinks",
+            "--no-extract-metadata",
+        ):
+            warnings.warn(
+                f"'{arg}' is deprecated and redundant in v2. "
+                f"These options are now disabled by default. Remove this flag.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+        elif arg == "--preprocess-html":
+            warnings.warn(
+                "'--preprocess-html' is deprecated. Use '--preprocess' instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            translated.append("--preprocess")
+        elif arg in (
+            "--escape-asterisks",
+            "--escape-underscores",
+            "--escape-misc",
+            "--autolinks",
+            "--extract-metadata",
+            "--wrap",
+        ):
+            translated.append(arg)
+        else:
+            translated.append(arg)
+        i += 1
+    return translated
+def main(argv: list[str]) -> str:
+    """Execute the CLI proxy.
+    Translates v1 arguments to v2 and invokes the native Rust CLI binary.
+    Args:
+        argv: Command-line arguments.
+    Returns:
+        Stdout from the CLI binary.
+    """
+    cli_binary = find_cli_binary()
+    try:
+        translated_args = translate_v1_args_to_v2(argv)
+    except (RemovedV1FlagError, RedundantV1FlagError) as e:
+        sys.stderr.write(f"\n❌ Error: {e.flag}\n\n")
+        sys.stderr.write(f"   {e.reason}\n\n")
+        sys.stderr.write(f"   💡 {e.migration}\n\n")
+        sys.exit(1)
+    except ValueError as e:
+        sys.stderr.write(f"Error: {e}\n")
+        sys.exit(1)
+    result = subprocess.run(  # noqa: S603
+        [str(cli_binary), *translated_args],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if result.returncode != 0:
+        sys.stderr.write(result.stderr)
+        sys.exit(result.returncode)
+    return result.stdout

html_to_markdown/exceptions.py ADDED Viewed

@@ -0,0 +1,73 @@
+from __future__ import annotations
+class HtmlToMarkdownError(Exception):
+    """Base exception for all html-to-markdown errors."""
+class MissingDependencyError(HtmlToMarkdownError):
+    """Raised when a required dependency is not installed."""
+    def __init__(self, dependency: str, install_command: str | None = None) -> None:
+        self.dependency = dependency
+        self.install_command = install_command
+        message = f"{dependency} is not installed."
+        if install_command:
+            message += f" Install with: {install_command}"
+        super().__init__(message)
+class InvalidParserError(HtmlToMarkdownError):
+    """Raised when an invalid parser is specified."""
+    def __init__(self, parser: str, available_parsers: list[str]) -> None:
+        self.parser = parser
+        self.available_parsers = available_parsers
+        message = f"Invalid parser '{parser}'. Available parsers: {', '.join(available_parsers)}"
+        super().__init__(message)
+class EmptyHtmlError(HtmlToMarkdownError):
+    """Raised when input HTML is empty."""
+    def __init__(self) -> None:
+        super().__init__("The input HTML is empty.")
+class ConflictingOptionsError(HtmlToMarkdownError):
+    """Raised when conflicting configuration options are specified."""
+    def __init__(self, option1: str, option2: str) -> None:
+        self.option1 = option1
+        self.option2 = option2
+        super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
+class InvalidEncodingError(HtmlToMarkdownError):
+    """Raised when an invalid character encoding is specified."""
+    def __init__(self, encoding: str) -> None:
+        super().__init__(f"The specified encoding ({encoding}) is not valid.")
+class UnsupportedV1FeatureError(HtmlToMarkdownError):
+    """Raised when a v1 feature is not supported in v2."""
+    def __init__(self, flag: str, reason: str, migration: str) -> None:
+        self.flag = flag
+        self.reason = reason
+        self.migration = migration
+        message = f"'{flag}' is not supported in v2.\n\nReason: {reason}\n\nMigration: {migration}"
+        super().__init__(message)
+class RemovedV1FlagError(UnsupportedV1FeatureError):
+    """Raised when a v1 flag has been removed in v2."""
+class RedundantV1FlagError(UnsupportedV1FeatureError):
+    """Raised when a v1 flag is redundant in v2."""