PyPI - html-to-markdown - Versions diffs - 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

html-to-markdown 1.9.1py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (16) hide show

html_to_markdown/__main__.py +0 -1
html_to_markdown/cli.py +101 -45
html_to_markdown/constants.py +3 -0
html_to_markdown/converters.py +31 -502
html_to_markdown/exceptions.py +1 -11
html_to_markdown/preprocessor.py +0 -37
html_to_markdown/processing.py +104 -181
html_to_markdown/utils.py +2 -42
html_to_markdown/whitespace.py +292 -0
{html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/METADATA +195 -203
html_to_markdown-1.10.0.dist-info/RECORD +17 -0
html_to_markdown-1.9.1.dist-info/RECORD +0 -16
{html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/WHEEL +0 -0
{html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/entry_points.txt +0 -0
{html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/licenses/LICENSE +0 -0
{html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/top_level.txt +0 -0

html_to_markdown/exceptions.py CHANGED Viewed

@@ -1,15 +1,11 @@
-"""Custom exceptions for the html-to-markdown library."""
 from __future__ import annotations
 class HtmlToMarkdownError(Exception):
-    """Base exception for all html-to-markdown errors."""
+    pass
 class MissingDependencyError(HtmlToMarkdownError):
-    """Raised when an optional dependency is required but not installed."""
     def __init__(self, dependency: str, install_command: str | None = None) -> None:
         self.dependency = dependency
         self.install_command = install_command
@@ -22,8 +18,6 @@ class MissingDependencyError(HtmlToMarkdownError):
 class InvalidParserError(HtmlToMarkdownError):
-    """Raised when an invalid parser is specified."""
     def __init__(self, parser: str, available_parsers: list[str]) -> None:
         self.parser = parser
         self.available_parsers = available_parsers
@@ -33,15 +27,11 @@ class InvalidParserError(HtmlToMarkdownError):
 class EmptyHtmlError(HtmlToMarkdownError):
-    """Raised when the input HTML is empty."""
     def __init__(self) -> None:
         super().__init__("The input HTML is empty.")
 class ConflictingOptionsError(HtmlToMarkdownError):
-    """Raised when conflicting options are specified."""
     def __init__(self, option1: str, option2: str) -> None:
         self.option1 = option1
         self.option2 = option2

html_to_markdown/preprocessor.py CHANGED Viewed

@@ -1,5 +1,3 @@
-"""HTML preprocessing using nh3 (ammonia bindings) for improved quality and performance."""
 from __future__ import annotations
 import re
@@ -22,24 +20,6 @@ def preprocess_html(
     custom_tags_to_remove: set[str] | None = None,
     custom_attributes_to_remove: set[str] | None = None,
 ) -> str:
-    """Preprocess HTML to remove unwanted elements and improve quality.
-    Args:
-        html: Raw HTML content to preprocess.
-        remove_navigation: Remove navigation elements and menus.
-        remove_forms: Remove form elements (input, button, select, etc.).
-        remove_scripts: Remove script tags and content.
-        remove_styles: Remove style tags and content.
-        remove_comments: Remove HTML comments.
-        preserve_semantic_structure: Preserve semantic HTML5 elements.
-        preserve_tables: Preserve table structure.
-        preserve_media: Preserve media elements (img, video, audio).
-        custom_tags_to_remove: Additional tags to remove.
-        custom_attributes_to_remove: Additional attributes to remove.
-    Returns:
-        Cleaned HTML ready for conversion to markdown.
-    """
     if not html or not html.strip():  # pragma: no cover
         return html
@@ -83,7 +63,6 @@ def _configure_cleaning_rules(
     custom_tags_to_remove: set[str],
     custom_attributes_to_remove: set[str],
 ) -> dict[str, Any]:
-    """Configure the cleaning rules for nh3."""
     allowed_tags = {
         "p",
         "div",
@@ -254,7 +233,6 @@ def _configure_cleaning_rules(
 def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
-    """Remove elements with navigation-related classes."""
     if not remove_navigation:
         return html
@@ -288,7 +266,6 @@ def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
 def _remove_navigation_patterns(html: str, remove_navigation: bool) -> str:
-    """Remove common navigation patterns that nh3 might miss."""
     if not remove_navigation:
         return html
@@ -329,7 +306,6 @@ def _remove_navigation_patterns(html: str, remove_navigation: bool) -> str:
 def _remove_wikipedia_navigation_lists(html: str) -> str:
-    """Remove Wikipedia-style navigation lists that appear at the start."""
     patterns = [
         r"Main menu\s*\n\n(-\s*\[.*?\]\(.*?\).*?\n){3,}",
         r"(-\s*\[[^\]]*\]\(/wiki/[^)]*\).*?\n){5,}",
@@ -342,7 +318,6 @@ def _remove_wikipedia_navigation_lists(html: str) -> str:
 def _fix_whitespace_issues(html: str) -> str:
-    """Fix common whitespace issues in HTML."""
     html = re.sub(r"[ \t]{2,}", " ", html)
     html = re.sub(r"\n\s*\n", "\n\n", html)
@@ -385,18 +360,6 @@ PRESETS: dict[str, dict[str, Any]] = {
 def create_preprocessor(preset: str = "standard", **overrides: Any) -> dict[str, Any]:
-    """Create preprocessor configuration with a preset.
-    Args:
-        preset: The preset configuration to use (minimal, standard, aggressive).
-        **overrides: Any configuration options to override.
-    Returns:
-        Configuration dict for preprocessor.
-    Raises:
-        ValueError: If preset is unknown.
-    """
     if preset not in PRESETS:
         msg = f"Unknown preset '{preset}'. Available presets: {list(PRESETS.keys())}"
         raise ValueError(msg)

html_to_markdown/processing.py CHANGED Viewed

@@ -33,12 +33,13 @@ from html_to_markdown.constants import (
     DOUBLE_EQUAL,
     SPACES,
     UNDERLINED,
+    WHITESPACE_NORMALIZED,
     html_heading_re,
-    whitespace_re,
 )
 from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
 from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
 from html_to_markdown.utils import escape
+from html_to_markdown.whitespace import WhitespaceHandler
 if TYPE_CHECKING:
     from collections.abc import Iterable
@@ -143,6 +144,12 @@ SupportedTag = Literal[
 ]
+def _get_list_indent(list_indent_type: str, list_indent_width: int) -> str:
+    if list_indent_type == "tabs":
+        return "\t"
+    return " " * list_indent_width
 def _is_nested_tag(el: PageElement) -> bool:
     return isinstance(el, Tag) and el.name in {
         "ol",
@@ -170,6 +177,7 @@ def _process_tag(
     escape_misc: bool,
     escape_underscores: bool,
     strip: set[str] | None,
+    whitespace_handler: WhitespaceHandler,
     context_before: str = "",
 ) -> str:
     should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
@@ -218,6 +226,7 @@ def _process_tag(
                     escape_misc=escape_misc,
                     escape_asterisks=escape_asterisks,
                     escape_underscores=escape_underscores,
+                    whitespace_handler=whitespace_handler,
                 )
             )
         elif isinstance(el, Tag):
@@ -232,6 +241,7 @@ def _process_tag(
                     escape_misc=escape_misc,
                     escape_underscores=escape_underscores,
                     strip=strip,
+                    whitespace_handler=whitespace_handler,
                     context_before=(context_before + current_text)[-2:],
                 )
             )
@@ -259,6 +269,7 @@ def _process_text(
     escape_misc: bool,
     escape_asterisks: bool,
     escape_underscores: bool,
+    whitespace_handler: WhitespaceHandler,
 ) -> str:
     text = str(el) or ""
@@ -275,69 +286,9 @@ def _process_text(
         if len(ancestor_names) > 10:
             break
-    if "pre" not in ancestor_names:
-        if text.strip() == "":
-            if "\n" in text:
-                text = ""
-            else:
-                block_elements = {
-                    "p",
-                    "ul",
-                    "ol",
-                    "div",
-                    "blockquote",
-                    "pre",
-                    "h1",
-                    "h2",
-                    "h3",
-                    "h4",
-                    "h5",
-                    "h6",
-                    "table",
-                    "dl",
-                    "hr",
-                    "figure",
-                    "article",
-                    "section",
-                    "nav",
-                    "aside",
-                    "header",
-                    "footer",
-                    "main",
-                    "form",
-                    "fieldset",
-                }
-                prev_sibling = el.previous_sibling
-                next_sibling = el.next_sibling
-                if (
-                    prev_sibling
-                    and hasattr(prev_sibling, "name")
-                    and prev_sibling.name in block_elements
-                    and next_sibling
-                    and hasattr(next_sibling, "name")
-                    and next_sibling.name in block_elements
-                ):
-                    text = ""
-                else:
-                    text = " " if text else ""
-        else:
-            has_leading_space = text.startswith((" ", "\t"))
-            has_trailing_space = text.endswith((" ", "\t"))
-            middle_content = (
-                text[1:-1]
-                if has_leading_space and has_trailing_space
-                else text[1:]
-                if has_leading_space
-                else text[:-1]
-                if has_trailing_space
-                else text
-            )
+    in_pre = bool(ancestor_names.intersection({"pre"}))
-            middle_content = whitespace_re.sub(" ", middle_content.strip())
-            text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
+    text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
     if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
         text = escape(
@@ -357,7 +308,6 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
 def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
-    """Get set of ancestor tag names for efficient parent checking."""
     elem_id = id(element)
     cache = _ancestor_cache.get()
     if cache is None:
@@ -388,7 +338,6 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
 def _has_ancestor(element: PageElement, tag_names: str | list[str]) -> bool:
-    """Check if element has any of the specified ancestors efficiently."""
     if isinstance(tag_names, str):
         tag_names = [tag_names]
@@ -414,14 +363,6 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
 def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
-    """Extract metadata from HTML document.
-    Args:
-        soup: BeautifulSoup instance of the HTML document.
-    Returns:
-        Dictionary of metadata key-value pairs.
-    """
     metadata = {}
     title_tag = soup.find("title")
@@ -468,14 +409,6 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
 def _format_metadata_comment(metadata: dict[str, str]) -> str:
-    """Format metadata as a Markdown comment block.
-    Args:
-        metadata: Dictionary of metadata key-value pairs.
-    Returns:
-        Formatted metadata comment block.
-    """
     if not metadata:
         return ""
@@ -511,64 +444,87 @@ def convert_to_markdown(
     heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
     highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
     keep_inline_images_in: Iterable[str] | None = None,
+    list_indent_type: Literal["spaces", "tabs"] = "spaces",
+    list_indent_width: int = 4,
     newline_style: Literal["spaces", "backslash"] = SPACES,
+    preprocess_html: bool = False,
+    preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
+    remove_forms: bool = True,
+    remove_navigation: bool = True,
     strip: str | Iterable[str] | None = None,
     strip_newlines: bool = False,
     strong_em_symbol: Literal["*", "_"] = ASTERISK,
     sub_symbol: str = "",
     sup_symbol: str = "",
+    whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
     wrap: bool = False,
     wrap_width: int = 80,
-    preprocess_html: bool = False,
-    preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
-    remove_navigation: bool = True,
-    remove_forms: bool = True,
 ) -> str:
-    """Convert HTML to Markdown.
+    """Convert HTML content to Markdown format.
-    Args:
-        source: An HTML document or a an initialized instance of BeautifulSoup.
-        stream_processing: Use streaming processing for large documents. Defaults to False.
-        chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
-        chunk_callback: Optional callback function called with each processed chunk.
-        progress_callback: Optional callback function called with (processed_bytes, total_bytes).
-        parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
-                Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
-        autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
-        bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
-        code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
-        code_language_callback: Function to dynamically determine the language for code blocks.
-        convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
-        convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
-        custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
-        default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
-        escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
-        escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
-        escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
-        extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
-        heading_style: The style to use for Markdown headings. Defaults to "underlined".
-        highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
-        keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
-        newline_style: Style for handling newlines in text content. Defaults to "spaces".
-        strip: Tags to strip from the output. Defaults to None.
-        strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
-        strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
-        sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
-        sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
-        wrap: Wrap text to the specified width. Defaults to False.
-        wrap_width: The number of characters at which to wrap text. Defaults to 80.
-        preprocess_html: Apply HTML preprocessing to improve quality. Defaults to False.
-        preprocessing_preset: Preset configuration for preprocessing. Defaults to "standard".
-        remove_navigation: Remove navigation elements during preprocessing. Defaults to True.
-        remove_forms: Remove form elements during preprocessing. Defaults to True.
+    This is the main entry point for converting HTML to Markdown. It supports
+    various customization options for controlling the conversion behavior.
-    Raises:
-        ConflictingOptionsError: If both 'strip' and 'convert' are specified.
-        EmptyHtmlError: When the input HTML is empty.
-        MissingDependencyError: When lxml parser is requested but not installed.
+    Args:
+        source: HTML string or BeautifulSoup object to convert.
+        stream_processing: Enable streaming mode for large documents.
+        chunk_size: Size of chunks for streaming processing.
+        chunk_callback: Callback for processing chunks in streaming mode.
+        progress_callback: Callback for progress updates (current, total).
+        parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
+        autolinks: Convert URLs to automatic links.
+        bullets: Characters to use for unordered list bullets.
+        code_language: Default language for code blocks.
+        code_language_callback: Callback to determine code language from element.
+        convert: HTML tags to convert to Markdown.
+        convert_as_inline: Treat block elements as inline during conversion.
+        custom_converters: Custom converters for specific HTML elements.
+        default_title: Add a default title if none exists.
+        escape_asterisks: Escape asterisk characters in text.
+        escape_misc: Escape miscellaneous Markdown characters.
+        escape_underscores: Escape underscore characters in text.
+        extract_metadata: Extract metadata from HTML head.
+        heading_style: Style for headings ('underlined', 'atx', 'atx_closed').
+        highlight_style: Style for highlighting ('double-equal', 'html', 'bold').
+        keep_inline_images_in: Parent tags where images should remain inline.
+        list_indent_type: Type of indentation for lists ('spaces', 'tabs').
+        list_indent_width: Number of spaces for list indentation.
+        newline_style: Style for newlines ('spaces', 'backslash').
+        preprocess_html: Enable HTML preprocessing to clean up content.
+        preprocessing_preset: Preprocessing aggressiveness level.
+        remove_forms: Remove form elements during preprocessing.
+        remove_navigation: Remove navigation elements during preprocessing.
+        strip: HTML tags to strip from output.
+        strip_newlines: Remove newlines from HTML before processing.
+        strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
+        sub_symbol: Symbol for subscript text.
+        sup_symbol: Symbol for superscript text.
+        whitespace_mode: How to handle whitespace ('normalized', 'strict').
+        wrap: Enable text wrapping.
+        wrap_width: Column width for text wrapping.
     Returns:
-        str: A string of Markdown-formatted text converted from the given HTML.
+        The converted Markdown string.
+    Raises:
+        EmptyHtmlError: If the HTML input is empty.
+        MissingDependencyError: If required dependencies are not installed.
+        ConflictingOptionsError: If conflicting options are provided.
+    Examples:
+        Basic conversion:
+        >>> html = "<h1>Title</h1><p>Content</p>"
+        >>> convert_to_markdown(html)
+        'Title\\n=====\\n\\nContent\\n\\n'
+        With custom options:
+        >>> convert_to_markdown(html, heading_style="atx", list_indent_width=2)
+        '# Title\\n\\nContent\\n\\n'
+        Discord-compatible lists (2-space indent):
+        >>> html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
+        >>> convert_to_markdown(html, list_indent_width=2)
+        '* Item 1\\n* Item 2\\n\\n'
     """
     if isinstance(source, str):
         if (
@@ -665,6 +621,7 @@ def convert_to_markdown(
             sup_symbol=sup_symbol,
             wrap=wrap,
             wrap_width=wrap_width,
+            whitespace_mode=whitespace_mode,
         ):
             if chunk_callback:
                 chunk_callback(chunk)
@@ -681,9 +638,12 @@ def convert_to_markdown(
     sink = StringSink()
+    whitespace_handler = WhitespaceHandler(whitespace_mode)
     _process_html_core(
         source,
         sink,
+        whitespace_handler=whitespace_handler,
         parser=parser,
         autolinks=autolinks,
         bullets=bullets,
@@ -700,6 +660,8 @@ def convert_to_markdown(
         heading_style=heading_style,
         highlight_style=highlight_style,
         keep_inline_images_in=keep_inline_images_in,
+        list_indent_type=list_indent_type,
+        list_indent_width=list_indent_width,
         newline_style=newline_style,
         strip=strip,
         strip_newlines=strip_newlines,
@@ -761,34 +723,25 @@ def convert_to_markdown(
 class OutputSink:
-    """Abstract output sink for processed markdown text."""
     def write(self, text: str) -> None:
-        """Write text to the sink."""
         raise NotImplementedError
     def finalize(self) -> None:
-        """Finalize the output."""
+        pass
 class StringSink(OutputSink):
-    """Collects all output into a single string."""
     def __init__(self) -> None:
         self.buffer = StringIO()
     def write(self, text: str) -> None:
-        """Write text to the buffer."""
         self.buffer.write(text)
     def get_result(self) -> str:
-        """Get the complete result string."""
         return self.buffer.getvalue()
 class StreamingSink(OutputSink):
-    """Yields chunks of output for streaming processing."""
     def __init__(self, chunk_size: int = 1024, progress_callback: Callable[[int, int], None] | None = None) -> None:
         self.chunk_size = chunk_size
         self.progress_callback = progress_callback
@@ -799,7 +752,6 @@ class StreamingSink(OutputSink):
         self.chunks: list[str] = []
     def write(self, text: str) -> None:
-        """Write text and yield chunks when threshold is reached."""
         if not text:
             return
@@ -822,7 +774,6 @@ class StreamingSink(OutputSink):
         self.buffer_size = len(current_content)
     def finalize(self) -> None:
-        """Finalize and yield any remaining content."""
         if self.buffer_size > 0:
             content = self.buffer.getvalue()
             self.chunks.append(content)
@@ -830,11 +781,9 @@ class StreamingSink(OutputSink):
             self._update_progress()
     def get_chunks(self) -> Generator[str, None, None]:
-        """Get all chunks yielded during processing."""
         yield from self.chunks
     def _find_split_position(self, content: str) -> int:
-        """Find optimal position to split content for chunks."""
         target = self.chunk_size
         lookahead = min(100, len(content) - target)
@@ -847,7 +796,6 @@ class StreamingSink(OutputSink):
         return min(target, len(content))
     def _update_progress(self) -> None:
-        """Update progress if callback is provided."""
         if self.progress_callback:
             self.progress_callback(self.processed_bytes, self.total_bytes)
@@ -856,6 +804,7 @@ def _process_html_core(
     source: str | BeautifulSoup,
     sink: OutputSink,
     *,
+    whitespace_handler: WhitespaceHandler,
     parser: str | None = None,
     autolinks: bool,
     bullets: str,
@@ -872,6 +821,8 @@ def _process_html_core(
     heading_style: Literal["underlined", "atx", "atx_closed"],
     highlight_style: Literal["double-equal", "html", "bold"],
     keep_inline_images_in: Iterable[str] | None,
+    list_indent_type: str,
+    list_indent_width: int,
     newline_style: Literal["spaces", "backslash"],
     strip: str | Iterable[str] | None,
     strip_newlines: bool,
@@ -881,7 +832,6 @@ def _process_html_core(
     wrap: bool,
     wrap_width: int,
 ) -> None:
-    """Core HTML to Markdown processing logic shared by both regular and streaming."""
     token = _ancestor_cache.set({})
     try:
@@ -921,6 +871,8 @@ def _process_html_core(
             heading_style=heading_style,
             highlight_style=highlight_style,
             keep_inline_images_in=keep_inline_images_in,
+            list_indent_type=list_indent_type,
+            list_indent_width=list_indent_width,
             newline_style=newline_style,
             strong_em_symbol=strong_em_symbol,
             sub_symbol=sub_symbol,
@@ -948,6 +900,7 @@ def _process_html_core(
                     escape_misc=escape_misc,
                     escape_asterisks=escape_asterisks,
                     escape_underscores=escape_underscores,
+                    whitespace_handler=whitespace_handler,
                 )
                 sink.write(text)
                 context += text
@@ -961,6 +914,7 @@ def _process_html_core(
                     escape_misc=escape_misc,
                     escape_underscores=escape_underscores,
                     strip=_as_optional_set(strip),
+                    whitespace_handler=whitespace_handler,
                     context_before=context[-2:],
                 )
                 sink.write(text)
@@ -992,54 +946,18 @@ def convert_to_markdown_stream(
     heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
     highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
     keep_inline_images_in: Iterable[str] | None = None,
+    list_indent_type: Literal["spaces", "tabs"] = "spaces",
+    list_indent_width: int = 4,
     newline_style: Literal["spaces", "backslash"] = SPACES,
     strip: str | Iterable[str] | None = None,
     strip_newlines: bool = False,
     strong_em_symbol: Literal["*", "_"] = ASTERISK,
     sub_symbol: str = "",
     sup_symbol: str = "",
+    whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
     wrap: bool = False,
     wrap_width: int = 80,
 ) -> Generator[str, None, None]:
-    """Convert HTML to Markdown using streaming/chunked processing.
-    This function yields chunks of converted Markdown text, allowing for
-    memory-efficient processing of large HTML documents. The output is guaranteed
-    to be identical to convert_to_markdown().
-    Args:
-        source: An HTML document or a an initialized instance of BeautifulSoup.
-        chunk_size: Size of chunks to yield (approximate, in characters).
-        progress_callback: Optional callback function called with (processed_bytes, total_bytes).
-        parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
-                Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
-        autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
-        bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
-        code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
-        code_language_callback: Function to dynamically determine the language for code blocks.
-        convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
-        convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
-        custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
-        default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
-        escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
-        escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
-        escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
-        extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
-        heading_style: The style to use for Markdown headings. Defaults to "underlined".
-        highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
-        keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
-        newline_style: Style for handling newlines in text content. Defaults to "spaces".
-        strip: Tags to strip from the output. Defaults to None.
-        strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
-        strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
-        sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
-        sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
-        wrap: Wrap text to the specified width. Defaults to False.
-        wrap_width: The number of characters at which to wrap text. Defaults to 80.
-    Yields:
-        str: Chunks of Markdown-formatted text.
-    """
     sink = StreamingSink(chunk_size, progress_callback)
     if isinstance(source, str):
@@ -1047,9 +965,12 @@ def convert_to_markdown_stream(
     elif isinstance(source, BeautifulSoup):
         sink.total_bytes = len(str(source))
+    whitespace_handler = WhitespaceHandler(whitespace_mode)
     _process_html_core(
         source,
         sink,
+        whitespace_handler=whitespace_handler,
         parser=parser,
         autolinks=autolinks,
         bullets=bullets,
@@ -1066,6 +987,8 @@ def convert_to_markdown_stream(
         heading_style=heading_style,
         highlight_style=highlight_style,
         keep_inline_images_in=keep_inline_images_in,
+        list_indent_type=list_indent_type,
+        list_indent_width=list_indent_width,
         newline_style=newline_style,
         strip=strip,
         strip_newlines=strip_newlines,

html-to-markdown 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

html-to-markdown 1.9.1py3-none-any.whl → 1.10.0py3-none-any.whl