PyPI - html-to-markdown - Versions diffs - 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

html-to-markdown 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (16) hide show

html_to_markdown/__main__.py +0 -1
html_to_markdown/cli.py +101 -45
html_to_markdown/constants.py +3 -0
html_to_markdown/converters.py +52 -573
html_to_markdown/exceptions.py +1 -11
html_to_markdown/preprocessor.py +0 -37
html_to_markdown/processing.py +104 -202
html_to_markdown/utils.py +2 -42
html_to_markdown/whitespace.py +292 -0
{html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/METADATA +204 -204
html_to_markdown-1.10.0.dist-info/RECORD +17 -0
html_to_markdown-1.9.0.dist-info/RECORD +0 -16
{html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/WHEEL +0 -0
{html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/entry_points.txt +0 -0
{html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/licenses/LICENSE +0 -0
{html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/top_level.txt +0 -0

html_to_markdown/processing.py CHANGED Viewed

@@ -33,12 +33,13 @@ from html_to_markdown.constants import (
     DOUBLE_EQUAL,
     SPACES,
     UNDERLINED,
+    WHITESPACE_NORMALIZED,
     html_heading_re,
-    whitespace_re,
 )
 from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
 from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
 from html_to_markdown.utils import escape
+from html_to_markdown.whitespace import WhitespaceHandler
 if TYPE_CHECKING:
     from collections.abc import Iterable
@@ -143,6 +144,12 @@ SupportedTag = Literal[
 ]
+def _get_list_indent(list_indent_type: str, list_indent_width: int) -> str:
+    if list_indent_type == "tabs":
+        return "\t"
+    return " " * list_indent_width
 def _is_nested_tag(el: PageElement) -> bool:
     return isinstance(el, Tag) and el.name in {
         "ol",
@@ -170,6 +177,7 @@ def _process_tag(
     escape_misc: bool,
     escape_underscores: bool,
     strip: set[str] | None,
+    whitespace_handler: WhitespaceHandler,
     context_before: str = "",
 ) -> str:
     should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
@@ -195,18 +203,14 @@ def _process_tag(
     children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
-    # List of tags that return empty string when they have no content
     empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
     for i, el in enumerate(children):
         if isinstance(el, NavigableString):
-            # Check if this is whitespace between empty elements
             if el.strip() == "" and i > 0 and i < len(children) - 1:
                 prev_el = children[i - 1]
                 next_el = children[i + 1]
-                # If previous element was a tag that produced empty output
-                # and next element is also a tag that could be empty, skip this whitespace
                 if (
                     isinstance(prev_el, Tag)
                     and isinstance(next_el, Tag)
@@ -214,7 +218,6 @@ def _process_tag(
                     and next_el.name.lower() in empty_when_no_content_tags
                     and not prev_el.get_text().strip()
                 ):
-                    # Previous tag is empty and next could be empty too, skip this whitespace
                     continue
             text_parts.append(
@@ -223,6 +226,7 @@ def _process_tag(
                     escape_misc=escape_misc,
                     escape_asterisks=escape_asterisks,
                     escape_underscores=escape_underscores,
+                    whitespace_handler=whitespace_handler,
                 )
             )
         elif isinstance(el, Tag):
@@ -237,6 +241,7 @@ def _process_tag(
                     escape_misc=escape_misc,
                     escape_underscores=escape_underscores,
                     strip=strip,
+                    whitespace_handler=whitespace_handler,
                     context_before=(context_before + current_text)[-2:],
                 )
             )
@@ -264,6 +269,7 @@ def _process_text(
     escape_misc: bool,
     escape_asterisks: bool,
     escape_underscores: bool,
+    whitespace_handler: WhitespaceHandler,
 ) -> str:
     text = str(el) or ""
@@ -280,76 +286,9 @@ def _process_text(
         if len(ancestor_names) > 10:
             break
-    if "pre" not in ancestor_names:
-        # Special case: if the text is only whitespace
-        if text.strip() == "":
-            # If it contains newlines, it's probably indentation whitespace, return empty
-            if "\n" in text:
-                text = ""
-            else:
-                # Check if this whitespace is between block elements
-                # Define block elements that should not have whitespace between them
-                block_elements = {
-                    "p",
-                    "ul",
-                    "ol",
-                    "div",
-                    "blockquote",
-                    "pre",
-                    "h1",
-                    "h2",
-                    "h3",
-                    "h4",
-                    "h5",
-                    "h6",
-                    "table",
-                    "dl",
-                    "hr",
-                    "figure",
-                    "article",
-                    "section",
-                    "nav",
-                    "aside",
-                    "header",
-                    "footer",
-                    "main",
-                    "form",
-                    "fieldset",
-                }
-                prev_sibling = el.previous_sibling
-                next_sibling = el.next_sibling
-                # Check if whitespace is between block elements
-                if (
-                    prev_sibling
-                    and hasattr(prev_sibling, "name")
-                    and prev_sibling.name in block_elements
-                    and next_sibling
-                    and hasattr(next_sibling, "name")
-                    and next_sibling.name in block_elements
-                ):
-                    # Remove whitespace between block elements
-                    text = ""
-                else:
-                    # Otherwise it's inline whitespace, normalize to single space
-                    text = " " if text else ""
-        else:
-            has_leading_space = text.startswith((" ", "\t"))
-            has_trailing_space = text.endswith((" ", "\t"))
-            middle_content = (
-                text[1:-1]
-                if has_leading_space and has_trailing_space
-                else text[1:]
-                if has_leading_space
-                else text[:-1]
-                if has_trailing_space
-                else text
-            )
+    in_pre = bool(ancestor_names.intersection({"pre"}))
-            middle_content = whitespace_re.sub(" ", middle_content.strip())
-            text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
+    text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
     if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
         text = escape(
@@ -369,7 +308,6 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
 def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
-    """Get set of ancestor tag names for efficient parent checking."""
     elem_id = id(element)
     cache = _ancestor_cache.get()
     if cache is None:
@@ -400,7 +338,6 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
 def _has_ancestor(element: PageElement, tag_names: str | list[str]) -> bool:
-    """Check if element has any of the specified ancestors efficiently."""
     if isinstance(tag_names, str):
         tag_names = [tag_names]
@@ -426,14 +363,6 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
 def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
-    """Extract metadata from HTML document.
-    Args:
-        soup: BeautifulSoup instance of the HTML document.
-    Returns:
-        Dictionary of metadata key-value pairs.
-    """
     metadata = {}
     title_tag = soup.find("title")
@@ -470,7 +399,6 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
     if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
         metadata["canonical"] = canonical["href"]
-    # Extract link relations
     link_relations = {"author", "license", "alternate"}
     for rel_type in link_relations:
         link = soup.find("link", rel=rel_type, href=True)
@@ -481,14 +409,6 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
 def _format_metadata_comment(metadata: dict[str, str]) -> str:
-    """Format metadata as a Markdown comment block.
-    Args:
-        metadata: Dictionary of metadata key-value pairs.
-    Returns:
-        Formatted metadata comment block.
-    """
     if not metadata:
         return ""
@@ -524,64 +444,87 @@ def convert_to_markdown(
     heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
     highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
     keep_inline_images_in: Iterable[str] | None = None,
+    list_indent_type: Literal["spaces", "tabs"] = "spaces",
+    list_indent_width: int = 4,
     newline_style: Literal["spaces", "backslash"] = SPACES,
+    preprocess_html: bool = False,
+    preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
+    remove_forms: bool = True,
+    remove_navigation: bool = True,
     strip: str | Iterable[str] | None = None,
     strip_newlines: bool = False,
     strong_em_symbol: Literal["*", "_"] = ASTERISK,
     sub_symbol: str = "",
     sup_symbol: str = "",
+    whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
     wrap: bool = False,
     wrap_width: int = 80,
-    preprocess_html: bool = False,
-    preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
-    remove_navigation: bool = True,
-    remove_forms: bool = True,
 ) -> str:
-    """Convert HTML to Markdown.
+    """Convert HTML content to Markdown format.
-    Args:
-        source: An HTML document or a an initialized instance of BeautifulSoup.
-        stream_processing: Use streaming processing for large documents. Defaults to False.
-        chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
-        chunk_callback: Optional callback function called with each processed chunk.
-        progress_callback: Optional callback function called with (processed_bytes, total_bytes).
-        parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
-                Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
-        autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
-        bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
-        code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
-        code_language_callback: Function to dynamically determine the language for code blocks.
-        convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
-        convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
-        custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
-        default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
-        escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
-        escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
-        escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
-        extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
-        heading_style: The style to use for Markdown headings. Defaults to "underlined".
-        highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
-        keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
-        newline_style: Style for handling newlines in text content. Defaults to "spaces".
-        strip: Tags to strip from the output. Defaults to None.
-        strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
-        strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
-        sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
-        sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
-        wrap: Wrap text to the specified width. Defaults to False.
-        wrap_width: The number of characters at which to wrap text. Defaults to 80.
-        preprocess_html: Apply HTML preprocessing to improve quality. Defaults to False.
-        preprocessing_preset: Preset configuration for preprocessing. Defaults to "standard".
-        remove_navigation: Remove navigation elements during preprocessing. Defaults to True.
-        remove_forms: Remove form elements during preprocessing. Defaults to True.
+    This is the main entry point for converting HTML to Markdown. It supports
+    various customization options for controlling the conversion behavior.
-    Raises:
-        ConflictingOptionsError: If both 'strip' and 'convert' are specified.
-        EmptyHtmlError: When the input HTML is empty.
-        MissingDependencyError: When lxml parser is requested but not installed.
+    Args:
+        source: HTML string or BeautifulSoup object to convert.
+        stream_processing: Enable streaming mode for large documents.
+        chunk_size: Size of chunks for streaming processing.
+        chunk_callback: Callback for processing chunks in streaming mode.
+        progress_callback: Callback for progress updates (current, total).
+        parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
+        autolinks: Convert URLs to automatic links.
+        bullets: Characters to use for unordered list bullets.
+        code_language: Default language for code blocks.
+        code_language_callback: Callback to determine code language from element.
+        convert: HTML tags to convert to Markdown.
+        convert_as_inline: Treat block elements as inline during conversion.
+        custom_converters: Custom converters for specific HTML elements.
+        default_title: Add a default title if none exists.
+        escape_asterisks: Escape asterisk characters in text.
+        escape_misc: Escape miscellaneous Markdown characters.
+        escape_underscores: Escape underscore characters in text.
+        extract_metadata: Extract metadata from HTML head.
+        heading_style: Style for headings ('underlined', 'atx', 'atx_closed').
+        highlight_style: Style for highlighting ('double-equal', 'html', 'bold').
+        keep_inline_images_in: Parent tags where images should remain inline.
+        list_indent_type: Type of indentation for lists ('spaces', 'tabs').
+        list_indent_width: Number of spaces for list indentation.
+        newline_style: Style for newlines ('spaces', 'backslash').
+        preprocess_html: Enable HTML preprocessing to clean up content.
+        preprocessing_preset: Preprocessing aggressiveness level.
+        remove_forms: Remove form elements during preprocessing.
+        remove_navigation: Remove navigation elements during preprocessing.
+        strip: HTML tags to strip from output.
+        strip_newlines: Remove newlines from HTML before processing.
+        strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
+        sub_symbol: Symbol for subscript text.
+        sup_symbol: Symbol for superscript text.
+        whitespace_mode: How to handle whitespace ('normalized', 'strict').
+        wrap: Enable text wrapping.
+        wrap_width: Column width for text wrapping.
     Returns:
-        str: A string of Markdown-formatted text converted from the given HTML.
+        The converted Markdown string.
+    Raises:
+        EmptyHtmlError: If the HTML input is empty.
+        MissingDependencyError: If required dependencies are not installed.
+        ConflictingOptionsError: If conflicting options are provided.
+    Examples:
+        Basic conversion:
+        >>> html = "<h1>Title</h1><p>Content</p>"
+        >>> convert_to_markdown(html)
+        'Title\\n=====\\n\\nContent\\n\\n'
+        With custom options:
+        >>> convert_to_markdown(html, heading_style="atx", list_indent_width=2)
+        '# Title\\n\\nContent\\n\\n'
+        Discord-compatible lists (2-space indent):
+        >>> html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
+        >>> convert_to_markdown(html, list_indent_width=2)
+        '* Item 1\\n* Item 2\\n\\n'
     """
     if isinstance(source, str):
         if (
@@ -595,8 +538,6 @@ def convert_to_markdown(
         if strip_newlines:
             source = source.replace("\n", " ").replace("\r", " ")
-        # Fix lxml parsing of void elements like <wbr>
-        # lxml incorrectly treats them as container tags
         source = re.sub(r"<wbr\s*>", "<wbr />", source, flags=re.IGNORECASE)
         if preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
@@ -680,6 +621,7 @@ def convert_to_markdown(
             sup_symbol=sup_symbol,
             wrap=wrap,
             wrap_width=wrap_width,
+            whitespace_mode=whitespace_mode,
         ):
             if chunk_callback:
                 chunk_callback(chunk)
@@ -696,9 +638,12 @@ def convert_to_markdown(
     sink = StringSink()
+    whitespace_handler = WhitespaceHandler(whitespace_mode)
     _process_html_core(
         source,
         sink,
+        whitespace_handler=whitespace_handler,
         parser=parser,
         autolinks=autolinks,
         bullets=bullets,
@@ -715,6 +660,8 @@ def convert_to_markdown(
         heading_style=heading_style,
         highlight_style=highlight_style,
         keep_inline_images_in=keep_inline_images_in,
+        list_indent_type=list_indent_type,
+        list_indent_width=list_indent_width,
         newline_style=newline_style,
         strip=strip,
         strip_newlines=strip_newlines,
@@ -737,7 +684,6 @@ def convert_to_markdown(
         if leading_whitespace_match:
             leading_whitespace = leading_whitespace_match.group(0)
-            # Check if input contains list or heading tags
             list_heading_tags = {"<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"}
             if any(tag in original_input for tag in list_heading_tags):
                 leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
@@ -751,19 +697,14 @@ def convert_to_markdown(
     def normalize_spaces_outside_code(text: str) -> str:
         parts = text.split("```")
         for i in range(0, len(parts), 2):
-            # Process each line separately to preserve leading spaces
             lines = parts[i].split("\n")
             processed_lines = []
             for line in lines:
-                # Preserve definition list formatting (: followed by 3 spaces)
                 def_parts = re.split(r"(:\s{3})", line)
                 for j in range(0, len(def_parts), 2):
-                    # Only normalize non-definition-list parts
-                    # Also preserve leading spaces (for list indentation)
                     match = re.match(r"^(\s*)(.*)", def_parts[j])
                     if match:
                         leading_spaces, rest = match.groups()
-                        # Only normalize multiple spaces that are not at the beginning
                         rest = re.sub(r" {3,}", " ", rest)
                         def_parts[j] = leading_spaces + rest
                 processed_lines.append("".join(def_parts))
@@ -782,34 +723,25 @@ def convert_to_markdown(
 class OutputSink:
-    """Abstract output sink for processed markdown text."""
     def write(self, text: str) -> None:
-        """Write text to the sink."""
         raise NotImplementedError
     def finalize(self) -> None:
-        """Finalize the output."""
+        pass
 class StringSink(OutputSink):
-    """Collects all output into a single string."""
     def __init__(self) -> None:
         self.buffer = StringIO()
     def write(self, text: str) -> None:
-        """Write text to the buffer."""
         self.buffer.write(text)
     def get_result(self) -> str:
-        """Get the complete result string."""
         return self.buffer.getvalue()
 class StreamingSink(OutputSink):
-    """Yields chunks of output for streaming processing."""
     def __init__(self, chunk_size: int = 1024, progress_callback: Callable[[int, int], None] | None = None) -> None:
         self.chunk_size = chunk_size
         self.progress_callback = progress_callback
@@ -820,7 +752,6 @@ class StreamingSink(OutputSink):
         self.chunks: list[str] = []
     def write(self, text: str) -> None:
-        """Write text and yield chunks when threshold is reached."""
         if not text:
             return
@@ -843,7 +774,6 @@ class StreamingSink(OutputSink):
         self.buffer_size = len(current_content)
     def finalize(self) -> None:
-        """Finalize and yield any remaining content."""
         if self.buffer_size > 0:
             content = self.buffer.getvalue()
             self.chunks.append(content)
@@ -851,11 +781,9 @@ class StreamingSink(OutputSink):
             self._update_progress()
     def get_chunks(self) -> Generator[str, None, None]:
-        """Get all chunks yielded during processing."""
         yield from self.chunks
     def _find_split_position(self, content: str) -> int:
-        """Find optimal position to split content for chunks."""
         target = self.chunk_size
         lookahead = min(100, len(content) - target)
@@ -868,7 +796,6 @@ class StreamingSink(OutputSink):
         return min(target, len(content))
     def _update_progress(self) -> None:
-        """Update progress if callback is provided."""
         if self.progress_callback:
             self.progress_callback(self.processed_bytes, self.total_bytes)
@@ -877,6 +804,7 @@ def _process_html_core(
     source: str | BeautifulSoup,
     sink: OutputSink,
     *,
+    whitespace_handler: WhitespaceHandler,
     parser: str | None = None,
     autolinks: bool,
     bullets: str,
@@ -893,6 +821,8 @@ def _process_html_core(
     heading_style: Literal["underlined", "atx", "atx_closed"],
     highlight_style: Literal["double-equal", "html", "bold"],
     keep_inline_images_in: Iterable[str] | None,
+    list_indent_type: str,
+    list_indent_width: int,
     newline_style: Literal["spaces", "backslash"],
     strip: str | Iterable[str] | None,
     strip_newlines: bool,
@@ -902,7 +832,6 @@ def _process_html_core(
     wrap: bool,
     wrap_width: int,
 ) -> None:
-    """Core HTML to Markdown processing logic shared by both regular and streaming."""
     token = _ancestor_cache.set({})
     try:
@@ -942,6 +871,8 @@ def _process_html_core(
             heading_style=heading_style,
             highlight_style=highlight_style,
             keep_inline_images_in=keep_inline_images_in,
+            list_indent_type=list_indent_type,
+            list_indent_width=list_indent_width,
             newline_style=newline_style,
             strong_em_symbol=strong_em_symbol,
             sub_symbol=sub_symbol,
@@ -969,6 +900,7 @@ def _process_html_core(
                     escape_misc=escape_misc,
                     escape_asterisks=escape_asterisks,
                     escape_underscores=escape_underscores,
+                    whitespace_handler=whitespace_handler,
                 )
                 sink.write(text)
                 context += text
@@ -982,6 +914,7 @@ def _process_html_core(
                     escape_misc=escape_misc,
                     escape_underscores=escape_underscores,
                     strip=_as_optional_set(strip),
+                    whitespace_handler=whitespace_handler,
                     context_before=context[-2:],
                 )
                 sink.write(text)
@@ -1013,54 +946,18 @@ def convert_to_markdown_stream(
     heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
     highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
     keep_inline_images_in: Iterable[str] | None = None,
+    list_indent_type: Literal["spaces", "tabs"] = "spaces",
+    list_indent_width: int = 4,
     newline_style: Literal["spaces", "backslash"] = SPACES,
     strip: str | Iterable[str] | None = None,
     strip_newlines: bool = False,
     strong_em_symbol: Literal["*", "_"] = ASTERISK,
     sub_symbol: str = "",
     sup_symbol: str = "",
+    whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
     wrap: bool = False,
     wrap_width: int = 80,
 ) -> Generator[str, None, None]:
-    """Convert HTML to Markdown using streaming/chunked processing.
-    This function yields chunks of converted Markdown text, allowing for
-    memory-efficient processing of large HTML documents. The output is guaranteed
-    to be identical to convert_to_markdown().
-    Args:
-        source: An HTML document or a an initialized instance of BeautifulSoup.
-        chunk_size: Size of chunks to yield (approximate, in characters).
-        progress_callback: Optional callback function called with (processed_bytes, total_bytes).
-        parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
-                Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
-        autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
-        bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
-        code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
-        code_language_callback: Function to dynamically determine the language for code blocks.
-        convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
-        convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
-        custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
-        default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
-        escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
-        escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
-        escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
-        extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
-        heading_style: The style to use for Markdown headings. Defaults to "underlined".
-        highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
-        keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
-        newline_style: Style for handling newlines in text content. Defaults to "spaces".
-        strip: Tags to strip from the output. Defaults to None.
-        strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
-        strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
-        sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
-        sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
-        wrap: Wrap text to the specified width. Defaults to False.
-        wrap_width: The number of characters at which to wrap text. Defaults to 80.
-    Yields:
-        str: Chunks of Markdown-formatted text.
-    """
     sink = StreamingSink(chunk_size, progress_callback)
     if isinstance(source, str):
@@ -1068,9 +965,12 @@ def convert_to_markdown_stream(
     elif isinstance(source, BeautifulSoup):
         sink.total_bytes = len(str(source))
+    whitespace_handler = WhitespaceHandler(whitespace_mode)
     _process_html_core(
         source,
         sink,
+        whitespace_handler=whitespace_handler,
         parser=parser,
         autolinks=autolinks,
         bullets=bullets,
@@ -1087,6 +987,8 @@ def convert_to_markdown_stream(
         heading_style=heading_style,
         highlight_style=highlight_style,
         keep_inline_images_in=keep_inline_images_in,
+        list_indent_type=list_indent_type,
+        list_indent_width=list_indent_width,
         newline_style=newline_style,
         strip=strip,
         strip_newlines=strip_newlines,

html_to_markdown/utils.py CHANGED Viewed

@@ -6,17 +6,6 @@ from html_to_markdown.constants import line_beginning_re
 def chomp(text: str) -> tuple[str, str, str]:
-    """Simplified whitespace handling for inline elements.
-    For semantic markdown output, preserves leading/trailing spaces as single spaces
-    and normalizes internal whitespace.
-    Args:
-        text: The text to chomp.
-    Returns:
-        A tuple containing the prefix, suffix, and the normalized text.
-    """
     if not text:
         return "", "", ""
@@ -29,17 +18,6 @@ def chomp(text: str) -> tuple[str, str, str]:
 def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:
-    """Escape special characters in text.
-    Args:
-        text: The text to escape.
-        escape_misc: Whether to escape miscellaneous characters.
-        escape_asterisks: Whether to escape asterisks.
-        escape_underscores: Whether to escape underscores.
-    Returns:
-        The escaped text.
-    """
     if not text:
         return ""
     if escape_misc:
@@ -52,28 +30,10 @@ def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_under
     return text
-def indent(*, text: str, level: int) -> str:
-    """Indent text by a given level.
-    Args:
-        text: The text to indent.
-        level: The level of indentation.
-    Returns:
-        The indented text.
-    """
-    return line_beginning_re.sub("\t" * level, text) if text else ""
+def indent(*, text: str, level: int, indent_str: str = "\t") -> str:
+    return line_beginning_re.sub(indent_str * level, text) if text else ""
 def underline(*, text: str, pad_char: str) -> str:
-    """Underline text with a given character.
-    Args:
-        text: The text to underline.
-        pad_char: The character to use for underlining.
-    Returns:
-        The underlined text.
-    """
     text = (text or "").rstrip()
     return f"{text}\n{pad_char * len(text)}\n\n" if text else ""

html-to-markdown 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

html-to-markdown 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl