PyPI - html-to-markdown - Versions diffs - 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl - Mend

html-to-markdown 1.12.0py3-none-any.whl → 1.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (9) hide show

html_to_markdown/converters.py CHANGED Viewed

@@ -39,7 +39,6 @@ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") ->
 def _find_list_item_ancestor(tag: Tag) -> Tag | None:
-    """Find the nearest list item ancestor of a tag."""
     parent = tag.parent
     while parent and parent.name != "li":
         parent = parent.parent
@@ -231,14 +230,15 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool, list_in
     return quote_text
-def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
+def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag, text: str) -> str:
     from html_to_markdown.processing import _has_ancestor  # noqa: PLC0415
     if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
-        return " "
+        return " " + text.strip()
     _ = convert_as_inline
-    return "\\\n" if newline_style.lower() == BACKSLASH else "  \n"
+    newline = "\\\n" if newline_style.lower() == BACKSLASH else "  \n"
+    return newline + text.strip() if text.strip() else newline
 def _convert_hn(
@@ -286,7 +286,6 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
 def _has_block_list_items(tag: Tag) -> bool:
-    """Check if any list items contain block elements."""
     return any(
         any(child.name in BLOCK_ELEMENTS for child in li.children if hasattr(child, "name"))
         for li in tag.find_all("li", recursive=False)
@@ -294,7 +293,6 @@ def _has_block_list_items(tag: Tag) -> bool:
 def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag) -> str:
-    """Handle indentation for lists nested within list items."""
     prev_p = None
     for child in parent.children:
         if hasattr(child, "name"):
@@ -310,7 +308,6 @@ def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag
 def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> str:
-    """Handle indentation for lists that are direct children of other lists."""
     lines = text.strip().split("\n")
     indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
     result = "\n".join(indented_lines)
@@ -318,7 +315,6 @@ def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> s
 def _add_list_item_spacing(text: str) -> str:
-    """Add extra spacing between list items that contain block content."""
     lines = text.split("\n")
     items_with_blocks = set()
@@ -418,7 +414,10 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
             return "".join(result_parts)
-    return "{} {}\n".format(bullet, (text or "").strip())
+    # Ensure consistent whitespace handling for list items, especially with strip_newlines=True
+    # Strip any leading whitespace that may have been inherited from parent containers
+    clean_text = (text or "").strip()
+    return f"{bullet} {clean_text}\n"
 def _convert_p(
@@ -482,7 +481,6 @@ def _convert_pre(
 def _process_table_cell_content(*, tag: Tag, text: str, br_in_tables: bool) -> str:
-    """Process table cell content, optionally using <br> tags for multi-line content."""
     if br_in_tables:
         block_children = [child for child in tag.children if hasattr(child, "name") and child.name in BLOCK_ELEMENTS]
@@ -510,7 +508,6 @@ def _convert_th(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
 def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
-    """Get positions of cells with rowspan > 1 from previous row."""
     rowspan_positions = []
     col_pos = 0
@@ -531,7 +528,6 @@ def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
 def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int) -> str:
-    """Handle text adjustment for rows with rowspan cells."""
     converted_cells = [part.rstrip() + " |" for part in text.split("|")[:-1] if part] if text.strip() else []
     rowspan_set = set(rowspan_positions)
@@ -542,7 +538,6 @@ def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int)
 def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_parent: Tag | None) -> bool:
-    """Determine if this table row should be treated as a header row."""
     return (
         all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
         or (not tag.previous_sibling and parent_name != "tbody")
@@ -555,7 +550,6 @@ def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_paren
 def _calculate_total_colspan(cells: list[Tag]) -> int:
-    """Calculate total colspan for all cells in a row."""
     full_colspan = 0
     for cell in cells:
         if hasattr(cell, "attrs") and "colspan" in cell.attrs:

html_to_markdown/processing.py CHANGED Viewed

@@ -11,7 +11,7 @@ from io import StringIO
 from itertools import chain
 from typing import TYPE_CHECKING, Any, Literal, cast
-from bs4 import BeautifulSoup, Comment, Doctype, Tag
+from bs4 import BeautifulSoup, CData, Comment, Doctype, Tag
 from bs4.element import NavigableString, PageElement
 try:
@@ -179,6 +179,7 @@ def _process_tag(
     strip: set[str] | None,
     whitespace_handler: WhitespaceHandler,
     context_before: str = "",
+    ancestor_names: set[str] | None = None,
 ) -> str:
     should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
     tag_name: SupportedTag | None = (
@@ -186,6 +187,17 @@ def _process_tag(
     )
     text_parts: list[str] = []
+    if ancestor_names is None:
+        ancestor_names = set()
+        current: Tag | None = tag
+        while current and hasattr(current, "name"):
+            if current.name:
+                ancestor_names.add(current.name)
+            current = getattr(current, "parent", None)
+            if len(ancestor_names) > 10:
+                break
     is_heading = html_heading_re.match(tag.name) is not None
     is_cell = tag_name in {"td", "th"}
     convert_children_as_inline = convert_as_inline or is_heading or is_cell
@@ -201,7 +213,7 @@ def _process_tag(
             if can_extract and isinstance(el, NavigableString) and not el.strip():
                 el.extract()
-    children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
+    children = list(filter(lambda value: not isinstance(value, (Comment, Doctype, CData)), tag.children))
     empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
@@ -227,6 +239,7 @@ def _process_tag(
                     escape_asterisks=escape_asterisks,
                     escape_underscores=escape_underscores,
                     whitespace_handler=whitespace_handler,
+                    ancestor_names=ancestor_names,
                 )
             )
         elif isinstance(el, Tag):
@@ -243,6 +256,7 @@ def _process_tag(
                     strip=strip,
                     whitespace_handler=whitespace_handler,
                     context_before=(context_before + current_text)[-2:],
+                    ancestor_names=ancestor_names,
                 )
             )
@@ -282,21 +296,23 @@ def _process_text(
     escape_asterisks: bool,
     escape_underscores: bool,
     whitespace_handler: WhitespaceHandler,
+    ancestor_names: set[str] | None = None,
 ) -> str:
     text = str(el) or ""
     parent = el.parent
     parent_name = parent.name if parent else None
-    ancestor_names = set()
-    current = parent
-    while current and hasattr(current, "name"):
-        if current.name:
-            ancestor_names.add(current.name)
-        current = getattr(current, "parent", None)
+    if ancestor_names is None:
+        ancestor_names = set()
+        current = parent
+        while current and hasattr(current, "name"):
+            if current.name:
+                ancestor_names.add(current.name)
+            current = getattr(current, "parent", None)
-        if len(ancestor_names) > 10:
-            break
+            if len(ancestor_names) > 10:
+                break
     in_pre = bool(ancestor_names.intersection({"pre"}))
@@ -469,7 +485,6 @@ def convert_to_markdown(
     wrap_width: int = 80,
 ) -> str:
     """Convert HTML content to Markdown format.
     This is the main entry point for converting HTML to Markdown. It supports
     various customization options for controlling the conversion behavior.
@@ -525,17 +540,21 @@ def convert_to_markdown(
         >>> html = "<h1>Title</h1><p>Content</p>"
         >>> convert_to_markdown(html)
         'Title\\n=====\\n\\nContent\\n\\n'
         With custom options:
         >>> convert_to_markdown(html, heading_style="atx", list_indent_width=2)
         '# Title\\n\\nContent\\n\\n'
         Discord-compatible lists (2-space indent):
         >>> html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
         >>> convert_to_markdown(html, list_indent_width=2)
         '* Item 1\\n* Item 2\\n\\n'
     """
+    # Initialize original input string for Windows lxml fix
+    original_input_str = None
     if isinstance(source, str):
+        # Store original string for plain text detection (Windows lxml fix)
+        original_input_str = source
         if (
             heading_style == UNDERLINED
             and "Header" in source
@@ -684,23 +703,33 @@ def convert_to_markdown(
     result = sink.get_result()
-    if (
-        "needs_leading_whitespace_fix" in locals()
-        and needs_leading_whitespace_fix
-        and not result.startswith((" ", "\t", "\n", "\r"))
-    ):
+    # Parser-agnostic behavior: handle leading whitespace differences between parsers
+    # lxml may either add unwanted whitespace or strip meaningful whitespace compared to html.parser
+    if "needs_leading_whitespace_fix" in locals() and needs_leading_whitespace_fix:
         original_input = sink.original_source if hasattr(sink, "original_source") else original_source
-        leading_whitespace_match = re.match(r"^[\s]*", original_input)
-        if leading_whitespace_match:
-            leading_whitespace = leading_whitespace_match.group(0)
-            list_heading_tags = {"<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"}
-            if any(tag in original_input for tag in list_heading_tags):
-                leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
-                leading_whitespace = leading_newlines.group(0) if leading_newlines else ""
+        if isinstance(original_input, str):
+            original_leading_whitespace_match = re.match(r"^[\s]*", original_input)
+            original_leading_whitespace = (
+                original_leading_whitespace_match.group(0) if original_leading_whitespace_match else ""
+            )
-            if leading_whitespace:
-                result = leading_whitespace + result
+            # Case 1: lxml added leading newlines (like "\n<figure>") - strip them
+            if result.startswith("\n") and not original_input.lstrip().startswith(result.strip()):
+                result = result.lstrip("\n\r")
+            # Case 2: lxml stripped meaningful leading whitespace (like " <b>") - restore it
+            # However, don't restore whitespace if strip_newlines=True was used, as the user
+            # explicitly requested to remove formatting whitespace
+            elif (
+                not strip_newlines
+                and not result.startswith((" ", "\t"))
+                and original_leading_whitespace.startswith((" ", "\t"))
+            ):
+                # Only restore spaces/tabs, not newlines (which are usually formatting)
+                leading_spaces_tabs_match = re.match(r"^[ \t]*", original_leading_whitespace)
+                leading_spaces_tabs = leading_spaces_tabs_match.group(0) if leading_spaces_tabs_match else ""
+                if leading_spaces_tabs:
+                    result = leading_spaces_tabs + result
     result = re.sub(r"\n{3,}", "\n\n", result)
@@ -729,6 +758,46 @@ def convert_to_markdown(
     if convert_as_inline:
         result = result.rstrip("\n")
+    # Windows-specific fix: For plain text input (no HTML tags), lxml may add extra trailing newlines
+    # This ensures consistent behavior across platforms when processing plain text
+    # Only apply to cases where lxml adds extra newlines (\n\n) at the end
+    if (
+        "original_input_str" in locals()
+        and original_input_str
+        and not original_input_str.strip().startswith("<")
+        and not original_input_str.strip().endswith(">")
+        and result.endswith("\n\n")
+    ):
+        # Input appears to be plain text, not HTML - normalize trailing newlines only
+        result = result.rstrip("\n")
+    # If the original input contained no block-level elements, normalize any
+    # accidental trailing newlines for cross-platform consistency.
+    # This guards cases like inline-only inputs (e.g., "text <strong>bold</strong>")
+    # and head-only documents (e.g., "<head>head</head>") where output should
+    # not end with extra blank lines.
+    if "original_input_str" in locals() and original_input_str:
+        from html_to_markdown.whitespace import BLOCK_ELEMENTS  # noqa: PLC0415
+        # Treat additional tags as block-producing for trailing newline purposes.
+        # These may be inline in HTML spec but produce block output in our Markdown conversion.
+        blockish = set(BLOCK_ELEMENTS) | {
+            "textarea",
+            "dialog",
+            "label",
+            "button",
+            "progress",
+            "meter",
+            "output",
+            "math",
+            "audio",
+            "video",
+            "iframe",
+        }
+        block_pattern = r"<(?:" + "|".join(sorted(blockish)) + r")\b"
+        if not re.search(block_pattern, original_input_str, flags=re.IGNORECASE):
+            result = result.rstrip("\n")
     return result
@@ -896,7 +965,7 @@ def _process_html_core(
         elements_to_process = body.children if body and isinstance(body, Tag) else source.children
         context = ""
-        for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
+        for el in filter(lambda value: not isinstance(value, (Comment, Doctype, CData)), elements_to_process):
             if isinstance(el, NavigableString):
                 text = _process_text(
                     el=el,

html_to_markdown/whitespace.py CHANGED Viewed

@@ -6,8 +6,10 @@ import re
 import unicodedata
 from typing import TYPE_CHECKING, Literal
+from bs4.element import NavigableString
 if TYPE_CHECKING:
-    from bs4 import NavigableString, PageElement
+    from bs4 import PageElement
 WhitespaceMode = Literal["normalized", "strict"]
@@ -128,6 +130,8 @@ class WhitespaceHandler:
     def normalize_unicode_spaces(self, text: str) -> str:
         text = self._unicode_spaces.sub(" ", text)
+        text = text.replace("\r\n", "\n")
         normalized = []
         for char in text:
             if unicodedata.category(char) in ("Zs", "Zl", "Zp"):
@@ -250,12 +254,22 @@ class WhitespaceHandler:
         has_leading = (
             has_lead_space
             and original[0] == " "
-            and (self.is_inline_element(prev_sibling) or self.is_block_element(prev_sibling) or prev_sibling is None)
+            and (
+                self.is_inline_element(prev_sibling)
+                or self.is_block_element(prev_sibling)
+                or prev_sibling is None
+                or isinstance(prev_sibling, NavigableString)
+            )
         )
         has_trailing = (
             has_trail_space
             and original[-1] == " "
-            and (self.is_inline_element(next_sibling) or self.is_block_element(next_sibling) or next_sibling is None)
+            and (
+                self.is_inline_element(next_sibling)
+                or self.is_block_element(next_sibling)
+                or next_sibling is None
+                or isinstance(next_sibling, NavigableString)
+            )
         )
         if original and original[0] in "\n\t" and self.is_inline_element(prev_sibling):

{html_to_markdown-1.12.0.dist-info → html_to_markdown-1.13.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: html-to-markdown
-Version: 1.12.0
+Version: 1.13.0
 Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
 Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
 License: MIT
@@ -348,6 +348,50 @@ def show_progress(processed: int, total: int):
 markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
 ```
+#### When to Use Streaming vs Regular Processing
+Based on comprehensive performance analysis, here are our recommendations:
+**📄 Use Regular Processing When:**
+- Files < 100KB (simplicity preferred)
+- Simple scripts and one-off conversions
+- Memory is not a concern
+- You want the simplest API
+**🌊 Use Streaming Processing When:**
+- Files > 100KB (memory efficiency)
+- Processing many files in batch
+- Memory is constrained
+- You need progress reporting
+- You want to process results incrementally
+- Running in production environments
+**📋 Specific Recommendations by File Size:**
+| File Size  | Recommendation                                  | Reason                                 |
+| ---------- | ----------------------------------------------- | -------------------------------------- |
+| < 50KB     | Regular (simplicity) or Streaming (3-5% faster) | Either works well                      |
+| 50KB-100KB | Either (streaming slightly preferred)           | Minimal difference                     |
+| 100KB-1MB  | Streaming preferred                             | Better performance + memory efficiency |
+| > 1MB      | Streaming strongly recommended                  | Significant memory advantages          |
+**🔧 Configuration Recommendations:**
+- **Default chunk_size: 2048 bytes** (optimal performance balance)
+- **For very large files (>10MB)**: Consider `chunk_size=4096`
+- **For memory-constrained environments**: Use smaller chunks `chunk_size=1024`
+**📈 Performance Benefits:**
+Streaming provides consistent **3-5% performance improvement** across all file sizes:
+- **Streaming throughput**: ~0.47-0.48 MB/s
+- **Regular throughput**: ~0.44-0.47 MB/s
+- **Memory usage**: Streaming uses less peak memory for large files
+- **Latency**: Streaming allows processing results before completion
 ### Preprocessing API
 The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:

{html_to_markdown-1.12.0.dist-info → html_to_markdown-1.13.0.dist-info}/RECORD RENAMED Viewed

@@ -2,16 +2,16 @@ html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,
 html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
 html_to_markdown/cli.py,sha256=qB8-1jqJPW-YrOmlyOdJnLM6DpKSUIA3iyn1SJaJgKg,9418
 html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
-html_to_markdown/converters.py,sha256=4dikabmNVu8g7jnSpk_i_6CAKy7OehjcL0c8lmIJRSk,36414
+html_to_markdown/converters.py,sha256=l4ZtIhfOdemvaApRjH7qmzHrWNF3PDlBzsT1LRw3n0Y,36022
 html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
 html_to_markdown/preprocessor.py,sha256=otnTOhoivJkxaip1Lb9xNMl8q-x9aGFXSYkSrxsTW8g,9591
-html_to_markdown/processing.py,sha256=RQbqkI3w_rm64uOvmO6-CrqCJXKNHtfKu2G6f59JSF0,34596
+html_to_markdown/processing.py,sha256=SjVStbriaOb24ZwCcRp8eqOJ1p5bIVxpCXSMW3vQojs,38059
 html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
-html_to_markdown/whitespace.py,sha256=a7M_u9JXh6cfjs4rz25hABIKKy3ax11ZXJhEID4YSV4,7397
-html_to_markdown-1.12.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
-html_to_markdown-1.12.0.dist-info/METADATA,sha256=y8bGQgaCogxjM7V3gldeZi0IIaiCC-H7NiPqQMwMgmY,20867
-html_to_markdown-1.12.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-html_to_markdown-1.12.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
-html_to_markdown-1.12.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
-html_to_markdown-1.12.0.dist-info/RECORD,,
+html_to_markdown/whitespace.py,sha256=rl3eEwqfMpNWx4FBmbkZ1RxO_Od45p3EZ_7UgKcDAtg,7710
+html_to_markdown-1.13.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
+html_to_markdown-1.13.0.dist-info/METADATA,sha256=CIfFx5C69D3lFg3wgajZnMRmQV-7C78ga2zbXKcxcsc,22694
+html_to_markdown-1.13.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+html_to_markdown-1.13.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
+html_to_markdown-1.13.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
+html_to_markdown-1.13.0.dist-info/RECORD,,

{html_to_markdown-1.12.0.dist-info → html_to_markdown-1.13.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{html_to_markdown-1.12.0.dist-info → html_to_markdown-1.13.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{html_to_markdown-1.12.0.dist-info → html_to_markdown-1.13.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{html_to_markdown-1.12.0.dist-info → html_to_markdown-1.13.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

html-to-markdown 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl

Potentially problematic release.

html-to-markdown 1.12.0py3-none-any.whl → 1.13.0py3-none-any.whl