PyPI - html-to-markdown - Versions diffs - 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl - Mend

html-to-markdown 1.8.0py3-none-any.whl → 1.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (8) hide show

html_to_markdown/processing.py CHANGED Viewed

@@ -3,13 +3,13 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
-    from collections.abc import Generator, Mapping
+    from collections.abc import Callable, Generator, Mapping
 import re
 from contextvars import ContextVar
 from io import StringIO
 from itertools import chain
-from typing import TYPE_CHECKING, Any, Callable, Literal, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 from bs4 import BeautifulSoup, Comment, Doctype, Tag
 from bs4.element import NavigableString, PageElement
@@ -176,7 +176,7 @@ def _process_tag(
     tag_name: SupportedTag | None = (
         cast("SupportedTag", tag.name.lower()) if tag.name.lower() in converters_map else None
     )
-    text = ""
+    text_parts: list[str] = []
     is_heading = html_heading_re.match(tag.name) is not None
     is_cell = tag_name in {"td", "th"}
@@ -193,27 +193,56 @@ def _process_tag(
             if can_extract and isinstance(el, NavigableString) and not el.strip():
                 el.extract()
-    for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children):
+    children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
+    # List of tags that return empty string when they have no content
+    empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
+    for i, el in enumerate(children):
         if isinstance(el, NavigableString):
-            text += _process_text(
-                el=el,
-                escape_misc=escape_misc,
-                escape_asterisks=escape_asterisks,
-                escape_underscores=escape_underscores,
+            # Check if this is whitespace between empty elements
+            if el.strip() == "" and i > 0 and i < len(children) - 1:
+                prev_el = children[i - 1]
+                next_el = children[i + 1]
+                # If previous element was a tag that produced empty output
+                # and next element is also a tag that could be empty, skip this whitespace
+                if (
+                    isinstance(prev_el, Tag)
+                    and isinstance(next_el, Tag)
+                    and prev_el.name.lower() in empty_when_no_content_tags
+                    and next_el.name.lower() in empty_when_no_content_tags
+                    and not prev_el.get_text().strip()
+                ):
+                    # Previous tag is empty and next could be empty too, skip this whitespace
+                    continue
+            text_parts.append(
+                _process_text(
+                    el=el,
+                    escape_misc=escape_misc,
+                    escape_asterisks=escape_asterisks,
+                    escape_underscores=escape_underscores,
+                )
             )
         elif isinstance(el, Tag):
-            text += _process_tag(
-                el,
-                converters_map,
-                convert_as_inline=convert_children_as_inline,
-                convert=convert,
-                escape_asterisks=escape_asterisks,
-                escape_misc=escape_misc,
-                escape_underscores=escape_underscores,
-                strip=strip,
-                context_before=(context_before + text)[-2:],
+            current_text = "".join(text_parts)
+            text_parts.append(
+                _process_tag(
+                    el,
+                    converters_map,
+                    convert_as_inline=convert_children_as_inline,
+                    convert=convert,
+                    escape_asterisks=escape_asterisks,
+                    escape_misc=escape_misc,
+                    escape_underscores=escape_underscores,
+                    strip=strip,
+                    context_before=(context_before + current_text)[-2:],
+                )
             )
+    text = "".join(text_parts)
     if tag_name and should_convert_tag:
         rendered = converters_map[tag_name](  # type: ignore[call-arg]
             tag=tag, text=text, convert_as_inline=convert_as_inline
@@ -252,22 +281,75 @@ def _process_text(
             break
     if "pre" not in ancestor_names:
-        has_leading_space = text.startswith((" ", "\t"))
-        has_trailing_space = text.endswith((" ", "\t"))
-        middle_content = (
-            text[1:-1]
-            if has_leading_space and has_trailing_space
-            else text[1:]
-            if has_leading_space
-            else text[:-1]
-            if has_trailing_space
-            else text
-        )
+        # Special case: if the text is only whitespace
+        if text.strip() == "":
+            # If it contains newlines, it's probably indentation whitespace, return empty
+            if "\n" in text:
+                text = ""
+            else:
+                # Check if this whitespace is between block elements
+                # Define block elements that should not have whitespace between them
+                block_elements = {
+                    "p",
+                    "ul",
+                    "ol",
+                    "div",
+                    "blockquote",
+                    "pre",
+                    "h1",
+                    "h2",
+                    "h3",
+                    "h4",
+                    "h5",
+                    "h6",
+                    "table",
+                    "dl",
+                    "hr",
+                    "figure",
+                    "article",
+                    "section",
+                    "nav",
+                    "aside",
+                    "header",
+                    "footer",
+                    "main",
+                    "form",
+                    "fieldset",
+                }
+                prev_sibling = el.previous_sibling
+                next_sibling = el.next_sibling
+                # Check if whitespace is between block elements
+                if (
+                    prev_sibling
+                    and hasattr(prev_sibling, "name")
+                    and prev_sibling.name in block_elements
+                    and next_sibling
+                    and hasattr(next_sibling, "name")
+                    and next_sibling.name in block_elements
+                ):
+                    # Remove whitespace between block elements
+                    text = ""
+                else:
+                    # Otherwise it's inline whitespace, normalize to single space
+                    text = " " if text else ""
+        else:
+            has_leading_space = text.startswith((" ", "\t"))
+            has_trailing_space = text.endswith((" ", "\t"))
+            middle_content = (
+                text[1:-1]
+                if has_leading_space and has_trailing_space
+                else text[1:]
+                if has_leading_space
+                else text[:-1]
+                if has_trailing_space
+                else text
+            )
-        middle_content = whitespace_re.sub(" ", middle_content.strip())
-        text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
+            middle_content = whitespace_re.sub(" ", middle_content.strip())
+            text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
     if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
         text = escape(
@@ -388,7 +470,9 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
     if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
         metadata["canonical"] = canonical["href"]
-    for rel_type in ["author", "license", "alternate"]:
+    # Extract link relations
+    link_relations = {"author", "license", "alternate"}
+    for rel_type in link_relations:
         link = soup.find("link", rel=rel_type, href=True)
         if link and isinstance(link, Tag) and isinstance(link["href"], str):
             metadata[f"link-{rel_type}"] = link["href"]
@@ -653,7 +737,9 @@ def convert_to_markdown(
         if leading_whitespace_match:
             leading_whitespace = leading_whitespace_match.group(0)
-            if any(tag in original_input for tag in ["<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"]):
+            # Check if input contains list or heading tags
+            list_heading_tags = {"<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"}
+            if any(tag in original_input for tag in list_heading_tags):
                 leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
                 leading_whitespace = leading_newlines.group(0) if leading_newlines else ""
@@ -665,13 +751,23 @@ def convert_to_markdown(
     def normalize_spaces_outside_code(text: str) -> str:
         parts = text.split("```")
         for i in range(0, len(parts), 2):
-            # Preserve definition list formatting (: followed by 3 spaces)
-            # Split by definition list patterns to preserve them
-            def_parts = re.split(r"(:\s{3})", parts[i])
-            for j in range(0, len(def_parts), 2):
-                # Only normalize non-definition-list parts
-                def_parts[j] = re.sub(r" {3,}", " ", def_parts[j])
-            parts[i] = "".join(def_parts)
+            # Process each line separately to preserve leading spaces
+            lines = parts[i].split("\n")
+            processed_lines = []
+            for line in lines:
+                # Preserve definition list formatting (: followed by 3 spaces)
+                def_parts = re.split(r"(:\s{3})", line)
+                for j in range(0, len(def_parts), 2):
+                    # Only normalize non-definition-list parts
+                    # Also preserve leading spaces (for list indentation)
+                    match = re.match(r"^(\s*)(.*)", def_parts[j])
+                    if match:
+                        leading_spaces, rest = match.groups()
+                        # Only normalize multiple spaces that are not at the beginning
+                        rest = re.sub(r" {3,}", " ", rest)
+                        def_parts[j] = leading_spaces + rest
+                processed_lines.append("".join(def_parts))
+            parts[i] = "\n".join(processed_lines)
         return "```".join(parts)
     result = normalize_spaces_outside_code(result)

{html_to_markdown-1.8.0.dist-info → html_to_markdown-1.9.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: html-to-markdown
-Version: 1.8.0
+Version: 1.9.0
 Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
 Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
 License: MIT
@@ -15,7 +15,6 @@ Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3 :: Only
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
@@ -28,13 +27,13 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
 Classifier: Topic :: Text Processing :: Markup :: Markdown
 Classifier: Topic :: Utilities
 Classifier: Typing :: Typed
-Requires-Python: >=3.9
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: beautifulsoup4>=4.13.4
-Requires-Dist: nh3>=0.2.21
+Requires-Dist: nh3>=0.3
 Provides-Extra: lxml
-Requires-Dist: lxml>=5; extra == "lxml"
+Requires-Dist: lxml>=6; extra == "lxml"
 Dynamic: license-file
 # html-to-markdown
@@ -46,6 +45,7 @@ Python 3.9+.
 ## Features
 - **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
+- **Enhanced Table Support**: Advanced handling of merged cells with rowspan/colspan support for better table representation
 - **Type Safety**: Strict MyPy adherence with comprehensive type hints
 - **Metadata Extraction**: Automatic extraction of document metadata (title, meta tags) as comment headers
 - **Streaming Support**: Memory-efficient processing for large documents with progress callbacks
@@ -55,7 +55,7 @@ Python 3.9+.
 - **CLI Tool**: Full-featured command-line interface with all API options exposed
 - **Custom Converters**: Extensible converter system for custom HTML tag handling
 - **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
-- **Extensive Test Coverage**: 100% test coverage requirement with comprehensive test suite
+- **Comprehensive Test Coverage**: 91%+ test coverage with 623+ comprehensive tests
 ## Installation
@@ -203,6 +203,51 @@ print(markdown)
 Custom converters take precedence over the built-in converters and can be used alongside other configuration options.
+### Enhanced Table Support
+The library now provides better handling of complex tables with merged cells:
+```python
+from html_to_markdown import convert_to_markdown
+# HTML table with merged cells
+html = """
+<table>
+    <tr>
+        <th rowspan="2">Category</th>
+        <th colspan="2">Sales Data</th>
+    </tr>
+    <tr>
+        <th>Q1</th>
+        <th>Q2</th>
+    </tr>
+    <tr>
+        <td>Product A</td>
+        <td>$100K</td>
+        <td>$150K</td>
+    </tr>
+</table>
+"""
+markdown = convert_to_markdown(html)
+print(markdown)
+```
+Output:
+```markdown
+| Category | Sales Data |  |
+| --- | --- | --- |
+| | Q1 | Q2 |
+| Product A | $100K | $150K |
+```
+The library handles:
+- **Rowspan**: Inserts empty cells in subsequent rows
+- **Colspan**: Properly manages column spanning
+- **Clean output**: Removes `<colgroup>` and `<col>` elements that have no Markdown equivalent
 ### Key Configuration Options
 | Option              | Type | Default          | Description                                                     |
@@ -438,7 +483,9 @@ This library provides comprehensive support for all modern HTML5 elements:
 ### Table Elements
-- `<table>`, `<thead>`, `<tbody>`, `<tfoot>`, `<tr>`, `<th>`, `<td>`, `<caption>`, `<col>`, `<colgroup>`
+- `<table>`, `<thead>`, `<tbody>`, `<tfoot>`, `<tr>`, `<th>`, `<td>`, `<caption>`
+- **Merged cell support**: Handles `rowspan` and `colspan` attributes for complex table layouts
+- **Smart cleanup**: Automatically handles table styling elements for clean Markdown output
 ### Interactive Elements
@@ -457,16 +504,41 @@ This library provides comprehensive support for all modern HTML5 elements:
 - `<math>` (MathML support)
-## Breaking Changes (Major Version)
+## Advanced Table Support
+The library provides sophisticated handling of complex HTML tables, including merged cells and proper structure conversion:
+```python
+from html_to_markdown import convert_to_markdown
+# Complex table with merged cells
+html = """
+<table>
+    <caption>Sales Report</caption>
+    <tr>
+        <th rowspan="2">Product</th>
+        <th colspan="2">Quarterly Sales</th>
+    </tr>
+    <tr>
+        <th>Q1</th>
+        <th>Q2</th>
+    </tr>
+    <tr>
+        <td>Widget A</td>
+        <td>$50K</td>
+        <td>$75K</td>
+    </tr>
+</table>
+"""
+result = convert_to_markdown(html)
+```
-This version introduces several breaking changes for improved consistency and functionality:
+**Features:**
-1. **Enhanced Metadata Extraction**: Now enabled by default with comprehensive extraction of title, meta tags, and link relations
-1. **Improved Newline Handling**: Better normalization of excessive newlines (max 2 consecutive)
-1. **Extended HTML5 Support**: Added support for 40+ new HTML5 elements
-1. **Streaming API**: New streaming parameters for large document processing
-1. **Task List Support**: Automatic conversion of HTML checkboxes to GitHub-compatible task lists
-1. **Highlight Styles**: New `highlight_style` parameter with multiple options for `<mark>` elements
+- **Merged cell support**: Handles `rowspan` and `colspan` attributes intelligently
+- **Clean output**: Automatically removes table styling elements that don't translate to Markdown
+- **Structure preservation**: Maintains table hierarchy and relationships
 ## Acknowledgments

{html_to_markdown-1.8.0.dist-info → html_to_markdown-1.9.0.dist-info}/RECORD RENAMED Viewed

@@ -2,15 +2,15 @@ html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,
 html_to_markdown/__main__.py,sha256=DJyJX7NIK0BVPNS2r3BYJ0Ci_lKHhgVOpw7ZEqACH3c,323
 html_to_markdown/cli.py,sha256=8xlgSEcnqsSM_dr1TCSgPDAo09YvUtO78PvDFivFFdg,6973
 html_to_markdown/constants.py,sha256=8vqANd-7wYvDzBm1VXZvdIxS4Xom4Ov_Yghg6jvmyio,584
-html_to_markdown/converters.py,sha256=COC2KqPelJlMCY5eXUS5gdiPOG8Yzx0U719FeXPw3GA,55514
+html_to_markdown/converters.py,sha256=ESOZQSW8qGAG1S9f_iDpPUirKIc9MGz_G0_rqbTCJ30,50018
 html_to_markdown/exceptions.py,sha256=s1DaG6A23rOurF91e4jryuUzplWcC_JIAuK9_bw_4jQ,1558
 html_to_markdown/preprocessor.py,sha256=S4S1ZfLC_hkJVgmA5atImTyWQDOxfHctPbaep2QtyrQ,11248
-html_to_markdown/processing.py,sha256=wkbhLg42U3aeVQSZFuzGt5irtN037XzRKpCE71QYZXI,36520
+html_to_markdown/processing.py,sha256=iUVZfDG_QmFsY32O3mJZEuyxS2m8cjZaNnsstx2RkQo,40544
 html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 html_to_markdown/utils.py,sha256=QgWPzmpZKFd6wDTe8IY3gbVT3xNzoGV3PBgd17J0O-w,2066
-html_to_markdown-1.8.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
-html_to_markdown-1.8.0.dist-info/METADATA,sha256=6pgiK4p0A77axLfD8MH1EGgzifP06koVV8KWS_5-iYk,17175
-html_to_markdown-1.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-html_to_markdown-1.8.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
-html_to_markdown-1.8.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
-html_to_markdown-1.8.0.dist-info/RECORD,,
+html_to_markdown-1.9.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
+html_to_markdown-1.9.0.dist-info/METADATA,sha256=Rptd2quL9YEGi7Bmh-pgbdPGx-8Ud8EZeZZLQNIMEik,18450
+html_to_markdown-1.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+html_to_markdown-1.9.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
+html_to_markdown-1.9.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
+html_to_markdown-1.9.0.dist-info/RECORD,,

{html_to_markdown-1.8.0.dist-info → html_to_markdown-1.9.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{html_to_markdown-1.8.0.dist-info → html_to_markdown-1.9.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{html_to_markdown-1.8.0.dist-info → html_to_markdown-1.9.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{html_to_markdown-1.8.0.dist-info → html_to_markdown-1.9.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

html-to-markdown 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

Potentially problematic release.

html-to-markdown 1.8.0py3-none-any.whl → 1.9.0py3-none-any.whl