PyPI - html-to-markdown - Versions diffs - 1.8.0__py3-none-any.whl → 1.9.1__py3-none-any.whl - Mend

html-to-markdown 1.8.0py3-none-any.whl → 1.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

html_to_markdown/processing.py CHANGED Viewed

@@ -3,13 +3,13 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
-    from collections.abc import Generator, Mapping
+    from collections.abc import Callable, Generator, Mapping
 import re
 from contextvars import ContextVar
 from io import StringIO
 from itertools import chain
-from typing import TYPE_CHECKING, Any, Callable, Literal, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 from bs4 import BeautifulSoup, Comment, Doctype, Tag
 from bs4.element import NavigableString, PageElement
@@ -176,7 +176,7 @@ def _process_tag(
     tag_name: SupportedTag | None = (
         cast("SupportedTag", tag.name.lower()) if tag.name.lower() in converters_map else None
     )
-    text = ""
+    text_parts: list[str] = []
     is_heading = html_heading_re.match(tag.name) is not None
     is_cell = tag_name in {"td", "th"}
@@ -193,27 +193,51 @@ def _process_tag(
             if can_extract and isinstance(el, NavigableString) and not el.strip():
                 el.extract()
-    for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children):
+    children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
+    empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
+    for i, el in enumerate(children):
         if isinstance(el, NavigableString):
-            text += _process_text(
-                el=el,
-                escape_misc=escape_misc,
-                escape_asterisks=escape_asterisks,
-                escape_underscores=escape_underscores,
+            if el.strip() == "" and i > 0 and i < len(children) - 1:
+                prev_el = children[i - 1]
+                next_el = children[i + 1]
+                if (
+                    isinstance(prev_el, Tag)
+                    and isinstance(next_el, Tag)
+                    and prev_el.name.lower() in empty_when_no_content_tags
+                    and next_el.name.lower() in empty_when_no_content_tags
+                    and not prev_el.get_text().strip()
+                ):
+                    continue
+            text_parts.append(
+                _process_text(
+                    el=el,
+                    escape_misc=escape_misc,
+                    escape_asterisks=escape_asterisks,
+                    escape_underscores=escape_underscores,
+                )
             )
         elif isinstance(el, Tag):
-            text += _process_tag(
-                el,
-                converters_map,
-                convert_as_inline=convert_children_as_inline,
-                convert=convert,
-                escape_asterisks=escape_asterisks,
-                escape_misc=escape_misc,
-                escape_underscores=escape_underscores,
-                strip=strip,
-                context_before=(context_before + text)[-2:],
+            current_text = "".join(text_parts)
+            text_parts.append(
+                _process_tag(
+                    el,
+                    converters_map,
+                    convert_as_inline=convert_children_as_inline,
+                    convert=convert,
+                    escape_asterisks=escape_asterisks,
+                    escape_misc=escape_misc,
+                    escape_underscores=escape_underscores,
+                    strip=strip,
+                    context_before=(context_before + current_text)[-2:],
+                )
             )
+    text = "".join(text_parts)
     if tag_name and should_convert_tag:
         rendered = converters_map[tag_name](  # type: ignore[call-arg]
             tag=tag, text=text, convert_as_inline=convert_as_inline
@@ -252,22 +276,68 @@ def _process_text(
             break
     if "pre" not in ancestor_names:
-        has_leading_space = text.startswith((" ", "\t"))
-        has_trailing_space = text.endswith((" ", "\t"))
-        middle_content = (
-            text[1:-1]
-            if has_leading_space and has_trailing_space
-            else text[1:]
-            if has_leading_space
-            else text[:-1]
-            if has_trailing_space
-            else text
-        )
+        if text.strip() == "":
+            if "\n" in text:
+                text = ""
+            else:
+                block_elements = {
+                    "p",
+                    "ul",
+                    "ol",
+                    "div",
+                    "blockquote",
+                    "pre",
+                    "h1",
+                    "h2",
+                    "h3",
+                    "h4",
+                    "h5",
+                    "h6",
+                    "table",
+                    "dl",
+                    "hr",
+                    "figure",
+                    "article",
+                    "section",
+                    "nav",
+                    "aside",
+                    "header",
+                    "footer",
+                    "main",
+                    "form",
+                    "fieldset",
+                }
+                prev_sibling = el.previous_sibling
+                next_sibling = el.next_sibling
+                if (
+                    prev_sibling
+                    and hasattr(prev_sibling, "name")
+                    and prev_sibling.name in block_elements
+                    and next_sibling
+                    and hasattr(next_sibling, "name")
+                    and next_sibling.name in block_elements
+                ):
+                    text = ""
+                else:
+                    text = " " if text else ""
+        else:
+            has_leading_space = text.startswith((" ", "\t"))
+            has_trailing_space = text.endswith((" ", "\t"))
+            middle_content = (
+                text[1:-1]
+                if has_leading_space and has_trailing_space
+                else text[1:]
+                if has_leading_space
+                else text[:-1]
+                if has_trailing_space
+                else text
+            )
-        middle_content = whitespace_re.sub(" ", middle_content.strip())
-        text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
+            middle_content = whitespace_re.sub(" ", middle_content.strip())
+            text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
     if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
         text = escape(
@@ -388,7 +458,8 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
     if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
         metadata["canonical"] = canonical["href"]
-    for rel_type in ["author", "license", "alternate"]:
+    link_relations = {"author", "license", "alternate"}
+    for rel_type in link_relations:
         link = soup.find("link", rel=rel_type, href=True)
         if link and isinstance(link, Tag) and isinstance(link["href"], str):
             metadata[f"link-{rel_type}"] = link["href"]
@@ -511,8 +582,6 @@ def convert_to_markdown(
         if strip_newlines:
             source = source.replace("\n", " ").replace("\r", " ")
-        # Fix lxml parsing of void elements like <wbr>
-        # lxml incorrectly treats them as container tags
         source = re.sub(r"<wbr\s*>", "<wbr />", source, flags=re.IGNORECASE)
         if preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
@@ -653,7 +722,8 @@ def convert_to_markdown(
         if leading_whitespace_match:
             leading_whitespace = leading_whitespace_match.group(0)
-            if any(tag in original_input for tag in ["<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"]):
+            list_heading_tags = {"<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"}
+            if any(tag in original_input for tag in list_heading_tags):
                 leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
                 leading_whitespace = leading_newlines.group(0) if leading_newlines else ""
@@ -665,13 +735,18 @@ def convert_to_markdown(
     def normalize_spaces_outside_code(text: str) -> str:
         parts = text.split("```")
         for i in range(0, len(parts), 2):
-            # Preserve definition list formatting (: followed by 3 spaces)
-            # Split by definition list patterns to preserve them
-            def_parts = re.split(r"(:\s{3})", parts[i])
-            for j in range(0, len(def_parts), 2):
-                # Only normalize non-definition-list parts
-                def_parts[j] = re.sub(r" {3,}", " ", def_parts[j])
-            parts[i] = "".join(def_parts)
+            lines = parts[i].split("\n")
+            processed_lines = []
+            for line in lines:
+                def_parts = re.split(r"(:\s{3})", line)
+                for j in range(0, len(def_parts), 2):
+                    match = re.match(r"^(\s*)(.*)", def_parts[j])
+                    if match:
+                        leading_spaces, rest = match.groups()
+                        rest = re.sub(r" {3,}", " ", rest)
+                        def_parts[j] = leading_spaces + rest
+                processed_lines.append("".join(def_parts))
+            parts[i] = "\n".join(processed_lines)
         return "```".join(parts)
     result = normalize_spaces_outside_code(result)

{html_to_markdown-1.8.0.dist-info → html_to_markdown-1.9.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: html-to-markdown
-Version: 1.8.0
+Version: 1.9.1
 Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
 Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
 License: MIT
@@ -15,7 +15,6 @@ Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3 :: Only
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
@@ -28,13 +27,13 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
 Classifier: Topic :: Text Processing :: Markup :: Markdown
 Classifier: Topic :: Utilities
 Classifier: Typing :: Typed
-Requires-Python: >=3.9
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: beautifulsoup4>=4.13.4
-Requires-Dist: nh3>=0.2.21
+Requires-Dist: beautifulsoup4>=4.13.5
+Requires-Dist: nh3>=0.3
 Provides-Extra: lxml
-Requires-Dist: lxml>=5; extra == "lxml"
+Requires-Dist: lxml>=6.0.1; extra == "lxml"
 Dynamic: license-file
 # html-to-markdown
@@ -43,9 +42,18 @@ A modern, fully typed Python library for converting HTML to Markdown. This libra
 of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
 Python 3.9+.
+## Support This Project
+If you find html-to-markdown useful, please consider sponsoring the development:
+<a href="https://github.com/sponsors/Goldziher"><img src="https://img.shields.io/badge/Sponsor-%E2%9D%A4-pink?logo=github-sponsors" alt="Sponsor on GitHub" height="32"></a>
+Your support helps maintain and improve this library for the community! 🚀
 ## Features
 - **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
+- **Enhanced Table Support**: Advanced handling of merged cells with rowspan/colspan support for better table representation
 - **Type Safety**: Strict MyPy adherence with comprehensive type hints
 - **Metadata Extraction**: Automatic extraction of document metadata (title, meta tags) as comment headers
 - **Streaming Support**: Memory-efficient processing for large documents with progress callbacks
@@ -55,7 +63,7 @@ Python 3.9+.
 - **CLI Tool**: Full-featured command-line interface with all API options exposed
 - **Custom Converters**: Extensible converter system for custom HTML tag handling
 - **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
-- **Extensive Test Coverage**: 100% test coverage requirement with comprehensive test suite
+- **Comprehensive Test Coverage**: 91%+ test coverage with 623+ comprehensive tests
 ## Installation
@@ -203,6 +211,51 @@ print(markdown)
 Custom converters take precedence over the built-in converters and can be used alongside other configuration options.
+### Enhanced Table Support
+The library now provides better handling of complex tables with merged cells:
+```python
+from html_to_markdown import convert_to_markdown
+# HTML table with merged cells
+html = """
+<table>
+    <tr>
+        <th rowspan="2">Category</th>
+        <th colspan="2">Sales Data</th>
+    </tr>
+    <tr>
+        <th>Q1</th>
+        <th>Q2</th>
+    </tr>
+    <tr>
+        <td>Product A</td>
+        <td>$100K</td>
+        <td>$150K</td>
+    </tr>
+</table>
+"""
+markdown = convert_to_markdown(html)
+print(markdown)
+```
+Output:
+```markdown
+| Category | Sales Data |  |
+| --- | --- | --- |
+| | Q1 | Q2 |
+| Product A | $100K | $150K |
+```
+The library handles:
+- **Rowspan**: Inserts empty cells in subsequent rows
+- **Colspan**: Properly manages column spanning
+- **Clean output**: Removes `<colgroup>` and `<col>` elements that have no Markdown equivalent
 ### Key Configuration Options
 | Option              | Type | Default          | Description                                                     |
@@ -438,7 +491,9 @@ This library provides comprehensive support for all modern HTML5 elements:
 ### Table Elements
-- `<table>`, `<thead>`, `<tbody>`, `<tfoot>`, `<tr>`, `<th>`, `<td>`, `<caption>`, `<col>`, `<colgroup>`
+- `<table>`, `<thead>`, `<tbody>`, `<tfoot>`, `<tr>`, `<th>`, `<td>`, `<caption>`
+- **Merged cell support**: Handles `rowspan` and `colspan` attributes for complex table layouts
+- **Smart cleanup**: Automatically handles table styling elements for clean Markdown output
 ### Interactive Elements
@@ -457,16 +512,41 @@ This library provides comprehensive support for all modern HTML5 elements:
 - `<math>` (MathML support)
-## Breaking Changes (Major Version)
+## Advanced Table Support
+The library provides sophisticated handling of complex HTML tables, including merged cells and proper structure conversion:
+```python
+from html_to_markdown import convert_to_markdown
+# Complex table with merged cells
+html = """
+<table>
+    <caption>Sales Report</caption>
+    <tr>
+        <th rowspan="2">Product</th>
+        <th colspan="2">Quarterly Sales</th>
+    </tr>
+    <tr>
+        <th>Q1</th>
+        <th>Q2</th>
+    </tr>
+    <tr>
+        <td>Widget A</td>
+        <td>$50K</td>
+        <td>$75K</td>
+    </tr>
+</table>
+"""
+result = convert_to_markdown(html)
+```
-This version introduces several breaking changes for improved consistency and functionality:
+**Features:**
-1. **Enhanced Metadata Extraction**: Now enabled by default with comprehensive extraction of title, meta tags, and link relations
-1. **Improved Newline Handling**: Better normalization of excessive newlines (max 2 consecutive)
-1. **Extended HTML5 Support**: Added support for 40+ new HTML5 elements
-1. **Streaming API**: New streaming parameters for large document processing
-1. **Task List Support**: Automatic conversion of HTML checkboxes to GitHub-compatible task lists
-1. **Highlight Styles**: New `highlight_style` parameter with multiple options for `<mark>` elements
+- **Merged cell support**: Handles `rowspan` and `colspan` attributes intelligently
+- **Clean output**: Automatically removes table styling elements that don't translate to Markdown
+- **Structure preservation**: Maintains table hierarchy and relationships
 ## Acknowledgments

{html_to_markdown-1.8.0.dist-info → html_to_markdown-1.9.1.dist-info}/RECORD RENAMED Viewed

@@ -2,15 +2,15 @@ html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,
 html_to_markdown/__main__.py,sha256=DJyJX7NIK0BVPNS2r3BYJ0Ci_lKHhgVOpw7ZEqACH3c,323
 html_to_markdown/cli.py,sha256=8xlgSEcnqsSM_dr1TCSgPDAo09YvUtO78PvDFivFFdg,6973
 html_to_markdown/constants.py,sha256=8vqANd-7wYvDzBm1VXZvdIxS4Xom4Ov_Yghg6jvmyio,584
-html_to_markdown/converters.py,sha256=COC2KqPelJlMCY5eXUS5gdiPOG8Yzx0U719FeXPw3GA,55514
+html_to_markdown/converters.py,sha256=n0OeRnfDc7sH2j5oOuqJQmxySJxRFdfpPRHcrHJXFGE,46869
 html_to_markdown/exceptions.py,sha256=s1DaG6A23rOurF91e4jryuUzplWcC_JIAuK9_bw_4jQ,1558
 html_to_markdown/preprocessor.py,sha256=S4S1ZfLC_hkJVgmA5atImTyWQDOxfHctPbaep2QtyrQ,11248
-html_to_markdown/processing.py,sha256=wkbhLg42U3aeVQSZFuzGt5irtN037XzRKpCE71QYZXI,36520
+html_to_markdown/processing.py,sha256=ephjzcUJOilId8Z6AScaMY6AKkyNq9N0A1DMt9HfVuk,39068
 html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 html_to_markdown/utils.py,sha256=QgWPzmpZKFd6wDTe8IY3gbVT3xNzoGV3PBgd17J0O-w,2066
-html_to_markdown-1.8.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
-html_to_markdown-1.8.0.dist-info/METADATA,sha256=6pgiK4p0A77axLfD8MH1EGgzifP06koVV8KWS_5-iYk,17175
-html_to_markdown-1.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-html_to_markdown-1.8.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
-html_to_markdown-1.8.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
-html_to_markdown-1.8.0.dist-info/RECORD,,
+html_to_markdown-1.9.1.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
+html_to_markdown-1.9.1.dist-info/METADATA,sha256=FUHr7dId_1ZQfgKjPcInKwvwBChyxlyPMIwYl0Z4dko,18813
+html_to_markdown-1.9.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+html_to_markdown-1.9.1.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
+html_to_markdown-1.9.1.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
+html_to_markdown-1.9.1.dist-info/RECORD,,

{html_to_markdown-1.8.0.dist-info → html_to_markdown-1.9.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{html_to_markdown-1.8.0.dist-info → html_to_markdown-1.9.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{html_to_markdown-1.8.0.dist-info → html_to_markdown-1.9.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{html_to_markdown-1.8.0.dist-info → html_to_markdown-1.9.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

html-to-markdown 1.8.0__py3-none-any.whl → 1.9.1__py3-none-any.whl

html-to-markdown 1.8.0py3-none-any.whl → 1.9.1py3-none-any.whl