PyPI - html-to-markdown - Versions diffs - 1.11.0__py3-none-any.whl → 1.12.1__py3-none-any.whl - Mend

html-to-markdown 1.11.0py3-none-any.whl → 1.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (14) hide show

html_to_markdown/cli.py +28 -2
html_to_markdown/converters.py +208 -130
html_to_markdown/exceptions.py +5 -0
html_to_markdown/preprocessor.py +96 -86
html_to_markdown/processing.py +63 -48
html_to_markdown/utils.py +1 -3
html_to_markdown/whitespace.py +23 -33
{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/METADATA +143 -2
html_to_markdown-1.12.1.dist-info/RECORD +17 -0
html_to_markdown-1.11.0.dist-info/RECORD +0 -17
{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/WHEEL +0 -0
{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/entry_points.txt +0 -0
{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/licenses/LICENSE +0 -0
{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/top_level.txt +0 -0

html_to_markdown/cli.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import sys
 from argparse import ArgumentParser, FileType
+from pathlib import Path
 from html_to_markdown.constants import (
     ASTERISK,
@@ -13,6 +14,7 @@ from html_to_markdown.constants import (
     WHITESPACE_NORMALIZED,
     WHITESPACE_STRICT,
 )
+from html_to_markdown.exceptions import InvalidEncodingError
 from html_to_markdown.processing import convert_to_markdown
@@ -131,6 +133,12 @@ def main(argv: list[str]) -> str:
         help="Parent tags where images remain inline (not converted to alt-text).",
     )
+    parser.add_argument(
+        "--br-in-tables",
+        action="store_true",
+        help="Use <br> tags for line breaks in table cells instead of spaces.",
+    )
     parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
     parser.add_argument(
@@ -235,10 +243,18 @@ def main(argv: list[str]) -> str:
         help="Keep navigation elements when preprocessing (normally removed).",
     )
+    parser.add_argument(
+        "--source-encoding",
+        type=str,
+        default=None,
+        help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
+    )
     args = parser.parse_args(argv)
     base_args = {
         "autolinks": args.autolinks,
+        "br_in_tables": args.br_in_tables,
         "bullets": args.bullets,
         "code_language": args.code_language,
         "convert": args.convert,
@@ -278,7 +294,7 @@ def main(argv: list[str]) -> str:
         if args.show_progress:
             def progress_callback(processed: int, total: int) -> None:
-                if total > 0:
+                if total > 0:  # pragma: no cover
                     percent = (processed / total) * 100
                     sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
@@ -286,4 +302,14 @@ def main(argv: list[str]) -> str:
             base_args["progress_callback"] = progress_callback
-    return convert_to_markdown(args.html.read(), **base_args)
+    if args.source_encoding and args.html.name != "<stdin>":
+        args.html.close()
+        try:
+            with Path(args.html.name).open(encoding=args.source_encoding) as f:
+                html_content = f.read()
+        except LookupError as e:
+            raise InvalidEncodingError(args.source_encoding) from e
+    else:
+        html_content = args.html.read()
+    return convert_to_markdown(html_content, **base_args)

html_to_markdown/converters.py CHANGED Viewed

@@ -5,9 +5,11 @@ from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from collections.abc import Iterable
 import base64
+import re
 from collections.abc import Callable
 from functools import partial
 from inspect import getfullargspec
+from itertools import chain
 from textwrap import fill
 from typing import Any, Literal, TypeVar, cast
@@ -36,6 +38,18 @@ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") ->
     return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
+def _find_list_item_ancestor(tag: Tag) -> Tag | None:
+    parent = tag.parent
+    while parent and parent.name != "li":
+        parent = parent.parent
+    return parent
+BLOCK_ELEMENTS = frozenset({"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"})
+_LIST_ITEM_PATTERN = re.compile(r"^\s*(\*|\+|-|\d+\.)\s")
 SupportedElements = Literal[
     "a",
     "abbr",
@@ -216,14 +230,15 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool, list_in
     return quote_text
-def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
+def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag, text: str) -> str:
     from html_to_markdown.processing import _has_ancestor  # noqa: PLC0415
     if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
-        return " "
+        return " " + text.strip()
     _ = convert_as_inline
-    return "\\\n" if newline_style.lower() == BACKSLASH else "  \n"
+    newline = "\\\n" if newline_style.lower() == BACKSLASH else "  \n"
+    return newline + text.strip() if text.strip() else newline
 def _convert_hn(
@@ -270,52 +285,87 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
     return f"![{alt}]({src}{title_part})"
+def _has_block_list_items(tag: Tag) -> bool:
+    return any(
+        any(child.name in BLOCK_ELEMENTS for child in li.children if hasattr(child, "name"))
+        for li in tag.find_all("li", recursive=False)
+    )
+def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag) -> str:
+    prev_p = None
+    for child in parent.children:
+        if hasattr(child, "name"):
+            if child.name == "p":
+                prev_p = child
+            break
+    if prev_p:
+        lines = text.strip().split("\n")
+        indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
+        return "\n" + "\n".join(indented_lines) + "\n"
+    return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
+def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> str:
+    lines = text.strip().split("\n")
+    indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
+    result = "\n".join(indented_lines)
+    return result + "\n" if not result.endswith("\n") else result
+def _add_list_item_spacing(text: str) -> str:
+    lines = text.split("\n")
+    items_with_blocks = set()
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        if line.strip() and _LIST_ITEM_PATTERN.match(line.lstrip()):
+            j = i + 1
+            has_continuation = False
+            while j < len(lines):
+                next_line = lines[j]
+                if next_line.strip() and _LIST_ITEM_PATTERN.match(next_line.lstrip()):
+                    break
+                if next_line.strip() and next_line.startswith(("  ", "   ", "\t")):
+                    has_continuation = True
+                j += 1
+            if has_continuation and j < len(lines):
+                items_with_blocks.add(j - 1)
+        i += 1
+    if items_with_blocks:
+        processed_lines = list(
+            chain.from_iterable([line, ""] if i in items_with_blocks else [line] for i, line in enumerate(lines))
+        )
+        return "\n".join(processed_lines)
+    return text
 def _convert_list(*, tag: Tag, text: str, list_indent_str: str) -> str:
     from html_to_markdown.processing import _has_ancestor  # noqa: PLC0415
-    before_paragraph = False
-    if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
-        before_paragraph = True
+    before_paragraph = tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}
-    if _has_ancestor(tag, "li"):
-        parent = tag.parent
-        while parent and parent.name != "li":
-            parent = parent.parent
+    has_block_items = _has_block_list_items(tag)
+    if _has_ancestor(tag, "li"):
+        parent = _find_list_item_ancestor(tag)
         if parent:
-            prev_p = None
-            for child in parent.children:
-                if hasattr(child, "name"):
-                    if child == tag:
-                        break
-                    if child.name == "p":
-                        prev_p = child
-            if prev_p:
-                lines = text.strip().split("\n")
-                indented_lines = []
-                for line in lines:
-                    if line.strip():
-                        indented_lines.append(f"{list_indent_str}{line}")
-                    else:
-                        indented_lines.append("")
-                return "\n" + "\n".join(indented_lines) + "\n"
-            return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
+            return _handle_nested_list_indentation(text, list_indent_str, parent)
     if tag.parent and tag.parent.name in {"ul", "ol"}:
-        lines = text.strip().split("\n")
-        indented_lines = []
-        for line in lines:
-            if line.strip():
-                indented_lines.append(f"{list_indent_str}{line}")
-            else:
-                indented_lines.append("")
-        result = "\n".join(indented_lines)
-        if not result.endswith("\n"):
-            result += "\n"
-        return result
+        return _handle_direct_nested_list_indentation(text, list_indent_str)
-    return text + ("\n" if before_paragraph else "")
+    if has_block_items:
+        text = _add_list_item_spacing(text)
+    trailing_newlines = "\n\n" if has_block_items else ("\n" if before_paragraph else "")
+    return text + trailing_newlines
 def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> str:
@@ -324,10 +374,8 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
         checked = checkbox.get("checked") is not None
         checkbox_symbol = "[x]" if checked else "[ ]"
-        checkbox_text = text
-        if checkbox.string:
-            checkbox_text = text.replace(str(checkbox.string), "").strip()
-        return f"- {checkbox_symbol} {checkbox_text.strip()}\n"
+        checkbox_text = text.strip()
+        return f"- {checkbox_symbol} {checkbox_text}\n"
     parent = tag.parent
     if parent is not None and parent.name == "ol":
@@ -349,11 +397,7 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
         bullet = bullets[depth % len(bullets)]
-    has_block_children = any(
-        child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
-        for child in tag.children
-        if hasattr(child, "name")
-    )
+    has_block_children = "\n\n" in text
     if has_block_children:
         paragraphs = text.strip().split("\n\n")
@@ -390,20 +434,13 @@ def _convert_p(
     from html_to_markdown.processing import _has_ancestor  # noqa: PLC0415
     if _has_ancestor(tag, "li"):
-        parent = tag.parent
-        while parent and parent.name != "li":
-            parent = parent.parent
+        parent = _find_list_item_ancestor(tag)
         if parent:
             p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
             if p_children and tag != p_children[0]:
-                indented_lines = []
-                for line in text.split("\n"):
-                    if line.strip():
-                        indented_lines.append(f"{list_indent_str}{line}")
-                    else:
-                        indented_lines.append("")
+                indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
                 text = "\n".join(indented_lines)
     return f"{text}\n\n" if text else ""
@@ -440,66 +477,65 @@ def _convert_pre(
     return f"\n```{code_language}\n{text}\n```\n"
-def _convert_td(*, tag: Tag, text: str) -> str:
+def _process_table_cell_content(*, tag: Tag, text: str, br_in_tables: bool) -> str:
+    if br_in_tables:
+        block_children = [child for child in tag.children if hasattr(child, "name") and child.name in BLOCK_ELEMENTS]
+        if len(block_children) > 1:
+            child_contents = []
+            for child in block_children:
+                child_text = child.get_text().strip()
+                if child_text:
+                    child_contents.append(child_text)
+            return "<br>".join(child_contents)
+        return text.strip().replace("\n", "<br>")
+    return text.strip().replace("\n", " ")
+def _convert_td(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
     colspan = _get_colspan(tag)
-    return " " + text.strip().replace("\n", " ") + " |" * colspan
+    processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
+    return " " + processed_text + " |" * colspan
-def _convert_th(*, tag: Tag, text: str) -> str:
+def _convert_th(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
     colspan = _get_colspan(tag)
-    return " " + text.strip().replace("\n", " ") + " |" * colspan
+    processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
+    return " " + processed_text + " |" * colspan
-def _convert_tr(*, tag: Tag, text: str) -> str:
-    cells = tag.find_all(["td", "th"])
-    parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
-    tag_grand_parent = tag.parent.parent if tag.parent else None
+def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
+    rowspan_positions = []
+    col_pos = 0
-    if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
-        prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
-        rowspan_positions = []
-        col_pos = 0
-        for prev_cell in prev_cells:
-            rowspan = 1
-            if (
-                "rowspan" in prev_cell.attrs
-                and isinstance(prev_cell["rowspan"], str)
-                and prev_cell["rowspan"].isdigit()
-            ):
-                rowspan = int(prev_cell["rowspan"])
-            if rowspan > 1:
-                rowspan_positions.append(col_pos)
-            colspan = 1
-            if (
-                "colspan" in prev_cell.attrs
-                and isinstance(prev_cell["colspan"], str)
-                and prev_cell["colspan"].isdigit()
-            ):
-                colspan = int(prev_cell["colspan"])
-            col_pos += colspan
+    for prev_cell in prev_cells:
+        rowspan = 1
+        if "rowspan" in prev_cell.attrs and isinstance(prev_cell["rowspan"], str) and prev_cell["rowspan"].isdigit():
+            rowspan = int(prev_cell["rowspan"])
+        if rowspan > 1:
+            rowspan_positions.append(col_pos)
+        colspan = 1
+        if "colspan" in prev_cell.attrs and isinstance(prev_cell["colspan"], str) and prev_cell["colspan"].isdigit():
+            colspan = int(prev_cell["colspan"])
+        col_pos += colspan
+    return rowspan_positions, col_pos
-        if rowspan_positions:
-            converted_cells: list[str] = []
-            if text.strip():
-                parts = text.split("|")
-                converted_cells.extend(part.rstrip() + " |" for part in parts[:-1] if part)
-            new_cells: list[str] = []
-            cell_index = 0
+def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int) -> str:
+    converted_cells = [part.rstrip() + " |" for part in text.split("|")[:-1] if part] if text.strip() else []
+    rowspan_set = set(rowspan_positions)
-            for pos in range(col_pos):
-                if pos in rowspan_positions:
-                    new_cells.append(" |")
-                elif cell_index < len(converted_cells):
-                    new_cells.append(converted_cells[cell_index])
-                    cell_index += 1
+    cell_iter = iter(converted_cells)
+    new_cells = [" |" if pos in rowspan_set else next(cell_iter, "") for pos in range(col_pos)]
-            text = "".join(new_cells)
+    return "".join(new_cells)
-    is_headrow = (
+def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_parent: Tag | None) -> bool:
+    return (
         all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
         or (not tag.previous_sibling and parent_name != "tbody")
         or (
@@ -508,25 +544,47 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
             and (not tag_grand_parent or len(tag_grand_parent.find_all(["thead"])) < 1)
         )
     )
+def _calculate_total_colspan(cells: list[Tag]) -> int:
+    full_colspan = 0
+    for cell in cells:
+        if hasattr(cell, "attrs") and "colspan" in cell.attrs:
+            colspan_value = cell.attrs["colspan"]
+            if isinstance(colspan_value, str) and colspan_value.isdigit():
+                full_colspan += int(colspan_value)
+            else:
+                full_colspan += 1
+        else:
+            full_colspan += 1
+    return full_colspan
+def _convert_tr(*, tag: Tag, text: str) -> str:
+    cells = tag.find_all(["td", "th"])
+    parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
+    tag_grand_parent = tag.parent.parent if tag.parent else None
+    if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
+        prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
+        rowspan_positions, col_pos = _get_rowspan_positions(prev_cells)
+        if rowspan_positions:
+            text = _handle_rowspan_text(text, rowspan_positions, col_pos)
+    is_headrow = _is_header_row(tag, cells, parent_name, tag_grand_parent)
     overline = ""
     underline = ""
     if is_headrow and not tag.previous_sibling:
-        full_colspan = 0
-        for cell in cells:
-            if hasattr(cell, "attrs") and "colspan" in cell.attrs:
-                colspan_value = cell.attrs["colspan"]
-                if isinstance(colspan_value, str) and colspan_value.isdigit():
-                    full_colspan += int(colspan_value)
-                else:
-                    full_colspan += 1
-            else:
-                full_colspan += 1
+        full_colspan = _calculate_total_colspan(cells)
         underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
     elif not tag.previous_sibling and (
         parent_name == "table" or (parent_name == "tbody" and not cast("Tag", tag.parent).previous_sibling)
     ):
-        overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"
-        overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"
+        overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"  # pragma: no cover
+        overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"  # pragma: no cover
     return overline + "|" + text + "\n" + underline
@@ -578,10 +636,23 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
     return f"{text}\n\n" if text.strip() else ""
-def _convert_div(*, text: str, convert_as_inline: bool) -> str:
+def _convert_div(*, text: str, convert_as_inline: bool, tag: Tag, list_indent_str: str) -> str:
     if convert_as_inline:
         return text
+    from html_to_markdown.processing import _has_ancestor  # noqa: PLC0415
+    if _has_ancestor(tag, "li"):
+        parent = _find_list_item_ancestor(tag)
+        if parent:
+            div_children = [child for child in parent.children if hasattr(child, "name") and child.name == "div"]
+            if div_children and tag != div_children[0]:
+                indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
+                indented_text = "\n".join(indented_lines)
+                return f"{indented_text}\n\n" if indented_text.strip() else ""
     return _format_block_element(text)
@@ -603,7 +674,7 @@ def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
     if convert_as_inline:
         return text
-    return f"{text}\n" if text.strip() else ""
+    return _format_block_element(text)
 def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
@@ -616,14 +687,21 @@ def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
     return f"{text.strip()}\n"
-def _convert_dd(*, text: str, convert_as_inline: bool) -> str:
+def _convert_dd(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
     if convert_as_inline:
         return text
-    if not text.strip():
-        return ""
+    has_dt_sibling = False
+    current = tag.previous_sibling
+    while current:
+        if hasattr(current, "name") and current.name and current.name == "dt":
+            has_dt_sibling = True
+            break
+        current = current.previous_sibling
-    return f":   {text.strip()}\n\n"
+    if has_dt_sibling:
+        return f":   {text.strip()}\n\n" if text.strip() else ":   \n\n"
+    return f"{text.strip()}\n\n" if text.strip() else ""
 def _convert_cite(*, text: str, convert_as_inline: bool) -> str:
@@ -648,9 +726,7 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
 def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    src = tag.get("src", "")
-    if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
+    if not (src := tag.get("src", "")) and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
         src = source_tag.get("src", "")
     if src and isinstance(src, str) and src.strip():
@@ -670,9 +746,8 @@ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> s
 def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
     _ = text
-    src = tag.get("src", "")
-    if src and isinstance(src, str) and src.strip():
+    if (src := tag.get("src", "")) and isinstance(src, str) and src.strip():
         link = f"[{src}]({src})"
         if convert_as_inline:
             return link
@@ -939,7 +1014,7 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
     content = text.strip()
     if content and not content.endswith("\n\n"):
         if content.endswith("\n"):
-            content += "\n"
+            content += "\n"  # pragma: no cover
         else:
             content += "\n\n"
     return content
@@ -997,6 +1072,7 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
 def create_converters_map(
     autolinks: bool,
+    br_in_tables: bool,
     bullets: str,
     code_language: str,
     code_language_callback: Callable[[Tag], str] | None,
@@ -1029,6 +1105,8 @@ def create_converters_map(
                     kwargs["convert_as_inline"] = convert_as_inline
                 if "list_indent_str" in spec.kwonlyargs:
                     kwargs["list_indent_str"] = list_indent_str
+                if "br_in_tables" in spec.kwonlyargs:
+                    kwargs["br_in_tables"] = br_in_tables
                 return func(**kwargs)
             return func(text)

html_to_markdown/exceptions.py CHANGED Viewed

@@ -37,3 +37,8 @@ class ConflictingOptionsError(HtmlToMarkdownError):
         self.option2 = option2
         super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
+class InvalidEncodingError(HtmlToMarkdownError):
+    def __init__(self, encoding: str) -> None:
+        super().__init__(f"The specified encoding ({encoding}) is not valid.")

html-to-markdown 1.11.0__py3-none-any.whl → 1.12.1__py3-none-any.whl

Potentially problematic release.

html-to-markdown 1.11.0py3-none-any.whl → 1.12.1py3-none-any.whl