PyPI - html-to-markdown - Versions diffs - 1.11.0__py3-none-any.whl → 1.12.0__py3-none-any.whl - Mend

html-to-markdown 1.11.0py3-none-any.whl → 1.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (14) hide show

html_to_markdown/cli.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import sys
 from argparse import ArgumentParser, FileType
+from pathlib import Path
 from html_to_markdown.constants import (
     ASTERISK,
@@ -13,6 +14,7 @@ from html_to_markdown.constants import (
     WHITESPACE_NORMALIZED,
     WHITESPACE_STRICT,
 )
+from html_to_markdown.exceptions import InvalidEncodingError
 from html_to_markdown.processing import convert_to_markdown
@@ -131,6 +133,12 @@ def main(argv: list[str]) -> str:
         help="Parent tags where images remain inline (not converted to alt-text).",
     )
+    parser.add_argument(
+        "--br-in-tables",
+        action="store_true",
+        help="Use <br> tags for line breaks in table cells instead of spaces.",
+    )
     parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
     parser.add_argument(
@@ -235,10 +243,18 @@ def main(argv: list[str]) -> str:
         help="Keep navigation elements when preprocessing (normally removed).",
     )
+    parser.add_argument(
+        "--source-encoding",
+        type=str,
+        default=None,
+        help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
+    )
     args = parser.parse_args(argv)
     base_args = {
         "autolinks": args.autolinks,
+        "br_in_tables": args.br_in_tables,
         "bullets": args.bullets,
         "code_language": args.code_language,
         "convert": args.convert,
@@ -278,7 +294,7 @@ def main(argv: list[str]) -> str:
         if args.show_progress:
             def progress_callback(processed: int, total: int) -> None:
-                if total > 0:
+                if total > 0:  # pragma: no cover
                     percent = (processed / total) * 100
                     sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
@@ -286,4 +302,14 @@ def main(argv: list[str]) -> str:
             base_args["progress_callback"] = progress_callback
-    return convert_to_markdown(args.html.read(), **base_args)
+    if args.source_encoding and args.html.name != "<stdin>":
+        args.html.close()
+        try:
+            with Path(args.html.name).open(encoding=args.source_encoding) as f:
+                html_content = f.read()
+        except LookupError as e:
+            raise InvalidEncodingError(args.source_encoding) from e
+    else:
+        html_content = args.html.read()
+    return convert_to_markdown(html_content, **base_args)

html_to_markdown/converters.py CHANGED Viewed

@@ -5,9 +5,11 @@ from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from collections.abc import Iterable
 import base64
+import re
 from collections.abc import Callable
 from functools import partial
 from inspect import getfullargspec
+from itertools import chain
 from textwrap import fill
 from typing import Any, Literal, TypeVar, cast
@@ -36,6 +38,19 @@ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") ->
     return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
+def _find_list_item_ancestor(tag: Tag) -> Tag | None:
+    """Find the nearest list item ancestor of a tag."""
+    parent = tag.parent
+    while parent and parent.name != "li":
+        parent = parent.parent
+    return parent
+BLOCK_ELEMENTS = frozenset({"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"})
+_LIST_ITEM_PATTERN = re.compile(r"^\s*(\*|\+|-|\d+\.)\s")
 SupportedElements = Literal[
     "a",
     "abbr",
@@ -270,52 +285,91 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
     return f"![{alt}]({src}{title_part})"
+def _has_block_list_items(tag: Tag) -> bool:
+    """Check if any list items contain block elements."""
+    return any(
+        any(child.name in BLOCK_ELEMENTS for child in li.children if hasattr(child, "name"))
+        for li in tag.find_all("li", recursive=False)
+    )
+def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag) -> str:
+    """Handle indentation for lists nested within list items."""
+    prev_p = None
+    for child in parent.children:
+        if hasattr(child, "name"):
+            if child.name == "p":
+                prev_p = child
+            break
+    if prev_p:
+        lines = text.strip().split("\n")
+        indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
+        return "\n" + "\n".join(indented_lines) + "\n"
+    return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
+def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> str:
+    """Handle indentation for lists that are direct children of other lists."""
+    lines = text.strip().split("\n")
+    indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
+    result = "\n".join(indented_lines)
+    return result + "\n" if not result.endswith("\n") else result
+def _add_list_item_spacing(text: str) -> str:
+    """Add extra spacing between list items that contain block content."""
+    lines = text.split("\n")
+    items_with_blocks = set()
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        if line.strip() and _LIST_ITEM_PATTERN.match(line.lstrip()):
+            j = i + 1
+            has_continuation = False
+            while j < len(lines):
+                next_line = lines[j]
+                if next_line.strip() and _LIST_ITEM_PATTERN.match(next_line.lstrip()):
+                    break
+                if next_line.strip() and next_line.startswith(("  ", "   ", "\t")):
+                    has_continuation = True
+                j += 1
+            if has_continuation and j < len(lines):
+                items_with_blocks.add(j - 1)
+        i += 1
+    if items_with_blocks:
+        processed_lines = list(
+            chain.from_iterable([line, ""] if i in items_with_blocks else [line] for i, line in enumerate(lines))
+        )
+        return "\n".join(processed_lines)
+    return text
 def _convert_list(*, tag: Tag, text: str, list_indent_str: str) -> str:
     from html_to_markdown.processing import _has_ancestor  # noqa: PLC0415
-    before_paragraph = False
-    if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
-        before_paragraph = True
+    before_paragraph = tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}
-    if _has_ancestor(tag, "li"):
-        parent = tag.parent
-        while parent and parent.name != "li":
-            parent = parent.parent
+    has_block_items = _has_block_list_items(tag)
+    if _has_ancestor(tag, "li"):
+        parent = _find_list_item_ancestor(tag)
         if parent:
-            prev_p = None
-            for child in parent.children:
-                if hasattr(child, "name"):
-                    if child == tag:
-                        break
-                    if child.name == "p":
-                        prev_p = child
-            if prev_p:
-                lines = text.strip().split("\n")
-                indented_lines = []
-                for line in lines:
-                    if line.strip():
-                        indented_lines.append(f"{list_indent_str}{line}")
-                    else:
-                        indented_lines.append("")
-                return "\n" + "\n".join(indented_lines) + "\n"
-            return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
+            return _handle_nested_list_indentation(text, list_indent_str, parent)
     if tag.parent and tag.parent.name in {"ul", "ol"}:
-        lines = text.strip().split("\n")
-        indented_lines = []
-        for line in lines:
-            if line.strip():
-                indented_lines.append(f"{list_indent_str}{line}")
-            else:
-                indented_lines.append("")
-        result = "\n".join(indented_lines)
-        if not result.endswith("\n"):
-            result += "\n"
-        return result
+        return _handle_direct_nested_list_indentation(text, list_indent_str)
-    return text + ("\n" if before_paragraph else "")
+    if has_block_items:
+        text = _add_list_item_spacing(text)
+    trailing_newlines = "\n\n" if has_block_items else ("\n" if before_paragraph else "")
+    return text + trailing_newlines
 def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> str:
@@ -324,10 +378,8 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
         checked = checkbox.get("checked") is not None
         checkbox_symbol = "[x]" if checked else "[ ]"
-        checkbox_text = text
-        if checkbox.string:
-            checkbox_text = text.replace(str(checkbox.string), "").strip()
-        return f"- {checkbox_symbol} {checkbox_text.strip()}\n"
+        checkbox_text = text.strip()
+        return f"- {checkbox_symbol} {checkbox_text}\n"
     parent = tag.parent
     if parent is not None and parent.name == "ol":
@@ -349,11 +401,7 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
         bullet = bullets[depth % len(bullets)]
-    has_block_children = any(
-        child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
-        for child in tag.children
-        if hasattr(child, "name")
-    )
+    has_block_children = "\n\n" in text
     if has_block_children:
         paragraphs = text.strip().split("\n\n")
@@ -390,20 +438,13 @@ def _convert_p(
     from html_to_markdown.processing import _has_ancestor  # noqa: PLC0415
     if _has_ancestor(tag, "li"):
-        parent = tag.parent
-        while parent and parent.name != "li":
-            parent = parent.parent
+        parent = _find_list_item_ancestor(tag)
         if parent:
             p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
             if p_children and tag != p_children[0]:
-                indented_lines = []
-                for line in text.split("\n"):
-                    if line.strip():
-                        indented_lines.append(f"{list_indent_str}{line}")
-                    else:
-                        indented_lines.append("")
+                indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
                 text = "\n".join(indented_lines)
     return f"{text}\n\n" if text else ""
@@ -440,66 +481,69 @@ def _convert_pre(
     return f"\n```{code_language}\n{text}\n```\n"
-def _convert_td(*, tag: Tag, text: str) -> str:
+def _process_table_cell_content(*, tag: Tag, text: str, br_in_tables: bool) -> str:
+    """Process table cell content, optionally using <br> tags for multi-line content."""
+    if br_in_tables:
+        block_children = [child for child in tag.children if hasattr(child, "name") and child.name in BLOCK_ELEMENTS]
+        if len(block_children) > 1:
+            child_contents = []
+            for child in block_children:
+                child_text = child.get_text().strip()
+                if child_text:
+                    child_contents.append(child_text)
+            return "<br>".join(child_contents)
+        return text.strip().replace("\n", "<br>")
+    return text.strip().replace("\n", " ")
+def _convert_td(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
     colspan = _get_colspan(tag)
-    return " " + text.strip().replace("\n", " ") + " |" * colspan
+    processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
+    return " " + processed_text + " |" * colspan
-def _convert_th(*, tag: Tag, text: str) -> str:
+def _convert_th(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
     colspan = _get_colspan(tag)
-    return " " + text.strip().replace("\n", " ") + " |" * colspan
+    processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
+    return " " + processed_text + " |" * colspan
-def _convert_tr(*, tag: Tag, text: str) -> str:
-    cells = tag.find_all(["td", "th"])
-    parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
-    tag_grand_parent = tag.parent.parent if tag.parent else None
+def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
+    """Get positions of cells with rowspan > 1 from previous row."""
+    rowspan_positions = []
+    col_pos = 0
-    if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
-        prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
-        rowspan_positions = []
-        col_pos = 0
-        for prev_cell in prev_cells:
-            rowspan = 1
-            if (
-                "rowspan" in prev_cell.attrs
-                and isinstance(prev_cell["rowspan"], str)
-                and prev_cell["rowspan"].isdigit()
-            ):
-                rowspan = int(prev_cell["rowspan"])
-            if rowspan > 1:
-                rowspan_positions.append(col_pos)
-            colspan = 1
-            if (
-                "colspan" in prev_cell.attrs
-                and isinstance(prev_cell["colspan"], str)
-                and prev_cell["colspan"].isdigit()
-            ):
-                colspan = int(prev_cell["colspan"])
-            col_pos += colspan
+    for prev_cell in prev_cells:
+        rowspan = 1
+        if "rowspan" in prev_cell.attrs and isinstance(prev_cell["rowspan"], str) and prev_cell["rowspan"].isdigit():
+            rowspan = int(prev_cell["rowspan"])
+        if rowspan > 1:
+            rowspan_positions.append(col_pos)
+        colspan = 1
+        if "colspan" in prev_cell.attrs and isinstance(prev_cell["colspan"], str) and prev_cell["colspan"].isdigit():
+            colspan = int(prev_cell["colspan"])
+        col_pos += colspan
+    return rowspan_positions, col_pos
-        if rowspan_positions:
-            converted_cells: list[str] = []
-            if text.strip():
-                parts = text.split("|")
-                converted_cells.extend(part.rstrip() + " |" for part in parts[:-1] if part)
-            new_cells: list[str] = []
-            cell_index = 0
+def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int) -> str:
+    """Handle text adjustment for rows with rowspan cells."""
+    converted_cells = [part.rstrip() + " |" for part in text.split("|")[:-1] if part] if text.strip() else []
+    rowspan_set = set(rowspan_positions)
-            for pos in range(col_pos):
-                if pos in rowspan_positions:
-                    new_cells.append(" |")
-                elif cell_index < len(converted_cells):
-                    new_cells.append(converted_cells[cell_index])
-                    cell_index += 1
+    cell_iter = iter(converted_cells)
+    new_cells = [" |" if pos in rowspan_set else next(cell_iter, "") for pos in range(col_pos)]
-            text = "".join(new_cells)
+    return "".join(new_cells)
-    is_headrow = (
+def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_parent: Tag | None) -> bool:
+    """Determine if this table row should be treated as a header row."""
+    return (
         all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
         or (not tag.previous_sibling and parent_name != "tbody")
         or (
@@ -508,25 +552,48 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
             and (not tag_grand_parent or len(tag_grand_parent.find_all(["thead"])) < 1)
         )
     )
+def _calculate_total_colspan(cells: list[Tag]) -> int:
+    """Calculate total colspan for all cells in a row."""
+    full_colspan = 0
+    for cell in cells:
+        if hasattr(cell, "attrs") and "colspan" in cell.attrs:
+            colspan_value = cell.attrs["colspan"]
+            if isinstance(colspan_value, str) and colspan_value.isdigit():
+                full_colspan += int(colspan_value)
+            else:
+                full_colspan += 1
+        else:
+            full_colspan += 1
+    return full_colspan
+def _convert_tr(*, tag: Tag, text: str) -> str:
+    cells = tag.find_all(["td", "th"])
+    parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
+    tag_grand_parent = tag.parent.parent if tag.parent else None
+    if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
+        prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
+        rowspan_positions, col_pos = _get_rowspan_positions(prev_cells)
+        if rowspan_positions:
+            text = _handle_rowspan_text(text, rowspan_positions, col_pos)
+    is_headrow = _is_header_row(tag, cells, parent_name, tag_grand_parent)
     overline = ""
     underline = ""
     if is_headrow and not tag.previous_sibling:
-        full_colspan = 0
-        for cell in cells:
-            if hasattr(cell, "attrs") and "colspan" in cell.attrs:
-                colspan_value = cell.attrs["colspan"]
-                if isinstance(colspan_value, str) and colspan_value.isdigit():
-                    full_colspan += int(colspan_value)
-                else:
-                    full_colspan += 1
-            else:
-                full_colspan += 1
+        full_colspan = _calculate_total_colspan(cells)
         underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
     elif not tag.previous_sibling and (
         parent_name == "table" or (parent_name == "tbody" and not cast("Tag", tag.parent).previous_sibling)
     ):
-        overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"
-        overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"
+        overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"  # pragma: no cover
+        overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"  # pragma: no cover
     return overline + "|" + text + "\n" + underline
@@ -578,10 +645,23 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
     return f"{text}\n\n" if text.strip() else ""
-def _convert_div(*, text: str, convert_as_inline: bool) -> str:
+def _convert_div(*, text: str, convert_as_inline: bool, tag: Tag, list_indent_str: str) -> str:
     if convert_as_inline:
         return text
+    from html_to_markdown.processing import _has_ancestor  # noqa: PLC0415
+    if _has_ancestor(tag, "li"):
+        parent = _find_list_item_ancestor(tag)
+        if parent:
+            div_children = [child for child in parent.children if hasattr(child, "name") and child.name == "div"]
+            if div_children and tag != div_children[0]:
+                indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
+                indented_text = "\n".join(indented_lines)
+                return f"{indented_text}\n\n" if indented_text.strip() else ""
     return _format_block_element(text)
@@ -603,7 +683,7 @@ def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
     if convert_as_inline:
         return text
-    return f"{text}\n" if text.strip() else ""
+    return _format_block_element(text)
 def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
@@ -616,14 +696,21 @@ def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
     return f"{text.strip()}\n"
-def _convert_dd(*, text: str, convert_as_inline: bool) -> str:
+def _convert_dd(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
     if convert_as_inline:
         return text
-    if not text.strip():
-        return ""
+    has_dt_sibling = False
+    current = tag.previous_sibling
+    while current:
+        if hasattr(current, "name") and current.name and current.name == "dt":
+            has_dt_sibling = True
+            break
+        current = current.previous_sibling
-    return f":   {text.strip()}\n\n"
+    if has_dt_sibling:
+        return f":   {text.strip()}\n\n" if text.strip() else ":   \n\n"
+    return f"{text.strip()}\n\n" if text.strip() else ""
 def _convert_cite(*, text: str, convert_as_inline: bool) -> str:
@@ -648,9 +735,7 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
 def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    src = tag.get("src", "")
-    if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
+    if not (src := tag.get("src", "")) and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
         src = source_tag.get("src", "")
     if src and isinstance(src, str) and src.strip():
@@ -670,9 +755,8 @@ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> s
 def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
     _ = text
-    src = tag.get("src", "")
-    if src and isinstance(src, str) and src.strip():
+    if (src := tag.get("src", "")) and isinstance(src, str) and src.strip():
         link = f"[{src}]({src})"
         if convert_as_inline:
             return link
@@ -939,7 +1023,7 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
     content = text.strip()
     if content and not content.endswith("\n\n"):
         if content.endswith("\n"):
-            content += "\n"
+            content += "\n"  # pragma: no cover
         else:
             content += "\n\n"
     return content
@@ -997,6 +1081,7 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
 def create_converters_map(
     autolinks: bool,
+    br_in_tables: bool,
     bullets: str,
     code_language: str,
     code_language_callback: Callable[[Tag], str] | None,
@@ -1029,6 +1114,8 @@ def create_converters_map(
                     kwargs["convert_as_inline"] = convert_as_inline
                 if "list_indent_str" in spec.kwonlyargs:
                     kwargs["list_indent_str"] = list_indent_str
+                if "br_in_tables" in spec.kwonlyargs:
+                    kwargs["br_in_tables"] = br_in_tables
                 return func(**kwargs)
             return func(text)

html_to_markdown/exceptions.py CHANGED Viewed

@@ -37,3 +37,8 @@ class ConflictingOptionsError(HtmlToMarkdownError):
         self.option2 = option2
         super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
+class InvalidEncodingError(HtmlToMarkdownError):
+    def __init__(self, encoding: str) -> None:
+        super().__init__(f"The specified encoding ({encoding}) is not valid.")

html_to_markdown/preprocessor.py CHANGED Viewed

@@ -5,6 +5,98 @@ from typing import Any
 import nh3
+BASE_ALLOWED_TAGS = frozenset(
+    {
+        "p",
+        "div",
+        "span",
+        "br",
+        "hr",
+        "h1",
+        "h2",
+        "h3",
+        "h4",
+        "h5",
+        "h6",
+        "ul",
+        "ol",
+        "li",
+        "dl",
+        "dt",
+        "dd",
+        "strong",
+        "b",
+        "em",
+        "i",
+        "u",
+        "s",
+        "del",
+        "ins",
+        "mark",
+        "small",
+        "sub",
+        "sup",
+        "code",
+        "pre",
+        "kbd",
+        "samp",
+        "var",
+        "abbr",
+        "cite",
+        "dfn",
+        "time",
+        "data",
+        "a",
+        "blockquote",
+        "q",
+    }
+)
+SEMANTIC_STRUCTURE_TAGS = frozenset(
+    {
+        "article",
+        "section",
+        "aside",
+        "header",
+        "footer",
+        "main",
+        "nav",
+        "figure",
+        "figcaption",
+        "details",
+        "summary",
+    }
+)
+TABLE_TAGS = frozenset(
+    {
+        "table",
+        "thead",
+        "tbody",
+        "tfoot",
+        "tr",
+        "td",
+        "th",
+        "caption",
+        "colgroup",
+        "col",
+    }
+)
+MEDIA_TAGS = frozenset(
+    {
+        "img",
+        "picture",
+        "source",
+        "audio",
+        "video",
+        "track",
+        "canvas",
+        "svg",
+        "iframe",
+    }
+)
 def preprocess_html(
     html: str,
@@ -63,98 +155,16 @@ def _configure_cleaning_rules(
     custom_tags_to_remove: set[str],
     custom_attributes_to_remove: set[str],
 ) -> dict[str, Any]:
-    allowed_tags = {
-        "p",
-        "div",
-        "span",
-        "br",
-        "hr",
-        "h1",
-        "h2",
-        "h3",
-        "h4",
-        "h5",
-        "h6",
-        "ul",
-        "ol",
-        "li",
-        "dl",
-        "dt",
-        "dd",
-        "strong",
-        "b",
-        "em",
-        "i",
-        "u",
-        "s",
-        "del",
-        "ins",
-        "mark",
-        "small",
-        "sub",
-        "sup",
-        "code",
-        "pre",
-        "kbd",
-        "samp",
-        "var",
-        "abbr",
-        "cite",
-        "dfn",
-        "time",
-        "data",
-        "a",
-        "blockquote",
-        "q",
-    }
+    allowed_tags = set(BASE_ALLOWED_TAGS)
     if preserve_semantic_structure:
-        allowed_tags.update(
-            {
-                "article",
-                "section",
-                "aside",
-                "header",
-                "footer",
-                "main",
-                "nav",
-                "figure",
-                "figcaption",
-                "details",
-                "summary",
-            }
-        )
+        allowed_tags.update(SEMANTIC_STRUCTURE_TAGS)
     if preserve_tables:
-        allowed_tags.update(
-            {
-                "table",
-                "thead",
-                "tbody",
-                "tfoot",
-                "tr",
-                "th",
-                "td",
-                "caption",
-                "col",
-                "colgroup",
-            }
-        )
+        allowed_tags.update(TABLE_TAGS)
     if preserve_media:
-        allowed_tags.update(
-            {
-                "img",
-                "picture",
-                "source",
-                "audio",
-                "video",
-                "track",
-                "canvas",
-                "svg",
-                "iframe",
-            }
-        )
+        allowed_tags.update(MEDIA_TAGS)
     allowed_tags -= custom_tags_to_remove

html_to_markdown/processing.py CHANGED Viewed

@@ -17,7 +17,7 @@ from bs4.element import NavigableString, PageElement
 try:
     from html_to_markdown.preprocessor import create_preprocessor
     from html_to_markdown.preprocessor import preprocess_html as preprocess_fn
-except ImportError:
+except ImportError:  # pragma: no cover
     create_preprocessor = None  # type: ignore[assignment]
     preprocess_fn = None  # type: ignore[assignment]
@@ -25,7 +25,7 @@ try:
     import importlib.util
     LXML_AVAILABLE = importlib.util.find_spec("lxml") is not None
-except ImportError:
+except ImportError:  # pragma: no cover
     LXML_AVAILABLE = False
 from html_to_markdown.constants import (
@@ -322,7 +322,7 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
 def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
     elem_id = id(element)
     cache = _ancestor_cache.get()
-    if cache is None:
+    if cache is None:  # pragma: no cover
         cache = {}
         _ancestor_cache.set(cache)
@@ -338,7 +338,7 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
             ancestor_names.add(current.name)
         parent_id = id(current)
-        if parent_id in cache:
+        if parent_id in cache:  # pragma: no cover
             ancestor_names.update(cache[parent_id])
             break
@@ -386,36 +386,35 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
         metadata["base-href"] = base_tag["href"]
     for meta in soup.find_all("meta"):
-        if meta.get("name") and meta.get("content") is not None:
-            name = meta["name"]
-            content = meta["content"]
+        if (name := meta.get("name")) and (content := meta.get("content")) is not None:
             if isinstance(name, str) and isinstance(content, str):
-                key = f"meta-{name.lower()}"
-                metadata[key] = content
+                metadata[f"meta-{name.lower()}"] = content
-        elif meta.get("property") and meta.get("content") is not None:
-            prop = meta["property"]
-            content = meta["content"]
+        elif (prop := meta.get("property")) and (content := meta.get("content")) is not None:
             if isinstance(prop, str) and isinstance(content, str):
-                key = f"meta-{prop.lower().replace(':', '-')}"
-                metadata[key] = content
+                metadata[f"meta-{prop.lower().replace(':', '-')}"] = content
-        elif meta.get("http-equiv") and meta.get("content") is not None:
-            equiv = meta["http-equiv"]
-            content = meta["content"]
-            if isinstance(equiv, str) and isinstance(content, str):
-                key = f"meta-{equiv.lower()}"
-                metadata[key] = content
+        elif (
+            (equiv := meta.get("http-equiv"))
+            and (content := meta.get("content")) is not None
+            and isinstance(equiv, str)
+            and isinstance(content, str)
+        ):
+            metadata[f"meta-{equiv.lower()}"] = content
     canonical = soup.find("link", rel="canonical", href=True)
     if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
         metadata["canonical"] = canonical["href"]
     link_relations = {"author", "license", "alternate"}
-    for rel_type in link_relations:
-        link = soup.find("link", rel=rel_type, href=True)
-        if link and isinstance(link, Tag) and isinstance(link["href"], str):
-            metadata[f"link-{rel_type}"] = link["href"]
+    link_metadata = {
+        f"link-{rel_type}": link["href"]
+        for rel_type in link_relations
+        if (link := soup.find("link", rel=rel_type, href=True))
+        and isinstance(link, Tag)
+        and isinstance(link["href"], str)
+    }
+    metadata.update(link_metadata)
     return metadata
@@ -424,11 +423,7 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
     if not metadata:
         return ""
-    lines = ["<!--"]
-    for key, value in sorted(metadata.items()):
-        safe_value = value.replace("-->", "--&gt;")
-        lines.append(f"{key}: {safe_value}")
-    lines.append("-->")
+    lines = ["<!--", *[f"{key}: {value.replace('-->', '--&gt;')}" for key, value in sorted(metadata.items())], "-->"]
     return "\n".join(lines) + "\n\n"
@@ -442,6 +437,7 @@ def convert_to_markdown(
     progress_callback: Callable[[int, int], None] | None = None,
     parser: str | None = None,
     autolinks: bool = True,
+    br_in_tables: bool = False,
     bullets: str = "*+-",
     code_language: str = "",
     code_language_callback: Callable[[Any], str] | None = None,
@@ -485,6 +481,7 @@ def convert_to_markdown(
         progress_callback: Callback for progress updates (current, total).
         parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
         autolinks: Convert URLs to automatic links.
+        br_in_tables: Use <br> tags for line breaks in table cells instead of spaces.
         bullets: Characters to use for unordered list bullets.
         code_language: Default language for code blocks.
         code_language_callback: Callback to determine code language from element.
@@ -644,7 +641,7 @@ def convert_to_markdown(
         result = re.sub(r"\n{3,}", "\n\n", result)
         if convert_as_inline:
-            result = result.rstrip("\n")
+            result = result.rstrip("\n")  # pragma: no cover
         return result
@@ -658,6 +655,7 @@ def convert_to_markdown(
         whitespace_handler=whitespace_handler,
         parser=parser,
         autolinks=autolinks,
+        br_in_tables=br_in_tables,
         bullets=bullets,
         code_language=code_language,
         code_language_callback=code_language_callback,
@@ -819,6 +817,7 @@ def _process_html_core(
     whitespace_handler: WhitespaceHandler,
     parser: str | None = None,
     autolinks: bool,
+    br_in_tables: bool,
     bullets: str,
     code_language: str,
     code_language_callback: Callable[[Any], str] | None,
@@ -849,24 +848,25 @@ def _process_html_core(
     try:
         if isinstance(source, str):
             if strip_newlines:
-                source = source.replace("\n", " ").replace("\r", " ")
+                source = source.replace("\n", " ").replace("\r", " ")  # pragma: no cover
             if "".join(source.split("\n")):
                 if parser is None:
                     parser = "lxml" if LXML_AVAILABLE else "html.parser"
-                if parser == "lxml" and not LXML_AVAILABLE:
+                if parser == "lxml" and not LXML_AVAILABLE:  # pragma: no cover
                     raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
                 source = BeautifulSoup(source, parser)
             else:
                 raise EmptyHtmlError
-        if strip is not None and convert is not None:
+        if strip is not None and convert is not None:  # pragma: no cover
             raise ConflictingOptionsError("strip", "convert")
         converters_map = create_converters_map(
             autolinks=autolinks,
+            br_in_tables=br_in_tables,
             bullets=bullets,
             code_language=code_language,
             code_language_callback=code_language_callback,
@@ -935,6 +935,7 @@ def convert_to_markdown_stream(
     progress_callback: Callable[[int, int], None] | None = None,
     parser: str | None = None,
     autolinks: bool = True,
+    br_in_tables: bool = False,
     bullets: str = "*+-",
     code_language: str = "",
     code_language_callback: Callable[[Any], str] | None = None,
@@ -976,6 +977,7 @@ def convert_to_markdown_stream(
         whitespace_handler=whitespace_handler,
         parser=parser,
         autolinks=autolinks,
+        br_in_tables=br_in_tables,
         bullets=bullets,
         code_language=code_language,
         code_language_callback=code_language_callback,
@@ -1027,7 +1029,7 @@ def convert_to_markdown_stream(
                 end_pos = search_start + newline_pos + 1
         chunk = combined_result[pos:end_pos]
-        if chunk:
+        if chunk:  # pragma: no cover
             yield chunk
         pos = end_pos

html_to_markdown/utils.py CHANGED Viewed

@@ -12,9 +12,7 @@ def chomp(text: str) -> tuple[str, str, str]:
     prefix = " " if text.startswith((" ", "\t")) else ""
     suffix = " " if text.endswith((" ", "\t")) else ""
-    text = text.strip()
-    return prefix, suffix, text
+    return prefix, suffix, text.strip()
 def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:

html_to_markdown/whitespace.py CHANGED Viewed

@@ -7,7 +7,7 @@ import unicodedata
 from typing import TYPE_CHECKING, Literal
 if TYPE_CHECKING:
-    from bs4 import NavigableString, PageElement, Tag
+    from bs4 import NavigableString, PageElement
 WhitespaceMode = Literal["normalized", "strict"]
@@ -132,7 +132,7 @@ class WhitespaceHandler:
         for char in text:
             if unicodedata.category(char) in ("Zs", "Zl", "Zp"):
                 normalized.append(" ")
-            elif char in ("\r\n", "\r"):
+            elif char == "\r":  # pragma: no cover
                 normalized.append("\n")
             else:
                 normalized.append(char)
@@ -168,15 +168,12 @@ class WhitespaceHandler:
         *,
         in_pre: bool = False,
     ) -> str:
-        if not text:
+        if not text:  # pragma: no cover
             return ""
         if in_pre or self.should_preserve_whitespace(element):
             return text
-        if self.mode == "strict":
-            return text
         text = self.normalize_unicode_spaces(text)
         return self._process_normalized(text, element)
@@ -204,8 +201,8 @@ class WhitespaceHandler:
     def _process_text_with_content(self, text: str, element: NavigableString) -> str:
         original = str(element)
-        has_lead_space = original and original[0] in " \t\n"
-        has_trail_space = original and original[-1] in " \t\n"
+        has_lead_space = bool(original and original[0] in " \t\n")
+        has_trail_space = bool(original and original[-1] in " \t\n")
         text = self._multiple_spaces.sub(" ", text.strip())
@@ -215,9 +212,9 @@ class WhitespaceHandler:
             return self._process_special_inline_containers(text, original)
         if parent and self.is_inline_element(parent):
-            return self._process_inline_element_text(text, original, bool(has_lead_space), bool(has_trail_space))
+            return self._process_inline_element_text(text, original, has_lead_space, has_trail_space)
-        return self._process_standalone_text(text, original, element, bool(has_lead_space), bool(has_trail_space))
+        return self._process_standalone_text(text, original, element, has_lead_space, has_trail_space)
     def _process_special_inline_containers(self, text: str, original: str) -> str:
         if original and "\n" not in original and "\t" not in original:
@@ -280,24 +277,3 @@ class WhitespaceHandler:
             text = text + "\n\n"
         return text
-    def get_block_spacing(self, tag: Tag, next_sibling: PageElement | None = None) -> str:
-        if self.mode == "strict":
-            return ""
-        tag_name = tag.name.lower() if hasattr(tag, "name") else ""
-        double_newline_elements = {"p", "div", "blockquote", "pre", "table", "ul", "ol", "dl"}
-        single_newline_elements = {"li", "dt", "dd", "tr", "td", "th"}
-        if tag_name in double_newline_elements:
-            if self.is_block_element(next_sibling):
-                return "\n\n"
-            return "\n"
-        if tag_name in single_newline_elements:
-            return "\n"
-        if tag_name.startswith("h") and len(tag_name) == 2 and tag_name[1].isdigit():
-            return "\n\n"
-        return ""

{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: html-to-markdown
-Version: 1.11.0
+Version: 1.12.0
 Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
 Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
 License: MIT
@@ -320,6 +320,88 @@ def converter(*, tag: Tag, text: str, **kwargs) -> str:
 Custom converters take precedence over built-in converters and can be used alongside other configuration options.
+### Streaming API
+For processing large documents with memory constraints, use the streaming API:
+```python
+from html_to_markdown import convert_to_markdown_stream
+# Process large HTML in chunks
+with open("large_document.html", "r") as f:
+    html_content = f.read()
+# Returns a generator that yields markdown chunks
+for chunk in convert_to_markdown_stream(html_content, chunk_size=2048):
+    print(chunk, end="")
+```
+With progress tracking:
+```python
+def show_progress(processed: int, total: int):
+    if total > 0:
+        percent = (processed / total) * 100
+        print(f"\rProgress: {percent:.1f}%", end="")
+# Stream with progress callback
+markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
+```
+### Preprocessing API
+The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
+```python
+from html_to_markdown import preprocess_html, create_preprocessor
+# Direct preprocessing with custom options
+cleaned_html = preprocess_html(
+    raw_html,
+    remove_navigation=True,
+    remove_forms=True,
+    remove_scripts=True,
+    remove_styles=True,
+    remove_comments=True,
+    preserve_semantic_structure=True,
+    preserve_tables=True,
+    preserve_media=True,
+)
+markdown = convert_to_markdown(cleaned_html)
+# Create a preprocessor configuration from presets
+config = create_preprocessor(preset="aggressive", preserve_tables=False)  # or "minimal", "standard"  # Override preset settings
+markdown = convert_to_markdown(html, **config)
+```
+### Exception Handling
+The library provides specific exception classes for better error handling:
+````python
+from html_to_markdown import (
+    convert_to_markdown,
+    HtmlToMarkdownError,
+    EmptyHtmlError,
+    InvalidParserError,
+    ConflictingOptionsError,
+    MissingDependencyError
+)
+try:
+    markdown = convert_to_markdown(html, parser='lxml')
+except MissingDependencyError:
+    # lxml not installed
+    markdown = convert_to_markdown(html, parser='html.parser')
+except EmptyHtmlError:
+    print("No HTML content to convert")
+except InvalidParserError as e:
+    print(f"Parser error: {e}")
+except ConflictingOptionsError as e:
+    print(f"Conflicting options: {e}")
+except HtmlToMarkdownError as e:
+    print(f"Conversion error: {e}")
 ## CLI Usage
 Convert HTML files directly from the command line with full access to all API options:
@@ -340,7 +422,7 @@ html_to_markdown \
   --preprocess-html \
   --preprocessing-preset aggressive \
   input.html > output.md
-```
+````
 ### Key CLI Options
@@ -353,6 +435,20 @@ html_to_markdown \
 --whitespace-mode {normalized,strict} # Whitespace handling (default: normalized)
 --heading-style {atx,atx_closed,underlined} # Header style
 --no-extract-metadata               # Disable metadata extraction
+--br-in-tables                      # Use <br> tags for line breaks in table cells
+--source-encoding ENCODING          # Override auto-detected encoding (rarely needed)
+```
+**File Encoding:**
+The CLI automatically detects file encoding in most cases. Use `--source-encoding` only when automatic detection fails (typically on some Windows systems or with unusual encodings):
+```shell
+# Override auto-detection for Latin-1 encoded file
+html_to_markdown --source-encoding latin-1 input.html > output.md
+# Force UTF-16 encoding when auto-detection fails
+html_to_markdown --source-encoding utf-16 input.html > output.md
 ```
 **All Available Options:**
@@ -393,6 +489,7 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
 - `newline_style` (str, default: `'spaces'`): Style for handling newlines (`'spaces'` or `'backslash'`)
 - `sub_symbol` (str, default: `''`): Custom symbol for subscript text
 - `sup_symbol` (str, default: `''`): Custom symbol for superscript text
+- `br_in_tables` (bool, default: `False`): Use `<br>` tags for line breaks in table cells instead of spaces
 ### Parser Options

html_to_markdown-1.12.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,17 @@
+html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
+html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
+html_to_markdown/cli.py,sha256=qB8-1jqJPW-YrOmlyOdJnLM6DpKSUIA3iyn1SJaJgKg,9418
+html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
+html_to_markdown/converters.py,sha256=4dikabmNVu8g7jnSpk_i_6CAKy7OehjcL0c8lmIJRSk,36414
+html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
+html_to_markdown/preprocessor.py,sha256=otnTOhoivJkxaip1Lb9xNMl8q-x9aGFXSYkSrxsTW8g,9591
+html_to_markdown/processing.py,sha256=RQbqkI3w_rm64uOvmO6-CrqCJXKNHtfKu2G6f59JSF0,34596
+html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
+html_to_markdown/whitespace.py,sha256=a7M_u9JXh6cfjs4rz25hABIKKy3ax11ZXJhEID4YSV4,7397
+html_to_markdown-1.12.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
+html_to_markdown-1.12.0.dist-info/METADATA,sha256=y8bGQgaCogxjM7V3gldeZi0IIaiCC-H7NiPqQMwMgmY,20867
+html_to_markdown-1.12.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+html_to_markdown-1.12.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
+html_to_markdown-1.12.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
+html_to_markdown-1.12.0.dist-info/RECORD,,

html_to_markdown-1.11.0.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
-html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
-html_to_markdown/cli.py,sha256=ilnrJN2XMhPDQ4UkkG4cjLXTvglu_ZJj-bBsohVF3fw,8541
-html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
-html_to_markdown/converters.py,sha256=CbChkRIlOPe0d1MK5-txDE56IG4Ea_dcCV6KRCTjeKY,32497
-html_to_markdown/exceptions.py,sha256=YjfwVCWE_oZakr9iy0E-_aPSYHNaocJZgWeQ9Enty7Q,1212
-html_to_markdown/preprocessor.py,sha256=acmuJJvx1RaXE3c0F_aWsartQE0cEpa3AOnJYGnPzqw,9708
-html_to_markdown/processing.py,sha256=sOIIFNyRkRYAH8Q4ehrh66RY71bkvttSuqzXYsMC5JM,34334
-html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-html_to_markdown/utils.py,sha256=4Vzk2cCjxN0LAZ1DXQCufYtxE7a6739TYgPbje-VM_E,1086
-html_to_markdown/whitespace.py,sha256=EJ0gEsfLB_wZAk5d5qP4UPhPg0pJJ8LZLRRr_QoL01o,8186
-html_to_markdown-1.11.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
-html_to_markdown-1.11.0.dist-info/METADATA,sha256=Cej6bnqT9JVFzACZvND6Z5-kD0QoabiLi46opAaC11U,17814
-html_to_markdown-1.11.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-html_to_markdown-1.11.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
-html_to_markdown-1.11.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
-html_to_markdown-1.11.0.dist-info/RECORD,,

{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

html-to-markdown 1.11.0__py3-none-any.whl → 1.12.0__py3-none-any.whl

Potentially problematic release.

html-to-markdown 1.11.0py3-none-any.whl → 1.12.0py3-none-any.whl