PyPI - html-to-markdown - Versions diffs - 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

html-to-markdown 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (16) hide show

html_to_markdown/__main__.py +0 -1
html_to_markdown/cli.py +101 -45
html_to_markdown/constants.py +3 -0
html_to_markdown/converters.py +52 -573
html_to_markdown/exceptions.py +1 -11
html_to_markdown/preprocessor.py +0 -37
html_to_markdown/processing.py +104 -202
html_to_markdown/utils.py +2 -42
html_to_markdown/whitespace.py +292 -0
{html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/METADATA +204 -204
html_to_markdown-1.10.0.dist-info/RECORD +17 -0
html_to_markdown-1.9.0.dist-info/RECORD +0 -16
{html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/WHEEL +0 -0
{html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/entry_points.txt +0 -0
{html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/licenses/LICENSE +0 -0
{html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/top_level.txt +0 -0

html_to_markdown/converters.py CHANGED Viewed

@@ -23,17 +23,14 @@ from html_to_markdown.utils import chomp, indent, underline
 def _format_block_element(text: str) -> str:
-    """Format text as a block element with trailing newlines."""
     return f"{text.strip()}\n\n" if text.strip() else ""
 def _format_inline_or_block(text: str, convert_as_inline: bool) -> str:
-    """Format text as inline or block element based on context."""
     return text.strip() if convert_as_inline else _format_block_element(text)
 def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") -> str:
-    """Format text wrapped in markers as a block element."""
     if not end_marker:
         end_marker = start_marker
     return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
@@ -63,6 +60,7 @@ SupportedElements = Literal[
     "details",
     "dfn",
     "dialog",
+    "div",
     "dl",
     "dt",
     "em",
@@ -145,15 +143,6 @@ T = TypeVar("T")
 def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
-    """Create an inline converter for a markup pattern or tag.
-    Args:
-        markup_prefix: The markup prefix to insert.
-    Returns:
-        A function that can be used to convert HTML to Markdown.
-    """
     def implementation(*, tag: Tag, text: str) -> str:
         from html_to_markdown.processing import _has_ancestor  # noqa: PLC0415
@@ -200,7 +189,7 @@ def _convert_a(*, tag: Tag, text: str, autolinks: bool, default_title: bool) ->
     return f"{prefix}[{text}]({href}{title_part}){suffix}" if href else text
-def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
+def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool, list_indent_str: str) -> str:
     if convert_as_inline:
         return text
@@ -211,18 +200,16 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
     cite_url = tag.get("cite")
-    # Check if this blockquote is inside a list item
     if _has_ancestor(tag, "li"):
-        # Indent the blockquote by 4 spaces
         lines = text.strip().split("\n")
-        indented_lines = [f"    > {line}" if line.strip() else "" for line in lines]
+        indented_lines = [f"{list_indent_str}> {line}" if line.strip() else "" for line in lines]
         quote_text = "\n".join(indented_lines) + "\n\n"
     else:
         quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
     if cite_url:
         if _has_ancestor(tag, "li"):
-            quote_text += f"    — <{cite_url}>\n\n"
+            quote_text += f"{list_indent_str}— <{cite_url}>\n\n"
         else:
             quote_text += f"— <{cite_url}>\n\n"
@@ -283,23 +270,19 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
     return f"![{alt}]({src}{title_part})"
-def _convert_list(*, tag: Tag, text: str) -> str:
+def _convert_list(*, tag: Tag, text: str, list_indent_str: str) -> str:
     from html_to_markdown.processing import _has_ancestor  # noqa: PLC0415
     before_paragraph = False
     if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
         before_paragraph = True
-    # Check if this list is inside a list item
     if _has_ancestor(tag, "li"):
-        # This is a nested list - needs indentation
-        # But we need to check if it's the first element after a paragraph
         parent = tag.parent
         while parent and parent.name != "li":
             parent = parent.parent
         if parent:
-            # Check if there's a paragraph before this list
             prev_p = None
             for child in parent.children:
                 if hasattr(child, "name"):
@@ -309,22 +292,33 @@ def _convert_list(*, tag: Tag, text: str) -> str:
                         prev_p = child
             if prev_p:
-                # If there's a paragraph before, we need proper indentation
                 lines = text.strip().split("\n")
                 indented_lines = []
                 for line in lines:
                     if line.strip():
-                        indented_lines.append(f"    {line}")
+                        indented_lines.append(f"{list_indent_str}{line}")
                     else:
                         indented_lines.append("")
                 return "\n" + "\n".join(indented_lines) + "\n"
-            # Otherwise use the original tab indentation
-            return "\n" + indent(text=text, level=1).rstrip()
+            return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
+    if tag.parent and tag.parent.name in {"ul", "ol"}:
+        lines = text.strip().split("\n")
+        indented_lines = []
+        for line in lines:
+            if line.strip():
+                indented_lines.append(f"{list_indent_str}{line}")
+            else:
+                indented_lines.append("")
+        result = "\n".join(indented_lines)
+        if not result.endswith("\n"):
+            result += "\n"
+        return result
     return text + ("\n" if before_paragraph else "")
-def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
+def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> str:
     checkbox = tag.find("input", {"type": "checkbox"})
     if checkbox and isinstance(checkbox, Tag):
         checked = checkbox.get("checked") is not None
@@ -355,7 +349,6 @@ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
         bullet = bullets[depth % len(bullets)]
-    # Check if the list item contains block-level elements (like <p>, <blockquote>, etc.)
     has_block_children = any(
         child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
         for child in tag.children
@@ -363,29 +356,26 @@ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
     )
     if has_block_children:
-        # Handle multi-paragraph list items
-        # Split by double newlines (paragraph separators)
         paragraphs = text.strip().split("\n\n")
         if paragraphs:
-            # First paragraph goes directly after the bullet
             result_parts = [f"{bullet} {paragraphs[0].strip()}\n"]
-            # Subsequent paragraphs need to be indented and separated by blank lines
             for para in paragraphs[1:]:
                 if para.strip():
-                    # Add blank line before the paragraph
                     result_parts.append("\n")
-                    # Indent each line of the paragraph by 4 spaces
-                    result_parts.extend(f"    {line}\n" for line in para.strip().split("\n") if line.strip())
+                    result_parts.extend(
+                        f"{list_indent_str}{line}\n" for line in para.strip().split("\n") if line.strip()
+                    )
             return "".join(result_parts)
-    # Simple case: no block elements, just inline content
     return "{} {}\n".format(bullet, (text or "").strip())
-def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: int, tag: Tag) -> str:
+def _convert_p(
+    *, wrap: bool, text: str, convert_as_inline: bool, wrap_width: int, tag: Tag, list_indent_str: str
+) -> str:
     if convert_as_inline:
         return text
@@ -399,24 +389,19 @@ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: in
     from html_to_markdown.processing import _has_ancestor  # noqa: PLC0415
-    # Check if this paragraph is inside a list item
     if _has_ancestor(tag, "li"):
-        # Check if this is the first paragraph in the list item
         parent = tag.parent
         while parent and parent.name != "li":
             parent = parent.parent
         if parent:
-            # Get all direct children that are paragraphs
             p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
-            # If this is not the first paragraph, indent it
             if p_children and tag != p_children[0]:
-                # Indent all lines by 4 spaces
                 indented_lines = []
                 for line in text.split("\n"):
                     if line.strip():
-                        indented_lines.append(f"    {line}")
+                        indented_lines.append(f"{list_indent_str}{line}")
                     else:
                         indented_lines.append("")
                 text = "\n".join(indented_lines)
@@ -425,16 +410,6 @@ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: in
 def _convert_mark(*, text: str, convert_as_inline: bool, highlight_style: str) -> str:
-    """Convert HTML mark element to Markdown highlighting.
-    Args:
-        text: The text content of the mark element.
-        convert_as_inline: Whether to convert as inline content.
-        highlight_style: The style to use for highlighting ("double-equal", "html", "bold").
-    Returns:
-        The converted markdown text.
-    """
     if convert_as_inline:
         return text
@@ -480,13 +455,11 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
     parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
     tag_grand_parent = tag.parent.parent if tag.parent else None
-    # Simple rowspan handling: if previous row had cells with rowspan, add empty cells
     if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
         prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
         rowspan_positions = []
         col_pos = 0
-        # Check which cells in previous row have rowspan > 1
         for prev_cell in prev_cells:
             rowspan = 1
             if (
@@ -497,10 +470,8 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
                 rowspan = int(prev_cell["rowspan"])
             if rowspan > 1:
-                # This cell spans into current row
                 rowspan_positions.append(col_pos)
-            # Account for colspan
             colspan = 1
             if (
                 "colspan" in prev_cell.attrs
@@ -510,25 +481,22 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
                 colspan = int(prev_cell["colspan"])
             col_pos += colspan
-        # If there are rowspan cells from previous row, add empty cells
         if rowspan_positions:
-            # Build new text with empty cells inserted
-            new_cells = []
+            converted_cells: list[str] = []
+            if text.strip():
+                parts = text.split("|")
+                converted_cells.extend(part.rstrip() + " |" for part in parts[:-1] if part)
+            new_cells: list[str] = []
             cell_index = 0
-            for pos in range(col_pos):  # Total columns
+            for pos in range(col_pos):
                 if pos in rowspan_positions:
-                    # Add empty cell for rowspan
                     new_cells.append(" |")
-                elif cell_index < len(cells):
-                    # Add actual cell content
-                    cell = cells[cell_index]
-                    cell_text = cell.get_text().strip().replace("\n", " ")
-                    colspan = _get_colspan(cell)
-                    new_cells.append(f" {cell_text} |" * colspan)
+                elif cell_index < len(converted_cells):
+                    new_cells.append(converted_cells[cell_index])
                     cell_index += 1
-            # Override text with new cell arrangement
             text = "".join(new_cells)
     is_headrow = (
@@ -563,15 +531,6 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
 def _convert_caption(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML caption element to emphasized text.
-    Args:
-        text: The text content of the caption element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text with caption formatting.
-    """
     if convert_as_inline:
         return text
@@ -582,15 +541,6 @@ def _convert_caption(*, text: str, convert_as_inline: bool) -> str:
 def _convert_thead(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML thead element preserving table structure.
-    Args:
-        text: The text content of the thead element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text preserving table structure.
-    """
     if convert_as_inline:
         return text
@@ -598,15 +548,6 @@ def _convert_thead(*, text: str, convert_as_inline: bool) -> str:
 def _convert_tbody(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML tbody element preserving table structure.
-    Args:
-        text: The text content of the tbody element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text preserving table structure.
-    """
     if convert_as_inline:
         return text
@@ -614,15 +555,6 @@ def _convert_tbody(*, text: str, convert_as_inline: bool) -> str:
 def _convert_tfoot(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML tfoot element preserving table structure.
-    Args:
-        text: The text content of the tfoot element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text preserving table structure.
-    """
     if convert_as_inline:
         return text
@@ -630,103 +562,41 @@ def _convert_tfoot(*, text: str, convert_as_inline: bool) -> str:
 def _convert_colgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML colgroup element - removes it entirely from Markdown output.
-    Colgroup is a table column grouping element that defines styling for columns.
-    It has no representation in Markdown and should be removed.
-    Args:
-        tag: The colgroup tag element.
-        text: The text content of the colgroup element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        Empty string as colgroup has no Markdown representation.
-    """
     _ = tag, text, convert_as_inline
-    # Colgroup and its contents (col elements) are purely presentational
-    # and have no equivalent in Markdown tables
     return ""
 def _convert_col(*, tag: Tag, convert_as_inline: bool) -> str:
-    """Convert HTML col element - removes it entirely from Markdown output.
-    Col elements define column properties (width, style) in HTML tables.
-    They have no representation in Markdown and should be removed.
-    Args:
-        tag: The col tag element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        Empty string as col has no Markdown representation.
-    """
     _ = tag, convert_as_inline
-    # Col elements are self-closing and purely presentational
     return ""
 def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML5 semantic elements to block-level Markdown.
-    Args:
-        text: The text content of the semantic element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text with proper block spacing.
-    """
     if convert_as_inline:
         return text
     return f"{text}\n\n" if text.strip() else ""
-def _convert_details(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML details element to semantic Markdown.
+def _convert_div(*, text: str, convert_as_inline: bool) -> str:  # noqa: ARG001
+    return text
-    Args:
-        text: The text content of the details element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (only content, no HTML tags).
-    """
+def _convert_details(*, text: str, convert_as_inline: bool) -> str:
     if convert_as_inline:
         return text
-    # Details is a semantic container, return its content
     return _format_block_element(text)
 def _convert_summary(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML summary element to emphasized text.
-    Args:
-        text: The text content of the summary element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text as bold heading.
-    """
     if convert_as_inline:
         return text
-    # Summary is like a heading/title
     return _format_wrapped_block(text, "**")
 def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML definition list element.
-    Args:
-        text: The text content of the definition list.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text with proper spacing.
-    """
     if convert_as_inline:
         return text
@@ -734,15 +604,6 @@ def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
 def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML definition term element.
-    Args:
-        text: The text content of the definition term.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text as a definition term.
-    """
     if convert_as_inline:
         return text
@@ -753,15 +614,6 @@ def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
 def _convert_dd(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML definition description element.
-    Args:
-        text: The text content of the definition description.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text as a definition description.
-    """
     if convert_as_inline:
         return text
@@ -772,15 +624,6 @@ def _convert_dd(*, text: str, convert_as_inline: bool) -> str:
 def _convert_cite(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML cite element to italic text.
-    Args:
-        text: The text content of the cite element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text in italic format.
-    """
     if convert_as_inline:
         return text
@@ -791,15 +634,6 @@ def _convert_cite(*, text: str, convert_as_inline: bool) -> str:
 def _convert_q(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML q element to quoted text.
-    Args:
-        text: The text content of the q element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text with quotes.
-    """
     if convert_as_inline:
         return text
@@ -811,33 +645,20 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
 def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML media elements (audio/video) to semantic Markdown.
-    Args:
-        tag: The media tag element.
-        text: The text content of the media element (fallback content).
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (link if src exists, otherwise fallback content).
-    """
     src = tag.get("src", "")
     if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
         src = source_tag.get("src", "")
-    # If we have a src, convert to a link
     if src and isinstance(src, str) and src.strip():
         link = f"[{src}]({src})"
         if convert_as_inline:
             return link
         result = f"{link}\n\n"
-        # Add fallback content if present
         if text.strip():
             result += f"{text.strip()}\n\n"
         return result
-    # No src, just return fallback content
     if text.strip():
         return _format_inline_or_block(text, convert_as_inline)
@@ -845,20 +666,9 @@ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> s
 def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML iframe element to semantic Markdown.
-    Args:
-        tag: The iframe tag element.
-        text: The text content of the iframe element (usually empty).
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (link if src exists).
-    """
     _ = text
     src = tag.get("src", "")
-    # If we have a src, convert to a link
     if src and isinstance(src, str) and src.strip():
         link = f"[{src}]({src})"
         if convert_as_inline:
@@ -869,16 +679,6 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
 def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML abbr element to text with optional title.
-    Args:
-        tag: The abbr tag element.
-        text: The text content of the abbr element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text with optional title annotation.
-    """
     _ = convert_as_inline
     if not text.strip():
         return ""
@@ -891,69 +691,29 @@ def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
 def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML time element to semantic Markdown.
-    Args:
-        tag: The time tag element.
-        text: The text content of the time element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (content only, no HTML tags).
-    """
     _ = tag
     _ = convert_as_inline
     if not text.strip():
         return ""
-    # Time elements are semantic - just return the content
     return text.strip()
 def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML data element to semantic Markdown.
-    Args:
-        tag: The data tag element.
-        text: The text content of the data element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (content only, no HTML tags).
-    """
     _ = tag
     _ = convert_as_inline
     if not text.strip():
         return ""
-    # Data elements are semantic - just return the content
     return text.strip()
 def _convert_wbr(*, convert_as_inline: bool) -> str:
-    """Convert HTML wbr (word break opportunity) element.
-    Args:
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        Empty string as wbr is just a break opportunity.
-    """
     _ = convert_as_inline
     return ""
 def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML form element to semantic Markdown.
-    Args:
-        tag: The form tag element.
-        text: The text content of the form element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (only content, no HTML tags).
-    """
     _ = tag
     if convert_as_inline:
         return text
@@ -961,63 +721,31 @@ def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
     if not text.strip():
         return ""
-    # Forms are just containers, return their content
     return text
 def _convert_fieldset(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML fieldset element to semantic Markdown.
-    Args:
-        text: The text content of the fieldset element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (only content, no HTML tags).
-    """
     if convert_as_inline:
         return text
     if not text.strip():
         return ""
-    # Fieldsets are semantic groupings, return their content
     return text
 def _convert_legend(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML legend element to emphasized text.
-    Args:
-        text: The text content of the legend element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text as emphasized legend.
-    """
     if convert_as_inline:
         return text
     if not text.strip():
         return ""
-    # Legend is like a heading/title for fieldsets
     return _format_wrapped_block(text, "**")
 def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML label element to Markdown.
-    Args:
-        tag: The label tag element.
-        text: The text content of the label element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The label text content.
-    """
     _ = tag
-    # Labels are just text, return the content
     if not text.strip():
         return ""
@@ -1025,33 +753,12 @@ def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
 def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
-    """Convert HTML input element to Markdown.
-    Args:
-        tag: The input tag element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        Empty string since input elements have no Markdown representation.
-    """
     _ = tag, convert_as_inline
-    # Input elements have no content and no Markdown equivalent
     return ""
 def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML textarea element to Markdown.
-    Args:
-        tag: The textarea tag element.
-        text: The text content of the textarea element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The text content of the textarea.
-    """
     _ = tag
-    # Return the text content, which is what the user entered
     if not text.strip():
         return ""
@@ -1059,69 +766,33 @@ def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
 def _convert_select(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML select element to Markdown.
-    Args:
-        tag: The select tag element.
-        text: The text content of the select element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The text content (options) as a comma-separated list.
-    """
     _ = tag
-    # Return the options as text
     if not text.strip():
         return ""
-    # In inline mode, show options separated by commas
     if convert_as_inline:
-        # Remove extra whitespace and join options
         options = [opt.strip() for opt in text.strip().split("\n") if opt.strip()]
         return ", ".join(options)
-    # In block mode, show as a list
     return _format_block_element(text)
 def _convert_option(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML option element to Markdown.
-    Args:
-        tag: The option tag element.
-        text: The text content of the option element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The option text, potentially with a marker if selected.
-    """
     if not text.strip():
         return ""
-    # Check if this option is selected
     selected = tag.get("selected") is not None
     content = text.strip()
     if convert_as_inline:
         return content
-    # In block mode, mark selected options
     if selected:
         return f"* {content}\n"
     return f"{content}\n"
 def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML optgroup element to semantic Markdown.
-    Args:
-        tag: The optgroup tag element.
-        text: The text content of the optgroup element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text with label as heading.
-    """
     if convert_as_inline:
         return text
@@ -1131,7 +802,6 @@ def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
     label = tag.get("label", "")
     content = text.strip()
-    # If there's a label, show it as a heading
     if label and isinstance(label, str) and label.strip():
         return f"**{label.strip()}**\n{content}\n"
@@ -1139,18 +809,7 @@ def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
 def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML button element to Markdown.
-    Args:
-        tag: The button tag element.
-        text: The text content of the button element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The button text content.
-    """
     _ = tag
-    # Buttons are just interactive text, return the text content
     if not text.strip():
         return ""
@@ -1158,16 +817,6 @@ def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
 def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML progress element to semantic text.
-    Args:
-        tag: The progress tag element.
-        text: The text content of the progress element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (only content, no HTML tags).
-    """
     _ = tag
     if convert_as_inline:
         return text
@@ -1175,21 +824,10 @@ def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
     if not text.strip():
         return ""
-    # Progress elements convert to their text content
     return _format_block_element(text)
 def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML meter element to semantic text.
-    Args:
-        tag: The meter tag element.
-        text: The text content of the meter element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (only content, no HTML tags).
-    """
     _ = tag
     if convert_as_inline:
         return text
@@ -1197,21 +835,10 @@ def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
     if not text.strip():
         return ""
-    # Meter elements convert to their text content
     return _format_block_element(text)
 def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML output element to semantic text.
-    Args:
-        tag: The output tag element.
-        text: The text content of the output element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (only content, no HTML tags).
-    """
     _ = tag
     if convert_as_inline:
         return text
@@ -1219,21 +846,10 @@ def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
     if not text.strip():
         return ""
-    # Output elements convert to their text content
     return _format_block_element(text)
 def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML datalist element to semantic Markdown.
-    Args:
-        tag: The datalist tag element.
-        text: The text content of the datalist element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (only content, no HTML tags).
-    """
     _ = tag
     if convert_as_inline:
         return text
@@ -1241,20 +857,10 @@ def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
     if not text.strip():
         return ""
-    # Datalist shows options as a list
     return _format_block_element(text)
 def _convert_ruby(*, text: str, convert_as_inline: bool) -> str:  # noqa: ARG001
-    """Convert HTML ruby element providing pronunciation annotation.
-    Args:
-        text: The text content of the ruby element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text with ruby annotation as fallback text.
-    """
     if not text.strip():
         return ""
@@ -1262,15 +868,6 @@ def _convert_ruby(*, text: str, convert_as_inline: bool) -> str:  # noqa: ARG001
 def _convert_rb(*, text: str, convert_as_inline: bool) -> str:  # noqa: ARG001
-    """Convert HTML rb (ruby base) element.
-    Args:
-        text: The text content of the rb element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (ruby base text).
-    """
     if not text.strip():
         return ""
@@ -1278,16 +875,6 @@ def _convert_rb(*, text: str, convert_as_inline: bool) -> str:  # noqa: ARG001
 def _convert_rt(*, text: str, convert_as_inline: bool, tag: Tag) -> str:  # noqa: ARG001
-    """Convert HTML rt (ruby text) element for pronunciation.
-    Args:
-        text: The text content of the rt element.
-        convert_as_inline: Whether to convert as inline content.
-        tag: The rt tag element.
-    Returns:
-        The converted markdown text with pronunciation in parentheses.
-    """
     content = text.strip()
     prev_sibling = tag.previous_sibling
@@ -1303,15 +890,6 @@ def _convert_rt(*, text: str, convert_as_inline: bool, tag: Tag) -> str:  # noqa
 def _convert_rp(*, text: str, convert_as_inline: bool) -> str:  # noqa: ARG001
-    """Convert HTML rp (ruby parentheses) element for fallback.
-    Args:
-        text: The text content of the rp element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (parentheses for ruby fallback).
-    """
     if not text.strip():
         return ""
@@ -1319,15 +897,6 @@ def _convert_rp(*, text: str, convert_as_inline: bool) -> str:  # noqa: ARG001
 def _convert_rtc(*, text: str, convert_as_inline: bool) -> str:  # noqa: ARG001
-    """Convert HTML rtc (ruby text container) element.
-    Args:
-        text: The text content of the rtc element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (ruby text container).
-    """
     if not text.strip():
         return ""
@@ -1335,16 +904,6 @@ def _convert_rtc(*, text: str, convert_as_inline: bool) -> str:  # noqa: ARG001
 def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
-    """Convert HTML dialog element to semantic Markdown.
-    Args:
-        text: The text content of the dialog element.
-        convert_as_inline: Whether to convert as inline content.
-        tag: The dialog tag element.
-    Returns:
-        The converted markdown text (only content, no HTML tags).
-    """
     _ = tag
     if convert_as_inline:
         return text
@@ -1352,21 +911,10 @@ def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
     if not text.strip():
         return ""
-    # Dialog is a semantic container, return its content
     return _format_block_element(text)
 def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
-    """Convert HTML menu element to semantic Markdown.
-    Args:
-        text: The text content of the menu element.
-        convert_as_inline: Whether to convert as inline content.
-        tag: The menu tag element.
-    Returns:
-        The converted markdown text (only content, no HTML tags).
-    """
     _ = tag
     if convert_as_inline:
         return text
@@ -1374,21 +922,10 @@ def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
     if not text.strip():
         return ""
-    # Menu is converted as a list
     return _format_block_element(text)
 def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
-    """Convert HTML figure element to semantic Markdown.
-    Args:
-        text: The text content of the figure element.
-        convert_as_inline: Whether to convert as inline content.
-        tag: The figure tag element.
-    Returns:
-        The converted markdown text (only content, no HTML tags).
-    """
     _ = tag
     if not text.strip():
         return ""
@@ -1396,8 +933,6 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
     if convert_as_inline:
         return text
-    # Figure is a semantic container, return its content
-    # Make sure there's proper spacing after the figure content
     content = text.strip()
     if content and not content.endswith("\n\n"):
         if content.endswith("\n"):
@@ -1408,55 +943,24 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
 def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
-    """Convert HTML hgroup element to semantic Markdown.
-    Args:
-        text: The text content of the hgroup element.
-        convert_as_inline: Whether to convert as inline content.
-    Returns:
-        The converted markdown text (only content, no HTML tags).
-    """
     if convert_as_inline:
         return text
     if not text.strip():
         return ""
-    # Hgroup is a semantic container for headings, return its content
     return text
 def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
-    """Convert HTML picture element to semantic Markdown.
-    Args:
-        text: The text content of the picture element.
-        convert_as_inline: Whether to convert as inline content.
-        tag: The picture tag element.
-    Returns:
-        The converted markdown text (only the img element).
-    """
     _ = tag, convert_as_inline
     if not text.strip():
         return ""
-    # Picture is a container for responsive images, only the img matters for Markdown
     return text.strip()
 def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
-    """Convert SVG element to Markdown image reference.
-    Args:
-        text: The text content of the SVG element.
-        convert_as_inline: Whether to convert as inline content.
-        tag: The SVG tag element.
-    Returns:
-        The converted markdown text as an image reference.
-    """
     if convert_as_inline:
         return text.strip()
@@ -1475,16 +979,6 @@ def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
 def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
-    """Convert MathML math element preserving mathematical notation.
-    Args:
-        text: The text content of the math element.
-        convert_as_inline: Whether to convert as inline content.
-        tag: The math tag element.
-    Returns:
-        The converted markdown text preserving math structure.
-    """
     if not text.strip():
         return ""
@@ -1507,6 +1001,8 @@ def create_converters_map(
     heading_style: Literal["atx", "atx_closed", "underlined"],
     highlight_style: Literal["double-equal", "html", "bold"],
     keep_inline_images_in: Iterable[str] | None,
+    list_indent_type: str,
+    list_indent_width: int,
     newline_style: str,
     strong_em_symbol: str,
     sub_symbol: str,
@@ -1514,27 +1010,7 @@ def create_converters_map(
     wrap: bool,
     wrap_width: int,
 ) -> ConvertersMap:
-    """Create a mapping of HTML elements to their corresponding conversion functions.
-    Args:
-        autolinks: Whether to convert URLs into links.
-        bullets: The bullet characters to use for unordered lists.
-        code_language: The default code language to use.
-        code_language_callback: A callback to get the code language.
-        default_title: Whether to use the URL as the title for links.
-        heading_style: The style of headings.
-        highlight_style: The style to use for highlighted text (mark elements).
-        keep_inline_images_in: The tags to keep inline images in.
-        newline_style: The style of newlines.
-        strong_em_symbol: The symbol to use for strong and emphasis text.
-        sub_symbol: The symbol to use for subscript text.
-        sup_symbol: The symbol to use for superscript text.
-        wrap: Whether to wrap text.
-        wrap_width: The width to wrap text at.
-    Returns:
-        A mapping of HTML elements to their corresponding conversion functions
-    """
+    list_indent_str = "\t" if list_indent_type == "tabs" else " " * list_indent_width
     def _wrapper(func: Callable[..., T]) -> Callable[[str, Tag], T]:
         spec = getfullargspec(func)
@@ -1548,6 +1024,8 @@ def create_converters_map(
                     kwargs["text"] = text
                 if "convert_as_inline" in spec.kwonlyargs:
                     kwargs["convert_as_inline"] = convert_as_inline
+                if "list_indent_str" in spec.kwonlyargs:
+                    kwargs["list_indent_str"] = list_indent_str
                 return func(**kwargs)
             return func(text)
@@ -1562,7 +1040,7 @@ def create_converters_map(
         "b": _wrapper(partial(_create_inline_converter(2 * strong_em_symbol))),
         "bdi": _wrapper(_create_inline_converter("")),
         "bdo": _wrapper(_create_inline_converter("")),
-        "blockquote": _wrapper(partial(_convert_blockquote)),
+        "blockquote": _wrapper(partial(_convert_blockquote, list_indent_str=list_indent_str)),
         "br": _wrapper(partial(_convert_br, newline_style=newline_style)),
         "button": _wrapper(_convert_button),
         "caption": _wrapper(_convert_caption),
@@ -1577,6 +1055,7 @@ def create_converters_map(
         "details": _wrapper(_convert_details),
         "dfn": _wrapper(_create_inline_converter("*")),
         "dialog": _wrapper(_convert_dialog),
+        "div": _wrapper(_convert_div),
         "dl": _wrapper(_convert_dl),
         "dt": _wrapper(_convert_dt),
         "em": _wrapper(_create_inline_converter(strong_em_symbol)),
@@ -1602,19 +1081,19 @@ def create_converters_map(
         "kbd": _wrapper(_create_inline_converter("`")),
         "label": _wrapper(_convert_label),
         "legend": _wrapper(_convert_legend),
-        "li": _wrapper(partial(_convert_li, bullets=bullets)),
-        "list": _wrapper(_convert_list),
+        "li": _wrapper(partial(_convert_li, bullets=bullets, list_indent_str=list_indent_str)),
+        "list": _wrapper(partial(_convert_list, list_indent_str=list_indent_str)),
         "main": _wrapper(_convert_semantic_block),
         "mark": _wrapper(partial(_convert_mark, highlight_style=highlight_style)),
         "math": _wrapper(_convert_math),
         "menu": _wrapper(_convert_menu),
         "meter": _wrapper(_convert_meter),
         "nav": _wrapper(_convert_semantic_block),
-        "ol": _wrapper(_convert_list),
+        "ol": _wrapper(partial(_convert_list, list_indent_str=list_indent_str)),
         "optgroup": _wrapper(_convert_optgroup),
         "option": _wrapper(_convert_option),
         "output": _wrapper(_convert_output),
-        "p": _wrapper(partial(_convert_p, wrap=wrap, wrap_width=wrap_width)),
+        "p": _wrapper(partial(_convert_p, wrap=wrap, wrap_width=wrap_width, list_indent_str=list_indent_str)),
         "picture": _wrapper(_convert_picture),
         "pre": _wrapper(
             partial(
@@ -1652,7 +1131,7 @@ def create_converters_map(
         "time": _wrapper(_convert_time),
         "tr": _wrapper(_convert_tr),
         "u": _wrapper(_create_inline_converter("")),
-        "ul": _wrapper(_convert_list),
+        "ul": _wrapper(partial(_convert_list, list_indent_str=list_indent_str)),
         "var": _wrapper(_create_inline_converter("*")),
         "video": _wrapper(_convert_media_element),
         "wbr": _wrapper(_convert_wbr),

html-to-markdown 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

html-to-markdown 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl