PyPI - html-to-markdown - Versions diffs - 1.9.1__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

html-to-markdown 1.9.1py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (16) hide show

html_to_markdown/__main__.py +0 -1
html_to_markdown/cli.py +101 -45
html_to_markdown/constants.py +3 -0
html_to_markdown/converters.py +34 -502
html_to_markdown/exceptions.py +1 -11
html_to_markdown/preprocessor.py +0 -37
html_to_markdown/processing.py +117 -191
html_to_markdown/utils.py +2 -42
html_to_markdown/whitespace.py +303 -0
{html_to_markdown-1.9.1.dist-info → html_to_markdown-1.11.0.dist-info}/METADATA +196 -204
html_to_markdown-1.11.0.dist-info/RECORD +17 -0
html_to_markdown-1.9.1.dist-info/RECORD +0 -16
{html_to_markdown-1.9.1.dist-info → html_to_markdown-1.11.0.dist-info}/WHEEL +0 -0
{html_to_markdown-1.9.1.dist-info → html_to_markdown-1.11.0.dist-info}/entry_points.txt +0 -0
{html_to_markdown-1.9.1.dist-info → html_to_markdown-1.11.0.dist-info}/licenses/LICENSE +0 -0
{html_to_markdown-1.9.1.dist-info → html_to_markdown-1.11.0.dist-info}/top_level.txt +0 -0

html_to_markdown/whitespace.py ADDED Viewed

@@ -0,0 +1,303 @@
+"""Whitespace handling module for HTML to Markdown conversion."""
+from __future__ import annotations
+import re
+import unicodedata
+from typing import TYPE_CHECKING, Literal
+if TYPE_CHECKING:
+    from bs4 import NavigableString, PageElement, Tag
+WhitespaceMode = Literal["normalized", "strict"]
+BLOCK_ELEMENTS = {
+    "address",
+    "article",
+    "aside",
+    "blockquote",
+    "canvas",
+    "datalist",
+    "dd",
+    "details",
+    "div",
+    "dl",
+    "dt",
+    "fieldset",
+    "figcaption",
+    "figure",
+    "footer",
+    "form",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "header",
+    "hr",
+    "legend",
+    "li",
+    "main",
+    "nav",
+    "noscript",
+    "ol",
+    "option",
+    "p",
+    "pre",
+    "section",
+    "summary",
+    "table",
+    "tfoot",
+    "ul",
+}
+PRESERVE_WHITESPACE_ELEMENTS = {"pre", "script", "style"}
+INLINE_ELEMENTS = {
+    "a",
+    "abbr",
+    "acronym",
+    "audio",
+    "b",
+    "bdi",
+    "bdo",
+    "big",
+    "br",
+    "button",
+    "cite",
+    "code",
+    "data",
+    "dfn",
+    "dialog",
+    "em",
+    "i",
+    "iframe",
+    "img",
+    "input",
+    "kbd",
+    "label",
+    "map",
+    "math",
+    "menu",
+    "meter",
+    "object",
+    "output",
+    "progress",
+    "q",
+    "rb",
+    "rp",
+    "rt",
+    "rtc",
+    "ruby",
+    "samp",
+    "script",
+    "select",
+    "small",
+    "span",
+    "strong",
+    "style",
+    "sub",
+    "sup",
+    "svg",
+    "textarea",
+    "time",
+    "tt",
+    "u",
+    "var",
+    "video",
+    "del",
+    "ins",
+    "mark",
+    "s",
+    "strike",
+    "wbr",
+}
+class WhitespaceHandler:
+    def __init__(self, mode: WhitespaceMode = "normalized") -> None:
+        self.mode = mode
+        self._multiple_spaces = re.compile(r"[ \t]+")
+        self._multiple_newlines = re.compile(r"\n{2,}")
+        self._leading_trailing_space = re.compile(r"^[ \t]+|[ \t]+$", re.MULTILINE)
+        self._unicode_spaces = re.compile(r"[\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]")
+    def normalize_unicode_spaces(self, text: str) -> str:
+        text = self._unicode_spaces.sub(" ", text)
+        normalized = []
+        for char in text:
+            if unicodedata.category(char) in ("Zs", "Zl", "Zp"):
+                normalized.append(" ")
+            elif char in ("\r\n", "\r"):
+                normalized.append("\n")
+            else:
+                normalized.append(char)
+        return "".join(normalized)
+    def should_preserve_whitespace(self, element: PageElement) -> bool:
+        if self.mode == "strict":
+            return True
+        current: PageElement | None = element
+        while current:
+            if hasattr(current, "name") and current.name in PRESERVE_WHITESPACE_ELEMENTS:
+                return True
+            current = getattr(current, "parent", None)
+        return False
+    def is_block_element(self, element: PageElement | None) -> bool:
+        if not element or not hasattr(element, "name"):
+            return False
+        return element.name in BLOCK_ELEMENTS
+    def is_inline_element(self, element: PageElement | None) -> bool:
+        if not element or not hasattr(element, "name"):
+            return False
+        return element.name in INLINE_ELEMENTS
+    def process_text_whitespace(
+        self,
+        text: str,
+        element: NavigableString,
+        *,
+        in_pre: bool = False,
+    ) -> str:
+        if not text:
+            return ""
+        if in_pre or self.should_preserve_whitespace(element):
+            return text
+        if self.mode == "strict":
+            return text
+        text = self.normalize_unicode_spaces(text)
+        return self._process_normalized(text, element)
+    def _process_normalized(self, text: str, element: NavigableString) -> str:
+        if not text.strip():
+            return self._process_whitespace_only(text, element)
+        return self._process_text_with_content(text, element)
+    def _process_whitespace_only(self, text: str, element: NavigableString) -> str:
+        prev_sibling = element.previous_sibling
+        next_sibling = element.next_sibling
+        if self.is_block_element(prev_sibling) and self.is_block_element(next_sibling):
+            return ""
+        if "\n" in text:
+            return ""
+        if self.is_inline_element(prev_sibling) or self.is_inline_element(next_sibling):
+            return " "
+        return ""
+    def _process_text_with_content(self, text: str, element: NavigableString) -> str:
+        original = str(element)
+        has_lead_space = original and original[0] in " \t\n"
+        has_trail_space = original and original[-1] in " \t\n"
+        text = self._multiple_spaces.sub(" ", text.strip())
+        parent = element.parent
+        if parent and hasattr(parent, "name") and parent.name in {"ruby", "select", "datalist"}:
+            return self._process_special_inline_containers(text, original)
+        if parent and self.is_inline_element(parent):
+            return self._process_inline_element_text(text, original, bool(has_lead_space), bool(has_trail_space))
+        return self._process_standalone_text(text, original, element, bool(has_lead_space), bool(has_trail_space))
+    def _process_special_inline_containers(self, text: str, original: str) -> str:
+        if original and "\n" not in original and "\t" not in original:
+            if original[0] == " ":
+                text = " " + text
+            if original[-1] == " ":
+                text = text + " "
+        return text
+    def _process_inline_element_text(
+        self, text: str, original: str, has_lead_space: bool, has_trail_space: bool
+    ) -> str:
+        if has_lead_space and original[0] == " ":
+            text = " " + text
+        if has_trail_space and original[-1] == " ":
+            text = text + " "
+        return text
+    def _process_standalone_text(
+        self, text: str, original: str, element: NavigableString, has_lead_space: bool, has_trail_space: bool
+    ) -> str:
+        prev_sibling = element.previous_sibling
+        next_sibling = element.next_sibling
+        multiple_newlines_before_block = (
+            original
+            and original.count("\n") >= 2
+            and self.is_block_element(next_sibling)
+            and text.strip()
+            and (self.is_inline_element(prev_sibling) or prev_sibling is None)
+        )
+        has_leading = (
+            has_lead_space
+            and original[0] == " "
+            and (self.is_inline_element(prev_sibling) or self.is_block_element(prev_sibling) or prev_sibling is None)
+        )
+        has_trailing = (
+            has_trail_space
+            and original[-1] == " "
+            and (self.is_inline_element(next_sibling) or self.is_block_element(next_sibling) or next_sibling is None)
+        )
+        if original and original[0] in "\n\t" and self.is_inline_element(prev_sibling):
+            text = " " + text
+        elif original and original[0] in "\n\t":
+            has_leading = False
+        if original and original[-1] in "\n\t" and self.is_inline_element(next_sibling):
+            text = text + " "
+        elif original and original[-1] in "\n\t":
+            has_trailing = False
+        if has_leading and not (original and original[0] in "\n\t"):
+            text = " " + text
+        if has_trailing and not (original and original[-1] in "\n\t"):
+            text = text + " "
+        if multiple_newlines_before_block:
+            text = text + "\n\n"
+        return text
+    def get_block_spacing(self, tag: Tag, next_sibling: PageElement | None = None) -> str:
+        if self.mode == "strict":
+            return ""
+        tag_name = tag.name.lower() if hasattr(tag, "name") else ""
+        double_newline_elements = {"p", "div", "blockquote", "pre", "table", "ul", "ol", "dl"}
+        single_newline_elements = {"li", "dt", "dd", "tr", "td", "th"}
+        if tag_name in double_newline_elements:
+            if self.is_block_element(next_sibling):
+                return "\n\n"
+            return "\n"
+        if tag_name in single_newline_elements:
+            return "\n"
+        if tag_name.startswith("h") and len(tag_name) == 2 and tag_name[1].isdigit():
+            return "\n\n"
+        return ""

html-to-markdown 1.9.1__py3-none-any.whl → 1.11.0__py3-none-any.whl

Potentially problematic release.

html-to-markdown 1.9.1py3-none-any.whl → 1.11.0py3-none-any.whl