PyPI - html-to-markdown - Versions diffs - 1.11.0__py3-none-any.whl → 1.12.1__py3-none-any.whl - Mend

html-to-markdown 1.11.0py3-none-any.whl → 1.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (14) hide show

html_to_markdown/cli.py +28 -2
html_to_markdown/converters.py +208 -130
html_to_markdown/exceptions.py +5 -0
html_to_markdown/preprocessor.py +96 -86
html_to_markdown/processing.py +63 -48
html_to_markdown/utils.py +1 -3
html_to_markdown/whitespace.py +23 -33
{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/METADATA +143 -2
html_to_markdown-1.12.1.dist-info/RECORD +17 -0
html_to_markdown-1.11.0.dist-info/RECORD +0 -17
{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/WHEEL +0 -0
{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/entry_points.txt +0 -0
{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/licenses/LICENSE +0 -0
{html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/top_level.txt +0 -0

html_to_markdown/preprocessor.py CHANGED Viewed

@@ -5,6 +5,98 @@ from typing import Any
 import nh3
+BASE_ALLOWED_TAGS = frozenset(
+    {
+        "p",
+        "div",
+        "span",
+        "br",
+        "hr",
+        "h1",
+        "h2",
+        "h3",
+        "h4",
+        "h5",
+        "h6",
+        "ul",
+        "ol",
+        "li",
+        "dl",
+        "dt",
+        "dd",
+        "strong",
+        "b",
+        "em",
+        "i",
+        "u",
+        "s",
+        "del",
+        "ins",
+        "mark",
+        "small",
+        "sub",
+        "sup",
+        "code",
+        "pre",
+        "kbd",
+        "samp",
+        "var",
+        "abbr",
+        "cite",
+        "dfn",
+        "time",
+        "data",
+        "a",
+        "blockquote",
+        "q",
+    }
+)
+SEMANTIC_STRUCTURE_TAGS = frozenset(
+    {
+        "article",
+        "section",
+        "aside",
+        "header",
+        "footer",
+        "main",
+        "nav",
+        "figure",
+        "figcaption",
+        "details",
+        "summary",
+    }
+)
+TABLE_TAGS = frozenset(
+    {
+        "table",
+        "thead",
+        "tbody",
+        "tfoot",
+        "tr",
+        "td",
+        "th",
+        "caption",
+        "colgroup",
+        "col",
+    }
+)
+MEDIA_TAGS = frozenset(
+    {
+        "img",
+        "picture",
+        "source",
+        "audio",
+        "video",
+        "track",
+        "canvas",
+        "svg",
+        "iframe",
+    }
+)
 def preprocess_html(
     html: str,
@@ -63,98 +155,16 @@ def _configure_cleaning_rules(
     custom_tags_to_remove: set[str],
     custom_attributes_to_remove: set[str],
 ) -> dict[str, Any]:
-    allowed_tags = {
-        "p",
-        "div",
-        "span",
-        "br",
-        "hr",
-        "h1",
-        "h2",
-        "h3",
-        "h4",
-        "h5",
-        "h6",
-        "ul",
-        "ol",
-        "li",
-        "dl",
-        "dt",
-        "dd",
-        "strong",
-        "b",
-        "em",
-        "i",
-        "u",
-        "s",
-        "del",
-        "ins",
-        "mark",
-        "small",
-        "sub",
-        "sup",
-        "code",
-        "pre",
-        "kbd",
-        "samp",
-        "var",
-        "abbr",
-        "cite",
-        "dfn",
-        "time",
-        "data",
-        "a",
-        "blockquote",
-        "q",
-    }
+    allowed_tags = set(BASE_ALLOWED_TAGS)
     if preserve_semantic_structure:
-        allowed_tags.update(
-            {
-                "article",
-                "section",
-                "aside",
-                "header",
-                "footer",
-                "main",
-                "nav",
-                "figure",
-                "figcaption",
-                "details",
-                "summary",
-            }
-        )
+        allowed_tags.update(SEMANTIC_STRUCTURE_TAGS)
     if preserve_tables:
-        allowed_tags.update(
-            {
-                "table",
-                "thead",
-                "tbody",
-                "tfoot",
-                "tr",
-                "th",
-                "td",
-                "caption",
-                "col",
-                "colgroup",
-            }
-        )
+        allowed_tags.update(TABLE_TAGS)
     if preserve_media:
-        allowed_tags.update(
-            {
-                "img",
-                "picture",
-                "source",
-                "audio",
-                "video",
-                "track",
-                "canvas",
-                "svg",
-                "iframe",
-            }
-        )
+        allowed_tags.update(MEDIA_TAGS)
     allowed_tags -= custom_tags_to_remove

html_to_markdown/processing.py CHANGED Viewed

@@ -11,13 +11,13 @@ from io import StringIO
 from itertools import chain
 from typing import TYPE_CHECKING, Any, Literal, cast
-from bs4 import BeautifulSoup, Comment, Doctype, Tag
+from bs4 import BeautifulSoup, CData, Comment, Doctype, Tag
 from bs4.element import NavigableString, PageElement
 try:
     from html_to_markdown.preprocessor import create_preprocessor
     from html_to_markdown.preprocessor import preprocess_html as preprocess_fn
-except ImportError:
+except ImportError:  # pragma: no cover
     create_preprocessor = None  # type: ignore[assignment]
     preprocess_fn = None  # type: ignore[assignment]
@@ -25,7 +25,7 @@ try:
     import importlib.util
     LXML_AVAILABLE = importlib.util.find_spec("lxml") is not None
-except ImportError:
+except ImportError:  # pragma: no cover
     LXML_AVAILABLE = False
 from html_to_markdown.constants import (
@@ -179,6 +179,7 @@ def _process_tag(
     strip: set[str] | None,
     whitespace_handler: WhitespaceHandler,
     context_before: str = "",
+    ancestor_names: set[str] | None = None,
 ) -> str:
     should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
     tag_name: SupportedTag | None = (
@@ -186,6 +187,17 @@ def _process_tag(
     )
     text_parts: list[str] = []
+    if ancestor_names is None:
+        ancestor_names = set()
+        current: Tag | None = tag
+        while current and hasattr(current, "name"):
+            if current.name:
+                ancestor_names.add(current.name)
+            current = getattr(current, "parent", None)
+            if len(ancestor_names) > 10:
+                break
     is_heading = html_heading_re.match(tag.name) is not None
     is_cell = tag_name in {"td", "th"}
     convert_children_as_inline = convert_as_inline or is_heading or is_cell
@@ -201,7 +213,7 @@ def _process_tag(
             if can_extract and isinstance(el, NavigableString) and not el.strip():
                 el.extract()
-    children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
+    children = list(filter(lambda value: not isinstance(value, (Comment, Doctype, CData)), tag.children))
     empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
@@ -227,6 +239,7 @@ def _process_tag(
                     escape_asterisks=escape_asterisks,
                     escape_underscores=escape_underscores,
                     whitespace_handler=whitespace_handler,
+                    ancestor_names=ancestor_names,
                 )
             )
         elif isinstance(el, Tag):
@@ -243,6 +256,7 @@ def _process_tag(
                     strip=strip,
                     whitespace_handler=whitespace_handler,
                     context_before=(context_before + current_text)[-2:],
+                    ancestor_names=ancestor_names,
                 )
             )
@@ -282,21 +296,23 @@ def _process_text(
     escape_asterisks: bool,
     escape_underscores: bool,
     whitespace_handler: WhitespaceHandler,
+    ancestor_names: set[str] | None = None,
 ) -> str:
     text = str(el) or ""
     parent = el.parent
     parent_name = parent.name if parent else None
-    ancestor_names = set()
-    current = parent
-    while current and hasattr(current, "name"):
-        if current.name:
-            ancestor_names.add(current.name)
-        current = getattr(current, "parent", None)
+    if ancestor_names is None:
+        ancestor_names = set()
+        current = parent
+        while current and hasattr(current, "name"):
+            if current.name:
+                ancestor_names.add(current.name)
+            current = getattr(current, "parent", None)
-        if len(ancestor_names) > 10:
-            break
+            if len(ancestor_names) > 10:
+                break
     in_pre = bool(ancestor_names.intersection({"pre"}))
@@ -322,7 +338,7 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
 def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
     elem_id = id(element)
     cache = _ancestor_cache.get()
-    if cache is None:
+    if cache is None:  # pragma: no cover
         cache = {}
         _ancestor_cache.set(cache)
@@ -338,7 +354,7 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
             ancestor_names.add(current.name)
         parent_id = id(current)
-        if parent_id in cache:
+        if parent_id in cache:  # pragma: no cover
             ancestor_names.update(cache[parent_id])
             break
@@ -386,36 +402,35 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
         metadata["base-href"] = base_tag["href"]
     for meta in soup.find_all("meta"):
-        if meta.get("name") and meta.get("content") is not None:
-            name = meta["name"]
-            content = meta["content"]
+        if (name := meta.get("name")) and (content := meta.get("content")) is not None:
             if isinstance(name, str) and isinstance(content, str):
-                key = f"meta-{name.lower()}"
-                metadata[key] = content
+                metadata[f"meta-{name.lower()}"] = content
-        elif meta.get("property") and meta.get("content") is not None:
-            prop = meta["property"]
-            content = meta["content"]
+        elif (prop := meta.get("property")) and (content := meta.get("content")) is not None:
             if isinstance(prop, str) and isinstance(content, str):
-                key = f"meta-{prop.lower().replace(':', '-')}"
-                metadata[key] = content
+                metadata[f"meta-{prop.lower().replace(':', '-')}"] = content
-        elif meta.get("http-equiv") and meta.get("content") is not None:
-            equiv = meta["http-equiv"]
-            content = meta["content"]
-            if isinstance(equiv, str) and isinstance(content, str):
-                key = f"meta-{equiv.lower()}"
-                metadata[key] = content
+        elif (
+            (equiv := meta.get("http-equiv"))
+            and (content := meta.get("content")) is not None
+            and isinstance(equiv, str)
+            and isinstance(content, str)
+        ):
+            metadata[f"meta-{equiv.lower()}"] = content
     canonical = soup.find("link", rel="canonical", href=True)
     if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
         metadata["canonical"] = canonical["href"]
     link_relations = {"author", "license", "alternate"}
-    for rel_type in link_relations:
-        link = soup.find("link", rel=rel_type, href=True)
-        if link and isinstance(link, Tag) and isinstance(link["href"], str):
-            metadata[f"link-{rel_type}"] = link["href"]
+    link_metadata = {
+        f"link-{rel_type}": link["href"]
+        for rel_type in link_relations
+        if (link := soup.find("link", rel=rel_type, href=True))
+        and isinstance(link, Tag)
+        and isinstance(link["href"], str)
+    }
+    metadata.update(link_metadata)
     return metadata
@@ -424,11 +439,7 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
     if not metadata:
         return ""
-    lines = ["<!--"]
-    for key, value in sorted(metadata.items()):
-        safe_value = value.replace("-->", "--&gt;")
-        lines.append(f"{key}: {safe_value}")
-    lines.append("-->")
+    lines = ["<!--", *[f"{key}: {value.replace('-->', '--&gt;')}" for key, value in sorted(metadata.items())], "-->"]
     return "\n".join(lines) + "\n\n"
@@ -442,6 +453,7 @@ def convert_to_markdown(
     progress_callback: Callable[[int, int], None] | None = None,
     parser: str | None = None,
     autolinks: bool = True,
+    br_in_tables: bool = False,
     bullets: str = "*+-",
     code_language: str = "",
     code_language_callback: Callable[[Any], str] | None = None,
@@ -473,7 +485,6 @@ def convert_to_markdown(
     wrap_width: int = 80,
 ) -> str:
     """Convert HTML content to Markdown format.
     This is the main entry point for converting HTML to Markdown. It supports
     various customization options for controlling the conversion behavior.
@@ -485,6 +496,7 @@ def convert_to_markdown(
         progress_callback: Callback for progress updates (current, total).
         parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
         autolinks: Convert URLs to automatic links.
+        br_in_tables: Use <br> tags for line breaks in table cells instead of spaces.
         bullets: Characters to use for unordered list bullets.
         code_language: Default language for code blocks.
         code_language_callback: Callback to determine code language from element.
@@ -528,11 +540,9 @@ def convert_to_markdown(
         >>> html = "<h1>Title</h1><p>Content</p>"
         >>> convert_to_markdown(html)
         'Title\\n=====\\n\\nContent\\n\\n'
         With custom options:
         >>> convert_to_markdown(html, heading_style="atx", list_indent_width=2)
         '# Title\\n\\nContent\\n\\n'
         Discord-compatible lists (2-space indent):
         >>> html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
         >>> convert_to_markdown(html, list_indent_width=2)
@@ -644,7 +654,7 @@ def convert_to_markdown(
         result = re.sub(r"\n{3,}", "\n\n", result)
         if convert_as_inline:
-            result = result.rstrip("\n")
+            result = result.rstrip("\n")  # pragma: no cover
         return result
@@ -658,6 +668,7 @@ def convert_to_markdown(
         whitespace_handler=whitespace_handler,
         parser=parser,
         autolinks=autolinks,
+        br_in_tables=br_in_tables,
         bullets=bullets,
         code_language=code_language,
         code_language_callback=code_language_callback,
@@ -819,6 +830,7 @@ def _process_html_core(
     whitespace_handler: WhitespaceHandler,
     parser: str | None = None,
     autolinks: bool,
+    br_in_tables: bool,
     bullets: str,
     code_language: str,
     code_language_callback: Callable[[Any], str] | None,
@@ -849,24 +861,25 @@ def _process_html_core(
     try:
         if isinstance(source, str):
             if strip_newlines:
-                source = source.replace("\n", " ").replace("\r", " ")
+                source = source.replace("\n", " ").replace("\r", " ")  # pragma: no cover
             if "".join(source.split("\n")):
                 if parser is None:
                     parser = "lxml" if LXML_AVAILABLE else "html.parser"
-                if parser == "lxml" and not LXML_AVAILABLE:
+                if parser == "lxml" and not LXML_AVAILABLE:  # pragma: no cover
                     raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
                 source = BeautifulSoup(source, parser)
             else:
                 raise EmptyHtmlError
-        if strip is not None and convert is not None:
+        if strip is not None and convert is not None:  # pragma: no cover
             raise ConflictingOptionsError("strip", "convert")
         converters_map = create_converters_map(
             autolinks=autolinks,
+            br_in_tables=br_in_tables,
             bullets=bullets,
             code_language=code_language,
             code_language_callback=code_language_callback,
@@ -896,7 +909,7 @@ def _process_html_core(
         elements_to_process = body.children if body and isinstance(body, Tag) else source.children
         context = ""
-        for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
+        for el in filter(lambda value: not isinstance(value, (Comment, Doctype, CData)), elements_to_process):
             if isinstance(el, NavigableString):
                 text = _process_text(
                     el=el,
@@ -935,6 +948,7 @@ def convert_to_markdown_stream(
     progress_callback: Callable[[int, int], None] | None = None,
     parser: str | None = None,
     autolinks: bool = True,
+    br_in_tables: bool = False,
     bullets: str = "*+-",
     code_language: str = "",
     code_language_callback: Callable[[Any], str] | None = None,
@@ -976,6 +990,7 @@ def convert_to_markdown_stream(
         whitespace_handler=whitespace_handler,
         parser=parser,
         autolinks=autolinks,
+        br_in_tables=br_in_tables,
         bullets=bullets,
         code_language=code_language,
         code_language_callback=code_language_callback,
@@ -1027,7 +1042,7 @@ def convert_to_markdown_stream(
                 end_pos = search_start + newline_pos + 1
         chunk = combined_result[pos:end_pos]
-        if chunk:
+        if chunk:  # pragma: no cover
             yield chunk
         pos = end_pos

html_to_markdown/utils.py CHANGED Viewed

@@ -12,9 +12,7 @@ def chomp(text: str) -> tuple[str, str, str]:
     prefix = " " if text.startswith((" ", "\t")) else ""
     suffix = " " if text.endswith((" ", "\t")) else ""
-    text = text.strip()
-    return prefix, suffix, text
+    return prefix, suffix, text.strip()
 def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:

html_to_markdown/whitespace.py CHANGED Viewed

@@ -6,8 +6,10 @@ import re
 import unicodedata
 from typing import TYPE_CHECKING, Literal
+from bs4.element import NavigableString
 if TYPE_CHECKING:
-    from bs4 import NavigableString, PageElement, Tag
+    from bs4 import PageElement
 WhitespaceMode = Literal["normalized", "strict"]
@@ -128,11 +130,13 @@ class WhitespaceHandler:
     def normalize_unicode_spaces(self, text: str) -> str:
         text = self._unicode_spaces.sub(" ", text)
+        text = text.replace("\r\n", "\n")
         normalized = []
         for char in text:
             if unicodedata.category(char) in ("Zs", "Zl", "Zp"):
                 normalized.append(" ")
-            elif char in ("\r\n", "\r"):
+            elif char == "\r":  # pragma: no cover
                 normalized.append("\n")
             else:
                 normalized.append(char)
@@ -168,15 +172,12 @@ class WhitespaceHandler:
         *,
         in_pre: bool = False,
     ) -> str:
-        if not text:
+        if not text:  # pragma: no cover
             return ""
         if in_pre or self.should_preserve_whitespace(element):
             return text
-        if self.mode == "strict":
-            return text
         text = self.normalize_unicode_spaces(text)
         return self._process_normalized(text, element)
@@ -204,8 +205,8 @@ class WhitespaceHandler:
     def _process_text_with_content(self, text: str, element: NavigableString) -> str:
         original = str(element)
-        has_lead_space = original and original[0] in " \t\n"
-        has_trail_space = original and original[-1] in " \t\n"
+        has_lead_space = bool(original and original[0] in " \t\n")
+        has_trail_space = bool(original and original[-1] in " \t\n")
         text = self._multiple_spaces.sub(" ", text.strip())
@@ -215,9 +216,9 @@ class WhitespaceHandler:
             return self._process_special_inline_containers(text, original)
         if parent and self.is_inline_element(parent):
-            return self._process_inline_element_text(text, original, bool(has_lead_space), bool(has_trail_space))
+            return self._process_inline_element_text(text, original, has_lead_space, has_trail_space)
-        return self._process_standalone_text(text, original, element, bool(has_lead_space), bool(has_trail_space))
+        return self._process_standalone_text(text, original, element, has_lead_space, has_trail_space)
     def _process_special_inline_containers(self, text: str, original: str) -> str:
         if original and "\n" not in original and "\t" not in original:
@@ -253,12 +254,22 @@ class WhitespaceHandler:
         has_leading = (
             has_lead_space
             and original[0] == " "
-            and (self.is_inline_element(prev_sibling) or self.is_block_element(prev_sibling) or prev_sibling is None)
+            and (
+                self.is_inline_element(prev_sibling)
+                or self.is_block_element(prev_sibling)
+                or prev_sibling is None
+                or isinstance(prev_sibling, NavigableString)
+            )
         )
         has_trailing = (
             has_trail_space
             and original[-1] == " "
-            and (self.is_inline_element(next_sibling) or self.is_block_element(next_sibling) or next_sibling is None)
+            and (
+                self.is_inline_element(next_sibling)
+                or self.is_block_element(next_sibling)
+                or next_sibling is None
+                or isinstance(next_sibling, NavigableString)
+            )
         )
         if original and original[0] in "\n\t" and self.is_inline_element(prev_sibling):
@@ -280,24 +291,3 @@ class WhitespaceHandler:
             text = text + "\n\n"
         return text
-    def get_block_spacing(self, tag: Tag, next_sibling: PageElement | None = None) -> str:
-        if self.mode == "strict":
-            return ""
-        tag_name = tag.name.lower() if hasattr(tag, "name") else ""
-        double_newline_elements = {"p", "div", "blockquote", "pre", "table", "ul", "ol", "dl"}
-        single_newline_elements = {"li", "dt", "dd", "tr", "td", "th"}
-        if tag_name in double_newline_elements:
-            if self.is_block_element(next_sibling):
-                return "\n\n"
-            return "\n"
-        if tag_name in single_newline_elements:
-            return "\n"
-        if tag_name.startswith("h") and len(tag_name) == 2 and tag_name[1].isdigit():
-            return "\n\n"
-        return ""

html-to-markdown 1.11.0__py3-none-any.whl → 1.12.1__py3-none-any.whl

Potentially problematic release.

html-to-markdown 1.11.0py3-none-any.whl → 1.12.1py3-none-any.whl