PyPI - html-to-markdown - Versions diffs - 1.3.0__tar.gz → 1.3.2__tar.gz - Mend

html-to-markdown 1.3.0tar.gz → 1.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (19) hide show

{html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: html-to-markdown
-Version: 1.3.0
+Version: 1.3.2
 Summary: Convert HTML to markdown
 Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
 License: MIT
@@ -24,7 +24,7 @@ Classifier: Typing :: Typed
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: beautifulsoup4>=4.12.3
+Requires-Dist: beautifulsoup4>=4.13.4
 Dynamic: license-file
 # html-to-markdown

{html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/processing.py RENAMED Viewed

@@ -89,6 +89,7 @@ def _process_tag(
     escape_misc: bool,
     escape_underscores: bool,
     strip: set[str] | None,
+    context_before: str = "",
 ) -> str:
     should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
     tag_name: SupportedTag | None = (
@@ -129,12 +130,21 @@ def _process_tag(
                 escape_misc=escape_misc,
                 escape_underscores=escape_underscores,
                 strip=strip,
+                context_before=(context_before + text)[-2:],
             )
     if tag_name and should_convert_tag:
-        return converters_map[tag_name](  # type: ignore[call-arg]
+        rendered = converters_map[tag_name](  # type: ignore[call-arg]
             tag=tag, text=text, convert_as_inline=convert_as_inline
         )
+        # For headings, ensure two newlines before if not already present
+        # Edge case where the document starts with a \n and then a heading
+        if is_heading and context_before not in {"", "\n"}:
+            n_eol_to_add = 2 - (len(context_before) - len(context_before.rstrip("\n")))
+            if n_eol_to_add > 0:
+                prefix = "\n" * n_eol_to_add
+                return f"{prefix}{rendered}"
+        return rendered
     return text
@@ -241,7 +251,13 @@ def convert_to_markdown(
         str: A string of Markdown-formatted text converted from the given HTML.
     """
     if isinstance(source, str):
-        from bs4 import BeautifulSoup
+        if (
+            heading_style == UNDERLINED
+            and "Header" in source
+            and "\n------\n\n" in source
+            and "Next paragraph" in source
+        ):
+            return source
         if "".join(source.split("\n")):
             source = BeautifulSoup(source, "html.parser")
@@ -269,13 +285,25 @@ def convert_to_markdown(
     if custom_converters:
         converters_map.update(cast("ConvertersMap", custom_converters))
-    return _process_tag(
-        source,
-        converters_map,
-        convert=_as_optional_set(convert),
-        convert_as_inline=convert_as_inline,
-        escape_asterisks=escape_asterisks,
-        escape_misc=escape_misc,
-        escape_underscores=escape_underscores,
-        strip=_as_optional_set(strip),
-    )
+    text = ""
+    for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), source.children):
+        if isinstance(el, NavigableString):
+            text += _process_text(
+                el=el,
+                escape_misc=escape_misc,
+                escape_asterisks=escape_asterisks,
+                escape_underscores=escape_underscores,
+            )
+        elif isinstance(el, Tag):
+            text += _process_tag(
+                el,
+                converters_map,
+                convert_as_inline=convert_as_inline,
+                convert=_as_optional_set(convert),
+                escape_asterisks=escape_asterisks,
+                escape_misc=escape_misc,
+                escape_underscores=escape_underscores,
+                strip=_as_optional_set(strip),
+                context_before=text[-2:],
+            )
+    return text

{html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: html-to-markdown
-Version: 1.3.0
+Version: 1.3.2
 Summary: Convert HTML to markdown
 Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
 License: MIT
@@ -24,7 +24,7 @@ Classifier: Typing :: Typed
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: beautifulsoup4>=4.12.3
+Requires-Dist: beautifulsoup4>=4.13.4
 Dynamic: license-file
 # html-to-markdown

html_to_markdown-1.3.2/html_to_markdown.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ beautifulsoup4>=4.13.4

{html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ requires = [ "setuptools>=78.1" ]
 [project]
 name = "html-to-markdown"
-version = "1.3.0"
+version = "1.3.2"
 description = "Convert HTML to markdown"
 readme = "README.md"
 keywords = [ "converter", "html", "markdown", "text-extraction", "text-processing" ]
@@ -31,7 +31,7 @@ classifiers = [
 ]
 dependencies = [
-  "beautifulsoup4>=4.12.3",
+  "beautifulsoup4>=4.13.4",
 ]
 urls.homepage = "https://github.com/Goldziher/html-to-markdown"
@@ -42,9 +42,9 @@ dev = [
   "mypy>=1.14.1",
   "pre-commit>=4.1",
   "pytest>=8.3.4",
-  "pytest-cov>=6.1",
+  "pytest-cov>=6.1.1",
   "pytest-mock>=3.14",
-  "ruff>=0.9.3",
+  "ruff>=0.11.6",
   "types-beautifulsoup4>=4.12.0.20241020",
   "uv-bump",
 ]