PyPI - html-to-markdown - Versions diffs - 1.3.2__tar.gz → 1.4.0__tar.gz - Mend

html-to-markdown 1.3.2tar.gz → 1.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (19) hide show

{html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,16 @@
 Metadata-Version: 2.4
 Name: html-to-markdown
-Version: 1.3.2
-Summary: Convert HTML to markdown
+Version: 1.4.0
+Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
 Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
 License: MIT
-Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
-Keywords: converter,html,markdown,text-extraction,text-processing
+Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
+Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
+Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
+Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
+Keywords: beautifulsoup,cli-tool,converter,html,html2markdown,markdown,markup,text-extraction,text-processing
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
@@ -15,6 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Internet :: WWW/HTTP
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Topic :: Text Processing
 Classifier: Topic :: Text Processing :: Markup
 Classifier: Topic :: Text Processing :: Markup :: HTML

{html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown/__main__.py RENAMED Viewed

@@ -1,11 +1,17 @@
 import sys
-if __name__ == "__main__":
-    from html_to_markdown.cli import main
+from html_to_markdown.cli import main
+def cli() -> None:
+    """Main CLI entrypoint."""
     try:
         result = main(sys.argv[1:])
         print(result)  # noqa: T201
     except ValueError as e:
         print(str(e), file=sys.stderr)  # noqa: T201
         sys.exit(1)
+if __name__ == "__main__":
+    cli()

{html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown/cli.py RENAMED Viewed

@@ -1,11 +1,12 @@
-def main(argv: list[str]) -> str:
-    """Command-line entry point."""
-    from argparse import ArgumentParser, FileType
-    from sys import stdin
+from argparse import ArgumentParser, FileType
+from sys import stdin
+from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
+from html_to_markdown.processing import convert_to_markdown
-    from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
-    from html_to_markdown.processing import convert_to_markdown
+def main(argv: list[str]) -> str:
+    """Command-line entry point."""
     parser = ArgumentParser(
         prog="html_to_markdown",
         description="Converts HTML to Markdown.",
@@ -127,6 +128,12 @@ def main(argv: list[str]) -> str:
         help="The number of characters at which text paragraphs should wrap. Defaults to 80.",
     )
+    parser.add_argument(
+        "--strip-newlines",
+        action="store_true",
+        help="Remove newlines from HTML input before processing. This helps flatten janky output from HTML with unnecessary line breaks.",
+    )
     args = parser.parse_args(argv)
     return convert_to_markdown(
@@ -147,4 +154,5 @@ def main(argv: list[str]) -> str:
         keep_inline_images_in=args.keep_inline_images_in,
         wrap=args.wrap,
         wrap_width=args.wrap_width,
+        strip_newlines=args.strip_newlines,
     )

{html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown/converters.py RENAMED Viewed

@@ -156,11 +156,17 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
     alt = tag.attrs.get("alt", "")
     src = tag.attrs.get("src", "")
     title = tag.attrs.get("title", "")
+    width = tag.attrs.get("width", "")
+    height = tag.attrs.get("height", "")
     title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
     parent_name = tag.parent.name if tag.parent else ""
-    if convert_as_inline and parent_name not in (keep_inline_images_in or []):
+    # Always preserve images in table cells (td, th) by default
+    default_preserve_in = ["td", "th"]
+    preserve_in = set(keep_inline_images_in or []) | set(default_preserve_in)
+    if convert_as_inline and parent_name not in preserve_in:
         return alt
+    if width or height:
+        return f"<img src='{src}' alt='{alt}' title='{title}' width='{width}' height='{height}' />"
     return f"![{alt}]({src}{title_part})"

{html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown/processing.py RENAMED Viewed

@@ -213,6 +213,7 @@ def convert_to_markdown(
     keep_inline_images_in: Iterable[str] | None = None,
     newline_style: Literal["spaces", "backslash"] = SPACES,
     strip: str | Iterable[str] | None = None,
+    strip_newlines: bool = False,
     strong_em_symbol: Literal["*", "_"] = ASTERISK,
     sub_symbol: str = "",
     sup_symbol: str = "",
@@ -238,6 +239,7 @@ def convert_to_markdown(
         keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
         newline_style: Style for handling newlines in text content. Defaults to "spaces".
         strip: Tags to strip from the output. Defaults to None.
+        strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
         strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
         sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
         sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
@@ -259,6 +261,10 @@ def convert_to_markdown(
         ):
             return source
+        if strip_newlines:
+            # Replace all newlines with spaces before parsing
+            source = source.replace("\n", " ").replace("\r", " ")
         if "".join(source.split("\n")):
             source = BeautifulSoup(source, "html.parser")
         else:

{html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown.egg-info/PKG-INFO RENAMED Viewed

@@ -1,11 +1,16 @@
 Metadata-Version: 2.4
 Name: html-to-markdown
-Version: 1.3.2
-Summary: Convert HTML to markdown
+Version: 1.4.0
+Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
 Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
 License: MIT
-Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
-Keywords: converter,html,markdown,text-extraction,text-processing
+Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
+Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
+Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
+Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
+Keywords: beautifulsoup,cli-tool,converter,html,html2markdown,markdown,markup,text-extraction,text-processing
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
@@ -15,6 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Internet :: WWW/HTTP
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Topic :: Text Processing
 Classifier: Topic :: Text Processing :: Markup
 Classifier: Topic :: Text Processing :: Markup :: HTML

{html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown.egg-info/SOURCES.txt RENAMED Viewed

@@ -12,5 +12,6 @@ html_to_markdown/utils.py
 html_to_markdown.egg-info/PKG-INFO
 html_to_markdown.egg-info/SOURCES.txt
 html_to_markdown.egg-info/dependency_links.txt
+html_to_markdown.egg-info/entry_points.txt
 html_to_markdown.egg-info/requires.txt
 html_to_markdown.egg-info/top_level.txt

html_to_markdown-1.4.0/html_to_markdown.egg-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+html-to-markdown = html_to_markdown.__main__:cli
+html_to_markdown = html_to_markdown.__main__:cli

{html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/pyproject.toml RENAMED Viewed

@@ -5,14 +5,26 @@ requires = [ "setuptools>=78.1" ]
 [project]
 name = "html-to-markdown"
-version = "1.3.2"
-description = "Convert HTML to markdown"
+version = "1.4.0"
+description = "A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options"
 readme = "README.md"
-keywords = [ "converter", "html", "markdown", "text-extraction", "text-processing" ]
+keywords = [
+  "beautifulsoup",
+  "cli-tool",
+  "converter",
+  "html",
+  "html2markdown",
+  "markdown",
+  "markup",
+  "text-extraction",
+  "text-processing",
+]
 license = { text = "MIT" }
 authors = [ { name = "Na'aman Hirschfeld", email = "nhirschfeld@gmail.com" } ]
 requires-python = ">=3.9"
 classifiers = [
+  "Development Status :: 5 - Production/Stable",
+  "Environment :: Console",
   "Intended Audience :: Developers",
   "License :: OSI Approved :: MIT License",
   "Operating System :: OS Independent",
@@ -22,6 +34,8 @@ classifiers = [
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
   "Programming Language :: Python :: 3.13",
+  "Topic :: Internet :: WWW/HTTP",
+  "Topic :: Software Development :: Libraries :: Python Modules",
   "Topic :: Text Processing",
   "Topic :: Text Processing :: Markup",
   "Topic :: Text Processing :: Markup :: HTML",
@@ -29,23 +43,27 @@ classifiers = [
   "Topic :: Utilities",
   "Typing :: Typed",
 ]
 dependencies = [
   "beautifulsoup4>=4.13.4",
 ]
-urls.homepage = "https://github.com/Goldziher/html-to-markdown"
+urls.Changelog = "https://github.com/Goldziher/html-to-markdown/releases"
+urls.Homepage = "https://github.com/Goldziher/html-to-markdown"
+urls.Issues = "https://github.com/Goldziher/html-to-markdown/issues"
+urls.Repository = "https://github.com/Goldziher/html-to-markdown.git"
+scripts.html-to-markdown = "html_to_markdown.__main__:cli"
+scripts.html_to_markdown = "html_to_markdown.__main__:cli"
 [dependency-groups]
 dev = [
   "covdefaults>=2.3",
-  "mypy>=1.14.1",
+  "mypy>=1.16.1",
   "pre-commit>=4.1",
-  "pytest>=8.3.4",
-  "pytest-cov>=6.1.1",
-  "pytest-mock>=3.14",
-  "ruff>=0.11.6",
-  "types-beautifulsoup4>=4.12.0.20241020",
+  "pytest>=8.4.1",
+  "pytest-cov>=6.2.1",
+  "pytest-mock>=3.14.1",
+  "ruff>=0.12",
+  "types-beautifulsoup4>=4.12.0.20250516",
   "uv-bump",
 ]
@@ -58,8 +76,6 @@ html_to_markdown = [ "py.typed" ]
 [tool.hatch.build]
 skip-excluded-dirs = true
-scripts.html_to_markdown = "html_to_markdown.__main__:cli"
 [tool.ruff]
 target-version = "py39"
 line-length = 120
@@ -71,16 +87,16 @@ lint.select = [ "ALL" ]
 lint.ignore = [
   "ANN401",  # dynamically typed ANY
   "COM812",  # Conflicts with formatter
-  "D100",    # # pydocstyle - missing docstring in public module
+  "D100",    # pydocstyle - missing docstring in public module
   "D104",    # pydocstyle - missing docstring in public package
   "D107",    # pydocstyle - missing docstring in __init__
   "D205",    # pydocstyle - 1 blank line required between summary line and description
   "E501",    # pycodestyle line too long, handled by ruff format
-  "EM",      # Exception messages,
+  "EM",      # Exception messages
   "FBT",     # Boolean Args
   "FIX",     # we allow todo and fixme comments
   "ISC001",  # Conflicts with formatter
-  "PLR0913", # Pylint - too many arguments.
+  "PLR0913", # Pylint - too many arguments
   "PLR2004", # Magic variables, we allow them
   "TD",      # we allow todo and fixme comments
   "TRY",     # Try except block, rules are too strict
@@ -92,22 +108,32 @@ lint.pydocstyle.convention = "google"
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
 asyncio_default_fixture_loop_scope = "function"
+testpaths = [ "tests" ]
+filterwarnings = [
+  "error",
+  "ignore::pytest.PytestConfigWarning",
+  "ignore::pytest.PytestUnraisableExceptionWarning",
+  "ignore::ResourceWarning",
+]
 [tool.coverage.run]
+source = [ "html_to_markdown" ]
 omit = [ "tests/*" ]
 plugins = [ "covdefaults" ]
-source = [ "html_to_markdown" ]
 [tool.coverage.report]
-exclude_lines = [ 'if TYPE_CHECKING:' ]
+exclude_lines = [ "if TYPE_CHECKING:" ]
 fail_under = 100
+show_missing = true
 [tool.mypy]
 packages = [ "html_to_markdown", "tests" ]
 python_version = "3.9"
+strict = true
 implicit_reexport = false
 show_error_codes = true
-strict = true
+warn_return_any = true
+warn_unused_configs = true
 [[tool.mypy.overrides]]
 module = "tests.*"