PyPI - html-to-markdown - Versions diffs - 1.3.3__py3-none-any.whl → 1.4.0__py3-none-any.whl - Mend

html-to-markdown 1.3.3py3-none-any.whl → 1.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (11) hide show

html_to_markdown/__main__.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import sys
-def cli():
-    from html_to_markdown.cli import main
+from html_to_markdown.cli import main
+def cli() -> None:
+    """Main CLI entrypoint."""
     try:
         result = main(sys.argv[1:])
         print(result)  # noqa: T201
@@ -10,5 +12,6 @@ def cli():
         print(str(e), file=sys.stderr)  # noqa: T201
         sys.exit(1)
 if __name__ == "__main__":
     cli()

html_to_markdown/cli.py CHANGED Viewed

@@ -1,11 +1,12 @@
-def main(argv: list[str]) -> str:
-    """Command-line entry point."""
-    from argparse import ArgumentParser, FileType
-    from sys import stdin
+from argparse import ArgumentParser, FileType
+from sys import stdin
+from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
+from html_to_markdown.processing import convert_to_markdown
-    from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
-    from html_to_markdown.processing import convert_to_markdown
+def main(argv: list[str]) -> str:
+    """Command-line entry point."""
     parser = ArgumentParser(
         prog="html_to_markdown",
         description="Converts HTML to Markdown.",
@@ -127,6 +128,12 @@ def main(argv: list[str]) -> str:
         help="The number of characters at which text paragraphs should wrap. Defaults to 80.",
     )
+    parser.add_argument(
+        "--strip-newlines",
+        action="store_true",
+        help="Remove newlines from HTML input before processing. This helps flatten janky output from HTML with unnecessary line breaks.",
+    )
     args = parser.parse_args(argv)
     return convert_to_markdown(
@@ -147,4 +154,5 @@ def main(argv: list[str]) -> str:
         keep_inline_images_in=args.keep_inline_images_in,
         wrap=args.wrap,
         wrap_width=args.wrap_width,
+        strip_newlines=args.strip_newlines,
     )

html_to_markdown/converters.py CHANGED Viewed

@@ -156,11 +156,17 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
     alt = tag.attrs.get("alt", "")
     src = tag.attrs.get("src", "")
     title = tag.attrs.get("title", "")
+    width = tag.attrs.get("width", "")
+    height = tag.attrs.get("height", "")
     title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
     parent_name = tag.parent.name if tag.parent else ""
-    if convert_as_inline and parent_name not in (keep_inline_images_in or []):
+    # Always preserve images in table cells (td, th) by default
+    default_preserve_in = ["td", "th"]
+    preserve_in = set(keep_inline_images_in or []) | set(default_preserve_in)
+    if convert_as_inline and parent_name not in preserve_in:
         return alt
+    if width or height:
+        return f"<img src='{src}' alt='{alt}' title='{title}' width='{width}' height='{height}' />"
     return f"![{alt}]({src}{title_part})"

html_to_markdown/processing.py CHANGED Viewed

@@ -213,6 +213,7 @@ def convert_to_markdown(
     keep_inline_images_in: Iterable[str] | None = None,
     newline_style: Literal["spaces", "backslash"] = SPACES,
     strip: str | Iterable[str] | None = None,
+    strip_newlines: bool = False,
     strong_em_symbol: Literal["*", "_"] = ASTERISK,
     sub_symbol: str = "",
     sup_symbol: str = "",
@@ -238,6 +239,7 @@ def convert_to_markdown(
         keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
         newline_style: Style for handling newlines in text content. Defaults to "spaces".
         strip: Tags to strip from the output. Defaults to None.
+        strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
         strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
         sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
         sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
@@ -259,6 +261,10 @@ def convert_to_markdown(
         ):
             return source
+        if strip_newlines:
+            # Replace all newlines with spaces before parsing
+            source = source.replace("\n", " ").replace("\r", " ")
         if "".join(source.split("\n")):
             source = BeautifulSoup(source, "html.parser")
         else:

{html_to_markdown-1.3.3.dist-info → html_to_markdown-1.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,11 +1,16 @@
 Metadata-Version: 2.4
 Name: html-to-markdown
-Version: 1.3.3
-Summary: Convert HTML to markdown
+Version: 1.4.0
+Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
 Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
 License: MIT
-Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
-Keywords: converter,html,markdown,text-extraction,text-processing
+Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
+Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
+Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
+Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
+Keywords: beautifulsoup,cli-tool,converter,html,html2markdown,markdown,markup,text-extraction,text-processing
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
@@ -15,6 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Internet :: WWW/HTTP
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Topic :: Text Processing
 Classifier: Topic :: Text Processing :: Markup
 Classifier: Topic :: Text Processing :: Markup :: HTML

html_to_markdown-1.4.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+html_to_markdown/__init__.py,sha256=95S7_7mR_g88uTnFI0FaRNykrtAaSKb6sJbwSea2zjk,145
+html_to_markdown/__main__.py,sha256=DJyJX7NIK0BVPNS2r3BYJ0Ci_lKHhgVOpw7ZEqACH3c,323
+html_to_markdown/cli.py,sha256=Kfh2sF_ySE_fQ0qdwvUZ5Rqx-P4Y12uTpG8xF60gAq0,4789
+html_to_markdown/constants.py,sha256=Usk67k18tuRovJpKDsiEXdgH20KgqI9KOnK4Fbx-M5c,547
+html_to_markdown/converters.py,sha256=SHRAV1qIFQQdXSD_TToR_F_t8hw3-amz8rIs2Q84YbQ,12276
+html_to_markdown/processing.py,sha256=mzF6YNqhj2VoRN6_TafnZ4ZndOyFglsZXTNnOl4uvWM,10564
+html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
+html_to_markdown-1.4.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
+html_to_markdown-1.4.0.dist-info/METADATA,sha256=LmjDer-QQkH6kSlkua1NBhnaKpIu4sVvyJZPPX5PgLk,8229
+html_to_markdown-1.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+html_to_markdown-1.4.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
+html_to_markdown-1.4.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
+html_to_markdown-1.4.0.dist-info/RECORD,,

{html_to_markdown-1.3.3.dist-info → html_to_markdown-1.4.0.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,2 +1,3 @@
 [console_scripts]
+html-to-markdown = html_to_markdown.__main__:cli
 html_to_markdown = html_to_markdown.__main__:cli

html_to_markdown-1.3.3.dist-info/RECORD DELETED Viewed

@@ -1,14 +0,0 @@
-html_to_markdown/__init__.py,sha256=95S7_7mR_g88uTnFI0FaRNykrtAaSKb6sJbwSea2zjk,145
-html_to_markdown/__main__.py,sha256=_EeKI8veMWZO7xsl-mBHBE-OmH1vnkVyXuExsOfduFI,286
-html_to_markdown/cli.py,sha256=HVnzmcyrYwah_yWhZ87mZcG0VgnKYp6y89fJh2R-Rlw,4532
-html_to_markdown/constants.py,sha256=Usk67k18tuRovJpKDsiEXdgH20KgqI9KOnK4Fbx-M5c,547
-html_to_markdown/converters.py,sha256=p8arBdejEeuAp9_wIYvp5PuWNBB0M699CgLSEkW3v88,11910
-html_to_markdown/processing.py,sha256=JNCjDgbfuW3YI7mfsj9aHlk2-KriQXJHU8Eo5D9Qj1E,10280
-html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
-html_to_markdown-1.3.3.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
-html_to_markdown-1.3.3.dist-info/METADATA,sha256=BcfsHs0cSG8Y1BScMsGFOGfN5mrxiu-HA_fJC6DrtFg,7653
-html_to_markdown-1.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-html_to_markdown-1.3.3.dist-info/entry_points.txt,sha256=jhMqXDYvIyzQDLKjCn4xCyzCCbAMl94tzQx_HiG5Qi0,67
-html_to_markdown-1.3.3.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
-html_to_markdown-1.3.3.dist-info/RECORD,,

{html_to_markdown-1.3.3.dist-info → html_to_markdown-1.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{html_to_markdown-1.3.3.dist-info → html_to_markdown-1.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{html_to_markdown-1.3.3.dist-info → html_to_markdown-1.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

html-to-markdown 1.3.3__py3-none-any.whl → 1.4.0__py3-none-any.whl

Potentially problematic release.

html-to-markdown 1.3.3py3-none-any.whl → 1.4.0py3-none-any.whl