PyPI - html-to-markdown - Versions diffs - 1.3.3__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

html-to-markdown 1.3.3py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (14) hide show

html_to_markdown/__init__.py +3 -2
html_to_markdown/__main__.py +5 -2
html_to_markdown/cli.py +114 -28
html_to_markdown/constants.py +1 -0
html_to_markdown/converters.py +1646 -105
html_to_markdown/processing.py +499 -13
html_to_markdown-1.5.0.dist-info/METADATA +436 -0
html_to_markdown-1.5.0.dist-info/RECORD +14 -0
{html_to_markdown-1.3.3.dist-info → html_to_markdown-1.5.0.dist-info}/entry_points.txt +1 -0
html_to_markdown-1.3.3.dist-info/METADATA +0 -242
html_to_markdown-1.3.3.dist-info/RECORD +0 -14
{html_to_markdown-1.3.3.dist-info → html_to_markdown-1.5.0.dist-info}/WHEEL +0 -0
{html_to_markdown-1.3.3.dist-info → html_to_markdown-1.5.0.dist-info}/licenses/LICENSE +0 -0
{html_to_markdown-1.3.3.dist-info → html_to_markdown-1.5.0.dist-info}/top_level.txt +0 -0

html_to_markdown/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
-from html_to_markdown.processing import convert_to_markdown
+from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
+# For backward compatibility and to maintain the existing API
 markdownify = convert_to_markdown
-__all__ = ["convert_to_markdown", "markdownify"]
+__all__ = ["convert_to_markdown", "convert_to_markdown_stream", "markdownify"]

html_to_markdown/__main__.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import sys
-def cli():
-    from html_to_markdown.cli import main
+from html_to_markdown.cli import main
+def cli() -> None:
+    """Main CLI entrypoint."""
     try:
         result = main(sys.argv[1:])
         print(result)  # noqa: T201
@@ -10,5 +12,6 @@ def cli():
         print(str(e), file=sys.stderr)  # noqa: T201
         sys.exit(1)
 if __name__ == "__main__":
     cli()

html_to_markdown/cli.py CHANGED Viewed

@@ -1,11 +1,21 @@
-def main(argv: list[str]) -> str:
-    """Command-line entry point."""
-    from argparse import ArgumentParser, FileType
-    from sys import stdin
+import sys
+from argparse import ArgumentParser, FileType
+from html_to_markdown.constants import (
+    ASTERISK,
+    ATX,
+    ATX_CLOSED,
+    BACKSLASH,
+    DOUBLE_EQUAL,
+    SPACES,
+    UNDERLINED,
+    UNDERSCORE,
+)
+from html_to_markdown.processing import convert_to_markdown
-    from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
-    from html_to_markdown.processing import convert_to_markdown
+def main(argv: list[str]) -> str:
+    """Command-line entry point."""
     parser = ArgumentParser(
         prog="html_to_markdown",
         description="Converts HTML to Markdown.",
@@ -15,7 +25,7 @@ def main(argv: list[str]) -> str:
         "html",
         nargs="?",
         type=FileType("r"),
-        default=stdin,
+        default=sys.stdin,
         help="The HTML file to convert. Defaults to STDIN if not provided.",
     )
@@ -42,8 +52,8 @@ def main(argv: list[str]) -> str:
     parser.add_argument(
         "--default-title",
-        action="store_false",
-        help="Use this flag to disable setting the link title to its href when no title is provided.",
+        action="store_true",
+        help="Set the link title to its href when no title is provided.",
     )
     parser.add_argument(
@@ -106,6 +116,13 @@ def main(argv: list[str]) -> str:
         help="Disable escaping of '_' characters in text to '\\_'.",
     )
+    parser.add_argument(
+        "--no-escape-misc",
+        dest="escape_misc",
+        action="store_false",
+        help="Disable escaping of miscellaneous characters to prevent conflicts in Markdown.",
+    )
     parser.add_argument(
         "-i",
         "--keep-inline-images-in",
@@ -127,24 +144,93 @@ def main(argv: list[str]) -> str:
         help="The number of characters at which text paragraphs should wrap. Defaults to 80.",
     )
-    args = parser.parse_args(argv)
+    parser.add_argument(
+        "--strip-newlines",
+        action="store_true",
+        help="Remove newlines from HTML input before processing. This helps flatten janky output from HTML with unnecessary line breaks.",
+    )
+    parser.add_argument(
+        "--convert-as-inline",
+        action="store_true",
+        help="Treat the content as inline elements (no block elements like paragraphs).",
+    )
+    parser.add_argument(
+        "--no-extract-metadata",
+        dest="extract_metadata",
+        action="store_false",
+        help="Disable extraction of document metadata (title, meta tags) as a comment header.",
+    )
+    parser.add_argument(
+        "--highlight-style",
+        default=DOUBLE_EQUAL,
+        choices=("double-equal", "html", "bold"),
+        help="Style to use for highlighted text (mark elements). Defaults to 'double-equal'.",
+    )
-    return convert_to_markdown(
-        args.html.read(),
-        strip=args.strip,
-        convert=args.convert,
-        autolinks=args.autolinks,
-        default_title=args.default_title,
-        heading_style=args.heading_style,
-        bullets=args.bullets,
-        strong_em_symbol=args.strong_em_symbol,
-        sub_symbol=args.sub_symbol,
-        sup_symbol=args.sup_symbol,
-        newline_style=args.newline_style,
-        code_language=args.code_language,
-        escape_asterisks=args.escape_asterisks,
-        escape_underscores=args.escape_underscores,
-        keep_inline_images_in=args.keep_inline_images_in,
-        wrap=args.wrap,
-        wrap_width=args.wrap_width,
+    parser.add_argument(
+        "--stream-processing",
+        action="store_true",
+        help="Use streaming processing for large documents to reduce memory usage.",
     )
+    parser.add_argument(
+        "--chunk-size",
+        type=int,
+        default=1024,
+        help="Size of chunks when using streaming processing. Defaults to 1024 characters.",
+    )
+    parser.add_argument(
+        "--show-progress",
+        action="store_true",
+        help="Show progress information when processing large documents.",
+    )
+    args = parser.parse_args(argv)
+    # Prepare base arguments
+    base_args = {
+        "strip": args.strip,
+        "convert": args.convert,
+        "autolinks": args.autolinks,
+        "default_title": args.default_title,
+        "heading_style": args.heading_style,
+        "bullets": args.bullets,
+        "strong_em_symbol": args.strong_em_symbol,
+        "sub_symbol": args.sub_symbol,
+        "sup_symbol": args.sup_symbol,
+        "newline_style": args.newline_style,
+        "code_language": args.code_language,
+        "escape_asterisks": args.escape_asterisks,
+        "escape_underscores": args.escape_underscores,
+        "escape_misc": args.escape_misc,
+        "keep_inline_images_in": args.keep_inline_images_in,
+        "wrap": args.wrap,
+        "wrap_width": args.wrap_width,
+        "strip_newlines": args.strip_newlines,
+        "convert_as_inline": args.convert_as_inline,
+        "extract_metadata": args.extract_metadata,
+        "highlight_style": args.highlight_style,
+    }
+    # Add streaming parameters only if streaming is enabled
+    if args.stream_processing:
+        base_args["stream_processing"] = True
+        base_args["chunk_size"] = args.chunk_size
+        # Progress callback for CLI
+        if args.show_progress:
+            def progress_callback(processed: int, total: int) -> None:
+                if total > 0:
+                    percent = (processed / total) * 100
+                    # Use sys.stderr to avoid ruff T201 error for progress output
+                    sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
+                    sys.stderr.flush()
+            base_args["progress_callback"] = progress_callback
+    return convert_to_markdown(args.html.read(), **base_args)

html_to_markdown/constants.py CHANGED Viewed

@@ -16,3 +16,4 @@ BACKSLASH: Final = "backslash"
 UNDERLINED: Final = "underlined"
 SPACES: Final = "spaces"
 UNDERSCORE: Final = "_"
+DOUBLE_EQUAL: Final = "double-equal"

html-to-markdown 1.3.3__py3-none-any.whl → 1.5.0__py3-none-any.whl

Potentially problematic release.

html-to-markdown 1.3.3py3-none-any.whl → 1.5.0py3-none-any.whl