PyPI - html-to-markdown - Versions diffs - 1.0.0__tar.gz - Mend

html-to-markdown 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (12) hide show

html_to_markdown-1.0.0/.gitignore +21 -0
html_to_markdown-1.0.0/LICENSE +22 -0
html_to_markdown-1.0.0/PKG-INFO +194 -0
html_to_markdown-1.0.0/README.md +168 -0
html_to_markdown-1.0.0/html_to_markdown/__init__.py +3 -0
html_to_markdown-1.0.0/html_to_markdown/__main__.py +131 -0
html_to_markdown-1.0.0/html_to_markdown/constants.py +18 -0
html_to_markdown-1.0.0/html_to_markdown/converters.py +380 -0
html_to_markdown-1.0.0/html_to_markdown/processing.py +298 -0
html_to_markdown-1.0.0/html_to_markdown/py.typed +0 -0
html_to_markdown-1.0.0/html_to_markdown/utils.py +72 -0
html_to_markdown-1.0.0/pyproject.toml +137 -0

html_to_markdown-1.0.0/.gitignore ADDED Viewed

@@ -0,0 +1,21 @@
+*$py.class
+*.iml
+*.log
+*.py[cod]
+.coverage
+.env
+.idea/
+.mypy_cache/
+.pdm-build/
+.pdm-python
+.pdm.toml
+.pytest_cache/
+.python-version
+.ruff_cache/
+.tox/
+.venv/
+.vscode/
+__pycache__/
+__pypackages__/
+coverage.xml
+dist/

html_to_markdown-1.0.0/LICENSE ADDED Viewed

@@ -0,0 +1,22 @@
+The MIT License (MIT)
+Copyright 2012-2018 Matthew Tretter
+Copyright 2024 Na'aman Hirschfeld
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

html_to_markdown-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,194 @@
+Metadata-Version: 2.3
+Name: html-to-markdown
+Version: 1.0.0
+Summary: Convert HTML to markdown
+Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
+License: MIT
+License-File: LICENSE
+Keywords: beautifulsoup,converter,html,markdown,text-processing
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Text Processing
+Classifier: Topic :: Text Processing :: Markup
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Classifier: Topic :: Text Processing :: Markup :: Markdown
+Classifier: Topic :: Utilities
+Classifier: Typing :: Typed
+Requires-Python: >=3.9
+Requires-Dist: beautifulsoup4>=4.12.3
+Description-Content-Type: text/markdown
+# html_to_markdown
+This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
+Python 3.9 and offering strong typing.
+### Differences from the Markdownify
+- The refactored codebase uses a strict functional approach - no classes are involved.
+- There is full typing with strict MyPy adherence in place.
+- The `convert_to_markdown` allows passing a pre-configured instance of `Beautifulsoup`.
+- This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
+  point versioning is no longer aligned.
+## Installation
+```shell
+pip install html_to_markdown
+```
+## Usage
+Convert some HTML to Markdown:
+```python
+from html_to_markdown import convert_to_markdown
+convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>')  # > '**Yay** [GitHub](http://github.com)'
+```
+Specify tags to exclude:
+```python
+from html_to_markdown import convert_to_markdown
+convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', strip=['a'])  # > '**Yay** GitHub'
+```
+\...or specify the tags you want to include:
+```python
+from html_to_markdown import convert_to_markdown
+convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', convert=['b'])  # > '**Yay** GitHub'
+```
+# Options
+html_to_markdown supports the following options:
+strip
+:   A list of tags to strip. This option can\'t be used with the
+`convert` option.
+convert
+:   A list of tags to convert. This option can\'t be used with the
+`strip` option.
+autolinks
+:   A boolean indicating whether the \"automatic link\" style should be
+used when a `a` tag\'s contents match its href. Defaults to `True`.
+default_title
+:   A boolean to enable setting the title of a link to its href, if no
+title is given. Defaults to `False`.
+heading_style
+:   Defines how headings should be converted. Accepted values are `ATX`,
+`ATX_CLOSED`, `SETEXT`, and `UNDERLINED` (which is an alias for
+`SETEXT`). Defaults to `UNDERLINED`.
+bullets
+:   An iterable (string, list, or tuple) of bullet styles to be used. If
+the iterable only contains one item, it will be used regardless of
+how deeply lists are nested. Otherwise, the bullet will alternate
+based on nesting level. Defaults to `'*+-'`.
+strong_em_symbol
+:   In markdown, both `*` and `_` are used to encode **strong** or
+*emphasized* texts. Either of these symbols can be chosen by the
+options `ASTERISK` (default) or `UNDERSCORE` respectively.
+sub_symbol, sup_symbol
+:   Define the chars that surround `<sub>` and `<sup>` text. Defaults to
+an empty string, because this is non-standard behavior. Could be
+something like `~` and `^` to result in `~sub~` and `^sup^`. If the
+value starts with `<` and ends with `>`, it is treated as an HTML
+tag and a `/` is inserted after the `<` in the string used after the
+text; this allows specifying `<sub>` to use raw HTML in the output
+for subscripts, for example.
+newline_style
+:   Defines the style of marking linebreaks (`<br>`) in markdown. The
+default value `SPACES` of this option will adopt the usual two
+spaces and a newline, while `BACKSLASH` will convert a linebreak to
+`\\n` (a backslash and a newline). While the latter convention is
+non-standard, it is commonly preferred and supported by a lot of
+interpreters.
+code_language
+:   Defines the language that should be assumed for all `<pre>`
+sections. Useful, if all code on a page is in the same programming
+language and should be annotated with ``[python]{.title-ref}[ or
+similar. Defaults to ]{.title-ref}[\'\']{.title-ref}\` (empty
+string) and can be any string.
+code_language_callback
+:   When the HTML code contains `pre` tags that in some way provide the
+code language, for example as class, this callback can be used to
+extract the language from the tag and prefix it to the converted
+`pre` tag. The callback gets one single argument, an BeautifylSoup
+object, and returns a string containing the code language, or
+`None`. An example to use the class name as code language could be:
+        def callback(el):
+            return el['class'][0] if el.has_attr('class') else None
+    Defaults to `None`.
+escape_asterisks
+:   If set to `False`, do not escape `*` to `\*` in text. Defaults to
+`True`.
+escape_underscores
+:   If set to `False`, do not escape `_` to `\_` in text. Defaults to
+`True`.
+escape_misc
+:   If set to `False`, do not escape miscellaneous punctuation
+characters that sometimes have Markdown significance in text.
+Defaults to `True`.
+keep_inline_images_in
+:   Images are converted to their alt-text when the images are located
+inside headlines or table cells. If some inline images should be
+converted to markdown images instead, this option can be set to a
+list of parent tags that should be allowed to contain inline images,
+for example `['td']`. Defaults to an empty list.
+wrap, wrap_width
+:   If `wrap` is set to `True`, all text paragraphs are wrapped at
+`wrap_width` characters. Defaults to `False` and `80`. Use with
+`newline_style=BACKSLASH` to keep line breaks in paragraphs.
+Options may be specified as kwargs to the `html_to_markdown` function, or as
+a nested `Options` class in `MarkdownConverter` subclasses.
+# CLI
+Use `html_to_markdown example.html > example.md` or pipe input from stdin
+(`cat example.html | html_to_markdown > example.md`). Call `html_to_markdown -h`
+to see all available options. They are the same as listed above and take
+the same arguments.

html_to_markdown-1.0.0/README.md ADDED Viewed

@@ -0,0 +1,168 @@
+# html_to_markdown
+This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
+Python 3.9 and offering strong typing.
+### Differences from the Markdownify
+- The refactored codebase uses a strict functional approach - no classes are involved.
+- There is full typing with strict MyPy adherence in place.
+- The `convert_to_markdown` allows passing a pre-configured instance of `Beautifulsoup`.
+- This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
+  point versioning is no longer aligned.
+## Installation
+```shell
+pip install html_to_markdown
+```
+## Usage
+Convert some HTML to Markdown:
+```python
+from html_to_markdown import convert_to_markdown
+convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>')  # > '**Yay** [GitHub](http://github.com)'
+```
+Specify tags to exclude:
+```python
+from html_to_markdown import convert_to_markdown
+convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', strip=['a'])  # > '**Yay** GitHub'
+```
+\...or specify the tags you want to include:
+```python
+from html_to_markdown import convert_to_markdown
+convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', convert=['b'])  # > '**Yay** GitHub'
+```
+# Options
+html_to_markdown supports the following options:
+strip
+:   A list of tags to strip. This option can\'t be used with the
+`convert` option.
+convert
+:   A list of tags to convert. This option can\'t be used with the
+`strip` option.
+autolinks
+:   A boolean indicating whether the \"automatic link\" style should be
+used when a `a` tag\'s contents match its href. Defaults to `True`.
+default_title
+:   A boolean to enable setting the title of a link to its href, if no
+title is given. Defaults to `False`.
+heading_style
+:   Defines how headings should be converted. Accepted values are `ATX`,
+`ATX_CLOSED`, `SETEXT`, and `UNDERLINED` (which is an alias for
+`SETEXT`). Defaults to `UNDERLINED`.
+bullets
+:   An iterable (string, list, or tuple) of bullet styles to be used. If
+the iterable only contains one item, it will be used regardless of
+how deeply lists are nested. Otherwise, the bullet will alternate
+based on nesting level. Defaults to `'*+-'`.
+strong_em_symbol
+:   In markdown, both `*` and `_` are used to encode **strong** or
+*emphasized* texts. Either of these symbols can be chosen by the
+options `ASTERISK` (default) or `UNDERSCORE` respectively.
+sub_symbol, sup_symbol
+:   Define the chars that surround `<sub>` and `<sup>` text. Defaults to
+an empty string, because this is non-standard behavior. Could be
+something like `~` and `^` to result in `~sub~` and `^sup^`. If the
+value starts with `<` and ends with `>`, it is treated as an HTML
+tag and a `/` is inserted after the `<` in the string used after the
+text; this allows specifying `<sub>` to use raw HTML in the output
+for subscripts, for example.
+newline_style
+:   Defines the style of marking linebreaks (`<br>`) in markdown. The
+default value `SPACES` of this option will adopt the usual two
+spaces and a newline, while `BACKSLASH` will convert a linebreak to
+`\\n` (a backslash and a newline). While the latter convention is
+non-standard, it is commonly preferred and supported by a lot of
+interpreters.
+code_language
+:   Defines the language that should be assumed for all `<pre>`
+sections. Useful, if all code on a page is in the same programming
+language and should be annotated with ``[python]{.title-ref}[ or
+similar. Defaults to ]{.title-ref}[\'\']{.title-ref}\` (empty
+string) and can be any string.
+code_language_callback
+:   When the HTML code contains `pre` tags that in some way provide the
+code language, for example as class, this callback can be used to
+extract the language from the tag and prefix it to the converted
+`pre` tag. The callback gets one single argument, an BeautifylSoup
+object, and returns a string containing the code language, or
+`None`. An example to use the class name as code language could be:
+        def callback(el):
+            return el['class'][0] if el.has_attr('class') else None
+    Defaults to `None`.
+escape_asterisks
+:   If set to `False`, do not escape `*` to `\*` in text. Defaults to
+`True`.
+escape_underscores
+:   If set to `False`, do not escape `_` to `\_` in text. Defaults to
+`True`.
+escape_misc
+:   If set to `False`, do not escape miscellaneous punctuation
+characters that sometimes have Markdown significance in text.
+Defaults to `True`.
+keep_inline_images_in
+:   Images are converted to their alt-text when the images are located
+inside headlines or table cells. If some inline images should be
+converted to markdown images instead, this option can be set to a
+list of parent tags that should be allowed to contain inline images,
+for example `['td']`. Defaults to an empty list.
+wrap, wrap_width
+:   If `wrap` is set to `True`, all text paragraphs are wrapped at
+`wrap_width` characters. Defaults to `False` and `80`. Use with
+`newline_style=BACKSLASH` to keep line breaks in paragraphs.
+Options may be specified as kwargs to the `html_to_markdown` function, or as
+a nested `Options` class in `MarkdownConverter` subclasses.
+# CLI
+Use `html_to_markdown example.html > example.md` or pipe input from stdin
+(`cat example.html | html_to_markdown > example.md`). Call `html_to_markdown -h`
+to see all available options. They are the same as listed above and take
+the same arguments.

html_to_markdown-1.0.0/html_to_markdown/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from html_to_markdown.processing import convert_to_markdown
+__all__ = ["convert_to_markdown"]

html_to_markdown-1.0.0/html_to_markdown/__main__.py ADDED Viewed

@@ -0,0 +1,131 @@
+import argparse
+import sys
+from html_to_markdown import convert_to_markdown
+from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
+def cli(argv: list[str]) -> None:
+    """Command-line interface for html_to_markdown."""
+    parser = argparse.ArgumentParser(
+        prog="html_to_markdown",
+        description="Converts html to markdown.",
+    )
+    parser.add_argument(
+        "html",
+        nargs="?",
+        type=argparse.FileType("r"),
+        default=sys.stdin,
+        help="The html file to convert. Defaults to STDIN if not " "provided.",
+    )
+    parser.add_argument(
+        "-s",
+        "--strip",
+        nargs="*",
+        help="A list of tags to strip. This option can't be used with " "the --convert option.",
+    )
+    parser.add_argument(
+        "-c",
+        "--convert",
+        nargs="*",
+        help="A list of tags to convert. This option can't be used with " "the --strip option.",
+    )
+    parser.add_argument(
+        "-a",
+        "--autolinks",
+        action="store_true",
+        help="A boolean indicating whether the 'automatic link' style "
+        "should be used when a 'a' tag's contents match its href.",
+    )
+    parser.add_argument(
+        "--default-title",
+        action="store_false",
+        help="A boolean to enable setting the title of a link to its " "href, if no title is given.",
+    )
+    parser.add_argument(
+        "--heading-style",
+        default=UNDERLINED,
+        choices=(ATX, ATX_CLOSED, UNDERLINED),
+        help="Defines how headings should be converted.",
+    )
+    parser.add_argument(
+        "-b",
+        "--bullets",
+        default="*+-",
+        help="A string of bullet styles to use; the bullet will " "alternate based on nesting level.",
+    )
+    (
+        parser.add_argument(
+            "--strong-em-symbol",
+            default=ASTERISK,
+            choices=(ASTERISK, UNDERSCORE),
+            help="Use * or _ to convert strong and italics text",
+        ),
+    )
+    parser.add_argument("--sub-symbol", default="", help="Define the chars that surround '<sub>'.")
+    parser.add_argument("--sup-symbol", default="", help="Define the chars that surround '<sup>'.")
+    parser.add_argument(
+        "--newline-style",
+        default=SPACES,
+        choices=(SPACES, BACKSLASH),
+        help="Defines the style of <br> conversions: two spaces "
+        "or backslash at the and of the line thet should break.",
+    )
+    parser.add_argument(
+        "--code-language", default="", help="Defines the language that should be assumed for all " "'<pre>' sections."
+    )
+    parser.add_argument(
+        "--no-escape-asterisks",
+        dest="escape_asterisks",
+        action="store_false",
+        help="Do not escape '*' to '\\*' in text.",
+    )
+    parser.add_argument(
+        "--no-escape-underscores",
+        dest="escape_underscores",
+        action="store_false",
+        help="Do not escape '_' to '\\_' in text.",
+    )
+    parser.add_argument(
+        "-i",
+        "--keep-inline-images-in",
+        nargs="*",
+        help="Images are converted to their alt-text when the images are "
+        "located inside headlines or table cells. If some inline images "
+        "should be converted to markdown images instead, this option can "
+        "be set to a list of parent tags that should be allowed to "
+        "contain inline images.",
+    )
+    parser.add_argument(
+        "-w", "--wrap", action="store_true", help="Wrap all text paragraphs at --wrap-width characters."
+    )
+    parser.add_argument("--wrap-width", type=int, default=80)
+    args = parser.parse_args(argv)
+    result = convert_to_markdown(
+        args.html.read(),
+        strip=args.strip,
+        convert=args.convert,
+        autolinks=args.autolinks,
+        default_title=args.default_title,
+        heading_style=args.heading_style,
+        bullets=args.bullets,
+        strong_em_symbol=args.strong_em_symbol,
+        sub_symbol=args.sub_symbol,
+        sup_symbol=args.sup_symbol,
+        newline_style=args.newline_style,
+        code_language=args.code_language,
+        escape_asterisks=args.escape_asterisks,
+        escape_underscores=args.escape_underscores,
+        keep_inline_images_in=args.keep_inline_images_in,
+        wrap=args.wrap,
+        wrap_width=args.wrap_width,
+    )
+    print(result)  # noqa: T201
+if __name__ == "__main__":
+    cli(sys.argv[1:])

html_to_markdown-1.0.0/html_to_markdown/constants.py ADDED Viewed

@@ -0,0 +1,18 @@
+from __future__ import annotations
+import re
+from re import Pattern
+from typing import Final, Literal
+convert_heading_re: Final[Pattern[str]] = re.compile(r"convert_h(\d+)")
+line_beginning_re: Final[Pattern[str]] = re.compile(r"^", re.MULTILINE)
+whitespace_re: Final[Pattern[str]] = re.compile(r"[\t ]+")
+html_heading_re: Final[Pattern[str]] = re.compile(r"h[1-6]")
+ASTERISK: Final[Literal["*"]] = "*"
+ATX: Final[Literal["atx"]] = "atx"
+ATX_CLOSED: Final[Literal["atx_closed"]] = "atx_closed"
+BACKSLASH: Final[Literal["backslash"]] = "backslash"
+UNDERLINED: Final[Literal["underlined"]] = "underlined"
+SPACES: Final[Literal["spaces"]] = "spaces"
+UNDERSCORE: Final[Literal["_"]] = "_"