html-to-markdown 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -0,0 +1,21 @@
1
+ *$py.class
2
+ *.iml
3
+ *.log
4
+ *.py[cod]
5
+ .coverage
6
+ .env
7
+ .idea/
8
+ .mypy_cache/
9
+ .pdm-build/
10
+ .pdm-python
11
+ .pdm.toml
12
+ .pytest_cache/
13
+ .python-version
14
+ .ruff_cache/
15
+ .tox/
16
+ .venv/
17
+ .vscode/
18
+ __pycache__/
19
+ __pypackages__/
20
+ coverage.xml
21
+ dist/
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright 2012-2018 Matthew Tretter
4
+ Copyright 2024 Na'aman Hirschfeld
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
@@ -0,0 +1,194 @@
1
+ Metadata-Version: 2.3
2
+ Name: html-to-markdown
3
+ Version: 1.0.0
4
+ Summary: Convert HTML to markdown
5
+ Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Keywords: beautifulsoup,converter,html,markdown,text-processing
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Text Processing
18
+ Classifier: Topic :: Text Processing :: Markup
19
+ Classifier: Topic :: Text Processing :: Markup :: HTML
20
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
21
+ Classifier: Topic :: Utilities
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.9
24
+ Requires-Dist: beautifulsoup4>=4.12.3
25
+ Description-Content-Type: text/markdown
26
+
27
+ # html_to_markdown
28
+
29
+ This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
30
+ Python 3.9 and offering strong typing.
31
+
32
+ ### Differences from the Markdownify
33
+
34
+ - The refactored codebase uses a strict functional approach - no classes are involved.
35
+ - There is full typing with strict MyPy adherence in place.
36
+ - The `convert_to_markdown` allows passing a pre-configured instance of `Beautifulsoup`.
37
+ - This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
38
+ point versioning is no longer aligned.
39
+
40
+ ## Installation
41
+
42
+ ```shell
43
+ pip install html_to_markdown
44
+ ```
45
+
46
+ ## Usage
47
+
48
+ Convert some HTML to Markdown:
49
+
50
+ ```python
51
+ from html_to_markdown import convert_to_markdown
52
+
53
+ convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
54
+ ```
55
+
56
+ Specify tags to exclude:
57
+
58
+ ```python
59
+ from html_to_markdown import convert_to_markdown
60
+
61
+ convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', strip=['a']) # > '**Yay** GitHub'
62
+ ```
63
+
64
+ \...or specify the tags you want to include:
65
+
66
+ ```python
67
+ from html_to_markdown import convert_to_markdown
68
+
69
+ convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', convert=['b']) # > '**Yay** GitHub'
70
+ ```
71
+
72
+ # Options
73
+
74
+ html_to_markdown supports the following options:
75
+
76
+ strip
77
+
78
+ : A list of tags to strip. This option can\'t be used with the
79
+ `convert` option.
80
+
81
+ convert
82
+
83
+ : A list of tags to convert. This option can\'t be used with the
84
+ `strip` option.
85
+
86
+ autolinks
87
+
88
+ : A boolean indicating whether the \"automatic link\" style should be
89
+ used when a `a` tag\'s contents match its href. Defaults to `True`.
90
+
91
+ default_title
92
+
93
+ : A boolean to enable setting the title of a link to its href, if no
94
+ title is given. Defaults to `False`.
95
+
96
+ heading_style
97
+
98
+ : Defines how headings should be converted. Accepted values are `ATX`,
99
+ `ATX_CLOSED`, `SETEXT`, and `UNDERLINED` (which is an alias for
100
+ `SETEXT`). Defaults to `UNDERLINED`.
101
+
102
+ bullets
103
+
104
+ : An iterable (string, list, or tuple) of bullet styles to be used. If
105
+ the iterable only contains one item, it will be used regardless of
106
+ how deeply lists are nested. Otherwise, the bullet will alternate
107
+ based on nesting level. Defaults to `'*+-'`.
108
+
109
+ strong_em_symbol
110
+
111
+ : In markdown, both `*` and `_` are used to encode **strong** or
112
+ *emphasized* texts. Either of these symbols can be chosen by the
113
+ options `ASTERISK` (default) or `UNDERSCORE` respectively.
114
+
115
+ sub_symbol, sup_symbol
116
+
117
+ : Define the chars that surround `<sub>` and `<sup>` text. Defaults to
118
+ an empty string, because this is non-standard behavior. Could be
119
+ something like `~` and `^` to result in `~sub~` and `^sup^`. If the
120
+ value starts with `<` and ends with `>`, it is treated as an HTML
121
+ tag and a `/` is inserted after the `<` in the string used after the
122
+ text; this allows specifying `<sub>` to use raw HTML in the output
123
+ for subscripts, for example.
124
+
125
+ newline_style
126
+
127
+ : Defines the style of marking linebreaks (`<br>`) in markdown. The
128
+ default value `SPACES` of this option will adopt the usual two
129
+ spaces and a newline, while `BACKSLASH` will convert a linebreak to
130
+ `\\n` (a backslash and a newline). While the latter convention is
131
+ non-standard, it is commonly preferred and supported by a lot of
132
+ interpreters.
133
+
134
+ code_language
135
+
136
+ : Defines the language that should be assumed for all `<pre>`
137
+ sections. Useful, if all code on a page is in the same programming
138
+ language and should be annotated with ``[python]{.title-ref}[ or
139
+ similar. Defaults to ]{.title-ref}[\'\']{.title-ref}\` (empty
140
+ string) and can be any string.
141
+
142
+ code_language_callback
143
+
144
+ : When the HTML code contains `pre` tags that in some way provide the
145
+ code language, for example as class, this callback can be used to
146
+ extract the language from the tag and prefix it to the converted
147
+ `pre` tag. The callback gets one single argument, an BeautifylSoup
148
+ object, and returns a string containing the code language, or
149
+ `None`. An example to use the class name as code language could be:
150
+
151
+ def callback(el):
152
+ return el['class'][0] if el.has_attr('class') else None
153
+
154
+ Defaults to `None`.
155
+
156
+ escape_asterisks
157
+
158
+ : If set to `False`, do not escape `*` to `\*` in text. Defaults to
159
+ `True`.
160
+
161
+ escape_underscores
162
+
163
+ : If set to `False`, do not escape `_` to `\_` in text. Defaults to
164
+ `True`.
165
+
166
+ escape_misc
167
+
168
+ : If set to `False`, do not escape miscellaneous punctuation
169
+ characters that sometimes have Markdown significance in text.
170
+ Defaults to `True`.
171
+
172
+ keep_inline_images_in
173
+
174
+ : Images are converted to their alt-text when the images are located
175
+ inside headlines or table cells. If some inline images should be
176
+ converted to markdown images instead, this option can be set to a
177
+ list of parent tags that should be allowed to contain inline images,
178
+ for example `['td']`. Defaults to an empty list.
179
+
180
+ wrap, wrap_width
181
+
182
+ : If `wrap` is set to `True`, all text paragraphs are wrapped at
183
+ `wrap_width` characters. Defaults to `False` and `80`. Use with
184
+ `newline_style=BACKSLASH` to keep line breaks in paragraphs.
185
+
186
+ Options may be specified as kwargs to the `html_to_markdown` function, or as
187
+ a nested `Options` class in `MarkdownConverter` subclasses.
188
+
189
+ # CLI
190
+
191
+ Use `html_to_markdown example.html > example.md` or pipe input from stdin
192
+ (`cat example.html | html_to_markdown > example.md`). Call `html_to_markdown -h`
193
+ to see all available options. They are the same as listed above and take
194
+ the same arguments.
@@ -0,0 +1,168 @@
1
+ # html_to_markdown
2
+
3
+ This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
4
+ Python 3.9 and offering strong typing.
5
+
6
+ ### Differences from the Markdownify
7
+
8
+ - The refactored codebase uses a strict functional approach - no classes are involved.
9
+ - There is full typing with strict MyPy adherence in place.
10
+ - The `convert_to_markdown` allows passing a pre-configured instance of `Beautifulsoup`.
11
+ - This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
12
+ point versioning is no longer aligned.
13
+
14
+ ## Installation
15
+
16
+ ```shell
17
+ pip install html_to_markdown
18
+ ```
19
+
20
+ ## Usage
21
+
22
+ Convert some HTML to Markdown:
23
+
24
+ ```python
25
+ from html_to_markdown import convert_to_markdown
26
+
27
+ convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
28
+ ```
29
+
30
+ Specify tags to exclude:
31
+
32
+ ```python
33
+ from html_to_markdown import convert_to_markdown
34
+
35
+ convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', strip=['a']) # > '**Yay** GitHub'
36
+ ```
37
+
38
+ \...or specify the tags you want to include:
39
+
40
+ ```python
41
+ from html_to_markdown import convert_to_markdown
42
+
43
+ convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', convert=['b']) # > '**Yay** GitHub'
44
+ ```
45
+
46
+ # Options
47
+
48
+ html_to_markdown supports the following options:
49
+
50
+ strip
51
+
52
+ : A list of tags to strip. This option can\'t be used with the
53
+ `convert` option.
54
+
55
+ convert
56
+
57
+ : A list of tags to convert. This option can\'t be used with the
58
+ `strip` option.
59
+
60
+ autolinks
61
+
62
+ : A boolean indicating whether the \"automatic link\" style should be
63
+ used when a `a` tag\'s contents match its href. Defaults to `True`.
64
+
65
+ default_title
66
+
67
+ : A boolean to enable setting the title of a link to its href, if no
68
+ title is given. Defaults to `False`.
69
+
70
+ heading_style
71
+
72
+ : Defines how headings should be converted. Accepted values are `ATX`,
73
+ `ATX_CLOSED`, `SETEXT`, and `UNDERLINED` (which is an alias for
74
+ `SETEXT`). Defaults to `UNDERLINED`.
75
+
76
+ bullets
77
+
78
+ : An iterable (string, list, or tuple) of bullet styles to be used. If
79
+ the iterable only contains one item, it will be used regardless of
80
+ how deeply lists are nested. Otherwise, the bullet will alternate
81
+ based on nesting level. Defaults to `'*+-'`.
82
+
83
+ strong_em_symbol
84
+
85
+ : In markdown, both `*` and `_` are used to encode **strong** or
86
+ *emphasized* texts. Either of these symbols can be chosen by the
87
+ options `ASTERISK` (default) or `UNDERSCORE` respectively.
88
+
89
+ sub_symbol, sup_symbol
90
+
91
+ : Define the chars that surround `<sub>` and `<sup>` text. Defaults to
92
+ an empty string, because this is non-standard behavior. Could be
93
+ something like `~` and `^` to result in `~sub~` and `^sup^`. If the
94
+ value starts with `<` and ends with `>`, it is treated as an HTML
95
+ tag and a `/` is inserted after the `<` in the string used after the
96
+ text; this allows specifying `<sub>` to use raw HTML in the output
97
+ for subscripts, for example.
98
+
99
+ newline_style
100
+
101
+ : Defines the style of marking linebreaks (`<br>`) in markdown. The
102
+ default value `SPACES` of this option will adopt the usual two
103
+ spaces and a newline, while `BACKSLASH` will convert a linebreak to
104
+ `\\n` (a backslash and a newline). While the latter convention is
105
+ non-standard, it is commonly preferred and supported by a lot of
106
+ interpreters.
107
+
108
+ code_language
109
+
110
+ : Defines the language that should be assumed for all `<pre>`
111
+ sections. Useful, if all code on a page is in the same programming
112
+ language and should be annotated with ``[python]{.title-ref}[ or
113
+ similar. Defaults to ]{.title-ref}[\'\']{.title-ref}\` (empty
114
+ string) and can be any string.
115
+
116
+ code_language_callback
117
+
118
+ : When the HTML code contains `pre` tags that in some way provide the
119
+ code language, for example as class, this callback can be used to
120
+ extract the language from the tag and prefix it to the converted
121
+ `pre` tag. The callback gets one single argument, an BeautifylSoup
122
+ object, and returns a string containing the code language, or
123
+ `None`. An example to use the class name as code language could be:
124
+
125
+ def callback(el):
126
+ return el['class'][0] if el.has_attr('class') else None
127
+
128
+ Defaults to `None`.
129
+
130
+ escape_asterisks
131
+
132
+ : If set to `False`, do not escape `*` to `\*` in text. Defaults to
133
+ `True`.
134
+
135
+ escape_underscores
136
+
137
+ : If set to `False`, do not escape `_` to `\_` in text. Defaults to
138
+ `True`.
139
+
140
+ escape_misc
141
+
142
+ : If set to `False`, do not escape miscellaneous punctuation
143
+ characters that sometimes have Markdown significance in text.
144
+ Defaults to `True`.
145
+
146
+ keep_inline_images_in
147
+
148
+ : Images are converted to their alt-text when the images are located
149
+ inside headlines or table cells. If some inline images should be
150
+ converted to markdown images instead, this option can be set to a
151
+ list of parent tags that should be allowed to contain inline images,
152
+ for example `['td']`. Defaults to an empty list.
153
+
154
+ wrap, wrap_width
155
+
156
+ : If `wrap` is set to `True`, all text paragraphs are wrapped at
157
+ `wrap_width` characters. Defaults to `False` and `80`. Use with
158
+ `newline_style=BACKSLASH` to keep line breaks in paragraphs.
159
+
160
+ Options may be specified as kwargs to the `html_to_markdown` function, or as
161
+ a nested `Options` class in `MarkdownConverter` subclasses.
162
+
163
+ # CLI
164
+
165
+ Use `html_to_markdown example.html > example.md` or pipe input from stdin
166
+ (`cat example.html | html_to_markdown > example.md`). Call `html_to_markdown -h`
167
+ to see all available options. They are the same as listed above and take
168
+ the same arguments.
@@ -0,0 +1,3 @@
1
+ from html_to_markdown.processing import convert_to_markdown
2
+
3
+ __all__ = ["convert_to_markdown"]
@@ -0,0 +1,131 @@
1
+ import argparse
2
+ import sys
3
+
4
+ from html_to_markdown import convert_to_markdown
5
+ from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
6
+
7
+
8
+ def cli(argv: list[str]) -> None:
9
+ """Command-line interface for html_to_markdown."""
10
+ parser = argparse.ArgumentParser(
11
+ prog="html_to_markdown",
12
+ description="Converts html to markdown.",
13
+ )
14
+
15
+ parser.add_argument(
16
+ "html",
17
+ nargs="?",
18
+ type=argparse.FileType("r"),
19
+ default=sys.stdin,
20
+ help="The html file to convert. Defaults to STDIN if not " "provided.",
21
+ )
22
+ parser.add_argument(
23
+ "-s",
24
+ "--strip",
25
+ nargs="*",
26
+ help="A list of tags to strip. This option can't be used with " "the --convert option.",
27
+ )
28
+ parser.add_argument(
29
+ "-c",
30
+ "--convert",
31
+ nargs="*",
32
+ help="A list of tags to convert. This option can't be used with " "the --strip option.",
33
+ )
34
+ parser.add_argument(
35
+ "-a",
36
+ "--autolinks",
37
+ action="store_true",
38
+ help="A boolean indicating whether the 'automatic link' style "
39
+ "should be used when a 'a' tag's contents match its href.",
40
+ )
41
+ parser.add_argument(
42
+ "--default-title",
43
+ action="store_false",
44
+ help="A boolean to enable setting the title of a link to its " "href, if no title is given.",
45
+ )
46
+ parser.add_argument(
47
+ "--heading-style",
48
+ default=UNDERLINED,
49
+ choices=(ATX, ATX_CLOSED, UNDERLINED),
50
+ help="Defines how headings should be converted.",
51
+ )
52
+ parser.add_argument(
53
+ "-b",
54
+ "--bullets",
55
+ default="*+-",
56
+ help="A string of bullet styles to use; the bullet will " "alternate based on nesting level.",
57
+ )
58
+ (
59
+ parser.add_argument(
60
+ "--strong-em-symbol",
61
+ default=ASTERISK,
62
+ choices=(ASTERISK, UNDERSCORE),
63
+ help="Use * or _ to convert strong and italics text",
64
+ ),
65
+ )
66
+ parser.add_argument("--sub-symbol", default="", help="Define the chars that surround '<sub>'.")
67
+ parser.add_argument("--sup-symbol", default="", help="Define the chars that surround '<sup>'.")
68
+ parser.add_argument(
69
+ "--newline-style",
70
+ default=SPACES,
71
+ choices=(SPACES, BACKSLASH),
72
+ help="Defines the style of <br> conversions: two spaces "
73
+ "or backslash at the and of the line thet should break.",
74
+ )
75
+ parser.add_argument(
76
+ "--code-language", default="", help="Defines the language that should be assumed for all " "'<pre>' sections."
77
+ )
78
+ parser.add_argument(
79
+ "--no-escape-asterisks",
80
+ dest="escape_asterisks",
81
+ action="store_false",
82
+ help="Do not escape '*' to '\\*' in text.",
83
+ )
84
+ parser.add_argument(
85
+ "--no-escape-underscores",
86
+ dest="escape_underscores",
87
+ action="store_false",
88
+ help="Do not escape '_' to '\\_' in text.",
89
+ )
90
+ parser.add_argument(
91
+ "-i",
92
+ "--keep-inline-images-in",
93
+ nargs="*",
94
+ help="Images are converted to their alt-text when the images are "
95
+ "located inside headlines or table cells. If some inline images "
96
+ "should be converted to markdown images instead, this option can "
97
+ "be set to a list of parent tags that should be allowed to "
98
+ "contain inline images.",
99
+ )
100
+ parser.add_argument(
101
+ "-w", "--wrap", action="store_true", help="Wrap all text paragraphs at --wrap-width characters."
102
+ )
103
+ parser.add_argument("--wrap-width", type=int, default=80)
104
+
105
+ args = parser.parse_args(argv)
106
+
107
+ result = convert_to_markdown(
108
+ args.html.read(),
109
+ strip=args.strip,
110
+ convert=args.convert,
111
+ autolinks=args.autolinks,
112
+ default_title=args.default_title,
113
+ heading_style=args.heading_style,
114
+ bullets=args.bullets,
115
+ strong_em_symbol=args.strong_em_symbol,
116
+ sub_symbol=args.sub_symbol,
117
+ sup_symbol=args.sup_symbol,
118
+ newline_style=args.newline_style,
119
+ code_language=args.code_language,
120
+ escape_asterisks=args.escape_asterisks,
121
+ escape_underscores=args.escape_underscores,
122
+ keep_inline_images_in=args.keep_inline_images_in,
123
+ wrap=args.wrap,
124
+ wrap_width=args.wrap_width,
125
+ )
126
+
127
+ print(result) # noqa: T201
128
+
129
+
130
+ if __name__ == "__main__":
131
+ cli(sys.argv[1:])
@@ -0,0 +1,18 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from re import Pattern
5
+ from typing import Final, Literal
6
+
7
+ convert_heading_re: Final[Pattern[str]] = re.compile(r"convert_h(\d+)")
8
+ line_beginning_re: Final[Pattern[str]] = re.compile(r"^", re.MULTILINE)
9
+ whitespace_re: Final[Pattern[str]] = re.compile(r"[\t ]+")
10
+ html_heading_re: Final[Pattern[str]] = re.compile(r"h[1-6]")
11
+
12
+ ASTERISK: Final[Literal["*"]] = "*"
13
+ ATX: Final[Literal["atx"]] = "atx"
14
+ ATX_CLOSED: Final[Literal["atx_closed"]] = "atx_closed"
15
+ BACKSLASH: Final[Literal["backslash"]] = "backslash"
16
+ UNDERLINED: Final[Literal["underlined"]] = "underlined"
17
+ SPACES: Final[Literal["spaces"]] = "spaces"
18
+ UNDERSCORE: Final[Literal["_"]] = "_"