html-to-markdown 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +3 -0
- html_to_markdown/__main__.py +131 -0
- html_to_markdown/constants.py +18 -0
- html_to_markdown/converters.py +380 -0
- html_to_markdown/processing.py +298 -0
- html_to_markdown/py.typed +0 -0
- html_to_markdown/utils.py +72 -0
- html_to_markdown-1.0.0.dist-info/METADATA +194 -0
- html_to_markdown-1.0.0.dist-info/RECORD +12 -0
- html_to_markdown-1.0.0.dist-info/WHEEL +4 -0
- html_to_markdown-1.0.0.dist-info/entry_points.txt +2 -0
- html_to_markdown-1.0.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
from html_to_markdown import convert_to_markdown
|
|
5
|
+
from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def cli(argv: list[str]) -> None:
|
|
9
|
+
"""Command-line interface for html_to_markdown."""
|
|
10
|
+
parser = argparse.ArgumentParser(
|
|
11
|
+
prog="html_to_markdown",
|
|
12
|
+
description="Converts html to markdown.",
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
parser.add_argument(
|
|
16
|
+
"html",
|
|
17
|
+
nargs="?",
|
|
18
|
+
type=argparse.FileType("r"),
|
|
19
|
+
default=sys.stdin,
|
|
20
|
+
help="The html file to convert. Defaults to STDIN if not " "provided.",
|
|
21
|
+
)
|
|
22
|
+
parser.add_argument(
|
|
23
|
+
"-s",
|
|
24
|
+
"--strip",
|
|
25
|
+
nargs="*",
|
|
26
|
+
help="A list of tags to strip. This option can't be used with " "the --convert option.",
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"-c",
|
|
30
|
+
"--convert",
|
|
31
|
+
nargs="*",
|
|
32
|
+
help="A list of tags to convert. This option can't be used with " "the --strip option.",
|
|
33
|
+
)
|
|
34
|
+
parser.add_argument(
|
|
35
|
+
"-a",
|
|
36
|
+
"--autolinks",
|
|
37
|
+
action="store_true",
|
|
38
|
+
help="A boolean indicating whether the 'automatic link' style "
|
|
39
|
+
"should be used when a 'a' tag's contents match its href.",
|
|
40
|
+
)
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
"--default-title",
|
|
43
|
+
action="store_false",
|
|
44
|
+
help="A boolean to enable setting the title of a link to its " "href, if no title is given.",
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--heading-style",
|
|
48
|
+
default=UNDERLINED,
|
|
49
|
+
choices=(ATX, ATX_CLOSED, UNDERLINED),
|
|
50
|
+
help="Defines how headings should be converted.",
|
|
51
|
+
)
|
|
52
|
+
parser.add_argument(
|
|
53
|
+
"-b",
|
|
54
|
+
"--bullets",
|
|
55
|
+
default="*+-",
|
|
56
|
+
help="A string of bullet styles to use; the bullet will " "alternate based on nesting level.",
|
|
57
|
+
)
|
|
58
|
+
(
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"--strong-em-symbol",
|
|
61
|
+
default=ASTERISK,
|
|
62
|
+
choices=(ASTERISK, UNDERSCORE),
|
|
63
|
+
help="Use * or _ to convert strong and italics text",
|
|
64
|
+
),
|
|
65
|
+
)
|
|
66
|
+
parser.add_argument("--sub-symbol", default="", help="Define the chars that surround '<sub>'.")
|
|
67
|
+
parser.add_argument("--sup-symbol", default="", help="Define the chars that surround '<sup>'.")
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
"--newline-style",
|
|
70
|
+
default=SPACES,
|
|
71
|
+
choices=(SPACES, BACKSLASH),
|
|
72
|
+
help="Defines the style of <br> conversions: two spaces "
|
|
73
|
+
"or backslash at the and of the line thet should break.",
|
|
74
|
+
)
|
|
75
|
+
parser.add_argument(
|
|
76
|
+
"--code-language", default="", help="Defines the language that should be assumed for all " "'<pre>' sections."
|
|
77
|
+
)
|
|
78
|
+
parser.add_argument(
|
|
79
|
+
"--no-escape-asterisks",
|
|
80
|
+
dest="escape_asterisks",
|
|
81
|
+
action="store_false",
|
|
82
|
+
help="Do not escape '*' to '\\*' in text.",
|
|
83
|
+
)
|
|
84
|
+
parser.add_argument(
|
|
85
|
+
"--no-escape-underscores",
|
|
86
|
+
dest="escape_underscores",
|
|
87
|
+
action="store_false",
|
|
88
|
+
help="Do not escape '_' to '\\_' in text.",
|
|
89
|
+
)
|
|
90
|
+
parser.add_argument(
|
|
91
|
+
"-i",
|
|
92
|
+
"--keep-inline-images-in",
|
|
93
|
+
nargs="*",
|
|
94
|
+
help="Images are converted to their alt-text when the images are "
|
|
95
|
+
"located inside headlines or table cells. If some inline images "
|
|
96
|
+
"should be converted to markdown images instead, this option can "
|
|
97
|
+
"be set to a list of parent tags that should be allowed to "
|
|
98
|
+
"contain inline images.",
|
|
99
|
+
)
|
|
100
|
+
parser.add_argument(
|
|
101
|
+
"-w", "--wrap", action="store_true", help="Wrap all text paragraphs at --wrap-width characters."
|
|
102
|
+
)
|
|
103
|
+
parser.add_argument("--wrap-width", type=int, default=80)
|
|
104
|
+
|
|
105
|
+
args = parser.parse_args(argv)
|
|
106
|
+
|
|
107
|
+
result = convert_to_markdown(
|
|
108
|
+
args.html.read(),
|
|
109
|
+
strip=args.strip,
|
|
110
|
+
convert=args.convert,
|
|
111
|
+
autolinks=args.autolinks,
|
|
112
|
+
default_title=args.default_title,
|
|
113
|
+
heading_style=args.heading_style,
|
|
114
|
+
bullets=args.bullets,
|
|
115
|
+
strong_em_symbol=args.strong_em_symbol,
|
|
116
|
+
sub_symbol=args.sub_symbol,
|
|
117
|
+
sup_symbol=args.sup_symbol,
|
|
118
|
+
newline_style=args.newline_style,
|
|
119
|
+
code_language=args.code_language,
|
|
120
|
+
escape_asterisks=args.escape_asterisks,
|
|
121
|
+
escape_underscores=args.escape_underscores,
|
|
122
|
+
keep_inline_images_in=args.keep_inline_images_in,
|
|
123
|
+
wrap=args.wrap,
|
|
124
|
+
wrap_width=args.wrap_width,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
print(result) # noqa: T201
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
if __name__ == "__main__":
|
|
131
|
+
cli(sys.argv[1:])
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from re import Pattern
|
|
5
|
+
from typing import Final, Literal
|
|
6
|
+
|
|
7
|
+
convert_heading_re: Final[Pattern[str]] = re.compile(r"convert_h(\d+)")
|
|
8
|
+
line_beginning_re: Final[Pattern[str]] = re.compile(r"^", re.MULTILINE)
|
|
9
|
+
whitespace_re: Final[Pattern[str]] = re.compile(r"[\t ]+")
|
|
10
|
+
html_heading_re: Final[Pattern[str]] = re.compile(r"h[1-6]")
|
|
11
|
+
|
|
12
|
+
ASTERISK: Final[Literal["*"]] = "*"
|
|
13
|
+
ATX: Final[Literal["atx"]] = "atx"
|
|
14
|
+
ATX_CLOSED: Final[Literal["atx_closed"]] = "atx_closed"
|
|
15
|
+
BACKSLASH: Final[Literal["backslash"]] = "backslash"
|
|
16
|
+
UNDERLINED: Final[Literal["underlined"]] = "underlined"
|
|
17
|
+
SPACES: Final[Literal["spaces"]] = "spaces"
|
|
18
|
+
UNDERSCORE: Final[Literal["_"]] = "_"
|
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable, Mapping
|
|
4
|
+
from functools import partial
|
|
5
|
+
from inspect import getfullargspec
|
|
6
|
+
from textwrap import fill
|
|
7
|
+
from typing import Any, Callable, Literal, TypeVar, cast
|
|
8
|
+
|
|
9
|
+
from bs4.element import Tag
|
|
10
|
+
|
|
11
|
+
from html_to_markdown.constants import (
|
|
12
|
+
ATX_CLOSED,
|
|
13
|
+
BACKSLASH,
|
|
14
|
+
UNDERLINED,
|
|
15
|
+
line_beginning_re,
|
|
16
|
+
)
|
|
17
|
+
from html_to_markdown.utils import chomp, indent, underline
|
|
18
|
+
|
|
19
|
+
SupportedElements = Literal[
|
|
20
|
+
"a",
|
|
21
|
+
"b",
|
|
22
|
+
"blockquote",
|
|
23
|
+
"br",
|
|
24
|
+
"code",
|
|
25
|
+
"del",
|
|
26
|
+
"em",
|
|
27
|
+
"h1",
|
|
28
|
+
"h2",
|
|
29
|
+
"h3",
|
|
30
|
+
"h4",
|
|
31
|
+
"h5",
|
|
32
|
+
"h6",
|
|
33
|
+
"hr",
|
|
34
|
+
"i",
|
|
35
|
+
"img",
|
|
36
|
+
"list",
|
|
37
|
+
"ul",
|
|
38
|
+
"ol",
|
|
39
|
+
"li",
|
|
40
|
+
"p",
|
|
41
|
+
"pre",
|
|
42
|
+
"script",
|
|
43
|
+
"style",
|
|
44
|
+
"s",
|
|
45
|
+
"strong",
|
|
46
|
+
"samp",
|
|
47
|
+
"sub",
|
|
48
|
+
"sup",
|
|
49
|
+
"table",
|
|
50
|
+
"caption",
|
|
51
|
+
"figcaption",
|
|
52
|
+
"td",
|
|
53
|
+
"th",
|
|
54
|
+
"tr",
|
|
55
|
+
"kbd",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
ConvertsMap = Mapping[SupportedElements, Callable[[str, Tag], str]]
|
|
59
|
+
|
|
60
|
+
T = TypeVar("T")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
|
|
64
|
+
"""This abstracts all simple inline tags like b, em, del, ...
|
|
65
|
+
Returns a function that wraps the chomped text in a pair of the string
|
|
66
|
+
that is returned by markup_fn, with '/' inserted in the string used after
|
|
67
|
+
the text if it looks like an HTML tag. markup_fn is necessary to allow for
|
|
68
|
+
references to self.strong_em_symbol etc.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def implementation(*, tag: Tag, text: str) -> str:
|
|
72
|
+
if tag.find_parent(["pre", "code", "kbd", "samp"]):
|
|
73
|
+
return text
|
|
74
|
+
|
|
75
|
+
if not text.strip():
|
|
76
|
+
return ""
|
|
77
|
+
|
|
78
|
+
markup_suffix = markup_prefix
|
|
79
|
+
if markup_prefix.startswith("<") and markup_prefix.endswith(">"):
|
|
80
|
+
markup_suffix = "</" + markup_prefix[1:]
|
|
81
|
+
|
|
82
|
+
prefix, suffix, text = chomp(text)
|
|
83
|
+
|
|
84
|
+
return f"{prefix}{markup_prefix}{text}{markup_suffix}{suffix}"
|
|
85
|
+
|
|
86
|
+
return cast(Callable[[Tag, str], str], implementation)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _get_colspan(tag: Tag) -> int:
|
|
90
|
+
colspan = 1
|
|
91
|
+
|
|
92
|
+
if "colspan" in tag.attrs and isinstance(tag["colspan"], str) and tag["colspan"].isdigit():
|
|
93
|
+
colspan = int(tag["colspan"])
|
|
94
|
+
|
|
95
|
+
return colspan
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _convert_a(*, tag: Tag, text: str, autolinks: bool, default_title: bool) -> str:
|
|
99
|
+
prefix, suffix, text = chomp(text)
|
|
100
|
+
if not text:
|
|
101
|
+
return ""
|
|
102
|
+
|
|
103
|
+
href = tag.get("href")
|
|
104
|
+
title = tag.get("title")
|
|
105
|
+
|
|
106
|
+
if autolinks and text.replace(r"\_", "_") == href and not title and not default_title:
|
|
107
|
+
return f"<{href}>"
|
|
108
|
+
|
|
109
|
+
if default_title and not title:
|
|
110
|
+
title = href
|
|
111
|
+
|
|
112
|
+
title_part = ' "{}"'.format(title.replace('"', r"\"")) if isinstance(title, str) else ""
|
|
113
|
+
return f"{prefix}[{text}]({href}{title_part}){suffix}" if href else text
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _convert_blockquote(*, text: str, convert_as_inline: bool) -> str:
|
|
117
|
+
if convert_as_inline:
|
|
118
|
+
return text
|
|
119
|
+
return f"\n{line_beginning_re.sub('> ', text.strip())}\n\n" if text else ""
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _convert_br(*, convert_as_inline: bool, newline_style: str) -> str:
|
|
123
|
+
if convert_as_inline:
|
|
124
|
+
return ""
|
|
125
|
+
return "\\\n" if newline_style.lower() == BACKSLASH else " \n"
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _convert_hn(
|
|
129
|
+
*,
|
|
130
|
+
n: int,
|
|
131
|
+
heading_style: Literal["atx", "atx_closed", "underlined"],
|
|
132
|
+
text: str,
|
|
133
|
+
convert_as_inline: bool,
|
|
134
|
+
) -> str:
|
|
135
|
+
if convert_as_inline:
|
|
136
|
+
return text
|
|
137
|
+
|
|
138
|
+
text = text.strip()
|
|
139
|
+
if heading_style == UNDERLINED and n <= 2:
|
|
140
|
+
return underline(text=text, pad_char="=" if n == 1 else "-")
|
|
141
|
+
|
|
142
|
+
hashes = "#" * n
|
|
143
|
+
if heading_style == ATX_CLOSED:
|
|
144
|
+
return f"{hashes} {text} {hashes}\n\n"
|
|
145
|
+
|
|
146
|
+
return f"{hashes} {text}\n\n"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: Iterable[str] | None) -> str:
|
|
150
|
+
alt = tag.attrs.get("alt", None) or ""
|
|
151
|
+
src = tag.attrs.get("src", None) or ""
|
|
152
|
+
title = tag.attrs.get("title", None) or ""
|
|
153
|
+
title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
|
|
154
|
+
parent_name = tag.parent.name if tag.parent else ""
|
|
155
|
+
if convert_as_inline and parent_name not in (keep_inline_images_in or []):
|
|
156
|
+
return alt
|
|
157
|
+
|
|
158
|
+
return f""
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _convert_list(*, tag: Tag, text: str) -> str:
|
|
162
|
+
nested = False
|
|
163
|
+
|
|
164
|
+
before_paragraph = False
|
|
165
|
+
if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
|
|
166
|
+
before_paragraph = True
|
|
167
|
+
|
|
168
|
+
while tag:
|
|
169
|
+
if tag.name == "li":
|
|
170
|
+
nested = True
|
|
171
|
+
break
|
|
172
|
+
|
|
173
|
+
if not tag.parent:
|
|
174
|
+
break
|
|
175
|
+
|
|
176
|
+
tag = tag.parent
|
|
177
|
+
|
|
178
|
+
if nested:
|
|
179
|
+
return "\n" + indent(text=text, level=1).rstrip()
|
|
180
|
+
|
|
181
|
+
return text + ("\n" if before_paragraph else "")
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
|
|
185
|
+
parent = tag.parent
|
|
186
|
+
if parent is not None and parent.name == "ol":
|
|
187
|
+
start = (
|
|
188
|
+
int(cast(str, parent["start"]))
|
|
189
|
+
if isinstance(parent.get("start"), str) and str(parent.get("start")).isnumeric()
|
|
190
|
+
else 1
|
|
191
|
+
)
|
|
192
|
+
bullet = "%s." % (start + parent.index(tag))
|
|
193
|
+
else:
|
|
194
|
+
depth = -1
|
|
195
|
+
while tag:
|
|
196
|
+
if tag.name == "ul":
|
|
197
|
+
depth += 1
|
|
198
|
+
if not tag.parent:
|
|
199
|
+
break
|
|
200
|
+
|
|
201
|
+
tag = tag.parent
|
|
202
|
+
|
|
203
|
+
bullet = bullets[depth % len(bullets)]
|
|
204
|
+
return "{} {}\n".format(bullet, (text or "").strip())
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: int) -> str:
|
|
208
|
+
if convert_as_inline:
|
|
209
|
+
return text
|
|
210
|
+
|
|
211
|
+
if wrap:
|
|
212
|
+
text = fill(
|
|
213
|
+
text,
|
|
214
|
+
width=wrap_width,
|
|
215
|
+
break_long_words=False,
|
|
216
|
+
break_on_hyphens=False,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
return f"{text}\n\n" if text else ""
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _convert_pre(
|
|
223
|
+
*,
|
|
224
|
+
tag: Tag,
|
|
225
|
+
text: str,
|
|
226
|
+
code_language: str,
|
|
227
|
+
code_language_callback: Callable[[Tag], str] | None,
|
|
228
|
+
) -> str:
|
|
229
|
+
if not text:
|
|
230
|
+
return ""
|
|
231
|
+
|
|
232
|
+
if code_language_callback:
|
|
233
|
+
code_language = code_language_callback(tag) or code_language
|
|
234
|
+
|
|
235
|
+
return f"\n```{code_language}\n{text}\n```\n"
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _convert_td(*, tag: Tag, text: str) -> str:
|
|
239
|
+
colspan = _get_colspan(tag)
|
|
240
|
+
return " " + text.strip().replace("\n", " ") + " |" * colspan
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _convert_th(*, tag: Tag, text: str) -> str:
|
|
244
|
+
colspan = _get_colspan(tag)
|
|
245
|
+
return " " + text.strip().replace("\n", " ") + " |" * colspan
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
249
|
+
cells = tag.find_all(["td", "th"])
|
|
250
|
+
parent_name = tag.parent.name if tag.parent else ""
|
|
251
|
+
tag_grand_parent = tag.parent.parent if tag.parent else None
|
|
252
|
+
is_headrow = (
|
|
253
|
+
all(cell.name == "th" for cell in cells)
|
|
254
|
+
or (not tag.previous_sibling and parent_name != "tbody")
|
|
255
|
+
or (
|
|
256
|
+
not tag.previous_sibling
|
|
257
|
+
and parent_name == "tbody"
|
|
258
|
+
and (not tag_grand_parent or len(tag_grand_parent.find_all(["thead"])) < 1)
|
|
259
|
+
)
|
|
260
|
+
)
|
|
261
|
+
overline = ""
|
|
262
|
+
underline = ""
|
|
263
|
+
if is_headrow and not tag.previous_sibling:
|
|
264
|
+
# first row and is headline: print headline underline
|
|
265
|
+
full_colspan = 0
|
|
266
|
+
for cell in cells:
|
|
267
|
+
if "colspan" in cell.attrs and cell["colspan"].isdigit():
|
|
268
|
+
full_colspan += int(cell["colspan"])
|
|
269
|
+
else:
|
|
270
|
+
full_colspan += 1
|
|
271
|
+
underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
|
|
272
|
+
elif not tag.previous_sibling and (
|
|
273
|
+
parent_name == "table" or (parent_name == "tbody" and not cast(Tag, tag.parent).previous_sibling)
|
|
274
|
+
):
|
|
275
|
+
# first row, not headline, and:
|
|
276
|
+
# - the parent is table or
|
|
277
|
+
# - the parent is tbody at the beginning of a table.
|
|
278
|
+
# print empty headline above this row
|
|
279
|
+
overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"
|
|
280
|
+
overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"
|
|
281
|
+
return overline + "|" + text + "\n" + underline
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def create_converters_map(
|
|
285
|
+
autolinks: bool,
|
|
286
|
+
bullets: str,
|
|
287
|
+
code_language: str,
|
|
288
|
+
code_language_callback: Callable[[Tag], str] | None,
|
|
289
|
+
default_title: bool,
|
|
290
|
+
heading_style: Literal["atx", "atx_closed", "underlined"],
|
|
291
|
+
keep_inline_images_in: Iterable[str] | None,
|
|
292
|
+
newline_style: str,
|
|
293
|
+
strong_em_symbol: str,
|
|
294
|
+
sub_symbol: str,
|
|
295
|
+
sup_symbol: str,
|
|
296
|
+
wrap: bool,
|
|
297
|
+
wrap_width: int,
|
|
298
|
+
) -> ConvertsMap:
|
|
299
|
+
"""Create a mapping of HTML elements to their corresponding conversion functions.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
autolinks: Whether to convert URLs into links.
|
|
303
|
+
bullets: The bullet characters to use for unordered lists.
|
|
304
|
+
code_language: The default code language to use.
|
|
305
|
+
code_language_callback: A callback to get the code language.
|
|
306
|
+
default_title: Whether to use the URL as the title for links.
|
|
307
|
+
heading_style: The style of headings.
|
|
308
|
+
keep_inline_images_in: The tags to keep inline images in.
|
|
309
|
+
newline_style: The style of newlines.
|
|
310
|
+
strong_em_symbol: The symbol to use for strong and emphasis text.
|
|
311
|
+
sub_symbol: The symbol to use for subscript text.
|
|
312
|
+
sup_symbol: The symbol to use for superscript text.
|
|
313
|
+
wrap: Whether to wrap text.
|
|
314
|
+
wrap_width: The width to wrap text at.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
A mapping of HTML elements to their corresponding conversion functions
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
def _wrapper(func: Callable[..., T]) -> Callable[[str, Tag], T]:
|
|
321
|
+
spec = getfullargspec(func)
|
|
322
|
+
|
|
323
|
+
def _inner(*, text: str, tag: Tag, convert_as_inline: bool) -> T:
|
|
324
|
+
if spec.kwonlyargs:
|
|
325
|
+
kwargs: dict[str, Any] = {}
|
|
326
|
+
if "tag" in spec.kwonlyargs:
|
|
327
|
+
kwargs["tag"] = tag
|
|
328
|
+
if "text" in spec.kwonlyargs:
|
|
329
|
+
kwargs["text"] = text
|
|
330
|
+
if "convert_as_inline" in spec.kwonlyargs:
|
|
331
|
+
kwargs["convert_as_inline"] = convert_as_inline
|
|
332
|
+
return func(**kwargs)
|
|
333
|
+
return func(text)
|
|
334
|
+
|
|
335
|
+
return cast(Callable[[str, Tag], T], _inner)
|
|
336
|
+
|
|
337
|
+
return {
|
|
338
|
+
"a": _wrapper(partial(_convert_a, autolinks=autolinks, default_title=default_title)),
|
|
339
|
+
"b": _wrapper(partial(_create_inline_converter(2 * strong_em_symbol))),
|
|
340
|
+
"blockquote": _wrapper(partial(_convert_blockquote)),
|
|
341
|
+
"br": _wrapper(partial(_convert_br, newline_style=newline_style)),
|
|
342
|
+
"code": _wrapper(_create_inline_converter("`")),
|
|
343
|
+
"del": _wrapper(_create_inline_converter("~~")),
|
|
344
|
+
"em": _wrapper(_create_inline_converter(strong_em_symbol)),
|
|
345
|
+
"h1": _wrapper(partial(_convert_hn, n=1, heading_style=heading_style)),
|
|
346
|
+
"h2": _wrapper(partial(_convert_hn, n=2, heading_style=heading_style)),
|
|
347
|
+
"h3": _wrapper(partial(_convert_hn, n=3, heading_style=heading_style)),
|
|
348
|
+
"h4": _wrapper(partial(_convert_hn, n=4, heading_style=heading_style)),
|
|
349
|
+
"h5": _wrapper(partial(_convert_hn, n=5, heading_style=heading_style)),
|
|
350
|
+
"h6": _wrapper(partial(_convert_hn, n=6, heading_style=heading_style)),
|
|
351
|
+
"hr": _wrapper(lambda _: "\n\n---\n\n"),
|
|
352
|
+
"i": _wrapper(partial(_create_inline_converter(strong_em_symbol))),
|
|
353
|
+
"img": _wrapper(partial(_convert_img, keep_inline_images_in=keep_inline_images_in)),
|
|
354
|
+
"list": _wrapper(_convert_list),
|
|
355
|
+
"ul": _wrapper(_convert_list),
|
|
356
|
+
"ol": _wrapper(_convert_list),
|
|
357
|
+
"li": _wrapper(partial(_convert_li, bullets=bullets)),
|
|
358
|
+
"p": _wrapper(partial(_convert_p, wrap=wrap, wrap_width=wrap_width)),
|
|
359
|
+
"pre": _wrapper(
|
|
360
|
+
partial(
|
|
361
|
+
_convert_pre,
|
|
362
|
+
code_language=code_language,
|
|
363
|
+
code_language_callback=code_language_callback,
|
|
364
|
+
)
|
|
365
|
+
),
|
|
366
|
+
"script": _wrapper(lambda _: ""),
|
|
367
|
+
"style": _wrapper(lambda _: ""),
|
|
368
|
+
"s": _wrapper(_create_inline_converter("~~")),
|
|
369
|
+
"strong": _wrapper(_create_inline_converter(strong_em_symbol * 2)),
|
|
370
|
+
"samp": _wrapper(_create_inline_converter("`")),
|
|
371
|
+
"sub": _wrapper(_create_inline_converter(sub_symbol)),
|
|
372
|
+
"sup": _wrapper(_create_inline_converter(sup_symbol)),
|
|
373
|
+
"table": _wrapper(lambda text: f"\n\n{text}\n"),
|
|
374
|
+
"caption": _wrapper(lambda text: f"{text}\n"),
|
|
375
|
+
"figcaption": _wrapper(lambda text: f"\n\n{text}\n\n"),
|
|
376
|
+
"td": _wrapper(_convert_td),
|
|
377
|
+
"th": _wrapper(_convert_th),
|
|
378
|
+
"tr": _wrapper(_convert_tr),
|
|
379
|
+
"kbd": _wrapper(_create_inline_converter("`")),
|
|
380
|
+
}
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal, cast
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
|
|
6
|
+
|
|
7
|
+
from html_to_markdown.constants import (
|
|
8
|
+
ASTERISK,
|
|
9
|
+
SPACES,
|
|
10
|
+
UNDERLINED,
|
|
11
|
+
html_heading_re,
|
|
12
|
+
whitespace_re,
|
|
13
|
+
)
|
|
14
|
+
from html_to_markdown.converters import ConvertsMap, create_converters_map
|
|
15
|
+
from html_to_markdown.utils import escape
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from collections.abc import Iterable
|
|
19
|
+
|
|
20
|
+
from bs4 import PageElement
|
|
21
|
+
|
|
22
|
+
SupportedTag = Literal[
|
|
23
|
+
"a",
|
|
24
|
+
"b",
|
|
25
|
+
"blockquote",
|
|
26
|
+
"br",
|
|
27
|
+
"code",
|
|
28
|
+
"del",
|
|
29
|
+
"em",
|
|
30
|
+
"h1",
|
|
31
|
+
"h2",
|
|
32
|
+
"h3",
|
|
33
|
+
"h4",
|
|
34
|
+
"h5",
|
|
35
|
+
"h6",
|
|
36
|
+
"hr",
|
|
37
|
+
"i",
|
|
38
|
+
"img",
|
|
39
|
+
"list",
|
|
40
|
+
"ul",
|
|
41
|
+
"ol",
|
|
42
|
+
"li",
|
|
43
|
+
"p",
|
|
44
|
+
"pre",
|
|
45
|
+
"script",
|
|
46
|
+
"style",
|
|
47
|
+
"s",
|
|
48
|
+
"strong",
|
|
49
|
+
"samp",
|
|
50
|
+
"sub",
|
|
51
|
+
"sup",
|
|
52
|
+
"table",
|
|
53
|
+
"caption",
|
|
54
|
+
"figcaption",
|
|
55
|
+
"td",
|
|
56
|
+
"th",
|
|
57
|
+
"tr",
|
|
58
|
+
"kbd",
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _is_nested_tag(el: PageElement) -> bool:
|
|
63
|
+
return isinstance(el, Tag) and el.name in {
|
|
64
|
+
"ol",
|
|
65
|
+
"ul",
|
|
66
|
+
"li",
|
|
67
|
+
"table",
|
|
68
|
+
"thead",
|
|
69
|
+
"tbody",
|
|
70
|
+
"tfoot",
|
|
71
|
+
"tr",
|
|
72
|
+
"td",
|
|
73
|
+
"th",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _process_tag(
|
|
78
|
+
tag: Tag,
|
|
79
|
+
*,
|
|
80
|
+
autolinks: bool,
|
|
81
|
+
bullets: str,
|
|
82
|
+
code_language: str,
|
|
83
|
+
code_language_callback: Callable[[Any], str] | None,
|
|
84
|
+
convert: Iterable[str] | None,
|
|
85
|
+
convert_as_inline: bool = False,
|
|
86
|
+
converters_map: ConvertsMap | None = None,
|
|
87
|
+
default_title: bool,
|
|
88
|
+
escape_asterisks: bool,
|
|
89
|
+
escape_misc: bool,
|
|
90
|
+
escape_underscores: bool,
|
|
91
|
+
heading_style: Literal["atx", "atx_closed", "underlined"],
|
|
92
|
+
keep_inline_images_in: Iterable[str] | None,
|
|
93
|
+
newline_style: str,
|
|
94
|
+
strip: Iterable[str] | None,
|
|
95
|
+
strong_em_symbol: str,
|
|
96
|
+
sub_symbol: str,
|
|
97
|
+
sup_symbol: str,
|
|
98
|
+
wrap: bool,
|
|
99
|
+
wrap_width: int,
|
|
100
|
+
) -> str:
|
|
101
|
+
if converters_map is None:
|
|
102
|
+
converters_map = create_converters_map(
|
|
103
|
+
autolinks=autolinks,
|
|
104
|
+
bullets=bullets,
|
|
105
|
+
code_language=code_language,
|
|
106
|
+
code_language_callback=code_language_callback,
|
|
107
|
+
default_title=default_title,
|
|
108
|
+
heading_style=heading_style,
|
|
109
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
110
|
+
newline_style=newline_style,
|
|
111
|
+
strong_em_symbol=strong_em_symbol,
|
|
112
|
+
sub_symbol=sub_symbol,
|
|
113
|
+
sup_symbol=sup_symbol,
|
|
114
|
+
wrap=wrap,
|
|
115
|
+
wrap_width=wrap_width,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
text = ""
|
|
119
|
+
is_heading = html_heading_re.match(tag.name) is not None
|
|
120
|
+
is_cell = tag.name in {"td", "th"}
|
|
121
|
+
convert_children_as_inline = convert_as_inline or is_heading or is_cell
|
|
122
|
+
|
|
123
|
+
if _is_nested_tag(tag):
|
|
124
|
+
for el in tag.children:
|
|
125
|
+
can_extract = (
|
|
126
|
+
not el.previous_sibling
|
|
127
|
+
or not el.next_sibling
|
|
128
|
+
or _is_nested_tag(el.previous_sibling)
|
|
129
|
+
or _is_nested_tag(el.next_sibling)
|
|
130
|
+
)
|
|
131
|
+
if can_extract and isinstance(el, NavigableString) and not el.strip():
|
|
132
|
+
el.extract()
|
|
133
|
+
|
|
134
|
+
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children):
|
|
135
|
+
if isinstance(el, NavigableString):
|
|
136
|
+
text += _process_text(
|
|
137
|
+
el=el,
|
|
138
|
+
escape_misc=escape_misc,
|
|
139
|
+
escape_asterisks=escape_asterisks,
|
|
140
|
+
escape_underscores=escape_underscores,
|
|
141
|
+
)
|
|
142
|
+
elif isinstance(el, Tag):
|
|
143
|
+
text += _process_tag(
|
|
144
|
+
tag=el,
|
|
145
|
+
convert_as_inline=convert_children_as_inline,
|
|
146
|
+
strip=strip,
|
|
147
|
+
convert=convert,
|
|
148
|
+
escape_misc=escape_misc,
|
|
149
|
+
escape_asterisks=escape_asterisks,
|
|
150
|
+
escape_underscores=escape_underscores,
|
|
151
|
+
converters_map=converters_map,
|
|
152
|
+
autolinks=autolinks,
|
|
153
|
+
bullets=bullets,
|
|
154
|
+
code_language=code_language,
|
|
155
|
+
code_language_callback=code_language_callback,
|
|
156
|
+
default_title=default_title,
|
|
157
|
+
heading_style=heading_style,
|
|
158
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
159
|
+
newline_style=newline_style,
|
|
160
|
+
strong_em_symbol=strong_em_symbol,
|
|
161
|
+
sub_symbol=sub_symbol,
|
|
162
|
+
sup_symbol=sup_symbol,
|
|
163
|
+
wrap=wrap,
|
|
164
|
+
wrap_width=wrap_width,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
tag_name: SupportedTag | None = cast(SupportedTag, tag.name.lower()) if tag.name.lower() in converters_map else None
|
|
168
|
+
|
|
169
|
+
if tag_name and _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert):
|
|
170
|
+
return converters_map[tag_name]( # type: ignore[call-arg]
|
|
171
|
+
tag=tag, text=text, convert_as_inline=convert_as_inline
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
return text
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _process_text(
|
|
178
|
+
*,
|
|
179
|
+
el: NavigableString,
|
|
180
|
+
escape_misc: bool,
|
|
181
|
+
escape_asterisks: bool,
|
|
182
|
+
escape_underscores: bool,
|
|
183
|
+
) -> str:
|
|
184
|
+
text = str(el) or ""
|
|
185
|
+
|
|
186
|
+
# normalize whitespace if we're not inside a preformatted element
|
|
187
|
+
if not el.find_parent("pre"):
|
|
188
|
+
text = whitespace_re.sub(" ", text)
|
|
189
|
+
|
|
190
|
+
# escape special characters if we're not inside a preformatted or code element
|
|
191
|
+
if not el.find_parent(["pre", "code", "kbd", "samp"]):
|
|
192
|
+
text = escape(
|
|
193
|
+
text=text,
|
|
194
|
+
escape_misc=escape_misc,
|
|
195
|
+
escape_asterisks=escape_asterisks,
|
|
196
|
+
escape_underscores=escape_underscores,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# remove trailing whitespaces if any of the following condition is true:
|
|
200
|
+
# - current text node is the last node in li
|
|
201
|
+
# - current text node is followed by an embedded list
|
|
202
|
+
if (
|
|
203
|
+
el.parent
|
|
204
|
+
and el.parent.name == "li"
|
|
205
|
+
and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"})
|
|
206
|
+
):
|
|
207
|
+
text = text.rstrip()
|
|
208
|
+
|
|
209
|
+
return text
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _should_convert_tag(*, tag_name: str, strip: Iterable[str] | None, convert: Iterable[str] | None) -> bool:
|
|
213
|
+
if strip is not None:
|
|
214
|
+
return tag_name not in strip
|
|
215
|
+
if convert is not None:
|
|
216
|
+
return tag_name in convert
|
|
217
|
+
return True
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def convert_to_markdown(
|
|
221
|
+
html: str,
|
|
222
|
+
*,
|
|
223
|
+
soup: BeautifulSoup | None = None,
|
|
224
|
+
autolinks: bool = True,
|
|
225
|
+
bullets: str = "*+-",
|
|
226
|
+
code_language: str = "",
|
|
227
|
+
code_language_callback: Callable[[Any], str] | None = None,
|
|
228
|
+
convert: Iterable[str] | None = None,
|
|
229
|
+
default_title: bool = False,
|
|
230
|
+
escape_asterisks: bool = True,
|
|
231
|
+
escape_misc: bool = True,
|
|
232
|
+
escape_underscores: bool = True,
|
|
233
|
+
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
234
|
+
keep_inline_images_in: Iterable[str] | None = None,
|
|
235
|
+
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
236
|
+
strip: Iterable[str] | None = None,
|
|
237
|
+
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
238
|
+
sub_symbol: str = "",
|
|
239
|
+
sup_symbol: str = "",
|
|
240
|
+
wrap: bool = False,
|
|
241
|
+
wrap_width: int = 80,
|
|
242
|
+
convert_as_inline: bool = False,
|
|
243
|
+
) -> str:
|
|
244
|
+
"""Convert HTML to Markdown.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
html: The HTML to convert.
|
|
248
|
+
soup: The BeautifulSoup object to convert.
|
|
249
|
+
autolinks: Whether to convert links to Markdown.
|
|
250
|
+
bullets: The bullet characters to use for unordered lists.
|
|
251
|
+
code_language: The default code language to use.
|
|
252
|
+
code_language_callback: A callback function to determine the code language.
|
|
253
|
+
convert: The HTML elements to convert.
|
|
254
|
+
default_title: Whether to use the default title.
|
|
255
|
+
escape_asterisks: Whether to escape asterisks.
|
|
256
|
+
escape_misc: Whether to escape miscellaneous characters.
|
|
257
|
+
escape_underscores: Whether to escape underscores.
|
|
258
|
+
heading_style: The style to use for headings.
|
|
259
|
+
keep_inline_images_in: The tags to keep inline images in.
|
|
260
|
+
newline_style: The style to use for newlines.
|
|
261
|
+
strip: The HTML elements to strip.
|
|
262
|
+
strong_em_symbol: The symbol to use for strong and emphasis.
|
|
263
|
+
sub_symbol: The symbol to use for subscript.
|
|
264
|
+
sup_symbol: The symbol to use for superscript.
|
|
265
|
+
wrap: Whether to wrap text.
|
|
266
|
+
wrap_width: The width to wrap text at.
|
|
267
|
+
convert_as_inline: Whether to convert elements as inline.
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
The Markdown.
|
|
271
|
+
"""
|
|
272
|
+
if soup is None:
|
|
273
|
+
from bs4 import BeautifulSoup
|
|
274
|
+
|
|
275
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
276
|
+
|
|
277
|
+
return _process_tag(
|
|
278
|
+
autolinks=autolinks,
|
|
279
|
+
bullets=bullets,
|
|
280
|
+
code_language=code_language,
|
|
281
|
+
code_language_callback=code_language_callback,
|
|
282
|
+
convert=convert,
|
|
283
|
+
convert_as_inline=convert_as_inline,
|
|
284
|
+
default_title=default_title,
|
|
285
|
+
escape_asterisks=escape_asterisks,
|
|
286
|
+
escape_misc=escape_misc,
|
|
287
|
+
escape_underscores=escape_underscores,
|
|
288
|
+
heading_style=heading_style,
|
|
289
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
290
|
+
newline_style=newline_style,
|
|
291
|
+
strip=strip,
|
|
292
|
+
strong_em_symbol=strong_em_symbol,
|
|
293
|
+
sub_symbol=sub_symbol,
|
|
294
|
+
sup_symbol=sup_symbol,
|
|
295
|
+
tag=soup,
|
|
296
|
+
wrap=wrap,
|
|
297
|
+
wrap_width=wrap_width,
|
|
298
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from html_to_markdown.constants import line_beginning_re
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def chomp(text: str) -> tuple[str, str, str]:
|
|
9
|
+
"""If the text in an inline tag like b, a, or em contains a leading or trailing
|
|
10
|
+
space, strip the string and return a space as suffix of prefix, if needed.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
text: The text to chomp.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
A tuple containing the prefix, suffix, and the stripped text.
|
|
17
|
+
"""
|
|
18
|
+
prefix = " " if text and text[0] == " " else ""
|
|
19
|
+
suffix = " " if text and text[-1] == " " else ""
|
|
20
|
+
text = text.strip()
|
|
21
|
+
return prefix, suffix, text
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:
|
|
25
|
+
"""Escape special characters in text.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
text: The text to escape.
|
|
29
|
+
escape_misc: Whether to escape miscellaneous characters.
|
|
30
|
+
escape_asterisks: Whether to escape asterisks.
|
|
31
|
+
escape_underscores: Whether to escape underscores.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
The escaped text.
|
|
35
|
+
"""
|
|
36
|
+
if not text:
|
|
37
|
+
return ""
|
|
38
|
+
if escape_misc:
|
|
39
|
+
text = re.sub(r"([\\&<`[>~#=+|-])", r"\\\1", text)
|
|
40
|
+
text = re.sub(r"([0-9])([.)])", r"\1\\\2", text)
|
|
41
|
+
if escape_asterisks:
|
|
42
|
+
text = text.replace("*", r"\*")
|
|
43
|
+
if escape_underscores:
|
|
44
|
+
text = text.replace("_", r"\_")
|
|
45
|
+
return text
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def indent(*, text: str, level: int) -> str:
|
|
49
|
+
"""Indent text by a given level.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
text: The text to indent.
|
|
53
|
+
level: The level of indentation.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
The indented text.
|
|
57
|
+
"""
|
|
58
|
+
return line_beginning_re.sub("\t" * level, text) if text else ""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def underline(*, text: str, pad_char: str) -> str:
|
|
62
|
+
"""Underline text with a given character.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
text: The text to underline.
|
|
66
|
+
pad_char: The character to use for underlining.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
The underlined text.
|
|
70
|
+
"""
|
|
71
|
+
text = (text or "").rstrip()
|
|
72
|
+
return f"{text}\n{pad_char * len(text)}\n\n" if text else ""
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: html-to-markdown
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Convert HTML to markdown
|
|
5
|
+
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: beautifulsoup,converter,html,markdown,text-processing
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Text Processing
|
|
18
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
21
|
+
Classifier: Topic :: Utilities
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Requires-Dist: beautifulsoup4>=4.12.3
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# html_to_markdown
|
|
28
|
+
|
|
29
|
+
This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
|
|
30
|
+
Python 3.9 and offering strong typing.
|
|
31
|
+
|
|
32
|
+
### Differences from the Markdownify
|
|
33
|
+
|
|
34
|
+
- The refactored codebase uses a strict functional approach - no classes are involved.
|
|
35
|
+
- There is full typing with strict MyPy adherence in place.
|
|
36
|
+
- The `convert_to_markdown` allows passing a pre-configured instance of `Beautifulsoup`.
|
|
37
|
+
- This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
|
|
38
|
+
point versioning is no longer aligned.
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
```shell
|
|
43
|
+
pip install html_to_markdown
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
Convert some HTML to Markdown:
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from html_to_markdown import convert_to_markdown
|
|
52
|
+
|
|
53
|
+
convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Specify tags to exclude:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from html_to_markdown import convert_to_markdown
|
|
60
|
+
|
|
61
|
+
convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', strip=['a']) # > '**Yay** GitHub'
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
\...or specify the tags you want to include:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from html_to_markdown import convert_to_markdown
|
|
68
|
+
|
|
69
|
+
convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', convert=['b']) # > '**Yay** GitHub'
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
# Options
|
|
73
|
+
|
|
74
|
+
html_to_markdown supports the following options:
|
|
75
|
+
|
|
76
|
+
strip
|
|
77
|
+
|
|
78
|
+
: A list of tags to strip. This option can\'t be used with the
|
|
79
|
+
`convert` option.
|
|
80
|
+
|
|
81
|
+
convert
|
|
82
|
+
|
|
83
|
+
: A list of tags to convert. This option can\'t be used with the
|
|
84
|
+
`strip` option.
|
|
85
|
+
|
|
86
|
+
autolinks
|
|
87
|
+
|
|
88
|
+
: A boolean indicating whether the \"automatic link\" style should be
|
|
89
|
+
used when a `a` tag\'s contents match its href. Defaults to `True`.
|
|
90
|
+
|
|
91
|
+
default_title
|
|
92
|
+
|
|
93
|
+
: A boolean to enable setting the title of a link to its href, if no
|
|
94
|
+
title is given. Defaults to `False`.
|
|
95
|
+
|
|
96
|
+
heading_style
|
|
97
|
+
|
|
98
|
+
: Defines how headings should be converted. Accepted values are `ATX`,
|
|
99
|
+
`ATX_CLOSED`, `SETEXT`, and `UNDERLINED` (which is an alias for
|
|
100
|
+
`SETEXT`). Defaults to `UNDERLINED`.
|
|
101
|
+
|
|
102
|
+
bullets
|
|
103
|
+
|
|
104
|
+
: An iterable (string, list, or tuple) of bullet styles to be used. If
|
|
105
|
+
the iterable only contains one item, it will be used regardless of
|
|
106
|
+
how deeply lists are nested. Otherwise, the bullet will alternate
|
|
107
|
+
based on nesting level. Defaults to `'*+-'`.
|
|
108
|
+
|
|
109
|
+
strong_em_symbol
|
|
110
|
+
|
|
111
|
+
: In markdown, both `*` and `_` are used to encode **strong** or
|
|
112
|
+
*emphasized* texts. Either of these symbols can be chosen by the
|
|
113
|
+
options `ASTERISK` (default) or `UNDERSCORE` respectively.
|
|
114
|
+
|
|
115
|
+
sub_symbol, sup_symbol
|
|
116
|
+
|
|
117
|
+
: Define the chars that surround `<sub>` and `<sup>` text. Defaults to
|
|
118
|
+
an empty string, because this is non-standard behavior. Could be
|
|
119
|
+
something like `~` and `^` to result in `~sub~` and `^sup^`. If the
|
|
120
|
+
value starts with `<` and ends with `>`, it is treated as an HTML
|
|
121
|
+
tag and a `/` is inserted after the `<` in the string used after the
|
|
122
|
+
text; this allows specifying `<sub>` to use raw HTML in the output
|
|
123
|
+
for subscripts, for example.
|
|
124
|
+
|
|
125
|
+
newline_style
|
|
126
|
+
|
|
127
|
+
: Defines the style of marking linebreaks (`<br>`) in markdown. The
|
|
128
|
+
default value `SPACES` of this option will adopt the usual two
|
|
129
|
+
spaces and a newline, while `BACKSLASH` will convert a linebreak to
|
|
130
|
+
`\\n` (a backslash and a newline). While the latter convention is
|
|
131
|
+
non-standard, it is commonly preferred and supported by a lot of
|
|
132
|
+
interpreters.
|
|
133
|
+
|
|
134
|
+
code_language
|
|
135
|
+
|
|
136
|
+
: Defines the language that should be assumed for all `<pre>`
|
|
137
|
+
sections. Useful, if all code on a page is in the same programming
|
|
138
|
+
language and should be annotated with ``[python]{.title-ref}[ or
|
|
139
|
+
similar. Defaults to ]{.title-ref}[\'\']{.title-ref}\` (empty
|
|
140
|
+
string) and can be any string.
|
|
141
|
+
|
|
142
|
+
code_language_callback
|
|
143
|
+
|
|
144
|
+
: When the HTML code contains `pre` tags that in some way provide the
|
|
145
|
+
code language, for example as class, this callback can be used to
|
|
146
|
+
extract the language from the tag and prefix it to the converted
|
|
147
|
+
`pre` tag. The callback gets one single argument, an BeautifylSoup
|
|
148
|
+
object, and returns a string containing the code language, or
|
|
149
|
+
`None`. An example to use the class name as code language could be:
|
|
150
|
+
|
|
151
|
+
def callback(el):
|
|
152
|
+
return el['class'][0] if el.has_attr('class') else None
|
|
153
|
+
|
|
154
|
+
Defaults to `None`.
|
|
155
|
+
|
|
156
|
+
escape_asterisks
|
|
157
|
+
|
|
158
|
+
: If set to `False`, do not escape `*` to `\*` in text. Defaults to
|
|
159
|
+
`True`.
|
|
160
|
+
|
|
161
|
+
escape_underscores
|
|
162
|
+
|
|
163
|
+
: If set to `False`, do not escape `_` to `\_` in text. Defaults to
|
|
164
|
+
`True`.
|
|
165
|
+
|
|
166
|
+
escape_misc
|
|
167
|
+
|
|
168
|
+
: If set to `False`, do not escape miscellaneous punctuation
|
|
169
|
+
characters that sometimes have Markdown significance in text.
|
|
170
|
+
Defaults to `True`.
|
|
171
|
+
|
|
172
|
+
keep_inline_images_in
|
|
173
|
+
|
|
174
|
+
: Images are converted to their alt-text when the images are located
|
|
175
|
+
inside headlines or table cells. If some inline images should be
|
|
176
|
+
converted to markdown images instead, this option can be set to a
|
|
177
|
+
list of parent tags that should be allowed to contain inline images,
|
|
178
|
+
for example `['td']`. Defaults to an empty list.
|
|
179
|
+
|
|
180
|
+
wrap, wrap_width
|
|
181
|
+
|
|
182
|
+
: If `wrap` is set to `True`, all text paragraphs are wrapped at
|
|
183
|
+
`wrap_width` characters. Defaults to `False` and `80`. Use with
|
|
184
|
+
`newline_style=BACKSLASH` to keep line breaks in paragraphs.
|
|
185
|
+
|
|
186
|
+
Options may be specified as kwargs to the `html_to_markdown` function, or as
|
|
187
|
+
a nested `Options` class in `MarkdownConverter` subclasses.
|
|
188
|
+
|
|
189
|
+
# CLI
|
|
190
|
+
|
|
191
|
+
Use `html_to_markdown example.html > example.md` or pipe input from stdin
|
|
192
|
+
(`cat example.html | html_to_markdown > example.md`). Call `html_to_markdown -h`
|
|
193
|
+
to see all available options. They are the same as listed above and take
|
|
194
|
+
the same arguments.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
html_to_markdown/__init__.py,sha256=_WXeqic-7b6hvidTXkPQwAfLa4YOEAEP-mOUXjx_25k,95
|
|
2
|
+
html_to_markdown/__main__.py,sha256=Wll22XKFmiNSIpdbGzC75b5_Unc3HYOTA6oXA414Tl8,4412
|
|
3
|
+
html_to_markdown/constants.py,sha256=vUjffZ0vFq56jbXF5bBNzomfJwgsp0TWqdUzhkp6bks,687
|
|
4
|
+
html_to_markdown/converters.py,sha256=q1wpzsYl-FRR9qbB983gAkem_-7mgYZ7hOgziofjIDM,12238
|
|
5
|
+
html_to_markdown/processing.py,sha256=9l3zq_kdyvU0TnTk5g4uuYI6Jbu1gY7NQ11u3IBKyFU,9029
|
|
6
|
+
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
|
|
8
|
+
html_to_markdown-1.0.0.dist-info/METADATA,sha256=0MObULuhTHiyvVcytDBN_liafpyFjgp5brgoWQYEglA,6478
|
|
9
|
+
html_to_markdown-1.0.0.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
10
|
+
html_to_markdown-1.0.0.dist-info/entry_points.txt,sha256=jhMqXDYvIyzQDLKjCn4xCyzCCbAMl94tzQx_HiG5Qi0,67
|
|
11
|
+
html_to_markdown-1.0.0.dist-info/licenses/LICENSE,sha256=06BS7zd6oPCrbzAqrThGFboRlbssgBsqDJGqKyZW2Og,1117
|
|
12
|
+
html_to_markdown-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright 2012-2018 Matthew Tretter
|
|
4
|
+
Copyright 2024 Na'aman Hirschfeld
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
SOFTWARE.
|