html-to-markdown 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +3 -1
- html_to_markdown/__main__.py +8 -128
- html_to_markdown/cli.py +150 -0
- html_to_markdown/constants.py +8 -8
- html_to_markdown/converters.py +12 -10
- html_to_markdown/legacy.py +89 -0
- html_to_markdown/processing.py +70 -92
- html_to_markdown-1.2.0.dist-info/METADATA +102 -0
- html_to_markdown-1.2.0.dist-info/RECORD +13 -0
- {html_to_markdown-1.0.0.dist-info → html_to_markdown-1.2.0.dist-info}/WHEEL +1 -1
- html_to_markdown-1.0.0.dist-info/METADATA +0 -194
- html_to_markdown-1.0.0.dist-info/RECORD +0 -12
- html_to_markdown-1.0.0.dist-info/entry_points.txt +0 -2
- {html_to_markdown-1.0.0.dist-info → html_to_markdown-1.2.0.dist-info}/licenses/LICENSE +0 -0
html_to_markdown/__init__.py
CHANGED
html_to_markdown/__main__.py
CHANGED
|
@@ -1,131 +1,11 @@
|
|
|
1
|
-
import argparse
|
|
2
1
|
import sys
|
|
3
2
|
|
|
4
|
-
from html_to_markdown import convert_to_markdown
|
|
5
|
-
from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def cli(argv: list[str]) -> None:
|
|
9
|
-
"""Command-line interface for html_to_markdown."""
|
|
10
|
-
parser = argparse.ArgumentParser(
|
|
11
|
-
prog="html_to_markdown",
|
|
12
|
-
description="Converts html to markdown.",
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
parser.add_argument(
|
|
16
|
-
"html",
|
|
17
|
-
nargs="?",
|
|
18
|
-
type=argparse.FileType("r"),
|
|
19
|
-
default=sys.stdin,
|
|
20
|
-
help="The html file to convert. Defaults to STDIN if not " "provided.",
|
|
21
|
-
)
|
|
22
|
-
parser.add_argument(
|
|
23
|
-
"-s",
|
|
24
|
-
"--strip",
|
|
25
|
-
nargs="*",
|
|
26
|
-
help="A list of tags to strip. This option can't be used with " "the --convert option.",
|
|
27
|
-
)
|
|
28
|
-
parser.add_argument(
|
|
29
|
-
"-c",
|
|
30
|
-
"--convert",
|
|
31
|
-
nargs="*",
|
|
32
|
-
help="A list of tags to convert. This option can't be used with " "the --strip option.",
|
|
33
|
-
)
|
|
34
|
-
parser.add_argument(
|
|
35
|
-
"-a",
|
|
36
|
-
"--autolinks",
|
|
37
|
-
action="store_true",
|
|
38
|
-
help="A boolean indicating whether the 'automatic link' style "
|
|
39
|
-
"should be used when a 'a' tag's contents match its href.",
|
|
40
|
-
)
|
|
41
|
-
parser.add_argument(
|
|
42
|
-
"--default-title",
|
|
43
|
-
action="store_false",
|
|
44
|
-
help="A boolean to enable setting the title of a link to its " "href, if no title is given.",
|
|
45
|
-
)
|
|
46
|
-
parser.add_argument(
|
|
47
|
-
"--heading-style",
|
|
48
|
-
default=UNDERLINED,
|
|
49
|
-
choices=(ATX, ATX_CLOSED, UNDERLINED),
|
|
50
|
-
help="Defines how headings should be converted.",
|
|
51
|
-
)
|
|
52
|
-
parser.add_argument(
|
|
53
|
-
"-b",
|
|
54
|
-
"--bullets",
|
|
55
|
-
default="*+-",
|
|
56
|
-
help="A string of bullet styles to use; the bullet will " "alternate based on nesting level.",
|
|
57
|
-
)
|
|
58
|
-
(
|
|
59
|
-
parser.add_argument(
|
|
60
|
-
"--strong-em-symbol",
|
|
61
|
-
default=ASTERISK,
|
|
62
|
-
choices=(ASTERISK, UNDERSCORE),
|
|
63
|
-
help="Use * or _ to convert strong and italics text",
|
|
64
|
-
),
|
|
65
|
-
)
|
|
66
|
-
parser.add_argument("--sub-symbol", default="", help="Define the chars that surround '<sub>'.")
|
|
67
|
-
parser.add_argument("--sup-symbol", default="", help="Define the chars that surround '<sup>'.")
|
|
68
|
-
parser.add_argument(
|
|
69
|
-
"--newline-style",
|
|
70
|
-
default=SPACES,
|
|
71
|
-
choices=(SPACES, BACKSLASH),
|
|
72
|
-
help="Defines the style of <br> conversions: two spaces "
|
|
73
|
-
"or backslash at the and of the line thet should break.",
|
|
74
|
-
)
|
|
75
|
-
parser.add_argument(
|
|
76
|
-
"--code-language", default="", help="Defines the language that should be assumed for all " "'<pre>' sections."
|
|
77
|
-
)
|
|
78
|
-
parser.add_argument(
|
|
79
|
-
"--no-escape-asterisks",
|
|
80
|
-
dest="escape_asterisks",
|
|
81
|
-
action="store_false",
|
|
82
|
-
help="Do not escape '*' to '\\*' in text.",
|
|
83
|
-
)
|
|
84
|
-
parser.add_argument(
|
|
85
|
-
"--no-escape-underscores",
|
|
86
|
-
dest="escape_underscores",
|
|
87
|
-
action="store_false",
|
|
88
|
-
help="Do not escape '_' to '\\_' in text.",
|
|
89
|
-
)
|
|
90
|
-
parser.add_argument(
|
|
91
|
-
"-i",
|
|
92
|
-
"--keep-inline-images-in",
|
|
93
|
-
nargs="*",
|
|
94
|
-
help="Images are converted to their alt-text when the images are "
|
|
95
|
-
"located inside headlines or table cells. If some inline images "
|
|
96
|
-
"should be converted to markdown images instead, this option can "
|
|
97
|
-
"be set to a list of parent tags that should be allowed to "
|
|
98
|
-
"contain inline images.",
|
|
99
|
-
)
|
|
100
|
-
parser.add_argument(
|
|
101
|
-
"-w", "--wrap", action="store_true", help="Wrap all text paragraphs at --wrap-width characters."
|
|
102
|
-
)
|
|
103
|
-
parser.add_argument("--wrap-width", type=int, default=80)
|
|
104
|
-
|
|
105
|
-
args = parser.parse_args(argv)
|
|
106
|
-
|
|
107
|
-
result = convert_to_markdown(
|
|
108
|
-
args.html.read(),
|
|
109
|
-
strip=args.strip,
|
|
110
|
-
convert=args.convert,
|
|
111
|
-
autolinks=args.autolinks,
|
|
112
|
-
default_title=args.default_title,
|
|
113
|
-
heading_style=args.heading_style,
|
|
114
|
-
bullets=args.bullets,
|
|
115
|
-
strong_em_symbol=args.strong_em_symbol,
|
|
116
|
-
sub_symbol=args.sub_symbol,
|
|
117
|
-
sup_symbol=args.sup_symbol,
|
|
118
|
-
newline_style=args.newline_style,
|
|
119
|
-
code_language=args.code_language,
|
|
120
|
-
escape_asterisks=args.escape_asterisks,
|
|
121
|
-
escape_underscores=args.escape_underscores,
|
|
122
|
-
keep_inline_images_in=args.keep_inline_images_in,
|
|
123
|
-
wrap=args.wrap,
|
|
124
|
-
wrap_width=args.wrap_width,
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
print(result) # noqa: T201
|
|
128
|
-
|
|
129
|
-
|
|
130
3
|
if __name__ == "__main__":
|
|
131
|
-
cli
|
|
4
|
+
from html_to_markdown.cli import main
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
result = main(sys.argv[1:])
|
|
8
|
+
print(result) # noqa: T201
|
|
9
|
+
except ValueError as e:
|
|
10
|
+
print(str(e), file=sys.stderr) # noqa: T201
|
|
11
|
+
sys.exit(1)
|
html_to_markdown/cli.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
def main(argv: list[str]) -> str:
|
|
2
|
+
"""Command-line entry point."""
|
|
3
|
+
from argparse import ArgumentParser, FileType
|
|
4
|
+
from sys import stdin
|
|
5
|
+
|
|
6
|
+
from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
|
|
7
|
+
from html_to_markdown.processing import convert_to_markdown
|
|
8
|
+
|
|
9
|
+
parser = ArgumentParser(
|
|
10
|
+
prog="html_to_markdown",
|
|
11
|
+
description="Converts HTML to Markdown.",
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
parser.add_argument(
|
|
15
|
+
"html",
|
|
16
|
+
nargs="?",
|
|
17
|
+
type=FileType("r"),
|
|
18
|
+
default=stdin,
|
|
19
|
+
help="The HTML file to convert. Defaults to STDIN if not provided.",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
parser.add_argument(
|
|
23
|
+
"-s",
|
|
24
|
+
"--strip",
|
|
25
|
+
nargs="*",
|
|
26
|
+
help="A list of tags to strip from the conversion. Incompatible with the --convert option.",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"-c",
|
|
31
|
+
"--convert",
|
|
32
|
+
nargs="*",
|
|
33
|
+
help="A list of HTML tags to explicitly convert. Incompatible with the --strip option.",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"-a",
|
|
38
|
+
"--autolinks",
|
|
39
|
+
action="store_true",
|
|
40
|
+
help="Automatically convert anchor links where the content matches the href.",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"--default-title",
|
|
45
|
+
action="store_false",
|
|
46
|
+
help="Use this flag to disable setting the link title to its href when no title is provided.",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
parser.add_argument(
|
|
50
|
+
"--heading-style",
|
|
51
|
+
default=UNDERLINED,
|
|
52
|
+
choices=(ATX, ATX_CLOSED, UNDERLINED),
|
|
53
|
+
help="Defines the heading conversion style: 'atx', 'atx_closed', or 'underlined'. Defaults to 'underlined'.",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
parser.add_argument(
|
|
57
|
+
"-b",
|
|
58
|
+
"--bullets",
|
|
59
|
+
default="*+-",
|
|
60
|
+
help="A string of bullet styles to use for list items. The style alternates based on nesting level. Defaults to '*+-'.",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
parser.add_argument(
|
|
64
|
+
"--strong-em-symbol",
|
|
65
|
+
default=ASTERISK,
|
|
66
|
+
choices=(ASTERISK, UNDERSCORE),
|
|
67
|
+
help="Choose between '*' or '_' for strong and emphasized text. Defaults to '*'.",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
parser.add_argument(
|
|
71
|
+
"--sub-symbol",
|
|
72
|
+
default="",
|
|
73
|
+
help="Define the characters used to surround <sub> text. Defaults to empty.",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
parser.add_argument(
|
|
77
|
+
"--sup-symbol",
|
|
78
|
+
default="",
|
|
79
|
+
help="Define the characters used to surround <sup> text. Defaults to empty.",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
parser.add_argument(
|
|
83
|
+
"--newline-style",
|
|
84
|
+
default=SPACES,
|
|
85
|
+
choices=(SPACES, BACKSLASH),
|
|
86
|
+
help="Specify the <br> conversion style: two spaces (default) or a backslash at the end of the line.",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
parser.add_argument(
|
|
90
|
+
"--code-language",
|
|
91
|
+
default="",
|
|
92
|
+
help="Specify the default language for code blocks inside <pre> tags. Defaults to empty.",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
parser.add_argument(
|
|
96
|
+
"--no-escape-asterisks",
|
|
97
|
+
dest="escape_asterisks",
|
|
98
|
+
action="store_false",
|
|
99
|
+
help="Disable escaping of '*' characters in text to '\\*'.",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
parser.add_argument(
|
|
103
|
+
"--no-escape-underscores",
|
|
104
|
+
dest="escape_underscores",
|
|
105
|
+
action="store_false",
|
|
106
|
+
help="Disable escaping of '_' characters in text to '\\_'.",
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
parser.add_argument(
|
|
110
|
+
"-i",
|
|
111
|
+
"--keep-inline-images-in",
|
|
112
|
+
nargs="*",
|
|
113
|
+
help="Specify parent tags where inline images should be preserved as images, rather than converted to alt-text. Defaults to None.",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
parser.add_argument(
|
|
117
|
+
"-w",
|
|
118
|
+
"--wrap",
|
|
119
|
+
action="store_true",
|
|
120
|
+
help="Enable word wrapping for paragraphs at --wrap-width characters.",
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
parser.add_argument(
|
|
124
|
+
"--wrap-width",
|
|
125
|
+
type=int,
|
|
126
|
+
default=80,
|
|
127
|
+
help="The number of characters at which text paragraphs should wrap. Defaults to 80.",
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
args = parser.parse_args(argv)
|
|
131
|
+
|
|
132
|
+
return convert_to_markdown(
|
|
133
|
+
args.html.read(),
|
|
134
|
+
strip=args.strip,
|
|
135
|
+
convert=args.convert,
|
|
136
|
+
autolinks=args.autolinks,
|
|
137
|
+
default_title=args.default_title,
|
|
138
|
+
heading_style=args.heading_style,
|
|
139
|
+
bullets=args.bullets,
|
|
140
|
+
strong_em_symbol=args.strong_em_symbol,
|
|
141
|
+
sub_symbol=args.sub_symbol,
|
|
142
|
+
sup_symbol=args.sup_symbol,
|
|
143
|
+
newline_style=args.newline_style,
|
|
144
|
+
code_language=args.code_language,
|
|
145
|
+
escape_asterisks=args.escape_asterisks,
|
|
146
|
+
escape_underscores=args.escape_underscores,
|
|
147
|
+
keep_inline_images_in=args.keep_inline_images_in,
|
|
148
|
+
wrap=args.wrap,
|
|
149
|
+
wrap_width=args.wrap_width,
|
|
150
|
+
)
|
html_to_markdown/constants.py
CHANGED
|
@@ -2,17 +2,17 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
4
|
from re import Pattern
|
|
5
|
-
from typing import Final
|
|
5
|
+
from typing import Final
|
|
6
6
|
|
|
7
7
|
convert_heading_re: Final[Pattern[str]] = re.compile(r"convert_h(\d+)")
|
|
8
8
|
line_beginning_re: Final[Pattern[str]] = re.compile(r"^", re.MULTILINE)
|
|
9
9
|
whitespace_re: Final[Pattern[str]] = re.compile(r"[\t ]+")
|
|
10
10
|
html_heading_re: Final[Pattern[str]] = re.compile(r"h[1-6]")
|
|
11
11
|
|
|
12
|
-
ASTERISK: Final
|
|
13
|
-
ATX: Final
|
|
14
|
-
ATX_CLOSED: Final
|
|
15
|
-
BACKSLASH: Final
|
|
16
|
-
UNDERLINED: Final
|
|
17
|
-
SPACES: Final
|
|
18
|
-
UNDERSCORE: Final
|
|
12
|
+
ASTERISK: Final = "*"
|
|
13
|
+
ATX: Final = "atx"
|
|
14
|
+
ATX_CLOSED: Final = "atx_closed"
|
|
15
|
+
BACKSLASH: Final = "backslash"
|
|
16
|
+
UNDERLINED: Final = "underlined"
|
|
17
|
+
SPACES: Final = "spaces"
|
|
18
|
+
UNDERSCORE: Final = "_"
|
html_to_markdown/converters.py
CHANGED
|
@@ -55,17 +55,19 @@ SupportedElements = Literal[
|
|
|
55
55
|
"kbd",
|
|
56
56
|
]
|
|
57
57
|
|
|
58
|
-
|
|
58
|
+
ConvertersMap = Mapping[SupportedElements, Callable[[str, Tag], str]]
|
|
59
59
|
|
|
60
60
|
T = TypeVar("T")
|
|
61
61
|
|
|
62
62
|
|
|
63
63
|
def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
|
|
64
|
-
"""
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
64
|
+
"""Create an inline converter for a markup pattern or tag.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
markup_prefix: The markup prefix to insert.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
A function that can be used to convert HTML to Markdown.
|
|
69
71
|
"""
|
|
70
72
|
|
|
71
73
|
def implementation(*, tag: Tag, text: str) -> str:
|
|
@@ -147,9 +149,9 @@ def _convert_hn(
|
|
|
147
149
|
|
|
148
150
|
|
|
149
151
|
def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: Iterable[str] | None) -> str:
|
|
150
|
-
alt = tag.attrs.get("alt",
|
|
151
|
-
src = tag.attrs.get("src",
|
|
152
|
-
title = tag.attrs.get("title",
|
|
152
|
+
alt = tag.attrs.get("alt", "")
|
|
153
|
+
src = tag.attrs.get("src", "")
|
|
154
|
+
title = tag.attrs.get("title", "")
|
|
153
155
|
title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
|
|
154
156
|
parent_name = tag.parent.name if tag.parent else ""
|
|
155
157
|
if convert_as_inline and parent_name not in (keep_inline_images_in or []):
|
|
@@ -295,7 +297,7 @@ def create_converters_map(
|
|
|
295
297
|
sup_symbol: str,
|
|
296
298
|
wrap: bool,
|
|
297
299
|
wrap_width: int,
|
|
298
|
-
) ->
|
|
300
|
+
) -> ConvertersMap:
|
|
299
301
|
"""Create a mapping of HTML elements to their corresponding conversion functions.
|
|
300
302
|
|
|
301
303
|
Args:
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Literal
|
|
4
|
+
|
|
5
|
+
from html_to_markdown.constants import ASTERISK, SPACES, UNDERLINED
|
|
6
|
+
from html_to_markdown.converters import create_converters_map
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import Callable, Iterable
|
|
10
|
+
|
|
11
|
+
from bs4 import Tag
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _create_legacy_class(
|
|
15
|
+
autolinks: bool,
|
|
16
|
+
bullets: str,
|
|
17
|
+
code_language: str,
|
|
18
|
+
code_language_callback: Callable[[Tag], str] | None,
|
|
19
|
+
default_title: bool,
|
|
20
|
+
heading_style: Literal["atx", "atx_closed", "underlined"],
|
|
21
|
+
keep_inline_images_in: Iterable[str] | None,
|
|
22
|
+
newline_style: str,
|
|
23
|
+
strong_em_symbol: str,
|
|
24
|
+
sub_symbol: str,
|
|
25
|
+
sup_symbol: str,
|
|
26
|
+
wrap: bool,
|
|
27
|
+
wrap_width: int,
|
|
28
|
+
) -> type:
|
|
29
|
+
"""Create a legacy class for Markdownify.
|
|
30
|
+
|
|
31
|
+
Deprecated: Use the new hooks api instead.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
autolinks: Whether to convert URLs into links.
|
|
35
|
+
bullets: The bullet characters to use for unordered lists.
|
|
36
|
+
code_language: The default code language to use.
|
|
37
|
+
code_language_callback: A callback to get the code language.
|
|
38
|
+
default_title: Whether to use the URL as the title for links.
|
|
39
|
+
heading_style: The style of headings.
|
|
40
|
+
keep_inline_images_in: The tags to keep inline images in.
|
|
41
|
+
newline_style: The style of newlines.
|
|
42
|
+
strong_em_symbol: The symbol to use for strong and emphasis text.
|
|
43
|
+
sub_symbol: The symbol to use for subscript text.
|
|
44
|
+
sup_symbol: The symbol to use for superscript text.
|
|
45
|
+
wrap: Whether to wrap text.
|
|
46
|
+
wrap_width: The width to wrap text at.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
A class that can be used to convert HTML to Markdown.
|
|
50
|
+
"""
|
|
51
|
+
return type(
|
|
52
|
+
"Markdownify",
|
|
53
|
+
(),
|
|
54
|
+
{
|
|
55
|
+
k.removeprefix("_"): v
|
|
56
|
+
for k, v in create_converters_map(
|
|
57
|
+
autolinks=autolinks,
|
|
58
|
+
bullets=bullets,
|
|
59
|
+
code_language=code_language,
|
|
60
|
+
code_language_callback=code_language_callback,
|
|
61
|
+
default_title=default_title,
|
|
62
|
+
heading_style=heading_style,
|
|
63
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
64
|
+
newline_style=newline_style,
|
|
65
|
+
strong_em_symbol=strong_em_symbol,
|
|
66
|
+
sub_symbol=sub_symbol,
|
|
67
|
+
sup_symbol=sup_symbol,
|
|
68
|
+
wrap=wrap,
|
|
69
|
+
wrap_width=wrap_width,
|
|
70
|
+
).items()
|
|
71
|
+
},
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
Markdownify = _create_legacy_class(
|
|
76
|
+
autolinks=True,
|
|
77
|
+
bullets="*+-",
|
|
78
|
+
code_language="",
|
|
79
|
+
code_language_callback=None,
|
|
80
|
+
default_title=False,
|
|
81
|
+
heading_style=UNDERLINED,
|
|
82
|
+
keep_inline_images_in=None,
|
|
83
|
+
newline_style=SPACES,
|
|
84
|
+
strong_em_symbol=ASTERISK,
|
|
85
|
+
sub_symbol="",
|
|
86
|
+
sup_symbol="",
|
|
87
|
+
wrap=False,
|
|
88
|
+
wrap_width=80,
|
|
89
|
+
)
|
html_to_markdown/processing.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from itertools import chain
|
|
3
4
|
from typing import TYPE_CHECKING, Any, Callable, Literal, cast
|
|
4
5
|
|
|
5
6
|
from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
|
|
@@ -11,7 +12,7 @@ from html_to_markdown.constants import (
|
|
|
11
12
|
html_heading_re,
|
|
12
13
|
whitespace_re,
|
|
13
14
|
)
|
|
14
|
-
from html_to_markdown.converters import
|
|
15
|
+
from html_to_markdown.converters import ConvertersMap, create_converters_map
|
|
15
16
|
from html_to_markdown.utils import escape
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
@@ -76,48 +77,21 @@ def _is_nested_tag(el: PageElement) -> bool:
|
|
|
76
77
|
|
|
77
78
|
def _process_tag(
|
|
78
79
|
tag: Tag,
|
|
80
|
+
converters_map: ConvertersMap,
|
|
79
81
|
*,
|
|
80
|
-
|
|
81
|
-
bullets: str,
|
|
82
|
-
code_language: str,
|
|
83
|
-
code_language_callback: Callable[[Any], str] | None,
|
|
84
|
-
convert: Iterable[str] | None,
|
|
82
|
+
convert: set[str] | None,
|
|
85
83
|
convert_as_inline: bool = False,
|
|
86
|
-
converters_map: ConvertsMap | None = None,
|
|
87
|
-
default_title: bool,
|
|
88
84
|
escape_asterisks: bool,
|
|
89
85
|
escape_misc: bool,
|
|
90
86
|
escape_underscores: bool,
|
|
91
|
-
|
|
92
|
-
keep_inline_images_in: Iterable[str] | None,
|
|
93
|
-
newline_style: str,
|
|
94
|
-
strip: Iterable[str] | None,
|
|
95
|
-
strong_em_symbol: str,
|
|
96
|
-
sub_symbol: str,
|
|
97
|
-
sup_symbol: str,
|
|
98
|
-
wrap: bool,
|
|
99
|
-
wrap_width: int,
|
|
87
|
+
strip: set[str] | None,
|
|
100
88
|
) -> str:
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
autolinks=autolinks,
|
|
104
|
-
bullets=bullets,
|
|
105
|
-
code_language=code_language,
|
|
106
|
-
code_language_callback=code_language_callback,
|
|
107
|
-
default_title=default_title,
|
|
108
|
-
heading_style=heading_style,
|
|
109
|
-
keep_inline_images_in=keep_inline_images_in,
|
|
110
|
-
newline_style=newline_style,
|
|
111
|
-
strong_em_symbol=strong_em_symbol,
|
|
112
|
-
sub_symbol=sub_symbol,
|
|
113
|
-
sup_symbol=sup_symbol,
|
|
114
|
-
wrap=wrap,
|
|
115
|
-
wrap_width=wrap_width,
|
|
116
|
-
)
|
|
117
|
-
|
|
89
|
+
should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
|
|
90
|
+
tag_name: SupportedTag | None = cast(SupportedTag, tag.name.lower()) if tag.name.lower() in converters_map else None
|
|
118
91
|
text = ""
|
|
92
|
+
|
|
119
93
|
is_heading = html_heading_re.match(tag.name) is not None
|
|
120
|
-
is_cell =
|
|
94
|
+
is_cell = tag_name in {"td", "th"}
|
|
121
95
|
convert_children_as_inline = convert_as_inline or is_heading or is_cell
|
|
122
96
|
|
|
123
97
|
if _is_nested_tag(tag):
|
|
@@ -141,32 +115,17 @@ def _process_tag(
|
|
|
141
115
|
)
|
|
142
116
|
elif isinstance(el, Tag):
|
|
143
117
|
text += _process_tag(
|
|
144
|
-
|
|
118
|
+
el,
|
|
119
|
+
converters_map,
|
|
145
120
|
convert_as_inline=convert_children_as_inline,
|
|
146
|
-
strip=strip,
|
|
147
121
|
convert=convert,
|
|
148
|
-
escape_misc=escape_misc,
|
|
149
122
|
escape_asterisks=escape_asterisks,
|
|
123
|
+
escape_misc=escape_misc,
|
|
150
124
|
escape_underscores=escape_underscores,
|
|
151
|
-
|
|
152
|
-
autolinks=autolinks,
|
|
153
|
-
bullets=bullets,
|
|
154
|
-
code_language=code_language,
|
|
155
|
-
code_language_callback=code_language_callback,
|
|
156
|
-
default_title=default_title,
|
|
157
|
-
heading_style=heading_style,
|
|
158
|
-
keep_inline_images_in=keep_inline_images_in,
|
|
159
|
-
newline_style=newline_style,
|
|
160
|
-
strong_em_symbol=strong_em_symbol,
|
|
161
|
-
sub_symbol=sub_symbol,
|
|
162
|
-
sup_symbol=sup_symbol,
|
|
163
|
-
wrap=wrap,
|
|
164
|
-
wrap_width=wrap_width,
|
|
125
|
+
strip=strip,
|
|
165
126
|
)
|
|
166
127
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
if tag_name and _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert):
|
|
128
|
+
if tag_name and should_convert_tag:
|
|
170
129
|
return converters_map[tag_name]( # type: ignore[call-arg]
|
|
171
130
|
tag=tag, text=text, convert_as_inline=convert_as_inline
|
|
172
131
|
)
|
|
@@ -209,7 +168,7 @@ def _process_text(
|
|
|
209
168
|
return text
|
|
210
169
|
|
|
211
170
|
|
|
212
|
-
def _should_convert_tag(*, tag_name: str, strip:
|
|
171
|
+
def _should_convert_tag(*, tag_name: str, strip: set[str] | None, convert: set[str] | None) -> bool:
|
|
213
172
|
if strip is not None:
|
|
214
173
|
return tag_name not in strip
|
|
215
174
|
if convert is not None:
|
|
@@ -217,15 +176,22 @@ def _should_convert_tag(*, tag_name: str, strip: Iterable[str] | None, convert:
|
|
|
217
176
|
return True
|
|
218
177
|
|
|
219
178
|
|
|
179
|
+
def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
|
|
180
|
+
if value is None:
|
|
181
|
+
return None
|
|
182
|
+
if isinstance(value, str):
|
|
183
|
+
return set(",".split(value))
|
|
184
|
+
return {*chain(*[v.split(",") for v in value])}
|
|
185
|
+
|
|
186
|
+
|
|
220
187
|
def convert_to_markdown(
|
|
221
|
-
|
|
188
|
+
source: str | BeautifulSoup,
|
|
222
189
|
*,
|
|
223
|
-
soup: BeautifulSoup | None = None,
|
|
224
190
|
autolinks: bool = True,
|
|
225
191
|
bullets: str = "*+-",
|
|
226
192
|
code_language: str = "",
|
|
227
193
|
code_language_callback: Callable[[Any], str] | None = None,
|
|
228
|
-
convert: Iterable[str] | None = None,
|
|
194
|
+
convert: str | Iterable[str] | None = None,
|
|
229
195
|
default_title: bool = False,
|
|
230
196
|
escape_asterisks: bool = True,
|
|
231
197
|
escape_misc: bool = True,
|
|
@@ -233,7 +199,7 @@ def convert_to_markdown(
|
|
|
233
199
|
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
234
200
|
keep_inline_images_in: Iterable[str] | None = None,
|
|
235
201
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
236
|
-
strip: Iterable[str] | None = None,
|
|
202
|
+
strip: str | Iterable[str] | None = None,
|
|
237
203
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
238
204
|
sub_symbol: str = "",
|
|
239
205
|
sup_symbol: str = "",
|
|
@@ -244,55 +210,67 @@ def convert_to_markdown(
|
|
|
244
210
|
"""Convert HTML to Markdown.
|
|
245
211
|
|
|
246
212
|
Args:
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
213
|
+
source: An HTML document or a an initialized instance of BeautifulSoup.
|
|
214
|
+
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
215
|
+
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
216
|
+
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
217
|
+
code_language_callback: Function to dynamically determine the language for code blocks.
|
|
218
|
+
convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
219
|
+
default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
220
|
+
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
221
|
+
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
222
|
+
escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
|
|
223
|
+
heading_style: The style to use for Markdown headings. Defaults to "underlined".
|
|
224
|
+
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
225
|
+
newline_style: Style for handling newlines in text content. Defaults to "spaces".
|
|
226
|
+
strip: Tags to strip from the output. Defaults to None.
|
|
227
|
+
strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
|
|
228
|
+
sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
|
|
229
|
+
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
230
|
+
wrap: Wrap text to the specified width. Defaults to False.
|
|
231
|
+
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
232
|
+
convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
233
|
+
|
|
234
|
+
Raises:
|
|
235
|
+
ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
|
|
268
236
|
|
|
269
237
|
Returns:
|
|
270
|
-
|
|
238
|
+
str: A string of Markdown-formatted text converted from the given HTML.
|
|
271
239
|
"""
|
|
272
|
-
if
|
|
240
|
+
if isinstance(source, str):
|
|
273
241
|
from bs4 import BeautifulSoup
|
|
274
242
|
|
|
275
|
-
|
|
243
|
+
if "".join(source.split("\n")):
|
|
244
|
+
source = BeautifulSoup(source, "html.parser")
|
|
245
|
+
else:
|
|
246
|
+
raise ValueError("The input HTML is empty.")
|
|
276
247
|
|
|
277
|
-
|
|
248
|
+
if strip is not None and convert is not None:
|
|
249
|
+
raise ValueError("Only one of 'strip' and 'convert' can be specified.")
|
|
250
|
+
|
|
251
|
+
converters_map = create_converters_map(
|
|
278
252
|
autolinks=autolinks,
|
|
279
253
|
bullets=bullets,
|
|
280
254
|
code_language=code_language,
|
|
281
255
|
code_language_callback=code_language_callback,
|
|
282
|
-
convert=convert,
|
|
283
|
-
convert_as_inline=convert_as_inline,
|
|
284
256
|
default_title=default_title,
|
|
285
|
-
escape_asterisks=escape_asterisks,
|
|
286
|
-
escape_misc=escape_misc,
|
|
287
|
-
escape_underscores=escape_underscores,
|
|
288
257
|
heading_style=heading_style,
|
|
289
258
|
keep_inline_images_in=keep_inline_images_in,
|
|
290
259
|
newline_style=newline_style,
|
|
291
|
-
strip=strip,
|
|
292
260
|
strong_em_symbol=strong_em_symbol,
|
|
293
261
|
sub_symbol=sub_symbol,
|
|
294
262
|
sup_symbol=sup_symbol,
|
|
295
|
-
tag=soup,
|
|
296
263
|
wrap=wrap,
|
|
297
264
|
wrap_width=wrap_width,
|
|
298
265
|
)
|
|
266
|
+
|
|
267
|
+
return _process_tag(
|
|
268
|
+
source,
|
|
269
|
+
converters_map,
|
|
270
|
+
convert=_as_optional_set(convert),
|
|
271
|
+
convert_as_inline=convert_as_inline,
|
|
272
|
+
escape_asterisks=escape_asterisks,
|
|
273
|
+
escape_misc=escape_misc,
|
|
274
|
+
escape_underscores=escape_underscores,
|
|
275
|
+
strip=_as_optional_set(strip),
|
|
276
|
+
)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: html-to-markdown
|
|
3
|
+
Version: 1.2.0
|
|
4
|
+
Summary: Convert HTML to markdown
|
|
5
|
+
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: converter,html,markdown,text-extraction,text-processing
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Text Processing
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
21
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
22
|
+
Classifier: Topic :: Utilities
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.9
|
|
25
|
+
Requires-Dist: beautifulsoup4>=4.12.3
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# html_to_markdown
|
|
29
|
+
|
|
30
|
+
This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
|
|
31
|
+
Python 3.9 and above.
|
|
32
|
+
|
|
33
|
+
### Differences with the Markdownify
|
|
34
|
+
|
|
35
|
+
- The refactored codebase uses a strict functional approach - no classes are involved.
|
|
36
|
+
- There is full typing with strict MyPy strict adherence and a py.typed file included.
|
|
37
|
+
- The `convert_to_markdown` function allows passing a pre-configured instance of `BeautifulSoup` instead of html.
|
|
38
|
+
- This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
|
|
39
|
+
point versioning is no longer aligned.
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
```shell
|
|
44
|
+
pip install html_to_markdown
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Usage
|
|
48
|
+
|
|
49
|
+
Convert an string HTML to Markdown:
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from html_to_markdown import convert_to_markdown
|
|
53
|
+
|
|
54
|
+
convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Or pass a pre-configured instance of `BeautifulSoup`:
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from bs4 import BeautifulSoup
|
|
61
|
+
from html_to_markdown import convert_to_markdown
|
|
62
|
+
|
|
63
|
+
soup = BeautifulSoup('<b>Yay</b> <a href="http://github.com">GitHub</a>', 'lxml') # lxml requires an extra dependency.
|
|
64
|
+
|
|
65
|
+
convert_to_markdown(soup) # > '**Yay** [GitHub](http://github.com)'
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Options
|
|
69
|
+
|
|
70
|
+
The `convert_to_markdown` function accepts the following kwargs:
|
|
71
|
+
|
|
72
|
+
- autolinks (bool): Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
73
|
+
- bullets (str): A string of characters to use for bullet points in lists. Defaults to '\*+-'.
|
|
74
|
+
- code_language (str): Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
75
|
+
- code_language_callback (Callable[[Any], str] | None): Function to dynamically determine the language for code blocks.
|
|
76
|
+
- convert (Iterable[str] | None): A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
77
|
+
- default_title (bool): Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
78
|
+
- escape_asterisks (bool): Escape asterisks (\*) to prevent unintended Markdown formatting. Defaults to True.
|
|
79
|
+
- escape_misc (bool): Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
80
|
+
- escape*underscores (bool): Escape underscores (*) to prevent unintended italic formatting. Defaults to True.
|
|
81
|
+
- heading_style (Literal["underlined", "atx", "atx_closed"]): The style to use for Markdown headings. Defaults to "
|
|
82
|
+
underlined".
|
|
83
|
+
- keep_inline_images_in (Iterable[str] | None): Tags in which inline images should be preserved. Defaults to None.
|
|
84
|
+
- newline_style (Literal["spaces", "backslash"]): Style for handling newlines in text content. Defaults to "spaces".
|
|
85
|
+
- strip (Iterable[str] | None): Tags to strip from the output. Defaults to None.
|
|
86
|
+
- strong*em_symbol (Literal["\*", "*"]): Symbol to use for strong/emphasized text. Defaults to "\*".
|
|
87
|
+
- sub_symbol (str): Custom symbol for subscript text. Defaults to an empty string.
|
|
88
|
+
- sup_symbol (str): Custom symbol for superscript text. Defaults to an empty string.
|
|
89
|
+
- wrap (bool): Wrap text to the specified width. Defaults to False.
|
|
90
|
+
- wrap_width (int): The number of characters at which to wrap text. Defaults to 80.
|
|
91
|
+
- convert_as_inline (bool): Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
92
|
+
|
|
93
|
+
## CLI
|
|
94
|
+
|
|
95
|
+
For compatibility with the original markdownify, a CLI is provided. Use `html_to_markdown example.html > example.md` or
|
|
96
|
+
pipe input from stdin:
|
|
97
|
+
|
|
98
|
+
```shell
|
|
99
|
+
cat example.html | html_to_markdown > example.md
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Use `html_to_markdown -h` to see all available options. They are the same as listed above and take the same arguments.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
html_to_markdown/__init__.py,sha256=cXm4YOyrAp2HKHMDfnVA5e75zg6wdqpyXugjBYvBMFc,143
|
|
2
|
+
html_to_markdown/__main__.py,sha256=u5xevySlT5eIGyLUaethdDQIKJygaKnc3F2sHWoz75g,264
|
|
3
|
+
html_to_markdown/cli.py,sha256=HVnzmcyrYwah_yWhZ87mZcG0VgnKYp6y89fJh2R-Rlw,4532
|
|
4
|
+
html_to_markdown/constants.py,sha256=Usk67k18tuRovJpKDsiEXdgH20KgqI9KOnK4Fbx-M5c,547
|
|
5
|
+
html_to_markdown/converters.py,sha256=hW4RqAbgx0tdTzfUSvAGQg1OgQUmHL1cekZtJLFq_Ns,12080
|
|
6
|
+
html_to_markdown/legacy.py,sha256=vL-MVKPXOue-JJafXFtmGcVIPylwmPOly0CELTSzWRQ,2773
|
|
7
|
+
html_to_markdown/processing.py,sha256=L1wZwUm7WA8wN4GA5zjCStwACb-8S2scQZPbzeHgdY8,8951
|
|
8
|
+
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
|
|
10
|
+
html_to_markdown-1.2.0.dist-info/METADATA,sha256=Dg2ZibNWNW_GyszXG2bxT-oOtOJc8iryVxlLn38eMww,4709
|
|
11
|
+
html_to_markdown-1.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
12
|
+
html_to_markdown-1.2.0.dist-info/licenses/LICENSE,sha256=06BS7zd6oPCrbzAqrThGFboRlbssgBsqDJGqKyZW2Og,1117
|
|
13
|
+
html_to_markdown-1.2.0.dist-info/RECORD,,
|
|
@@ -1,194 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: html-to-markdown
|
|
3
|
-
Version: 1.0.0
|
|
4
|
-
Summary: Convert HTML to markdown
|
|
5
|
-
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
|
-
License: MIT
|
|
7
|
-
License-File: LICENSE
|
|
8
|
-
Keywords: beautifulsoup,converter,html,markdown,text-processing
|
|
9
|
-
Classifier: Intended Audience :: Developers
|
|
10
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
-
Classifier: Topic :: Text Processing
|
|
18
|
-
Classifier: Topic :: Text Processing :: Markup
|
|
19
|
-
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
20
|
-
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
21
|
-
Classifier: Topic :: Utilities
|
|
22
|
-
Classifier: Typing :: Typed
|
|
23
|
-
Requires-Python: >=3.9
|
|
24
|
-
Requires-Dist: beautifulsoup4>=4.12.3
|
|
25
|
-
Description-Content-Type: text/markdown
|
|
26
|
-
|
|
27
|
-
# html_to_markdown
|
|
28
|
-
|
|
29
|
-
This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
|
|
30
|
-
Python 3.9 and offering strong typing.
|
|
31
|
-
|
|
32
|
-
### Differences from the Markdownify
|
|
33
|
-
|
|
34
|
-
- The refactored codebase uses a strict functional approach - no classes are involved.
|
|
35
|
-
- There is full typing with strict MyPy adherence in place.
|
|
36
|
-
- The `convert_to_markdown` allows passing a pre-configured instance of `Beautifulsoup`.
|
|
37
|
-
- This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
|
|
38
|
-
point versioning is no longer aligned.
|
|
39
|
-
|
|
40
|
-
## Installation
|
|
41
|
-
|
|
42
|
-
```shell
|
|
43
|
-
pip install html_to_markdown
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
## Usage
|
|
47
|
-
|
|
48
|
-
Convert some HTML to Markdown:
|
|
49
|
-
|
|
50
|
-
```python
|
|
51
|
-
from html_to_markdown import convert_to_markdown
|
|
52
|
-
|
|
53
|
-
convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
Specify tags to exclude:
|
|
57
|
-
|
|
58
|
-
```python
|
|
59
|
-
from html_to_markdown import convert_to_markdown
|
|
60
|
-
|
|
61
|
-
convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', strip=['a']) # > '**Yay** GitHub'
|
|
62
|
-
```
|
|
63
|
-
|
|
64
|
-
\...or specify the tags you want to include:
|
|
65
|
-
|
|
66
|
-
```python
|
|
67
|
-
from html_to_markdown import convert_to_markdown
|
|
68
|
-
|
|
69
|
-
convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', convert=['b']) # > '**Yay** GitHub'
|
|
70
|
-
```
|
|
71
|
-
|
|
72
|
-
# Options
|
|
73
|
-
|
|
74
|
-
html_to_markdown supports the following options:
|
|
75
|
-
|
|
76
|
-
strip
|
|
77
|
-
|
|
78
|
-
: A list of tags to strip. This option can\'t be used with the
|
|
79
|
-
`convert` option.
|
|
80
|
-
|
|
81
|
-
convert
|
|
82
|
-
|
|
83
|
-
: A list of tags to convert. This option can\'t be used with the
|
|
84
|
-
`strip` option.
|
|
85
|
-
|
|
86
|
-
autolinks
|
|
87
|
-
|
|
88
|
-
: A boolean indicating whether the \"automatic link\" style should be
|
|
89
|
-
used when a `a` tag\'s contents match its href. Defaults to `True`.
|
|
90
|
-
|
|
91
|
-
default_title
|
|
92
|
-
|
|
93
|
-
: A boolean to enable setting the title of a link to its href, if no
|
|
94
|
-
title is given. Defaults to `False`.
|
|
95
|
-
|
|
96
|
-
heading_style
|
|
97
|
-
|
|
98
|
-
: Defines how headings should be converted. Accepted values are `ATX`,
|
|
99
|
-
`ATX_CLOSED`, `SETEXT`, and `UNDERLINED` (which is an alias for
|
|
100
|
-
`SETEXT`). Defaults to `UNDERLINED`.
|
|
101
|
-
|
|
102
|
-
bullets
|
|
103
|
-
|
|
104
|
-
: An iterable (string, list, or tuple) of bullet styles to be used. If
|
|
105
|
-
the iterable only contains one item, it will be used regardless of
|
|
106
|
-
how deeply lists are nested. Otherwise, the bullet will alternate
|
|
107
|
-
based on nesting level. Defaults to `'*+-'`.
|
|
108
|
-
|
|
109
|
-
strong_em_symbol
|
|
110
|
-
|
|
111
|
-
: In markdown, both `*` and `_` are used to encode **strong** or
|
|
112
|
-
*emphasized* texts. Either of these symbols can be chosen by the
|
|
113
|
-
options `ASTERISK` (default) or `UNDERSCORE` respectively.
|
|
114
|
-
|
|
115
|
-
sub_symbol, sup_symbol
|
|
116
|
-
|
|
117
|
-
: Define the chars that surround `<sub>` and `<sup>` text. Defaults to
|
|
118
|
-
an empty string, because this is non-standard behavior. Could be
|
|
119
|
-
something like `~` and `^` to result in `~sub~` and `^sup^`. If the
|
|
120
|
-
value starts with `<` and ends with `>`, it is treated as an HTML
|
|
121
|
-
tag and a `/` is inserted after the `<` in the string used after the
|
|
122
|
-
text; this allows specifying `<sub>` to use raw HTML in the output
|
|
123
|
-
for subscripts, for example.
|
|
124
|
-
|
|
125
|
-
newline_style
|
|
126
|
-
|
|
127
|
-
: Defines the style of marking linebreaks (`<br>`) in markdown. The
|
|
128
|
-
default value `SPACES` of this option will adopt the usual two
|
|
129
|
-
spaces and a newline, while `BACKSLASH` will convert a linebreak to
|
|
130
|
-
`\\n` (a backslash and a newline). While the latter convention is
|
|
131
|
-
non-standard, it is commonly preferred and supported by a lot of
|
|
132
|
-
interpreters.
|
|
133
|
-
|
|
134
|
-
code_language
|
|
135
|
-
|
|
136
|
-
: Defines the language that should be assumed for all `<pre>`
|
|
137
|
-
sections. Useful, if all code on a page is in the same programming
|
|
138
|
-
language and should be annotated with ``[python]{.title-ref}[ or
|
|
139
|
-
similar. Defaults to ]{.title-ref}[\'\']{.title-ref}\` (empty
|
|
140
|
-
string) and can be any string.
|
|
141
|
-
|
|
142
|
-
code_language_callback
|
|
143
|
-
|
|
144
|
-
: When the HTML code contains `pre` tags that in some way provide the
|
|
145
|
-
code language, for example as class, this callback can be used to
|
|
146
|
-
extract the language from the tag and prefix it to the converted
|
|
147
|
-
`pre` tag. The callback gets one single argument, an BeautifylSoup
|
|
148
|
-
object, and returns a string containing the code language, or
|
|
149
|
-
`None`. An example to use the class name as code language could be:
|
|
150
|
-
|
|
151
|
-
def callback(el):
|
|
152
|
-
return el['class'][0] if el.has_attr('class') else None
|
|
153
|
-
|
|
154
|
-
Defaults to `None`.
|
|
155
|
-
|
|
156
|
-
escape_asterisks
|
|
157
|
-
|
|
158
|
-
: If set to `False`, do not escape `*` to `\*` in text. Defaults to
|
|
159
|
-
`True`.
|
|
160
|
-
|
|
161
|
-
escape_underscores
|
|
162
|
-
|
|
163
|
-
: If set to `False`, do not escape `_` to `\_` in text. Defaults to
|
|
164
|
-
`True`.
|
|
165
|
-
|
|
166
|
-
escape_misc
|
|
167
|
-
|
|
168
|
-
: If set to `False`, do not escape miscellaneous punctuation
|
|
169
|
-
characters that sometimes have Markdown significance in text.
|
|
170
|
-
Defaults to `True`.
|
|
171
|
-
|
|
172
|
-
keep_inline_images_in
|
|
173
|
-
|
|
174
|
-
: Images are converted to their alt-text when the images are located
|
|
175
|
-
inside headlines or table cells. If some inline images should be
|
|
176
|
-
converted to markdown images instead, this option can be set to a
|
|
177
|
-
list of parent tags that should be allowed to contain inline images,
|
|
178
|
-
for example `['td']`. Defaults to an empty list.
|
|
179
|
-
|
|
180
|
-
wrap, wrap_width
|
|
181
|
-
|
|
182
|
-
: If `wrap` is set to `True`, all text paragraphs are wrapped at
|
|
183
|
-
`wrap_width` characters. Defaults to `False` and `80`. Use with
|
|
184
|
-
`newline_style=BACKSLASH` to keep line breaks in paragraphs.
|
|
185
|
-
|
|
186
|
-
Options may be specified as kwargs to the `html_to_markdown` function, or as
|
|
187
|
-
a nested `Options` class in `MarkdownConverter` subclasses.
|
|
188
|
-
|
|
189
|
-
# CLI
|
|
190
|
-
|
|
191
|
-
Use `html_to_markdown example.html > example.md` or pipe input from stdin
|
|
192
|
-
(`cat example.html | html_to_markdown > example.md`). Call `html_to_markdown -h`
|
|
193
|
-
to see all available options. They are the same as listed above and take
|
|
194
|
-
the same arguments.
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
html_to_markdown/__init__.py,sha256=_WXeqic-7b6hvidTXkPQwAfLa4YOEAEP-mOUXjx_25k,95
|
|
2
|
-
html_to_markdown/__main__.py,sha256=Wll22XKFmiNSIpdbGzC75b5_Unc3HYOTA6oXA414Tl8,4412
|
|
3
|
-
html_to_markdown/constants.py,sha256=vUjffZ0vFq56jbXF5bBNzomfJwgsp0TWqdUzhkp6bks,687
|
|
4
|
-
html_to_markdown/converters.py,sha256=q1wpzsYl-FRR9qbB983gAkem_-7mgYZ7hOgziofjIDM,12238
|
|
5
|
-
html_to_markdown/processing.py,sha256=9l3zq_kdyvU0TnTk5g4uuYI6Jbu1gY7NQ11u3IBKyFU,9029
|
|
6
|
-
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
|
|
8
|
-
html_to_markdown-1.0.0.dist-info/METADATA,sha256=0MObULuhTHiyvVcytDBN_liafpyFjgp5brgoWQYEglA,6478
|
|
9
|
-
html_to_markdown-1.0.0.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
10
|
-
html_to_markdown-1.0.0.dist-info/entry_points.txt,sha256=jhMqXDYvIyzQDLKjCn4xCyzCCbAMl94tzQx_HiG5Qi0,67
|
|
11
|
-
html_to_markdown-1.0.0.dist-info/licenses/LICENSE,sha256=06BS7zd6oPCrbzAqrThGFboRlbssgBsqDJGqKyZW2Og,1117
|
|
12
|
-
html_to_markdown-1.0.0.dist-info/RECORD,,
|
|
File without changes
|