html-to-markdown 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -1,3 +1,5 @@
1
1
  from html_to_markdown.processing import convert_to_markdown
2
2
 
3
- __all__ = ["convert_to_markdown"]
3
+ from .legacy import Markdownify
4
+
5
+ __all__ = ["Markdownify", "convert_to_markdown"]
@@ -1,131 +1,11 @@
1
- import argparse
2
1
  import sys
3
2
 
4
- from html_to_markdown import convert_to_markdown
5
- from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
6
-
7
-
8
- def cli(argv: list[str]) -> None:
9
- """Command-line interface for html_to_markdown."""
10
- parser = argparse.ArgumentParser(
11
- prog="html_to_markdown",
12
- description="Converts html to markdown.",
13
- )
14
-
15
- parser.add_argument(
16
- "html",
17
- nargs="?",
18
- type=argparse.FileType("r"),
19
- default=sys.stdin,
20
- help="The html file to convert. Defaults to STDIN if not " "provided.",
21
- )
22
- parser.add_argument(
23
- "-s",
24
- "--strip",
25
- nargs="*",
26
- help="A list of tags to strip. This option can't be used with " "the --convert option.",
27
- )
28
- parser.add_argument(
29
- "-c",
30
- "--convert",
31
- nargs="*",
32
- help="A list of tags to convert. This option can't be used with " "the --strip option.",
33
- )
34
- parser.add_argument(
35
- "-a",
36
- "--autolinks",
37
- action="store_true",
38
- help="A boolean indicating whether the 'automatic link' style "
39
- "should be used when a 'a' tag's contents match its href.",
40
- )
41
- parser.add_argument(
42
- "--default-title",
43
- action="store_false",
44
- help="A boolean to enable setting the title of a link to its " "href, if no title is given.",
45
- )
46
- parser.add_argument(
47
- "--heading-style",
48
- default=UNDERLINED,
49
- choices=(ATX, ATX_CLOSED, UNDERLINED),
50
- help="Defines how headings should be converted.",
51
- )
52
- parser.add_argument(
53
- "-b",
54
- "--bullets",
55
- default="*+-",
56
- help="A string of bullet styles to use; the bullet will " "alternate based on nesting level.",
57
- )
58
- (
59
- parser.add_argument(
60
- "--strong-em-symbol",
61
- default=ASTERISK,
62
- choices=(ASTERISK, UNDERSCORE),
63
- help="Use * or _ to convert strong and italics text",
64
- ),
65
- )
66
- parser.add_argument("--sub-symbol", default="", help="Define the chars that surround '<sub>'.")
67
- parser.add_argument("--sup-symbol", default="", help="Define the chars that surround '<sup>'.")
68
- parser.add_argument(
69
- "--newline-style",
70
- default=SPACES,
71
- choices=(SPACES, BACKSLASH),
72
- help="Defines the style of <br> conversions: two spaces "
73
- "or backslash at the and of the line thet should break.",
74
- )
75
- parser.add_argument(
76
- "--code-language", default="", help="Defines the language that should be assumed for all " "'<pre>' sections."
77
- )
78
- parser.add_argument(
79
- "--no-escape-asterisks",
80
- dest="escape_asterisks",
81
- action="store_false",
82
- help="Do not escape '*' to '\\*' in text.",
83
- )
84
- parser.add_argument(
85
- "--no-escape-underscores",
86
- dest="escape_underscores",
87
- action="store_false",
88
- help="Do not escape '_' to '\\_' in text.",
89
- )
90
- parser.add_argument(
91
- "-i",
92
- "--keep-inline-images-in",
93
- nargs="*",
94
- help="Images are converted to their alt-text when the images are "
95
- "located inside headlines or table cells. If some inline images "
96
- "should be converted to markdown images instead, this option can "
97
- "be set to a list of parent tags that should be allowed to "
98
- "contain inline images.",
99
- )
100
- parser.add_argument(
101
- "-w", "--wrap", action="store_true", help="Wrap all text paragraphs at --wrap-width characters."
102
- )
103
- parser.add_argument("--wrap-width", type=int, default=80)
104
-
105
- args = parser.parse_args(argv)
106
-
107
- result = convert_to_markdown(
108
- args.html.read(),
109
- strip=args.strip,
110
- convert=args.convert,
111
- autolinks=args.autolinks,
112
- default_title=args.default_title,
113
- heading_style=args.heading_style,
114
- bullets=args.bullets,
115
- strong_em_symbol=args.strong_em_symbol,
116
- sub_symbol=args.sub_symbol,
117
- sup_symbol=args.sup_symbol,
118
- newline_style=args.newline_style,
119
- code_language=args.code_language,
120
- escape_asterisks=args.escape_asterisks,
121
- escape_underscores=args.escape_underscores,
122
- keep_inline_images_in=args.keep_inline_images_in,
123
- wrap=args.wrap,
124
- wrap_width=args.wrap_width,
125
- )
126
-
127
- print(result) # noqa: T201
128
-
129
-
130
3
  if __name__ == "__main__":
131
- cli(sys.argv[1:])
4
+ from html_to_markdown.cli import main
5
+
6
+ try:
7
+ result = main(sys.argv[1:])
8
+ print(result) # noqa: T201
9
+ except ValueError as e:
10
+ print(str(e), file=sys.stderr) # noqa: T201
11
+ sys.exit(1)
@@ -0,0 +1,150 @@
1
+ def main(argv: list[str]) -> str:
2
+ """Command-line entry point."""
3
+ from argparse import ArgumentParser, FileType
4
+ from sys import stdin
5
+
6
+ from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
7
+ from html_to_markdown.processing import convert_to_markdown
8
+
9
+ parser = ArgumentParser(
10
+ prog="html_to_markdown",
11
+ description="Converts HTML to Markdown.",
12
+ )
13
+
14
+ parser.add_argument(
15
+ "html",
16
+ nargs="?",
17
+ type=FileType("r"),
18
+ default=stdin,
19
+ help="The HTML file to convert. Defaults to STDIN if not provided.",
20
+ )
21
+
22
+ parser.add_argument(
23
+ "-s",
24
+ "--strip",
25
+ nargs="*",
26
+ help="A list of tags to strip from the conversion. Incompatible with the --convert option.",
27
+ )
28
+
29
+ parser.add_argument(
30
+ "-c",
31
+ "--convert",
32
+ nargs="*",
33
+ help="A list of HTML tags to explicitly convert. Incompatible with the --strip option.",
34
+ )
35
+
36
+ parser.add_argument(
37
+ "-a",
38
+ "--autolinks",
39
+ action="store_true",
40
+ help="Automatically convert anchor links where the content matches the href.",
41
+ )
42
+
43
+ parser.add_argument(
44
+ "--default-title",
45
+ action="store_false",
46
+ help="Use this flag to disable setting the link title to its href when no title is provided.",
47
+ )
48
+
49
+ parser.add_argument(
50
+ "--heading-style",
51
+ default=UNDERLINED,
52
+ choices=(ATX, ATX_CLOSED, UNDERLINED),
53
+ help="Defines the heading conversion style: 'atx', 'atx_closed', or 'underlined'. Defaults to 'underlined'.",
54
+ )
55
+
56
+ parser.add_argument(
57
+ "-b",
58
+ "--bullets",
59
+ default="*+-",
60
+ help="A string of bullet styles to use for list items. The style alternates based on nesting level. Defaults to '*+-'.",
61
+ )
62
+
63
+ parser.add_argument(
64
+ "--strong-em-symbol",
65
+ default=ASTERISK,
66
+ choices=(ASTERISK, UNDERSCORE),
67
+ help="Choose between '*' or '_' for strong and emphasized text. Defaults to '*'.",
68
+ )
69
+
70
+ parser.add_argument(
71
+ "--sub-symbol",
72
+ default="",
73
+ help="Define the characters used to surround <sub> text. Defaults to empty.",
74
+ )
75
+
76
+ parser.add_argument(
77
+ "--sup-symbol",
78
+ default="",
79
+ help="Define the characters used to surround <sup> text. Defaults to empty.",
80
+ )
81
+
82
+ parser.add_argument(
83
+ "--newline-style",
84
+ default=SPACES,
85
+ choices=(SPACES, BACKSLASH),
86
+ help="Specify the <br> conversion style: two spaces (default) or a backslash at the end of the line.",
87
+ )
88
+
89
+ parser.add_argument(
90
+ "--code-language",
91
+ default="",
92
+ help="Specify the default language for code blocks inside <pre> tags. Defaults to empty.",
93
+ )
94
+
95
+ parser.add_argument(
96
+ "--no-escape-asterisks",
97
+ dest="escape_asterisks",
98
+ action="store_false",
99
+ help="Disable escaping of '*' characters in text to '\\*'.",
100
+ )
101
+
102
+ parser.add_argument(
103
+ "--no-escape-underscores",
104
+ dest="escape_underscores",
105
+ action="store_false",
106
+ help="Disable escaping of '_' characters in text to '\\_'.",
107
+ )
108
+
109
+ parser.add_argument(
110
+ "-i",
111
+ "--keep-inline-images-in",
112
+ nargs="*",
113
+ help="Specify parent tags where inline images should be preserved as images, rather than converted to alt-text. Defaults to None.",
114
+ )
115
+
116
+ parser.add_argument(
117
+ "-w",
118
+ "--wrap",
119
+ action="store_true",
120
+ help="Enable word wrapping for paragraphs at --wrap-width characters.",
121
+ )
122
+
123
+ parser.add_argument(
124
+ "--wrap-width",
125
+ type=int,
126
+ default=80,
127
+ help="The number of characters at which text paragraphs should wrap. Defaults to 80.",
128
+ )
129
+
130
+ args = parser.parse_args(argv)
131
+
132
+ return convert_to_markdown(
133
+ args.html.read(),
134
+ strip=args.strip,
135
+ convert=args.convert,
136
+ autolinks=args.autolinks,
137
+ default_title=args.default_title,
138
+ heading_style=args.heading_style,
139
+ bullets=args.bullets,
140
+ strong_em_symbol=args.strong_em_symbol,
141
+ sub_symbol=args.sub_symbol,
142
+ sup_symbol=args.sup_symbol,
143
+ newline_style=args.newline_style,
144
+ code_language=args.code_language,
145
+ escape_asterisks=args.escape_asterisks,
146
+ escape_underscores=args.escape_underscores,
147
+ keep_inline_images_in=args.keep_inline_images_in,
148
+ wrap=args.wrap,
149
+ wrap_width=args.wrap_width,
150
+ )
@@ -2,17 +2,17 @@ from __future__ import annotations
2
2
 
3
3
  import re
4
4
  from re import Pattern
5
- from typing import Final, Literal
5
+ from typing import Final
6
6
 
7
7
  convert_heading_re: Final[Pattern[str]] = re.compile(r"convert_h(\d+)")
8
8
  line_beginning_re: Final[Pattern[str]] = re.compile(r"^", re.MULTILINE)
9
9
  whitespace_re: Final[Pattern[str]] = re.compile(r"[\t ]+")
10
10
  html_heading_re: Final[Pattern[str]] = re.compile(r"h[1-6]")
11
11
 
12
- ASTERISK: Final[Literal["*"]] = "*"
13
- ATX: Final[Literal["atx"]] = "atx"
14
- ATX_CLOSED: Final[Literal["atx_closed"]] = "atx_closed"
15
- BACKSLASH: Final[Literal["backslash"]] = "backslash"
16
- UNDERLINED: Final[Literal["underlined"]] = "underlined"
17
- SPACES: Final[Literal["spaces"]] = "spaces"
18
- UNDERSCORE: Final[Literal["_"]] = "_"
12
+ ASTERISK: Final = "*"
13
+ ATX: Final = "atx"
14
+ ATX_CLOSED: Final = "atx_closed"
15
+ BACKSLASH: Final = "backslash"
16
+ UNDERLINED: Final = "underlined"
17
+ SPACES: Final = "spaces"
18
+ UNDERSCORE: Final = "_"
@@ -55,17 +55,19 @@ SupportedElements = Literal[
55
55
  "kbd",
56
56
  ]
57
57
 
58
- ConvertsMap = Mapping[SupportedElements, Callable[[str, Tag], str]]
58
+ ConvertersMap = Mapping[SupportedElements, Callable[[str, Tag], str]]
59
59
 
60
60
  T = TypeVar("T")
61
61
 
62
62
 
63
63
  def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
64
- """This abstracts all simple inline tags like b, em, del, ...
65
- Returns a function that wraps the chomped text in a pair of the string
66
- that is returned by markup_fn, with '/' inserted in the string used after
67
- the text if it looks like an HTML tag. markup_fn is necessary to allow for
68
- references to self.strong_em_symbol etc.
64
+ """Create an inline converter for a markup pattern or tag.
65
+
66
+ Args:
67
+ markup_prefix: The markup prefix to insert.
68
+
69
+ Returns:
70
+ A function that can be used to convert HTML to Markdown.
69
71
  """
70
72
 
71
73
  def implementation(*, tag: Tag, text: str) -> str:
@@ -147,9 +149,9 @@ def _convert_hn(
147
149
 
148
150
 
149
151
  def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: Iterable[str] | None) -> str:
150
- alt = tag.attrs.get("alt", None) or ""
151
- src = tag.attrs.get("src", None) or ""
152
- title = tag.attrs.get("title", None) or ""
152
+ alt = tag.attrs.get("alt", "")
153
+ src = tag.attrs.get("src", "")
154
+ title = tag.attrs.get("title", "")
153
155
  title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
154
156
  parent_name = tag.parent.name if tag.parent else ""
155
157
  if convert_as_inline and parent_name not in (keep_inline_images_in or []):
@@ -295,7 +297,7 @@ def create_converters_map(
295
297
  sup_symbol: str,
296
298
  wrap: bool,
297
299
  wrap_width: int,
298
- ) -> ConvertsMap:
300
+ ) -> ConvertersMap:
299
301
  """Create a mapping of HTML elements to their corresponding conversion functions.
300
302
 
301
303
  Args:
@@ -0,0 +1,89 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Literal
4
+
5
+ from html_to_markdown.constants import ASTERISK, SPACES, UNDERLINED
6
+ from html_to_markdown.converters import create_converters_map
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Callable, Iterable
10
+
11
+ from bs4 import Tag
12
+
13
+
14
+ def _create_legacy_class(
15
+ autolinks: bool,
16
+ bullets: str,
17
+ code_language: str,
18
+ code_language_callback: Callable[[Tag], str] | None,
19
+ default_title: bool,
20
+ heading_style: Literal["atx", "atx_closed", "underlined"],
21
+ keep_inline_images_in: Iterable[str] | None,
22
+ newline_style: str,
23
+ strong_em_symbol: str,
24
+ sub_symbol: str,
25
+ sup_symbol: str,
26
+ wrap: bool,
27
+ wrap_width: int,
28
+ ) -> type:
29
+ """Create a legacy class for Markdownify.
30
+
31
+ Deprecated: Use the new hooks api instead.
32
+
33
+ Args:
34
+ autolinks: Whether to convert URLs into links.
35
+ bullets: The bullet characters to use for unordered lists.
36
+ code_language: The default code language to use.
37
+ code_language_callback: A callback to get the code language.
38
+ default_title: Whether to use the URL as the title for links.
39
+ heading_style: The style of headings.
40
+ keep_inline_images_in: The tags to keep inline images in.
41
+ newline_style: The style of newlines.
42
+ strong_em_symbol: The symbol to use for strong and emphasis text.
43
+ sub_symbol: The symbol to use for subscript text.
44
+ sup_symbol: The symbol to use for superscript text.
45
+ wrap: Whether to wrap text.
46
+ wrap_width: The width to wrap text at.
47
+
48
+ Returns:
49
+ A class that can be used to convert HTML to Markdown.
50
+ """
51
+ return type(
52
+ "Markdownify",
53
+ (),
54
+ {
55
+ k.removeprefix("_"): v
56
+ for k, v in create_converters_map(
57
+ autolinks=autolinks,
58
+ bullets=bullets,
59
+ code_language=code_language,
60
+ code_language_callback=code_language_callback,
61
+ default_title=default_title,
62
+ heading_style=heading_style,
63
+ keep_inline_images_in=keep_inline_images_in,
64
+ newline_style=newline_style,
65
+ strong_em_symbol=strong_em_symbol,
66
+ sub_symbol=sub_symbol,
67
+ sup_symbol=sup_symbol,
68
+ wrap=wrap,
69
+ wrap_width=wrap_width,
70
+ ).items()
71
+ },
72
+ )
73
+
74
+
75
+ Markdownify = _create_legacy_class(
76
+ autolinks=True,
77
+ bullets="*+-",
78
+ code_language="",
79
+ code_language_callback=None,
80
+ default_title=False,
81
+ heading_style=UNDERLINED,
82
+ keep_inline_images_in=None,
83
+ newline_style=SPACES,
84
+ strong_em_symbol=ASTERISK,
85
+ sub_symbol="",
86
+ sup_symbol="",
87
+ wrap=False,
88
+ wrap_width=80,
89
+ )
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from itertools import chain
3
4
  from typing import TYPE_CHECKING, Any, Callable, Literal, cast
4
5
 
5
6
  from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
@@ -11,7 +12,7 @@ from html_to_markdown.constants import (
11
12
  html_heading_re,
12
13
  whitespace_re,
13
14
  )
14
- from html_to_markdown.converters import ConvertsMap, create_converters_map
15
+ from html_to_markdown.converters import ConvertersMap, create_converters_map
15
16
  from html_to_markdown.utils import escape
16
17
 
17
18
  if TYPE_CHECKING:
@@ -76,48 +77,21 @@ def _is_nested_tag(el: PageElement) -> bool:
76
77
 
77
78
  def _process_tag(
78
79
  tag: Tag,
80
+ converters_map: ConvertersMap,
79
81
  *,
80
- autolinks: bool,
81
- bullets: str,
82
- code_language: str,
83
- code_language_callback: Callable[[Any], str] | None,
84
- convert: Iterable[str] | None,
82
+ convert: set[str] | None,
85
83
  convert_as_inline: bool = False,
86
- converters_map: ConvertsMap | None = None,
87
- default_title: bool,
88
84
  escape_asterisks: bool,
89
85
  escape_misc: bool,
90
86
  escape_underscores: bool,
91
- heading_style: Literal["atx", "atx_closed", "underlined"],
92
- keep_inline_images_in: Iterable[str] | None,
93
- newline_style: str,
94
- strip: Iterable[str] | None,
95
- strong_em_symbol: str,
96
- sub_symbol: str,
97
- sup_symbol: str,
98
- wrap: bool,
99
- wrap_width: int,
87
+ strip: set[str] | None,
100
88
  ) -> str:
101
- if converters_map is None:
102
- converters_map = create_converters_map(
103
- autolinks=autolinks,
104
- bullets=bullets,
105
- code_language=code_language,
106
- code_language_callback=code_language_callback,
107
- default_title=default_title,
108
- heading_style=heading_style,
109
- keep_inline_images_in=keep_inline_images_in,
110
- newline_style=newline_style,
111
- strong_em_symbol=strong_em_symbol,
112
- sub_symbol=sub_symbol,
113
- sup_symbol=sup_symbol,
114
- wrap=wrap,
115
- wrap_width=wrap_width,
116
- )
117
-
89
+ should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
90
+ tag_name: SupportedTag | None = cast(SupportedTag, tag.name.lower()) if tag.name.lower() in converters_map else None
118
91
  text = ""
92
+
119
93
  is_heading = html_heading_re.match(tag.name) is not None
120
- is_cell = tag.name in {"td", "th"}
94
+ is_cell = tag_name in {"td", "th"}
121
95
  convert_children_as_inline = convert_as_inline or is_heading or is_cell
122
96
 
123
97
  if _is_nested_tag(tag):
@@ -141,32 +115,17 @@ def _process_tag(
141
115
  )
142
116
  elif isinstance(el, Tag):
143
117
  text += _process_tag(
144
- tag=el,
118
+ el,
119
+ converters_map,
145
120
  convert_as_inline=convert_children_as_inline,
146
- strip=strip,
147
121
  convert=convert,
148
- escape_misc=escape_misc,
149
122
  escape_asterisks=escape_asterisks,
123
+ escape_misc=escape_misc,
150
124
  escape_underscores=escape_underscores,
151
- converters_map=converters_map,
152
- autolinks=autolinks,
153
- bullets=bullets,
154
- code_language=code_language,
155
- code_language_callback=code_language_callback,
156
- default_title=default_title,
157
- heading_style=heading_style,
158
- keep_inline_images_in=keep_inline_images_in,
159
- newline_style=newline_style,
160
- strong_em_symbol=strong_em_symbol,
161
- sub_symbol=sub_symbol,
162
- sup_symbol=sup_symbol,
163
- wrap=wrap,
164
- wrap_width=wrap_width,
125
+ strip=strip,
165
126
  )
166
127
 
167
- tag_name: SupportedTag | None = cast(SupportedTag, tag.name.lower()) if tag.name.lower() in converters_map else None
168
-
169
- if tag_name and _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert):
128
+ if tag_name and should_convert_tag:
170
129
  return converters_map[tag_name]( # type: ignore[call-arg]
171
130
  tag=tag, text=text, convert_as_inline=convert_as_inline
172
131
  )
@@ -209,7 +168,7 @@ def _process_text(
209
168
  return text
210
169
 
211
170
 
212
- def _should_convert_tag(*, tag_name: str, strip: Iterable[str] | None, convert: Iterable[str] | None) -> bool:
171
+ def _should_convert_tag(*, tag_name: str, strip: set[str] | None, convert: set[str] | None) -> bool:
213
172
  if strip is not None:
214
173
  return tag_name not in strip
215
174
  if convert is not None:
@@ -217,15 +176,22 @@ def _should_convert_tag(*, tag_name: str, strip: Iterable[str] | None, convert:
217
176
  return True
218
177
 
219
178
 
179
+ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
180
+ if value is None:
181
+ return None
182
+ if isinstance(value, str):
183
+ return set(",".split(value))
184
+ return {*chain(*[v.split(",") for v in value])}
185
+
186
+
220
187
  def convert_to_markdown(
221
- html: str,
188
+ source: str | BeautifulSoup,
222
189
  *,
223
- soup: BeautifulSoup | None = None,
224
190
  autolinks: bool = True,
225
191
  bullets: str = "*+-",
226
192
  code_language: str = "",
227
193
  code_language_callback: Callable[[Any], str] | None = None,
228
- convert: Iterable[str] | None = None,
194
+ convert: str | Iterable[str] | None = None,
229
195
  default_title: bool = False,
230
196
  escape_asterisks: bool = True,
231
197
  escape_misc: bool = True,
@@ -233,7 +199,7 @@ def convert_to_markdown(
233
199
  heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
234
200
  keep_inline_images_in: Iterable[str] | None = None,
235
201
  newline_style: Literal["spaces", "backslash"] = SPACES,
236
- strip: Iterable[str] | None = None,
202
+ strip: str | Iterable[str] | None = None,
237
203
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
238
204
  sub_symbol: str = "",
239
205
  sup_symbol: str = "",
@@ -244,55 +210,67 @@ def convert_to_markdown(
244
210
  """Convert HTML to Markdown.
245
211
 
246
212
  Args:
247
- html: The HTML to convert.
248
- soup: The BeautifulSoup object to convert.
249
- autolinks: Whether to convert links to Markdown.
250
- bullets: The bullet characters to use for unordered lists.
251
- code_language: The default code language to use.
252
- code_language_callback: A callback function to determine the code language.
253
- convert: The HTML elements to convert.
254
- default_title: Whether to use the default title.
255
- escape_asterisks: Whether to escape asterisks.
256
- escape_misc: Whether to escape miscellaneous characters.
257
- escape_underscores: Whether to escape underscores.
258
- heading_style: The style to use for headings.
259
- keep_inline_images_in: The tags to keep inline images in.
260
- newline_style: The style to use for newlines.
261
- strip: The HTML elements to strip.
262
- strong_em_symbol: The symbol to use for strong and emphasis.
263
- sub_symbol: The symbol to use for subscript.
264
- sup_symbol: The symbol to use for superscript.
265
- wrap: Whether to wrap text.
266
- wrap_width: The width to wrap text at.
267
- convert_as_inline: Whether to convert elements as inline.
213
+ source: An HTML document or a an initialized instance of BeautifulSoup.
214
+ autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
215
+ bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
216
+ code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
217
+ code_language_callback: Function to dynamically determine the language for code blocks.
218
+ convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
219
+ default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
220
+ escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
221
+ escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
222
+ escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
223
+ heading_style: The style to use for Markdown headings. Defaults to "underlined".
224
+ keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
225
+ newline_style: Style for handling newlines in text content. Defaults to "spaces".
226
+ strip: Tags to strip from the output. Defaults to None.
227
+ strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
228
+ sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
229
+ sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
230
+ wrap: Wrap text to the specified width. Defaults to False.
231
+ wrap_width: The number of characters at which to wrap text. Defaults to 80.
232
+ convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
233
+
234
+ Raises:
235
+ ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
268
236
 
269
237
  Returns:
270
- The Markdown.
238
+ str: A string of Markdown-formatted text converted from the given HTML.
271
239
  """
272
- if soup is None:
240
+ if isinstance(source, str):
273
241
  from bs4 import BeautifulSoup
274
242
 
275
- soup = BeautifulSoup(html, "html.parser")
243
+ if "".join(source.split("\n")):
244
+ source = BeautifulSoup(source, "html.parser")
245
+ else:
246
+ raise ValueError("The input HTML is empty.")
276
247
 
277
- return _process_tag(
248
+ if strip is not None and convert is not None:
249
+ raise ValueError("Only one of 'strip' and 'convert' can be specified.")
250
+
251
+ converters_map = create_converters_map(
278
252
  autolinks=autolinks,
279
253
  bullets=bullets,
280
254
  code_language=code_language,
281
255
  code_language_callback=code_language_callback,
282
- convert=convert,
283
- convert_as_inline=convert_as_inline,
284
256
  default_title=default_title,
285
- escape_asterisks=escape_asterisks,
286
- escape_misc=escape_misc,
287
- escape_underscores=escape_underscores,
288
257
  heading_style=heading_style,
289
258
  keep_inline_images_in=keep_inline_images_in,
290
259
  newline_style=newline_style,
291
- strip=strip,
292
260
  strong_em_symbol=strong_em_symbol,
293
261
  sub_symbol=sub_symbol,
294
262
  sup_symbol=sup_symbol,
295
- tag=soup,
296
263
  wrap=wrap,
297
264
  wrap_width=wrap_width,
298
265
  )
266
+
267
+ return _process_tag(
268
+ source,
269
+ converters_map,
270
+ convert=_as_optional_set(convert),
271
+ convert_as_inline=convert_as_inline,
272
+ escape_asterisks=escape_asterisks,
273
+ escape_misc=escape_misc,
274
+ escape_underscores=escape_underscores,
275
+ strip=_as_optional_set(strip),
276
+ )
@@ -0,0 +1,102 @@
1
+ Metadata-Version: 2.4
2
+ Name: html-to-markdown
3
+ Version: 1.2.0
4
+ Summary: Convert HTML to markdown
5
+ Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Keywords: converter,html,markdown,text-extraction,text-processing
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Text Processing
19
+ Classifier: Topic :: Text Processing :: Markup
20
+ Classifier: Topic :: Text Processing :: Markup :: HTML
21
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
22
+ Classifier: Topic :: Utilities
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.9
25
+ Requires-Dist: beautifulsoup4>=4.12.3
26
+ Description-Content-Type: text/markdown
27
+
28
+ # html_to_markdown
29
+
30
+ This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
31
+ Python 3.9 and above.
32
+
33
+ ### Differences with the Markdownify
34
+
35
+ - The refactored codebase uses a strict functional approach - no classes are involved.
36
+ - There is full typing with strict MyPy strict adherence and a py.typed file included.
37
+ - The `convert_to_markdown` function allows passing a pre-configured instance of `BeautifulSoup` instead of html.
38
+ - This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
39
+ point versioning is no longer aligned.
40
+
41
+ ## Installation
42
+
43
+ ```shell
44
+ pip install html_to_markdown
45
+ ```
46
+
47
+ ## Usage
48
+
49
+ Convert an string HTML to Markdown:
50
+
51
+ ```python
52
+ from html_to_markdown import convert_to_markdown
53
+
54
+ convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
55
+ ```
56
+
57
+ Or pass a pre-configured instance of `BeautifulSoup`:
58
+
59
+ ```python
60
+ from bs4 import BeautifulSoup
61
+ from html_to_markdown import convert_to_markdown
62
+
63
+ soup = BeautifulSoup('<b>Yay</b> <a href="http://github.com">GitHub</a>', 'lxml') # lxml requires an extra dependency.
64
+
65
+ convert_to_markdown(soup) # > '**Yay** [GitHub](http://github.com)'
66
+ ```
67
+
68
+ ### Options
69
+
70
+ The `convert_to_markdown` function accepts the following kwargs:
71
+
72
+ - autolinks (bool): Automatically convert valid URLs into Markdown links. Defaults to True.
73
+ - bullets (str): A string of characters to use for bullet points in lists. Defaults to '\*+-'.
74
+ - code_language (str): Default language identifier for fenced code blocks. Defaults to an empty string.
75
+ - code_language_callback (Callable[[Any], str] | None): Function to dynamically determine the language for code blocks.
76
+ - convert (Iterable[str] | None): A list of tag names to convert to Markdown. If None, all supported tags are converted.
77
+ - default_title (bool): Use the default title when converting certain elements (e.g., links). Defaults to False.
78
+ - escape_asterisks (bool): Escape asterisks (\*) to prevent unintended Markdown formatting. Defaults to True.
79
+ - escape_misc (bool): Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
80
+ - escape*underscores (bool): Escape underscores (*) to prevent unintended italic formatting. Defaults to True.
81
+ - heading_style (Literal["underlined", "atx", "atx_closed"]): The style to use for Markdown headings. Defaults to "
82
+ underlined".
83
+ - keep_inline_images_in (Iterable[str] | None): Tags in which inline images should be preserved. Defaults to None.
84
+ - newline_style (Literal["spaces", "backslash"]): Style for handling newlines in text content. Defaults to "spaces".
85
+ - strip (Iterable[str] | None): Tags to strip from the output. Defaults to None.
86
+ - strong*em_symbol (Literal["\*", "*"]): Symbol to use for strong/emphasized text. Defaults to "\*".
87
+ - sub_symbol (str): Custom symbol for subscript text. Defaults to an empty string.
88
+ - sup_symbol (str): Custom symbol for superscript text. Defaults to an empty string.
89
+ - wrap (bool): Wrap text to the specified width. Defaults to False.
90
+ - wrap_width (int): The number of characters at which to wrap text. Defaults to 80.
91
+ - convert_as_inline (bool): Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
92
+
93
+ ## CLI
94
+
95
+ For compatibility with the original markdownify, a CLI is provided. Use `html_to_markdown example.html > example.md` or
96
+ pipe input from stdin:
97
+
98
+ ```shell
99
+ cat example.html | html_to_markdown > example.md
100
+ ```
101
+
102
+ Use `html_to_markdown -h` to see all available options. They are the same as listed above and take the same arguments.
@@ -0,0 +1,13 @@
1
+ html_to_markdown/__init__.py,sha256=cXm4YOyrAp2HKHMDfnVA5e75zg6wdqpyXugjBYvBMFc,143
2
+ html_to_markdown/__main__.py,sha256=u5xevySlT5eIGyLUaethdDQIKJygaKnc3F2sHWoz75g,264
3
+ html_to_markdown/cli.py,sha256=HVnzmcyrYwah_yWhZ87mZcG0VgnKYp6y89fJh2R-Rlw,4532
4
+ html_to_markdown/constants.py,sha256=Usk67k18tuRovJpKDsiEXdgH20KgqI9KOnK4Fbx-M5c,547
5
+ html_to_markdown/converters.py,sha256=hW4RqAbgx0tdTzfUSvAGQg1OgQUmHL1cekZtJLFq_Ns,12080
6
+ html_to_markdown/legacy.py,sha256=vL-MVKPXOue-JJafXFtmGcVIPylwmPOly0CELTSzWRQ,2773
7
+ html_to_markdown/processing.py,sha256=L1wZwUm7WA8wN4GA5zjCStwACb-8S2scQZPbzeHgdY8,8951
8
+ html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
10
+ html_to_markdown-1.2.0.dist-info/METADATA,sha256=Dg2ZibNWNW_GyszXG2bxT-oOtOJc8iryVxlLn38eMww,4709
11
+ html_to_markdown-1.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
12
+ html_to_markdown-1.2.0.dist-info/licenses/LICENSE,sha256=06BS7zd6oPCrbzAqrThGFboRlbssgBsqDJGqKyZW2Og,1117
13
+ html_to_markdown-1.2.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,194 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: html-to-markdown
3
- Version: 1.0.0
4
- Summary: Convert HTML to markdown
5
- Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
- License: MIT
7
- License-File: LICENSE
8
- Keywords: beautifulsoup,converter,html,markdown,text-processing
9
- Classifier: Intended Audience :: Developers
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Operating System :: OS Independent
12
- Classifier: Programming Language :: Python :: 3.9
13
- Classifier: Programming Language :: Python :: 3.10
14
- Classifier: Programming Language :: Python :: 3.11
15
- Classifier: Programming Language :: Python :: 3.12
16
- Classifier: Programming Language :: Python :: 3.13
17
- Classifier: Topic :: Text Processing
18
- Classifier: Topic :: Text Processing :: Markup
19
- Classifier: Topic :: Text Processing :: Markup :: HTML
20
- Classifier: Topic :: Text Processing :: Markup :: Markdown
21
- Classifier: Topic :: Utilities
22
- Classifier: Typing :: Typed
23
- Requires-Python: >=3.9
24
- Requires-Dist: beautifulsoup4>=4.12.3
25
- Description-Content-Type: text/markdown
26
-
27
- # html_to_markdown
28
-
29
- This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
30
- Python 3.9 and offering strong typing.
31
-
32
- ### Differences from the Markdownify
33
-
34
- - The refactored codebase uses a strict functional approach - no classes are involved.
35
- - There is full typing with strict MyPy adherence in place.
36
- - The `convert_to_markdown` allows passing a pre-configured instance of `Beautifulsoup`.
37
- - This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
38
- point versioning is no longer aligned.
39
-
40
- ## Installation
41
-
42
- ```shell
43
- pip install html_to_markdown
44
- ```
45
-
46
- ## Usage
47
-
48
- Convert some HTML to Markdown:
49
-
50
- ```python
51
- from html_to_markdown import convert_to_markdown
52
-
53
- convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
54
- ```
55
-
56
- Specify tags to exclude:
57
-
58
- ```python
59
- from html_to_markdown import convert_to_markdown
60
-
61
- convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', strip=['a']) # > '**Yay** GitHub'
62
- ```
63
-
64
- \...or specify the tags you want to include:
65
-
66
- ```python
67
- from html_to_markdown import convert_to_markdown
68
-
69
- convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', convert=['b']) # > '**Yay** GitHub'
70
- ```
71
-
72
- # Options
73
-
74
- html_to_markdown supports the following options:
75
-
76
- strip
77
-
78
- : A list of tags to strip. This option can\'t be used with the
79
- `convert` option.
80
-
81
- convert
82
-
83
- : A list of tags to convert. This option can\'t be used with the
84
- `strip` option.
85
-
86
- autolinks
87
-
88
- : A boolean indicating whether the \"automatic link\" style should be
89
- used when a `a` tag\'s contents match its href. Defaults to `True`.
90
-
91
- default_title
92
-
93
- : A boolean to enable setting the title of a link to its href, if no
94
- title is given. Defaults to `False`.
95
-
96
- heading_style
97
-
98
- : Defines how headings should be converted. Accepted values are `ATX`,
99
- `ATX_CLOSED`, `SETEXT`, and `UNDERLINED` (which is an alias for
100
- `SETEXT`). Defaults to `UNDERLINED`.
101
-
102
- bullets
103
-
104
- : An iterable (string, list, or tuple) of bullet styles to be used. If
105
- the iterable only contains one item, it will be used regardless of
106
- how deeply lists are nested. Otherwise, the bullet will alternate
107
- based on nesting level. Defaults to `'*+-'`.
108
-
109
- strong_em_symbol
110
-
111
- : In markdown, both `*` and `_` are used to encode **strong** or
112
- *emphasized* texts. Either of these symbols can be chosen by the
113
- options `ASTERISK` (default) or `UNDERSCORE` respectively.
114
-
115
- sub_symbol, sup_symbol
116
-
117
- : Define the chars that surround `<sub>` and `<sup>` text. Defaults to
118
- an empty string, because this is non-standard behavior. Could be
119
- something like `~` and `^` to result in `~sub~` and `^sup^`. If the
120
- value starts with `<` and ends with `>`, it is treated as an HTML
121
- tag and a `/` is inserted after the `<` in the string used after the
122
- text; this allows specifying `<sub>` to use raw HTML in the output
123
- for subscripts, for example.
124
-
125
- newline_style
126
-
127
- : Defines the style of marking linebreaks (`<br>`) in markdown. The
128
- default value `SPACES` of this option will adopt the usual two
129
- spaces and a newline, while `BACKSLASH` will convert a linebreak to
130
- `\\n` (a backslash and a newline). While the latter convention is
131
- non-standard, it is commonly preferred and supported by a lot of
132
- interpreters.
133
-
134
- code_language
135
-
136
- : Defines the language that should be assumed for all `<pre>`
137
- sections. Useful, if all code on a page is in the same programming
138
- language and should be annotated with ``[python]{.title-ref}[ or
139
- similar. Defaults to ]{.title-ref}[\'\']{.title-ref}\` (empty
140
- string) and can be any string.
141
-
142
- code_language_callback
143
-
144
- : When the HTML code contains `pre` tags that in some way provide the
145
- code language, for example as class, this callback can be used to
146
- extract the language from the tag and prefix it to the converted
147
- `pre` tag. The callback gets one single argument, an BeautifylSoup
148
- object, and returns a string containing the code language, or
149
- `None`. An example to use the class name as code language could be:
150
-
151
- def callback(el):
152
- return el['class'][0] if el.has_attr('class') else None
153
-
154
- Defaults to `None`.
155
-
156
- escape_asterisks
157
-
158
- : If set to `False`, do not escape `*` to `\*` in text. Defaults to
159
- `True`.
160
-
161
- escape_underscores
162
-
163
- : If set to `False`, do not escape `_` to `\_` in text. Defaults to
164
- `True`.
165
-
166
- escape_misc
167
-
168
- : If set to `False`, do not escape miscellaneous punctuation
169
- characters that sometimes have Markdown significance in text.
170
- Defaults to `True`.
171
-
172
- keep_inline_images_in
173
-
174
- : Images are converted to their alt-text when the images are located
175
- inside headlines or table cells. If some inline images should be
176
- converted to markdown images instead, this option can be set to a
177
- list of parent tags that should be allowed to contain inline images,
178
- for example `['td']`. Defaults to an empty list.
179
-
180
- wrap, wrap_width
181
-
182
- : If `wrap` is set to `True`, all text paragraphs are wrapped at
183
- `wrap_width` characters. Defaults to `False` and `80`. Use with
184
- `newline_style=BACKSLASH` to keep line breaks in paragraphs.
185
-
186
- Options may be specified as kwargs to the `html_to_markdown` function, or as
187
- a nested `Options` class in `MarkdownConverter` subclasses.
188
-
189
- # CLI
190
-
191
- Use `html_to_markdown example.html > example.md` or pipe input from stdin
192
- (`cat example.html | html_to_markdown > example.md`). Call `html_to_markdown -h`
193
- to see all available options. They are the same as listed above and take
194
- the same arguments.
@@ -1,12 +0,0 @@
1
- html_to_markdown/__init__.py,sha256=_WXeqic-7b6hvidTXkPQwAfLa4YOEAEP-mOUXjx_25k,95
2
- html_to_markdown/__main__.py,sha256=Wll22XKFmiNSIpdbGzC75b5_Unc3HYOTA6oXA414Tl8,4412
3
- html_to_markdown/constants.py,sha256=vUjffZ0vFq56jbXF5bBNzomfJwgsp0TWqdUzhkp6bks,687
4
- html_to_markdown/converters.py,sha256=q1wpzsYl-FRR9qbB983gAkem_-7mgYZ7hOgziofjIDM,12238
5
- html_to_markdown/processing.py,sha256=9l3zq_kdyvU0TnTk5g4uuYI6Jbu1gY7NQ11u3IBKyFU,9029
6
- html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
8
- html_to_markdown-1.0.0.dist-info/METADATA,sha256=0MObULuhTHiyvVcytDBN_liafpyFjgp5brgoWQYEglA,6478
9
- html_to_markdown-1.0.0.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
10
- html_to_markdown-1.0.0.dist-info/entry_points.txt,sha256=jhMqXDYvIyzQDLKjCn4xCyzCCbAMl94tzQx_HiG5Qi0,67
11
- html_to_markdown-1.0.0.dist-info/licenses/LICENSE,sha256=06BS7zd6oPCrbzAqrThGFboRlbssgBsqDJGqKyZW2Og,1117
12
- html_to_markdown-1.0.0.dist-info/RECORD,,
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- html_to_markdown = html_to_markdown.__main__:cli