html-to-markdown 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -0,0 +1,3 @@
1
+ from html_to_markdown.processing import convert_to_markdown
2
+
3
+ __all__ = ["convert_to_markdown"]
@@ -0,0 +1,131 @@
1
+ import argparse
2
+ import sys
3
+
4
+ from html_to_markdown import convert_to_markdown
5
+ from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
6
+
7
+
8
+ def cli(argv: list[str]) -> None:
9
+ """Command-line interface for html_to_markdown."""
10
+ parser = argparse.ArgumentParser(
11
+ prog="html_to_markdown",
12
+ description="Converts html to markdown.",
13
+ )
14
+
15
+ parser.add_argument(
16
+ "html",
17
+ nargs="?",
18
+ type=argparse.FileType("r"),
19
+ default=sys.stdin,
20
+ help="The html file to convert. Defaults to STDIN if not " "provided.",
21
+ )
22
+ parser.add_argument(
23
+ "-s",
24
+ "--strip",
25
+ nargs="*",
26
+ help="A list of tags to strip. This option can't be used with " "the --convert option.",
27
+ )
28
+ parser.add_argument(
29
+ "-c",
30
+ "--convert",
31
+ nargs="*",
32
+ help="A list of tags to convert. This option can't be used with " "the --strip option.",
33
+ )
34
+ parser.add_argument(
35
+ "-a",
36
+ "--autolinks",
37
+ action="store_true",
38
+ help="A boolean indicating whether the 'automatic link' style "
39
+ "should be used when a 'a' tag's contents match its href.",
40
+ )
41
+ parser.add_argument(
42
+ "--default-title",
43
+ action="store_false",
44
+ help="A boolean to enable setting the title of a link to its " "href, if no title is given.",
45
+ )
46
+ parser.add_argument(
47
+ "--heading-style",
48
+ default=UNDERLINED,
49
+ choices=(ATX, ATX_CLOSED, UNDERLINED),
50
+ help="Defines how headings should be converted.",
51
+ )
52
+ parser.add_argument(
53
+ "-b",
54
+ "--bullets",
55
+ default="*+-",
56
+ help="A string of bullet styles to use; the bullet will " "alternate based on nesting level.",
57
+ )
58
+ (
59
+ parser.add_argument(
60
+ "--strong-em-symbol",
61
+ default=ASTERISK,
62
+ choices=(ASTERISK, UNDERSCORE),
63
+ help="Use * or _ to convert strong and italics text",
64
+ ),
65
+ )
66
+ parser.add_argument("--sub-symbol", default="", help="Define the chars that surround '<sub>'.")
67
+ parser.add_argument("--sup-symbol", default="", help="Define the chars that surround '<sup>'.")
68
+ parser.add_argument(
69
+ "--newline-style",
70
+ default=SPACES,
71
+ choices=(SPACES, BACKSLASH),
72
+ help="Defines the style of <br> conversions: two spaces "
73
+ "or backslash at the and of the line thet should break.",
74
+ )
75
+ parser.add_argument(
76
+ "--code-language", default="", help="Defines the language that should be assumed for all " "'<pre>' sections."
77
+ )
78
+ parser.add_argument(
79
+ "--no-escape-asterisks",
80
+ dest="escape_asterisks",
81
+ action="store_false",
82
+ help="Do not escape '*' to '\\*' in text.",
83
+ )
84
+ parser.add_argument(
85
+ "--no-escape-underscores",
86
+ dest="escape_underscores",
87
+ action="store_false",
88
+ help="Do not escape '_' to '\\_' in text.",
89
+ )
90
+ parser.add_argument(
91
+ "-i",
92
+ "--keep-inline-images-in",
93
+ nargs="*",
94
+ help="Images are converted to their alt-text when the images are "
95
+ "located inside headlines or table cells. If some inline images "
96
+ "should be converted to markdown images instead, this option can "
97
+ "be set to a list of parent tags that should be allowed to "
98
+ "contain inline images.",
99
+ )
100
+ parser.add_argument(
101
+ "-w", "--wrap", action="store_true", help="Wrap all text paragraphs at --wrap-width characters."
102
+ )
103
+ parser.add_argument("--wrap-width", type=int, default=80)
104
+
105
+ args = parser.parse_args(argv)
106
+
107
+ result = convert_to_markdown(
108
+ args.html.read(),
109
+ strip=args.strip,
110
+ convert=args.convert,
111
+ autolinks=args.autolinks,
112
+ default_title=args.default_title,
113
+ heading_style=args.heading_style,
114
+ bullets=args.bullets,
115
+ strong_em_symbol=args.strong_em_symbol,
116
+ sub_symbol=args.sub_symbol,
117
+ sup_symbol=args.sup_symbol,
118
+ newline_style=args.newline_style,
119
+ code_language=args.code_language,
120
+ escape_asterisks=args.escape_asterisks,
121
+ escape_underscores=args.escape_underscores,
122
+ keep_inline_images_in=args.keep_inline_images_in,
123
+ wrap=args.wrap,
124
+ wrap_width=args.wrap_width,
125
+ )
126
+
127
+ print(result) # noqa: T201
128
+
129
+
130
+ if __name__ == "__main__":
131
+ cli(sys.argv[1:])
@@ -0,0 +1,18 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from re import Pattern
5
+ from typing import Final, Literal
6
+
7
+ convert_heading_re: Final[Pattern[str]] = re.compile(r"convert_h(\d+)")
8
+ line_beginning_re: Final[Pattern[str]] = re.compile(r"^", re.MULTILINE)
9
+ whitespace_re: Final[Pattern[str]] = re.compile(r"[\t ]+")
10
+ html_heading_re: Final[Pattern[str]] = re.compile(r"h[1-6]")
11
+
12
+ ASTERISK: Final[Literal["*"]] = "*"
13
+ ATX: Final[Literal["atx"]] = "atx"
14
+ ATX_CLOSED: Final[Literal["atx_closed"]] = "atx_closed"
15
+ BACKSLASH: Final[Literal["backslash"]] = "backslash"
16
+ UNDERLINED: Final[Literal["underlined"]] = "underlined"
17
+ SPACES: Final[Literal["spaces"]] = "spaces"
18
+ UNDERSCORE: Final[Literal["_"]] = "_"
@@ -0,0 +1,380 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterable, Mapping
4
+ from functools import partial
5
+ from inspect import getfullargspec
6
+ from textwrap import fill
7
+ from typing import Any, Callable, Literal, TypeVar, cast
8
+
9
+ from bs4.element import Tag
10
+
11
+ from html_to_markdown.constants import (
12
+ ATX_CLOSED,
13
+ BACKSLASH,
14
+ UNDERLINED,
15
+ line_beginning_re,
16
+ )
17
+ from html_to_markdown.utils import chomp, indent, underline
18
+
19
+ SupportedElements = Literal[
20
+ "a",
21
+ "b",
22
+ "blockquote",
23
+ "br",
24
+ "code",
25
+ "del",
26
+ "em",
27
+ "h1",
28
+ "h2",
29
+ "h3",
30
+ "h4",
31
+ "h5",
32
+ "h6",
33
+ "hr",
34
+ "i",
35
+ "img",
36
+ "list",
37
+ "ul",
38
+ "ol",
39
+ "li",
40
+ "p",
41
+ "pre",
42
+ "script",
43
+ "style",
44
+ "s",
45
+ "strong",
46
+ "samp",
47
+ "sub",
48
+ "sup",
49
+ "table",
50
+ "caption",
51
+ "figcaption",
52
+ "td",
53
+ "th",
54
+ "tr",
55
+ "kbd",
56
+ ]
57
+
58
+ ConvertsMap = Mapping[SupportedElements, Callable[[str, Tag], str]]
59
+
60
+ T = TypeVar("T")
61
+
62
+
63
+ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
64
+ """This abstracts all simple inline tags like b, em, del, ...
65
+ Returns a function that wraps the chomped text in a pair of the string
66
+ that is returned by markup_fn, with '/' inserted in the string used after
67
+ the text if it looks like an HTML tag. markup_fn is necessary to allow for
68
+ references to self.strong_em_symbol etc.
69
+ """
70
+
71
+ def implementation(*, tag: Tag, text: str) -> str:
72
+ if tag.find_parent(["pre", "code", "kbd", "samp"]):
73
+ return text
74
+
75
+ if not text.strip():
76
+ return ""
77
+
78
+ markup_suffix = markup_prefix
79
+ if markup_prefix.startswith("<") and markup_prefix.endswith(">"):
80
+ markup_suffix = "</" + markup_prefix[1:]
81
+
82
+ prefix, suffix, text = chomp(text)
83
+
84
+ return f"{prefix}{markup_prefix}{text}{markup_suffix}{suffix}"
85
+
86
+ return cast(Callable[[Tag, str], str], implementation)
87
+
88
+
89
+ def _get_colspan(tag: Tag) -> int:
90
+ colspan = 1
91
+
92
+ if "colspan" in tag.attrs and isinstance(tag["colspan"], str) and tag["colspan"].isdigit():
93
+ colspan = int(tag["colspan"])
94
+
95
+ return colspan
96
+
97
+
98
+ def _convert_a(*, tag: Tag, text: str, autolinks: bool, default_title: bool) -> str:
99
+ prefix, suffix, text = chomp(text)
100
+ if not text:
101
+ return ""
102
+
103
+ href = tag.get("href")
104
+ title = tag.get("title")
105
+
106
+ if autolinks and text.replace(r"\_", "_") == href and not title and not default_title:
107
+ return f"<{href}>"
108
+
109
+ if default_title and not title:
110
+ title = href
111
+
112
+ title_part = ' "{}"'.format(title.replace('"', r"\"")) if isinstance(title, str) else ""
113
+ return f"{prefix}[{text}]({href}{title_part}){suffix}" if href else text
114
+
115
+
116
+ def _convert_blockquote(*, text: str, convert_as_inline: bool) -> str:
117
+ if convert_as_inline:
118
+ return text
119
+ return f"\n{line_beginning_re.sub('> ', text.strip())}\n\n" if text else ""
120
+
121
+
122
+ def _convert_br(*, convert_as_inline: bool, newline_style: str) -> str:
123
+ if convert_as_inline:
124
+ return ""
125
+ return "\\\n" if newline_style.lower() == BACKSLASH else " \n"
126
+
127
+
128
+ def _convert_hn(
129
+ *,
130
+ n: int,
131
+ heading_style: Literal["atx", "atx_closed", "underlined"],
132
+ text: str,
133
+ convert_as_inline: bool,
134
+ ) -> str:
135
+ if convert_as_inline:
136
+ return text
137
+
138
+ text = text.strip()
139
+ if heading_style == UNDERLINED and n <= 2:
140
+ return underline(text=text, pad_char="=" if n == 1 else "-")
141
+
142
+ hashes = "#" * n
143
+ if heading_style == ATX_CLOSED:
144
+ return f"{hashes} {text} {hashes}\n\n"
145
+
146
+ return f"{hashes} {text}\n\n"
147
+
148
+
149
+ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: Iterable[str] | None) -> str:
150
+ alt = tag.attrs.get("alt", None) or ""
151
+ src = tag.attrs.get("src", None) or ""
152
+ title = tag.attrs.get("title", None) or ""
153
+ title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
154
+ parent_name = tag.parent.name if tag.parent else ""
155
+ if convert_as_inline and parent_name not in (keep_inline_images_in or []):
156
+ return alt
157
+
158
+ return f"![{alt}]({src}{title_part})"
159
+
160
+
161
+ def _convert_list(*, tag: Tag, text: str) -> str:
162
+ nested = False
163
+
164
+ before_paragraph = False
165
+ if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
166
+ before_paragraph = True
167
+
168
+ while tag:
169
+ if tag.name == "li":
170
+ nested = True
171
+ break
172
+
173
+ if not tag.parent:
174
+ break
175
+
176
+ tag = tag.parent
177
+
178
+ if nested:
179
+ return "\n" + indent(text=text, level=1).rstrip()
180
+
181
+ return text + ("\n" if before_paragraph else "")
182
+
183
+
184
+ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
185
+ parent = tag.parent
186
+ if parent is not None and parent.name == "ol":
187
+ start = (
188
+ int(cast(str, parent["start"]))
189
+ if isinstance(parent.get("start"), str) and str(parent.get("start")).isnumeric()
190
+ else 1
191
+ )
192
+ bullet = "%s." % (start + parent.index(tag))
193
+ else:
194
+ depth = -1
195
+ while tag:
196
+ if tag.name == "ul":
197
+ depth += 1
198
+ if not tag.parent:
199
+ break
200
+
201
+ tag = tag.parent
202
+
203
+ bullet = bullets[depth % len(bullets)]
204
+ return "{} {}\n".format(bullet, (text or "").strip())
205
+
206
+
207
+ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: int) -> str:
208
+ if convert_as_inline:
209
+ return text
210
+
211
+ if wrap:
212
+ text = fill(
213
+ text,
214
+ width=wrap_width,
215
+ break_long_words=False,
216
+ break_on_hyphens=False,
217
+ )
218
+
219
+ return f"{text}\n\n" if text else ""
220
+
221
+
222
+ def _convert_pre(
223
+ *,
224
+ tag: Tag,
225
+ text: str,
226
+ code_language: str,
227
+ code_language_callback: Callable[[Tag], str] | None,
228
+ ) -> str:
229
+ if not text:
230
+ return ""
231
+
232
+ if code_language_callback:
233
+ code_language = code_language_callback(tag) or code_language
234
+
235
+ return f"\n```{code_language}\n{text}\n```\n"
236
+
237
+
238
+ def _convert_td(*, tag: Tag, text: str) -> str:
239
+ colspan = _get_colspan(tag)
240
+ return " " + text.strip().replace("\n", " ") + " |" * colspan
241
+
242
+
243
+ def _convert_th(*, tag: Tag, text: str) -> str:
244
+ colspan = _get_colspan(tag)
245
+ return " " + text.strip().replace("\n", " ") + " |" * colspan
246
+
247
+
248
+ def _convert_tr(*, tag: Tag, text: str) -> str:
249
+ cells = tag.find_all(["td", "th"])
250
+ parent_name = tag.parent.name if tag.parent else ""
251
+ tag_grand_parent = tag.parent.parent if tag.parent else None
252
+ is_headrow = (
253
+ all(cell.name == "th" for cell in cells)
254
+ or (not tag.previous_sibling and parent_name != "tbody")
255
+ or (
256
+ not tag.previous_sibling
257
+ and parent_name == "tbody"
258
+ and (not tag_grand_parent or len(tag_grand_parent.find_all(["thead"])) < 1)
259
+ )
260
+ )
261
+ overline = ""
262
+ underline = ""
263
+ if is_headrow and not tag.previous_sibling:
264
+ # first row and is headline: print headline underline
265
+ full_colspan = 0
266
+ for cell in cells:
267
+ if "colspan" in cell.attrs and cell["colspan"].isdigit():
268
+ full_colspan += int(cell["colspan"])
269
+ else:
270
+ full_colspan += 1
271
+ underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
272
+ elif not tag.previous_sibling and (
273
+ parent_name == "table" or (parent_name == "tbody" and not cast(Tag, tag.parent).previous_sibling)
274
+ ):
275
+ # first row, not headline, and:
276
+ # - the parent is table or
277
+ # - the parent is tbody at the beginning of a table.
278
+ # print empty headline above this row
279
+ overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"
280
+ overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"
281
+ return overline + "|" + text + "\n" + underline
282
+
283
+
284
+ def create_converters_map(
285
+ autolinks: bool,
286
+ bullets: str,
287
+ code_language: str,
288
+ code_language_callback: Callable[[Tag], str] | None,
289
+ default_title: bool,
290
+ heading_style: Literal["atx", "atx_closed", "underlined"],
291
+ keep_inline_images_in: Iterable[str] | None,
292
+ newline_style: str,
293
+ strong_em_symbol: str,
294
+ sub_symbol: str,
295
+ sup_symbol: str,
296
+ wrap: bool,
297
+ wrap_width: int,
298
+ ) -> ConvertsMap:
299
+ """Create a mapping of HTML elements to their corresponding conversion functions.
300
+
301
+ Args:
302
+ autolinks: Whether to convert URLs into links.
303
+ bullets: The bullet characters to use for unordered lists.
304
+ code_language: The default code language to use.
305
+ code_language_callback: A callback to get the code language.
306
+ default_title: Whether to use the URL as the title for links.
307
+ heading_style: The style of headings.
308
+ keep_inline_images_in: The tags to keep inline images in.
309
+ newline_style: The style of newlines.
310
+ strong_em_symbol: The symbol to use for strong and emphasis text.
311
+ sub_symbol: The symbol to use for subscript text.
312
+ sup_symbol: The symbol to use for superscript text.
313
+ wrap: Whether to wrap text.
314
+ wrap_width: The width to wrap text at.
315
+
316
+ Returns:
317
+ A mapping of HTML elements to their corresponding conversion functions
318
+ """
319
+
320
+ def _wrapper(func: Callable[..., T]) -> Callable[[str, Tag], T]:
321
+ spec = getfullargspec(func)
322
+
323
+ def _inner(*, text: str, tag: Tag, convert_as_inline: bool) -> T:
324
+ if spec.kwonlyargs:
325
+ kwargs: dict[str, Any] = {}
326
+ if "tag" in spec.kwonlyargs:
327
+ kwargs["tag"] = tag
328
+ if "text" in spec.kwonlyargs:
329
+ kwargs["text"] = text
330
+ if "convert_as_inline" in spec.kwonlyargs:
331
+ kwargs["convert_as_inline"] = convert_as_inline
332
+ return func(**kwargs)
333
+ return func(text)
334
+
335
+ return cast(Callable[[str, Tag], T], _inner)
336
+
337
+ return {
338
+ "a": _wrapper(partial(_convert_a, autolinks=autolinks, default_title=default_title)),
339
+ "b": _wrapper(partial(_create_inline_converter(2 * strong_em_symbol))),
340
+ "blockquote": _wrapper(partial(_convert_blockquote)),
341
+ "br": _wrapper(partial(_convert_br, newline_style=newline_style)),
342
+ "code": _wrapper(_create_inline_converter("`")),
343
+ "del": _wrapper(_create_inline_converter("~~")),
344
+ "em": _wrapper(_create_inline_converter(strong_em_symbol)),
345
+ "h1": _wrapper(partial(_convert_hn, n=1, heading_style=heading_style)),
346
+ "h2": _wrapper(partial(_convert_hn, n=2, heading_style=heading_style)),
347
+ "h3": _wrapper(partial(_convert_hn, n=3, heading_style=heading_style)),
348
+ "h4": _wrapper(partial(_convert_hn, n=4, heading_style=heading_style)),
349
+ "h5": _wrapper(partial(_convert_hn, n=5, heading_style=heading_style)),
350
+ "h6": _wrapper(partial(_convert_hn, n=6, heading_style=heading_style)),
351
+ "hr": _wrapper(lambda _: "\n\n---\n\n"),
352
+ "i": _wrapper(partial(_create_inline_converter(strong_em_symbol))),
353
+ "img": _wrapper(partial(_convert_img, keep_inline_images_in=keep_inline_images_in)),
354
+ "list": _wrapper(_convert_list),
355
+ "ul": _wrapper(_convert_list),
356
+ "ol": _wrapper(_convert_list),
357
+ "li": _wrapper(partial(_convert_li, bullets=bullets)),
358
+ "p": _wrapper(partial(_convert_p, wrap=wrap, wrap_width=wrap_width)),
359
+ "pre": _wrapper(
360
+ partial(
361
+ _convert_pre,
362
+ code_language=code_language,
363
+ code_language_callback=code_language_callback,
364
+ )
365
+ ),
366
+ "script": _wrapper(lambda _: ""),
367
+ "style": _wrapper(lambda _: ""),
368
+ "s": _wrapper(_create_inline_converter("~~")),
369
+ "strong": _wrapper(_create_inline_converter(strong_em_symbol * 2)),
370
+ "samp": _wrapper(_create_inline_converter("`")),
371
+ "sub": _wrapper(_create_inline_converter(sub_symbol)),
372
+ "sup": _wrapper(_create_inline_converter(sup_symbol)),
373
+ "table": _wrapper(lambda text: f"\n\n{text}\n"),
374
+ "caption": _wrapper(lambda text: f"{text}\n"),
375
+ "figcaption": _wrapper(lambda text: f"\n\n{text}\n\n"),
376
+ "td": _wrapper(_convert_td),
377
+ "th": _wrapper(_convert_th),
378
+ "tr": _wrapper(_convert_tr),
379
+ "kbd": _wrapper(_create_inline_converter("`")),
380
+ }
@@ -0,0 +1,298 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Callable, Literal, cast
4
+
5
+ from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
6
+
7
+ from html_to_markdown.constants import (
8
+ ASTERISK,
9
+ SPACES,
10
+ UNDERLINED,
11
+ html_heading_re,
12
+ whitespace_re,
13
+ )
14
+ from html_to_markdown.converters import ConvertsMap, create_converters_map
15
+ from html_to_markdown.utils import escape
16
+
17
+ if TYPE_CHECKING:
18
+ from collections.abc import Iterable
19
+
20
+ from bs4 import PageElement
21
+
22
+ SupportedTag = Literal[
23
+ "a",
24
+ "b",
25
+ "blockquote",
26
+ "br",
27
+ "code",
28
+ "del",
29
+ "em",
30
+ "h1",
31
+ "h2",
32
+ "h3",
33
+ "h4",
34
+ "h5",
35
+ "h6",
36
+ "hr",
37
+ "i",
38
+ "img",
39
+ "list",
40
+ "ul",
41
+ "ol",
42
+ "li",
43
+ "p",
44
+ "pre",
45
+ "script",
46
+ "style",
47
+ "s",
48
+ "strong",
49
+ "samp",
50
+ "sub",
51
+ "sup",
52
+ "table",
53
+ "caption",
54
+ "figcaption",
55
+ "td",
56
+ "th",
57
+ "tr",
58
+ "kbd",
59
+ ]
60
+
61
+
62
+ def _is_nested_tag(el: PageElement) -> bool:
63
+ return isinstance(el, Tag) and el.name in {
64
+ "ol",
65
+ "ul",
66
+ "li",
67
+ "table",
68
+ "thead",
69
+ "tbody",
70
+ "tfoot",
71
+ "tr",
72
+ "td",
73
+ "th",
74
+ }
75
+
76
+
77
+ def _process_tag(
78
+ tag: Tag,
79
+ *,
80
+ autolinks: bool,
81
+ bullets: str,
82
+ code_language: str,
83
+ code_language_callback: Callable[[Any], str] | None,
84
+ convert: Iterable[str] | None,
85
+ convert_as_inline: bool = False,
86
+ converters_map: ConvertsMap | None = None,
87
+ default_title: bool,
88
+ escape_asterisks: bool,
89
+ escape_misc: bool,
90
+ escape_underscores: bool,
91
+ heading_style: Literal["atx", "atx_closed", "underlined"],
92
+ keep_inline_images_in: Iterable[str] | None,
93
+ newline_style: str,
94
+ strip: Iterable[str] | None,
95
+ strong_em_symbol: str,
96
+ sub_symbol: str,
97
+ sup_symbol: str,
98
+ wrap: bool,
99
+ wrap_width: int,
100
+ ) -> str:
101
+ if converters_map is None:
102
+ converters_map = create_converters_map(
103
+ autolinks=autolinks,
104
+ bullets=bullets,
105
+ code_language=code_language,
106
+ code_language_callback=code_language_callback,
107
+ default_title=default_title,
108
+ heading_style=heading_style,
109
+ keep_inline_images_in=keep_inline_images_in,
110
+ newline_style=newline_style,
111
+ strong_em_symbol=strong_em_symbol,
112
+ sub_symbol=sub_symbol,
113
+ sup_symbol=sup_symbol,
114
+ wrap=wrap,
115
+ wrap_width=wrap_width,
116
+ )
117
+
118
+ text = ""
119
+ is_heading = html_heading_re.match(tag.name) is not None
120
+ is_cell = tag.name in {"td", "th"}
121
+ convert_children_as_inline = convert_as_inline or is_heading or is_cell
122
+
123
+ if _is_nested_tag(tag):
124
+ for el in tag.children:
125
+ can_extract = (
126
+ not el.previous_sibling
127
+ or not el.next_sibling
128
+ or _is_nested_tag(el.previous_sibling)
129
+ or _is_nested_tag(el.next_sibling)
130
+ )
131
+ if can_extract and isinstance(el, NavigableString) and not el.strip():
132
+ el.extract()
133
+
134
+ for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children):
135
+ if isinstance(el, NavigableString):
136
+ text += _process_text(
137
+ el=el,
138
+ escape_misc=escape_misc,
139
+ escape_asterisks=escape_asterisks,
140
+ escape_underscores=escape_underscores,
141
+ )
142
+ elif isinstance(el, Tag):
143
+ text += _process_tag(
144
+ tag=el,
145
+ convert_as_inline=convert_children_as_inline,
146
+ strip=strip,
147
+ convert=convert,
148
+ escape_misc=escape_misc,
149
+ escape_asterisks=escape_asterisks,
150
+ escape_underscores=escape_underscores,
151
+ converters_map=converters_map,
152
+ autolinks=autolinks,
153
+ bullets=bullets,
154
+ code_language=code_language,
155
+ code_language_callback=code_language_callback,
156
+ default_title=default_title,
157
+ heading_style=heading_style,
158
+ keep_inline_images_in=keep_inline_images_in,
159
+ newline_style=newline_style,
160
+ strong_em_symbol=strong_em_symbol,
161
+ sub_symbol=sub_symbol,
162
+ sup_symbol=sup_symbol,
163
+ wrap=wrap,
164
+ wrap_width=wrap_width,
165
+ )
166
+
167
+ tag_name: SupportedTag | None = cast(SupportedTag, tag.name.lower()) if tag.name.lower() in converters_map else None
168
+
169
+ if tag_name and _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert):
170
+ return converters_map[tag_name]( # type: ignore[call-arg]
171
+ tag=tag, text=text, convert_as_inline=convert_as_inline
172
+ )
173
+
174
+ return text
175
+
176
+
177
+ def _process_text(
178
+ *,
179
+ el: NavigableString,
180
+ escape_misc: bool,
181
+ escape_asterisks: bool,
182
+ escape_underscores: bool,
183
+ ) -> str:
184
+ text = str(el) or ""
185
+
186
+ # normalize whitespace if we're not inside a preformatted element
187
+ if not el.find_parent("pre"):
188
+ text = whitespace_re.sub(" ", text)
189
+
190
+ # escape special characters if we're not inside a preformatted or code element
191
+ if not el.find_parent(["pre", "code", "kbd", "samp"]):
192
+ text = escape(
193
+ text=text,
194
+ escape_misc=escape_misc,
195
+ escape_asterisks=escape_asterisks,
196
+ escape_underscores=escape_underscores,
197
+ )
198
+
199
+ # remove trailing whitespaces if any of the following condition is true:
200
+ # - current text node is the last node in li
201
+ # - current text node is followed by an embedded list
202
+ if (
203
+ el.parent
204
+ and el.parent.name == "li"
205
+ and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"})
206
+ ):
207
+ text = text.rstrip()
208
+
209
+ return text
210
+
211
+
212
+ def _should_convert_tag(*, tag_name: str, strip: Iterable[str] | None, convert: Iterable[str] | None) -> bool:
213
+ if strip is not None:
214
+ return tag_name not in strip
215
+ if convert is not None:
216
+ return tag_name in convert
217
+ return True
218
+
219
+
220
+ def convert_to_markdown(
221
+ html: str,
222
+ *,
223
+ soup: BeautifulSoup | None = None,
224
+ autolinks: bool = True,
225
+ bullets: str = "*+-",
226
+ code_language: str = "",
227
+ code_language_callback: Callable[[Any], str] | None = None,
228
+ convert: Iterable[str] | None = None,
229
+ default_title: bool = False,
230
+ escape_asterisks: bool = True,
231
+ escape_misc: bool = True,
232
+ escape_underscores: bool = True,
233
+ heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
234
+ keep_inline_images_in: Iterable[str] | None = None,
235
+ newline_style: Literal["spaces", "backslash"] = SPACES,
236
+ strip: Iterable[str] | None = None,
237
+ strong_em_symbol: Literal["*", "_"] = ASTERISK,
238
+ sub_symbol: str = "",
239
+ sup_symbol: str = "",
240
+ wrap: bool = False,
241
+ wrap_width: int = 80,
242
+ convert_as_inline: bool = False,
243
+ ) -> str:
244
+ """Convert HTML to Markdown.
245
+
246
+ Args:
247
+ html: The HTML to convert.
248
+ soup: The BeautifulSoup object to convert.
249
+ autolinks: Whether to convert links to Markdown.
250
+ bullets: The bullet characters to use for unordered lists.
251
+ code_language: The default code language to use.
252
+ code_language_callback: A callback function to determine the code language.
253
+ convert: The HTML elements to convert.
254
+ default_title: Whether to use the default title.
255
+ escape_asterisks: Whether to escape asterisks.
256
+ escape_misc: Whether to escape miscellaneous characters.
257
+ escape_underscores: Whether to escape underscores.
258
+ heading_style: The style to use for headings.
259
+ keep_inline_images_in: The tags to keep inline images in.
260
+ newline_style: The style to use for newlines.
261
+ strip: The HTML elements to strip.
262
+ strong_em_symbol: The symbol to use for strong and emphasis.
263
+ sub_symbol: The symbol to use for subscript.
264
+ sup_symbol: The symbol to use for superscript.
265
+ wrap: Whether to wrap text.
266
+ wrap_width: The width to wrap text at.
267
+ convert_as_inline: Whether to convert elements as inline.
268
+
269
+ Returns:
270
+ The Markdown.
271
+ """
272
+ if soup is None:
273
+ from bs4 import BeautifulSoup
274
+
275
+ soup = BeautifulSoup(html, "html.parser")
276
+
277
+ return _process_tag(
278
+ autolinks=autolinks,
279
+ bullets=bullets,
280
+ code_language=code_language,
281
+ code_language_callback=code_language_callback,
282
+ convert=convert,
283
+ convert_as_inline=convert_as_inline,
284
+ default_title=default_title,
285
+ escape_asterisks=escape_asterisks,
286
+ escape_misc=escape_misc,
287
+ escape_underscores=escape_underscores,
288
+ heading_style=heading_style,
289
+ keep_inline_images_in=keep_inline_images_in,
290
+ newline_style=newline_style,
291
+ strip=strip,
292
+ strong_em_symbol=strong_em_symbol,
293
+ sub_symbol=sub_symbol,
294
+ sup_symbol=sup_symbol,
295
+ tag=soup,
296
+ wrap=wrap,
297
+ wrap_width=wrap_width,
298
+ )
File without changes
@@ -0,0 +1,72 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ from html_to_markdown.constants import line_beginning_re
6
+
7
+
8
+ def chomp(text: str) -> tuple[str, str, str]:
9
+ """If the text in an inline tag like b, a, or em contains a leading or trailing
10
+ space, strip the string and return a space as suffix of prefix, if needed.
11
+
12
+ Args:
13
+ text: The text to chomp.
14
+
15
+ Returns:
16
+ A tuple containing the prefix, suffix, and the stripped text.
17
+ """
18
+ prefix = " " if text and text[0] == " " else ""
19
+ suffix = " " if text and text[-1] == " " else ""
20
+ text = text.strip()
21
+ return prefix, suffix, text
22
+
23
+
24
+ def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:
25
+ """Escape special characters in text.
26
+
27
+ Args:
28
+ text: The text to escape.
29
+ escape_misc: Whether to escape miscellaneous characters.
30
+ escape_asterisks: Whether to escape asterisks.
31
+ escape_underscores: Whether to escape underscores.
32
+
33
+ Returns:
34
+ The escaped text.
35
+ """
36
+ if not text:
37
+ return ""
38
+ if escape_misc:
39
+ text = re.sub(r"([\\&<`[>~#=+|-])", r"\\\1", text)
40
+ text = re.sub(r"([0-9])([.)])", r"\1\\\2", text)
41
+ if escape_asterisks:
42
+ text = text.replace("*", r"\*")
43
+ if escape_underscores:
44
+ text = text.replace("_", r"\_")
45
+ return text
46
+
47
+
48
+ def indent(*, text: str, level: int) -> str:
49
+ """Indent text by a given level.
50
+
51
+ Args:
52
+ text: The text to indent.
53
+ level: The level of indentation.
54
+
55
+ Returns:
56
+ The indented text.
57
+ """
58
+ return line_beginning_re.sub("\t" * level, text) if text else ""
59
+
60
+
61
+ def underline(*, text: str, pad_char: str) -> str:
62
+ """Underline text with a given character.
63
+
64
+ Args:
65
+ text: The text to underline.
66
+ pad_char: The character to use for underlining.
67
+
68
+ Returns:
69
+ The underlined text.
70
+ """
71
+ text = (text or "").rstrip()
72
+ return f"{text}\n{pad_char * len(text)}\n\n" if text else ""
@@ -0,0 +1,194 @@
1
+ Metadata-Version: 2.3
2
+ Name: html-to-markdown
3
+ Version: 1.0.0
4
+ Summary: Convert HTML to markdown
5
+ Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Keywords: beautifulsoup,converter,html,markdown,text-processing
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Text Processing
18
+ Classifier: Topic :: Text Processing :: Markup
19
+ Classifier: Topic :: Text Processing :: Markup :: HTML
20
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
21
+ Classifier: Topic :: Utilities
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.9
24
+ Requires-Dist: beautifulsoup4>=4.12.3
25
+ Description-Content-Type: text/markdown
26
+
27
+ # html_to_markdown
28
+
29
+ This library is a refactored and modernized fork of [markdownify](https://pypi.org/project/markdownify/), supporting
30
+ Python 3.9 and offering strong typing.
31
+
32
+ ### Differences from the Markdownify
33
+
34
+ - The refactored codebase uses a strict functional approach - no classes are involved.
35
+ - There is full typing with strict MyPy adherence in place.
36
+ - The `convert_to_markdown` allows passing a pre-configured instance of `Beautifulsoup`.
37
+ - This library releases follows standard semver. Its version v1.0.0 was branched from markdownify's v0.13.1, at which
38
+ point versioning is no longer aligned.
39
+
40
+ ## Installation
41
+
42
+ ```shell
43
+ pip install html_to_markdown
44
+ ```
45
+
46
+ ## Usage
47
+
48
+ Convert some HTML to Markdown:
49
+
50
+ ```python
51
+ from html_to_markdown import convert_to_markdown
52
+
53
+ convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
54
+ ```
55
+
56
+ Specify tags to exclude:
57
+
58
+ ```python
59
+ from html_to_markdown import convert_to_markdown
60
+
61
+ convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', strip=['a']) # > '**Yay** GitHub'
62
+ ```
63
+
64
+ \...or specify the tags you want to include:
65
+
66
+ ```python
67
+ from html_to_markdown import convert_to_markdown
68
+
69
+ convert_to_markdown('<b>Yay</b> <a href="http://github.com">GitHub</a>', convert=['b']) # > '**Yay** GitHub'
70
+ ```
71
+
72
+ # Options
73
+
74
+ html_to_markdown supports the following options:
75
+
76
+ strip
77
+
78
+ : A list of tags to strip. This option can\'t be used with the
79
+ `convert` option.
80
+
81
+ convert
82
+
83
+ : A list of tags to convert. This option can\'t be used with the
84
+ `strip` option.
85
+
86
+ autolinks
87
+
88
+ : A boolean indicating whether the \"automatic link\" style should be
89
+ used when a `a` tag\'s contents match its href. Defaults to `True`.
90
+
91
+ default_title
92
+
93
+ : A boolean to enable setting the title of a link to its href, if no
94
+ title is given. Defaults to `False`.
95
+
96
+ heading_style
97
+
98
+ : Defines how headings should be converted. Accepted values are `ATX`,
99
+ `ATX_CLOSED`, `SETEXT`, and `UNDERLINED` (which is an alias for
100
+ `SETEXT`). Defaults to `UNDERLINED`.
101
+
102
+ bullets
103
+
104
+ : An iterable (string, list, or tuple) of bullet styles to be used. If
105
+ the iterable only contains one item, it will be used regardless of
106
+ how deeply lists are nested. Otherwise, the bullet will alternate
107
+ based on nesting level. Defaults to `'*+-'`.
108
+
109
+ strong_em_symbol
110
+
111
+ : In markdown, both `*` and `_` are used to encode **strong** or
112
+ *emphasized* texts. Either of these symbols can be chosen by the
113
+ options `ASTERISK` (default) or `UNDERSCORE` respectively.
114
+
115
+ sub_symbol, sup_symbol
116
+
117
+ : Define the chars that surround `<sub>` and `<sup>` text. Defaults to
118
+ an empty string, because this is non-standard behavior. Could be
119
+ something like `~` and `^` to result in `~sub~` and `^sup^`. If the
120
+ value starts with `<` and ends with `>`, it is treated as an HTML
121
+ tag and a `/` is inserted after the `<` in the string used after the
122
+ text; this allows specifying `<sub>` to use raw HTML in the output
123
+ for subscripts, for example.
124
+
125
+ newline_style
126
+
127
+ : Defines the style of marking linebreaks (`<br>`) in markdown. The
128
+ default value `SPACES` of this option will adopt the usual two
129
+ spaces and a newline, while `BACKSLASH` will convert a linebreak to
130
+ `\\n` (a backslash and a newline). While the latter convention is
131
+ non-standard, it is commonly preferred and supported by a lot of
132
+ interpreters.
133
+
134
+ code_language
135
+
136
+ : Defines the language that should be assumed for all `<pre>`
137
+ sections. Useful, if all code on a page is in the same programming
138
+ language and should be annotated with ``[python]{.title-ref}[ or
139
+ similar. Defaults to ]{.title-ref}[\'\']{.title-ref}\` (empty
140
+ string) and can be any string.
141
+
142
+ code_language_callback
143
+
144
+ : When the HTML code contains `pre` tags that in some way provide the
145
+ code language, for example as class, this callback can be used to
146
+ extract the language from the tag and prefix it to the converted
147
+ `pre` tag. The callback gets one single argument, an BeautifylSoup
148
+ object, and returns a string containing the code language, or
149
+ `None`. An example to use the class name as code language could be:
150
+
151
+ def callback(el):
152
+ return el['class'][0] if el.has_attr('class') else None
153
+
154
+ Defaults to `None`.
155
+
156
+ escape_asterisks
157
+
158
+ : If set to `False`, do not escape `*` to `\*` in text. Defaults to
159
+ `True`.
160
+
161
+ escape_underscores
162
+
163
+ : If set to `False`, do not escape `_` to `\_` in text. Defaults to
164
+ `True`.
165
+
166
+ escape_misc
167
+
168
+ : If set to `False`, do not escape miscellaneous punctuation
169
+ characters that sometimes have Markdown significance in text.
170
+ Defaults to `True`.
171
+
172
+ keep_inline_images_in
173
+
174
+ : Images are converted to their alt-text when the images are located
175
+ inside headlines or table cells. If some inline images should be
176
+ converted to markdown images instead, this option can be set to a
177
+ list of parent tags that should be allowed to contain inline images,
178
+ for example `['td']`. Defaults to an empty list.
179
+
180
+ wrap, wrap_width
181
+
182
+ : If `wrap` is set to `True`, all text paragraphs are wrapped at
183
+ `wrap_width` characters. Defaults to `False` and `80`. Use with
184
+ `newline_style=BACKSLASH` to keep line breaks in paragraphs.
185
+
186
+ Options may be specified as kwargs to the `html_to_markdown` function, or as
187
+ a nested `Options` class in `MarkdownConverter` subclasses.
188
+
189
+ # CLI
190
+
191
+ Use `html_to_markdown example.html > example.md` or pipe input from stdin
192
+ (`cat example.html | html_to_markdown > example.md`). Call `html_to_markdown -h`
193
+ to see all available options. They are the same as listed above and take
194
+ the same arguments.
@@ -0,0 +1,12 @@
1
+ html_to_markdown/__init__.py,sha256=_WXeqic-7b6hvidTXkPQwAfLa4YOEAEP-mOUXjx_25k,95
2
+ html_to_markdown/__main__.py,sha256=Wll22XKFmiNSIpdbGzC75b5_Unc3HYOTA6oXA414Tl8,4412
3
+ html_to_markdown/constants.py,sha256=vUjffZ0vFq56jbXF5bBNzomfJwgsp0TWqdUzhkp6bks,687
4
+ html_to_markdown/converters.py,sha256=q1wpzsYl-FRR9qbB983gAkem_-7mgYZ7hOgziofjIDM,12238
5
+ html_to_markdown/processing.py,sha256=9l3zq_kdyvU0TnTk5g4uuYI6Jbu1gY7NQ11u3IBKyFU,9029
6
+ html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
8
+ html_to_markdown-1.0.0.dist-info/METADATA,sha256=0MObULuhTHiyvVcytDBN_liafpyFjgp5brgoWQYEglA,6478
9
+ html_to_markdown-1.0.0.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
10
+ html_to_markdown-1.0.0.dist-info/entry_points.txt,sha256=jhMqXDYvIyzQDLKjCn4xCyzCCbAMl94tzQx_HiG5Qi0,67
11
+ html_to_markdown-1.0.0.dist-info/licenses/LICENSE,sha256=06BS7zd6oPCrbzAqrThGFboRlbssgBsqDJGqKyZW2Og,1117
12
+ html_to_markdown-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.25.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ html_to_markdown = html_to_markdown.__main__:cli
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright 2012-2018 Matthew Tretter
4
+ Copyright 2024 Na'aman Hirschfeld
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.