html-to-markdown 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__main__.py +0 -1
- html_to_markdown/cli.py +101 -45
- html_to_markdown/constants.py +3 -0
- html_to_markdown/converters.py +31 -502
- html_to_markdown/exceptions.py +1 -11
- html_to_markdown/preprocessor.py +0 -37
- html_to_markdown/processing.py +104 -181
- html_to_markdown/utils.py +2 -42
- html_to_markdown/whitespace.py +292 -0
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/METADATA +195 -203
- html_to_markdown-1.10.0.dist-info/RECORD +17 -0
- html_to_markdown-1.9.1.dist-info/RECORD +0 -16
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/top_level.txt +0 -0
html_to_markdown/utils.py
CHANGED
|
@@ -6,17 +6,6 @@ from html_to_markdown.constants import line_beginning_re
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def chomp(text: str) -> tuple[str, str, str]:
|
|
9
|
-
"""Simplified whitespace handling for inline elements.
|
|
10
|
-
|
|
11
|
-
For semantic markdown output, preserves leading/trailing spaces as single spaces
|
|
12
|
-
and normalizes internal whitespace.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
text: The text to chomp.
|
|
16
|
-
|
|
17
|
-
Returns:
|
|
18
|
-
A tuple containing the prefix, suffix, and the normalized text.
|
|
19
|
-
"""
|
|
20
9
|
if not text:
|
|
21
10
|
return "", "", ""
|
|
22
11
|
|
|
@@ -29,17 +18,6 @@ def chomp(text: str) -> tuple[str, str, str]:
|
|
|
29
18
|
|
|
30
19
|
|
|
31
20
|
def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:
|
|
32
|
-
"""Escape special characters in text.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
text: The text to escape.
|
|
36
|
-
escape_misc: Whether to escape miscellaneous characters.
|
|
37
|
-
escape_asterisks: Whether to escape asterisks.
|
|
38
|
-
escape_underscores: Whether to escape underscores.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
The escaped text.
|
|
42
|
-
"""
|
|
43
21
|
if not text:
|
|
44
22
|
return ""
|
|
45
23
|
if escape_misc:
|
|
@@ -52,28 +30,10 @@ def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_under
|
|
|
52
30
|
return text
|
|
53
31
|
|
|
54
32
|
|
|
55
|
-
def indent(*, text: str, level: int) -> str:
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
Args:
|
|
59
|
-
text: The text to indent.
|
|
60
|
-
level: The level of indentation.
|
|
61
|
-
|
|
62
|
-
Returns:
|
|
63
|
-
The indented text.
|
|
64
|
-
"""
|
|
65
|
-
return line_beginning_re.sub("\t" * level, text) if text else ""
|
|
33
|
+
def indent(*, text: str, level: int, indent_str: str = "\t") -> str:
|
|
34
|
+
return line_beginning_re.sub(indent_str * level, text) if text else ""
|
|
66
35
|
|
|
67
36
|
|
|
68
37
|
def underline(*, text: str, pad_char: str) -> str:
|
|
69
|
-
"""Underline text with a given character.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
text: The text to underline.
|
|
73
|
-
pad_char: The character to use for underlining.
|
|
74
|
-
|
|
75
|
-
Returns:
|
|
76
|
-
The underlined text.
|
|
77
|
-
"""
|
|
78
38
|
text = (text or "").rstrip()
|
|
79
39
|
return f"{text}\n{pad_char * len(text)}\n\n" if text else ""
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
"""Whitespace handling module for HTML to Markdown conversion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import unicodedata
|
|
7
|
+
from typing import TYPE_CHECKING, Literal
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from bs4 import NavigableString, PageElement, Tag
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
WhitespaceMode = Literal["normalized", "strict"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
BLOCK_ELEMENTS = {
|
|
17
|
+
"address",
|
|
18
|
+
"article",
|
|
19
|
+
"aside",
|
|
20
|
+
"blockquote",
|
|
21
|
+
"canvas",
|
|
22
|
+
"datalist",
|
|
23
|
+
"dd",
|
|
24
|
+
"details",
|
|
25
|
+
"div",
|
|
26
|
+
"dl",
|
|
27
|
+
"dt",
|
|
28
|
+
"fieldset",
|
|
29
|
+
"figcaption",
|
|
30
|
+
"figure",
|
|
31
|
+
"footer",
|
|
32
|
+
"form",
|
|
33
|
+
"h1",
|
|
34
|
+
"h2",
|
|
35
|
+
"h3",
|
|
36
|
+
"h4",
|
|
37
|
+
"h5",
|
|
38
|
+
"h6",
|
|
39
|
+
"header",
|
|
40
|
+
"hr",
|
|
41
|
+
"legend",
|
|
42
|
+
"li",
|
|
43
|
+
"main",
|
|
44
|
+
"nav",
|
|
45
|
+
"noscript",
|
|
46
|
+
"ol",
|
|
47
|
+
"option",
|
|
48
|
+
"p",
|
|
49
|
+
"pre",
|
|
50
|
+
"section",
|
|
51
|
+
"summary",
|
|
52
|
+
"table",
|
|
53
|
+
"tfoot",
|
|
54
|
+
"ul",
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
PRESERVE_WHITESPACE_ELEMENTS = {"pre", "script", "style"}
|
|
58
|
+
|
|
59
|
+
INLINE_ELEMENTS = {
|
|
60
|
+
"a",
|
|
61
|
+
"abbr",
|
|
62
|
+
"acronym",
|
|
63
|
+
"audio",
|
|
64
|
+
"b",
|
|
65
|
+
"bdi",
|
|
66
|
+
"bdo",
|
|
67
|
+
"big",
|
|
68
|
+
"br",
|
|
69
|
+
"button",
|
|
70
|
+
"cite",
|
|
71
|
+
"code",
|
|
72
|
+
"data",
|
|
73
|
+
"dfn",
|
|
74
|
+
"dialog",
|
|
75
|
+
"em",
|
|
76
|
+
"i",
|
|
77
|
+
"iframe",
|
|
78
|
+
"img",
|
|
79
|
+
"input",
|
|
80
|
+
"kbd",
|
|
81
|
+
"label",
|
|
82
|
+
"map",
|
|
83
|
+
"math",
|
|
84
|
+
"menu",
|
|
85
|
+
"meter",
|
|
86
|
+
"object",
|
|
87
|
+
"output",
|
|
88
|
+
"progress",
|
|
89
|
+
"q",
|
|
90
|
+
"rb",
|
|
91
|
+
"rp",
|
|
92
|
+
"rt",
|
|
93
|
+
"rtc",
|
|
94
|
+
"ruby",
|
|
95
|
+
"samp",
|
|
96
|
+
"script",
|
|
97
|
+
"select",
|
|
98
|
+
"small",
|
|
99
|
+
"span",
|
|
100
|
+
"strong",
|
|
101
|
+
"style",
|
|
102
|
+
"sub",
|
|
103
|
+
"sup",
|
|
104
|
+
"svg",
|
|
105
|
+
"textarea",
|
|
106
|
+
"time",
|
|
107
|
+
"tt",
|
|
108
|
+
"u",
|
|
109
|
+
"var",
|
|
110
|
+
"video",
|
|
111
|
+
"del",
|
|
112
|
+
"ins",
|
|
113
|
+
"mark",
|
|
114
|
+
"s",
|
|
115
|
+
"strike",
|
|
116
|
+
"wbr",
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class WhitespaceHandler:
|
|
121
|
+
def __init__(self, mode: WhitespaceMode = "normalized") -> None:
|
|
122
|
+
self.mode = mode
|
|
123
|
+
self._multiple_spaces = re.compile(r"[ \t]+")
|
|
124
|
+
self._multiple_newlines = re.compile(r"\n{2,}")
|
|
125
|
+
self._leading_trailing_space = re.compile(r"^[ \t]+|[ \t]+$", re.MULTILINE)
|
|
126
|
+
self._unicode_spaces = re.compile(r"[\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]")
|
|
127
|
+
|
|
128
|
+
def normalize_unicode_spaces(self, text: str) -> str:
|
|
129
|
+
text = self._unicode_spaces.sub(" ", text)
|
|
130
|
+
|
|
131
|
+
normalized = []
|
|
132
|
+
for char in text:
|
|
133
|
+
if unicodedata.category(char) in ("Zs", "Zl", "Zp"):
|
|
134
|
+
normalized.append(" ")
|
|
135
|
+
elif char in ("\r\n", "\r"):
|
|
136
|
+
normalized.append("\n")
|
|
137
|
+
else:
|
|
138
|
+
normalized.append(char)
|
|
139
|
+
|
|
140
|
+
return "".join(normalized)
|
|
141
|
+
|
|
142
|
+
def should_preserve_whitespace(self, element: PageElement) -> bool:
|
|
143
|
+
if self.mode == "strict":
|
|
144
|
+
return True
|
|
145
|
+
|
|
146
|
+
current: PageElement | None = element
|
|
147
|
+
while current:
|
|
148
|
+
if hasattr(current, "name") and current.name in PRESERVE_WHITESPACE_ELEMENTS:
|
|
149
|
+
return True
|
|
150
|
+
current = getattr(current, "parent", None)
|
|
151
|
+
|
|
152
|
+
return False
|
|
153
|
+
|
|
154
|
+
def is_block_element(self, element: PageElement | None) -> bool:
|
|
155
|
+
if not element or not hasattr(element, "name"):
|
|
156
|
+
return False
|
|
157
|
+
return element.name in BLOCK_ELEMENTS
|
|
158
|
+
|
|
159
|
+
def is_inline_element(self, element: PageElement | None) -> bool:
|
|
160
|
+
if not element or not hasattr(element, "name"):
|
|
161
|
+
return False
|
|
162
|
+
return element.name in INLINE_ELEMENTS
|
|
163
|
+
|
|
164
|
+
def process_text_whitespace(
|
|
165
|
+
self,
|
|
166
|
+
text: str,
|
|
167
|
+
element: NavigableString,
|
|
168
|
+
*,
|
|
169
|
+
in_pre: bool = False,
|
|
170
|
+
) -> str:
|
|
171
|
+
if not text:
|
|
172
|
+
return ""
|
|
173
|
+
|
|
174
|
+
text = self.normalize_unicode_spaces(text)
|
|
175
|
+
|
|
176
|
+
if in_pre or self.should_preserve_whitespace(element):
|
|
177
|
+
return text
|
|
178
|
+
|
|
179
|
+
if self.mode == "strict":
|
|
180
|
+
return text
|
|
181
|
+
return self._process_normalized(text, element)
|
|
182
|
+
|
|
183
|
+
def _process_normalized(self, text: str, element: NavigableString) -> str:
|
|
184
|
+
if not text.strip():
|
|
185
|
+
return self._process_whitespace_only(text, element)
|
|
186
|
+
|
|
187
|
+
return self._process_text_with_content(text, element)
|
|
188
|
+
|
|
189
|
+
def _process_whitespace_only(self, text: str, element: NavigableString) -> str:
|
|
190
|
+
prev_sibling = element.previous_sibling
|
|
191
|
+
next_sibling = element.next_sibling
|
|
192
|
+
|
|
193
|
+
if self.is_block_element(prev_sibling) and self.is_block_element(next_sibling):
|
|
194
|
+
return ""
|
|
195
|
+
|
|
196
|
+
if "\n" in text:
|
|
197
|
+
return ""
|
|
198
|
+
|
|
199
|
+
if self.is_inline_element(prev_sibling) or self.is_inline_element(next_sibling):
|
|
200
|
+
return " "
|
|
201
|
+
|
|
202
|
+
return ""
|
|
203
|
+
|
|
204
|
+
def _process_text_with_content(self, text: str, element: NavigableString) -> str:
|
|
205
|
+
original = str(element)
|
|
206
|
+
|
|
207
|
+
has_lead_space = original and original[0] in " \t\n"
|
|
208
|
+
has_trail_space = original and original[-1] in " \t\n"
|
|
209
|
+
|
|
210
|
+
text = self._multiple_spaces.sub(" ", text.strip())
|
|
211
|
+
|
|
212
|
+
parent = element.parent
|
|
213
|
+
|
|
214
|
+
if parent and hasattr(parent, "name") and parent.name in {"ruby", "select", "datalist"}:
|
|
215
|
+
return self._process_special_inline_containers(text, original)
|
|
216
|
+
|
|
217
|
+
if parent and self.is_inline_element(parent):
|
|
218
|
+
return self._process_inline_element_text(text, original, bool(has_lead_space), bool(has_trail_space))
|
|
219
|
+
|
|
220
|
+
return self._process_standalone_text(text, original, element, bool(has_lead_space), bool(has_trail_space))
|
|
221
|
+
|
|
222
|
+
def _process_special_inline_containers(self, text: str, original: str) -> str:
|
|
223
|
+
if original and "\n" not in original and "\t" not in original:
|
|
224
|
+
if original[0] == " ":
|
|
225
|
+
text = " " + text
|
|
226
|
+
if original[-1] == " ":
|
|
227
|
+
text = text + " "
|
|
228
|
+
return text
|
|
229
|
+
|
|
230
|
+
def _process_inline_element_text(
|
|
231
|
+
self, text: str, original: str, has_lead_space: bool, has_trail_space: bool
|
|
232
|
+
) -> str:
|
|
233
|
+
if has_lead_space and original[0] == " ":
|
|
234
|
+
text = " " + text
|
|
235
|
+
if has_trail_space and original[-1] == " ":
|
|
236
|
+
text = text + " "
|
|
237
|
+
return text
|
|
238
|
+
|
|
239
|
+
def _process_standalone_text(
|
|
240
|
+
self, text: str, original: str, element: NavigableString, has_lead_space: bool, has_trail_space: bool
|
|
241
|
+
) -> str:
|
|
242
|
+
prev_sibling = element.previous_sibling
|
|
243
|
+
next_sibling = element.next_sibling
|
|
244
|
+
|
|
245
|
+
has_leading = (
|
|
246
|
+
has_lead_space
|
|
247
|
+
and original[0] == " "
|
|
248
|
+
and (self.is_inline_element(prev_sibling) or self.is_block_element(prev_sibling) or prev_sibling is None)
|
|
249
|
+
)
|
|
250
|
+
has_trailing = (
|
|
251
|
+
has_trail_space
|
|
252
|
+
and original[-1] == " "
|
|
253
|
+
and (self.is_inline_element(next_sibling) or self.is_block_element(next_sibling) or next_sibling is None)
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
if original and original[0] in "\n\t" and self.is_inline_element(prev_sibling):
|
|
257
|
+
text = " " + text
|
|
258
|
+
elif original and original[0] in "\n\t":
|
|
259
|
+
has_leading = False
|
|
260
|
+
|
|
261
|
+
if original and original[-1] in "\n\t" and self.is_inline_element(next_sibling):
|
|
262
|
+
text = text + " "
|
|
263
|
+
elif original and original[-1] in "\n\t":
|
|
264
|
+
has_trailing = False
|
|
265
|
+
|
|
266
|
+
if has_leading and not (original and original[0] in "\n\t"):
|
|
267
|
+
text = " " + text
|
|
268
|
+
if has_trailing and not (original and original[-1] in "\n\t"):
|
|
269
|
+
text = text + " "
|
|
270
|
+
|
|
271
|
+
return text
|
|
272
|
+
|
|
273
|
+
def get_block_spacing(self, tag: Tag, next_sibling: PageElement | None = None) -> str:
|
|
274
|
+
if self.mode == "strict":
|
|
275
|
+
return ""
|
|
276
|
+
|
|
277
|
+
tag_name = tag.name.lower() if hasattr(tag, "name") else ""
|
|
278
|
+
|
|
279
|
+
double_newline_elements = {"p", "div", "blockquote", "pre", "table", "ul", "ol", "dl"}
|
|
280
|
+
|
|
281
|
+
single_newline_elements = {"li", "dt", "dd", "tr", "td", "th"}
|
|
282
|
+
|
|
283
|
+
if tag_name in double_newline_elements:
|
|
284
|
+
if self.is_block_element(next_sibling):
|
|
285
|
+
return "\n\n"
|
|
286
|
+
return "\n"
|
|
287
|
+
if tag_name in single_newline_elements:
|
|
288
|
+
return "\n"
|
|
289
|
+
if tag_name.startswith("h") and len(tag_name) == 2:
|
|
290
|
+
return "\n\n"
|
|
291
|
+
|
|
292
|
+
return ""
|