html-to-markdown 1.9.1__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -0,0 +1,303 @@
1
+ """Whitespace handling module for HTML to Markdown conversion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import unicodedata
7
+ from typing import TYPE_CHECKING, Literal
8
+
9
+ if TYPE_CHECKING:
10
+ from bs4 import NavigableString, PageElement, Tag
11
+
12
+
13
+ WhitespaceMode = Literal["normalized", "strict"]
14
+
15
+
16
+ BLOCK_ELEMENTS = {
17
+ "address",
18
+ "article",
19
+ "aside",
20
+ "blockquote",
21
+ "canvas",
22
+ "datalist",
23
+ "dd",
24
+ "details",
25
+ "div",
26
+ "dl",
27
+ "dt",
28
+ "fieldset",
29
+ "figcaption",
30
+ "figure",
31
+ "footer",
32
+ "form",
33
+ "h1",
34
+ "h2",
35
+ "h3",
36
+ "h4",
37
+ "h5",
38
+ "h6",
39
+ "header",
40
+ "hr",
41
+ "legend",
42
+ "li",
43
+ "main",
44
+ "nav",
45
+ "noscript",
46
+ "ol",
47
+ "option",
48
+ "p",
49
+ "pre",
50
+ "section",
51
+ "summary",
52
+ "table",
53
+ "tfoot",
54
+ "ul",
55
+ }
56
+
57
+ PRESERVE_WHITESPACE_ELEMENTS = {"pre", "script", "style"}
58
+
59
+ INLINE_ELEMENTS = {
60
+ "a",
61
+ "abbr",
62
+ "acronym",
63
+ "audio",
64
+ "b",
65
+ "bdi",
66
+ "bdo",
67
+ "big",
68
+ "br",
69
+ "button",
70
+ "cite",
71
+ "code",
72
+ "data",
73
+ "dfn",
74
+ "dialog",
75
+ "em",
76
+ "i",
77
+ "iframe",
78
+ "img",
79
+ "input",
80
+ "kbd",
81
+ "label",
82
+ "map",
83
+ "math",
84
+ "menu",
85
+ "meter",
86
+ "object",
87
+ "output",
88
+ "progress",
89
+ "q",
90
+ "rb",
91
+ "rp",
92
+ "rt",
93
+ "rtc",
94
+ "ruby",
95
+ "samp",
96
+ "script",
97
+ "select",
98
+ "small",
99
+ "span",
100
+ "strong",
101
+ "style",
102
+ "sub",
103
+ "sup",
104
+ "svg",
105
+ "textarea",
106
+ "time",
107
+ "tt",
108
+ "u",
109
+ "var",
110
+ "video",
111
+ "del",
112
+ "ins",
113
+ "mark",
114
+ "s",
115
+ "strike",
116
+ "wbr",
117
+ }
118
+
119
+
120
+ class WhitespaceHandler:
121
+ def __init__(self, mode: WhitespaceMode = "normalized") -> None:
122
+ self.mode = mode
123
+ self._multiple_spaces = re.compile(r"[ \t]+")
124
+ self._multiple_newlines = re.compile(r"\n{2,}")
125
+ self._leading_trailing_space = re.compile(r"^[ \t]+|[ \t]+$", re.MULTILINE)
126
+ self._unicode_spaces = re.compile(r"[\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]")
127
+
128
+ def normalize_unicode_spaces(self, text: str) -> str:
129
+ text = self._unicode_spaces.sub(" ", text)
130
+
131
+ normalized = []
132
+ for char in text:
133
+ if unicodedata.category(char) in ("Zs", "Zl", "Zp"):
134
+ normalized.append(" ")
135
+ elif char in ("\r\n", "\r"):
136
+ normalized.append("\n")
137
+ else:
138
+ normalized.append(char)
139
+
140
+ return "".join(normalized)
141
+
142
+ def should_preserve_whitespace(self, element: PageElement) -> bool:
143
+ if self.mode == "strict":
144
+ return True
145
+
146
+ current: PageElement | None = element
147
+ while current:
148
+ if hasattr(current, "name") and current.name in PRESERVE_WHITESPACE_ELEMENTS:
149
+ return True
150
+ current = getattr(current, "parent", None)
151
+
152
+ return False
153
+
154
+ def is_block_element(self, element: PageElement | None) -> bool:
155
+ if not element or not hasattr(element, "name"):
156
+ return False
157
+ return element.name in BLOCK_ELEMENTS
158
+
159
+ def is_inline_element(self, element: PageElement | None) -> bool:
160
+ if not element or not hasattr(element, "name"):
161
+ return False
162
+ return element.name in INLINE_ELEMENTS
163
+
164
+ def process_text_whitespace(
165
+ self,
166
+ text: str,
167
+ element: NavigableString,
168
+ *,
169
+ in_pre: bool = False,
170
+ ) -> str:
171
+ if not text:
172
+ return ""
173
+
174
+ if in_pre or self.should_preserve_whitespace(element):
175
+ return text
176
+
177
+ if self.mode == "strict":
178
+ return text
179
+
180
+ text = self.normalize_unicode_spaces(text)
181
+ return self._process_normalized(text, element)
182
+
183
+ def _process_normalized(self, text: str, element: NavigableString) -> str:
184
+ if not text.strip():
185
+ return self._process_whitespace_only(text, element)
186
+
187
+ return self._process_text_with_content(text, element)
188
+
189
+ def _process_whitespace_only(self, text: str, element: NavigableString) -> str:
190
+ prev_sibling = element.previous_sibling
191
+ next_sibling = element.next_sibling
192
+
193
+ if self.is_block_element(prev_sibling) and self.is_block_element(next_sibling):
194
+ return ""
195
+
196
+ if "\n" in text:
197
+ return ""
198
+
199
+ if self.is_inline_element(prev_sibling) or self.is_inline_element(next_sibling):
200
+ return " "
201
+
202
+ return ""
203
+
204
+ def _process_text_with_content(self, text: str, element: NavigableString) -> str:
205
+ original = str(element)
206
+
207
+ has_lead_space = original and original[0] in " \t\n"
208
+ has_trail_space = original and original[-1] in " \t\n"
209
+
210
+ text = self._multiple_spaces.sub(" ", text.strip())
211
+
212
+ parent = element.parent
213
+
214
+ if parent and hasattr(parent, "name") and parent.name in {"ruby", "select", "datalist"}:
215
+ return self._process_special_inline_containers(text, original)
216
+
217
+ if parent and self.is_inline_element(parent):
218
+ return self._process_inline_element_text(text, original, bool(has_lead_space), bool(has_trail_space))
219
+
220
+ return self._process_standalone_text(text, original, element, bool(has_lead_space), bool(has_trail_space))
221
+
222
+ def _process_special_inline_containers(self, text: str, original: str) -> str:
223
+ if original and "\n" not in original and "\t" not in original:
224
+ if original[0] == " ":
225
+ text = " " + text
226
+ if original[-1] == " ":
227
+ text = text + " "
228
+ return text
229
+
230
+ def _process_inline_element_text(
231
+ self, text: str, original: str, has_lead_space: bool, has_trail_space: bool
232
+ ) -> str:
233
+ if has_lead_space and original[0] == " ":
234
+ text = " " + text
235
+ if has_trail_space and original[-1] == " ":
236
+ text = text + " "
237
+ return text
238
+
239
+ def _process_standalone_text(
240
+ self, text: str, original: str, element: NavigableString, has_lead_space: bool, has_trail_space: bool
241
+ ) -> str:
242
+ prev_sibling = element.previous_sibling
243
+ next_sibling = element.next_sibling
244
+
245
+ multiple_newlines_before_block = (
246
+ original
247
+ and original.count("\n") >= 2
248
+ and self.is_block_element(next_sibling)
249
+ and text.strip()
250
+ and (self.is_inline_element(prev_sibling) or prev_sibling is None)
251
+ )
252
+
253
+ has_leading = (
254
+ has_lead_space
255
+ and original[0] == " "
256
+ and (self.is_inline_element(prev_sibling) or self.is_block_element(prev_sibling) or prev_sibling is None)
257
+ )
258
+ has_trailing = (
259
+ has_trail_space
260
+ and original[-1] == " "
261
+ and (self.is_inline_element(next_sibling) or self.is_block_element(next_sibling) or next_sibling is None)
262
+ )
263
+
264
+ if original and original[0] in "\n\t" and self.is_inline_element(prev_sibling):
265
+ text = " " + text
266
+ elif original and original[0] in "\n\t":
267
+ has_leading = False
268
+
269
+ if original and original[-1] in "\n\t" and self.is_inline_element(next_sibling):
270
+ text = text + " "
271
+ elif original and original[-1] in "\n\t":
272
+ has_trailing = False
273
+
274
+ if has_leading and not (original and original[0] in "\n\t"):
275
+ text = " " + text
276
+ if has_trailing and not (original and original[-1] in "\n\t"):
277
+ text = text + " "
278
+
279
+ if multiple_newlines_before_block:
280
+ text = text + "\n\n"
281
+
282
+ return text
283
+
284
+ def get_block_spacing(self, tag: Tag, next_sibling: PageElement | None = None) -> str:
285
+ if self.mode == "strict":
286
+ return ""
287
+
288
+ tag_name = tag.name.lower() if hasattr(tag, "name") else ""
289
+
290
+ double_newline_elements = {"p", "div", "blockquote", "pre", "table", "ul", "ol", "dl"}
291
+
292
+ single_newline_elements = {"li", "dt", "dd", "tr", "td", "th"}
293
+
294
+ if tag_name in double_newline_elements:
295
+ if self.is_block_element(next_sibling):
296
+ return "\n\n"
297
+ return "\n"
298
+ if tag_name in single_newline_elements:
299
+ return "\n"
300
+ if tag_name.startswith("h") and len(tag_name) == 2 and tag_name[1].isdigit():
301
+ return "\n\n"
302
+
303
+ return ""