html-to-markdown 1.11.0__py3-none-any.whl → 1.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/cli.py +28 -2
- html_to_markdown/converters.py +208 -130
- html_to_markdown/exceptions.py +5 -0
- html_to_markdown/preprocessor.py +96 -86
- html_to_markdown/processing.py +63 -48
- html_to_markdown/utils.py +1 -3
- html_to_markdown/whitespace.py +23 -33
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/METADATA +143 -2
- html_to_markdown-1.12.1.dist-info/RECORD +17 -0
- html_to_markdown-1.11.0.dist-info/RECORD +0 -17
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/top_level.txt +0 -0
html_to_markdown/preprocessor.py
CHANGED
|
@@ -5,6 +5,98 @@ from typing import Any
|
|
|
5
5
|
|
|
6
6
|
import nh3
|
|
7
7
|
|
|
8
|
+
BASE_ALLOWED_TAGS = frozenset(
|
|
9
|
+
{
|
|
10
|
+
"p",
|
|
11
|
+
"div",
|
|
12
|
+
"span",
|
|
13
|
+
"br",
|
|
14
|
+
"hr",
|
|
15
|
+
"h1",
|
|
16
|
+
"h2",
|
|
17
|
+
"h3",
|
|
18
|
+
"h4",
|
|
19
|
+
"h5",
|
|
20
|
+
"h6",
|
|
21
|
+
"ul",
|
|
22
|
+
"ol",
|
|
23
|
+
"li",
|
|
24
|
+
"dl",
|
|
25
|
+
"dt",
|
|
26
|
+
"dd",
|
|
27
|
+
"strong",
|
|
28
|
+
"b",
|
|
29
|
+
"em",
|
|
30
|
+
"i",
|
|
31
|
+
"u",
|
|
32
|
+
"s",
|
|
33
|
+
"del",
|
|
34
|
+
"ins",
|
|
35
|
+
"mark",
|
|
36
|
+
"small",
|
|
37
|
+
"sub",
|
|
38
|
+
"sup",
|
|
39
|
+
"code",
|
|
40
|
+
"pre",
|
|
41
|
+
"kbd",
|
|
42
|
+
"samp",
|
|
43
|
+
"var",
|
|
44
|
+
"abbr",
|
|
45
|
+
"cite",
|
|
46
|
+
"dfn",
|
|
47
|
+
"time",
|
|
48
|
+
"data",
|
|
49
|
+
"a",
|
|
50
|
+
"blockquote",
|
|
51
|
+
"q",
|
|
52
|
+
}
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
SEMANTIC_STRUCTURE_TAGS = frozenset(
|
|
56
|
+
{
|
|
57
|
+
"article",
|
|
58
|
+
"section",
|
|
59
|
+
"aside",
|
|
60
|
+
"header",
|
|
61
|
+
"footer",
|
|
62
|
+
"main",
|
|
63
|
+
"nav",
|
|
64
|
+
"figure",
|
|
65
|
+
"figcaption",
|
|
66
|
+
"details",
|
|
67
|
+
"summary",
|
|
68
|
+
}
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
TABLE_TAGS = frozenset(
|
|
72
|
+
{
|
|
73
|
+
"table",
|
|
74
|
+
"thead",
|
|
75
|
+
"tbody",
|
|
76
|
+
"tfoot",
|
|
77
|
+
"tr",
|
|
78
|
+
"td",
|
|
79
|
+
"th",
|
|
80
|
+
"caption",
|
|
81
|
+
"colgroup",
|
|
82
|
+
"col",
|
|
83
|
+
}
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
MEDIA_TAGS = frozenset(
|
|
87
|
+
{
|
|
88
|
+
"img",
|
|
89
|
+
"picture",
|
|
90
|
+
"source",
|
|
91
|
+
"audio",
|
|
92
|
+
"video",
|
|
93
|
+
"track",
|
|
94
|
+
"canvas",
|
|
95
|
+
"svg",
|
|
96
|
+
"iframe",
|
|
97
|
+
}
|
|
98
|
+
)
|
|
99
|
+
|
|
8
100
|
|
|
9
101
|
def preprocess_html(
|
|
10
102
|
html: str,
|
|
@@ -63,98 +155,16 @@ def _configure_cleaning_rules(
|
|
|
63
155
|
custom_tags_to_remove: set[str],
|
|
64
156
|
custom_attributes_to_remove: set[str],
|
|
65
157
|
) -> dict[str, Any]:
|
|
66
|
-
allowed_tags =
|
|
67
|
-
"p",
|
|
68
|
-
"div",
|
|
69
|
-
"span",
|
|
70
|
-
"br",
|
|
71
|
-
"hr",
|
|
72
|
-
"h1",
|
|
73
|
-
"h2",
|
|
74
|
-
"h3",
|
|
75
|
-
"h4",
|
|
76
|
-
"h5",
|
|
77
|
-
"h6",
|
|
78
|
-
"ul",
|
|
79
|
-
"ol",
|
|
80
|
-
"li",
|
|
81
|
-
"dl",
|
|
82
|
-
"dt",
|
|
83
|
-
"dd",
|
|
84
|
-
"strong",
|
|
85
|
-
"b",
|
|
86
|
-
"em",
|
|
87
|
-
"i",
|
|
88
|
-
"u",
|
|
89
|
-
"s",
|
|
90
|
-
"del",
|
|
91
|
-
"ins",
|
|
92
|
-
"mark",
|
|
93
|
-
"small",
|
|
94
|
-
"sub",
|
|
95
|
-
"sup",
|
|
96
|
-
"code",
|
|
97
|
-
"pre",
|
|
98
|
-
"kbd",
|
|
99
|
-
"samp",
|
|
100
|
-
"var",
|
|
101
|
-
"abbr",
|
|
102
|
-
"cite",
|
|
103
|
-
"dfn",
|
|
104
|
-
"time",
|
|
105
|
-
"data",
|
|
106
|
-
"a",
|
|
107
|
-
"blockquote",
|
|
108
|
-
"q",
|
|
109
|
-
}
|
|
158
|
+
allowed_tags = set(BASE_ALLOWED_TAGS)
|
|
110
159
|
|
|
111
160
|
if preserve_semantic_structure:
|
|
112
|
-
allowed_tags.update(
|
|
113
|
-
{
|
|
114
|
-
"article",
|
|
115
|
-
"section",
|
|
116
|
-
"aside",
|
|
117
|
-
"header",
|
|
118
|
-
"footer",
|
|
119
|
-
"main",
|
|
120
|
-
"nav",
|
|
121
|
-
"figure",
|
|
122
|
-
"figcaption",
|
|
123
|
-
"details",
|
|
124
|
-
"summary",
|
|
125
|
-
}
|
|
126
|
-
)
|
|
161
|
+
allowed_tags.update(SEMANTIC_STRUCTURE_TAGS)
|
|
127
162
|
|
|
128
163
|
if preserve_tables:
|
|
129
|
-
allowed_tags.update(
|
|
130
|
-
{
|
|
131
|
-
"table",
|
|
132
|
-
"thead",
|
|
133
|
-
"tbody",
|
|
134
|
-
"tfoot",
|
|
135
|
-
"tr",
|
|
136
|
-
"th",
|
|
137
|
-
"td",
|
|
138
|
-
"caption",
|
|
139
|
-
"col",
|
|
140
|
-
"colgroup",
|
|
141
|
-
}
|
|
142
|
-
)
|
|
164
|
+
allowed_tags.update(TABLE_TAGS)
|
|
143
165
|
|
|
144
166
|
if preserve_media:
|
|
145
|
-
allowed_tags.update(
|
|
146
|
-
{
|
|
147
|
-
"img",
|
|
148
|
-
"picture",
|
|
149
|
-
"source",
|
|
150
|
-
"audio",
|
|
151
|
-
"video",
|
|
152
|
-
"track",
|
|
153
|
-
"canvas",
|
|
154
|
-
"svg",
|
|
155
|
-
"iframe",
|
|
156
|
-
}
|
|
157
|
-
)
|
|
167
|
+
allowed_tags.update(MEDIA_TAGS)
|
|
158
168
|
|
|
159
169
|
allowed_tags -= custom_tags_to_remove
|
|
160
170
|
|
html_to_markdown/processing.py
CHANGED
|
@@ -11,13 +11,13 @@ from io import StringIO
|
|
|
11
11
|
from itertools import chain
|
|
12
12
|
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
13
13
|
|
|
14
|
-
from bs4 import BeautifulSoup, Comment, Doctype, Tag
|
|
14
|
+
from bs4 import BeautifulSoup, CData, Comment, Doctype, Tag
|
|
15
15
|
from bs4.element import NavigableString, PageElement
|
|
16
16
|
|
|
17
17
|
try:
|
|
18
18
|
from html_to_markdown.preprocessor import create_preprocessor
|
|
19
19
|
from html_to_markdown.preprocessor import preprocess_html as preprocess_fn
|
|
20
|
-
except ImportError:
|
|
20
|
+
except ImportError: # pragma: no cover
|
|
21
21
|
create_preprocessor = None # type: ignore[assignment]
|
|
22
22
|
preprocess_fn = None # type: ignore[assignment]
|
|
23
23
|
|
|
@@ -25,7 +25,7 @@ try:
|
|
|
25
25
|
import importlib.util
|
|
26
26
|
|
|
27
27
|
LXML_AVAILABLE = importlib.util.find_spec("lxml") is not None
|
|
28
|
-
except ImportError:
|
|
28
|
+
except ImportError: # pragma: no cover
|
|
29
29
|
LXML_AVAILABLE = False
|
|
30
30
|
|
|
31
31
|
from html_to_markdown.constants import (
|
|
@@ -179,6 +179,7 @@ def _process_tag(
|
|
|
179
179
|
strip: set[str] | None,
|
|
180
180
|
whitespace_handler: WhitespaceHandler,
|
|
181
181
|
context_before: str = "",
|
|
182
|
+
ancestor_names: set[str] | None = None,
|
|
182
183
|
) -> str:
|
|
183
184
|
should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
|
|
184
185
|
tag_name: SupportedTag | None = (
|
|
@@ -186,6 +187,17 @@ def _process_tag(
|
|
|
186
187
|
)
|
|
187
188
|
text_parts: list[str] = []
|
|
188
189
|
|
|
190
|
+
if ancestor_names is None:
|
|
191
|
+
ancestor_names = set()
|
|
192
|
+
current: Tag | None = tag
|
|
193
|
+
while current and hasattr(current, "name"):
|
|
194
|
+
if current.name:
|
|
195
|
+
ancestor_names.add(current.name)
|
|
196
|
+
current = getattr(current, "parent", None)
|
|
197
|
+
|
|
198
|
+
if len(ancestor_names) > 10:
|
|
199
|
+
break
|
|
200
|
+
|
|
189
201
|
is_heading = html_heading_re.match(tag.name) is not None
|
|
190
202
|
is_cell = tag_name in {"td", "th"}
|
|
191
203
|
convert_children_as_inline = convert_as_inline or is_heading or is_cell
|
|
@@ -201,7 +213,7 @@ def _process_tag(
|
|
|
201
213
|
if can_extract and isinstance(el, NavigableString) and not el.strip():
|
|
202
214
|
el.extract()
|
|
203
215
|
|
|
204
|
-
children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
|
|
216
|
+
children = list(filter(lambda value: not isinstance(value, (Comment, Doctype, CData)), tag.children))
|
|
205
217
|
|
|
206
218
|
empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
|
|
207
219
|
|
|
@@ -227,6 +239,7 @@ def _process_tag(
|
|
|
227
239
|
escape_asterisks=escape_asterisks,
|
|
228
240
|
escape_underscores=escape_underscores,
|
|
229
241
|
whitespace_handler=whitespace_handler,
|
|
242
|
+
ancestor_names=ancestor_names,
|
|
230
243
|
)
|
|
231
244
|
)
|
|
232
245
|
elif isinstance(el, Tag):
|
|
@@ -243,6 +256,7 @@ def _process_tag(
|
|
|
243
256
|
strip=strip,
|
|
244
257
|
whitespace_handler=whitespace_handler,
|
|
245
258
|
context_before=(context_before + current_text)[-2:],
|
|
259
|
+
ancestor_names=ancestor_names,
|
|
246
260
|
)
|
|
247
261
|
)
|
|
248
262
|
|
|
@@ -282,21 +296,23 @@ def _process_text(
|
|
|
282
296
|
escape_asterisks: bool,
|
|
283
297
|
escape_underscores: bool,
|
|
284
298
|
whitespace_handler: WhitespaceHandler,
|
|
299
|
+
ancestor_names: set[str] | None = None,
|
|
285
300
|
) -> str:
|
|
286
301
|
text = str(el) or ""
|
|
287
302
|
|
|
288
303
|
parent = el.parent
|
|
289
304
|
parent_name = parent.name if parent else None
|
|
290
305
|
|
|
291
|
-
ancestor_names
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
306
|
+
if ancestor_names is None:
|
|
307
|
+
ancestor_names = set()
|
|
308
|
+
current = parent
|
|
309
|
+
while current and hasattr(current, "name"):
|
|
310
|
+
if current.name:
|
|
311
|
+
ancestor_names.add(current.name)
|
|
312
|
+
current = getattr(current, "parent", None)
|
|
297
313
|
|
|
298
|
-
|
|
299
|
-
|
|
314
|
+
if len(ancestor_names) > 10:
|
|
315
|
+
break
|
|
300
316
|
|
|
301
317
|
in_pre = bool(ancestor_names.intersection({"pre"}))
|
|
302
318
|
|
|
@@ -322,7 +338,7 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
|
|
|
322
338
|
def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
323
339
|
elem_id = id(element)
|
|
324
340
|
cache = _ancestor_cache.get()
|
|
325
|
-
if cache is None:
|
|
341
|
+
if cache is None: # pragma: no cover
|
|
326
342
|
cache = {}
|
|
327
343
|
_ancestor_cache.set(cache)
|
|
328
344
|
|
|
@@ -338,7 +354,7 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
|
338
354
|
ancestor_names.add(current.name)
|
|
339
355
|
|
|
340
356
|
parent_id = id(current)
|
|
341
|
-
if parent_id in cache:
|
|
357
|
+
if parent_id in cache: # pragma: no cover
|
|
342
358
|
ancestor_names.update(cache[parent_id])
|
|
343
359
|
break
|
|
344
360
|
|
|
@@ -386,36 +402,35 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
386
402
|
metadata["base-href"] = base_tag["href"]
|
|
387
403
|
|
|
388
404
|
for meta in soup.find_all("meta"):
|
|
389
|
-
if meta.get("name") and meta.get("content") is not None:
|
|
390
|
-
name = meta["name"]
|
|
391
|
-
content = meta["content"]
|
|
405
|
+
if (name := meta.get("name")) and (content := meta.get("content")) is not None:
|
|
392
406
|
if isinstance(name, str) and isinstance(content, str):
|
|
393
|
-
|
|
394
|
-
metadata[key] = content
|
|
407
|
+
metadata[f"meta-{name.lower()}"] = content
|
|
395
408
|
|
|
396
|
-
elif meta.get("property") and meta.get("content") is not None:
|
|
397
|
-
prop = meta["property"]
|
|
398
|
-
content = meta["content"]
|
|
409
|
+
elif (prop := meta.get("property")) and (content := meta.get("content")) is not None:
|
|
399
410
|
if isinstance(prop, str) and isinstance(content, str):
|
|
400
|
-
|
|
401
|
-
metadata[key] = content
|
|
411
|
+
metadata[f"meta-{prop.lower().replace(':', '-')}"] = content
|
|
402
412
|
|
|
403
|
-
elif
|
|
404
|
-
equiv
|
|
405
|
-
content
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
413
|
+
elif (
|
|
414
|
+
(equiv := meta.get("http-equiv"))
|
|
415
|
+
and (content := meta.get("content")) is not None
|
|
416
|
+
and isinstance(equiv, str)
|
|
417
|
+
and isinstance(content, str)
|
|
418
|
+
):
|
|
419
|
+
metadata[f"meta-{equiv.lower()}"] = content
|
|
409
420
|
|
|
410
421
|
canonical = soup.find("link", rel="canonical", href=True)
|
|
411
422
|
if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
|
|
412
423
|
metadata["canonical"] = canonical["href"]
|
|
413
424
|
|
|
414
425
|
link_relations = {"author", "license", "alternate"}
|
|
415
|
-
|
|
416
|
-
link
|
|
417
|
-
|
|
418
|
-
|
|
426
|
+
link_metadata = {
|
|
427
|
+
f"link-{rel_type}": link["href"]
|
|
428
|
+
for rel_type in link_relations
|
|
429
|
+
if (link := soup.find("link", rel=rel_type, href=True))
|
|
430
|
+
and isinstance(link, Tag)
|
|
431
|
+
and isinstance(link["href"], str)
|
|
432
|
+
}
|
|
433
|
+
metadata.update(link_metadata)
|
|
419
434
|
|
|
420
435
|
return metadata
|
|
421
436
|
|
|
@@ -424,11 +439,7 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
|
424
439
|
if not metadata:
|
|
425
440
|
return ""
|
|
426
441
|
|
|
427
|
-
lines = ["<!--"]
|
|
428
|
-
for key, value in sorted(metadata.items()):
|
|
429
|
-
safe_value = value.replace("-->", "-->")
|
|
430
|
-
lines.append(f"{key}: {safe_value}")
|
|
431
|
-
lines.append("-->")
|
|
442
|
+
lines = ["<!--", *[f"{key}: {value.replace('-->', '-->')}" for key, value in sorted(metadata.items())], "-->"]
|
|
432
443
|
|
|
433
444
|
return "\n".join(lines) + "\n\n"
|
|
434
445
|
|
|
@@ -442,6 +453,7 @@ def convert_to_markdown(
|
|
|
442
453
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
443
454
|
parser: str | None = None,
|
|
444
455
|
autolinks: bool = True,
|
|
456
|
+
br_in_tables: bool = False,
|
|
445
457
|
bullets: str = "*+-",
|
|
446
458
|
code_language: str = "",
|
|
447
459
|
code_language_callback: Callable[[Any], str] | None = None,
|
|
@@ -473,7 +485,6 @@ def convert_to_markdown(
|
|
|
473
485
|
wrap_width: int = 80,
|
|
474
486
|
) -> str:
|
|
475
487
|
"""Convert HTML content to Markdown format.
|
|
476
|
-
|
|
477
488
|
This is the main entry point for converting HTML to Markdown. It supports
|
|
478
489
|
various customization options for controlling the conversion behavior.
|
|
479
490
|
|
|
@@ -485,6 +496,7 @@ def convert_to_markdown(
|
|
|
485
496
|
progress_callback: Callback for progress updates (current, total).
|
|
486
497
|
parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
|
|
487
498
|
autolinks: Convert URLs to automatic links.
|
|
499
|
+
br_in_tables: Use <br> tags for line breaks in table cells instead of spaces.
|
|
488
500
|
bullets: Characters to use for unordered list bullets.
|
|
489
501
|
code_language: Default language for code blocks.
|
|
490
502
|
code_language_callback: Callback to determine code language from element.
|
|
@@ -528,11 +540,9 @@ def convert_to_markdown(
|
|
|
528
540
|
>>> html = "<h1>Title</h1><p>Content</p>"
|
|
529
541
|
>>> convert_to_markdown(html)
|
|
530
542
|
'Title\\n=====\\n\\nContent\\n\\n'
|
|
531
|
-
|
|
532
543
|
With custom options:
|
|
533
544
|
>>> convert_to_markdown(html, heading_style="atx", list_indent_width=2)
|
|
534
545
|
'# Title\\n\\nContent\\n\\n'
|
|
535
|
-
|
|
536
546
|
Discord-compatible lists (2-space indent):
|
|
537
547
|
>>> html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
|
|
538
548
|
>>> convert_to_markdown(html, list_indent_width=2)
|
|
@@ -644,7 +654,7 @@ def convert_to_markdown(
|
|
|
644
654
|
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
645
655
|
|
|
646
656
|
if convert_as_inline:
|
|
647
|
-
result = result.rstrip("\n")
|
|
657
|
+
result = result.rstrip("\n") # pragma: no cover
|
|
648
658
|
|
|
649
659
|
return result
|
|
650
660
|
|
|
@@ -658,6 +668,7 @@ def convert_to_markdown(
|
|
|
658
668
|
whitespace_handler=whitespace_handler,
|
|
659
669
|
parser=parser,
|
|
660
670
|
autolinks=autolinks,
|
|
671
|
+
br_in_tables=br_in_tables,
|
|
661
672
|
bullets=bullets,
|
|
662
673
|
code_language=code_language,
|
|
663
674
|
code_language_callback=code_language_callback,
|
|
@@ -819,6 +830,7 @@ def _process_html_core(
|
|
|
819
830
|
whitespace_handler: WhitespaceHandler,
|
|
820
831
|
parser: str | None = None,
|
|
821
832
|
autolinks: bool,
|
|
833
|
+
br_in_tables: bool,
|
|
822
834
|
bullets: str,
|
|
823
835
|
code_language: str,
|
|
824
836
|
code_language_callback: Callable[[Any], str] | None,
|
|
@@ -849,24 +861,25 @@ def _process_html_core(
|
|
|
849
861
|
try:
|
|
850
862
|
if isinstance(source, str):
|
|
851
863
|
if strip_newlines:
|
|
852
|
-
source = source.replace("\n", " ").replace("\r", " ")
|
|
864
|
+
source = source.replace("\n", " ").replace("\r", " ") # pragma: no cover
|
|
853
865
|
|
|
854
866
|
if "".join(source.split("\n")):
|
|
855
867
|
if parser is None:
|
|
856
868
|
parser = "lxml" if LXML_AVAILABLE else "html.parser"
|
|
857
869
|
|
|
858
|
-
if parser == "lxml" and not LXML_AVAILABLE:
|
|
870
|
+
if parser == "lxml" and not LXML_AVAILABLE: # pragma: no cover
|
|
859
871
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
860
872
|
|
|
861
873
|
source = BeautifulSoup(source, parser)
|
|
862
874
|
else:
|
|
863
875
|
raise EmptyHtmlError
|
|
864
876
|
|
|
865
|
-
if strip is not None and convert is not None:
|
|
877
|
+
if strip is not None and convert is not None: # pragma: no cover
|
|
866
878
|
raise ConflictingOptionsError("strip", "convert")
|
|
867
879
|
|
|
868
880
|
converters_map = create_converters_map(
|
|
869
881
|
autolinks=autolinks,
|
|
882
|
+
br_in_tables=br_in_tables,
|
|
870
883
|
bullets=bullets,
|
|
871
884
|
code_language=code_language,
|
|
872
885
|
code_language_callback=code_language_callback,
|
|
@@ -896,7 +909,7 @@ def _process_html_core(
|
|
|
896
909
|
elements_to_process = body.children if body and isinstance(body, Tag) else source.children
|
|
897
910
|
|
|
898
911
|
context = ""
|
|
899
|
-
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
|
|
912
|
+
for el in filter(lambda value: not isinstance(value, (Comment, Doctype, CData)), elements_to_process):
|
|
900
913
|
if isinstance(el, NavigableString):
|
|
901
914
|
text = _process_text(
|
|
902
915
|
el=el,
|
|
@@ -935,6 +948,7 @@ def convert_to_markdown_stream(
|
|
|
935
948
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
936
949
|
parser: str | None = None,
|
|
937
950
|
autolinks: bool = True,
|
|
951
|
+
br_in_tables: bool = False,
|
|
938
952
|
bullets: str = "*+-",
|
|
939
953
|
code_language: str = "",
|
|
940
954
|
code_language_callback: Callable[[Any], str] | None = None,
|
|
@@ -976,6 +990,7 @@ def convert_to_markdown_stream(
|
|
|
976
990
|
whitespace_handler=whitespace_handler,
|
|
977
991
|
parser=parser,
|
|
978
992
|
autolinks=autolinks,
|
|
993
|
+
br_in_tables=br_in_tables,
|
|
979
994
|
bullets=bullets,
|
|
980
995
|
code_language=code_language,
|
|
981
996
|
code_language_callback=code_language_callback,
|
|
@@ -1027,7 +1042,7 @@ def convert_to_markdown_stream(
|
|
|
1027
1042
|
end_pos = search_start + newline_pos + 1
|
|
1028
1043
|
|
|
1029
1044
|
chunk = combined_result[pos:end_pos]
|
|
1030
|
-
if chunk:
|
|
1045
|
+
if chunk: # pragma: no cover
|
|
1031
1046
|
yield chunk
|
|
1032
1047
|
|
|
1033
1048
|
pos = end_pos
|
html_to_markdown/utils.py
CHANGED
|
@@ -12,9 +12,7 @@ def chomp(text: str) -> tuple[str, str, str]:
|
|
|
12
12
|
prefix = " " if text.startswith((" ", "\t")) else ""
|
|
13
13
|
suffix = " " if text.endswith((" ", "\t")) else ""
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
return prefix, suffix, text
|
|
15
|
+
return prefix, suffix, text.strip()
|
|
18
16
|
|
|
19
17
|
|
|
20
18
|
def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:
|
html_to_markdown/whitespace.py
CHANGED
|
@@ -6,8 +6,10 @@ import re
|
|
|
6
6
|
import unicodedata
|
|
7
7
|
from typing import TYPE_CHECKING, Literal
|
|
8
8
|
|
|
9
|
+
from bs4.element import NavigableString
|
|
10
|
+
|
|
9
11
|
if TYPE_CHECKING:
|
|
10
|
-
from bs4 import
|
|
12
|
+
from bs4 import PageElement
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
WhitespaceMode = Literal["normalized", "strict"]
|
|
@@ -128,11 +130,13 @@ class WhitespaceHandler:
|
|
|
128
130
|
def normalize_unicode_spaces(self, text: str) -> str:
|
|
129
131
|
text = self._unicode_spaces.sub(" ", text)
|
|
130
132
|
|
|
133
|
+
text = text.replace("\r\n", "\n")
|
|
134
|
+
|
|
131
135
|
normalized = []
|
|
132
136
|
for char in text:
|
|
133
137
|
if unicodedata.category(char) in ("Zs", "Zl", "Zp"):
|
|
134
138
|
normalized.append(" ")
|
|
135
|
-
elif char
|
|
139
|
+
elif char == "\r": # pragma: no cover
|
|
136
140
|
normalized.append("\n")
|
|
137
141
|
else:
|
|
138
142
|
normalized.append(char)
|
|
@@ -168,15 +172,12 @@ class WhitespaceHandler:
|
|
|
168
172
|
*,
|
|
169
173
|
in_pre: bool = False,
|
|
170
174
|
) -> str:
|
|
171
|
-
if not text:
|
|
175
|
+
if not text: # pragma: no cover
|
|
172
176
|
return ""
|
|
173
177
|
|
|
174
178
|
if in_pre or self.should_preserve_whitespace(element):
|
|
175
179
|
return text
|
|
176
180
|
|
|
177
|
-
if self.mode == "strict":
|
|
178
|
-
return text
|
|
179
|
-
|
|
180
181
|
text = self.normalize_unicode_spaces(text)
|
|
181
182
|
return self._process_normalized(text, element)
|
|
182
183
|
|
|
@@ -204,8 +205,8 @@ class WhitespaceHandler:
|
|
|
204
205
|
def _process_text_with_content(self, text: str, element: NavigableString) -> str:
|
|
205
206
|
original = str(element)
|
|
206
207
|
|
|
207
|
-
has_lead_space = original and original[0] in " \t\n"
|
|
208
|
-
has_trail_space = original and original[-1] in " \t\n"
|
|
208
|
+
has_lead_space = bool(original and original[0] in " \t\n")
|
|
209
|
+
has_trail_space = bool(original and original[-1] in " \t\n")
|
|
209
210
|
|
|
210
211
|
text = self._multiple_spaces.sub(" ", text.strip())
|
|
211
212
|
|
|
@@ -215,9 +216,9 @@ class WhitespaceHandler:
|
|
|
215
216
|
return self._process_special_inline_containers(text, original)
|
|
216
217
|
|
|
217
218
|
if parent and self.is_inline_element(parent):
|
|
218
|
-
return self._process_inline_element_text(text, original,
|
|
219
|
+
return self._process_inline_element_text(text, original, has_lead_space, has_trail_space)
|
|
219
220
|
|
|
220
|
-
return self._process_standalone_text(text, original, element,
|
|
221
|
+
return self._process_standalone_text(text, original, element, has_lead_space, has_trail_space)
|
|
221
222
|
|
|
222
223
|
def _process_special_inline_containers(self, text: str, original: str) -> str:
|
|
223
224
|
if original and "\n" not in original and "\t" not in original:
|
|
@@ -253,12 +254,22 @@ class WhitespaceHandler:
|
|
|
253
254
|
has_leading = (
|
|
254
255
|
has_lead_space
|
|
255
256
|
and original[0] == " "
|
|
256
|
-
and (
|
|
257
|
+
and (
|
|
258
|
+
self.is_inline_element(prev_sibling)
|
|
259
|
+
or self.is_block_element(prev_sibling)
|
|
260
|
+
or prev_sibling is None
|
|
261
|
+
or isinstance(prev_sibling, NavigableString)
|
|
262
|
+
)
|
|
257
263
|
)
|
|
258
264
|
has_trailing = (
|
|
259
265
|
has_trail_space
|
|
260
266
|
and original[-1] == " "
|
|
261
|
-
and (
|
|
267
|
+
and (
|
|
268
|
+
self.is_inline_element(next_sibling)
|
|
269
|
+
or self.is_block_element(next_sibling)
|
|
270
|
+
or next_sibling is None
|
|
271
|
+
or isinstance(next_sibling, NavigableString)
|
|
272
|
+
)
|
|
262
273
|
)
|
|
263
274
|
|
|
264
275
|
if original and original[0] in "\n\t" and self.is_inline_element(prev_sibling):
|
|
@@ -280,24 +291,3 @@ class WhitespaceHandler:
|
|
|
280
291
|
text = text + "\n\n"
|
|
281
292
|
|
|
282
293
|
return text
|
|
283
|
-
|
|
284
|
-
def get_block_spacing(self, tag: Tag, next_sibling: PageElement | None = None) -> str:
|
|
285
|
-
if self.mode == "strict":
|
|
286
|
-
return ""
|
|
287
|
-
|
|
288
|
-
tag_name = tag.name.lower() if hasattr(tag, "name") else ""
|
|
289
|
-
|
|
290
|
-
double_newline_elements = {"p", "div", "blockquote", "pre", "table", "ul", "ol", "dl"}
|
|
291
|
-
|
|
292
|
-
single_newline_elements = {"li", "dt", "dd", "tr", "td", "th"}
|
|
293
|
-
|
|
294
|
-
if tag_name in double_newline_elements:
|
|
295
|
-
if self.is_block_element(next_sibling):
|
|
296
|
-
return "\n\n"
|
|
297
|
-
return "\n"
|
|
298
|
-
if tag_name in single_newline_elements:
|
|
299
|
-
return "\n"
|
|
300
|
-
if tag_name.startswith("h") and len(tag_name) == 2 and tag_name[1].isdigit():
|
|
301
|
-
return "\n\n"
|
|
302
|
-
|
|
303
|
-
return ""
|