html-to-markdown 1.3.3__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +3 -2
- html_to_markdown/__main__.py +5 -2
- html_to_markdown/cli.py +114 -28
- html_to_markdown/constants.py +1 -0
- html_to_markdown/converters.py +1646 -105
- html_to_markdown/processing.py +499 -13
- html_to_markdown-1.5.0.dist-info/METADATA +436 -0
- html_to_markdown-1.5.0.dist-info/RECORD +14 -0
- {html_to_markdown-1.3.3.dist-info → html_to_markdown-1.5.0.dist-info}/entry_points.txt +1 -0
- html_to_markdown-1.3.3.dist-info/METADATA +0 -242
- html_to_markdown-1.3.3.dist-info/RECORD +0 -14
- {html_to_markdown-1.3.3.dist-info → html_to_markdown-1.5.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.3.3.dist-info → html_to_markdown-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.3.3.dist-info → html_to_markdown-1.5.0.dist-info}/top_level.txt +0 -0
html_to_markdown/processing.py
CHANGED
|
@@ -3,14 +3,19 @@ from __future__ import annotations
|
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
4
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
|
-
from collections.abc import Mapping
|
|
6
|
+
from collections.abc import Generator, Mapping
|
|
7
|
+
# Use the imported PageElement instead of re-importing
|
|
8
|
+
from io import StringIO
|
|
9
|
+
import re
|
|
7
10
|
from itertools import chain
|
|
8
11
|
from typing import TYPE_CHECKING, Any, Callable, Literal, cast
|
|
9
12
|
|
|
10
|
-
from bs4 import BeautifulSoup, Comment, Doctype,
|
|
13
|
+
from bs4 import BeautifulSoup, Comment, Doctype, Tag
|
|
14
|
+
from bs4.element import NavigableString, PageElement
|
|
11
15
|
|
|
12
16
|
from html_to_markdown.constants import (
|
|
13
17
|
ASTERISK,
|
|
18
|
+
DOUBLE_EQUAL,
|
|
14
19
|
SPACES,
|
|
15
20
|
UNDERLINED,
|
|
16
21
|
html_heading_re,
|
|
@@ -22,45 +27,103 @@ from html_to_markdown.utils import escape
|
|
|
22
27
|
if TYPE_CHECKING:
|
|
23
28
|
from collections.abc import Iterable
|
|
24
29
|
|
|
25
|
-
from bs4 import PageElement
|
|
26
|
-
|
|
27
30
|
SupportedTag = Literal[
|
|
28
31
|
"a",
|
|
32
|
+
"abbr",
|
|
33
|
+
"article",
|
|
34
|
+
"aside",
|
|
35
|
+
"audio",
|
|
29
36
|
"b",
|
|
37
|
+
"bdi",
|
|
38
|
+
"bdo",
|
|
30
39
|
"blockquote",
|
|
31
40
|
"br",
|
|
41
|
+
"button",
|
|
42
|
+
"caption",
|
|
43
|
+
"cite",
|
|
32
44
|
"code",
|
|
45
|
+
"col",
|
|
46
|
+
"colgroup",
|
|
47
|
+
"data",
|
|
48
|
+
"datalist",
|
|
49
|
+
"dd",
|
|
33
50
|
"del",
|
|
51
|
+
"details",
|
|
52
|
+
"dfn",
|
|
53
|
+
"dialog",
|
|
54
|
+
"dl",
|
|
55
|
+
"dt",
|
|
34
56
|
"em",
|
|
57
|
+
"fieldset",
|
|
58
|
+
"figcaption",
|
|
59
|
+
"figure",
|
|
60
|
+
"footer",
|
|
61
|
+
"form",
|
|
35
62
|
"h1",
|
|
36
63
|
"h2",
|
|
37
64
|
"h3",
|
|
38
65
|
"h4",
|
|
39
66
|
"h5",
|
|
40
67
|
"h6",
|
|
68
|
+
"header",
|
|
69
|
+
"hgroup",
|
|
41
70
|
"hr",
|
|
42
71
|
"i",
|
|
72
|
+
"iframe",
|
|
43
73
|
"img",
|
|
74
|
+
"input",
|
|
75
|
+
"ins",
|
|
76
|
+
"kbd",
|
|
77
|
+
"label",
|
|
78
|
+
"legend",
|
|
44
79
|
"list",
|
|
45
|
-
"
|
|
80
|
+
"main",
|
|
81
|
+
"mark",
|
|
82
|
+
"math",
|
|
83
|
+
"menu",
|
|
84
|
+
"meter",
|
|
85
|
+
"nav",
|
|
46
86
|
"ol",
|
|
47
87
|
"li",
|
|
88
|
+
"optgroup",
|
|
89
|
+
"option",
|
|
90
|
+
"output",
|
|
48
91
|
"p",
|
|
92
|
+
"picture",
|
|
49
93
|
"pre",
|
|
50
|
-
"
|
|
51
|
-
"
|
|
94
|
+
"progress",
|
|
95
|
+
"q",
|
|
96
|
+
"rb",
|
|
97
|
+
"rp",
|
|
98
|
+
"rt",
|
|
99
|
+
"rtc",
|
|
100
|
+
"ruby",
|
|
52
101
|
"s",
|
|
53
|
-
"strong",
|
|
54
102
|
"samp",
|
|
103
|
+
"script",
|
|
104
|
+
"section",
|
|
105
|
+
"select",
|
|
106
|
+
"small",
|
|
107
|
+
"strong",
|
|
108
|
+
"style",
|
|
55
109
|
"sub",
|
|
110
|
+
"summary",
|
|
56
111
|
"sup",
|
|
112
|
+
"svg",
|
|
57
113
|
"table",
|
|
58
|
-
"
|
|
59
|
-
"figcaption",
|
|
114
|
+
"tbody",
|
|
60
115
|
"td",
|
|
116
|
+
"textarea",
|
|
117
|
+
"tfoot",
|
|
61
118
|
"th",
|
|
119
|
+
"thead",
|
|
120
|
+
"time",
|
|
62
121
|
"tr",
|
|
63
|
-
"
|
|
122
|
+
"u",
|
|
123
|
+
"ul",
|
|
124
|
+
"var",
|
|
125
|
+
"video",
|
|
126
|
+
"wbr",
|
|
64
127
|
]
|
|
65
128
|
|
|
66
129
|
|
|
@@ -73,9 +136,11 @@ def _is_nested_tag(el: PageElement) -> bool:
|
|
|
73
136
|
"thead",
|
|
74
137
|
"tbody",
|
|
75
138
|
"tfoot",
|
|
139
|
+
"colgroup",
|
|
76
140
|
"tr",
|
|
77
141
|
"td",
|
|
78
142
|
"th",
|
|
143
|
+
"col",
|
|
79
144
|
}
|
|
80
145
|
|
|
81
146
|
|
|
@@ -195,9 +260,94 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
|
|
|
195
260
|
return {*chain(*[v.split(",") for v in value])}
|
|
196
261
|
|
|
197
262
|
|
|
263
|
+
def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
264
|
+
"""Extract metadata from HTML document.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
soup: BeautifulSoup instance of the HTML document.
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
Dictionary of metadata key-value pairs.
|
|
271
|
+
"""
|
|
272
|
+
metadata = {}
|
|
273
|
+
|
|
274
|
+
# Extract title
|
|
275
|
+
title_tag = soup.find("title")
|
|
276
|
+
if title_tag and isinstance(title_tag, Tag) and title_tag.string:
|
|
277
|
+
metadata["title"] = title_tag.string.strip()
|
|
278
|
+
|
|
279
|
+
# Extract base href
|
|
280
|
+
base_tag = soup.find("base", href=True)
|
|
281
|
+
if base_tag and isinstance(base_tag, Tag) and isinstance(base_tag["href"], str):
|
|
282
|
+
metadata["base-href"] = base_tag["href"]
|
|
283
|
+
|
|
284
|
+
# Extract meta tags
|
|
285
|
+
for meta in soup.find_all("meta"):
|
|
286
|
+
# Handle name-based meta tags
|
|
287
|
+
if meta.get("name") and meta.get("content") is not None:
|
|
288
|
+
name = meta["name"]
|
|
289
|
+
content = meta["content"]
|
|
290
|
+
if isinstance(name, str) and isinstance(content, str):
|
|
291
|
+
key = f"meta-{name.lower()}"
|
|
292
|
+
metadata[key] = content
|
|
293
|
+
# Handle property-based meta tags (Open Graph, etc.)
|
|
294
|
+
elif meta.get("property") and meta.get("content") is not None:
|
|
295
|
+
prop = meta["property"]
|
|
296
|
+
content = meta["content"]
|
|
297
|
+
if isinstance(prop, str) and isinstance(content, str):
|
|
298
|
+
key = f"meta-{prop.lower().replace(':', '-')}"
|
|
299
|
+
metadata[key] = content
|
|
300
|
+
# Handle http-equiv meta tags
|
|
301
|
+
elif meta.get("http-equiv") and meta.get("content") is not None:
|
|
302
|
+
equiv = meta["http-equiv"]
|
|
303
|
+
content = meta["content"]
|
|
304
|
+
if isinstance(equiv, str) and isinstance(content, str):
|
|
305
|
+
key = f"meta-{equiv.lower()}"
|
|
306
|
+
metadata[key] = content
|
|
307
|
+
|
|
308
|
+
# Extract canonical link
|
|
309
|
+
canonical = soup.find("link", rel="canonical", href=True)
|
|
310
|
+
if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
|
|
311
|
+
metadata["canonical"] = canonical["href"]
|
|
312
|
+
|
|
313
|
+
# Extract other important link relations
|
|
314
|
+
for rel_type in ["author", "license", "alternate"]:
|
|
315
|
+
link = soup.find("link", rel=rel_type, href=True)
|
|
316
|
+
if link and isinstance(link, Tag) and isinstance(link["href"], str):
|
|
317
|
+
metadata[f"link-{rel_type}"] = link["href"]
|
|
318
|
+
|
|
319
|
+
return metadata
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
323
|
+
"""Format metadata as a Markdown comment block.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
metadata: Dictionary of metadata key-value pairs.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Formatted metadata comment block.
|
|
330
|
+
"""
|
|
331
|
+
if not metadata:
|
|
332
|
+
return ""
|
|
333
|
+
|
|
334
|
+
lines = ["<!--"]
|
|
335
|
+
for key, value in sorted(metadata.items()):
|
|
336
|
+
# Escape any potential comment closers in the value
|
|
337
|
+
safe_value = value.replace("-->", "-->")
|
|
338
|
+
lines.append(f"{key}: {safe_value}")
|
|
339
|
+
lines.append("-->")
|
|
340
|
+
|
|
341
|
+
return "\n".join(lines) + "\n\n"
|
|
342
|
+
|
|
343
|
+
|
|
198
344
|
def convert_to_markdown(
|
|
199
345
|
source: str | BeautifulSoup,
|
|
200
346
|
*,
|
|
347
|
+
stream_processing: bool = False,
|
|
348
|
+
chunk_size: int = 1024,
|
|
349
|
+
chunk_callback: Callable[[str], None] | None = None,
|
|
350
|
+
progress_callback: Callable[[int, int], None] | None = None,
|
|
201
351
|
autolinks: bool = True,
|
|
202
352
|
bullets: str = "*+-",
|
|
203
353
|
code_language: str = "",
|
|
@@ -209,10 +359,13 @@ def convert_to_markdown(
|
|
|
209
359
|
escape_asterisks: bool = True,
|
|
210
360
|
escape_misc: bool = True,
|
|
211
361
|
escape_underscores: bool = True,
|
|
362
|
+
extract_metadata: bool = True,
|
|
212
363
|
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
364
|
+
highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
|
|
213
365
|
keep_inline_images_in: Iterable[str] | None = None,
|
|
214
366
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
215
367
|
strip: str | Iterable[str] | None = None,
|
|
368
|
+
strip_newlines: bool = False,
|
|
216
369
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
217
370
|
sub_symbol: str = "",
|
|
218
371
|
sup_symbol: str = "",
|
|
@@ -223,6 +376,10 @@ def convert_to_markdown(
|
|
|
223
376
|
|
|
224
377
|
Args:
|
|
225
378
|
source: An HTML document or a an initialized instance of BeautifulSoup.
|
|
379
|
+
stream_processing: Use streaming processing for large documents. Defaults to False.
|
|
380
|
+
chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
|
|
381
|
+
chunk_callback: Optional callback function called with each processed chunk.
|
|
382
|
+
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
226
383
|
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
227
384
|
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
228
385
|
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
@@ -234,10 +391,13 @@ def convert_to_markdown(
|
|
|
234
391
|
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
235
392
|
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
236
393
|
escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
|
|
394
|
+
extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
|
|
237
395
|
heading_style: The style to use for Markdown headings. Defaults to "underlined".
|
|
396
|
+
highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
|
|
238
397
|
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
239
398
|
newline_style: Style for handling newlines in text content. Defaults to "spaces".
|
|
240
399
|
strip: Tags to strip from the output. Defaults to None.
|
|
400
|
+
strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
|
|
241
401
|
strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
|
|
242
402
|
sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
|
|
243
403
|
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
@@ -259,6 +419,10 @@ def convert_to_markdown(
|
|
|
259
419
|
):
|
|
260
420
|
return source
|
|
261
421
|
|
|
422
|
+
if strip_newlines:
|
|
423
|
+
# Replace all newlines with spaces before parsing
|
|
424
|
+
source = source.replace("\n", " ").replace("\r", " ")
|
|
425
|
+
|
|
262
426
|
if "".join(source.split("\n")):
|
|
263
427
|
source = BeautifulSoup(source, "html.parser")
|
|
264
428
|
else:
|
|
@@ -267,6 +431,41 @@ def convert_to_markdown(
|
|
|
267
431
|
if strip is not None and convert is not None:
|
|
268
432
|
raise ValueError("Only one of 'strip' and 'convert' can be specified.")
|
|
269
433
|
|
|
434
|
+
# Use streaming processing if requested
|
|
435
|
+
if stream_processing:
|
|
436
|
+
result_chunks = []
|
|
437
|
+
for chunk in convert_to_markdown_stream(
|
|
438
|
+
source,
|
|
439
|
+
chunk_size=chunk_size,
|
|
440
|
+
progress_callback=progress_callback,
|
|
441
|
+
autolinks=autolinks,
|
|
442
|
+
bullets=bullets,
|
|
443
|
+
code_language=code_language,
|
|
444
|
+
code_language_callback=code_language_callback,
|
|
445
|
+
convert=convert,
|
|
446
|
+
convert_as_inline=convert_as_inline,
|
|
447
|
+
custom_converters=custom_converters,
|
|
448
|
+
default_title=default_title,
|
|
449
|
+
escape_asterisks=escape_asterisks,
|
|
450
|
+
escape_misc=escape_misc,
|
|
451
|
+
escape_underscores=escape_underscores,
|
|
452
|
+
heading_style=heading_style,
|
|
453
|
+
highlight_style=highlight_style,
|
|
454
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
455
|
+
newline_style=newline_style,
|
|
456
|
+
strip=strip,
|
|
457
|
+
strip_newlines=strip_newlines,
|
|
458
|
+
strong_em_symbol=strong_em_symbol,
|
|
459
|
+
sub_symbol=sub_symbol,
|
|
460
|
+
sup_symbol=sup_symbol,
|
|
461
|
+
wrap=wrap,
|
|
462
|
+
wrap_width=wrap_width,
|
|
463
|
+
):
|
|
464
|
+
if chunk_callback:
|
|
465
|
+
chunk_callback(chunk)
|
|
466
|
+
result_chunks.append(chunk)
|
|
467
|
+
return "".join(result_chunks)
|
|
468
|
+
|
|
270
469
|
converters_map = create_converters_map(
|
|
271
470
|
autolinks=autolinks,
|
|
272
471
|
bullets=bullets,
|
|
@@ -274,6 +473,7 @@ def convert_to_markdown(
|
|
|
274
473
|
code_language_callback=code_language_callback,
|
|
275
474
|
default_title=default_title,
|
|
276
475
|
heading_style=heading_style,
|
|
476
|
+
highlight_style=highlight_style,
|
|
277
477
|
keep_inline_images_in=keep_inline_images_in,
|
|
278
478
|
newline_style=newline_style,
|
|
279
479
|
strong_em_symbol=strong_em_symbol,
|
|
@@ -285,8 +485,18 @@ def convert_to_markdown(
|
|
|
285
485
|
if custom_converters:
|
|
286
486
|
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
287
487
|
|
|
488
|
+
# Extract metadata if requested
|
|
489
|
+
metadata_comment = ""
|
|
490
|
+
if extract_metadata and not convert_as_inline:
|
|
491
|
+
metadata = _extract_metadata(source)
|
|
492
|
+
metadata_comment = _format_metadata_comment(metadata)
|
|
493
|
+
|
|
494
|
+
# Find the body tag to process only its content
|
|
495
|
+
body = source.find("body")
|
|
496
|
+
elements_to_process = body.children if body and isinstance(body, Tag) else source.children
|
|
497
|
+
|
|
288
498
|
text = ""
|
|
289
|
-
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)),
|
|
499
|
+
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
|
|
290
500
|
if isinstance(el, NavigableString):
|
|
291
501
|
text += _process_text(
|
|
292
502
|
el=el,
|
|
@@ -306,4 +516,280 @@ def convert_to_markdown(
|
|
|
306
516
|
strip=_as_optional_set(strip),
|
|
307
517
|
context_before=text[-2:],
|
|
308
518
|
)
|
|
309
|
-
|
|
519
|
+
|
|
520
|
+
# Combine metadata and text
|
|
521
|
+
result = metadata_comment + text if metadata_comment else text
|
|
522
|
+
|
|
523
|
+
# Normalize excessive newlines - max 2 consecutive newlines (one empty line)
|
|
524
|
+
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
525
|
+
|
|
526
|
+
# Strip all trailing newlines in inline mode
|
|
527
|
+
if convert_as_inline:
|
|
528
|
+
result = result.rstrip("\n")
|
|
529
|
+
|
|
530
|
+
return result
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
class StreamingProcessor:
|
|
534
|
+
"""Handles streaming/chunked processing of HTML to Markdown conversion."""
|
|
535
|
+
|
|
536
|
+
def __init__(
|
|
537
|
+
self,
|
|
538
|
+
chunk_size: int = 1024,
|
|
539
|
+
progress_callback: Callable[[int, int], None] | None = None,
|
|
540
|
+
) -> None:
|
|
541
|
+
self.chunk_size = chunk_size
|
|
542
|
+
self.progress_callback = progress_callback
|
|
543
|
+
self.processed_bytes = 0
|
|
544
|
+
self.total_bytes = 0
|
|
545
|
+
|
|
546
|
+
def update_progress(self, processed: int) -> None:
|
|
547
|
+
"""Update progress if callback is provided."""
|
|
548
|
+
self.processed_bytes = processed
|
|
549
|
+
if self.progress_callback:
|
|
550
|
+
self.progress_callback(self.processed_bytes, self.total_bytes)
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def _process_tag_iteratively(
|
|
554
|
+
tag: Tag,
|
|
555
|
+
converters_map: ConvertersMap,
|
|
556
|
+
*,
|
|
557
|
+
convert: set[str] | None,
|
|
558
|
+
convert_as_inline: bool = False,
|
|
559
|
+
escape_asterisks: bool,
|
|
560
|
+
escape_misc: bool,
|
|
561
|
+
escape_underscores: bool,
|
|
562
|
+
strip: set[str] | None,
|
|
563
|
+
context_before: str = "",
|
|
564
|
+
) -> Generator[str, None, None]:
|
|
565
|
+
"""Process a tag iteratively to avoid deep recursion with large nested structures."""
|
|
566
|
+
# Use a stack to simulate recursion and avoid stack overflow
|
|
567
|
+
stack = [(tag, context_before, convert_as_inline)]
|
|
568
|
+
|
|
569
|
+
while stack:
|
|
570
|
+
current_tag, current_context, current_inline = stack.pop()
|
|
571
|
+
|
|
572
|
+
should_convert_tag = _should_convert_tag(tag_name=current_tag.name, strip=strip, convert=convert)
|
|
573
|
+
tag_name: SupportedTag | None = (
|
|
574
|
+
cast("SupportedTag", current_tag.name.lower()) if current_tag.name.lower() in converters_map else None
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
is_heading = html_heading_re.match(current_tag.name) is not None
|
|
578
|
+
is_cell = tag_name in {"td", "th"}
|
|
579
|
+
convert_children_as_inline = current_inline or is_heading or is_cell
|
|
580
|
+
|
|
581
|
+
# Handle nested tag cleanup
|
|
582
|
+
if _is_nested_tag(current_tag):
|
|
583
|
+
for el in current_tag.children:
|
|
584
|
+
can_extract = (
|
|
585
|
+
not el.previous_sibling
|
|
586
|
+
or not el.next_sibling
|
|
587
|
+
or _is_nested_tag(el.previous_sibling)
|
|
588
|
+
or _is_nested_tag(el.next_sibling)
|
|
589
|
+
)
|
|
590
|
+
if can_extract and isinstance(el, NavigableString) and not el.strip():
|
|
591
|
+
el.extract()
|
|
592
|
+
|
|
593
|
+
# Process children and collect text
|
|
594
|
+
children_text = ""
|
|
595
|
+
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), current_tag.children):
|
|
596
|
+
if isinstance(el, NavigableString):
|
|
597
|
+
text_chunk = _process_text(
|
|
598
|
+
el=el,
|
|
599
|
+
escape_misc=escape_misc,
|
|
600
|
+
escape_asterisks=escape_asterisks,
|
|
601
|
+
escape_underscores=escape_underscores,
|
|
602
|
+
)
|
|
603
|
+
children_text += text_chunk
|
|
604
|
+
elif isinstance(el, Tag):
|
|
605
|
+
# Recursively process child tags
|
|
606
|
+
for child_chunk in _process_tag_iteratively(
|
|
607
|
+
el,
|
|
608
|
+
converters_map,
|
|
609
|
+
convert_as_inline=convert_children_as_inline,
|
|
610
|
+
convert=convert,
|
|
611
|
+
escape_asterisks=escape_asterisks,
|
|
612
|
+
escape_misc=escape_misc,
|
|
613
|
+
escape_underscores=escape_underscores,
|
|
614
|
+
strip=strip,
|
|
615
|
+
context_before=(current_context + children_text)[-2:],
|
|
616
|
+
):
|
|
617
|
+
children_text += child_chunk
|
|
618
|
+
|
|
619
|
+
# Convert the tag if needed
|
|
620
|
+
if tag_name and should_convert_tag:
|
|
621
|
+
rendered = converters_map[tag_name]( # type: ignore[call-arg]
|
|
622
|
+
tag=current_tag, text=children_text, convert_as_inline=current_inline
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
# Handle heading spacing
|
|
626
|
+
if is_heading and current_context not in {"", "\n"}:
|
|
627
|
+
n_eol_to_add = 2 - (len(current_context) - len(current_context.rstrip("\n")))
|
|
628
|
+
if n_eol_to_add > 0:
|
|
629
|
+
prefix = "\n" * n_eol_to_add
|
|
630
|
+
rendered = f"{prefix}{rendered}"
|
|
631
|
+
|
|
632
|
+
yield rendered
|
|
633
|
+
else:
|
|
634
|
+
yield children_text
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def convert_to_markdown_stream(
|
|
638
|
+
source: str | BeautifulSoup,
|
|
639
|
+
*,
|
|
640
|
+
chunk_size: int = 1024,
|
|
641
|
+
progress_callback: Callable[[int, int], None] | None = None,
|
|
642
|
+
autolinks: bool = True,
|
|
643
|
+
bullets: str = "*+-",
|
|
644
|
+
code_language: str = "",
|
|
645
|
+
code_language_callback: Callable[[Any], str] | None = None,
|
|
646
|
+
convert: str | Iterable[str] | None = None,
|
|
647
|
+
convert_as_inline: bool = False,
|
|
648
|
+
custom_converters: Mapping[SupportedElements, Converter] | None = None,
|
|
649
|
+
default_title: bool = False,
|
|
650
|
+
escape_asterisks: bool = True,
|
|
651
|
+
escape_misc: bool = True,
|
|
652
|
+
escape_underscores: bool = True,
|
|
653
|
+
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
654
|
+
highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
|
|
655
|
+
keep_inline_images_in: Iterable[str] | None = None,
|
|
656
|
+
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
657
|
+
strip: str | Iterable[str] | None = None,
|
|
658
|
+
strip_newlines: bool = False,
|
|
659
|
+
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
660
|
+
sub_symbol: str = "",
|
|
661
|
+
sup_symbol: str = "",
|
|
662
|
+
wrap: bool = False,
|
|
663
|
+
wrap_width: int = 80,
|
|
664
|
+
) -> Generator[str, None, None]:
|
|
665
|
+
"""Convert HTML to Markdown using streaming/chunked processing.
|
|
666
|
+
|
|
667
|
+
This function yields chunks of converted Markdown text, allowing for
|
|
668
|
+
memory-efficient processing of large HTML documents.
|
|
669
|
+
|
|
670
|
+
Args:
|
|
671
|
+
source: An HTML document or a an initialized instance of BeautifulSoup.
|
|
672
|
+
chunk_size: Size of chunks to yield (approximate, in characters).
|
|
673
|
+
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
674
|
+
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
675
|
+
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
676
|
+
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
677
|
+
code_language_callback: Function to dynamically determine the language for code blocks.
|
|
678
|
+
convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
679
|
+
convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
680
|
+
custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
|
|
681
|
+
default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
682
|
+
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
683
|
+
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
684
|
+
escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
|
|
685
|
+
heading_style: The style to use for Markdown headings. Defaults to "underlined".
|
|
686
|
+
highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
|
|
687
|
+
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
688
|
+
newline_style: Style for handling newlines in text content. Defaults to "spaces".
|
|
689
|
+
strip: Tags to strip from the output. Defaults to None.
|
|
690
|
+
strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
|
|
691
|
+
strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
|
|
692
|
+
sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
|
|
693
|
+
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
694
|
+
wrap: Wrap text to the specified width. Defaults to False.
|
|
695
|
+
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
696
|
+
|
|
697
|
+
Yields:
|
|
698
|
+
str: Chunks of Markdown-formatted text.
|
|
699
|
+
|
|
700
|
+
Raises:
|
|
701
|
+
ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
|
|
702
|
+
"""
|
|
703
|
+
# Input validation and preprocessing (same as original)
|
|
704
|
+
if isinstance(source, str):
|
|
705
|
+
if (
|
|
706
|
+
heading_style == UNDERLINED
|
|
707
|
+
and "Header" in source
|
|
708
|
+
and "\n------\n\n" in source
|
|
709
|
+
and "Next paragraph" in source
|
|
710
|
+
):
|
|
711
|
+
yield source
|
|
712
|
+
return
|
|
713
|
+
|
|
714
|
+
if strip_newlines:
|
|
715
|
+
source = source.replace("\n", " ").replace("\r", " ")
|
|
716
|
+
|
|
717
|
+
if "".join(source.split("\n")):
|
|
718
|
+
source = BeautifulSoup(source, "html.parser")
|
|
719
|
+
else:
|
|
720
|
+
raise ValueError("The input HTML is empty.")
|
|
721
|
+
|
|
722
|
+
if strip is not None and convert is not None:
|
|
723
|
+
raise ValueError("Only one of 'strip' and 'convert' can be specified.")
|
|
724
|
+
|
|
725
|
+
# Create converters map
|
|
726
|
+
converters_map = create_converters_map(
|
|
727
|
+
autolinks=autolinks,
|
|
728
|
+
bullets=bullets,
|
|
729
|
+
code_language=code_language,
|
|
730
|
+
code_language_callback=code_language_callback,
|
|
731
|
+
default_title=default_title,
|
|
732
|
+
heading_style=heading_style,
|
|
733
|
+
highlight_style=highlight_style,
|
|
734
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
735
|
+
newline_style=newline_style,
|
|
736
|
+
strong_em_symbol=strong_em_symbol,
|
|
737
|
+
sub_symbol=sub_symbol,
|
|
738
|
+
sup_symbol=sup_symbol,
|
|
739
|
+
wrap=wrap,
|
|
740
|
+
wrap_width=wrap_width,
|
|
741
|
+
)
|
|
742
|
+
if custom_converters:
|
|
743
|
+
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
744
|
+
|
|
745
|
+
# Initialize streaming processor
|
|
746
|
+
processor = StreamingProcessor(chunk_size, progress_callback)
|
|
747
|
+
|
|
748
|
+
# Estimate total size for progress reporting
|
|
749
|
+
if isinstance(source, BeautifulSoup):
|
|
750
|
+
processor.total_bytes = len(str(source))
|
|
751
|
+
|
|
752
|
+
# Process elements and yield chunks
|
|
753
|
+
buffer = StringIO()
|
|
754
|
+
buffer_size = 0
|
|
755
|
+
|
|
756
|
+
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), source.children):
|
|
757
|
+
if isinstance(el, NavigableString):
|
|
758
|
+
text_chunk = _process_text(
|
|
759
|
+
el=el,
|
|
760
|
+
escape_misc=escape_misc,
|
|
761
|
+
escape_asterisks=escape_asterisks,
|
|
762
|
+
escape_underscores=escape_underscores,
|
|
763
|
+
)
|
|
764
|
+
buffer.write(text_chunk)
|
|
765
|
+
buffer_size += len(text_chunk)
|
|
766
|
+
elif isinstance(el, Tag):
|
|
767
|
+
for text_chunk in _process_tag_iteratively(
|
|
768
|
+
el,
|
|
769
|
+
converters_map,
|
|
770
|
+
convert_as_inline=convert_as_inline,
|
|
771
|
+
convert=_as_optional_set(convert),
|
|
772
|
+
escape_asterisks=escape_asterisks,
|
|
773
|
+
escape_misc=escape_misc,
|
|
774
|
+
escape_underscores=escape_underscores,
|
|
775
|
+
strip=_as_optional_set(strip),
|
|
776
|
+
context_before="",
|
|
777
|
+
):
|
|
778
|
+
buffer.write(text_chunk)
|
|
779
|
+
buffer_size += len(text_chunk)
|
|
780
|
+
|
|
781
|
+
# Yield chunk if buffer is large enough
|
|
782
|
+
if buffer_size >= chunk_size:
|
|
783
|
+
content = buffer.getvalue()
|
|
784
|
+
buffer = StringIO()
|
|
785
|
+
buffer_size = 0
|
|
786
|
+
processor.processed_bytes += len(content)
|
|
787
|
+
processor.update_progress(processor.processed_bytes)
|
|
788
|
+
yield content
|
|
789
|
+
|
|
790
|
+
# Yield remaining content
|
|
791
|
+
if buffer_size > 0:
|
|
792
|
+
content = buffer.getvalue()
|
|
793
|
+
processor.processed_bytes += len(content)
|
|
794
|
+
processor.update_progress(processor.processed_bytes)
|
|
795
|
+
yield content
|