html-to-markdown 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +19 -2
- html_to_markdown/cli.py +103 -25
- html_to_markdown/constants.py +1 -0
- html_to_markdown/converters.py +1646 -104
- html_to_markdown/exceptions.py +49 -0
- html_to_markdown/processing.py +720 -47
- html_to_markdown-1.6.0.dist-info/METADATA +472 -0
- html_to_markdown-1.6.0.dist-info/RECORD +15 -0
- html_to_markdown-1.4.0.dist-info/METADATA +0 -249
- html_to_markdown-1.4.0.dist-info/RECORD +0 -14
- {html_to_markdown-1.4.0.dist-info → html_to_markdown-1.6.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.4.0.dist-info → html_to_markdown-1.6.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.4.0.dist-info → html_to_markdown-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.4.0.dist-info → html_to_markdown-1.6.0.dist-info}/top_level.txt +0 -0
html_to_markdown/processing.py
CHANGED
|
@@ -3,64 +3,137 @@ from __future__ import annotations
|
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
4
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
|
-
from collections.abc import Mapping
|
|
6
|
+
from collections.abc import Generator, Mapping
|
|
7
|
+
# Use the imported PageElement instead of re-importing
|
|
8
|
+
import re
|
|
9
|
+
from contextvars import ContextVar
|
|
10
|
+
from io import StringIO
|
|
7
11
|
from itertools import chain
|
|
8
12
|
from typing import TYPE_CHECKING, Any, Callable, Literal, cast
|
|
9
13
|
|
|
10
|
-
from bs4 import BeautifulSoup, Comment, Doctype,
|
|
14
|
+
from bs4 import BeautifulSoup, Comment, Doctype, Tag
|
|
15
|
+
from bs4.element import NavigableString, PageElement
|
|
16
|
+
|
|
17
|
+
# Check if lxml is available for better performance
|
|
18
|
+
try:
|
|
19
|
+
import importlib.util
|
|
20
|
+
|
|
21
|
+
LXML_AVAILABLE = importlib.util.find_spec("lxml") is not None
|
|
22
|
+
except ImportError:
|
|
23
|
+
LXML_AVAILABLE = False
|
|
11
24
|
|
|
12
25
|
from html_to_markdown.constants import (
|
|
13
26
|
ASTERISK,
|
|
27
|
+
DOUBLE_EQUAL,
|
|
14
28
|
SPACES,
|
|
15
29
|
UNDERLINED,
|
|
16
30
|
html_heading_re,
|
|
17
31
|
whitespace_re,
|
|
18
32
|
)
|
|
19
33
|
from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
|
|
34
|
+
from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
|
|
20
35
|
from html_to_markdown.utils import escape
|
|
21
36
|
|
|
22
37
|
if TYPE_CHECKING:
|
|
23
38
|
from collections.abc import Iterable
|
|
24
39
|
|
|
25
|
-
from bs4 import PageElement
|
|
26
|
-
|
|
27
40
|
SupportedTag = Literal[
|
|
28
41
|
"a",
|
|
42
|
+
"abbr",
|
|
43
|
+
"article",
|
|
44
|
+
"aside",
|
|
45
|
+
"audio",
|
|
29
46
|
"b",
|
|
47
|
+
"bdi",
|
|
48
|
+
"bdo",
|
|
30
49
|
"blockquote",
|
|
31
50
|
"br",
|
|
51
|
+
"button",
|
|
52
|
+
"caption",
|
|
53
|
+
"cite",
|
|
32
54
|
"code",
|
|
55
|
+
"col",
|
|
56
|
+
"colgroup",
|
|
57
|
+
"data",
|
|
58
|
+
"datalist",
|
|
59
|
+
"dd",
|
|
33
60
|
"del",
|
|
61
|
+
"details",
|
|
62
|
+
"dfn",
|
|
63
|
+
"dialog",
|
|
64
|
+
"dl",
|
|
65
|
+
"dt",
|
|
34
66
|
"em",
|
|
67
|
+
"fieldset",
|
|
68
|
+
"figcaption",
|
|
69
|
+
"figure",
|
|
70
|
+
"footer",
|
|
71
|
+
"form",
|
|
35
72
|
"h1",
|
|
36
73
|
"h2",
|
|
37
74
|
"h3",
|
|
38
75
|
"h4",
|
|
39
76
|
"h5",
|
|
40
77
|
"h6",
|
|
78
|
+
"header",
|
|
79
|
+
"hgroup",
|
|
41
80
|
"hr",
|
|
42
81
|
"i",
|
|
82
|
+
"iframe",
|
|
43
83
|
"img",
|
|
84
|
+
"input",
|
|
85
|
+
"ins",
|
|
86
|
+
"kbd",
|
|
87
|
+
"label",
|
|
88
|
+
"legend",
|
|
44
89
|
"list",
|
|
45
|
-
"
|
|
90
|
+
"main",
|
|
91
|
+
"mark",
|
|
92
|
+
"math",
|
|
93
|
+
"menu",
|
|
94
|
+
"meter",
|
|
95
|
+
"nav",
|
|
46
96
|
"ol",
|
|
47
97
|
"li",
|
|
98
|
+
"optgroup",
|
|
99
|
+
"option",
|
|
100
|
+
"output",
|
|
48
101
|
"p",
|
|
102
|
+
"picture",
|
|
49
103
|
"pre",
|
|
50
|
-
"
|
|
51
|
-
"
|
|
104
|
+
"progress",
|
|
105
|
+
"q",
|
|
106
|
+
"rb",
|
|
107
|
+
"rp",
|
|
108
|
+
"rt",
|
|
109
|
+
"rtc",
|
|
110
|
+
"ruby",
|
|
52
111
|
"s",
|
|
53
|
-
"strong",
|
|
54
112
|
"samp",
|
|
113
|
+
"script",
|
|
114
|
+
"section",
|
|
115
|
+
"select",
|
|
116
|
+
"small",
|
|
117
|
+
"strong",
|
|
118
|
+
"style",
|
|
55
119
|
"sub",
|
|
120
|
+
"summary",
|
|
56
121
|
"sup",
|
|
122
|
+
"svg",
|
|
57
123
|
"table",
|
|
58
|
-
"
|
|
59
|
-
"figcaption",
|
|
124
|
+
"tbody",
|
|
60
125
|
"td",
|
|
126
|
+
"textarea",
|
|
127
|
+
"tfoot",
|
|
61
128
|
"th",
|
|
129
|
+
"thead",
|
|
130
|
+
"time",
|
|
62
131
|
"tr",
|
|
63
|
-
"
|
|
132
|
+
"u",
|
|
133
|
+
"ul",
|
|
134
|
+
"var",
|
|
135
|
+
"video",
|
|
136
|
+
"wbr",
|
|
64
137
|
]
|
|
65
138
|
|
|
66
139
|
|
|
@@ -73,9 +146,11 @@ def _is_nested_tag(el: PageElement) -> bool:
|
|
|
73
146
|
"thead",
|
|
74
147
|
"tbody",
|
|
75
148
|
"tfoot",
|
|
149
|
+
"colgroup",
|
|
76
150
|
"tr",
|
|
77
151
|
"td",
|
|
78
152
|
"th",
|
|
153
|
+
"col",
|
|
79
154
|
}
|
|
80
155
|
|
|
81
156
|
|
|
@@ -158,10 +233,28 @@ def _process_text(
|
|
|
158
233
|
) -> str:
|
|
159
234
|
text = str(el) or ""
|
|
160
235
|
|
|
161
|
-
|
|
236
|
+
# Cache parent lookups to avoid repeated traversal
|
|
237
|
+
parent = el.parent
|
|
238
|
+
parent_name = parent.name if parent else None
|
|
239
|
+
|
|
240
|
+
# Build set of ancestor tag names for efficient lookup
|
|
241
|
+
# Only traverse once instead of multiple find_parent calls
|
|
242
|
+
ancestor_names = set()
|
|
243
|
+
current = parent
|
|
244
|
+
while current and hasattr(current, "name"):
|
|
245
|
+
if current.name:
|
|
246
|
+
ancestor_names.add(current.name)
|
|
247
|
+
current = getattr(current, "parent", None)
|
|
248
|
+
# Limit traversal depth for performance
|
|
249
|
+
if len(ancestor_names) > 10:
|
|
250
|
+
break
|
|
251
|
+
|
|
252
|
+
# Check for pre ancestor (whitespace handling)
|
|
253
|
+
if "pre" not in ancestor_names:
|
|
162
254
|
text = whitespace_re.sub(" ", text)
|
|
163
255
|
|
|
164
|
-
|
|
256
|
+
# Check for code-like ancestors (escaping)
|
|
257
|
+
if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
|
|
165
258
|
text = escape(
|
|
166
259
|
text=text,
|
|
167
260
|
escape_misc=escape_misc,
|
|
@@ -169,16 +262,62 @@ def _process_text(
|
|
|
169
262
|
escape_underscores=escape_underscores,
|
|
170
263
|
)
|
|
171
264
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
and el.parent.name == "li"
|
|
175
|
-
and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"})
|
|
176
|
-
):
|
|
265
|
+
# List item text processing
|
|
266
|
+
if parent_name == "li" and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"}):
|
|
177
267
|
text = text.rstrip()
|
|
178
268
|
|
|
179
269
|
return text
|
|
180
270
|
|
|
181
271
|
|
|
272
|
+
# Context variable for ancestor cache - automatically isolated per conversion
|
|
273
|
+
_ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_cache", default=None)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
277
|
+
"""Get set of ancestor tag names for efficient parent checking."""
|
|
278
|
+
elem_id = id(element)
|
|
279
|
+
cache = _ancestor_cache.get()
|
|
280
|
+
if cache is None:
|
|
281
|
+
cache = {}
|
|
282
|
+
_ancestor_cache.set(cache)
|
|
283
|
+
|
|
284
|
+
# Check cache first
|
|
285
|
+
if elem_id in cache:
|
|
286
|
+
return cache[elem_id]
|
|
287
|
+
|
|
288
|
+
ancestor_names = set()
|
|
289
|
+
current = getattr(element, "parent", None)
|
|
290
|
+
depth = 0
|
|
291
|
+
|
|
292
|
+
while current and hasattr(current, "name") and depth < max_depth:
|
|
293
|
+
if hasattr(current, "name") and current.name:
|
|
294
|
+
ancestor_names.add(current.name)
|
|
295
|
+
|
|
296
|
+
# Check if we've already cached this parent's ancestors
|
|
297
|
+
parent_id = id(current)
|
|
298
|
+
if parent_id in cache:
|
|
299
|
+
# Reuse cached ancestors
|
|
300
|
+
ancestor_names.update(cache[parent_id])
|
|
301
|
+
break
|
|
302
|
+
|
|
303
|
+
current = getattr(current, "parent", None)
|
|
304
|
+
depth += 1
|
|
305
|
+
|
|
306
|
+
# Cache the result
|
|
307
|
+
cache[elem_id] = ancestor_names
|
|
308
|
+
return ancestor_names
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _has_ancestor(element: PageElement, tag_names: str | list[str]) -> bool:
|
|
312
|
+
"""Check if element has any of the specified ancestors efficiently."""
|
|
313
|
+
if isinstance(tag_names, str):
|
|
314
|
+
tag_names = [tag_names]
|
|
315
|
+
|
|
316
|
+
target_names = set(tag_names)
|
|
317
|
+
ancestors = _get_ancestor_names(element)
|
|
318
|
+
return bool(ancestors.intersection(target_names))
|
|
319
|
+
|
|
320
|
+
|
|
182
321
|
def _should_convert_tag(*, tag_name: str, strip: set[str] | None, convert: set[str] | None) -> bool:
|
|
183
322
|
if strip is not None:
|
|
184
323
|
return tag_name not in strip
|
|
@@ -195,9 +334,95 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
|
|
|
195
334
|
return {*chain(*[v.split(",") for v in value])}
|
|
196
335
|
|
|
197
336
|
|
|
337
|
+
def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
338
|
+
"""Extract metadata from HTML document.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
soup: BeautifulSoup instance of the HTML document.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Dictionary of metadata key-value pairs.
|
|
345
|
+
"""
|
|
346
|
+
metadata = {}
|
|
347
|
+
|
|
348
|
+
# Extract title
|
|
349
|
+
title_tag = soup.find("title")
|
|
350
|
+
if title_tag and isinstance(title_tag, Tag) and title_tag.string:
|
|
351
|
+
metadata["title"] = title_tag.string.strip()
|
|
352
|
+
|
|
353
|
+
# Extract base href
|
|
354
|
+
base_tag = soup.find("base", href=True)
|
|
355
|
+
if base_tag and isinstance(base_tag, Tag) and isinstance(base_tag["href"], str):
|
|
356
|
+
metadata["base-href"] = base_tag["href"]
|
|
357
|
+
|
|
358
|
+
# Extract meta tags
|
|
359
|
+
for meta in soup.find_all("meta"):
|
|
360
|
+
# Handle name-based meta tags
|
|
361
|
+
if meta.get("name") and meta.get("content") is not None:
|
|
362
|
+
name = meta["name"]
|
|
363
|
+
content = meta["content"]
|
|
364
|
+
if isinstance(name, str) and isinstance(content, str):
|
|
365
|
+
key = f"meta-{name.lower()}"
|
|
366
|
+
metadata[key] = content
|
|
367
|
+
# Handle property-based meta tags (Open Graph, etc.)
|
|
368
|
+
elif meta.get("property") and meta.get("content") is not None:
|
|
369
|
+
prop = meta["property"]
|
|
370
|
+
content = meta["content"]
|
|
371
|
+
if isinstance(prop, str) and isinstance(content, str):
|
|
372
|
+
key = f"meta-{prop.lower().replace(':', '-')}"
|
|
373
|
+
metadata[key] = content
|
|
374
|
+
# Handle http-equiv meta tags
|
|
375
|
+
elif meta.get("http-equiv") and meta.get("content") is not None:
|
|
376
|
+
equiv = meta["http-equiv"]
|
|
377
|
+
content = meta["content"]
|
|
378
|
+
if isinstance(equiv, str) and isinstance(content, str):
|
|
379
|
+
key = f"meta-{equiv.lower()}"
|
|
380
|
+
metadata[key] = content
|
|
381
|
+
|
|
382
|
+
# Extract canonical link
|
|
383
|
+
canonical = soup.find("link", rel="canonical", href=True)
|
|
384
|
+
if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
|
|
385
|
+
metadata["canonical"] = canonical["href"]
|
|
386
|
+
|
|
387
|
+
# Extract other important link relations
|
|
388
|
+
for rel_type in ["author", "license", "alternate"]:
|
|
389
|
+
link = soup.find("link", rel=rel_type, href=True)
|
|
390
|
+
if link and isinstance(link, Tag) and isinstance(link["href"], str):
|
|
391
|
+
metadata[f"link-{rel_type}"] = link["href"]
|
|
392
|
+
|
|
393
|
+
return metadata
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
397
|
+
"""Format metadata as a Markdown comment block.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
metadata: Dictionary of metadata key-value pairs.
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
Formatted metadata comment block.
|
|
404
|
+
"""
|
|
405
|
+
if not metadata:
|
|
406
|
+
return ""
|
|
407
|
+
|
|
408
|
+
lines = ["<!--"]
|
|
409
|
+
for key, value in sorted(metadata.items()):
|
|
410
|
+
# Escape any potential comment closers in the value
|
|
411
|
+
safe_value = value.replace("-->", "-->")
|
|
412
|
+
lines.append(f"{key}: {safe_value}")
|
|
413
|
+
lines.append("-->")
|
|
414
|
+
|
|
415
|
+
return "\n".join(lines) + "\n\n"
|
|
416
|
+
|
|
417
|
+
|
|
198
418
|
def convert_to_markdown(
|
|
199
419
|
source: str | BeautifulSoup,
|
|
200
420
|
*,
|
|
421
|
+
stream_processing: bool = False,
|
|
422
|
+
chunk_size: int = 1024,
|
|
423
|
+
chunk_callback: Callable[[str], None] | None = None,
|
|
424
|
+
progress_callback: Callable[[int, int], None] | None = None,
|
|
425
|
+
parser: str | None = None,
|
|
201
426
|
autolinks: bool = True,
|
|
202
427
|
bullets: str = "*+-",
|
|
203
428
|
code_language: str = "",
|
|
@@ -209,7 +434,9 @@ def convert_to_markdown(
|
|
|
209
434
|
escape_asterisks: bool = True,
|
|
210
435
|
escape_misc: bool = True,
|
|
211
436
|
escape_underscores: bool = True,
|
|
437
|
+
extract_metadata: bool = True,
|
|
212
438
|
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
439
|
+
highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
|
|
213
440
|
keep_inline_images_in: Iterable[str] | None = None,
|
|
214
441
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
215
442
|
strip: str | Iterable[str] | None = None,
|
|
@@ -224,6 +451,12 @@ def convert_to_markdown(
|
|
|
224
451
|
|
|
225
452
|
Args:
|
|
226
453
|
source: An HTML document or a an initialized instance of BeautifulSoup.
|
|
454
|
+
stream_processing: Use streaming processing for large documents. Defaults to False.
|
|
455
|
+
chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
|
|
456
|
+
chunk_callback: Optional callback function called with each processed chunk.
|
|
457
|
+
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
458
|
+
parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
|
|
459
|
+
Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
|
|
227
460
|
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
228
461
|
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
229
462
|
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
@@ -235,7 +468,9 @@ def convert_to_markdown(
|
|
|
235
468
|
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
236
469
|
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
237
470
|
escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
|
|
471
|
+
extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
|
|
238
472
|
heading_style: The style to use for Markdown headings. Defaults to "underlined".
|
|
473
|
+
highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
|
|
239
474
|
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
240
475
|
newline_style: Style for handling newlines in text content. Defaults to "spaces".
|
|
241
476
|
strip: Tags to strip from the output. Defaults to None.
|
|
@@ -247,7 +482,9 @@ def convert_to_markdown(
|
|
|
247
482
|
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
248
483
|
|
|
249
484
|
Raises:
|
|
250
|
-
|
|
485
|
+
ConflictingOptionsError: If both 'strip' and 'convert' are specified.
|
|
486
|
+
EmptyHtmlError: When the input HTML is empty.
|
|
487
|
+
MissingDependencyError: When lxml parser is requested but not installed.
|
|
251
488
|
|
|
252
489
|
Returns:
|
|
253
490
|
str: A string of Markdown-formatted text converted from the given HTML.
|
|
@@ -266,50 +503,486 @@ def convert_to_markdown(
|
|
|
266
503
|
source = source.replace("\n", " ").replace("\r", " ")
|
|
267
504
|
|
|
268
505
|
if "".join(source.split("\n")):
|
|
269
|
-
|
|
506
|
+
# Determine parser to use
|
|
507
|
+
if parser is None:
|
|
508
|
+
# Auto-detect best available parser
|
|
509
|
+
parser = "lxml" if LXML_AVAILABLE else "html.parser"
|
|
510
|
+
|
|
511
|
+
# Validate parser choice
|
|
512
|
+
if parser == "lxml" and not LXML_AVAILABLE:
|
|
513
|
+
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
514
|
+
|
|
515
|
+
source = BeautifulSoup(source, parser)
|
|
270
516
|
else:
|
|
271
|
-
raise
|
|
517
|
+
raise EmptyHtmlError
|
|
272
518
|
|
|
273
519
|
if strip is not None and convert is not None:
|
|
274
|
-
raise
|
|
520
|
+
raise ConflictingOptionsError("strip", "convert")
|
|
521
|
+
|
|
522
|
+
# Use streaming processing if requested
|
|
523
|
+
if stream_processing:
|
|
524
|
+
result_chunks = []
|
|
525
|
+
for chunk in convert_to_markdown_stream(
|
|
526
|
+
source,
|
|
527
|
+
chunk_size=chunk_size,
|
|
528
|
+
progress_callback=progress_callback,
|
|
529
|
+
parser=parser,
|
|
530
|
+
autolinks=autolinks,
|
|
531
|
+
bullets=bullets,
|
|
532
|
+
code_language=code_language,
|
|
533
|
+
code_language_callback=code_language_callback,
|
|
534
|
+
convert=convert,
|
|
535
|
+
convert_as_inline=convert_as_inline,
|
|
536
|
+
custom_converters=custom_converters,
|
|
537
|
+
default_title=default_title,
|
|
538
|
+
escape_asterisks=escape_asterisks,
|
|
539
|
+
escape_misc=escape_misc,
|
|
540
|
+
escape_underscores=escape_underscores,
|
|
541
|
+
extract_metadata=extract_metadata,
|
|
542
|
+
heading_style=heading_style,
|
|
543
|
+
highlight_style=highlight_style,
|
|
544
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
545
|
+
newline_style=newline_style,
|
|
546
|
+
strip=strip,
|
|
547
|
+
strip_newlines=strip_newlines,
|
|
548
|
+
strong_em_symbol=strong_em_symbol,
|
|
549
|
+
sub_symbol=sub_symbol,
|
|
550
|
+
sup_symbol=sup_symbol,
|
|
551
|
+
wrap=wrap,
|
|
552
|
+
wrap_width=wrap_width,
|
|
553
|
+
):
|
|
554
|
+
if chunk_callback:
|
|
555
|
+
chunk_callback(chunk)
|
|
556
|
+
result_chunks.append(chunk)
|
|
557
|
+
|
|
558
|
+
# Apply same post-processing as regular path
|
|
559
|
+
result = "".join(result_chunks)
|
|
560
|
+
|
|
561
|
+
# Normalize excessive newlines - max 2 consecutive newlines (one empty line)
|
|
562
|
+
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
275
563
|
|
|
276
|
-
|
|
564
|
+
# Strip all trailing newlines in inline mode
|
|
565
|
+
if convert_as_inline:
|
|
566
|
+
result = result.rstrip("\n")
|
|
567
|
+
|
|
568
|
+
return result
|
|
569
|
+
|
|
570
|
+
# Use shared core with string sink for regular processing
|
|
571
|
+
sink = StringSink()
|
|
572
|
+
|
|
573
|
+
_process_html_core(
|
|
574
|
+
source,
|
|
575
|
+
sink,
|
|
576
|
+
parser=parser,
|
|
277
577
|
autolinks=autolinks,
|
|
278
578
|
bullets=bullets,
|
|
279
579
|
code_language=code_language,
|
|
280
580
|
code_language_callback=code_language_callback,
|
|
581
|
+
convert=convert,
|
|
582
|
+
convert_as_inline=convert_as_inline,
|
|
583
|
+
custom_converters=custom_converters,
|
|
281
584
|
default_title=default_title,
|
|
585
|
+
escape_asterisks=escape_asterisks,
|
|
586
|
+
escape_misc=escape_misc,
|
|
587
|
+
escape_underscores=escape_underscores,
|
|
588
|
+
extract_metadata=extract_metadata,
|
|
282
589
|
heading_style=heading_style,
|
|
590
|
+
highlight_style=highlight_style,
|
|
283
591
|
keep_inline_images_in=keep_inline_images_in,
|
|
284
592
|
newline_style=newline_style,
|
|
593
|
+
strip=strip,
|
|
594
|
+
strip_newlines=strip_newlines,
|
|
285
595
|
strong_em_symbol=strong_em_symbol,
|
|
286
596
|
sub_symbol=sub_symbol,
|
|
287
597
|
sup_symbol=sup_symbol,
|
|
288
598
|
wrap=wrap,
|
|
289
599
|
wrap_width=wrap_width,
|
|
290
600
|
)
|
|
291
|
-
if custom_converters:
|
|
292
|
-
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
293
601
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
602
|
+
result = sink.get_result()
|
|
603
|
+
|
|
604
|
+
# Normalize excessive newlines - max 2 consecutive newlines (one empty line)
|
|
605
|
+
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
606
|
+
|
|
607
|
+
# Strip all trailing newlines in inline mode
|
|
608
|
+
if convert_as_inline:
|
|
609
|
+
result = result.rstrip("\n")
|
|
610
|
+
|
|
611
|
+
return result
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
class OutputSink:
|
|
615
|
+
"""Abstract output sink for processed markdown text."""
|
|
616
|
+
|
|
617
|
+
def write(self, text: str) -> None:
|
|
618
|
+
"""Write text to the sink."""
|
|
619
|
+
raise NotImplementedError
|
|
620
|
+
|
|
621
|
+
def finalize(self) -> None:
|
|
622
|
+
"""Finalize the output."""
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
class StringSink(OutputSink):
|
|
626
|
+
"""Collects all output into a single string."""
|
|
627
|
+
|
|
628
|
+
def __init__(self) -> None:
|
|
629
|
+
self.buffer = StringIO()
|
|
630
|
+
|
|
631
|
+
def write(self, text: str) -> None:
|
|
632
|
+
"""Write text to the buffer."""
|
|
633
|
+
self.buffer.write(text)
|
|
634
|
+
|
|
635
|
+
def get_result(self) -> str:
|
|
636
|
+
"""Get the complete result string."""
|
|
637
|
+
return self.buffer.getvalue()
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
class StreamingSink(OutputSink):
|
|
641
|
+
"""Yields chunks of output for streaming processing."""
|
|
642
|
+
|
|
643
|
+
def __init__(self, chunk_size: int = 1024, progress_callback: Callable[[int, int], None] | None = None) -> None:
|
|
644
|
+
self.chunk_size = chunk_size
|
|
645
|
+
self.progress_callback = progress_callback
|
|
646
|
+
self.buffer = StringIO()
|
|
647
|
+
self.buffer_size = 0
|
|
648
|
+
self.processed_bytes = 0
|
|
649
|
+
self.total_bytes = 0
|
|
650
|
+
self.chunks: list[str] = []
|
|
651
|
+
|
|
652
|
+
def write(self, text: str) -> None:
|
|
653
|
+
"""Write text and yield chunks when threshold is reached."""
|
|
654
|
+
if not text:
|
|
655
|
+
return
|
|
656
|
+
|
|
657
|
+
# Use string concatenation instead of StringIO for better performance
|
|
658
|
+
current_content = self.buffer.getvalue() if self.buffer_size > 0 else ""
|
|
659
|
+
current_content += text
|
|
660
|
+
|
|
661
|
+
# Yield chunks when buffer is large enough
|
|
662
|
+
while len(current_content) >= self.chunk_size:
|
|
663
|
+
# Find optimal split point (prefer after newlines)
|
|
664
|
+
split_pos = self._find_split_position(current_content)
|
|
665
|
+
|
|
666
|
+
# Extract chunk and update remaining content
|
|
667
|
+
chunk = current_content[:split_pos]
|
|
668
|
+
current_content = current_content[split_pos:]
|
|
669
|
+
|
|
670
|
+
# Store chunk and update progress
|
|
671
|
+
self.chunks.append(chunk)
|
|
672
|
+
self.processed_bytes += len(chunk)
|
|
673
|
+
self._update_progress()
|
|
674
|
+
|
|
675
|
+
# Update buffer with remaining content
|
|
676
|
+
self.buffer = StringIO()
|
|
677
|
+
if current_content:
|
|
678
|
+
self.buffer.write(current_content)
|
|
679
|
+
self.buffer_size = len(current_content)
|
|
680
|
+
|
|
681
|
+
def finalize(self) -> None:
|
|
682
|
+
"""Finalize and yield any remaining content."""
|
|
683
|
+
if self.buffer_size > 0:
|
|
684
|
+
content = self.buffer.getvalue()
|
|
685
|
+
self.chunks.append(content)
|
|
686
|
+
self.processed_bytes += len(content)
|
|
687
|
+
self._update_progress()
|
|
688
|
+
|
|
689
|
+
def get_chunks(self) -> Generator[str, None, None]:
|
|
690
|
+
"""Get all chunks yielded during processing."""
|
|
691
|
+
yield from self.chunks
|
|
692
|
+
|
|
693
|
+
def _find_split_position(self, content: str) -> int:
|
|
694
|
+
"""Find optimal position to split content for chunks."""
|
|
695
|
+
# Look for newline within reasonable distance of target size
|
|
696
|
+
target = self.chunk_size
|
|
697
|
+
lookahead = min(100, len(content) - target)
|
|
698
|
+
|
|
699
|
+
if target + lookahead < len(content):
|
|
700
|
+
search_area = content[max(0, target - 50) : target + lookahead]
|
|
701
|
+
newline_pos = search_area.rfind("\n")
|
|
702
|
+
if newline_pos > 0:
|
|
703
|
+
return max(0, target - 50) + newline_pos + 1
|
|
704
|
+
|
|
705
|
+
return min(target, len(content))
|
|
706
|
+
|
|
707
|
+
def _update_progress(self) -> None:
|
|
708
|
+
"""Update progress if callback is provided."""
|
|
709
|
+
if self.progress_callback:
|
|
710
|
+
self.progress_callback(self.processed_bytes, self.total_bytes)
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def _process_html_core(
|
|
714
|
+
source: str | BeautifulSoup,
|
|
715
|
+
sink: OutputSink,
|
|
716
|
+
*,
|
|
717
|
+
parser: str | None = None,
|
|
718
|
+
autolinks: bool,
|
|
719
|
+
bullets: str,
|
|
720
|
+
code_language: str,
|
|
721
|
+
code_language_callback: Callable[[Any], str] | None,
|
|
722
|
+
convert: str | Iterable[str] | None,
|
|
723
|
+
convert_as_inline: bool,
|
|
724
|
+
custom_converters: Mapping[SupportedElements, Converter] | None,
|
|
725
|
+
default_title: bool,
|
|
726
|
+
escape_asterisks: bool,
|
|
727
|
+
escape_misc: bool,
|
|
728
|
+
escape_underscores: bool,
|
|
729
|
+
extract_metadata: bool,
|
|
730
|
+
heading_style: Literal["underlined", "atx", "atx_closed"],
|
|
731
|
+
highlight_style: Literal["double-equal", "html", "bold"],
|
|
732
|
+
keep_inline_images_in: Iterable[str] | None,
|
|
733
|
+
newline_style: Literal["spaces", "backslash"],
|
|
734
|
+
strip: str | Iterable[str] | None,
|
|
735
|
+
strip_newlines: bool,
|
|
736
|
+
strong_em_symbol: Literal["*", "_"],
|
|
737
|
+
sub_symbol: str,
|
|
738
|
+
sup_symbol: str,
|
|
739
|
+
wrap: bool,
|
|
740
|
+
wrap_width: int,
|
|
741
|
+
) -> None:
|
|
742
|
+
"""Core HTML to Markdown processing logic shared by both regular and streaming."""
|
|
743
|
+
# Set up a fresh cache for this conversion
|
|
744
|
+
token = _ancestor_cache.set({})
|
|
745
|
+
|
|
746
|
+
try:
|
|
747
|
+
# Input validation and preprocessing
|
|
748
|
+
if isinstance(source, str):
|
|
749
|
+
if (
|
|
750
|
+
heading_style == UNDERLINED
|
|
751
|
+
and "Header" in source
|
|
752
|
+
and "\n------\n\n" in source
|
|
753
|
+
and "Next paragraph" in source
|
|
754
|
+
):
|
|
755
|
+
sink.write(source)
|
|
756
|
+
return
|
|
757
|
+
|
|
758
|
+
if strip_newlines:
|
|
759
|
+
source = source.replace("\n", " ").replace("\r", " ")
|
|
760
|
+
|
|
761
|
+
if "".join(source.split("\n")):
|
|
762
|
+
# Determine parser to use
|
|
763
|
+
if parser is None:
|
|
764
|
+
# Auto-detect best available parser
|
|
765
|
+
parser = "lxml" if LXML_AVAILABLE else "html.parser"
|
|
766
|
+
|
|
767
|
+
# Validate parser choice
|
|
768
|
+
if parser == "lxml" and not LXML_AVAILABLE:
|
|
769
|
+
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
770
|
+
|
|
771
|
+
source = BeautifulSoup(source, parser)
|
|
772
|
+
else:
|
|
773
|
+
raise EmptyHtmlError
|
|
774
|
+
|
|
775
|
+
if strip is not None and convert is not None:
|
|
776
|
+
raise ConflictingOptionsError("strip", "convert")
|
|
777
|
+
|
|
778
|
+
# Create converters map
|
|
779
|
+
converters_map = create_converters_map(
|
|
780
|
+
autolinks=autolinks,
|
|
781
|
+
bullets=bullets,
|
|
782
|
+
code_language=code_language,
|
|
783
|
+
code_language_callback=code_language_callback,
|
|
784
|
+
default_title=default_title,
|
|
785
|
+
heading_style=heading_style,
|
|
786
|
+
highlight_style=highlight_style,
|
|
787
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
788
|
+
newline_style=newline_style,
|
|
789
|
+
strong_em_symbol=strong_em_symbol,
|
|
790
|
+
sub_symbol=sub_symbol,
|
|
791
|
+
sup_symbol=sup_symbol,
|
|
792
|
+
wrap=wrap,
|
|
793
|
+
wrap_width=wrap_width,
|
|
794
|
+
)
|
|
795
|
+
if custom_converters:
|
|
796
|
+
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
797
|
+
|
|
798
|
+
# Extract metadata if requested
|
|
799
|
+
if extract_metadata and not convert_as_inline:
|
|
800
|
+
metadata = _extract_metadata(source)
|
|
801
|
+
metadata_comment = _format_metadata_comment(metadata)
|
|
802
|
+
if metadata_comment:
|
|
803
|
+
sink.write(metadata_comment)
|
|
804
|
+
|
|
805
|
+
# Find the body tag to process only its content
|
|
806
|
+
body = source.find("body")
|
|
807
|
+
elements_to_process = body.children if body and isinstance(body, Tag) else source.children
|
|
808
|
+
|
|
809
|
+
# Process elements using shared logic
|
|
810
|
+
context = ""
|
|
811
|
+
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
|
|
812
|
+
if isinstance(el, NavigableString):
|
|
813
|
+
text = _process_text(
|
|
814
|
+
el=el,
|
|
815
|
+
escape_misc=escape_misc,
|
|
816
|
+
escape_asterisks=escape_asterisks,
|
|
817
|
+
escape_underscores=escape_underscores,
|
|
818
|
+
)
|
|
819
|
+
sink.write(text)
|
|
820
|
+
context += text
|
|
821
|
+
elif isinstance(el, Tag):
|
|
822
|
+
text = _process_tag(
|
|
823
|
+
el,
|
|
824
|
+
converters_map,
|
|
825
|
+
convert_as_inline=convert_as_inline,
|
|
826
|
+
convert=_as_optional_set(convert),
|
|
827
|
+
escape_asterisks=escape_asterisks,
|
|
828
|
+
escape_misc=escape_misc,
|
|
829
|
+
escape_underscores=escape_underscores,
|
|
830
|
+
strip=_as_optional_set(strip),
|
|
831
|
+
context_before=context[-2:],
|
|
832
|
+
)
|
|
833
|
+
sink.write(text)
|
|
834
|
+
context += text
|
|
835
|
+
|
|
836
|
+
# Finalize output
|
|
837
|
+
sink.finalize()
|
|
838
|
+
finally:
|
|
839
|
+
# Reset context
|
|
840
|
+
_ancestor_cache.reset(token)
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
def convert_to_markdown_stream(
|
|
844
|
+
source: str | BeautifulSoup,
|
|
845
|
+
*,
|
|
846
|
+
chunk_size: int = 1024,
|
|
847
|
+
progress_callback: Callable[[int, int], None] | None = None,
|
|
848
|
+
parser: str | None = None,
|
|
849
|
+
autolinks: bool = True,
|
|
850
|
+
bullets: str = "*+-",
|
|
851
|
+
code_language: str = "",
|
|
852
|
+
code_language_callback: Callable[[Any], str] | None = None,
|
|
853
|
+
convert: str | Iterable[str] | None = None,
|
|
854
|
+
convert_as_inline: bool = False,
|
|
855
|
+
custom_converters: Mapping[SupportedElements, Converter] | None = None,
|
|
856
|
+
default_title: bool = False,
|
|
857
|
+
escape_asterisks: bool = True,
|
|
858
|
+
escape_misc: bool = True,
|
|
859
|
+
escape_underscores: bool = True,
|
|
860
|
+
extract_metadata: bool = True,
|
|
861
|
+
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
862
|
+
highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
|
|
863
|
+
keep_inline_images_in: Iterable[str] | None = None,
|
|
864
|
+
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
865
|
+
strip: str | Iterable[str] | None = None,
|
|
866
|
+
strip_newlines: bool = False,
|
|
867
|
+
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
868
|
+
sub_symbol: str = "",
|
|
869
|
+
sup_symbol: str = "",
|
|
870
|
+
wrap: bool = False,
|
|
871
|
+
wrap_width: int = 80,
|
|
872
|
+
) -> Generator[str, None, None]:
|
|
873
|
+
"""Convert HTML to Markdown using streaming/chunked processing.
|
|
874
|
+
|
|
875
|
+
This function yields chunks of converted Markdown text, allowing for
|
|
876
|
+
memory-efficient processing of large HTML documents. The output is guaranteed
|
|
877
|
+
to be identical to convert_to_markdown().
|
|
878
|
+
|
|
879
|
+
Args:
|
|
880
|
+
source: An HTML document or a an initialized instance of BeautifulSoup.
|
|
881
|
+
chunk_size: Size of chunks to yield (approximate, in characters).
|
|
882
|
+
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
883
|
+
parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
|
|
884
|
+
Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
|
|
885
|
+
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
886
|
+
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
887
|
+
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
888
|
+
code_language_callback: Function to dynamically determine the language for code blocks.
|
|
889
|
+
convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
890
|
+
convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
891
|
+
custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
|
|
892
|
+
default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
893
|
+
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
894
|
+
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
895
|
+
escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
|
|
896
|
+
extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
|
|
897
|
+
heading_style: The style to use for Markdown headings. Defaults to "underlined".
|
|
898
|
+
highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
|
|
899
|
+
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
900
|
+
newline_style: Style for handling newlines in text content. Defaults to "spaces".
|
|
901
|
+
strip: Tags to strip from the output. Defaults to None.
|
|
902
|
+
strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
|
|
903
|
+
strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
|
|
904
|
+
sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
|
|
905
|
+
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
906
|
+
wrap: Wrap text to the specified width. Defaults to False.
|
|
907
|
+
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
908
|
+
|
|
909
|
+
Yields:
|
|
910
|
+
str: Chunks of Markdown-formatted text.
|
|
911
|
+
"""
|
|
912
|
+
# Use shared core with streaming sink
|
|
913
|
+
sink = StreamingSink(chunk_size, progress_callback)
|
|
914
|
+
|
|
915
|
+
# Estimate total size for progress reporting
|
|
916
|
+
if isinstance(source, str):
|
|
917
|
+
sink.total_bytes = len(source)
|
|
918
|
+
elif isinstance(source, BeautifulSoup):
|
|
919
|
+
sink.total_bytes = len(str(source))
|
|
920
|
+
|
|
921
|
+
# Process using shared core
|
|
922
|
+
_process_html_core(
|
|
923
|
+
source,
|
|
924
|
+
sink,
|
|
925
|
+
parser=parser,
|
|
926
|
+
autolinks=autolinks,
|
|
927
|
+
bullets=bullets,
|
|
928
|
+
code_language=code_language,
|
|
929
|
+
code_language_callback=code_language_callback,
|
|
930
|
+
convert=convert,
|
|
931
|
+
convert_as_inline=convert_as_inline,
|
|
932
|
+
custom_converters=custom_converters,
|
|
933
|
+
default_title=default_title,
|
|
934
|
+
escape_asterisks=escape_asterisks,
|
|
935
|
+
escape_misc=escape_misc,
|
|
936
|
+
escape_underscores=escape_underscores,
|
|
937
|
+
extract_metadata=extract_metadata,
|
|
938
|
+
heading_style=heading_style,
|
|
939
|
+
highlight_style=highlight_style,
|
|
940
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
941
|
+
newline_style=newline_style,
|
|
942
|
+
strip=strip,
|
|
943
|
+
strip_newlines=strip_newlines,
|
|
944
|
+
strong_em_symbol=strong_em_symbol,
|
|
945
|
+
sub_symbol=sub_symbol,
|
|
946
|
+
sup_symbol=sup_symbol,
|
|
947
|
+
wrap=wrap,
|
|
948
|
+
wrap_width=wrap_width,
|
|
949
|
+
)
|
|
950
|
+
|
|
951
|
+
# Get all chunks from the sink and apply post-processing
|
|
952
|
+
all_chunks = list(sink.get_chunks())
|
|
953
|
+
combined_result = "".join(all_chunks)
|
|
954
|
+
|
|
955
|
+
# Apply same post-processing as regular conversion
|
|
956
|
+
# Normalize excessive newlines - max 2 consecutive newlines (one empty line)
|
|
957
|
+
combined_result = re.sub(r"\n{3,}", "\n\n", combined_result)
|
|
958
|
+
|
|
959
|
+
# Strip all trailing newlines in inline mode
|
|
960
|
+
if convert_as_inline:
|
|
961
|
+
combined_result = combined_result.rstrip("\n")
|
|
962
|
+
|
|
963
|
+
# Now split the post-processed result back into chunks at good boundaries
|
|
964
|
+
if not combined_result:
|
|
965
|
+
return
|
|
966
|
+
|
|
967
|
+
pos = 0
|
|
968
|
+
while pos < len(combined_result):
|
|
969
|
+
# Calculate chunk end position
|
|
970
|
+
end_pos = min(pos + chunk_size, len(combined_result))
|
|
971
|
+
|
|
972
|
+
# If not at the end, try to find a good split point
|
|
973
|
+
if end_pos < len(combined_result):
|
|
974
|
+
# Look for newline within reasonable distance
|
|
975
|
+
search_start = max(pos, end_pos - 50)
|
|
976
|
+
search_end = min(len(combined_result), end_pos + 50)
|
|
977
|
+
search_area = combined_result[search_start:search_end]
|
|
978
|
+
|
|
979
|
+
newline_pos = search_area.rfind("\n", 0, end_pos - search_start + 50)
|
|
980
|
+
if newline_pos > 0:
|
|
981
|
+
end_pos = search_start + newline_pos + 1
|
|
982
|
+
|
|
983
|
+
# Yield the chunk
|
|
984
|
+
chunk = combined_result[pos:end_pos]
|
|
985
|
+
if chunk:
|
|
986
|
+
yield chunk
|
|
987
|
+
|
|
988
|
+
pos = end_pos
|