html-to-markdown 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +17 -1
- html_to_markdown/converters.py +14 -7
- html_to_markdown/exceptions.py +49 -0
- html_to_markdown/processing.py +391 -198
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.6.0.dist-info}/METADATA +49 -13
- html_to_markdown-1.6.0.dist-info/RECORD +15 -0
- html_to_markdown-1.5.0.dist-info/RECORD +0 -14
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.6.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.6.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.6.0.dist-info}/top_level.txt +0 -0
html_to_markdown/__init__.py
CHANGED
|
@@ -1,6 +1,22 @@
|
|
|
1
|
+
from html_to_markdown.exceptions import (
|
|
2
|
+
ConflictingOptionsError,
|
|
3
|
+
EmptyHtmlError,
|
|
4
|
+
HtmlToMarkdownError,
|
|
5
|
+
InvalidParserError,
|
|
6
|
+
MissingDependencyError,
|
|
7
|
+
)
|
|
1
8
|
from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
|
|
2
9
|
|
|
3
10
|
# For backward compatibility and to maintain the existing API
|
|
4
11
|
markdownify = convert_to_markdown
|
|
5
12
|
|
|
6
|
-
__all__ = [
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ConflictingOptionsError",
|
|
15
|
+
"EmptyHtmlError",
|
|
16
|
+
"HtmlToMarkdownError",
|
|
17
|
+
"InvalidParserError",
|
|
18
|
+
"MissingDependencyError",
|
|
19
|
+
"convert_to_markdown",
|
|
20
|
+
"convert_to_markdown_stream",
|
|
21
|
+
"markdownify",
|
|
22
|
+
]
|
html_to_markdown/converters.py
CHANGED
|
@@ -137,7 +137,10 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
|
|
|
137
137
|
"""
|
|
138
138
|
|
|
139
139
|
def implementation(*, tag: Tag, text: str) -> str:
|
|
140
|
-
if
|
|
140
|
+
# Check if we're in a code context - if so, don't apply markup
|
|
141
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
142
|
+
|
|
143
|
+
if _has_ancestor(tag, ["pre", "code", "kbd", "samp"]):
|
|
141
144
|
return text
|
|
142
145
|
|
|
143
146
|
if not text.strip():
|
|
@@ -200,7 +203,9 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
|
|
|
200
203
|
|
|
201
204
|
def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
|
|
202
205
|
# Convert br to line break, but handle headings specially
|
|
203
|
-
|
|
206
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
207
|
+
|
|
208
|
+
if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
|
|
204
209
|
return " " # Convert to space in headings
|
|
205
210
|
|
|
206
211
|
# Always convert br to line break in other contexts
|
|
@@ -676,7 +681,7 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
|
|
|
676
681
|
return f'"{escaped_text}"'
|
|
677
682
|
|
|
678
683
|
|
|
679
|
-
def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
684
|
+
def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
680
685
|
"""Convert HTML audio element preserving structure with fallback.
|
|
681
686
|
|
|
682
687
|
Args:
|
|
@@ -732,7 +737,7 @@ def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # n
|
|
|
732
737
|
return "<audio />\n\n"
|
|
733
738
|
|
|
734
739
|
|
|
735
|
-
def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
740
|
+
def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
736
741
|
"""Convert HTML video element preserving structure with fallback.
|
|
737
742
|
|
|
738
743
|
Args:
|
|
@@ -797,7 +802,7 @@ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # n
|
|
|
797
802
|
return "<video />\n\n"
|
|
798
803
|
|
|
799
804
|
|
|
800
|
-
def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
805
|
+
def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
801
806
|
"""Convert HTML iframe element preserving structure.
|
|
802
807
|
|
|
803
808
|
Args:
|
|
@@ -1029,7 +1034,7 @@ def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1029
1034
|
return f"<label>{text.strip()}</label>\n\n"
|
|
1030
1035
|
|
|
1031
1036
|
|
|
1032
|
-
def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
|
|
1037
|
+
def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
|
|
1033
1038
|
"""Convert HTML input element preserving all relevant attributes.
|
|
1034
1039
|
|
|
1035
1040
|
Args:
|
|
@@ -1043,7 +1048,9 @@ def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str: # noq
|
|
|
1043
1048
|
|
|
1044
1049
|
# Special handling for inputs in list items - let _convert_li handle checkboxes
|
|
1045
1050
|
# and ignore other input types in list items (legacy behavior)
|
|
1046
|
-
|
|
1051
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
1052
|
+
|
|
1053
|
+
if _has_ancestor(tag, "li"):
|
|
1047
1054
|
return ""
|
|
1048
1055
|
|
|
1049
1056
|
id_attr = tag.get("id", "")
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Custom exceptions for the html-to-markdown library."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class HtmlToMarkdownError(Exception):
|
|
7
|
+
"""Base exception for all html-to-markdown errors."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MissingDependencyError(HtmlToMarkdownError):
|
|
11
|
+
"""Raised when an optional dependency is required but not installed."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, dependency: str, install_command: str | None = None) -> None:
|
|
14
|
+
self.dependency = dependency
|
|
15
|
+
self.install_command = install_command
|
|
16
|
+
|
|
17
|
+
message = f"{dependency} is not installed."
|
|
18
|
+
if install_command:
|
|
19
|
+
message += f" Install with: {install_command}"
|
|
20
|
+
|
|
21
|
+
super().__init__(message)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class InvalidParserError(HtmlToMarkdownError):
|
|
25
|
+
"""Raised when an invalid parser is specified."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, parser: str, available_parsers: list[str]) -> None:
|
|
28
|
+
self.parser = parser
|
|
29
|
+
self.available_parsers = available_parsers
|
|
30
|
+
|
|
31
|
+
message = f"Invalid parser '{parser}'. Available parsers: {', '.join(available_parsers)}"
|
|
32
|
+
super().__init__(message)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class EmptyHtmlError(HtmlToMarkdownError):
|
|
36
|
+
"""Raised when the input HTML is empty."""
|
|
37
|
+
|
|
38
|
+
def __init__(self) -> None:
|
|
39
|
+
super().__init__("The input HTML is empty.")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ConflictingOptionsError(HtmlToMarkdownError):
|
|
43
|
+
"""Raised when conflicting options are specified."""
|
|
44
|
+
|
|
45
|
+
def __init__(self, option1: str, option2: str) -> None:
|
|
46
|
+
self.option1 = option1
|
|
47
|
+
self.option2 = option2
|
|
48
|
+
|
|
49
|
+
super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
|
html_to_markdown/processing.py
CHANGED
|
@@ -5,14 +5,23 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
6
|
from collections.abc import Generator, Mapping
|
|
7
7
|
# Use the imported PageElement instead of re-importing
|
|
8
|
-
from io import StringIO
|
|
9
8
|
import re
|
|
9
|
+
from contextvars import ContextVar
|
|
10
|
+
from io import StringIO
|
|
10
11
|
from itertools import chain
|
|
11
12
|
from typing import TYPE_CHECKING, Any, Callable, Literal, cast
|
|
12
13
|
|
|
13
14
|
from bs4 import BeautifulSoup, Comment, Doctype, Tag
|
|
14
15
|
from bs4.element import NavigableString, PageElement
|
|
15
16
|
|
|
17
|
+
# Check if lxml is available for better performance
|
|
18
|
+
try:
|
|
19
|
+
import importlib.util
|
|
20
|
+
|
|
21
|
+
LXML_AVAILABLE = importlib.util.find_spec("lxml") is not None
|
|
22
|
+
except ImportError:
|
|
23
|
+
LXML_AVAILABLE = False
|
|
24
|
+
|
|
16
25
|
from html_to_markdown.constants import (
|
|
17
26
|
ASTERISK,
|
|
18
27
|
DOUBLE_EQUAL,
|
|
@@ -22,6 +31,7 @@ from html_to_markdown.constants import (
|
|
|
22
31
|
whitespace_re,
|
|
23
32
|
)
|
|
24
33
|
from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
|
|
34
|
+
from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
|
|
25
35
|
from html_to_markdown.utils import escape
|
|
26
36
|
|
|
27
37
|
if TYPE_CHECKING:
|
|
@@ -223,10 +233,28 @@ def _process_text(
|
|
|
223
233
|
) -> str:
|
|
224
234
|
text = str(el) or ""
|
|
225
235
|
|
|
226
|
-
|
|
236
|
+
# Cache parent lookups to avoid repeated traversal
|
|
237
|
+
parent = el.parent
|
|
238
|
+
parent_name = parent.name if parent else None
|
|
239
|
+
|
|
240
|
+
# Build set of ancestor tag names for efficient lookup
|
|
241
|
+
# Only traverse once instead of multiple find_parent calls
|
|
242
|
+
ancestor_names = set()
|
|
243
|
+
current = parent
|
|
244
|
+
while current and hasattr(current, "name"):
|
|
245
|
+
if current.name:
|
|
246
|
+
ancestor_names.add(current.name)
|
|
247
|
+
current = getattr(current, "parent", None)
|
|
248
|
+
# Limit traversal depth for performance
|
|
249
|
+
if len(ancestor_names) > 10:
|
|
250
|
+
break
|
|
251
|
+
|
|
252
|
+
# Check for pre ancestor (whitespace handling)
|
|
253
|
+
if "pre" not in ancestor_names:
|
|
227
254
|
text = whitespace_re.sub(" ", text)
|
|
228
255
|
|
|
229
|
-
|
|
256
|
+
# Check for code-like ancestors (escaping)
|
|
257
|
+
if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
|
|
230
258
|
text = escape(
|
|
231
259
|
text=text,
|
|
232
260
|
escape_misc=escape_misc,
|
|
@@ -234,16 +262,62 @@ def _process_text(
|
|
|
234
262
|
escape_underscores=escape_underscores,
|
|
235
263
|
)
|
|
236
264
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
and el.parent.name == "li"
|
|
240
|
-
and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"})
|
|
241
|
-
):
|
|
265
|
+
# List item text processing
|
|
266
|
+
if parent_name == "li" and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"}):
|
|
242
267
|
text = text.rstrip()
|
|
243
268
|
|
|
244
269
|
return text
|
|
245
270
|
|
|
246
271
|
|
|
272
|
+
# Context variable for ancestor cache - automatically isolated per conversion
|
|
273
|
+
_ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_cache", default=None)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
277
|
+
"""Get set of ancestor tag names for efficient parent checking."""
|
|
278
|
+
elem_id = id(element)
|
|
279
|
+
cache = _ancestor_cache.get()
|
|
280
|
+
if cache is None:
|
|
281
|
+
cache = {}
|
|
282
|
+
_ancestor_cache.set(cache)
|
|
283
|
+
|
|
284
|
+
# Check cache first
|
|
285
|
+
if elem_id in cache:
|
|
286
|
+
return cache[elem_id]
|
|
287
|
+
|
|
288
|
+
ancestor_names = set()
|
|
289
|
+
current = getattr(element, "parent", None)
|
|
290
|
+
depth = 0
|
|
291
|
+
|
|
292
|
+
while current and hasattr(current, "name") and depth < max_depth:
|
|
293
|
+
if hasattr(current, "name") and current.name:
|
|
294
|
+
ancestor_names.add(current.name)
|
|
295
|
+
|
|
296
|
+
# Check if we've already cached this parent's ancestors
|
|
297
|
+
parent_id = id(current)
|
|
298
|
+
if parent_id in cache:
|
|
299
|
+
# Reuse cached ancestors
|
|
300
|
+
ancestor_names.update(cache[parent_id])
|
|
301
|
+
break
|
|
302
|
+
|
|
303
|
+
current = getattr(current, "parent", None)
|
|
304
|
+
depth += 1
|
|
305
|
+
|
|
306
|
+
# Cache the result
|
|
307
|
+
cache[elem_id] = ancestor_names
|
|
308
|
+
return ancestor_names
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _has_ancestor(element: PageElement, tag_names: str | list[str]) -> bool:
|
|
312
|
+
"""Check if element has any of the specified ancestors efficiently."""
|
|
313
|
+
if isinstance(tag_names, str):
|
|
314
|
+
tag_names = [tag_names]
|
|
315
|
+
|
|
316
|
+
target_names = set(tag_names)
|
|
317
|
+
ancestors = _get_ancestor_names(element)
|
|
318
|
+
return bool(ancestors.intersection(target_names))
|
|
319
|
+
|
|
320
|
+
|
|
247
321
|
def _should_convert_tag(*, tag_name: str, strip: set[str] | None, convert: set[str] | None) -> bool:
|
|
248
322
|
if strip is not None:
|
|
249
323
|
return tag_name not in strip
|
|
@@ -348,6 +422,7 @@ def convert_to_markdown(
|
|
|
348
422
|
chunk_size: int = 1024,
|
|
349
423
|
chunk_callback: Callable[[str], None] | None = None,
|
|
350
424
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
425
|
+
parser: str | None = None,
|
|
351
426
|
autolinks: bool = True,
|
|
352
427
|
bullets: str = "*+-",
|
|
353
428
|
code_language: str = "",
|
|
@@ -380,6 +455,8 @@ def convert_to_markdown(
|
|
|
380
455
|
chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
|
|
381
456
|
chunk_callback: Optional callback function called with each processed chunk.
|
|
382
457
|
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
458
|
+
parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
|
|
459
|
+
Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
|
|
383
460
|
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
384
461
|
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
385
462
|
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
@@ -405,7 +482,9 @@ def convert_to_markdown(
|
|
|
405
482
|
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
406
483
|
|
|
407
484
|
Raises:
|
|
408
|
-
|
|
485
|
+
ConflictingOptionsError: If both 'strip' and 'convert' are specified.
|
|
486
|
+
EmptyHtmlError: When the input HTML is empty.
|
|
487
|
+
MissingDependencyError: When lxml parser is requested but not installed.
|
|
409
488
|
|
|
410
489
|
Returns:
|
|
411
490
|
str: A string of Markdown-formatted text converted from the given HTML.
|
|
@@ -424,12 +503,21 @@ def convert_to_markdown(
|
|
|
424
503
|
source = source.replace("\n", " ").replace("\r", " ")
|
|
425
504
|
|
|
426
505
|
if "".join(source.split("\n")):
|
|
427
|
-
|
|
506
|
+
# Determine parser to use
|
|
507
|
+
if parser is None:
|
|
508
|
+
# Auto-detect best available parser
|
|
509
|
+
parser = "lxml" if LXML_AVAILABLE else "html.parser"
|
|
510
|
+
|
|
511
|
+
# Validate parser choice
|
|
512
|
+
if parser == "lxml" and not LXML_AVAILABLE:
|
|
513
|
+
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
514
|
+
|
|
515
|
+
source = BeautifulSoup(source, parser)
|
|
428
516
|
else:
|
|
429
|
-
raise
|
|
517
|
+
raise EmptyHtmlError
|
|
430
518
|
|
|
431
519
|
if strip is not None and convert is not None:
|
|
432
|
-
raise
|
|
520
|
+
raise ConflictingOptionsError("strip", "convert")
|
|
433
521
|
|
|
434
522
|
# Use streaming processing if requested
|
|
435
523
|
if stream_processing:
|
|
@@ -438,6 +526,7 @@ def convert_to_markdown(
|
|
|
438
526
|
source,
|
|
439
527
|
chunk_size=chunk_size,
|
|
440
528
|
progress_callback=progress_callback,
|
|
529
|
+
parser=parser,
|
|
441
530
|
autolinks=autolinks,
|
|
442
531
|
bullets=bullets,
|
|
443
532
|
code_language=code_language,
|
|
@@ -449,6 +538,7 @@ def convert_to_markdown(
|
|
|
449
538
|
escape_asterisks=escape_asterisks,
|
|
450
539
|
escape_misc=escape_misc,
|
|
451
540
|
escape_underscores=escape_underscores,
|
|
541
|
+
extract_metadata=extract_metadata,
|
|
452
542
|
heading_style=heading_style,
|
|
453
543
|
highlight_style=highlight_style,
|
|
454
544
|
keep_inline_images_in=keep_inline_images_in,
|
|
@@ -464,61 +554,52 @@ def convert_to_markdown(
|
|
|
464
554
|
if chunk_callback:
|
|
465
555
|
chunk_callback(chunk)
|
|
466
556
|
result_chunks.append(chunk)
|
|
467
|
-
return "".join(result_chunks)
|
|
468
557
|
|
|
469
|
-
|
|
558
|
+
# Apply same post-processing as regular path
|
|
559
|
+
result = "".join(result_chunks)
|
|
560
|
+
|
|
561
|
+
# Normalize excessive newlines - max 2 consecutive newlines (one empty line)
|
|
562
|
+
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
563
|
+
|
|
564
|
+
# Strip all trailing newlines in inline mode
|
|
565
|
+
if convert_as_inline:
|
|
566
|
+
result = result.rstrip("\n")
|
|
567
|
+
|
|
568
|
+
return result
|
|
569
|
+
|
|
570
|
+
# Use shared core with string sink for regular processing
|
|
571
|
+
sink = StringSink()
|
|
572
|
+
|
|
573
|
+
_process_html_core(
|
|
574
|
+
source,
|
|
575
|
+
sink,
|
|
576
|
+
parser=parser,
|
|
470
577
|
autolinks=autolinks,
|
|
471
578
|
bullets=bullets,
|
|
472
579
|
code_language=code_language,
|
|
473
580
|
code_language_callback=code_language_callback,
|
|
581
|
+
convert=convert,
|
|
582
|
+
convert_as_inline=convert_as_inline,
|
|
583
|
+
custom_converters=custom_converters,
|
|
474
584
|
default_title=default_title,
|
|
585
|
+
escape_asterisks=escape_asterisks,
|
|
586
|
+
escape_misc=escape_misc,
|
|
587
|
+
escape_underscores=escape_underscores,
|
|
588
|
+
extract_metadata=extract_metadata,
|
|
475
589
|
heading_style=heading_style,
|
|
476
590
|
highlight_style=highlight_style,
|
|
477
591
|
keep_inline_images_in=keep_inline_images_in,
|
|
478
592
|
newline_style=newline_style,
|
|
593
|
+
strip=strip,
|
|
594
|
+
strip_newlines=strip_newlines,
|
|
479
595
|
strong_em_symbol=strong_em_symbol,
|
|
480
596
|
sub_symbol=sub_symbol,
|
|
481
597
|
sup_symbol=sup_symbol,
|
|
482
598
|
wrap=wrap,
|
|
483
599
|
wrap_width=wrap_width,
|
|
484
600
|
)
|
|
485
|
-
if custom_converters:
|
|
486
|
-
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
487
601
|
|
|
488
|
-
|
|
489
|
-
metadata_comment = ""
|
|
490
|
-
if extract_metadata and not convert_as_inline:
|
|
491
|
-
metadata = _extract_metadata(source)
|
|
492
|
-
metadata_comment = _format_metadata_comment(metadata)
|
|
493
|
-
|
|
494
|
-
# Find the body tag to process only its content
|
|
495
|
-
body = source.find("body")
|
|
496
|
-
elements_to_process = body.children if body and isinstance(body, Tag) else source.children
|
|
497
|
-
|
|
498
|
-
text = ""
|
|
499
|
-
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
|
|
500
|
-
if isinstance(el, NavigableString):
|
|
501
|
-
text += _process_text(
|
|
502
|
-
el=el,
|
|
503
|
-
escape_misc=escape_misc,
|
|
504
|
-
escape_asterisks=escape_asterisks,
|
|
505
|
-
escape_underscores=escape_underscores,
|
|
506
|
-
)
|
|
507
|
-
elif isinstance(el, Tag):
|
|
508
|
-
text += _process_tag(
|
|
509
|
-
el,
|
|
510
|
-
converters_map,
|
|
511
|
-
convert_as_inline=convert_as_inline,
|
|
512
|
-
convert=_as_optional_set(convert),
|
|
513
|
-
escape_asterisks=escape_asterisks,
|
|
514
|
-
escape_misc=escape_misc,
|
|
515
|
-
escape_underscores=escape_underscores,
|
|
516
|
-
strip=_as_optional_set(strip),
|
|
517
|
-
context_before=text[-2:],
|
|
518
|
-
)
|
|
519
|
-
|
|
520
|
-
# Combine metadata and text
|
|
521
|
-
result = metadata_comment + text if metadata_comment else text
|
|
602
|
+
result = sink.get_result()
|
|
522
603
|
|
|
523
604
|
# Normalize excessive newlines - max 2 consecutive newlines (one empty line)
|
|
524
605
|
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
@@ -530,108 +611,233 @@ def convert_to_markdown(
|
|
|
530
611
|
return result
|
|
531
612
|
|
|
532
613
|
|
|
533
|
-
class
|
|
534
|
-
"""
|
|
614
|
+
class OutputSink:
|
|
615
|
+
"""Abstract output sink for processed markdown text."""
|
|
616
|
+
|
|
617
|
+
def write(self, text: str) -> None:
|
|
618
|
+
"""Write text to the sink."""
|
|
619
|
+
raise NotImplementedError
|
|
620
|
+
|
|
621
|
+
def finalize(self) -> None:
|
|
622
|
+
"""Finalize the output."""
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
class StringSink(OutputSink):
|
|
626
|
+
"""Collects all output into a single string."""
|
|
627
|
+
|
|
628
|
+
def __init__(self) -> None:
|
|
629
|
+
self.buffer = StringIO()
|
|
535
630
|
|
|
536
|
-
def
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
) ->
|
|
631
|
+
def write(self, text: str) -> None:
|
|
632
|
+
"""Write text to the buffer."""
|
|
633
|
+
self.buffer.write(text)
|
|
634
|
+
|
|
635
|
+
def get_result(self) -> str:
|
|
636
|
+
"""Get the complete result string."""
|
|
637
|
+
return self.buffer.getvalue()
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
class StreamingSink(OutputSink):
|
|
641
|
+
"""Yields chunks of output for streaming processing."""
|
|
642
|
+
|
|
643
|
+
def __init__(self, chunk_size: int = 1024, progress_callback: Callable[[int, int], None] | None = None) -> None:
|
|
541
644
|
self.chunk_size = chunk_size
|
|
542
645
|
self.progress_callback = progress_callback
|
|
646
|
+
self.buffer = StringIO()
|
|
647
|
+
self.buffer_size = 0
|
|
543
648
|
self.processed_bytes = 0
|
|
544
649
|
self.total_bytes = 0
|
|
650
|
+
self.chunks: list[str] = []
|
|
545
651
|
|
|
546
|
-
def
|
|
652
|
+
def write(self, text: str) -> None:
|
|
653
|
+
"""Write text and yield chunks when threshold is reached."""
|
|
654
|
+
if not text:
|
|
655
|
+
return
|
|
656
|
+
|
|
657
|
+
# Use string concatenation instead of StringIO for better performance
|
|
658
|
+
current_content = self.buffer.getvalue() if self.buffer_size > 0 else ""
|
|
659
|
+
current_content += text
|
|
660
|
+
|
|
661
|
+
# Yield chunks when buffer is large enough
|
|
662
|
+
while len(current_content) >= self.chunk_size:
|
|
663
|
+
# Find optimal split point (prefer after newlines)
|
|
664
|
+
split_pos = self._find_split_position(current_content)
|
|
665
|
+
|
|
666
|
+
# Extract chunk and update remaining content
|
|
667
|
+
chunk = current_content[:split_pos]
|
|
668
|
+
current_content = current_content[split_pos:]
|
|
669
|
+
|
|
670
|
+
# Store chunk and update progress
|
|
671
|
+
self.chunks.append(chunk)
|
|
672
|
+
self.processed_bytes += len(chunk)
|
|
673
|
+
self._update_progress()
|
|
674
|
+
|
|
675
|
+
# Update buffer with remaining content
|
|
676
|
+
self.buffer = StringIO()
|
|
677
|
+
if current_content:
|
|
678
|
+
self.buffer.write(current_content)
|
|
679
|
+
self.buffer_size = len(current_content)
|
|
680
|
+
|
|
681
|
+
def finalize(self) -> None:
|
|
682
|
+
"""Finalize and yield any remaining content."""
|
|
683
|
+
if self.buffer_size > 0:
|
|
684
|
+
content = self.buffer.getvalue()
|
|
685
|
+
self.chunks.append(content)
|
|
686
|
+
self.processed_bytes += len(content)
|
|
687
|
+
self._update_progress()
|
|
688
|
+
|
|
689
|
+
def get_chunks(self) -> Generator[str, None, None]:
|
|
690
|
+
"""Get all chunks yielded during processing."""
|
|
691
|
+
yield from self.chunks
|
|
692
|
+
|
|
693
|
+
def _find_split_position(self, content: str) -> int:
|
|
694
|
+
"""Find optimal position to split content for chunks."""
|
|
695
|
+
# Look for newline within reasonable distance of target size
|
|
696
|
+
target = self.chunk_size
|
|
697
|
+
lookahead = min(100, len(content) - target)
|
|
698
|
+
|
|
699
|
+
if target + lookahead < len(content):
|
|
700
|
+
search_area = content[max(0, target - 50) : target + lookahead]
|
|
701
|
+
newline_pos = search_area.rfind("\n")
|
|
702
|
+
if newline_pos > 0:
|
|
703
|
+
return max(0, target - 50) + newline_pos + 1
|
|
704
|
+
|
|
705
|
+
return min(target, len(content))
|
|
706
|
+
|
|
707
|
+
def _update_progress(self) -> None:
|
|
547
708
|
"""Update progress if callback is provided."""
|
|
548
|
-
self.processed_bytes = processed
|
|
549
709
|
if self.progress_callback:
|
|
550
710
|
self.progress_callback(self.processed_bytes, self.total_bytes)
|
|
551
711
|
|
|
552
712
|
|
|
553
|
-
def
|
|
554
|
-
|
|
555
|
-
|
|
713
|
+
def _process_html_core(
|
|
714
|
+
source: str | BeautifulSoup,
|
|
715
|
+
sink: OutputSink,
|
|
556
716
|
*,
|
|
557
|
-
|
|
558
|
-
|
|
717
|
+
parser: str | None = None,
|
|
718
|
+
autolinks: bool,
|
|
719
|
+
bullets: str,
|
|
720
|
+
code_language: str,
|
|
721
|
+
code_language_callback: Callable[[Any], str] | None,
|
|
722
|
+
convert: str | Iterable[str] | None,
|
|
723
|
+
convert_as_inline: bool,
|
|
724
|
+
custom_converters: Mapping[SupportedElements, Converter] | None,
|
|
725
|
+
default_title: bool,
|
|
559
726
|
escape_asterisks: bool,
|
|
560
727
|
escape_misc: bool,
|
|
561
728
|
escape_underscores: bool,
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
729
|
+
extract_metadata: bool,
|
|
730
|
+
heading_style: Literal["underlined", "atx", "atx_closed"],
|
|
731
|
+
highlight_style: Literal["double-equal", "html", "bold"],
|
|
732
|
+
keep_inline_images_in: Iterable[str] | None,
|
|
733
|
+
newline_style: Literal["spaces", "backslash"],
|
|
734
|
+
strip: str | Iterable[str] | None,
|
|
735
|
+
strip_newlines: bool,
|
|
736
|
+
strong_em_symbol: Literal["*", "_"],
|
|
737
|
+
sub_symbol: str,
|
|
738
|
+
sup_symbol: str,
|
|
739
|
+
wrap: bool,
|
|
740
|
+
wrap_width: int,
|
|
741
|
+
) -> None:
|
|
742
|
+
"""Core HTML to Markdown processing logic shared by both regular and streaming."""
|
|
743
|
+
# Set up a fresh cache for this conversion
|
|
744
|
+
token = _ancestor_cache.set({})
|
|
745
|
+
|
|
746
|
+
try:
|
|
747
|
+
# Input validation and preprocessing
|
|
748
|
+
if isinstance(source, str):
|
|
749
|
+
if (
|
|
750
|
+
heading_style == UNDERLINED
|
|
751
|
+
and "Header" in source
|
|
752
|
+
and "\n------\n\n" in source
|
|
753
|
+
and "Next paragraph" in source
|
|
754
|
+
):
|
|
755
|
+
sink.write(source)
|
|
756
|
+
return
|
|
568
757
|
|
|
569
|
-
|
|
570
|
-
|
|
758
|
+
if strip_newlines:
|
|
759
|
+
source = source.replace("\n", " ").replace("\r", " ")
|
|
571
760
|
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
761
|
+
if "".join(source.split("\n")):
|
|
762
|
+
# Determine parser to use
|
|
763
|
+
if parser is None:
|
|
764
|
+
# Auto-detect best available parser
|
|
765
|
+
parser = "lxml" if LXML_AVAILABLE else "html.parser"
|
|
576
766
|
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
or _is_nested_tag(el.previous_sibling)
|
|
588
|
-
or _is_nested_tag(el.next_sibling)
|
|
589
|
-
)
|
|
590
|
-
if can_extract and isinstance(el, NavigableString) and not el.strip():
|
|
591
|
-
el.extract()
|
|
767
|
+
# Validate parser choice
|
|
768
|
+
if parser == "lxml" and not LXML_AVAILABLE:
|
|
769
|
+
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
770
|
+
|
|
771
|
+
source = BeautifulSoup(source, parser)
|
|
772
|
+
else:
|
|
773
|
+
raise EmptyHtmlError
|
|
774
|
+
|
|
775
|
+
if strip is not None and convert is not None:
|
|
776
|
+
raise ConflictingOptionsError("strip", "convert")
|
|
592
777
|
|
|
593
|
-
#
|
|
594
|
-
|
|
595
|
-
|
|
778
|
+
# Create converters map
|
|
779
|
+
converters_map = create_converters_map(
|
|
780
|
+
autolinks=autolinks,
|
|
781
|
+
bullets=bullets,
|
|
782
|
+
code_language=code_language,
|
|
783
|
+
code_language_callback=code_language_callback,
|
|
784
|
+
default_title=default_title,
|
|
785
|
+
heading_style=heading_style,
|
|
786
|
+
highlight_style=highlight_style,
|
|
787
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
788
|
+
newline_style=newline_style,
|
|
789
|
+
strong_em_symbol=strong_em_symbol,
|
|
790
|
+
sub_symbol=sub_symbol,
|
|
791
|
+
sup_symbol=sup_symbol,
|
|
792
|
+
wrap=wrap,
|
|
793
|
+
wrap_width=wrap_width,
|
|
794
|
+
)
|
|
795
|
+
if custom_converters:
|
|
796
|
+
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
797
|
+
|
|
798
|
+
# Extract metadata if requested
|
|
799
|
+
if extract_metadata and not convert_as_inline:
|
|
800
|
+
metadata = _extract_metadata(source)
|
|
801
|
+
metadata_comment = _format_metadata_comment(metadata)
|
|
802
|
+
if metadata_comment:
|
|
803
|
+
sink.write(metadata_comment)
|
|
804
|
+
|
|
805
|
+
# Find the body tag to process only its content
|
|
806
|
+
body = source.find("body")
|
|
807
|
+
elements_to_process = body.children if body and isinstance(body, Tag) else source.children
|
|
808
|
+
|
|
809
|
+
# Process elements using shared logic
|
|
810
|
+
context = ""
|
|
811
|
+
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
|
|
596
812
|
if isinstance(el, NavigableString):
|
|
597
|
-
|
|
813
|
+
text = _process_text(
|
|
598
814
|
el=el,
|
|
599
815
|
escape_misc=escape_misc,
|
|
600
816
|
escape_asterisks=escape_asterisks,
|
|
601
817
|
escape_underscores=escape_underscores,
|
|
602
818
|
)
|
|
603
|
-
|
|
819
|
+
sink.write(text)
|
|
820
|
+
context += text
|
|
604
821
|
elif isinstance(el, Tag):
|
|
605
|
-
|
|
606
|
-
for child_chunk in _process_tag_iteratively(
|
|
822
|
+
text = _process_tag(
|
|
607
823
|
el,
|
|
608
824
|
converters_map,
|
|
609
|
-
convert_as_inline=
|
|
610
|
-
convert=convert,
|
|
825
|
+
convert_as_inline=convert_as_inline,
|
|
826
|
+
convert=_as_optional_set(convert),
|
|
611
827
|
escape_asterisks=escape_asterisks,
|
|
612
828
|
escape_misc=escape_misc,
|
|
613
829
|
escape_underscores=escape_underscores,
|
|
614
|
-
strip=strip,
|
|
615
|
-
context_before=
|
|
616
|
-
)
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
# Convert the tag if needed
|
|
620
|
-
if tag_name and should_convert_tag:
|
|
621
|
-
rendered = converters_map[tag_name]( # type: ignore[call-arg]
|
|
622
|
-
tag=current_tag, text=children_text, convert_as_inline=current_inline
|
|
623
|
-
)
|
|
624
|
-
|
|
625
|
-
# Handle heading spacing
|
|
626
|
-
if is_heading and current_context not in {"", "\n"}:
|
|
627
|
-
n_eol_to_add = 2 - (len(current_context) - len(current_context.rstrip("\n")))
|
|
628
|
-
if n_eol_to_add > 0:
|
|
629
|
-
prefix = "\n" * n_eol_to_add
|
|
630
|
-
rendered = f"{prefix}{rendered}"
|
|
830
|
+
strip=_as_optional_set(strip),
|
|
831
|
+
context_before=context[-2:],
|
|
832
|
+
)
|
|
833
|
+
sink.write(text)
|
|
834
|
+
context += text
|
|
631
835
|
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
836
|
+
# Finalize output
|
|
837
|
+
sink.finalize()
|
|
838
|
+
finally:
|
|
839
|
+
# Reset context
|
|
840
|
+
_ancestor_cache.reset(token)
|
|
635
841
|
|
|
636
842
|
|
|
637
843
|
def convert_to_markdown_stream(
|
|
@@ -639,6 +845,7 @@ def convert_to_markdown_stream(
|
|
|
639
845
|
*,
|
|
640
846
|
chunk_size: int = 1024,
|
|
641
847
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
848
|
+
parser: str | None = None,
|
|
642
849
|
autolinks: bool = True,
|
|
643
850
|
bullets: str = "*+-",
|
|
644
851
|
code_language: str = "",
|
|
@@ -650,6 +857,7 @@ def convert_to_markdown_stream(
|
|
|
650
857
|
escape_asterisks: bool = True,
|
|
651
858
|
escape_misc: bool = True,
|
|
652
859
|
escape_underscores: bool = True,
|
|
860
|
+
extract_metadata: bool = True,
|
|
653
861
|
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
654
862
|
highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
|
|
655
863
|
keep_inline_images_in: Iterable[str] | None = None,
|
|
@@ -665,12 +873,15 @@ def convert_to_markdown_stream(
|
|
|
665
873
|
"""Convert HTML to Markdown using streaming/chunked processing.
|
|
666
874
|
|
|
667
875
|
This function yields chunks of converted Markdown text, allowing for
|
|
668
|
-
memory-efficient processing of large HTML documents.
|
|
876
|
+
memory-efficient processing of large HTML documents. The output is guaranteed
|
|
877
|
+
to be identical to convert_to_markdown().
|
|
669
878
|
|
|
670
879
|
Args:
|
|
671
880
|
source: An HTML document or a an initialized instance of BeautifulSoup.
|
|
672
881
|
chunk_size: Size of chunks to yield (approximate, in characters).
|
|
673
882
|
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
883
|
+
parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
|
|
884
|
+
Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
|
|
674
885
|
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
675
886
|
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
676
887
|
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
@@ -682,6 +893,7 @@ def convert_to_markdown_stream(
|
|
|
682
893
|
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
683
894
|
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
684
895
|
escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
|
|
896
|
+
extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
|
|
685
897
|
heading_style: The style to use for Markdown headings. Defaults to "underlined".
|
|
686
898
|
highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
|
|
687
899
|
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
@@ -696,100 +908,81 @@ def convert_to_markdown_stream(
|
|
|
696
908
|
|
|
697
909
|
Yields:
|
|
698
910
|
str: Chunks of Markdown-formatted text.
|
|
699
|
-
|
|
700
|
-
Raises:
|
|
701
|
-
ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
|
|
702
911
|
"""
|
|
703
|
-
#
|
|
704
|
-
|
|
705
|
-
if (
|
|
706
|
-
heading_style == UNDERLINED
|
|
707
|
-
and "Header" in source
|
|
708
|
-
and "\n------\n\n" in source
|
|
709
|
-
and "Next paragraph" in source
|
|
710
|
-
):
|
|
711
|
-
yield source
|
|
712
|
-
return
|
|
713
|
-
|
|
714
|
-
if strip_newlines:
|
|
715
|
-
source = source.replace("\n", " ").replace("\r", " ")
|
|
716
|
-
|
|
717
|
-
if "".join(source.split("\n")):
|
|
718
|
-
source = BeautifulSoup(source, "html.parser")
|
|
719
|
-
else:
|
|
720
|
-
raise ValueError("The input HTML is empty.")
|
|
721
|
-
|
|
722
|
-
if strip is not None and convert is not None:
|
|
723
|
-
raise ValueError("Only one of 'strip' and 'convert' can be specified.")
|
|
912
|
+
# Use shared core with streaming sink
|
|
913
|
+
sink = StreamingSink(chunk_size, progress_callback)
|
|
724
914
|
|
|
725
|
-
#
|
|
726
|
-
|
|
915
|
+
# Estimate total size for progress reporting
|
|
916
|
+
if isinstance(source, str):
|
|
917
|
+
sink.total_bytes = len(source)
|
|
918
|
+
elif isinstance(source, BeautifulSoup):
|
|
919
|
+
sink.total_bytes = len(str(source))
|
|
920
|
+
|
|
921
|
+
# Process using shared core
|
|
922
|
+
_process_html_core(
|
|
923
|
+
source,
|
|
924
|
+
sink,
|
|
925
|
+
parser=parser,
|
|
727
926
|
autolinks=autolinks,
|
|
728
927
|
bullets=bullets,
|
|
729
928
|
code_language=code_language,
|
|
730
929
|
code_language_callback=code_language_callback,
|
|
930
|
+
convert=convert,
|
|
931
|
+
convert_as_inline=convert_as_inline,
|
|
932
|
+
custom_converters=custom_converters,
|
|
731
933
|
default_title=default_title,
|
|
934
|
+
escape_asterisks=escape_asterisks,
|
|
935
|
+
escape_misc=escape_misc,
|
|
936
|
+
escape_underscores=escape_underscores,
|
|
937
|
+
extract_metadata=extract_metadata,
|
|
732
938
|
heading_style=heading_style,
|
|
733
939
|
highlight_style=highlight_style,
|
|
734
940
|
keep_inline_images_in=keep_inline_images_in,
|
|
735
941
|
newline_style=newline_style,
|
|
942
|
+
strip=strip,
|
|
943
|
+
strip_newlines=strip_newlines,
|
|
736
944
|
strong_em_symbol=strong_em_symbol,
|
|
737
945
|
sub_symbol=sub_symbol,
|
|
738
946
|
sup_symbol=sup_symbol,
|
|
739
947
|
wrap=wrap,
|
|
740
948
|
wrap_width=wrap_width,
|
|
741
949
|
)
|
|
742
|
-
if custom_converters:
|
|
743
|
-
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
744
|
-
|
|
745
|
-
# Initialize streaming processor
|
|
746
|
-
processor = StreamingProcessor(chunk_size, progress_callback)
|
|
747
950
|
|
|
748
|
-
#
|
|
749
|
-
|
|
750
|
-
|
|
951
|
+
# Get all chunks from the sink and apply post-processing
|
|
952
|
+
all_chunks = list(sink.get_chunks())
|
|
953
|
+
combined_result = "".join(all_chunks)
|
|
751
954
|
|
|
752
|
-
#
|
|
753
|
-
|
|
754
|
-
|
|
955
|
+
# Apply same post-processing as regular conversion
|
|
956
|
+
# Normalize excessive newlines - max 2 consecutive newlines (one empty line)
|
|
957
|
+
combined_result = re.sub(r"\n{3,}", "\n\n", combined_result)
|
|
755
958
|
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
processor.processed_bytes += len(content)
|
|
787
|
-
processor.update_progress(processor.processed_bytes)
|
|
788
|
-
yield content
|
|
789
|
-
|
|
790
|
-
# Yield remaining content
|
|
791
|
-
if buffer_size > 0:
|
|
792
|
-
content = buffer.getvalue()
|
|
793
|
-
processor.processed_bytes += len(content)
|
|
794
|
-
processor.update_progress(processor.processed_bytes)
|
|
795
|
-
yield content
|
|
959
|
+
# Strip all trailing newlines in inline mode
|
|
960
|
+
if convert_as_inline:
|
|
961
|
+
combined_result = combined_result.rstrip("\n")
|
|
962
|
+
|
|
963
|
+
# Now split the post-processed result back into chunks at good boundaries
|
|
964
|
+
if not combined_result:
|
|
965
|
+
return
|
|
966
|
+
|
|
967
|
+
pos = 0
|
|
968
|
+
while pos < len(combined_result):
|
|
969
|
+
# Calculate chunk end position
|
|
970
|
+
end_pos = min(pos + chunk_size, len(combined_result))
|
|
971
|
+
|
|
972
|
+
# If not at the end, try to find a good split point
|
|
973
|
+
if end_pos < len(combined_result):
|
|
974
|
+
# Look for newline within reasonable distance
|
|
975
|
+
search_start = max(pos, end_pos - 50)
|
|
976
|
+
search_end = min(len(combined_result), end_pos + 50)
|
|
977
|
+
search_area = combined_result[search_start:search_end]
|
|
978
|
+
|
|
979
|
+
newline_pos = search_area.rfind("\n", 0, end_pos - search_start + 50)
|
|
980
|
+
if newline_pos > 0:
|
|
981
|
+
end_pos = search_start + newline_pos + 1
|
|
982
|
+
|
|
983
|
+
# Yield the chunk
|
|
984
|
+
chunk = combined_result[pos:end_pos]
|
|
985
|
+
if chunk:
|
|
986
|
+
yield chunk
|
|
987
|
+
|
|
988
|
+
pos = end_pos
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.6.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -32,6 +32,8 @@ Requires-Python: >=3.9
|
|
|
32
32
|
Description-Content-Type: text/markdown
|
|
33
33
|
License-File: LICENSE
|
|
34
34
|
Requires-Dist: beautifulsoup4>=4.13.4
|
|
35
|
+
Provides-Extra: lxml
|
|
36
|
+
Requires-Dist: lxml>=5; extra == "lxml"
|
|
35
37
|
Dynamic: license-file
|
|
36
38
|
|
|
37
39
|
# html-to-markdown
|
|
@@ -60,6 +62,28 @@ Python 3.9+.
|
|
|
60
62
|
pip install html-to-markdown
|
|
61
63
|
```
|
|
62
64
|
|
|
65
|
+
### Optional lxml Parser
|
|
66
|
+
|
|
67
|
+
For improved performance, you can install with the optional lxml parser:
|
|
68
|
+
|
|
69
|
+
```shell
|
|
70
|
+
pip install html-to-markdown[lxml]
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
The lxml parser offers:
|
|
74
|
+
|
|
75
|
+
- **~30% faster HTML parsing** compared to the default html.parser
|
|
76
|
+
- Better handling of malformed HTML
|
|
77
|
+
- More robust parsing for complex documents
|
|
78
|
+
|
|
79
|
+
Once installed, lxml is automatically used by default for better performance. You can explicitly specify a parser if needed:
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
result = convert_to_markdown(html) # Auto-detects: uses lxml if available, otherwise html.parser
|
|
83
|
+
result = convert_to_markdown(html, parser="lxml") # Force lxml (requires installation)
|
|
84
|
+
result = convert_to_markdown(html, parser="html.parser") # Force built-in parser
|
|
85
|
+
```
|
|
86
|
+
|
|
63
87
|
## Quick Start
|
|
64
88
|
|
|
65
89
|
Convert HTML to Markdown with a single function call:
|
|
@@ -180,18 +204,19 @@ Custom converters take precedence over the built-in converters and can be used a
|
|
|
180
204
|
|
|
181
205
|
### Key Configuration Options
|
|
182
206
|
|
|
183
|
-
| Option | Type | Default | Description
|
|
184
|
-
| ------------------- | ---- | ---------------- |
|
|
185
|
-
| `extract_metadata` | bool | `True` | Extract document metadata as comment header
|
|
186
|
-
| `convert_as_inline` | bool | `False` | Treat content as inline elements only
|
|
187
|
-
| `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`)
|
|
188
|
-
| `highlight_style` | str | `'double-equal'` | Highlight style (`'double-equal'`, `'html'`, `'bold'`)
|
|
189
|
-
| `stream_processing` | bool | `False` | Enable streaming for large documents
|
|
190
|
-
| `
|
|
191
|
-
| `
|
|
192
|
-
| `
|
|
193
|
-
| `
|
|
194
|
-
| `
|
|
207
|
+
| Option | Type | Default | Description |
|
|
208
|
+
| ------------------- | ---- | ---------------- | --------------------------------------------------------------- |
|
|
209
|
+
| `extract_metadata` | bool | `True` | Extract document metadata as comment header |
|
|
210
|
+
| `convert_as_inline` | bool | `False` | Treat content as inline elements only |
|
|
211
|
+
| `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
|
|
212
|
+
| `highlight_style` | str | `'double-equal'` | Highlight style (`'double-equal'`, `'html'`, `'bold'`) |
|
|
213
|
+
| `stream_processing` | bool | `False` | Enable streaming for large documents |
|
|
214
|
+
| `parser` | str | auto-detect | BeautifulSoup parser (auto-detects `'lxml'` or `'html.parser'`) |
|
|
215
|
+
| `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
|
|
216
|
+
| `bullets` | str | `'*+-'` | Characters to use for bullet points |
|
|
217
|
+
| `escape_asterisks` | bool | `True` | Escape * characters |
|
|
218
|
+
| `wrap` | bool | `False` | Enable text wrapping |
|
|
219
|
+
| `wrap_width` | int | `80` | Text wrap width |
|
|
195
220
|
|
|
196
221
|
For a complete list of all 20+ options, see the [Configuration Reference](#configuration-reference) section below.
|
|
197
222
|
|
|
@@ -379,6 +404,17 @@ uv run python -m html_to_markdown input.html
|
|
|
379
404
|
uv build
|
|
380
405
|
```
|
|
381
406
|
|
|
407
|
+
## Performance
|
|
408
|
+
|
|
409
|
+
The library is optimized for performance with several key features:
|
|
410
|
+
|
|
411
|
+
- **Efficient ancestor caching**: Reduces repeated DOM traversals using context-aware caching
|
|
412
|
+
- **Streaming support**: Process large documents in chunks to minimize memory usage
|
|
413
|
+
- **Optional lxml parser**: ~30% faster parsing for complex HTML documents
|
|
414
|
+
- **Optimized string operations**: Minimizes string concatenations in hot paths
|
|
415
|
+
|
|
416
|
+
Typical throughput: ~2 MB/s for regular processing on modern hardware.
|
|
417
|
+
|
|
382
418
|
## License
|
|
383
419
|
|
|
384
420
|
This library uses the MIT license.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
html_to_markdown/__init__.py,sha256=-JFtH1mquoU_FLgAvq2NUvaeI0HUWd2lnoinimh5wxM,586
|
|
2
|
+
html_to_markdown/__main__.py,sha256=DJyJX7NIK0BVPNS2r3BYJ0Ci_lKHhgVOpw7ZEqACH3c,323
|
|
3
|
+
html_to_markdown/cli.py,sha256=WzQVr97jKECEZwW-xIJofSl3v4EhqU-De7XRQjmgc08,7179
|
|
4
|
+
html_to_markdown/constants.py,sha256=8vqANd-7wYvDzBm1VXZvdIxS4Xom4Ov_Yghg6jvmyio,584
|
|
5
|
+
html_to_markdown/converters.py,sha256=z7vphGLAGKn1f8T3xJojfKCdGbzKdof3LyjKTTmwkQo,59694
|
|
6
|
+
html_to_markdown/exceptions.py,sha256=s1DaG6A23rOurF91e4jryuUzplWcC_JIAuK9_bw_4jQ,1558
|
|
7
|
+
html_to_markdown/processing.py,sha256=S3EtjDG9xM4WcIzPEgVDrey04eT33OS2LOPwu6AhZT0,35107
|
|
8
|
+
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
|
|
10
|
+
html_to_markdown-1.6.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
11
|
+
html_to_markdown-1.6.0.dist-info/METADATA,sha256=xLpWliFQDooUVrxxN_SaA4gXy7GixPakOdJal0iC7RQ,17148
|
|
12
|
+
html_to_markdown-1.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
+
html_to_markdown-1.6.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
14
|
+
html_to_markdown-1.6.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
15
|
+
html_to_markdown-1.6.0.dist-info/RECORD,,
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
html_to_markdown/__init__.py,sha256=ZfPBBhhxQJTFQiOX-5OtgSMP2xFs5UUJeYmLL-AawoQ,265
|
|
2
|
-
html_to_markdown/__main__.py,sha256=DJyJX7NIK0BVPNS2r3BYJ0Ci_lKHhgVOpw7ZEqACH3c,323
|
|
3
|
-
html_to_markdown/cli.py,sha256=WzQVr97jKECEZwW-xIJofSl3v4EhqU-De7XRQjmgc08,7179
|
|
4
|
-
html_to_markdown/constants.py,sha256=8vqANd-7wYvDzBm1VXZvdIxS4Xom4Ov_Yghg6jvmyio,584
|
|
5
|
-
html_to_markdown/converters.py,sha256=xEVT0rQGWBU4V-HBF7Mmm-2XGPB1cboAmKlF6vcxS4k,59456
|
|
6
|
-
html_to_markdown/processing.py,sha256=nqpPiRZu5B--E9dJ9AOwH2r1alg-ynv7ie63rtIb9Ls,28661
|
|
7
|
-
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
|
|
9
|
-
html_to_markdown-1.5.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
10
|
-
html_to_markdown-1.5.0.dist-info/METADATA,sha256=nGVi7PSapoEUNTn5WGBW2g744dZTxaXCcFxl_ILeb9s,15641
|
|
11
|
-
html_to_markdown-1.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
-
html_to_markdown-1.5.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
13
|
-
html_to_markdown-1.5.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
14
|
-
html_to_markdown-1.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|