html-to-markdown 1.9.1__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__main__.py +0 -1
- html_to_markdown/cli.py +101 -45
- html_to_markdown/constants.py +3 -0
- html_to_markdown/converters.py +34 -502
- html_to_markdown/exceptions.py +1 -11
- html_to_markdown/preprocessor.py +0 -37
- html_to_markdown/processing.py +117 -191
- html_to_markdown/utils.py +2 -42
- html_to_markdown/whitespace.py +303 -0
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.11.0.dist-info}/METADATA +196 -204
- html_to_markdown-1.11.0.dist-info/RECORD +17 -0
- html_to_markdown-1.9.1.dist-info/RECORD +0 -16
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.11.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.11.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.11.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.11.0.dist-info}/top_level.txt +0 -0
html_to_markdown/processing.py
CHANGED
|
@@ -33,12 +33,13 @@ from html_to_markdown.constants import (
|
|
|
33
33
|
DOUBLE_EQUAL,
|
|
34
34
|
SPACES,
|
|
35
35
|
UNDERLINED,
|
|
36
|
+
WHITESPACE_NORMALIZED,
|
|
36
37
|
html_heading_re,
|
|
37
|
-
whitespace_re,
|
|
38
38
|
)
|
|
39
39
|
from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
|
|
40
40
|
from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
|
|
41
41
|
from html_to_markdown.utils import escape
|
|
42
|
+
from html_to_markdown.whitespace import WhitespaceHandler
|
|
42
43
|
|
|
43
44
|
if TYPE_CHECKING:
|
|
44
45
|
from collections.abc import Iterable
|
|
@@ -143,6 +144,12 @@ SupportedTag = Literal[
|
|
|
143
144
|
]
|
|
144
145
|
|
|
145
146
|
|
|
147
|
+
def _get_list_indent(list_indent_type: str, list_indent_width: int) -> str:
|
|
148
|
+
if list_indent_type == "tabs":
|
|
149
|
+
return "\t"
|
|
150
|
+
return " " * list_indent_width
|
|
151
|
+
|
|
152
|
+
|
|
146
153
|
def _is_nested_tag(el: PageElement) -> bool:
|
|
147
154
|
return isinstance(el, Tag) and el.name in {
|
|
148
155
|
"ol",
|
|
@@ -170,6 +177,7 @@ def _process_tag(
|
|
|
170
177
|
escape_misc: bool,
|
|
171
178
|
escape_underscores: bool,
|
|
172
179
|
strip: set[str] | None,
|
|
180
|
+
whitespace_handler: WhitespaceHandler,
|
|
173
181
|
context_before: str = "",
|
|
174
182
|
) -> str:
|
|
175
183
|
should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
|
|
@@ -218,6 +226,7 @@ def _process_tag(
|
|
|
218
226
|
escape_misc=escape_misc,
|
|
219
227
|
escape_asterisks=escape_asterisks,
|
|
220
228
|
escape_underscores=escape_underscores,
|
|
229
|
+
whitespace_handler=whitespace_handler,
|
|
221
230
|
)
|
|
222
231
|
)
|
|
223
232
|
elif isinstance(el, Tag):
|
|
@@ -232,6 +241,7 @@ def _process_tag(
|
|
|
232
241
|
escape_misc=escape_misc,
|
|
233
242
|
escape_underscores=escape_underscores,
|
|
234
243
|
strip=strip,
|
|
244
|
+
whitespace_handler=whitespace_handler,
|
|
235
245
|
context_before=(context_before + current_text)[-2:],
|
|
236
246
|
)
|
|
237
247
|
)
|
|
@@ -248,6 +258,18 @@ def _process_tag(
|
|
|
248
258
|
if n_eol_to_add > 0:
|
|
249
259
|
prefix = "\n" * n_eol_to_add
|
|
250
260
|
return f"{prefix}{rendered}"
|
|
261
|
+
|
|
262
|
+
from html_to_markdown.whitespace import BLOCK_ELEMENTS # noqa: PLC0415
|
|
263
|
+
|
|
264
|
+
is_block_element = tag.name.lower() in BLOCK_ELEMENTS
|
|
265
|
+
if (
|
|
266
|
+
is_block_element
|
|
267
|
+
and not convert_as_inline
|
|
268
|
+
and context_before
|
|
269
|
+
and not context_before.endswith("\n")
|
|
270
|
+
and rendered.strip()
|
|
271
|
+
):
|
|
272
|
+
return f"\n\n{rendered}"
|
|
251
273
|
return rendered
|
|
252
274
|
|
|
253
275
|
return text
|
|
@@ -259,6 +281,7 @@ def _process_text(
|
|
|
259
281
|
escape_misc: bool,
|
|
260
282
|
escape_asterisks: bool,
|
|
261
283
|
escape_underscores: bool,
|
|
284
|
+
whitespace_handler: WhitespaceHandler,
|
|
262
285
|
) -> str:
|
|
263
286
|
text = str(el) or ""
|
|
264
287
|
|
|
@@ -275,69 +298,9 @@ def _process_text(
|
|
|
275
298
|
if len(ancestor_names) > 10:
|
|
276
299
|
break
|
|
277
300
|
|
|
278
|
-
|
|
279
|
-
if text.strip() == "":
|
|
280
|
-
if "\n" in text:
|
|
281
|
-
text = ""
|
|
282
|
-
else:
|
|
283
|
-
block_elements = {
|
|
284
|
-
"p",
|
|
285
|
-
"ul",
|
|
286
|
-
"ol",
|
|
287
|
-
"div",
|
|
288
|
-
"blockquote",
|
|
289
|
-
"pre",
|
|
290
|
-
"h1",
|
|
291
|
-
"h2",
|
|
292
|
-
"h3",
|
|
293
|
-
"h4",
|
|
294
|
-
"h5",
|
|
295
|
-
"h6",
|
|
296
|
-
"table",
|
|
297
|
-
"dl",
|
|
298
|
-
"hr",
|
|
299
|
-
"figure",
|
|
300
|
-
"article",
|
|
301
|
-
"section",
|
|
302
|
-
"nav",
|
|
303
|
-
"aside",
|
|
304
|
-
"header",
|
|
305
|
-
"footer",
|
|
306
|
-
"main",
|
|
307
|
-
"form",
|
|
308
|
-
"fieldset",
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
prev_sibling = el.previous_sibling
|
|
312
|
-
next_sibling = el.next_sibling
|
|
313
|
-
|
|
314
|
-
if (
|
|
315
|
-
prev_sibling
|
|
316
|
-
and hasattr(prev_sibling, "name")
|
|
317
|
-
and prev_sibling.name in block_elements
|
|
318
|
-
and next_sibling
|
|
319
|
-
and hasattr(next_sibling, "name")
|
|
320
|
-
and next_sibling.name in block_elements
|
|
321
|
-
):
|
|
322
|
-
text = ""
|
|
323
|
-
else:
|
|
324
|
-
text = " " if text else ""
|
|
325
|
-
else:
|
|
326
|
-
has_leading_space = text.startswith((" ", "\t"))
|
|
327
|
-
has_trailing_space = text.endswith((" ", "\t"))
|
|
328
|
-
|
|
329
|
-
middle_content = (
|
|
330
|
-
text[1:-1]
|
|
331
|
-
if has_leading_space and has_trailing_space
|
|
332
|
-
else text[1:]
|
|
333
|
-
if has_leading_space
|
|
334
|
-
else text[:-1]
|
|
335
|
-
if has_trailing_space
|
|
336
|
-
else text
|
|
337
|
-
)
|
|
301
|
+
in_pre = bool(ancestor_names.intersection({"pre"}))
|
|
338
302
|
|
|
339
|
-
|
|
340
|
-
text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
|
|
303
|
+
text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
|
|
341
304
|
|
|
342
305
|
if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
|
|
343
306
|
text = escape(
|
|
@@ -357,7 +320,6 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
|
|
|
357
320
|
|
|
358
321
|
|
|
359
322
|
def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
360
|
-
"""Get set of ancestor tag names for efficient parent checking."""
|
|
361
323
|
elem_id = id(element)
|
|
362
324
|
cache = _ancestor_cache.get()
|
|
363
325
|
if cache is None:
|
|
@@ -388,7 +350,6 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
|
388
350
|
|
|
389
351
|
|
|
390
352
|
def _has_ancestor(element: PageElement, tag_names: str | list[str]) -> bool:
|
|
391
|
-
"""Check if element has any of the specified ancestors efficiently."""
|
|
392
353
|
if isinstance(tag_names, str):
|
|
393
354
|
tag_names = [tag_names]
|
|
394
355
|
|
|
@@ -409,19 +370,11 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
|
|
|
409
370
|
if value is None:
|
|
410
371
|
return None
|
|
411
372
|
if isinstance(value, str):
|
|
412
|
-
return set(","
|
|
373
|
+
return set(value.split(","))
|
|
413
374
|
return {*chain(*[v.split(",") for v in value])}
|
|
414
375
|
|
|
415
376
|
|
|
416
377
|
def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
417
|
-
"""Extract metadata from HTML document.
|
|
418
|
-
|
|
419
|
-
Args:
|
|
420
|
-
soup: BeautifulSoup instance of the HTML document.
|
|
421
|
-
|
|
422
|
-
Returns:
|
|
423
|
-
Dictionary of metadata key-value pairs.
|
|
424
|
-
"""
|
|
425
378
|
metadata = {}
|
|
426
379
|
|
|
427
380
|
title_tag = soup.find("title")
|
|
@@ -468,14 +421,6 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
468
421
|
|
|
469
422
|
|
|
470
423
|
def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
471
|
-
"""Format metadata as a Markdown comment block.
|
|
472
|
-
|
|
473
|
-
Args:
|
|
474
|
-
metadata: Dictionary of metadata key-value pairs.
|
|
475
|
-
|
|
476
|
-
Returns:
|
|
477
|
-
Formatted metadata comment block.
|
|
478
|
-
"""
|
|
479
424
|
if not metadata:
|
|
480
425
|
return ""
|
|
481
426
|
|
|
@@ -511,64 +456,87 @@ def convert_to_markdown(
|
|
|
511
456
|
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
512
457
|
highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
|
|
513
458
|
keep_inline_images_in: Iterable[str] | None = None,
|
|
459
|
+
list_indent_type: Literal["spaces", "tabs"] = "spaces",
|
|
460
|
+
list_indent_width: int = 4,
|
|
514
461
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
462
|
+
preprocess_html: bool = False,
|
|
463
|
+
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
464
|
+
remove_forms: bool = True,
|
|
465
|
+
remove_navigation: bool = True,
|
|
515
466
|
strip: str | Iterable[str] | None = None,
|
|
516
467
|
strip_newlines: bool = False,
|
|
517
468
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
518
469
|
sub_symbol: str = "",
|
|
519
470
|
sup_symbol: str = "",
|
|
471
|
+
whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
|
|
520
472
|
wrap: bool = False,
|
|
521
473
|
wrap_width: int = 80,
|
|
522
|
-
preprocess_html: bool = False,
|
|
523
|
-
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
524
|
-
remove_navigation: bool = True,
|
|
525
|
-
remove_forms: bool = True,
|
|
526
474
|
) -> str:
|
|
527
|
-
"""Convert HTML to Markdown.
|
|
475
|
+
"""Convert HTML content to Markdown format.
|
|
528
476
|
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
stream_processing: Use streaming processing for large documents. Defaults to False.
|
|
532
|
-
chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
|
|
533
|
-
chunk_callback: Optional callback function called with each processed chunk.
|
|
534
|
-
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
535
|
-
parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
|
|
536
|
-
Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
|
|
537
|
-
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
538
|
-
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
539
|
-
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
540
|
-
code_language_callback: Function to dynamically determine the language for code blocks.
|
|
541
|
-
convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
542
|
-
convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
543
|
-
custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
|
|
544
|
-
default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
545
|
-
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
546
|
-
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
547
|
-
escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
|
|
548
|
-
extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
|
|
549
|
-
heading_style: The style to use for Markdown headings. Defaults to "underlined".
|
|
550
|
-
highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
|
|
551
|
-
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
552
|
-
newline_style: Style for handling newlines in text content. Defaults to "spaces".
|
|
553
|
-
strip: Tags to strip from the output. Defaults to None.
|
|
554
|
-
strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
|
|
555
|
-
strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
|
|
556
|
-
sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
|
|
557
|
-
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
558
|
-
wrap: Wrap text to the specified width. Defaults to False.
|
|
559
|
-
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
560
|
-
preprocess_html: Apply HTML preprocessing to improve quality. Defaults to False.
|
|
561
|
-
preprocessing_preset: Preset configuration for preprocessing. Defaults to "standard".
|
|
562
|
-
remove_navigation: Remove navigation elements during preprocessing. Defaults to True.
|
|
563
|
-
remove_forms: Remove form elements during preprocessing. Defaults to True.
|
|
477
|
+
This is the main entry point for converting HTML to Markdown. It supports
|
|
478
|
+
various customization options for controlling the conversion behavior.
|
|
564
479
|
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
480
|
+
Args:
|
|
481
|
+
source: HTML string or BeautifulSoup object to convert.
|
|
482
|
+
stream_processing: Enable streaming mode for large documents.
|
|
483
|
+
chunk_size: Size of chunks for streaming processing.
|
|
484
|
+
chunk_callback: Callback for processing chunks in streaming mode.
|
|
485
|
+
progress_callback: Callback for progress updates (current, total).
|
|
486
|
+
parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
|
|
487
|
+
autolinks: Convert URLs to automatic links.
|
|
488
|
+
bullets: Characters to use for unordered list bullets.
|
|
489
|
+
code_language: Default language for code blocks.
|
|
490
|
+
code_language_callback: Callback to determine code language from element.
|
|
491
|
+
convert: HTML tags to convert to Markdown.
|
|
492
|
+
convert_as_inline: Treat block elements as inline during conversion.
|
|
493
|
+
custom_converters: Custom converters for specific HTML elements.
|
|
494
|
+
default_title: Add a default title if none exists.
|
|
495
|
+
escape_asterisks: Escape asterisk characters in text.
|
|
496
|
+
escape_misc: Escape miscellaneous Markdown characters.
|
|
497
|
+
escape_underscores: Escape underscore characters in text.
|
|
498
|
+
extract_metadata: Extract metadata from HTML head.
|
|
499
|
+
heading_style: Style for headings ('underlined', 'atx', 'atx_closed').
|
|
500
|
+
highlight_style: Style for highlighting ('double-equal', 'html', 'bold').
|
|
501
|
+
keep_inline_images_in: Parent tags where images should remain inline.
|
|
502
|
+
list_indent_type: Type of indentation for lists ('spaces', 'tabs').
|
|
503
|
+
list_indent_width: Number of spaces for list indentation.
|
|
504
|
+
newline_style: Style for newlines ('spaces', 'backslash').
|
|
505
|
+
preprocess_html: Enable HTML preprocessing to clean up content.
|
|
506
|
+
preprocessing_preset: Preprocessing aggressiveness level.
|
|
507
|
+
remove_forms: Remove form elements during preprocessing.
|
|
508
|
+
remove_navigation: Remove navigation elements during preprocessing.
|
|
509
|
+
strip: HTML tags to strip from output.
|
|
510
|
+
strip_newlines: Remove newlines from HTML before processing.
|
|
511
|
+
strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
|
|
512
|
+
sub_symbol: Symbol for subscript text.
|
|
513
|
+
sup_symbol: Symbol for superscript text.
|
|
514
|
+
whitespace_mode: How to handle whitespace ('normalized', 'strict').
|
|
515
|
+
wrap: Enable text wrapping.
|
|
516
|
+
wrap_width: Column width for text wrapping.
|
|
569
517
|
|
|
570
518
|
Returns:
|
|
571
|
-
|
|
519
|
+
The converted Markdown string.
|
|
520
|
+
|
|
521
|
+
Raises:
|
|
522
|
+
EmptyHtmlError: If the HTML input is empty.
|
|
523
|
+
MissingDependencyError: If required dependencies are not installed.
|
|
524
|
+
ConflictingOptionsError: If conflicting options are provided.
|
|
525
|
+
|
|
526
|
+
Examples:
|
|
527
|
+
Basic conversion:
|
|
528
|
+
>>> html = "<h1>Title</h1><p>Content</p>"
|
|
529
|
+
>>> convert_to_markdown(html)
|
|
530
|
+
'Title\\n=====\\n\\nContent\\n\\n'
|
|
531
|
+
|
|
532
|
+
With custom options:
|
|
533
|
+
>>> convert_to_markdown(html, heading_style="atx", list_indent_width=2)
|
|
534
|
+
'# Title\\n\\nContent\\n\\n'
|
|
535
|
+
|
|
536
|
+
Discord-compatible lists (2-space indent):
|
|
537
|
+
>>> html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
|
|
538
|
+
>>> convert_to_markdown(html, list_indent_width=2)
|
|
539
|
+
'* Item 1\\n* Item 2\\n\\n'
|
|
572
540
|
"""
|
|
573
541
|
if isinstance(source, str):
|
|
574
542
|
if (
|
|
@@ -665,6 +633,7 @@ def convert_to_markdown(
|
|
|
665
633
|
sup_symbol=sup_symbol,
|
|
666
634
|
wrap=wrap,
|
|
667
635
|
wrap_width=wrap_width,
|
|
636
|
+
whitespace_mode=whitespace_mode,
|
|
668
637
|
):
|
|
669
638
|
if chunk_callback:
|
|
670
639
|
chunk_callback(chunk)
|
|
@@ -681,9 +650,12 @@ def convert_to_markdown(
|
|
|
681
650
|
|
|
682
651
|
sink = StringSink()
|
|
683
652
|
|
|
653
|
+
whitespace_handler = WhitespaceHandler(whitespace_mode)
|
|
654
|
+
|
|
684
655
|
_process_html_core(
|
|
685
656
|
source,
|
|
686
657
|
sink,
|
|
658
|
+
whitespace_handler=whitespace_handler,
|
|
687
659
|
parser=parser,
|
|
688
660
|
autolinks=autolinks,
|
|
689
661
|
bullets=bullets,
|
|
@@ -700,6 +672,8 @@ def convert_to_markdown(
|
|
|
700
672
|
heading_style=heading_style,
|
|
701
673
|
highlight_style=highlight_style,
|
|
702
674
|
keep_inline_images_in=keep_inline_images_in,
|
|
675
|
+
list_indent_type=list_indent_type,
|
|
676
|
+
list_indent_width=list_indent_width,
|
|
703
677
|
newline_style=newline_style,
|
|
704
678
|
strip=strip,
|
|
705
679
|
strip_newlines=strip_newlines,
|
|
@@ -761,34 +735,25 @@ def convert_to_markdown(
|
|
|
761
735
|
|
|
762
736
|
|
|
763
737
|
class OutputSink:
|
|
764
|
-
"""Abstract output sink for processed markdown text."""
|
|
765
|
-
|
|
766
738
|
def write(self, text: str) -> None:
|
|
767
|
-
"""Write text to the sink."""
|
|
768
739
|
raise NotImplementedError
|
|
769
740
|
|
|
770
741
|
def finalize(self) -> None:
|
|
771
|
-
|
|
742
|
+
pass
|
|
772
743
|
|
|
773
744
|
|
|
774
745
|
class StringSink(OutputSink):
|
|
775
|
-
"""Collects all output into a single string."""
|
|
776
|
-
|
|
777
746
|
def __init__(self) -> None:
|
|
778
747
|
self.buffer = StringIO()
|
|
779
748
|
|
|
780
749
|
def write(self, text: str) -> None:
|
|
781
|
-
"""Write text to the buffer."""
|
|
782
750
|
self.buffer.write(text)
|
|
783
751
|
|
|
784
752
|
def get_result(self) -> str:
|
|
785
|
-
"""Get the complete result string."""
|
|
786
753
|
return self.buffer.getvalue()
|
|
787
754
|
|
|
788
755
|
|
|
789
756
|
class StreamingSink(OutputSink):
|
|
790
|
-
"""Yields chunks of output for streaming processing."""
|
|
791
|
-
|
|
792
757
|
def __init__(self, chunk_size: int = 1024, progress_callback: Callable[[int, int], None] | None = None) -> None:
|
|
793
758
|
self.chunk_size = chunk_size
|
|
794
759
|
self.progress_callback = progress_callback
|
|
@@ -799,7 +764,6 @@ class StreamingSink(OutputSink):
|
|
|
799
764
|
self.chunks: list[str] = []
|
|
800
765
|
|
|
801
766
|
def write(self, text: str) -> None:
|
|
802
|
-
"""Write text and yield chunks when threshold is reached."""
|
|
803
767
|
if not text:
|
|
804
768
|
return
|
|
805
769
|
|
|
@@ -822,7 +786,6 @@ class StreamingSink(OutputSink):
|
|
|
822
786
|
self.buffer_size = len(current_content)
|
|
823
787
|
|
|
824
788
|
def finalize(self) -> None:
|
|
825
|
-
"""Finalize and yield any remaining content."""
|
|
826
789
|
if self.buffer_size > 0:
|
|
827
790
|
content = self.buffer.getvalue()
|
|
828
791
|
self.chunks.append(content)
|
|
@@ -830,11 +793,9 @@ class StreamingSink(OutputSink):
|
|
|
830
793
|
self._update_progress()
|
|
831
794
|
|
|
832
795
|
def get_chunks(self) -> Generator[str, None, None]:
|
|
833
|
-
"""Get all chunks yielded during processing."""
|
|
834
796
|
yield from self.chunks
|
|
835
797
|
|
|
836
798
|
def _find_split_position(self, content: str) -> int:
|
|
837
|
-
"""Find optimal position to split content for chunks."""
|
|
838
799
|
target = self.chunk_size
|
|
839
800
|
lookahead = min(100, len(content) - target)
|
|
840
801
|
|
|
@@ -847,7 +808,6 @@ class StreamingSink(OutputSink):
|
|
|
847
808
|
return min(target, len(content))
|
|
848
809
|
|
|
849
810
|
def _update_progress(self) -> None:
|
|
850
|
-
"""Update progress if callback is provided."""
|
|
851
811
|
if self.progress_callback:
|
|
852
812
|
self.progress_callback(self.processed_bytes, self.total_bytes)
|
|
853
813
|
|
|
@@ -856,6 +816,7 @@ def _process_html_core(
|
|
|
856
816
|
source: str | BeautifulSoup,
|
|
857
817
|
sink: OutputSink,
|
|
858
818
|
*,
|
|
819
|
+
whitespace_handler: WhitespaceHandler,
|
|
859
820
|
parser: str | None = None,
|
|
860
821
|
autolinks: bool,
|
|
861
822
|
bullets: str,
|
|
@@ -872,6 +833,8 @@ def _process_html_core(
|
|
|
872
833
|
heading_style: Literal["underlined", "atx", "atx_closed"],
|
|
873
834
|
highlight_style: Literal["double-equal", "html", "bold"],
|
|
874
835
|
keep_inline_images_in: Iterable[str] | None,
|
|
836
|
+
list_indent_type: str,
|
|
837
|
+
list_indent_width: int,
|
|
875
838
|
newline_style: Literal["spaces", "backslash"],
|
|
876
839
|
strip: str | Iterable[str] | None,
|
|
877
840
|
strip_newlines: bool,
|
|
@@ -881,20 +844,10 @@ def _process_html_core(
|
|
|
881
844
|
wrap: bool,
|
|
882
845
|
wrap_width: int,
|
|
883
846
|
) -> None:
|
|
884
|
-
"""Core HTML to Markdown processing logic shared by both regular and streaming."""
|
|
885
847
|
token = _ancestor_cache.set({})
|
|
886
848
|
|
|
887
849
|
try:
|
|
888
850
|
if isinstance(source, str):
|
|
889
|
-
if (
|
|
890
|
-
heading_style == UNDERLINED
|
|
891
|
-
and "Header" in source
|
|
892
|
-
and "\n------\n\n" in source
|
|
893
|
-
and "Next paragraph" in source
|
|
894
|
-
):
|
|
895
|
-
sink.write(source)
|
|
896
|
-
return
|
|
897
|
-
|
|
898
851
|
if strip_newlines:
|
|
899
852
|
source = source.replace("\n", " ").replace("\r", " ")
|
|
900
853
|
|
|
@@ -921,6 +874,8 @@ def _process_html_core(
|
|
|
921
874
|
heading_style=heading_style,
|
|
922
875
|
highlight_style=highlight_style,
|
|
923
876
|
keep_inline_images_in=keep_inline_images_in,
|
|
877
|
+
list_indent_type=list_indent_type,
|
|
878
|
+
list_indent_width=list_indent_width,
|
|
924
879
|
newline_style=newline_style,
|
|
925
880
|
strong_em_symbol=strong_em_symbol,
|
|
926
881
|
sub_symbol=sub_symbol,
|
|
@@ -948,6 +903,7 @@ def _process_html_core(
|
|
|
948
903
|
escape_misc=escape_misc,
|
|
949
904
|
escape_asterisks=escape_asterisks,
|
|
950
905
|
escape_underscores=escape_underscores,
|
|
906
|
+
whitespace_handler=whitespace_handler,
|
|
951
907
|
)
|
|
952
908
|
sink.write(text)
|
|
953
909
|
context += text
|
|
@@ -961,6 +917,7 @@ def _process_html_core(
|
|
|
961
917
|
escape_misc=escape_misc,
|
|
962
918
|
escape_underscores=escape_underscores,
|
|
963
919
|
strip=_as_optional_set(strip),
|
|
920
|
+
whitespace_handler=whitespace_handler,
|
|
964
921
|
context_before=context[-2:],
|
|
965
922
|
)
|
|
966
923
|
sink.write(text)
|
|
@@ -992,54 +949,18 @@ def convert_to_markdown_stream(
|
|
|
992
949
|
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
993
950
|
highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
|
|
994
951
|
keep_inline_images_in: Iterable[str] | None = None,
|
|
952
|
+
list_indent_type: Literal["spaces", "tabs"] = "spaces",
|
|
953
|
+
list_indent_width: int = 4,
|
|
995
954
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
996
955
|
strip: str | Iterable[str] | None = None,
|
|
997
956
|
strip_newlines: bool = False,
|
|
998
957
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
999
958
|
sub_symbol: str = "",
|
|
1000
959
|
sup_symbol: str = "",
|
|
960
|
+
whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
|
|
1001
961
|
wrap: bool = False,
|
|
1002
962
|
wrap_width: int = 80,
|
|
1003
963
|
) -> Generator[str, None, None]:
|
|
1004
|
-
"""Convert HTML to Markdown using streaming/chunked processing.
|
|
1005
|
-
|
|
1006
|
-
This function yields chunks of converted Markdown text, allowing for
|
|
1007
|
-
memory-efficient processing of large HTML documents. The output is guaranteed
|
|
1008
|
-
to be identical to convert_to_markdown().
|
|
1009
|
-
|
|
1010
|
-
Args:
|
|
1011
|
-
source: An HTML document or a an initialized instance of BeautifulSoup.
|
|
1012
|
-
chunk_size: Size of chunks to yield (approximate, in characters).
|
|
1013
|
-
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
1014
|
-
parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
|
|
1015
|
-
Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
|
|
1016
|
-
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
1017
|
-
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
1018
|
-
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
1019
|
-
code_language_callback: Function to dynamically determine the language for code blocks.
|
|
1020
|
-
convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
1021
|
-
convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
1022
|
-
custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
|
|
1023
|
-
default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
1024
|
-
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
1025
|
-
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
1026
|
-
escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
|
|
1027
|
-
extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
|
|
1028
|
-
heading_style: The style to use for Markdown headings. Defaults to "underlined".
|
|
1029
|
-
highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
|
|
1030
|
-
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
1031
|
-
newline_style: Style for handling newlines in text content. Defaults to "spaces".
|
|
1032
|
-
strip: Tags to strip from the output. Defaults to None.
|
|
1033
|
-
strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
|
|
1034
|
-
strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
|
|
1035
|
-
sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
|
|
1036
|
-
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
1037
|
-
wrap: Wrap text to the specified width. Defaults to False.
|
|
1038
|
-
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
1039
|
-
|
|
1040
|
-
Yields:
|
|
1041
|
-
str: Chunks of Markdown-formatted text.
|
|
1042
|
-
"""
|
|
1043
964
|
sink = StreamingSink(chunk_size, progress_callback)
|
|
1044
965
|
|
|
1045
966
|
if isinstance(source, str):
|
|
@@ -1047,9 +968,12 @@ def convert_to_markdown_stream(
|
|
|
1047
968
|
elif isinstance(source, BeautifulSoup):
|
|
1048
969
|
sink.total_bytes = len(str(source))
|
|
1049
970
|
|
|
971
|
+
whitespace_handler = WhitespaceHandler(whitespace_mode)
|
|
972
|
+
|
|
1050
973
|
_process_html_core(
|
|
1051
974
|
source,
|
|
1052
975
|
sink,
|
|
976
|
+
whitespace_handler=whitespace_handler,
|
|
1053
977
|
parser=parser,
|
|
1054
978
|
autolinks=autolinks,
|
|
1055
979
|
bullets=bullets,
|
|
@@ -1066,6 +990,8 @@ def convert_to_markdown_stream(
|
|
|
1066
990
|
heading_style=heading_style,
|
|
1067
991
|
highlight_style=highlight_style,
|
|
1068
992
|
keep_inline_images_in=keep_inline_images_in,
|
|
993
|
+
list_indent_type=list_indent_type,
|
|
994
|
+
list_indent_width=list_indent_width,
|
|
1069
995
|
newline_style=newline_style,
|
|
1070
996
|
strip=strip,
|
|
1071
997
|
strip_newlines=strip_newlines,
|
html_to_markdown/utils.py
CHANGED
|
@@ -6,17 +6,6 @@ from html_to_markdown.constants import line_beginning_re
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def chomp(text: str) -> tuple[str, str, str]:
|
|
9
|
-
"""Simplified whitespace handling for inline elements.
|
|
10
|
-
|
|
11
|
-
For semantic markdown output, preserves leading/trailing spaces as single spaces
|
|
12
|
-
and normalizes internal whitespace.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
text: The text to chomp.
|
|
16
|
-
|
|
17
|
-
Returns:
|
|
18
|
-
A tuple containing the prefix, suffix, and the normalized text.
|
|
19
|
-
"""
|
|
20
9
|
if not text:
|
|
21
10
|
return "", "", ""
|
|
22
11
|
|
|
@@ -29,17 +18,6 @@ def chomp(text: str) -> tuple[str, str, str]:
|
|
|
29
18
|
|
|
30
19
|
|
|
31
20
|
def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:
|
|
32
|
-
"""Escape special characters in text.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
text: The text to escape.
|
|
36
|
-
escape_misc: Whether to escape miscellaneous characters.
|
|
37
|
-
escape_asterisks: Whether to escape asterisks.
|
|
38
|
-
escape_underscores: Whether to escape underscores.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
The escaped text.
|
|
42
|
-
"""
|
|
43
21
|
if not text:
|
|
44
22
|
return ""
|
|
45
23
|
if escape_misc:
|
|
@@ -52,28 +30,10 @@ def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_under
|
|
|
52
30
|
return text
|
|
53
31
|
|
|
54
32
|
|
|
55
|
-
def indent(*, text: str, level: int) -> str:
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
Args:
|
|
59
|
-
text: The text to indent.
|
|
60
|
-
level: The level of indentation.
|
|
61
|
-
|
|
62
|
-
Returns:
|
|
63
|
-
The indented text.
|
|
64
|
-
"""
|
|
65
|
-
return line_beginning_re.sub("\t" * level, text) if text else ""
|
|
33
|
+
def indent(*, text: str, level: int, indent_str: str = "\t") -> str:
|
|
34
|
+
return line_beginning_re.sub(indent_str * level, text) if text else ""
|
|
66
35
|
|
|
67
36
|
|
|
68
37
|
def underline(*, text: str, pad_char: str) -> str:
|
|
69
|
-
"""Underline text with a given character.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
text: The text to underline.
|
|
73
|
-
pad_char: The character to use for underlining.
|
|
74
|
-
|
|
75
|
-
Returns:
|
|
76
|
-
The underlined text.
|
|
77
|
-
"""
|
|
78
38
|
text = (text or "").rstrip()
|
|
79
39
|
return f"{text}\n{pad_char * len(text)}\n\n" if text else ""
|