html-to-markdown 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__main__.py +0 -1
- html_to_markdown/cli.py +101 -45
- html_to_markdown/constants.py +3 -0
- html_to_markdown/converters.py +52 -573
- html_to_markdown/exceptions.py +1 -11
- html_to_markdown/preprocessor.py +0 -37
- html_to_markdown/processing.py +104 -202
- html_to_markdown/utils.py +2 -42
- html_to_markdown/whitespace.py +292 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/METADATA +204 -204
- html_to_markdown-1.10.0.dist-info/RECORD +17 -0
- html_to_markdown-1.9.0.dist-info/RECORD +0 -16
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/top_level.txt +0 -0
html_to_markdown/processing.py
CHANGED
|
@@ -33,12 +33,13 @@ from html_to_markdown.constants import (
|
|
|
33
33
|
DOUBLE_EQUAL,
|
|
34
34
|
SPACES,
|
|
35
35
|
UNDERLINED,
|
|
36
|
+
WHITESPACE_NORMALIZED,
|
|
36
37
|
html_heading_re,
|
|
37
|
-
whitespace_re,
|
|
38
38
|
)
|
|
39
39
|
from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
|
|
40
40
|
from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
|
|
41
41
|
from html_to_markdown.utils import escape
|
|
42
|
+
from html_to_markdown.whitespace import WhitespaceHandler
|
|
42
43
|
|
|
43
44
|
if TYPE_CHECKING:
|
|
44
45
|
from collections.abc import Iterable
|
|
@@ -143,6 +144,12 @@ SupportedTag = Literal[
|
|
|
143
144
|
]
|
|
144
145
|
|
|
145
146
|
|
|
147
|
+
def _get_list_indent(list_indent_type: str, list_indent_width: int) -> str:
|
|
148
|
+
if list_indent_type == "tabs":
|
|
149
|
+
return "\t"
|
|
150
|
+
return " " * list_indent_width
|
|
151
|
+
|
|
152
|
+
|
|
146
153
|
def _is_nested_tag(el: PageElement) -> bool:
|
|
147
154
|
return isinstance(el, Tag) and el.name in {
|
|
148
155
|
"ol",
|
|
@@ -170,6 +177,7 @@ def _process_tag(
|
|
|
170
177
|
escape_misc: bool,
|
|
171
178
|
escape_underscores: bool,
|
|
172
179
|
strip: set[str] | None,
|
|
180
|
+
whitespace_handler: WhitespaceHandler,
|
|
173
181
|
context_before: str = "",
|
|
174
182
|
) -> str:
|
|
175
183
|
should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
|
|
@@ -195,18 +203,14 @@ def _process_tag(
|
|
|
195
203
|
|
|
196
204
|
children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
|
|
197
205
|
|
|
198
|
-
# List of tags that return empty string when they have no content
|
|
199
206
|
empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
|
|
200
207
|
|
|
201
208
|
for i, el in enumerate(children):
|
|
202
209
|
if isinstance(el, NavigableString):
|
|
203
|
-
# Check if this is whitespace between empty elements
|
|
204
210
|
if el.strip() == "" and i > 0 and i < len(children) - 1:
|
|
205
211
|
prev_el = children[i - 1]
|
|
206
212
|
next_el = children[i + 1]
|
|
207
213
|
|
|
208
|
-
# If previous element was a tag that produced empty output
|
|
209
|
-
# and next element is also a tag that could be empty, skip this whitespace
|
|
210
214
|
if (
|
|
211
215
|
isinstance(prev_el, Tag)
|
|
212
216
|
and isinstance(next_el, Tag)
|
|
@@ -214,7 +218,6 @@ def _process_tag(
|
|
|
214
218
|
and next_el.name.lower() in empty_when_no_content_tags
|
|
215
219
|
and not prev_el.get_text().strip()
|
|
216
220
|
):
|
|
217
|
-
# Previous tag is empty and next could be empty too, skip this whitespace
|
|
218
221
|
continue
|
|
219
222
|
|
|
220
223
|
text_parts.append(
|
|
@@ -223,6 +226,7 @@ def _process_tag(
|
|
|
223
226
|
escape_misc=escape_misc,
|
|
224
227
|
escape_asterisks=escape_asterisks,
|
|
225
228
|
escape_underscores=escape_underscores,
|
|
229
|
+
whitespace_handler=whitespace_handler,
|
|
226
230
|
)
|
|
227
231
|
)
|
|
228
232
|
elif isinstance(el, Tag):
|
|
@@ -237,6 +241,7 @@ def _process_tag(
|
|
|
237
241
|
escape_misc=escape_misc,
|
|
238
242
|
escape_underscores=escape_underscores,
|
|
239
243
|
strip=strip,
|
|
244
|
+
whitespace_handler=whitespace_handler,
|
|
240
245
|
context_before=(context_before + current_text)[-2:],
|
|
241
246
|
)
|
|
242
247
|
)
|
|
@@ -264,6 +269,7 @@ def _process_text(
|
|
|
264
269
|
escape_misc: bool,
|
|
265
270
|
escape_asterisks: bool,
|
|
266
271
|
escape_underscores: bool,
|
|
272
|
+
whitespace_handler: WhitespaceHandler,
|
|
267
273
|
) -> str:
|
|
268
274
|
text = str(el) or ""
|
|
269
275
|
|
|
@@ -280,76 +286,9 @@ def _process_text(
|
|
|
280
286
|
if len(ancestor_names) > 10:
|
|
281
287
|
break
|
|
282
288
|
|
|
283
|
-
|
|
284
|
-
# Special case: if the text is only whitespace
|
|
285
|
-
if text.strip() == "":
|
|
286
|
-
# If it contains newlines, it's probably indentation whitespace, return empty
|
|
287
|
-
if "\n" in text:
|
|
288
|
-
text = ""
|
|
289
|
-
else:
|
|
290
|
-
# Check if this whitespace is between block elements
|
|
291
|
-
# Define block elements that should not have whitespace between them
|
|
292
|
-
block_elements = {
|
|
293
|
-
"p",
|
|
294
|
-
"ul",
|
|
295
|
-
"ol",
|
|
296
|
-
"div",
|
|
297
|
-
"blockquote",
|
|
298
|
-
"pre",
|
|
299
|
-
"h1",
|
|
300
|
-
"h2",
|
|
301
|
-
"h3",
|
|
302
|
-
"h4",
|
|
303
|
-
"h5",
|
|
304
|
-
"h6",
|
|
305
|
-
"table",
|
|
306
|
-
"dl",
|
|
307
|
-
"hr",
|
|
308
|
-
"figure",
|
|
309
|
-
"article",
|
|
310
|
-
"section",
|
|
311
|
-
"nav",
|
|
312
|
-
"aside",
|
|
313
|
-
"header",
|
|
314
|
-
"footer",
|
|
315
|
-
"main",
|
|
316
|
-
"form",
|
|
317
|
-
"fieldset",
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
prev_sibling = el.previous_sibling
|
|
321
|
-
next_sibling = el.next_sibling
|
|
322
|
-
|
|
323
|
-
# Check if whitespace is between block elements
|
|
324
|
-
if (
|
|
325
|
-
prev_sibling
|
|
326
|
-
and hasattr(prev_sibling, "name")
|
|
327
|
-
and prev_sibling.name in block_elements
|
|
328
|
-
and next_sibling
|
|
329
|
-
and hasattr(next_sibling, "name")
|
|
330
|
-
and next_sibling.name in block_elements
|
|
331
|
-
):
|
|
332
|
-
# Remove whitespace between block elements
|
|
333
|
-
text = ""
|
|
334
|
-
else:
|
|
335
|
-
# Otherwise it's inline whitespace, normalize to single space
|
|
336
|
-
text = " " if text else ""
|
|
337
|
-
else:
|
|
338
|
-
has_leading_space = text.startswith((" ", "\t"))
|
|
339
|
-
has_trailing_space = text.endswith((" ", "\t"))
|
|
340
|
-
|
|
341
|
-
middle_content = (
|
|
342
|
-
text[1:-1]
|
|
343
|
-
if has_leading_space and has_trailing_space
|
|
344
|
-
else text[1:]
|
|
345
|
-
if has_leading_space
|
|
346
|
-
else text[:-1]
|
|
347
|
-
if has_trailing_space
|
|
348
|
-
else text
|
|
349
|
-
)
|
|
289
|
+
in_pre = bool(ancestor_names.intersection({"pre"}))
|
|
350
290
|
|
|
351
|
-
|
|
352
|
-
text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
|
|
291
|
+
text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
|
|
353
292
|
|
|
354
293
|
if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
|
|
355
294
|
text = escape(
|
|
@@ -369,7 +308,6 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
|
|
|
369
308
|
|
|
370
309
|
|
|
371
310
|
def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
372
|
-
"""Get set of ancestor tag names for efficient parent checking."""
|
|
373
311
|
elem_id = id(element)
|
|
374
312
|
cache = _ancestor_cache.get()
|
|
375
313
|
if cache is None:
|
|
@@ -400,7 +338,6 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
|
400
338
|
|
|
401
339
|
|
|
402
340
|
def _has_ancestor(element: PageElement, tag_names: str | list[str]) -> bool:
|
|
403
|
-
"""Check if element has any of the specified ancestors efficiently."""
|
|
404
341
|
if isinstance(tag_names, str):
|
|
405
342
|
tag_names = [tag_names]
|
|
406
343
|
|
|
@@ -426,14 +363,6 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
|
|
|
426
363
|
|
|
427
364
|
|
|
428
365
|
def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
429
|
-
"""Extract metadata from HTML document.
|
|
430
|
-
|
|
431
|
-
Args:
|
|
432
|
-
soup: BeautifulSoup instance of the HTML document.
|
|
433
|
-
|
|
434
|
-
Returns:
|
|
435
|
-
Dictionary of metadata key-value pairs.
|
|
436
|
-
"""
|
|
437
366
|
metadata = {}
|
|
438
367
|
|
|
439
368
|
title_tag = soup.find("title")
|
|
@@ -470,7 +399,6 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
470
399
|
if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
|
|
471
400
|
metadata["canonical"] = canonical["href"]
|
|
472
401
|
|
|
473
|
-
# Extract link relations
|
|
474
402
|
link_relations = {"author", "license", "alternate"}
|
|
475
403
|
for rel_type in link_relations:
|
|
476
404
|
link = soup.find("link", rel=rel_type, href=True)
|
|
@@ -481,14 +409,6 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
481
409
|
|
|
482
410
|
|
|
483
411
|
def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
484
|
-
"""Format metadata as a Markdown comment block.
|
|
485
|
-
|
|
486
|
-
Args:
|
|
487
|
-
metadata: Dictionary of metadata key-value pairs.
|
|
488
|
-
|
|
489
|
-
Returns:
|
|
490
|
-
Formatted metadata comment block.
|
|
491
|
-
"""
|
|
492
412
|
if not metadata:
|
|
493
413
|
return ""
|
|
494
414
|
|
|
@@ -524,64 +444,87 @@ def convert_to_markdown(
|
|
|
524
444
|
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
525
445
|
highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
|
|
526
446
|
keep_inline_images_in: Iterable[str] | None = None,
|
|
447
|
+
list_indent_type: Literal["spaces", "tabs"] = "spaces",
|
|
448
|
+
list_indent_width: int = 4,
|
|
527
449
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
450
|
+
preprocess_html: bool = False,
|
|
451
|
+
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
452
|
+
remove_forms: bool = True,
|
|
453
|
+
remove_navigation: bool = True,
|
|
528
454
|
strip: str | Iterable[str] | None = None,
|
|
529
455
|
strip_newlines: bool = False,
|
|
530
456
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
531
457
|
sub_symbol: str = "",
|
|
532
458
|
sup_symbol: str = "",
|
|
459
|
+
whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
|
|
533
460
|
wrap: bool = False,
|
|
534
461
|
wrap_width: int = 80,
|
|
535
|
-
preprocess_html: bool = False,
|
|
536
|
-
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
537
|
-
remove_navigation: bool = True,
|
|
538
|
-
remove_forms: bool = True,
|
|
539
462
|
) -> str:
|
|
540
|
-
"""Convert HTML to Markdown.
|
|
463
|
+
"""Convert HTML content to Markdown format.
|
|
541
464
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
stream_processing: Use streaming processing for large documents. Defaults to False.
|
|
545
|
-
chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
|
|
546
|
-
chunk_callback: Optional callback function called with each processed chunk.
|
|
547
|
-
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
548
|
-
parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
|
|
549
|
-
Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
|
|
550
|
-
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
551
|
-
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
552
|
-
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
553
|
-
code_language_callback: Function to dynamically determine the language for code blocks.
|
|
554
|
-
convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
555
|
-
convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
556
|
-
custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
|
|
557
|
-
default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
558
|
-
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
559
|
-
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
560
|
-
escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
|
|
561
|
-
extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
|
|
562
|
-
heading_style: The style to use for Markdown headings. Defaults to "underlined".
|
|
563
|
-
highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
|
|
564
|
-
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
565
|
-
newline_style: Style for handling newlines in text content. Defaults to "spaces".
|
|
566
|
-
strip: Tags to strip from the output. Defaults to None.
|
|
567
|
-
strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
|
|
568
|
-
strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
|
|
569
|
-
sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
|
|
570
|
-
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
571
|
-
wrap: Wrap text to the specified width. Defaults to False.
|
|
572
|
-
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
573
|
-
preprocess_html: Apply HTML preprocessing to improve quality. Defaults to False.
|
|
574
|
-
preprocessing_preset: Preset configuration for preprocessing. Defaults to "standard".
|
|
575
|
-
remove_navigation: Remove navigation elements during preprocessing. Defaults to True.
|
|
576
|
-
remove_forms: Remove form elements during preprocessing. Defaults to True.
|
|
465
|
+
This is the main entry point for converting HTML to Markdown. It supports
|
|
466
|
+
various customization options for controlling the conversion behavior.
|
|
577
467
|
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
468
|
+
Args:
|
|
469
|
+
source: HTML string or BeautifulSoup object to convert.
|
|
470
|
+
stream_processing: Enable streaming mode for large documents.
|
|
471
|
+
chunk_size: Size of chunks for streaming processing.
|
|
472
|
+
chunk_callback: Callback for processing chunks in streaming mode.
|
|
473
|
+
progress_callback: Callback for progress updates (current, total).
|
|
474
|
+
parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
|
|
475
|
+
autolinks: Convert URLs to automatic links.
|
|
476
|
+
bullets: Characters to use for unordered list bullets.
|
|
477
|
+
code_language: Default language for code blocks.
|
|
478
|
+
code_language_callback: Callback to determine code language from element.
|
|
479
|
+
convert: HTML tags to convert to Markdown.
|
|
480
|
+
convert_as_inline: Treat block elements as inline during conversion.
|
|
481
|
+
custom_converters: Custom converters for specific HTML elements.
|
|
482
|
+
default_title: Add a default title if none exists.
|
|
483
|
+
escape_asterisks: Escape asterisk characters in text.
|
|
484
|
+
escape_misc: Escape miscellaneous Markdown characters.
|
|
485
|
+
escape_underscores: Escape underscore characters in text.
|
|
486
|
+
extract_metadata: Extract metadata from HTML head.
|
|
487
|
+
heading_style: Style for headings ('underlined', 'atx', 'atx_closed').
|
|
488
|
+
highlight_style: Style for highlighting ('double-equal', 'html', 'bold').
|
|
489
|
+
keep_inline_images_in: Parent tags where images should remain inline.
|
|
490
|
+
list_indent_type: Type of indentation for lists ('spaces', 'tabs').
|
|
491
|
+
list_indent_width: Number of spaces for list indentation.
|
|
492
|
+
newline_style: Style for newlines ('spaces', 'backslash').
|
|
493
|
+
preprocess_html: Enable HTML preprocessing to clean up content.
|
|
494
|
+
preprocessing_preset: Preprocessing aggressiveness level.
|
|
495
|
+
remove_forms: Remove form elements during preprocessing.
|
|
496
|
+
remove_navigation: Remove navigation elements during preprocessing.
|
|
497
|
+
strip: HTML tags to strip from output.
|
|
498
|
+
strip_newlines: Remove newlines from HTML before processing.
|
|
499
|
+
strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
|
|
500
|
+
sub_symbol: Symbol for subscript text.
|
|
501
|
+
sup_symbol: Symbol for superscript text.
|
|
502
|
+
whitespace_mode: How to handle whitespace ('normalized', 'strict').
|
|
503
|
+
wrap: Enable text wrapping.
|
|
504
|
+
wrap_width: Column width for text wrapping.
|
|
582
505
|
|
|
583
506
|
Returns:
|
|
584
|
-
|
|
507
|
+
The converted Markdown string.
|
|
508
|
+
|
|
509
|
+
Raises:
|
|
510
|
+
EmptyHtmlError: If the HTML input is empty.
|
|
511
|
+
MissingDependencyError: If required dependencies are not installed.
|
|
512
|
+
ConflictingOptionsError: If conflicting options are provided.
|
|
513
|
+
|
|
514
|
+
Examples:
|
|
515
|
+
Basic conversion:
|
|
516
|
+
>>> html = "<h1>Title</h1><p>Content</p>"
|
|
517
|
+
>>> convert_to_markdown(html)
|
|
518
|
+
'Title\\n=====\\n\\nContent\\n\\n'
|
|
519
|
+
|
|
520
|
+
With custom options:
|
|
521
|
+
>>> convert_to_markdown(html, heading_style="atx", list_indent_width=2)
|
|
522
|
+
'# Title\\n\\nContent\\n\\n'
|
|
523
|
+
|
|
524
|
+
Discord-compatible lists (2-space indent):
|
|
525
|
+
>>> html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
|
|
526
|
+
>>> convert_to_markdown(html, list_indent_width=2)
|
|
527
|
+
'* Item 1\\n* Item 2\\n\\n'
|
|
585
528
|
"""
|
|
586
529
|
if isinstance(source, str):
|
|
587
530
|
if (
|
|
@@ -595,8 +538,6 @@ def convert_to_markdown(
|
|
|
595
538
|
if strip_newlines:
|
|
596
539
|
source = source.replace("\n", " ").replace("\r", " ")
|
|
597
540
|
|
|
598
|
-
# Fix lxml parsing of void elements like <wbr>
|
|
599
|
-
# lxml incorrectly treats them as container tags
|
|
600
541
|
source = re.sub(r"<wbr\s*>", "<wbr />", source, flags=re.IGNORECASE)
|
|
601
542
|
|
|
602
543
|
if preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
|
|
@@ -680,6 +621,7 @@ def convert_to_markdown(
|
|
|
680
621
|
sup_symbol=sup_symbol,
|
|
681
622
|
wrap=wrap,
|
|
682
623
|
wrap_width=wrap_width,
|
|
624
|
+
whitespace_mode=whitespace_mode,
|
|
683
625
|
):
|
|
684
626
|
if chunk_callback:
|
|
685
627
|
chunk_callback(chunk)
|
|
@@ -696,9 +638,12 @@ def convert_to_markdown(
|
|
|
696
638
|
|
|
697
639
|
sink = StringSink()
|
|
698
640
|
|
|
641
|
+
whitespace_handler = WhitespaceHandler(whitespace_mode)
|
|
642
|
+
|
|
699
643
|
_process_html_core(
|
|
700
644
|
source,
|
|
701
645
|
sink,
|
|
646
|
+
whitespace_handler=whitespace_handler,
|
|
702
647
|
parser=parser,
|
|
703
648
|
autolinks=autolinks,
|
|
704
649
|
bullets=bullets,
|
|
@@ -715,6 +660,8 @@ def convert_to_markdown(
|
|
|
715
660
|
heading_style=heading_style,
|
|
716
661
|
highlight_style=highlight_style,
|
|
717
662
|
keep_inline_images_in=keep_inline_images_in,
|
|
663
|
+
list_indent_type=list_indent_type,
|
|
664
|
+
list_indent_width=list_indent_width,
|
|
718
665
|
newline_style=newline_style,
|
|
719
666
|
strip=strip,
|
|
720
667
|
strip_newlines=strip_newlines,
|
|
@@ -737,7 +684,6 @@ def convert_to_markdown(
|
|
|
737
684
|
if leading_whitespace_match:
|
|
738
685
|
leading_whitespace = leading_whitespace_match.group(0)
|
|
739
686
|
|
|
740
|
-
# Check if input contains list or heading tags
|
|
741
687
|
list_heading_tags = {"<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"}
|
|
742
688
|
if any(tag in original_input for tag in list_heading_tags):
|
|
743
689
|
leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
|
|
@@ -751,19 +697,14 @@ def convert_to_markdown(
|
|
|
751
697
|
def normalize_spaces_outside_code(text: str) -> str:
|
|
752
698
|
parts = text.split("```")
|
|
753
699
|
for i in range(0, len(parts), 2):
|
|
754
|
-
# Process each line separately to preserve leading spaces
|
|
755
700
|
lines = parts[i].split("\n")
|
|
756
701
|
processed_lines = []
|
|
757
702
|
for line in lines:
|
|
758
|
-
# Preserve definition list formatting (: followed by 3 spaces)
|
|
759
703
|
def_parts = re.split(r"(:\s{3})", line)
|
|
760
704
|
for j in range(0, len(def_parts), 2):
|
|
761
|
-
# Only normalize non-definition-list parts
|
|
762
|
-
# Also preserve leading spaces (for list indentation)
|
|
763
705
|
match = re.match(r"^(\s*)(.*)", def_parts[j])
|
|
764
706
|
if match:
|
|
765
707
|
leading_spaces, rest = match.groups()
|
|
766
|
-
# Only normalize multiple spaces that are not at the beginning
|
|
767
708
|
rest = re.sub(r" {3,}", " ", rest)
|
|
768
709
|
def_parts[j] = leading_spaces + rest
|
|
769
710
|
processed_lines.append("".join(def_parts))
|
|
@@ -782,34 +723,25 @@ def convert_to_markdown(
|
|
|
782
723
|
|
|
783
724
|
|
|
784
725
|
class OutputSink:
|
|
785
|
-
"""Abstract output sink for processed markdown text."""
|
|
786
|
-
|
|
787
726
|
def write(self, text: str) -> None:
|
|
788
|
-
"""Write text to the sink."""
|
|
789
727
|
raise NotImplementedError
|
|
790
728
|
|
|
791
729
|
def finalize(self) -> None:
|
|
792
|
-
|
|
730
|
+
pass
|
|
793
731
|
|
|
794
732
|
|
|
795
733
|
class StringSink(OutputSink):
|
|
796
|
-
"""Collects all output into a single string."""
|
|
797
|
-
|
|
798
734
|
def __init__(self) -> None:
|
|
799
735
|
self.buffer = StringIO()
|
|
800
736
|
|
|
801
737
|
def write(self, text: str) -> None:
|
|
802
|
-
"""Write text to the buffer."""
|
|
803
738
|
self.buffer.write(text)
|
|
804
739
|
|
|
805
740
|
def get_result(self) -> str:
|
|
806
|
-
"""Get the complete result string."""
|
|
807
741
|
return self.buffer.getvalue()
|
|
808
742
|
|
|
809
743
|
|
|
810
744
|
class StreamingSink(OutputSink):
|
|
811
|
-
"""Yields chunks of output for streaming processing."""
|
|
812
|
-
|
|
813
745
|
def __init__(self, chunk_size: int = 1024, progress_callback: Callable[[int, int], None] | None = None) -> None:
|
|
814
746
|
self.chunk_size = chunk_size
|
|
815
747
|
self.progress_callback = progress_callback
|
|
@@ -820,7 +752,6 @@ class StreamingSink(OutputSink):
|
|
|
820
752
|
self.chunks: list[str] = []
|
|
821
753
|
|
|
822
754
|
def write(self, text: str) -> None:
|
|
823
|
-
"""Write text and yield chunks when threshold is reached."""
|
|
824
755
|
if not text:
|
|
825
756
|
return
|
|
826
757
|
|
|
@@ -843,7 +774,6 @@ class StreamingSink(OutputSink):
|
|
|
843
774
|
self.buffer_size = len(current_content)
|
|
844
775
|
|
|
845
776
|
def finalize(self) -> None:
|
|
846
|
-
"""Finalize and yield any remaining content."""
|
|
847
777
|
if self.buffer_size > 0:
|
|
848
778
|
content = self.buffer.getvalue()
|
|
849
779
|
self.chunks.append(content)
|
|
@@ -851,11 +781,9 @@ class StreamingSink(OutputSink):
|
|
|
851
781
|
self._update_progress()
|
|
852
782
|
|
|
853
783
|
def get_chunks(self) -> Generator[str, None, None]:
|
|
854
|
-
"""Get all chunks yielded during processing."""
|
|
855
784
|
yield from self.chunks
|
|
856
785
|
|
|
857
786
|
def _find_split_position(self, content: str) -> int:
|
|
858
|
-
"""Find optimal position to split content for chunks."""
|
|
859
787
|
target = self.chunk_size
|
|
860
788
|
lookahead = min(100, len(content) - target)
|
|
861
789
|
|
|
@@ -868,7 +796,6 @@ class StreamingSink(OutputSink):
|
|
|
868
796
|
return min(target, len(content))
|
|
869
797
|
|
|
870
798
|
def _update_progress(self) -> None:
|
|
871
|
-
"""Update progress if callback is provided."""
|
|
872
799
|
if self.progress_callback:
|
|
873
800
|
self.progress_callback(self.processed_bytes, self.total_bytes)
|
|
874
801
|
|
|
@@ -877,6 +804,7 @@ def _process_html_core(
|
|
|
877
804
|
source: str | BeautifulSoup,
|
|
878
805
|
sink: OutputSink,
|
|
879
806
|
*,
|
|
807
|
+
whitespace_handler: WhitespaceHandler,
|
|
880
808
|
parser: str | None = None,
|
|
881
809
|
autolinks: bool,
|
|
882
810
|
bullets: str,
|
|
@@ -893,6 +821,8 @@ def _process_html_core(
|
|
|
893
821
|
heading_style: Literal["underlined", "atx", "atx_closed"],
|
|
894
822
|
highlight_style: Literal["double-equal", "html", "bold"],
|
|
895
823
|
keep_inline_images_in: Iterable[str] | None,
|
|
824
|
+
list_indent_type: str,
|
|
825
|
+
list_indent_width: int,
|
|
896
826
|
newline_style: Literal["spaces", "backslash"],
|
|
897
827
|
strip: str | Iterable[str] | None,
|
|
898
828
|
strip_newlines: bool,
|
|
@@ -902,7 +832,6 @@ def _process_html_core(
|
|
|
902
832
|
wrap: bool,
|
|
903
833
|
wrap_width: int,
|
|
904
834
|
) -> None:
|
|
905
|
-
"""Core HTML to Markdown processing logic shared by both regular and streaming."""
|
|
906
835
|
token = _ancestor_cache.set({})
|
|
907
836
|
|
|
908
837
|
try:
|
|
@@ -942,6 +871,8 @@ def _process_html_core(
|
|
|
942
871
|
heading_style=heading_style,
|
|
943
872
|
highlight_style=highlight_style,
|
|
944
873
|
keep_inline_images_in=keep_inline_images_in,
|
|
874
|
+
list_indent_type=list_indent_type,
|
|
875
|
+
list_indent_width=list_indent_width,
|
|
945
876
|
newline_style=newline_style,
|
|
946
877
|
strong_em_symbol=strong_em_symbol,
|
|
947
878
|
sub_symbol=sub_symbol,
|
|
@@ -969,6 +900,7 @@ def _process_html_core(
|
|
|
969
900
|
escape_misc=escape_misc,
|
|
970
901
|
escape_asterisks=escape_asterisks,
|
|
971
902
|
escape_underscores=escape_underscores,
|
|
903
|
+
whitespace_handler=whitespace_handler,
|
|
972
904
|
)
|
|
973
905
|
sink.write(text)
|
|
974
906
|
context += text
|
|
@@ -982,6 +914,7 @@ def _process_html_core(
|
|
|
982
914
|
escape_misc=escape_misc,
|
|
983
915
|
escape_underscores=escape_underscores,
|
|
984
916
|
strip=_as_optional_set(strip),
|
|
917
|
+
whitespace_handler=whitespace_handler,
|
|
985
918
|
context_before=context[-2:],
|
|
986
919
|
)
|
|
987
920
|
sink.write(text)
|
|
@@ -1013,54 +946,18 @@ def convert_to_markdown_stream(
|
|
|
1013
946
|
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
1014
947
|
highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
|
|
1015
948
|
keep_inline_images_in: Iterable[str] | None = None,
|
|
949
|
+
list_indent_type: Literal["spaces", "tabs"] = "spaces",
|
|
950
|
+
list_indent_width: int = 4,
|
|
1016
951
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
1017
952
|
strip: str | Iterable[str] | None = None,
|
|
1018
953
|
strip_newlines: bool = False,
|
|
1019
954
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
1020
955
|
sub_symbol: str = "",
|
|
1021
956
|
sup_symbol: str = "",
|
|
957
|
+
whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
|
|
1022
958
|
wrap: bool = False,
|
|
1023
959
|
wrap_width: int = 80,
|
|
1024
960
|
) -> Generator[str, None, None]:
|
|
1025
|
-
"""Convert HTML to Markdown using streaming/chunked processing.
|
|
1026
|
-
|
|
1027
|
-
This function yields chunks of converted Markdown text, allowing for
|
|
1028
|
-
memory-efficient processing of large HTML documents. The output is guaranteed
|
|
1029
|
-
to be identical to convert_to_markdown().
|
|
1030
|
-
|
|
1031
|
-
Args:
|
|
1032
|
-
source: An HTML document or a an initialized instance of BeautifulSoup.
|
|
1033
|
-
chunk_size: Size of chunks to yield (approximate, in characters).
|
|
1034
|
-
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
1035
|
-
parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
|
|
1036
|
-
Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
|
|
1037
|
-
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
1038
|
-
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
1039
|
-
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
1040
|
-
code_language_callback: Function to dynamically determine the language for code blocks.
|
|
1041
|
-
convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
1042
|
-
convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
1043
|
-
custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
|
|
1044
|
-
default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
1045
|
-
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
1046
|
-
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
1047
|
-
escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
|
|
1048
|
-
extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
|
|
1049
|
-
heading_style: The style to use for Markdown headings. Defaults to "underlined".
|
|
1050
|
-
highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
|
|
1051
|
-
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
1052
|
-
newline_style: Style for handling newlines in text content. Defaults to "spaces".
|
|
1053
|
-
strip: Tags to strip from the output. Defaults to None.
|
|
1054
|
-
strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
|
|
1055
|
-
strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
|
|
1056
|
-
sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
|
|
1057
|
-
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
1058
|
-
wrap: Wrap text to the specified width. Defaults to False.
|
|
1059
|
-
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
1060
|
-
|
|
1061
|
-
Yields:
|
|
1062
|
-
str: Chunks of Markdown-formatted text.
|
|
1063
|
-
"""
|
|
1064
961
|
sink = StreamingSink(chunk_size, progress_callback)
|
|
1065
962
|
|
|
1066
963
|
if isinstance(source, str):
|
|
@@ -1068,9 +965,12 @@ def convert_to_markdown_stream(
|
|
|
1068
965
|
elif isinstance(source, BeautifulSoup):
|
|
1069
966
|
sink.total_bytes = len(str(source))
|
|
1070
967
|
|
|
968
|
+
whitespace_handler = WhitespaceHandler(whitespace_mode)
|
|
969
|
+
|
|
1071
970
|
_process_html_core(
|
|
1072
971
|
source,
|
|
1073
972
|
sink,
|
|
973
|
+
whitespace_handler=whitespace_handler,
|
|
1074
974
|
parser=parser,
|
|
1075
975
|
autolinks=autolinks,
|
|
1076
976
|
bullets=bullets,
|
|
@@ -1087,6 +987,8 @@ def convert_to_markdown_stream(
|
|
|
1087
987
|
heading_style=heading_style,
|
|
1088
988
|
highlight_style=highlight_style,
|
|
1089
989
|
keep_inline_images_in=keep_inline_images_in,
|
|
990
|
+
list_indent_type=list_indent_type,
|
|
991
|
+
list_indent_width=list_indent_width,
|
|
1090
992
|
newline_style=newline_style,
|
|
1091
993
|
strip=strip,
|
|
1092
994
|
strip_newlines=strip_newlines,
|
html_to_markdown/utils.py
CHANGED
|
@@ -6,17 +6,6 @@ from html_to_markdown.constants import line_beginning_re
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def chomp(text: str) -> tuple[str, str, str]:
|
|
9
|
-
"""Simplified whitespace handling for inline elements.
|
|
10
|
-
|
|
11
|
-
For semantic markdown output, preserves leading/trailing spaces as single spaces
|
|
12
|
-
and normalizes internal whitespace.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
text: The text to chomp.
|
|
16
|
-
|
|
17
|
-
Returns:
|
|
18
|
-
A tuple containing the prefix, suffix, and the normalized text.
|
|
19
|
-
"""
|
|
20
9
|
if not text:
|
|
21
10
|
return "", "", ""
|
|
22
11
|
|
|
@@ -29,17 +18,6 @@ def chomp(text: str) -> tuple[str, str, str]:
|
|
|
29
18
|
|
|
30
19
|
|
|
31
20
|
def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:
|
|
32
|
-
"""Escape special characters in text.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
text: The text to escape.
|
|
36
|
-
escape_misc: Whether to escape miscellaneous characters.
|
|
37
|
-
escape_asterisks: Whether to escape asterisks.
|
|
38
|
-
escape_underscores: Whether to escape underscores.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
The escaped text.
|
|
42
|
-
"""
|
|
43
21
|
if not text:
|
|
44
22
|
return ""
|
|
45
23
|
if escape_misc:
|
|
@@ -52,28 +30,10 @@ def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_under
|
|
|
52
30
|
return text
|
|
53
31
|
|
|
54
32
|
|
|
55
|
-
def indent(*, text: str, level: int) -> str:
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
Args:
|
|
59
|
-
text: The text to indent.
|
|
60
|
-
level: The level of indentation.
|
|
61
|
-
|
|
62
|
-
Returns:
|
|
63
|
-
The indented text.
|
|
64
|
-
"""
|
|
65
|
-
return line_beginning_re.sub("\t" * level, text) if text else ""
|
|
33
|
+
def indent(*, text: str, level: int, indent_str: str = "\t") -> str:
|
|
34
|
+
return line_beginning_re.sub(indent_str * level, text) if text else ""
|
|
66
35
|
|
|
67
36
|
|
|
68
37
|
def underline(*, text: str, pad_char: str) -> str:
|
|
69
|
-
"""Underline text with a given character.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
text: The text to underline.
|
|
73
|
-
pad_char: The character to use for underlining.
|
|
74
|
-
|
|
75
|
-
Returns:
|
|
76
|
-
The underlined text.
|
|
77
|
-
"""
|
|
78
38
|
text = (text or "").rstrip()
|
|
79
39
|
return f"{text}\n{pad_char * len(text)}\n\n" if text else ""
|