html-to-markdown 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__main__.py +0 -1
- html_to_markdown/cli.py +101 -45
- html_to_markdown/constants.py +3 -0
- html_to_markdown/converters.py +31 -502
- html_to_markdown/exceptions.py +1 -11
- html_to_markdown/preprocessor.py +0 -37
- html_to_markdown/processing.py +104 -181
- html_to_markdown/utils.py +2 -42
- html_to_markdown/whitespace.py +292 -0
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/METADATA +195 -203
- html_to_markdown-1.10.0.dist-info/RECORD +17 -0
- html_to_markdown-1.9.1.dist-info/RECORD +0 -16
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.9.1.dist-info → html_to_markdown-1.10.0.dist-info}/top_level.txt +0 -0
html_to_markdown/exceptions.py
CHANGED
|
@@ -1,15 +1,11 @@
|
|
|
1
|
-
"""Custom exceptions for the html-to-markdown library."""
|
|
2
|
-
|
|
3
1
|
from __future__ import annotations
|
|
4
2
|
|
|
5
3
|
|
|
6
4
|
class HtmlToMarkdownError(Exception):
|
|
7
|
-
|
|
5
|
+
pass
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
class MissingDependencyError(HtmlToMarkdownError):
|
|
11
|
-
"""Raised when an optional dependency is required but not installed."""
|
|
12
|
-
|
|
13
9
|
def __init__(self, dependency: str, install_command: str | None = None) -> None:
|
|
14
10
|
self.dependency = dependency
|
|
15
11
|
self.install_command = install_command
|
|
@@ -22,8 +18,6 @@ class MissingDependencyError(HtmlToMarkdownError):
|
|
|
22
18
|
|
|
23
19
|
|
|
24
20
|
class InvalidParserError(HtmlToMarkdownError):
|
|
25
|
-
"""Raised when an invalid parser is specified."""
|
|
26
|
-
|
|
27
21
|
def __init__(self, parser: str, available_parsers: list[str]) -> None:
|
|
28
22
|
self.parser = parser
|
|
29
23
|
self.available_parsers = available_parsers
|
|
@@ -33,15 +27,11 @@ class InvalidParserError(HtmlToMarkdownError):
|
|
|
33
27
|
|
|
34
28
|
|
|
35
29
|
class EmptyHtmlError(HtmlToMarkdownError):
|
|
36
|
-
"""Raised when the input HTML is empty."""
|
|
37
|
-
|
|
38
30
|
def __init__(self) -> None:
|
|
39
31
|
super().__init__("The input HTML is empty.")
|
|
40
32
|
|
|
41
33
|
|
|
42
34
|
class ConflictingOptionsError(HtmlToMarkdownError):
|
|
43
|
-
"""Raised when conflicting options are specified."""
|
|
44
|
-
|
|
45
35
|
def __init__(self, option1: str, option2: str) -> None:
|
|
46
36
|
self.option1 = option1
|
|
47
37
|
self.option2 = option2
|
html_to_markdown/preprocessor.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
"""HTML preprocessing using nh3 (ammonia bindings) for improved quality and performance."""
|
|
2
|
-
|
|
3
1
|
from __future__ import annotations
|
|
4
2
|
|
|
5
3
|
import re
|
|
@@ -22,24 +20,6 @@ def preprocess_html(
|
|
|
22
20
|
custom_tags_to_remove: set[str] | None = None,
|
|
23
21
|
custom_attributes_to_remove: set[str] | None = None,
|
|
24
22
|
) -> str:
|
|
25
|
-
"""Preprocess HTML to remove unwanted elements and improve quality.
|
|
26
|
-
|
|
27
|
-
Args:
|
|
28
|
-
html: Raw HTML content to preprocess.
|
|
29
|
-
remove_navigation: Remove navigation elements and menus.
|
|
30
|
-
remove_forms: Remove form elements (input, button, select, etc.).
|
|
31
|
-
remove_scripts: Remove script tags and content.
|
|
32
|
-
remove_styles: Remove style tags and content.
|
|
33
|
-
remove_comments: Remove HTML comments.
|
|
34
|
-
preserve_semantic_structure: Preserve semantic HTML5 elements.
|
|
35
|
-
preserve_tables: Preserve table structure.
|
|
36
|
-
preserve_media: Preserve media elements (img, video, audio).
|
|
37
|
-
custom_tags_to_remove: Additional tags to remove.
|
|
38
|
-
custom_attributes_to_remove: Additional attributes to remove.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
Cleaned HTML ready for conversion to markdown.
|
|
42
|
-
"""
|
|
43
23
|
if not html or not html.strip(): # pragma: no cover
|
|
44
24
|
return html
|
|
45
25
|
|
|
@@ -83,7 +63,6 @@ def _configure_cleaning_rules(
|
|
|
83
63
|
custom_tags_to_remove: set[str],
|
|
84
64
|
custom_attributes_to_remove: set[str],
|
|
85
65
|
) -> dict[str, Any]:
|
|
86
|
-
"""Configure the cleaning rules for nh3."""
|
|
87
66
|
allowed_tags = {
|
|
88
67
|
"p",
|
|
89
68
|
"div",
|
|
@@ -254,7 +233,6 @@ def _configure_cleaning_rules(
|
|
|
254
233
|
|
|
255
234
|
|
|
256
235
|
def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
|
|
257
|
-
"""Remove elements with navigation-related classes."""
|
|
258
236
|
if not remove_navigation:
|
|
259
237
|
return html
|
|
260
238
|
|
|
@@ -288,7 +266,6 @@ def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
|
|
|
288
266
|
|
|
289
267
|
|
|
290
268
|
def _remove_navigation_patterns(html: str, remove_navigation: bool) -> str:
|
|
291
|
-
"""Remove common navigation patterns that nh3 might miss."""
|
|
292
269
|
if not remove_navigation:
|
|
293
270
|
return html
|
|
294
271
|
|
|
@@ -329,7 +306,6 @@ def _remove_navigation_patterns(html: str, remove_navigation: bool) -> str:
|
|
|
329
306
|
|
|
330
307
|
|
|
331
308
|
def _remove_wikipedia_navigation_lists(html: str) -> str:
|
|
332
|
-
"""Remove Wikipedia-style navigation lists that appear at the start."""
|
|
333
309
|
patterns = [
|
|
334
310
|
r"Main menu\s*\n\n(-\s*\[.*?\]\(.*?\).*?\n){3,}",
|
|
335
311
|
r"(-\s*\[[^\]]*\]\(/wiki/[^)]*\).*?\n){5,}",
|
|
@@ -342,7 +318,6 @@ def _remove_wikipedia_navigation_lists(html: str) -> str:
|
|
|
342
318
|
|
|
343
319
|
|
|
344
320
|
def _fix_whitespace_issues(html: str) -> str:
|
|
345
|
-
"""Fix common whitespace issues in HTML."""
|
|
346
321
|
html = re.sub(r"[ \t]{2,}", " ", html)
|
|
347
322
|
html = re.sub(r"\n\s*\n", "\n\n", html)
|
|
348
323
|
|
|
@@ -385,18 +360,6 @@ PRESETS: dict[str, dict[str, Any]] = {
|
|
|
385
360
|
|
|
386
361
|
|
|
387
362
|
def create_preprocessor(preset: str = "standard", **overrides: Any) -> dict[str, Any]:
|
|
388
|
-
"""Create preprocessor configuration with a preset.
|
|
389
|
-
|
|
390
|
-
Args:
|
|
391
|
-
preset: The preset configuration to use (minimal, standard, aggressive).
|
|
392
|
-
**overrides: Any configuration options to override.
|
|
393
|
-
|
|
394
|
-
Returns:
|
|
395
|
-
Configuration dict for preprocessor.
|
|
396
|
-
|
|
397
|
-
Raises:
|
|
398
|
-
ValueError: If preset is unknown.
|
|
399
|
-
"""
|
|
400
363
|
if preset not in PRESETS:
|
|
401
364
|
msg = f"Unknown preset '{preset}'. Available presets: {list(PRESETS.keys())}"
|
|
402
365
|
raise ValueError(msg)
|
html_to_markdown/processing.py
CHANGED
|
@@ -33,12 +33,13 @@ from html_to_markdown.constants import (
|
|
|
33
33
|
DOUBLE_EQUAL,
|
|
34
34
|
SPACES,
|
|
35
35
|
UNDERLINED,
|
|
36
|
+
WHITESPACE_NORMALIZED,
|
|
36
37
|
html_heading_re,
|
|
37
|
-
whitespace_re,
|
|
38
38
|
)
|
|
39
39
|
from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
|
|
40
40
|
from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
|
|
41
41
|
from html_to_markdown.utils import escape
|
|
42
|
+
from html_to_markdown.whitespace import WhitespaceHandler
|
|
42
43
|
|
|
43
44
|
if TYPE_CHECKING:
|
|
44
45
|
from collections.abc import Iterable
|
|
@@ -143,6 +144,12 @@ SupportedTag = Literal[
|
|
|
143
144
|
]
|
|
144
145
|
|
|
145
146
|
|
|
147
|
+
def _get_list_indent(list_indent_type: str, list_indent_width: int) -> str:
|
|
148
|
+
if list_indent_type == "tabs":
|
|
149
|
+
return "\t"
|
|
150
|
+
return " " * list_indent_width
|
|
151
|
+
|
|
152
|
+
|
|
146
153
|
def _is_nested_tag(el: PageElement) -> bool:
|
|
147
154
|
return isinstance(el, Tag) and el.name in {
|
|
148
155
|
"ol",
|
|
@@ -170,6 +177,7 @@ def _process_tag(
|
|
|
170
177
|
escape_misc: bool,
|
|
171
178
|
escape_underscores: bool,
|
|
172
179
|
strip: set[str] | None,
|
|
180
|
+
whitespace_handler: WhitespaceHandler,
|
|
173
181
|
context_before: str = "",
|
|
174
182
|
) -> str:
|
|
175
183
|
should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
|
|
@@ -218,6 +226,7 @@ def _process_tag(
|
|
|
218
226
|
escape_misc=escape_misc,
|
|
219
227
|
escape_asterisks=escape_asterisks,
|
|
220
228
|
escape_underscores=escape_underscores,
|
|
229
|
+
whitespace_handler=whitespace_handler,
|
|
221
230
|
)
|
|
222
231
|
)
|
|
223
232
|
elif isinstance(el, Tag):
|
|
@@ -232,6 +241,7 @@ def _process_tag(
|
|
|
232
241
|
escape_misc=escape_misc,
|
|
233
242
|
escape_underscores=escape_underscores,
|
|
234
243
|
strip=strip,
|
|
244
|
+
whitespace_handler=whitespace_handler,
|
|
235
245
|
context_before=(context_before + current_text)[-2:],
|
|
236
246
|
)
|
|
237
247
|
)
|
|
@@ -259,6 +269,7 @@ def _process_text(
|
|
|
259
269
|
escape_misc: bool,
|
|
260
270
|
escape_asterisks: bool,
|
|
261
271
|
escape_underscores: bool,
|
|
272
|
+
whitespace_handler: WhitespaceHandler,
|
|
262
273
|
) -> str:
|
|
263
274
|
text = str(el) or ""
|
|
264
275
|
|
|
@@ -275,69 +286,9 @@ def _process_text(
|
|
|
275
286
|
if len(ancestor_names) > 10:
|
|
276
287
|
break
|
|
277
288
|
|
|
278
|
-
|
|
279
|
-
if text.strip() == "":
|
|
280
|
-
if "\n" in text:
|
|
281
|
-
text = ""
|
|
282
|
-
else:
|
|
283
|
-
block_elements = {
|
|
284
|
-
"p",
|
|
285
|
-
"ul",
|
|
286
|
-
"ol",
|
|
287
|
-
"div",
|
|
288
|
-
"blockquote",
|
|
289
|
-
"pre",
|
|
290
|
-
"h1",
|
|
291
|
-
"h2",
|
|
292
|
-
"h3",
|
|
293
|
-
"h4",
|
|
294
|
-
"h5",
|
|
295
|
-
"h6",
|
|
296
|
-
"table",
|
|
297
|
-
"dl",
|
|
298
|
-
"hr",
|
|
299
|
-
"figure",
|
|
300
|
-
"article",
|
|
301
|
-
"section",
|
|
302
|
-
"nav",
|
|
303
|
-
"aside",
|
|
304
|
-
"header",
|
|
305
|
-
"footer",
|
|
306
|
-
"main",
|
|
307
|
-
"form",
|
|
308
|
-
"fieldset",
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
prev_sibling = el.previous_sibling
|
|
312
|
-
next_sibling = el.next_sibling
|
|
313
|
-
|
|
314
|
-
if (
|
|
315
|
-
prev_sibling
|
|
316
|
-
and hasattr(prev_sibling, "name")
|
|
317
|
-
and prev_sibling.name in block_elements
|
|
318
|
-
and next_sibling
|
|
319
|
-
and hasattr(next_sibling, "name")
|
|
320
|
-
and next_sibling.name in block_elements
|
|
321
|
-
):
|
|
322
|
-
text = ""
|
|
323
|
-
else:
|
|
324
|
-
text = " " if text else ""
|
|
325
|
-
else:
|
|
326
|
-
has_leading_space = text.startswith((" ", "\t"))
|
|
327
|
-
has_trailing_space = text.endswith((" ", "\t"))
|
|
328
|
-
|
|
329
|
-
middle_content = (
|
|
330
|
-
text[1:-1]
|
|
331
|
-
if has_leading_space and has_trailing_space
|
|
332
|
-
else text[1:]
|
|
333
|
-
if has_leading_space
|
|
334
|
-
else text[:-1]
|
|
335
|
-
if has_trailing_space
|
|
336
|
-
else text
|
|
337
|
-
)
|
|
289
|
+
in_pre = bool(ancestor_names.intersection({"pre"}))
|
|
338
290
|
|
|
339
|
-
|
|
340
|
-
text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
|
|
291
|
+
text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
|
|
341
292
|
|
|
342
293
|
if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
|
|
343
294
|
text = escape(
|
|
@@ -357,7 +308,6 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
|
|
|
357
308
|
|
|
358
309
|
|
|
359
310
|
def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
360
|
-
"""Get set of ancestor tag names for efficient parent checking."""
|
|
361
311
|
elem_id = id(element)
|
|
362
312
|
cache = _ancestor_cache.get()
|
|
363
313
|
if cache is None:
|
|
@@ -388,7 +338,6 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
|
388
338
|
|
|
389
339
|
|
|
390
340
|
def _has_ancestor(element: PageElement, tag_names: str | list[str]) -> bool:
|
|
391
|
-
"""Check if element has any of the specified ancestors efficiently."""
|
|
392
341
|
if isinstance(tag_names, str):
|
|
393
342
|
tag_names = [tag_names]
|
|
394
343
|
|
|
@@ -414,14 +363,6 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
|
|
|
414
363
|
|
|
415
364
|
|
|
416
365
|
def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
417
|
-
"""Extract metadata from HTML document.
|
|
418
|
-
|
|
419
|
-
Args:
|
|
420
|
-
soup: BeautifulSoup instance of the HTML document.
|
|
421
|
-
|
|
422
|
-
Returns:
|
|
423
|
-
Dictionary of metadata key-value pairs.
|
|
424
|
-
"""
|
|
425
366
|
metadata = {}
|
|
426
367
|
|
|
427
368
|
title_tag = soup.find("title")
|
|
@@ -468,14 +409,6 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
468
409
|
|
|
469
410
|
|
|
470
411
|
def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
471
|
-
"""Format metadata as a Markdown comment block.
|
|
472
|
-
|
|
473
|
-
Args:
|
|
474
|
-
metadata: Dictionary of metadata key-value pairs.
|
|
475
|
-
|
|
476
|
-
Returns:
|
|
477
|
-
Formatted metadata comment block.
|
|
478
|
-
"""
|
|
479
412
|
if not metadata:
|
|
480
413
|
return ""
|
|
481
414
|
|
|
@@ -511,64 +444,87 @@ def convert_to_markdown(
|
|
|
511
444
|
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
512
445
|
highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
|
|
513
446
|
keep_inline_images_in: Iterable[str] | None = None,
|
|
447
|
+
list_indent_type: Literal["spaces", "tabs"] = "spaces",
|
|
448
|
+
list_indent_width: int = 4,
|
|
514
449
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
450
|
+
preprocess_html: bool = False,
|
|
451
|
+
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
452
|
+
remove_forms: bool = True,
|
|
453
|
+
remove_navigation: bool = True,
|
|
515
454
|
strip: str | Iterable[str] | None = None,
|
|
516
455
|
strip_newlines: bool = False,
|
|
517
456
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
518
457
|
sub_symbol: str = "",
|
|
519
458
|
sup_symbol: str = "",
|
|
459
|
+
whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
|
|
520
460
|
wrap: bool = False,
|
|
521
461
|
wrap_width: int = 80,
|
|
522
|
-
preprocess_html: bool = False,
|
|
523
|
-
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
524
|
-
remove_navigation: bool = True,
|
|
525
|
-
remove_forms: bool = True,
|
|
526
462
|
) -> str:
|
|
527
|
-
"""Convert HTML to Markdown.
|
|
463
|
+
"""Convert HTML content to Markdown format.
|
|
528
464
|
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
stream_processing: Use streaming processing for large documents. Defaults to False.
|
|
532
|
-
chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
|
|
533
|
-
chunk_callback: Optional callback function called with each processed chunk.
|
|
534
|
-
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
535
|
-
parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
|
|
536
|
-
Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
|
|
537
|
-
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
538
|
-
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
539
|
-
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
540
|
-
code_language_callback: Function to dynamically determine the language for code blocks.
|
|
541
|
-
convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
542
|
-
convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
543
|
-
custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
|
|
544
|
-
default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
545
|
-
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
546
|
-
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
547
|
-
escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
|
|
548
|
-
extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
|
|
549
|
-
heading_style: The style to use for Markdown headings. Defaults to "underlined".
|
|
550
|
-
highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
|
|
551
|
-
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
552
|
-
newline_style: Style for handling newlines in text content. Defaults to "spaces".
|
|
553
|
-
strip: Tags to strip from the output. Defaults to None.
|
|
554
|
-
strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
|
|
555
|
-
strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
|
|
556
|
-
sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
|
|
557
|
-
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
558
|
-
wrap: Wrap text to the specified width. Defaults to False.
|
|
559
|
-
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
560
|
-
preprocess_html: Apply HTML preprocessing to improve quality. Defaults to False.
|
|
561
|
-
preprocessing_preset: Preset configuration for preprocessing. Defaults to "standard".
|
|
562
|
-
remove_navigation: Remove navigation elements during preprocessing. Defaults to True.
|
|
563
|
-
remove_forms: Remove form elements during preprocessing. Defaults to True.
|
|
465
|
+
This is the main entry point for converting HTML to Markdown. It supports
|
|
466
|
+
various customization options for controlling the conversion behavior.
|
|
564
467
|
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
468
|
+
Args:
|
|
469
|
+
source: HTML string or BeautifulSoup object to convert.
|
|
470
|
+
stream_processing: Enable streaming mode for large documents.
|
|
471
|
+
chunk_size: Size of chunks for streaming processing.
|
|
472
|
+
chunk_callback: Callback for processing chunks in streaming mode.
|
|
473
|
+
progress_callback: Callback for progress updates (current, total).
|
|
474
|
+
parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
|
|
475
|
+
autolinks: Convert URLs to automatic links.
|
|
476
|
+
bullets: Characters to use for unordered list bullets.
|
|
477
|
+
code_language: Default language for code blocks.
|
|
478
|
+
code_language_callback: Callback to determine code language from element.
|
|
479
|
+
convert: HTML tags to convert to Markdown.
|
|
480
|
+
convert_as_inline: Treat block elements as inline during conversion.
|
|
481
|
+
custom_converters: Custom converters for specific HTML elements.
|
|
482
|
+
default_title: Add a default title if none exists.
|
|
483
|
+
escape_asterisks: Escape asterisk characters in text.
|
|
484
|
+
escape_misc: Escape miscellaneous Markdown characters.
|
|
485
|
+
escape_underscores: Escape underscore characters in text.
|
|
486
|
+
extract_metadata: Extract metadata from HTML head.
|
|
487
|
+
heading_style: Style for headings ('underlined', 'atx', 'atx_closed').
|
|
488
|
+
highlight_style: Style for highlighting ('double-equal', 'html', 'bold').
|
|
489
|
+
keep_inline_images_in: Parent tags where images should remain inline.
|
|
490
|
+
list_indent_type: Type of indentation for lists ('spaces', 'tabs').
|
|
491
|
+
list_indent_width: Number of spaces for list indentation.
|
|
492
|
+
newline_style: Style for newlines ('spaces', 'backslash').
|
|
493
|
+
preprocess_html: Enable HTML preprocessing to clean up content.
|
|
494
|
+
preprocessing_preset: Preprocessing aggressiveness level.
|
|
495
|
+
remove_forms: Remove form elements during preprocessing.
|
|
496
|
+
remove_navigation: Remove navigation elements during preprocessing.
|
|
497
|
+
strip: HTML tags to strip from output.
|
|
498
|
+
strip_newlines: Remove newlines from HTML before processing.
|
|
499
|
+
strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
|
|
500
|
+
sub_symbol: Symbol for subscript text.
|
|
501
|
+
sup_symbol: Symbol for superscript text.
|
|
502
|
+
whitespace_mode: How to handle whitespace ('normalized', 'strict').
|
|
503
|
+
wrap: Enable text wrapping.
|
|
504
|
+
wrap_width: Column width for text wrapping.
|
|
569
505
|
|
|
570
506
|
Returns:
|
|
571
|
-
|
|
507
|
+
The converted Markdown string.
|
|
508
|
+
|
|
509
|
+
Raises:
|
|
510
|
+
EmptyHtmlError: If the HTML input is empty.
|
|
511
|
+
MissingDependencyError: If required dependencies are not installed.
|
|
512
|
+
ConflictingOptionsError: If conflicting options are provided.
|
|
513
|
+
|
|
514
|
+
Examples:
|
|
515
|
+
Basic conversion:
|
|
516
|
+
>>> html = "<h1>Title</h1><p>Content</p>"
|
|
517
|
+
>>> convert_to_markdown(html)
|
|
518
|
+
'Title\\n=====\\n\\nContent\\n\\n'
|
|
519
|
+
|
|
520
|
+
With custom options:
|
|
521
|
+
>>> convert_to_markdown(html, heading_style="atx", list_indent_width=2)
|
|
522
|
+
'# Title\\n\\nContent\\n\\n'
|
|
523
|
+
|
|
524
|
+
Discord-compatible lists (2-space indent):
|
|
525
|
+
>>> html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
|
|
526
|
+
>>> convert_to_markdown(html, list_indent_width=2)
|
|
527
|
+
'* Item 1\\n* Item 2\\n\\n'
|
|
572
528
|
"""
|
|
573
529
|
if isinstance(source, str):
|
|
574
530
|
if (
|
|
@@ -665,6 +621,7 @@ def convert_to_markdown(
|
|
|
665
621
|
sup_symbol=sup_symbol,
|
|
666
622
|
wrap=wrap,
|
|
667
623
|
wrap_width=wrap_width,
|
|
624
|
+
whitespace_mode=whitespace_mode,
|
|
668
625
|
):
|
|
669
626
|
if chunk_callback:
|
|
670
627
|
chunk_callback(chunk)
|
|
@@ -681,9 +638,12 @@ def convert_to_markdown(
|
|
|
681
638
|
|
|
682
639
|
sink = StringSink()
|
|
683
640
|
|
|
641
|
+
whitespace_handler = WhitespaceHandler(whitespace_mode)
|
|
642
|
+
|
|
684
643
|
_process_html_core(
|
|
685
644
|
source,
|
|
686
645
|
sink,
|
|
646
|
+
whitespace_handler=whitespace_handler,
|
|
687
647
|
parser=parser,
|
|
688
648
|
autolinks=autolinks,
|
|
689
649
|
bullets=bullets,
|
|
@@ -700,6 +660,8 @@ def convert_to_markdown(
|
|
|
700
660
|
heading_style=heading_style,
|
|
701
661
|
highlight_style=highlight_style,
|
|
702
662
|
keep_inline_images_in=keep_inline_images_in,
|
|
663
|
+
list_indent_type=list_indent_type,
|
|
664
|
+
list_indent_width=list_indent_width,
|
|
703
665
|
newline_style=newline_style,
|
|
704
666
|
strip=strip,
|
|
705
667
|
strip_newlines=strip_newlines,
|
|
@@ -761,34 +723,25 @@ def convert_to_markdown(
|
|
|
761
723
|
|
|
762
724
|
|
|
763
725
|
class OutputSink:
|
|
764
|
-
"""Abstract output sink for processed markdown text."""
|
|
765
|
-
|
|
766
726
|
def write(self, text: str) -> None:
|
|
767
|
-
"""Write text to the sink."""
|
|
768
727
|
raise NotImplementedError
|
|
769
728
|
|
|
770
729
|
def finalize(self) -> None:
|
|
771
|
-
|
|
730
|
+
pass
|
|
772
731
|
|
|
773
732
|
|
|
774
733
|
class StringSink(OutputSink):
|
|
775
|
-
"""Collects all output into a single string."""
|
|
776
|
-
|
|
777
734
|
def __init__(self) -> None:
|
|
778
735
|
self.buffer = StringIO()
|
|
779
736
|
|
|
780
737
|
def write(self, text: str) -> None:
|
|
781
|
-
"""Write text to the buffer."""
|
|
782
738
|
self.buffer.write(text)
|
|
783
739
|
|
|
784
740
|
def get_result(self) -> str:
|
|
785
|
-
"""Get the complete result string."""
|
|
786
741
|
return self.buffer.getvalue()
|
|
787
742
|
|
|
788
743
|
|
|
789
744
|
class StreamingSink(OutputSink):
|
|
790
|
-
"""Yields chunks of output for streaming processing."""
|
|
791
|
-
|
|
792
745
|
def __init__(self, chunk_size: int = 1024, progress_callback: Callable[[int, int], None] | None = None) -> None:
|
|
793
746
|
self.chunk_size = chunk_size
|
|
794
747
|
self.progress_callback = progress_callback
|
|
@@ -799,7 +752,6 @@ class StreamingSink(OutputSink):
|
|
|
799
752
|
self.chunks: list[str] = []
|
|
800
753
|
|
|
801
754
|
def write(self, text: str) -> None:
|
|
802
|
-
"""Write text and yield chunks when threshold is reached."""
|
|
803
755
|
if not text:
|
|
804
756
|
return
|
|
805
757
|
|
|
@@ -822,7 +774,6 @@ class StreamingSink(OutputSink):
|
|
|
822
774
|
self.buffer_size = len(current_content)
|
|
823
775
|
|
|
824
776
|
def finalize(self) -> None:
|
|
825
|
-
"""Finalize and yield any remaining content."""
|
|
826
777
|
if self.buffer_size > 0:
|
|
827
778
|
content = self.buffer.getvalue()
|
|
828
779
|
self.chunks.append(content)
|
|
@@ -830,11 +781,9 @@ class StreamingSink(OutputSink):
|
|
|
830
781
|
self._update_progress()
|
|
831
782
|
|
|
832
783
|
def get_chunks(self) -> Generator[str, None, None]:
|
|
833
|
-
"""Get all chunks yielded during processing."""
|
|
834
784
|
yield from self.chunks
|
|
835
785
|
|
|
836
786
|
def _find_split_position(self, content: str) -> int:
|
|
837
|
-
"""Find optimal position to split content for chunks."""
|
|
838
787
|
target = self.chunk_size
|
|
839
788
|
lookahead = min(100, len(content) - target)
|
|
840
789
|
|
|
@@ -847,7 +796,6 @@ class StreamingSink(OutputSink):
|
|
|
847
796
|
return min(target, len(content))
|
|
848
797
|
|
|
849
798
|
def _update_progress(self) -> None:
|
|
850
|
-
"""Update progress if callback is provided."""
|
|
851
799
|
if self.progress_callback:
|
|
852
800
|
self.progress_callback(self.processed_bytes, self.total_bytes)
|
|
853
801
|
|
|
@@ -856,6 +804,7 @@ def _process_html_core(
|
|
|
856
804
|
source: str | BeautifulSoup,
|
|
857
805
|
sink: OutputSink,
|
|
858
806
|
*,
|
|
807
|
+
whitespace_handler: WhitespaceHandler,
|
|
859
808
|
parser: str | None = None,
|
|
860
809
|
autolinks: bool,
|
|
861
810
|
bullets: str,
|
|
@@ -872,6 +821,8 @@ def _process_html_core(
|
|
|
872
821
|
heading_style: Literal["underlined", "atx", "atx_closed"],
|
|
873
822
|
highlight_style: Literal["double-equal", "html", "bold"],
|
|
874
823
|
keep_inline_images_in: Iterable[str] | None,
|
|
824
|
+
list_indent_type: str,
|
|
825
|
+
list_indent_width: int,
|
|
875
826
|
newline_style: Literal["spaces", "backslash"],
|
|
876
827
|
strip: str | Iterable[str] | None,
|
|
877
828
|
strip_newlines: bool,
|
|
@@ -881,7 +832,6 @@ def _process_html_core(
|
|
|
881
832
|
wrap: bool,
|
|
882
833
|
wrap_width: int,
|
|
883
834
|
) -> None:
|
|
884
|
-
"""Core HTML to Markdown processing logic shared by both regular and streaming."""
|
|
885
835
|
token = _ancestor_cache.set({})
|
|
886
836
|
|
|
887
837
|
try:
|
|
@@ -921,6 +871,8 @@ def _process_html_core(
|
|
|
921
871
|
heading_style=heading_style,
|
|
922
872
|
highlight_style=highlight_style,
|
|
923
873
|
keep_inline_images_in=keep_inline_images_in,
|
|
874
|
+
list_indent_type=list_indent_type,
|
|
875
|
+
list_indent_width=list_indent_width,
|
|
924
876
|
newline_style=newline_style,
|
|
925
877
|
strong_em_symbol=strong_em_symbol,
|
|
926
878
|
sub_symbol=sub_symbol,
|
|
@@ -948,6 +900,7 @@ def _process_html_core(
|
|
|
948
900
|
escape_misc=escape_misc,
|
|
949
901
|
escape_asterisks=escape_asterisks,
|
|
950
902
|
escape_underscores=escape_underscores,
|
|
903
|
+
whitespace_handler=whitespace_handler,
|
|
951
904
|
)
|
|
952
905
|
sink.write(text)
|
|
953
906
|
context += text
|
|
@@ -961,6 +914,7 @@ def _process_html_core(
|
|
|
961
914
|
escape_misc=escape_misc,
|
|
962
915
|
escape_underscores=escape_underscores,
|
|
963
916
|
strip=_as_optional_set(strip),
|
|
917
|
+
whitespace_handler=whitespace_handler,
|
|
964
918
|
context_before=context[-2:],
|
|
965
919
|
)
|
|
966
920
|
sink.write(text)
|
|
@@ -992,54 +946,18 @@ def convert_to_markdown_stream(
|
|
|
992
946
|
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
993
947
|
highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
|
|
994
948
|
keep_inline_images_in: Iterable[str] | None = None,
|
|
949
|
+
list_indent_type: Literal["spaces", "tabs"] = "spaces",
|
|
950
|
+
list_indent_width: int = 4,
|
|
995
951
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
996
952
|
strip: str | Iterable[str] | None = None,
|
|
997
953
|
strip_newlines: bool = False,
|
|
998
954
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
999
955
|
sub_symbol: str = "",
|
|
1000
956
|
sup_symbol: str = "",
|
|
957
|
+
whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
|
|
1001
958
|
wrap: bool = False,
|
|
1002
959
|
wrap_width: int = 80,
|
|
1003
960
|
) -> Generator[str, None, None]:
|
|
1004
|
-
"""Convert HTML to Markdown using streaming/chunked processing.
|
|
1005
|
-
|
|
1006
|
-
This function yields chunks of converted Markdown text, allowing for
|
|
1007
|
-
memory-efficient processing of large HTML documents. The output is guaranteed
|
|
1008
|
-
to be identical to convert_to_markdown().
|
|
1009
|
-
|
|
1010
|
-
Args:
|
|
1011
|
-
source: An HTML document or a an initialized instance of BeautifulSoup.
|
|
1012
|
-
chunk_size: Size of chunks to yield (approximate, in characters).
|
|
1013
|
-
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
1014
|
-
parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
|
|
1015
|
-
Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
|
|
1016
|
-
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
1017
|
-
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
1018
|
-
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
1019
|
-
code_language_callback: Function to dynamically determine the language for code blocks.
|
|
1020
|
-
convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
|
|
1021
|
-
convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
|
|
1022
|
-
custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
|
|
1023
|
-
default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
|
|
1024
|
-
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
1025
|
-
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
1026
|
-
escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
|
|
1027
|
-
extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
|
|
1028
|
-
heading_style: The style to use for Markdown headings. Defaults to "underlined".
|
|
1029
|
-
highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
|
|
1030
|
-
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
1031
|
-
newline_style: Style for handling newlines in text content. Defaults to "spaces".
|
|
1032
|
-
strip: Tags to strip from the output. Defaults to None.
|
|
1033
|
-
strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
|
|
1034
|
-
strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
|
|
1035
|
-
sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
|
|
1036
|
-
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
1037
|
-
wrap: Wrap text to the specified width. Defaults to False.
|
|
1038
|
-
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
1039
|
-
|
|
1040
|
-
Yields:
|
|
1041
|
-
str: Chunks of Markdown-formatted text.
|
|
1042
|
-
"""
|
|
1043
961
|
sink = StreamingSink(chunk_size, progress_callback)
|
|
1044
962
|
|
|
1045
963
|
if isinstance(source, str):
|
|
@@ -1047,9 +965,12 @@ def convert_to_markdown_stream(
|
|
|
1047
965
|
elif isinstance(source, BeautifulSoup):
|
|
1048
966
|
sink.total_bytes = len(str(source))
|
|
1049
967
|
|
|
968
|
+
whitespace_handler = WhitespaceHandler(whitespace_mode)
|
|
969
|
+
|
|
1050
970
|
_process_html_core(
|
|
1051
971
|
source,
|
|
1052
972
|
sink,
|
|
973
|
+
whitespace_handler=whitespace_handler,
|
|
1053
974
|
parser=parser,
|
|
1054
975
|
autolinks=autolinks,
|
|
1055
976
|
bullets=bullets,
|
|
@@ -1066,6 +987,8 @@ def convert_to_markdown_stream(
|
|
|
1066
987
|
heading_style=heading_style,
|
|
1067
988
|
highlight_style=highlight_style,
|
|
1068
989
|
keep_inline_images_in=keep_inline_images_in,
|
|
990
|
+
list_indent_type=list_indent_type,
|
|
991
|
+
list_indent_width=list_indent_width,
|
|
1069
992
|
newline_style=newline_style,
|
|
1070
993
|
strip=strip,
|
|
1071
994
|
strip_newlines=strip_newlines,
|