html-to-markdown 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -1,15 +1,11 @@
1
- """Custom exceptions for the html-to-markdown library."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
 
6
4
  class HtmlToMarkdownError(Exception):
7
- """Base exception for all html-to-markdown errors."""
5
+ pass
8
6
 
9
7
 
10
8
  class MissingDependencyError(HtmlToMarkdownError):
11
- """Raised when an optional dependency is required but not installed."""
12
-
13
9
  def __init__(self, dependency: str, install_command: str | None = None) -> None:
14
10
  self.dependency = dependency
15
11
  self.install_command = install_command
@@ -22,8 +18,6 @@ class MissingDependencyError(HtmlToMarkdownError):
22
18
 
23
19
 
24
20
  class InvalidParserError(HtmlToMarkdownError):
25
- """Raised when an invalid parser is specified."""
26
-
27
21
  def __init__(self, parser: str, available_parsers: list[str]) -> None:
28
22
  self.parser = parser
29
23
  self.available_parsers = available_parsers
@@ -33,15 +27,11 @@ class InvalidParserError(HtmlToMarkdownError):
33
27
 
34
28
 
35
29
  class EmptyHtmlError(HtmlToMarkdownError):
36
- """Raised when the input HTML is empty."""
37
-
38
30
  def __init__(self) -> None:
39
31
  super().__init__("The input HTML is empty.")
40
32
 
41
33
 
42
34
  class ConflictingOptionsError(HtmlToMarkdownError):
43
- """Raised when conflicting options are specified."""
44
-
45
35
  def __init__(self, option1: str, option2: str) -> None:
46
36
  self.option1 = option1
47
37
  self.option2 = option2
@@ -1,5 +1,3 @@
1
- """HTML preprocessing using nh3 (ammonia bindings) for improved quality and performance."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import re
@@ -22,24 +20,6 @@ def preprocess_html(
22
20
  custom_tags_to_remove: set[str] | None = None,
23
21
  custom_attributes_to_remove: set[str] | None = None,
24
22
  ) -> str:
25
- """Preprocess HTML to remove unwanted elements and improve quality.
26
-
27
- Args:
28
- html: Raw HTML content to preprocess.
29
- remove_navigation: Remove navigation elements and menus.
30
- remove_forms: Remove form elements (input, button, select, etc.).
31
- remove_scripts: Remove script tags and content.
32
- remove_styles: Remove style tags and content.
33
- remove_comments: Remove HTML comments.
34
- preserve_semantic_structure: Preserve semantic HTML5 elements.
35
- preserve_tables: Preserve table structure.
36
- preserve_media: Preserve media elements (img, video, audio).
37
- custom_tags_to_remove: Additional tags to remove.
38
- custom_attributes_to_remove: Additional attributes to remove.
39
-
40
- Returns:
41
- Cleaned HTML ready for conversion to markdown.
42
- """
43
23
  if not html or not html.strip(): # pragma: no cover
44
24
  return html
45
25
 
@@ -83,7 +63,6 @@ def _configure_cleaning_rules(
83
63
  custom_tags_to_remove: set[str],
84
64
  custom_attributes_to_remove: set[str],
85
65
  ) -> dict[str, Any]:
86
- """Configure the cleaning rules for nh3."""
87
66
  allowed_tags = {
88
67
  "p",
89
68
  "div",
@@ -254,7 +233,6 @@ def _configure_cleaning_rules(
254
233
 
255
234
 
256
235
  def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
257
- """Remove elements with navigation-related classes."""
258
236
  if not remove_navigation:
259
237
  return html
260
238
 
@@ -288,7 +266,6 @@ def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
288
266
 
289
267
 
290
268
  def _remove_navigation_patterns(html: str, remove_navigation: bool) -> str:
291
- """Remove common navigation patterns that nh3 might miss."""
292
269
  if not remove_navigation:
293
270
  return html
294
271
 
@@ -329,7 +306,6 @@ def _remove_navigation_patterns(html: str, remove_navigation: bool) -> str:
329
306
 
330
307
 
331
308
  def _remove_wikipedia_navigation_lists(html: str) -> str:
332
- """Remove Wikipedia-style navigation lists that appear at the start."""
333
309
  patterns = [
334
310
  r"Main menu\s*\n\n(-\s*\[.*?\]\(.*?\).*?\n){3,}",
335
311
  r"(-\s*\[[^\]]*\]\(/wiki/[^)]*\).*?\n){5,}",
@@ -342,7 +318,6 @@ def _remove_wikipedia_navigation_lists(html: str) -> str:
342
318
 
343
319
 
344
320
  def _fix_whitespace_issues(html: str) -> str:
345
- """Fix common whitespace issues in HTML."""
346
321
  html = re.sub(r"[ \t]{2,}", " ", html)
347
322
  html = re.sub(r"\n\s*\n", "\n\n", html)
348
323
 
@@ -385,18 +360,6 @@ PRESETS: dict[str, dict[str, Any]] = {
385
360
 
386
361
 
387
362
  def create_preprocessor(preset: str = "standard", **overrides: Any) -> dict[str, Any]:
388
- """Create preprocessor configuration with a preset.
389
-
390
- Args:
391
- preset: The preset configuration to use (minimal, standard, aggressive).
392
- **overrides: Any configuration options to override.
393
-
394
- Returns:
395
- Configuration dict for preprocessor.
396
-
397
- Raises:
398
- ValueError: If preset is unknown.
399
- """
400
363
  if preset not in PRESETS:
401
364
  msg = f"Unknown preset '{preset}'. Available presets: {list(PRESETS.keys())}"
402
365
  raise ValueError(msg)
@@ -33,12 +33,13 @@ from html_to_markdown.constants import (
33
33
  DOUBLE_EQUAL,
34
34
  SPACES,
35
35
  UNDERLINED,
36
+ WHITESPACE_NORMALIZED,
36
37
  html_heading_re,
37
- whitespace_re,
38
38
  )
39
39
  from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
40
40
  from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
41
41
  from html_to_markdown.utils import escape
42
+ from html_to_markdown.whitespace import WhitespaceHandler
42
43
 
43
44
  if TYPE_CHECKING:
44
45
  from collections.abc import Iterable
@@ -143,6 +144,12 @@ SupportedTag = Literal[
143
144
  ]
144
145
 
145
146
 
147
+ def _get_list_indent(list_indent_type: str, list_indent_width: int) -> str:
148
+ if list_indent_type == "tabs":
149
+ return "\t"
150
+ return " " * list_indent_width
151
+
152
+
146
153
  def _is_nested_tag(el: PageElement) -> bool:
147
154
  return isinstance(el, Tag) and el.name in {
148
155
  "ol",
@@ -170,6 +177,7 @@ def _process_tag(
170
177
  escape_misc: bool,
171
178
  escape_underscores: bool,
172
179
  strip: set[str] | None,
180
+ whitespace_handler: WhitespaceHandler,
173
181
  context_before: str = "",
174
182
  ) -> str:
175
183
  should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
@@ -218,6 +226,7 @@ def _process_tag(
218
226
  escape_misc=escape_misc,
219
227
  escape_asterisks=escape_asterisks,
220
228
  escape_underscores=escape_underscores,
229
+ whitespace_handler=whitespace_handler,
221
230
  )
222
231
  )
223
232
  elif isinstance(el, Tag):
@@ -232,6 +241,7 @@ def _process_tag(
232
241
  escape_misc=escape_misc,
233
242
  escape_underscores=escape_underscores,
234
243
  strip=strip,
244
+ whitespace_handler=whitespace_handler,
235
245
  context_before=(context_before + current_text)[-2:],
236
246
  )
237
247
  )
@@ -259,6 +269,7 @@ def _process_text(
259
269
  escape_misc: bool,
260
270
  escape_asterisks: bool,
261
271
  escape_underscores: bool,
272
+ whitespace_handler: WhitespaceHandler,
262
273
  ) -> str:
263
274
  text = str(el) or ""
264
275
 
@@ -275,69 +286,9 @@ def _process_text(
275
286
  if len(ancestor_names) > 10:
276
287
  break
277
288
 
278
- if "pre" not in ancestor_names:
279
- if text.strip() == "":
280
- if "\n" in text:
281
- text = ""
282
- else:
283
- block_elements = {
284
- "p",
285
- "ul",
286
- "ol",
287
- "div",
288
- "blockquote",
289
- "pre",
290
- "h1",
291
- "h2",
292
- "h3",
293
- "h4",
294
- "h5",
295
- "h6",
296
- "table",
297
- "dl",
298
- "hr",
299
- "figure",
300
- "article",
301
- "section",
302
- "nav",
303
- "aside",
304
- "header",
305
- "footer",
306
- "main",
307
- "form",
308
- "fieldset",
309
- }
310
-
311
- prev_sibling = el.previous_sibling
312
- next_sibling = el.next_sibling
313
-
314
- if (
315
- prev_sibling
316
- and hasattr(prev_sibling, "name")
317
- and prev_sibling.name in block_elements
318
- and next_sibling
319
- and hasattr(next_sibling, "name")
320
- and next_sibling.name in block_elements
321
- ):
322
- text = ""
323
- else:
324
- text = " " if text else ""
325
- else:
326
- has_leading_space = text.startswith((" ", "\t"))
327
- has_trailing_space = text.endswith((" ", "\t"))
328
-
329
- middle_content = (
330
- text[1:-1]
331
- if has_leading_space and has_trailing_space
332
- else text[1:]
333
- if has_leading_space
334
- else text[:-1]
335
- if has_trailing_space
336
- else text
337
- )
289
+ in_pre = bool(ancestor_names.intersection({"pre"}))
338
290
 
339
- middle_content = whitespace_re.sub(" ", middle_content.strip())
340
- text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
291
+ text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
341
292
 
342
293
  if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
343
294
  text = escape(
@@ -357,7 +308,6 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
357
308
 
358
309
 
359
310
  def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
360
- """Get set of ancestor tag names for efficient parent checking."""
361
311
  elem_id = id(element)
362
312
  cache = _ancestor_cache.get()
363
313
  if cache is None:
@@ -388,7 +338,6 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
388
338
 
389
339
 
390
340
  def _has_ancestor(element: PageElement, tag_names: str | list[str]) -> bool:
391
- """Check if element has any of the specified ancestors efficiently."""
392
341
  if isinstance(tag_names, str):
393
342
  tag_names = [tag_names]
394
343
 
@@ -414,14 +363,6 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
414
363
 
415
364
 
416
365
  def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
417
- """Extract metadata from HTML document.
418
-
419
- Args:
420
- soup: BeautifulSoup instance of the HTML document.
421
-
422
- Returns:
423
- Dictionary of metadata key-value pairs.
424
- """
425
366
  metadata = {}
426
367
 
427
368
  title_tag = soup.find("title")
@@ -468,14 +409,6 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
468
409
 
469
410
 
470
411
  def _format_metadata_comment(metadata: dict[str, str]) -> str:
471
- """Format metadata as a Markdown comment block.
472
-
473
- Args:
474
- metadata: Dictionary of metadata key-value pairs.
475
-
476
- Returns:
477
- Formatted metadata comment block.
478
- """
479
412
  if not metadata:
480
413
  return ""
481
414
 
@@ -511,64 +444,87 @@ def convert_to_markdown(
511
444
  heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
512
445
  highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
513
446
  keep_inline_images_in: Iterable[str] | None = None,
447
+ list_indent_type: Literal["spaces", "tabs"] = "spaces",
448
+ list_indent_width: int = 4,
514
449
  newline_style: Literal["spaces", "backslash"] = SPACES,
450
+ preprocess_html: bool = False,
451
+ preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
452
+ remove_forms: bool = True,
453
+ remove_navigation: bool = True,
515
454
  strip: str | Iterable[str] | None = None,
516
455
  strip_newlines: bool = False,
517
456
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
518
457
  sub_symbol: str = "",
519
458
  sup_symbol: str = "",
459
+ whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
520
460
  wrap: bool = False,
521
461
  wrap_width: int = 80,
522
- preprocess_html: bool = False,
523
- preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
524
- remove_navigation: bool = True,
525
- remove_forms: bool = True,
526
462
  ) -> str:
527
- """Convert HTML to Markdown.
463
+ """Convert HTML content to Markdown format.
528
464
 
529
- Args:
530
- source: An HTML document or a an initialized instance of BeautifulSoup.
531
- stream_processing: Use streaming processing for large documents. Defaults to False.
532
- chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
533
- chunk_callback: Optional callback function called with each processed chunk.
534
- progress_callback: Optional callback function called with (processed_bytes, total_bytes).
535
- parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
536
- Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
537
- autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
538
- bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
539
- code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
540
- code_language_callback: Function to dynamically determine the language for code blocks.
541
- convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
542
- convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
543
- custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
544
- default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
545
- escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
546
- escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
547
- escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
548
- extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
549
- heading_style: The style to use for Markdown headings. Defaults to "underlined".
550
- highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
551
- keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
552
- newline_style: Style for handling newlines in text content. Defaults to "spaces".
553
- strip: Tags to strip from the output. Defaults to None.
554
- strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
555
- strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
556
- sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
557
- sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
558
- wrap: Wrap text to the specified width. Defaults to False.
559
- wrap_width: The number of characters at which to wrap text. Defaults to 80.
560
- preprocess_html: Apply HTML preprocessing to improve quality. Defaults to False.
561
- preprocessing_preset: Preset configuration for preprocessing. Defaults to "standard".
562
- remove_navigation: Remove navigation elements during preprocessing. Defaults to True.
563
- remove_forms: Remove form elements during preprocessing. Defaults to True.
465
+ This is the main entry point for converting HTML to Markdown. It supports
466
+ various customization options for controlling the conversion behavior.
564
467
 
565
- Raises:
566
- ConflictingOptionsError: If both 'strip' and 'convert' are specified.
567
- EmptyHtmlError: When the input HTML is empty.
568
- MissingDependencyError: When lxml parser is requested but not installed.
468
+ Args:
469
+ source: HTML string or BeautifulSoup object to convert.
470
+ stream_processing: Enable streaming mode for large documents.
471
+ chunk_size: Size of chunks for streaming processing.
472
+ chunk_callback: Callback for processing chunks in streaming mode.
473
+ progress_callback: Callback for progress updates (current, total).
474
+ parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
475
+ autolinks: Convert URLs to automatic links.
476
+ bullets: Characters to use for unordered list bullets.
477
+ code_language: Default language for code blocks.
478
+ code_language_callback: Callback to determine code language from element.
479
+ convert: HTML tags to convert to Markdown.
480
+ convert_as_inline: Treat block elements as inline during conversion.
481
+ custom_converters: Custom converters for specific HTML elements.
482
+ default_title: Add a default title if none exists.
483
+ escape_asterisks: Escape asterisk characters in text.
484
+ escape_misc: Escape miscellaneous Markdown characters.
485
+ escape_underscores: Escape underscore characters in text.
486
+ extract_metadata: Extract metadata from HTML head.
487
+ heading_style: Style for headings ('underlined', 'atx', 'atx_closed').
488
+ highlight_style: Style for highlighting ('double-equal', 'html', 'bold').
489
+ keep_inline_images_in: Parent tags where images should remain inline.
490
+ list_indent_type: Type of indentation for lists ('spaces', 'tabs').
491
+ list_indent_width: Number of spaces for list indentation.
492
+ newline_style: Style for newlines ('spaces', 'backslash').
493
+ preprocess_html: Enable HTML preprocessing to clean up content.
494
+ preprocessing_preset: Preprocessing aggressiveness level.
495
+ remove_forms: Remove form elements during preprocessing.
496
+ remove_navigation: Remove navigation elements during preprocessing.
497
+ strip: HTML tags to strip from output.
498
+ strip_newlines: Remove newlines from HTML before processing.
499
+ strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
500
+ sub_symbol: Symbol for subscript text.
501
+ sup_symbol: Symbol for superscript text.
502
+ whitespace_mode: How to handle whitespace ('normalized', 'strict').
503
+ wrap: Enable text wrapping.
504
+ wrap_width: Column width for text wrapping.
569
505
 
570
506
  Returns:
571
- str: A string of Markdown-formatted text converted from the given HTML.
507
+ The converted Markdown string.
508
+
509
+ Raises:
510
+ EmptyHtmlError: If the HTML input is empty.
511
+ MissingDependencyError: If required dependencies are not installed.
512
+ ConflictingOptionsError: If conflicting options are provided.
513
+
514
+ Examples:
515
+ Basic conversion:
516
+ >>> html = "<h1>Title</h1><p>Content</p>"
517
+ >>> convert_to_markdown(html)
518
+ 'Title\\n=====\\n\\nContent\\n\\n'
519
+
520
+ With custom options:
521
+ >>> convert_to_markdown(html, heading_style="atx", list_indent_width=2)
522
+ '# Title\\n\\nContent\\n\\n'
523
+
524
+ Discord-compatible lists (2-space indent):
525
+ >>> html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
526
+ >>> convert_to_markdown(html, list_indent_width=2)
527
+ '* Item 1\\n* Item 2\\n\\n'
572
528
  """
573
529
  if isinstance(source, str):
574
530
  if (
@@ -665,6 +621,7 @@ def convert_to_markdown(
665
621
  sup_symbol=sup_symbol,
666
622
  wrap=wrap,
667
623
  wrap_width=wrap_width,
624
+ whitespace_mode=whitespace_mode,
668
625
  ):
669
626
  if chunk_callback:
670
627
  chunk_callback(chunk)
@@ -681,9 +638,12 @@ def convert_to_markdown(
681
638
 
682
639
  sink = StringSink()
683
640
 
641
+ whitespace_handler = WhitespaceHandler(whitespace_mode)
642
+
684
643
  _process_html_core(
685
644
  source,
686
645
  sink,
646
+ whitespace_handler=whitespace_handler,
687
647
  parser=parser,
688
648
  autolinks=autolinks,
689
649
  bullets=bullets,
@@ -700,6 +660,8 @@ def convert_to_markdown(
700
660
  heading_style=heading_style,
701
661
  highlight_style=highlight_style,
702
662
  keep_inline_images_in=keep_inline_images_in,
663
+ list_indent_type=list_indent_type,
664
+ list_indent_width=list_indent_width,
703
665
  newline_style=newline_style,
704
666
  strip=strip,
705
667
  strip_newlines=strip_newlines,
@@ -761,34 +723,25 @@ def convert_to_markdown(
761
723
 
762
724
 
763
725
  class OutputSink:
764
- """Abstract output sink for processed markdown text."""
765
-
766
726
  def write(self, text: str) -> None:
767
- """Write text to the sink."""
768
727
  raise NotImplementedError
769
728
 
770
729
  def finalize(self) -> None:
771
- """Finalize the output."""
730
+ pass
772
731
 
773
732
 
774
733
  class StringSink(OutputSink):
775
- """Collects all output into a single string."""
776
-
777
734
  def __init__(self) -> None:
778
735
  self.buffer = StringIO()
779
736
 
780
737
  def write(self, text: str) -> None:
781
- """Write text to the buffer."""
782
738
  self.buffer.write(text)
783
739
 
784
740
  def get_result(self) -> str:
785
- """Get the complete result string."""
786
741
  return self.buffer.getvalue()
787
742
 
788
743
 
789
744
  class StreamingSink(OutputSink):
790
- """Yields chunks of output for streaming processing."""
791
-
792
745
  def __init__(self, chunk_size: int = 1024, progress_callback: Callable[[int, int], None] | None = None) -> None:
793
746
  self.chunk_size = chunk_size
794
747
  self.progress_callback = progress_callback
@@ -799,7 +752,6 @@ class StreamingSink(OutputSink):
799
752
  self.chunks: list[str] = []
800
753
 
801
754
  def write(self, text: str) -> None:
802
- """Write text and yield chunks when threshold is reached."""
803
755
  if not text:
804
756
  return
805
757
 
@@ -822,7 +774,6 @@ class StreamingSink(OutputSink):
822
774
  self.buffer_size = len(current_content)
823
775
 
824
776
  def finalize(self) -> None:
825
- """Finalize and yield any remaining content."""
826
777
  if self.buffer_size > 0:
827
778
  content = self.buffer.getvalue()
828
779
  self.chunks.append(content)
@@ -830,11 +781,9 @@ class StreamingSink(OutputSink):
830
781
  self._update_progress()
831
782
 
832
783
  def get_chunks(self) -> Generator[str, None, None]:
833
- """Get all chunks yielded during processing."""
834
784
  yield from self.chunks
835
785
 
836
786
  def _find_split_position(self, content: str) -> int:
837
- """Find optimal position to split content for chunks."""
838
787
  target = self.chunk_size
839
788
  lookahead = min(100, len(content) - target)
840
789
 
@@ -847,7 +796,6 @@ class StreamingSink(OutputSink):
847
796
  return min(target, len(content))
848
797
 
849
798
  def _update_progress(self) -> None:
850
- """Update progress if callback is provided."""
851
799
  if self.progress_callback:
852
800
  self.progress_callback(self.processed_bytes, self.total_bytes)
853
801
 
@@ -856,6 +804,7 @@ def _process_html_core(
856
804
  source: str | BeautifulSoup,
857
805
  sink: OutputSink,
858
806
  *,
807
+ whitespace_handler: WhitespaceHandler,
859
808
  parser: str | None = None,
860
809
  autolinks: bool,
861
810
  bullets: str,
@@ -872,6 +821,8 @@ def _process_html_core(
872
821
  heading_style: Literal["underlined", "atx", "atx_closed"],
873
822
  highlight_style: Literal["double-equal", "html", "bold"],
874
823
  keep_inline_images_in: Iterable[str] | None,
824
+ list_indent_type: str,
825
+ list_indent_width: int,
875
826
  newline_style: Literal["spaces", "backslash"],
876
827
  strip: str | Iterable[str] | None,
877
828
  strip_newlines: bool,
@@ -881,7 +832,6 @@ def _process_html_core(
881
832
  wrap: bool,
882
833
  wrap_width: int,
883
834
  ) -> None:
884
- """Core HTML to Markdown processing logic shared by both regular and streaming."""
885
835
  token = _ancestor_cache.set({})
886
836
 
887
837
  try:
@@ -921,6 +871,8 @@ def _process_html_core(
921
871
  heading_style=heading_style,
922
872
  highlight_style=highlight_style,
923
873
  keep_inline_images_in=keep_inline_images_in,
874
+ list_indent_type=list_indent_type,
875
+ list_indent_width=list_indent_width,
924
876
  newline_style=newline_style,
925
877
  strong_em_symbol=strong_em_symbol,
926
878
  sub_symbol=sub_symbol,
@@ -948,6 +900,7 @@ def _process_html_core(
948
900
  escape_misc=escape_misc,
949
901
  escape_asterisks=escape_asterisks,
950
902
  escape_underscores=escape_underscores,
903
+ whitespace_handler=whitespace_handler,
951
904
  )
952
905
  sink.write(text)
953
906
  context += text
@@ -961,6 +914,7 @@ def _process_html_core(
961
914
  escape_misc=escape_misc,
962
915
  escape_underscores=escape_underscores,
963
916
  strip=_as_optional_set(strip),
917
+ whitespace_handler=whitespace_handler,
964
918
  context_before=context[-2:],
965
919
  )
966
920
  sink.write(text)
@@ -992,54 +946,18 @@ def convert_to_markdown_stream(
992
946
  heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
993
947
  highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
994
948
  keep_inline_images_in: Iterable[str] | None = None,
949
+ list_indent_type: Literal["spaces", "tabs"] = "spaces",
950
+ list_indent_width: int = 4,
995
951
  newline_style: Literal["spaces", "backslash"] = SPACES,
996
952
  strip: str | Iterable[str] | None = None,
997
953
  strip_newlines: bool = False,
998
954
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
999
955
  sub_symbol: str = "",
1000
956
  sup_symbol: str = "",
957
+ whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
1001
958
  wrap: bool = False,
1002
959
  wrap_width: int = 80,
1003
960
  ) -> Generator[str, None, None]:
1004
- """Convert HTML to Markdown using streaming/chunked processing.
1005
-
1006
- This function yields chunks of converted Markdown text, allowing for
1007
- memory-efficient processing of large HTML documents. The output is guaranteed
1008
- to be identical to convert_to_markdown().
1009
-
1010
- Args:
1011
- source: An HTML document or a an initialized instance of BeautifulSoup.
1012
- chunk_size: Size of chunks to yield (approximate, in characters).
1013
- progress_callback: Optional callback function called with (processed_bytes, total_bytes).
1014
- parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
1015
- Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
1016
- autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
1017
- bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
1018
- code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
1019
- code_language_callback: Function to dynamically determine the language for code blocks.
1020
- convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
1021
- convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
1022
- custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
1023
- default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
1024
- escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
1025
- escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
1026
- escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
1027
- extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
1028
- heading_style: The style to use for Markdown headings. Defaults to "underlined".
1029
- highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
1030
- keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
1031
- newline_style: Style for handling newlines in text content. Defaults to "spaces".
1032
- strip: Tags to strip from the output. Defaults to None.
1033
- strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
1034
- strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
1035
- sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
1036
- sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
1037
- wrap: Wrap text to the specified width. Defaults to False.
1038
- wrap_width: The number of characters at which to wrap text. Defaults to 80.
1039
-
1040
- Yields:
1041
- str: Chunks of Markdown-formatted text.
1042
- """
1043
961
  sink = StreamingSink(chunk_size, progress_callback)
1044
962
 
1045
963
  if isinstance(source, str):
@@ -1047,9 +965,12 @@ def convert_to_markdown_stream(
1047
965
  elif isinstance(source, BeautifulSoup):
1048
966
  sink.total_bytes = len(str(source))
1049
967
 
968
+ whitespace_handler = WhitespaceHandler(whitespace_mode)
969
+
1050
970
  _process_html_core(
1051
971
  source,
1052
972
  sink,
973
+ whitespace_handler=whitespace_handler,
1053
974
  parser=parser,
1054
975
  autolinks=autolinks,
1055
976
  bullets=bullets,
@@ -1066,6 +987,8 @@ def convert_to_markdown_stream(
1066
987
  heading_style=heading_style,
1067
988
  highlight_style=highlight_style,
1068
989
  keep_inline_images_in=keep_inline_images_in,
990
+ list_indent_type=list_indent_type,
991
+ list_indent_width=list_indent_width,
1069
992
  newline_style=newline_style,
1070
993
  strip=strip,
1071
994
  strip_newlines=strip_newlines,