html-to-markdown 1.9.1__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -33,12 +33,13 @@ from html_to_markdown.constants import (
33
33
  DOUBLE_EQUAL,
34
34
  SPACES,
35
35
  UNDERLINED,
36
+ WHITESPACE_NORMALIZED,
36
37
  html_heading_re,
37
- whitespace_re,
38
38
  )
39
39
  from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
40
40
  from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
41
41
  from html_to_markdown.utils import escape
42
+ from html_to_markdown.whitespace import WhitespaceHandler
42
43
 
43
44
  if TYPE_CHECKING:
44
45
  from collections.abc import Iterable
@@ -143,6 +144,12 @@ SupportedTag = Literal[
143
144
  ]
144
145
 
145
146
 
147
+ def _get_list_indent(list_indent_type: str, list_indent_width: int) -> str:
148
+ if list_indent_type == "tabs":
149
+ return "\t"
150
+ return " " * list_indent_width
151
+
152
+
146
153
  def _is_nested_tag(el: PageElement) -> bool:
147
154
  return isinstance(el, Tag) and el.name in {
148
155
  "ol",
@@ -170,6 +177,7 @@ def _process_tag(
170
177
  escape_misc: bool,
171
178
  escape_underscores: bool,
172
179
  strip: set[str] | None,
180
+ whitespace_handler: WhitespaceHandler,
173
181
  context_before: str = "",
174
182
  ) -> str:
175
183
  should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
@@ -218,6 +226,7 @@ def _process_tag(
218
226
  escape_misc=escape_misc,
219
227
  escape_asterisks=escape_asterisks,
220
228
  escape_underscores=escape_underscores,
229
+ whitespace_handler=whitespace_handler,
221
230
  )
222
231
  )
223
232
  elif isinstance(el, Tag):
@@ -232,6 +241,7 @@ def _process_tag(
232
241
  escape_misc=escape_misc,
233
242
  escape_underscores=escape_underscores,
234
243
  strip=strip,
244
+ whitespace_handler=whitespace_handler,
235
245
  context_before=(context_before + current_text)[-2:],
236
246
  )
237
247
  )
@@ -248,6 +258,18 @@ def _process_tag(
248
258
  if n_eol_to_add > 0:
249
259
  prefix = "\n" * n_eol_to_add
250
260
  return f"{prefix}{rendered}"
261
+
262
+ from html_to_markdown.whitespace import BLOCK_ELEMENTS # noqa: PLC0415
263
+
264
+ is_block_element = tag.name.lower() in BLOCK_ELEMENTS
265
+ if (
266
+ is_block_element
267
+ and not convert_as_inline
268
+ and context_before
269
+ and not context_before.endswith("\n")
270
+ and rendered.strip()
271
+ ):
272
+ return f"\n\n{rendered}"
251
273
  return rendered
252
274
 
253
275
  return text
@@ -259,6 +281,7 @@ def _process_text(
259
281
  escape_misc: bool,
260
282
  escape_asterisks: bool,
261
283
  escape_underscores: bool,
284
+ whitespace_handler: WhitespaceHandler,
262
285
  ) -> str:
263
286
  text = str(el) or ""
264
287
 
@@ -275,69 +298,9 @@ def _process_text(
275
298
  if len(ancestor_names) > 10:
276
299
  break
277
300
 
278
- if "pre" not in ancestor_names:
279
- if text.strip() == "":
280
- if "\n" in text:
281
- text = ""
282
- else:
283
- block_elements = {
284
- "p",
285
- "ul",
286
- "ol",
287
- "div",
288
- "blockquote",
289
- "pre",
290
- "h1",
291
- "h2",
292
- "h3",
293
- "h4",
294
- "h5",
295
- "h6",
296
- "table",
297
- "dl",
298
- "hr",
299
- "figure",
300
- "article",
301
- "section",
302
- "nav",
303
- "aside",
304
- "header",
305
- "footer",
306
- "main",
307
- "form",
308
- "fieldset",
309
- }
310
-
311
- prev_sibling = el.previous_sibling
312
- next_sibling = el.next_sibling
313
-
314
- if (
315
- prev_sibling
316
- and hasattr(prev_sibling, "name")
317
- and prev_sibling.name in block_elements
318
- and next_sibling
319
- and hasattr(next_sibling, "name")
320
- and next_sibling.name in block_elements
321
- ):
322
- text = ""
323
- else:
324
- text = " " if text else ""
325
- else:
326
- has_leading_space = text.startswith((" ", "\t"))
327
- has_trailing_space = text.endswith((" ", "\t"))
328
-
329
- middle_content = (
330
- text[1:-1]
331
- if has_leading_space and has_trailing_space
332
- else text[1:]
333
- if has_leading_space
334
- else text[:-1]
335
- if has_trailing_space
336
- else text
337
- )
301
+ in_pre = bool(ancestor_names.intersection({"pre"}))
338
302
 
339
- middle_content = whitespace_re.sub(" ", middle_content.strip())
340
- text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
303
+ text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
341
304
 
342
305
  if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
343
306
  text = escape(
@@ -357,7 +320,6 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
357
320
 
358
321
 
359
322
  def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
360
- """Get set of ancestor tag names for efficient parent checking."""
361
323
  elem_id = id(element)
362
324
  cache = _ancestor_cache.get()
363
325
  if cache is None:
@@ -388,7 +350,6 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
388
350
 
389
351
 
390
352
  def _has_ancestor(element: PageElement, tag_names: str | list[str]) -> bool:
391
- """Check if element has any of the specified ancestors efficiently."""
392
353
  if isinstance(tag_names, str):
393
354
  tag_names = [tag_names]
394
355
 
@@ -409,19 +370,11 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
409
370
  if value is None:
410
371
  return None
411
372
  if isinstance(value, str):
412
- return set(",".split(value))
373
+ return set(value.split(","))
413
374
  return {*chain(*[v.split(",") for v in value])}
414
375
 
415
376
 
416
377
  def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
417
- """Extract metadata from HTML document.
418
-
419
- Args:
420
- soup: BeautifulSoup instance of the HTML document.
421
-
422
- Returns:
423
- Dictionary of metadata key-value pairs.
424
- """
425
378
  metadata = {}
426
379
 
427
380
  title_tag = soup.find("title")
@@ -468,14 +421,6 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
468
421
 
469
422
 
470
423
  def _format_metadata_comment(metadata: dict[str, str]) -> str:
471
- """Format metadata as a Markdown comment block.
472
-
473
- Args:
474
- metadata: Dictionary of metadata key-value pairs.
475
-
476
- Returns:
477
- Formatted metadata comment block.
478
- """
479
424
  if not metadata:
480
425
  return ""
481
426
 
@@ -511,64 +456,87 @@ def convert_to_markdown(
511
456
  heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
512
457
  highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
513
458
  keep_inline_images_in: Iterable[str] | None = None,
459
+ list_indent_type: Literal["spaces", "tabs"] = "spaces",
460
+ list_indent_width: int = 4,
514
461
  newline_style: Literal["spaces", "backslash"] = SPACES,
462
+ preprocess_html: bool = False,
463
+ preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
464
+ remove_forms: bool = True,
465
+ remove_navigation: bool = True,
515
466
  strip: str | Iterable[str] | None = None,
516
467
  strip_newlines: bool = False,
517
468
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
518
469
  sub_symbol: str = "",
519
470
  sup_symbol: str = "",
471
+ whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
520
472
  wrap: bool = False,
521
473
  wrap_width: int = 80,
522
- preprocess_html: bool = False,
523
- preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
524
- remove_navigation: bool = True,
525
- remove_forms: bool = True,
526
474
  ) -> str:
527
- """Convert HTML to Markdown.
475
+ """Convert HTML content to Markdown format.
528
476
 
529
- Args:
530
- source: An HTML document or a an initialized instance of BeautifulSoup.
531
- stream_processing: Use streaming processing for large documents. Defaults to False.
532
- chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
533
- chunk_callback: Optional callback function called with each processed chunk.
534
- progress_callback: Optional callback function called with (processed_bytes, total_bytes).
535
- parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
536
- Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
537
- autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
538
- bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
539
- code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
540
- code_language_callback: Function to dynamically determine the language for code blocks.
541
- convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
542
- convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
543
- custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
544
- default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
545
- escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
546
- escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
547
- escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
548
- extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
549
- heading_style: The style to use for Markdown headings. Defaults to "underlined".
550
- highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
551
- keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
552
- newline_style: Style for handling newlines in text content. Defaults to "spaces".
553
- strip: Tags to strip from the output. Defaults to None.
554
- strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
555
- strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
556
- sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
557
- sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
558
- wrap: Wrap text to the specified width. Defaults to False.
559
- wrap_width: The number of characters at which to wrap text. Defaults to 80.
560
- preprocess_html: Apply HTML preprocessing to improve quality. Defaults to False.
561
- preprocessing_preset: Preset configuration for preprocessing. Defaults to "standard".
562
- remove_navigation: Remove navigation elements during preprocessing. Defaults to True.
563
- remove_forms: Remove form elements during preprocessing. Defaults to True.
477
+ This is the main entry point for converting HTML to Markdown. It supports
478
+ various customization options for controlling the conversion behavior.
564
479
 
565
- Raises:
566
- ConflictingOptionsError: If both 'strip' and 'convert' are specified.
567
- EmptyHtmlError: When the input HTML is empty.
568
- MissingDependencyError: When lxml parser is requested but not installed.
480
+ Args:
481
+ source: HTML string or BeautifulSoup object to convert.
482
+ stream_processing: Enable streaming mode for large documents.
483
+ chunk_size: Size of chunks for streaming processing.
484
+ chunk_callback: Callback for processing chunks in streaming mode.
485
+ progress_callback: Callback for progress updates (current, total).
486
+ parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
487
+ autolinks: Convert URLs to automatic links.
488
+ bullets: Characters to use for unordered list bullets.
489
+ code_language: Default language for code blocks.
490
+ code_language_callback: Callback to determine code language from element.
491
+ convert: HTML tags to convert to Markdown.
492
+ convert_as_inline: Treat block elements as inline during conversion.
493
+ custom_converters: Custom converters for specific HTML elements.
494
+ default_title: Add a default title if none exists.
495
+ escape_asterisks: Escape asterisk characters in text.
496
+ escape_misc: Escape miscellaneous Markdown characters.
497
+ escape_underscores: Escape underscore characters in text.
498
+ extract_metadata: Extract metadata from HTML head.
499
+ heading_style: Style for headings ('underlined', 'atx', 'atx_closed').
500
+ highlight_style: Style for highlighting ('double-equal', 'html', 'bold').
501
+ keep_inline_images_in: Parent tags where images should remain inline.
502
+ list_indent_type: Type of indentation for lists ('spaces', 'tabs').
503
+ list_indent_width: Number of spaces for list indentation.
504
+ newline_style: Style for newlines ('spaces', 'backslash').
505
+ preprocess_html: Enable HTML preprocessing to clean up content.
506
+ preprocessing_preset: Preprocessing aggressiveness level.
507
+ remove_forms: Remove form elements during preprocessing.
508
+ remove_navigation: Remove navigation elements during preprocessing.
509
+ strip: HTML tags to strip from output.
510
+ strip_newlines: Remove newlines from HTML before processing.
511
+ strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
512
+ sub_symbol: Symbol for subscript text.
513
+ sup_symbol: Symbol for superscript text.
514
+ whitespace_mode: How to handle whitespace ('normalized', 'strict').
515
+ wrap: Enable text wrapping.
516
+ wrap_width: Column width for text wrapping.
569
517
 
570
518
  Returns:
571
- str: A string of Markdown-formatted text converted from the given HTML.
519
+ The converted Markdown string.
520
+
521
+ Raises:
522
+ EmptyHtmlError: If the HTML input is empty.
523
+ MissingDependencyError: If required dependencies are not installed.
524
+ ConflictingOptionsError: If conflicting options are provided.
525
+
526
+ Examples:
527
+ Basic conversion:
528
+ >>> html = "<h1>Title</h1><p>Content</p>"
529
+ >>> convert_to_markdown(html)
530
+ 'Title\\n=====\\n\\nContent\\n\\n'
531
+
532
+ With custom options:
533
+ >>> convert_to_markdown(html, heading_style="atx", list_indent_width=2)
534
+ '# Title\\n\\nContent\\n\\n'
535
+
536
+ Discord-compatible lists (2-space indent):
537
+ >>> html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
538
+ >>> convert_to_markdown(html, list_indent_width=2)
539
+ '* Item 1\\n* Item 2\\n\\n'
572
540
  """
573
541
  if isinstance(source, str):
574
542
  if (
@@ -665,6 +633,7 @@ def convert_to_markdown(
665
633
  sup_symbol=sup_symbol,
666
634
  wrap=wrap,
667
635
  wrap_width=wrap_width,
636
+ whitespace_mode=whitespace_mode,
668
637
  ):
669
638
  if chunk_callback:
670
639
  chunk_callback(chunk)
@@ -681,9 +650,12 @@ def convert_to_markdown(
681
650
 
682
651
  sink = StringSink()
683
652
 
653
+ whitespace_handler = WhitespaceHandler(whitespace_mode)
654
+
684
655
  _process_html_core(
685
656
  source,
686
657
  sink,
658
+ whitespace_handler=whitespace_handler,
687
659
  parser=parser,
688
660
  autolinks=autolinks,
689
661
  bullets=bullets,
@@ -700,6 +672,8 @@ def convert_to_markdown(
700
672
  heading_style=heading_style,
701
673
  highlight_style=highlight_style,
702
674
  keep_inline_images_in=keep_inline_images_in,
675
+ list_indent_type=list_indent_type,
676
+ list_indent_width=list_indent_width,
703
677
  newline_style=newline_style,
704
678
  strip=strip,
705
679
  strip_newlines=strip_newlines,
@@ -761,34 +735,25 @@ def convert_to_markdown(
761
735
 
762
736
 
763
737
  class OutputSink:
764
- """Abstract output sink for processed markdown text."""
765
-
766
738
  def write(self, text: str) -> None:
767
- """Write text to the sink."""
768
739
  raise NotImplementedError
769
740
 
770
741
  def finalize(self) -> None:
771
- """Finalize the output."""
742
+ pass
772
743
 
773
744
 
774
745
  class StringSink(OutputSink):
775
- """Collects all output into a single string."""
776
-
777
746
  def __init__(self) -> None:
778
747
  self.buffer = StringIO()
779
748
 
780
749
  def write(self, text: str) -> None:
781
- """Write text to the buffer."""
782
750
  self.buffer.write(text)
783
751
 
784
752
  def get_result(self) -> str:
785
- """Get the complete result string."""
786
753
  return self.buffer.getvalue()
787
754
 
788
755
 
789
756
  class StreamingSink(OutputSink):
790
- """Yields chunks of output for streaming processing."""
791
-
792
757
  def __init__(self, chunk_size: int = 1024, progress_callback: Callable[[int, int], None] | None = None) -> None:
793
758
  self.chunk_size = chunk_size
794
759
  self.progress_callback = progress_callback
@@ -799,7 +764,6 @@ class StreamingSink(OutputSink):
799
764
  self.chunks: list[str] = []
800
765
 
801
766
  def write(self, text: str) -> None:
802
- """Write text and yield chunks when threshold is reached."""
803
767
  if not text:
804
768
  return
805
769
 
@@ -822,7 +786,6 @@ class StreamingSink(OutputSink):
822
786
  self.buffer_size = len(current_content)
823
787
 
824
788
  def finalize(self) -> None:
825
- """Finalize and yield any remaining content."""
826
789
  if self.buffer_size > 0:
827
790
  content = self.buffer.getvalue()
828
791
  self.chunks.append(content)
@@ -830,11 +793,9 @@ class StreamingSink(OutputSink):
830
793
  self._update_progress()
831
794
 
832
795
  def get_chunks(self) -> Generator[str, None, None]:
833
- """Get all chunks yielded during processing."""
834
796
  yield from self.chunks
835
797
 
836
798
  def _find_split_position(self, content: str) -> int:
837
- """Find optimal position to split content for chunks."""
838
799
  target = self.chunk_size
839
800
  lookahead = min(100, len(content) - target)
840
801
 
@@ -847,7 +808,6 @@ class StreamingSink(OutputSink):
847
808
  return min(target, len(content))
848
809
 
849
810
  def _update_progress(self) -> None:
850
- """Update progress if callback is provided."""
851
811
  if self.progress_callback:
852
812
  self.progress_callback(self.processed_bytes, self.total_bytes)
853
813
 
@@ -856,6 +816,7 @@ def _process_html_core(
856
816
  source: str | BeautifulSoup,
857
817
  sink: OutputSink,
858
818
  *,
819
+ whitespace_handler: WhitespaceHandler,
859
820
  parser: str | None = None,
860
821
  autolinks: bool,
861
822
  bullets: str,
@@ -872,6 +833,8 @@ def _process_html_core(
872
833
  heading_style: Literal["underlined", "atx", "atx_closed"],
873
834
  highlight_style: Literal["double-equal", "html", "bold"],
874
835
  keep_inline_images_in: Iterable[str] | None,
836
+ list_indent_type: str,
837
+ list_indent_width: int,
875
838
  newline_style: Literal["spaces", "backslash"],
876
839
  strip: str | Iterable[str] | None,
877
840
  strip_newlines: bool,
@@ -881,20 +844,10 @@ def _process_html_core(
881
844
  wrap: bool,
882
845
  wrap_width: int,
883
846
  ) -> None:
884
- """Core HTML to Markdown processing logic shared by both regular and streaming."""
885
847
  token = _ancestor_cache.set({})
886
848
 
887
849
  try:
888
850
  if isinstance(source, str):
889
- if (
890
- heading_style == UNDERLINED
891
- and "Header" in source
892
- and "\n------\n\n" in source
893
- and "Next paragraph" in source
894
- ):
895
- sink.write(source)
896
- return
897
-
898
851
  if strip_newlines:
899
852
  source = source.replace("\n", " ").replace("\r", " ")
900
853
 
@@ -921,6 +874,8 @@ def _process_html_core(
921
874
  heading_style=heading_style,
922
875
  highlight_style=highlight_style,
923
876
  keep_inline_images_in=keep_inline_images_in,
877
+ list_indent_type=list_indent_type,
878
+ list_indent_width=list_indent_width,
924
879
  newline_style=newline_style,
925
880
  strong_em_symbol=strong_em_symbol,
926
881
  sub_symbol=sub_symbol,
@@ -948,6 +903,7 @@ def _process_html_core(
948
903
  escape_misc=escape_misc,
949
904
  escape_asterisks=escape_asterisks,
950
905
  escape_underscores=escape_underscores,
906
+ whitespace_handler=whitespace_handler,
951
907
  )
952
908
  sink.write(text)
953
909
  context += text
@@ -961,6 +917,7 @@ def _process_html_core(
961
917
  escape_misc=escape_misc,
962
918
  escape_underscores=escape_underscores,
963
919
  strip=_as_optional_set(strip),
920
+ whitespace_handler=whitespace_handler,
964
921
  context_before=context[-2:],
965
922
  )
966
923
  sink.write(text)
@@ -992,54 +949,18 @@ def convert_to_markdown_stream(
992
949
  heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
993
950
  highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
994
951
  keep_inline_images_in: Iterable[str] | None = None,
952
+ list_indent_type: Literal["spaces", "tabs"] = "spaces",
953
+ list_indent_width: int = 4,
995
954
  newline_style: Literal["spaces", "backslash"] = SPACES,
996
955
  strip: str | Iterable[str] | None = None,
997
956
  strip_newlines: bool = False,
998
957
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
999
958
  sub_symbol: str = "",
1000
959
  sup_symbol: str = "",
960
+ whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
1001
961
  wrap: bool = False,
1002
962
  wrap_width: int = 80,
1003
963
  ) -> Generator[str, None, None]:
1004
- """Convert HTML to Markdown using streaming/chunked processing.
1005
-
1006
- This function yields chunks of converted Markdown text, allowing for
1007
- memory-efficient processing of large HTML documents. The output is guaranteed
1008
- to be identical to convert_to_markdown().
1009
-
1010
- Args:
1011
- source: An HTML document or a an initialized instance of BeautifulSoup.
1012
- chunk_size: Size of chunks to yield (approximate, in characters).
1013
- progress_callback: Optional callback function called with (processed_bytes, total_bytes).
1014
- parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
1015
- Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
1016
- autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
1017
- bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
1018
- code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
1019
- code_language_callback: Function to dynamically determine the language for code blocks.
1020
- convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
1021
- convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
1022
- custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
1023
- default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
1024
- escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
1025
- escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
1026
- escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
1027
- extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
1028
- heading_style: The style to use for Markdown headings. Defaults to "underlined".
1029
- highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
1030
- keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
1031
- newline_style: Style for handling newlines in text content. Defaults to "spaces".
1032
- strip: Tags to strip from the output. Defaults to None.
1033
- strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
1034
- strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
1035
- sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
1036
- sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
1037
- wrap: Wrap text to the specified width. Defaults to False.
1038
- wrap_width: The number of characters at which to wrap text. Defaults to 80.
1039
-
1040
- Yields:
1041
- str: Chunks of Markdown-formatted text.
1042
- """
1043
964
  sink = StreamingSink(chunk_size, progress_callback)
1044
965
 
1045
966
  if isinstance(source, str):
@@ -1047,9 +968,12 @@ def convert_to_markdown_stream(
1047
968
  elif isinstance(source, BeautifulSoup):
1048
969
  sink.total_bytes = len(str(source))
1049
970
 
971
+ whitespace_handler = WhitespaceHandler(whitespace_mode)
972
+
1050
973
  _process_html_core(
1051
974
  source,
1052
975
  sink,
976
+ whitespace_handler=whitespace_handler,
1053
977
  parser=parser,
1054
978
  autolinks=autolinks,
1055
979
  bullets=bullets,
@@ -1066,6 +990,8 @@ def convert_to_markdown_stream(
1066
990
  heading_style=heading_style,
1067
991
  highlight_style=highlight_style,
1068
992
  keep_inline_images_in=keep_inline_images_in,
993
+ list_indent_type=list_indent_type,
994
+ list_indent_width=list_indent_width,
1069
995
  newline_style=newline_style,
1070
996
  strip=strip,
1071
997
  strip_newlines=strip_newlines,
html_to_markdown/utils.py CHANGED
@@ -6,17 +6,6 @@ from html_to_markdown.constants import line_beginning_re
6
6
 
7
7
 
8
8
  def chomp(text: str) -> tuple[str, str, str]:
9
- """Simplified whitespace handling for inline elements.
10
-
11
- For semantic markdown output, preserves leading/trailing spaces as single spaces
12
- and normalizes internal whitespace.
13
-
14
- Args:
15
- text: The text to chomp.
16
-
17
- Returns:
18
- A tuple containing the prefix, suffix, and the normalized text.
19
- """
20
9
  if not text:
21
10
  return "", "", ""
22
11
 
@@ -29,17 +18,6 @@ def chomp(text: str) -> tuple[str, str, str]:
29
18
 
30
19
 
31
20
  def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:
32
- """Escape special characters in text.
33
-
34
- Args:
35
- text: The text to escape.
36
- escape_misc: Whether to escape miscellaneous characters.
37
- escape_asterisks: Whether to escape asterisks.
38
- escape_underscores: Whether to escape underscores.
39
-
40
- Returns:
41
- The escaped text.
42
- """
43
21
  if not text:
44
22
  return ""
45
23
  if escape_misc:
@@ -52,28 +30,10 @@ def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_under
52
30
  return text
53
31
 
54
32
 
55
- def indent(*, text: str, level: int) -> str:
56
- """Indent text by a given level.
57
-
58
- Args:
59
- text: The text to indent.
60
- level: The level of indentation.
61
-
62
- Returns:
63
- The indented text.
64
- """
65
- return line_beginning_re.sub("\t" * level, text) if text else ""
33
+ def indent(*, text: str, level: int, indent_str: str = "\t") -> str:
34
+ return line_beginning_re.sub(indent_str * level, text) if text else ""
66
35
 
67
36
 
68
37
  def underline(*, text: str, pad_char: str) -> str:
69
- """Underline text with a given character.
70
-
71
- Args:
72
- text: The text to underline.
73
- pad_char: The character to use for underlining.
74
-
75
- Returns:
76
- The underlined text.
77
- """
78
38
  text = (text or "").rstrip()
79
39
  return f"{text}\n{pad_char * len(text)}\n\n" if text else ""