html-to-markdown 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -33,12 +33,13 @@ from html_to_markdown.constants import (
33
33
  DOUBLE_EQUAL,
34
34
  SPACES,
35
35
  UNDERLINED,
36
+ WHITESPACE_NORMALIZED,
36
37
  html_heading_re,
37
- whitespace_re,
38
38
  )
39
39
  from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
40
40
  from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
41
41
  from html_to_markdown.utils import escape
42
+ from html_to_markdown.whitespace import WhitespaceHandler
42
43
 
43
44
  if TYPE_CHECKING:
44
45
  from collections.abc import Iterable
@@ -143,6 +144,12 @@ SupportedTag = Literal[
143
144
  ]
144
145
 
145
146
 
147
+ def _get_list_indent(list_indent_type: str, list_indent_width: int) -> str:
148
+ if list_indent_type == "tabs":
149
+ return "\t"
150
+ return " " * list_indent_width
151
+
152
+
146
153
  def _is_nested_tag(el: PageElement) -> bool:
147
154
  return isinstance(el, Tag) and el.name in {
148
155
  "ol",
@@ -170,6 +177,7 @@ def _process_tag(
170
177
  escape_misc: bool,
171
178
  escape_underscores: bool,
172
179
  strip: set[str] | None,
180
+ whitespace_handler: WhitespaceHandler,
173
181
  context_before: str = "",
174
182
  ) -> str:
175
183
  should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
@@ -195,18 +203,14 @@ def _process_tag(
195
203
 
196
204
  children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
197
205
 
198
- # List of tags that return empty string when they have no content
199
206
  empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
200
207
 
201
208
  for i, el in enumerate(children):
202
209
  if isinstance(el, NavigableString):
203
- # Check if this is whitespace between empty elements
204
210
  if el.strip() == "" and i > 0 and i < len(children) - 1:
205
211
  prev_el = children[i - 1]
206
212
  next_el = children[i + 1]
207
213
 
208
- # If previous element was a tag that produced empty output
209
- # and next element is also a tag that could be empty, skip this whitespace
210
214
  if (
211
215
  isinstance(prev_el, Tag)
212
216
  and isinstance(next_el, Tag)
@@ -214,7 +218,6 @@ def _process_tag(
214
218
  and next_el.name.lower() in empty_when_no_content_tags
215
219
  and not prev_el.get_text().strip()
216
220
  ):
217
- # Previous tag is empty and next could be empty too, skip this whitespace
218
221
  continue
219
222
 
220
223
  text_parts.append(
@@ -223,6 +226,7 @@ def _process_tag(
223
226
  escape_misc=escape_misc,
224
227
  escape_asterisks=escape_asterisks,
225
228
  escape_underscores=escape_underscores,
229
+ whitespace_handler=whitespace_handler,
226
230
  )
227
231
  )
228
232
  elif isinstance(el, Tag):
@@ -237,6 +241,7 @@ def _process_tag(
237
241
  escape_misc=escape_misc,
238
242
  escape_underscores=escape_underscores,
239
243
  strip=strip,
244
+ whitespace_handler=whitespace_handler,
240
245
  context_before=(context_before + current_text)[-2:],
241
246
  )
242
247
  )
@@ -264,6 +269,7 @@ def _process_text(
264
269
  escape_misc: bool,
265
270
  escape_asterisks: bool,
266
271
  escape_underscores: bool,
272
+ whitespace_handler: WhitespaceHandler,
267
273
  ) -> str:
268
274
  text = str(el) or ""
269
275
 
@@ -280,76 +286,9 @@ def _process_text(
280
286
  if len(ancestor_names) > 10:
281
287
  break
282
288
 
283
- if "pre" not in ancestor_names:
284
- # Special case: if the text is only whitespace
285
- if text.strip() == "":
286
- # If it contains newlines, it's probably indentation whitespace, return empty
287
- if "\n" in text:
288
- text = ""
289
- else:
290
- # Check if this whitespace is between block elements
291
- # Define block elements that should not have whitespace between them
292
- block_elements = {
293
- "p",
294
- "ul",
295
- "ol",
296
- "div",
297
- "blockquote",
298
- "pre",
299
- "h1",
300
- "h2",
301
- "h3",
302
- "h4",
303
- "h5",
304
- "h6",
305
- "table",
306
- "dl",
307
- "hr",
308
- "figure",
309
- "article",
310
- "section",
311
- "nav",
312
- "aside",
313
- "header",
314
- "footer",
315
- "main",
316
- "form",
317
- "fieldset",
318
- }
319
-
320
- prev_sibling = el.previous_sibling
321
- next_sibling = el.next_sibling
322
-
323
- # Check if whitespace is between block elements
324
- if (
325
- prev_sibling
326
- and hasattr(prev_sibling, "name")
327
- and prev_sibling.name in block_elements
328
- and next_sibling
329
- and hasattr(next_sibling, "name")
330
- and next_sibling.name in block_elements
331
- ):
332
- # Remove whitespace between block elements
333
- text = ""
334
- else:
335
- # Otherwise it's inline whitespace, normalize to single space
336
- text = " " if text else ""
337
- else:
338
- has_leading_space = text.startswith((" ", "\t"))
339
- has_trailing_space = text.endswith((" ", "\t"))
340
-
341
- middle_content = (
342
- text[1:-1]
343
- if has_leading_space and has_trailing_space
344
- else text[1:]
345
- if has_leading_space
346
- else text[:-1]
347
- if has_trailing_space
348
- else text
349
- )
289
+ in_pre = bool(ancestor_names.intersection({"pre"}))
350
290
 
351
- middle_content = whitespace_re.sub(" ", middle_content.strip())
352
- text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
291
+ text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
353
292
 
354
293
  if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
355
294
  text = escape(
@@ -369,7 +308,6 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
369
308
 
370
309
 
371
310
  def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
372
- """Get set of ancestor tag names for efficient parent checking."""
373
311
  elem_id = id(element)
374
312
  cache = _ancestor_cache.get()
375
313
  if cache is None:
@@ -400,7 +338,6 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
400
338
 
401
339
 
402
340
  def _has_ancestor(element: PageElement, tag_names: str | list[str]) -> bool:
403
- """Check if element has any of the specified ancestors efficiently."""
404
341
  if isinstance(tag_names, str):
405
342
  tag_names = [tag_names]
406
343
 
@@ -426,14 +363,6 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
426
363
 
427
364
 
428
365
  def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
429
- """Extract metadata from HTML document.
430
-
431
- Args:
432
- soup: BeautifulSoup instance of the HTML document.
433
-
434
- Returns:
435
- Dictionary of metadata key-value pairs.
436
- """
437
366
  metadata = {}
438
367
 
439
368
  title_tag = soup.find("title")
@@ -470,7 +399,6 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
470
399
  if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
471
400
  metadata["canonical"] = canonical["href"]
472
401
 
473
- # Extract link relations
474
402
  link_relations = {"author", "license", "alternate"}
475
403
  for rel_type in link_relations:
476
404
  link = soup.find("link", rel=rel_type, href=True)
@@ -481,14 +409,6 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
481
409
 
482
410
 
483
411
  def _format_metadata_comment(metadata: dict[str, str]) -> str:
484
- """Format metadata as a Markdown comment block.
485
-
486
- Args:
487
- metadata: Dictionary of metadata key-value pairs.
488
-
489
- Returns:
490
- Formatted metadata comment block.
491
- """
492
412
  if not metadata:
493
413
  return ""
494
414
 
@@ -524,64 +444,87 @@ def convert_to_markdown(
524
444
  heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
525
445
  highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
526
446
  keep_inline_images_in: Iterable[str] | None = None,
447
+ list_indent_type: Literal["spaces", "tabs"] = "spaces",
448
+ list_indent_width: int = 4,
527
449
  newline_style: Literal["spaces", "backslash"] = SPACES,
450
+ preprocess_html: bool = False,
451
+ preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
452
+ remove_forms: bool = True,
453
+ remove_navigation: bool = True,
528
454
  strip: str | Iterable[str] | None = None,
529
455
  strip_newlines: bool = False,
530
456
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
531
457
  sub_symbol: str = "",
532
458
  sup_symbol: str = "",
459
+ whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
533
460
  wrap: bool = False,
534
461
  wrap_width: int = 80,
535
- preprocess_html: bool = False,
536
- preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
537
- remove_navigation: bool = True,
538
- remove_forms: bool = True,
539
462
  ) -> str:
540
- """Convert HTML to Markdown.
463
+ """Convert HTML content to Markdown format.
541
464
 
542
- Args:
543
- source: An HTML document or a an initialized instance of BeautifulSoup.
544
- stream_processing: Use streaming processing for large documents. Defaults to False.
545
- chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
546
- chunk_callback: Optional callback function called with each processed chunk.
547
- progress_callback: Optional callback function called with (processed_bytes, total_bytes).
548
- parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
549
- Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
550
- autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
551
- bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
552
- code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
553
- code_language_callback: Function to dynamically determine the language for code blocks.
554
- convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
555
- convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
556
- custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
557
- default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
558
- escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
559
- escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
560
- escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
561
- extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
562
- heading_style: The style to use for Markdown headings. Defaults to "underlined".
563
- highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
564
- keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
565
- newline_style: Style for handling newlines in text content. Defaults to "spaces".
566
- strip: Tags to strip from the output. Defaults to None.
567
- strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
568
- strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
569
- sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
570
- sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
571
- wrap: Wrap text to the specified width. Defaults to False.
572
- wrap_width: The number of characters at which to wrap text. Defaults to 80.
573
- preprocess_html: Apply HTML preprocessing to improve quality. Defaults to False.
574
- preprocessing_preset: Preset configuration for preprocessing. Defaults to "standard".
575
- remove_navigation: Remove navigation elements during preprocessing. Defaults to True.
576
- remove_forms: Remove form elements during preprocessing. Defaults to True.
465
+ This is the main entry point for converting HTML to Markdown. It supports
466
+ various customization options for controlling the conversion behavior.
577
467
 
578
- Raises:
579
- ConflictingOptionsError: If both 'strip' and 'convert' are specified.
580
- EmptyHtmlError: When the input HTML is empty.
581
- MissingDependencyError: When lxml parser is requested but not installed.
468
+ Args:
469
+ source: HTML string or BeautifulSoup object to convert.
470
+ stream_processing: Enable streaming mode for large documents.
471
+ chunk_size: Size of chunks for streaming processing.
472
+ chunk_callback: Callback for processing chunks in streaming mode.
473
+ progress_callback: Callback for progress updates (current, total).
474
+ parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
475
+ autolinks: Convert URLs to automatic links.
476
+ bullets: Characters to use for unordered list bullets.
477
+ code_language: Default language for code blocks.
478
+ code_language_callback: Callback to determine code language from element.
479
+ convert: HTML tags to convert to Markdown.
480
+ convert_as_inline: Treat block elements as inline during conversion.
481
+ custom_converters: Custom converters for specific HTML elements.
482
+ default_title: Add a default title if none exists.
483
+ escape_asterisks: Escape asterisk characters in text.
484
+ escape_misc: Escape miscellaneous Markdown characters.
485
+ escape_underscores: Escape underscore characters in text.
486
+ extract_metadata: Extract metadata from HTML head.
487
+ heading_style: Style for headings ('underlined', 'atx', 'atx_closed').
488
+ highlight_style: Style for highlighting ('double-equal', 'html', 'bold').
489
+ keep_inline_images_in: Parent tags where images should remain inline.
490
+ list_indent_type: Type of indentation for lists ('spaces', 'tabs').
491
+ list_indent_width: Number of spaces for list indentation.
492
+ newline_style: Style for newlines ('spaces', 'backslash').
493
+ preprocess_html: Enable HTML preprocessing to clean up content.
494
+ preprocessing_preset: Preprocessing aggressiveness level.
495
+ remove_forms: Remove form elements during preprocessing.
496
+ remove_navigation: Remove navigation elements during preprocessing.
497
+ strip: HTML tags to strip from output.
498
+ strip_newlines: Remove newlines from HTML before processing.
499
+ strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
500
+ sub_symbol: Symbol for subscript text.
501
+ sup_symbol: Symbol for superscript text.
502
+ whitespace_mode: How to handle whitespace ('normalized', 'strict').
503
+ wrap: Enable text wrapping.
504
+ wrap_width: Column width for text wrapping.
582
505
 
583
506
  Returns:
584
- str: A string of Markdown-formatted text converted from the given HTML.
507
+ The converted Markdown string.
508
+
509
+ Raises:
510
+ EmptyHtmlError: If the HTML input is empty.
511
+ MissingDependencyError: If required dependencies are not installed.
512
+ ConflictingOptionsError: If conflicting options are provided.
513
+
514
+ Examples:
515
+ Basic conversion:
516
+ >>> html = "<h1>Title</h1><p>Content</p>"
517
+ >>> convert_to_markdown(html)
518
+ 'Title\\n=====\\n\\nContent\\n\\n'
519
+
520
+ With custom options:
521
+ >>> convert_to_markdown(html, heading_style="atx", list_indent_width=2)
522
+ '# Title\\n\\nContent\\n\\n'
523
+
524
+ Discord-compatible lists (2-space indent):
525
+ >>> html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
526
+ >>> convert_to_markdown(html, list_indent_width=2)
527
+ '* Item 1\\n* Item 2\\n\\n'
585
528
  """
586
529
  if isinstance(source, str):
587
530
  if (
@@ -595,8 +538,6 @@ def convert_to_markdown(
595
538
  if strip_newlines:
596
539
  source = source.replace("\n", " ").replace("\r", " ")
597
540
 
598
- # Fix lxml parsing of void elements like <wbr>
599
- # lxml incorrectly treats them as container tags
600
541
  source = re.sub(r"<wbr\s*>", "<wbr />", source, flags=re.IGNORECASE)
601
542
 
602
543
  if preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
@@ -680,6 +621,7 @@ def convert_to_markdown(
680
621
  sup_symbol=sup_symbol,
681
622
  wrap=wrap,
682
623
  wrap_width=wrap_width,
624
+ whitespace_mode=whitespace_mode,
683
625
  ):
684
626
  if chunk_callback:
685
627
  chunk_callback(chunk)
@@ -696,9 +638,12 @@ def convert_to_markdown(
696
638
 
697
639
  sink = StringSink()
698
640
 
641
+ whitespace_handler = WhitespaceHandler(whitespace_mode)
642
+
699
643
  _process_html_core(
700
644
  source,
701
645
  sink,
646
+ whitespace_handler=whitespace_handler,
702
647
  parser=parser,
703
648
  autolinks=autolinks,
704
649
  bullets=bullets,
@@ -715,6 +660,8 @@ def convert_to_markdown(
715
660
  heading_style=heading_style,
716
661
  highlight_style=highlight_style,
717
662
  keep_inline_images_in=keep_inline_images_in,
663
+ list_indent_type=list_indent_type,
664
+ list_indent_width=list_indent_width,
718
665
  newline_style=newline_style,
719
666
  strip=strip,
720
667
  strip_newlines=strip_newlines,
@@ -737,7 +684,6 @@ def convert_to_markdown(
737
684
  if leading_whitespace_match:
738
685
  leading_whitespace = leading_whitespace_match.group(0)
739
686
 
740
- # Check if input contains list or heading tags
741
687
  list_heading_tags = {"<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"}
742
688
  if any(tag in original_input for tag in list_heading_tags):
743
689
  leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
@@ -751,19 +697,14 @@ def convert_to_markdown(
751
697
  def normalize_spaces_outside_code(text: str) -> str:
752
698
  parts = text.split("```")
753
699
  for i in range(0, len(parts), 2):
754
- # Process each line separately to preserve leading spaces
755
700
  lines = parts[i].split("\n")
756
701
  processed_lines = []
757
702
  for line in lines:
758
- # Preserve definition list formatting (: followed by 3 spaces)
759
703
  def_parts = re.split(r"(:\s{3})", line)
760
704
  for j in range(0, len(def_parts), 2):
761
- # Only normalize non-definition-list parts
762
- # Also preserve leading spaces (for list indentation)
763
705
  match = re.match(r"^(\s*)(.*)", def_parts[j])
764
706
  if match:
765
707
  leading_spaces, rest = match.groups()
766
- # Only normalize multiple spaces that are not at the beginning
767
708
  rest = re.sub(r" {3,}", " ", rest)
768
709
  def_parts[j] = leading_spaces + rest
769
710
  processed_lines.append("".join(def_parts))
@@ -782,34 +723,25 @@ def convert_to_markdown(
782
723
 
783
724
 
784
725
  class OutputSink:
785
- """Abstract output sink for processed markdown text."""
786
-
787
726
  def write(self, text: str) -> None:
788
- """Write text to the sink."""
789
727
  raise NotImplementedError
790
728
 
791
729
  def finalize(self) -> None:
792
- """Finalize the output."""
730
+ pass
793
731
 
794
732
 
795
733
  class StringSink(OutputSink):
796
- """Collects all output into a single string."""
797
-
798
734
  def __init__(self) -> None:
799
735
  self.buffer = StringIO()
800
736
 
801
737
  def write(self, text: str) -> None:
802
- """Write text to the buffer."""
803
738
  self.buffer.write(text)
804
739
 
805
740
  def get_result(self) -> str:
806
- """Get the complete result string."""
807
741
  return self.buffer.getvalue()
808
742
 
809
743
 
810
744
  class StreamingSink(OutputSink):
811
- """Yields chunks of output for streaming processing."""
812
-
813
745
  def __init__(self, chunk_size: int = 1024, progress_callback: Callable[[int, int], None] | None = None) -> None:
814
746
  self.chunk_size = chunk_size
815
747
  self.progress_callback = progress_callback
@@ -820,7 +752,6 @@ class StreamingSink(OutputSink):
820
752
  self.chunks: list[str] = []
821
753
 
822
754
  def write(self, text: str) -> None:
823
- """Write text and yield chunks when threshold is reached."""
824
755
  if not text:
825
756
  return
826
757
 
@@ -843,7 +774,6 @@ class StreamingSink(OutputSink):
843
774
  self.buffer_size = len(current_content)
844
775
 
845
776
  def finalize(self) -> None:
846
- """Finalize and yield any remaining content."""
847
777
  if self.buffer_size > 0:
848
778
  content = self.buffer.getvalue()
849
779
  self.chunks.append(content)
@@ -851,11 +781,9 @@ class StreamingSink(OutputSink):
851
781
  self._update_progress()
852
782
 
853
783
  def get_chunks(self) -> Generator[str, None, None]:
854
- """Get all chunks yielded during processing."""
855
784
  yield from self.chunks
856
785
 
857
786
  def _find_split_position(self, content: str) -> int:
858
- """Find optimal position to split content for chunks."""
859
787
  target = self.chunk_size
860
788
  lookahead = min(100, len(content) - target)
861
789
 
@@ -868,7 +796,6 @@ class StreamingSink(OutputSink):
868
796
  return min(target, len(content))
869
797
 
870
798
  def _update_progress(self) -> None:
871
- """Update progress if callback is provided."""
872
799
  if self.progress_callback:
873
800
  self.progress_callback(self.processed_bytes, self.total_bytes)
874
801
 
@@ -877,6 +804,7 @@ def _process_html_core(
877
804
  source: str | BeautifulSoup,
878
805
  sink: OutputSink,
879
806
  *,
807
+ whitespace_handler: WhitespaceHandler,
880
808
  parser: str | None = None,
881
809
  autolinks: bool,
882
810
  bullets: str,
@@ -893,6 +821,8 @@ def _process_html_core(
893
821
  heading_style: Literal["underlined", "atx", "atx_closed"],
894
822
  highlight_style: Literal["double-equal", "html", "bold"],
895
823
  keep_inline_images_in: Iterable[str] | None,
824
+ list_indent_type: str,
825
+ list_indent_width: int,
896
826
  newline_style: Literal["spaces", "backslash"],
897
827
  strip: str | Iterable[str] | None,
898
828
  strip_newlines: bool,
@@ -902,7 +832,6 @@ def _process_html_core(
902
832
  wrap: bool,
903
833
  wrap_width: int,
904
834
  ) -> None:
905
- """Core HTML to Markdown processing logic shared by both regular and streaming."""
906
835
  token = _ancestor_cache.set({})
907
836
 
908
837
  try:
@@ -942,6 +871,8 @@ def _process_html_core(
942
871
  heading_style=heading_style,
943
872
  highlight_style=highlight_style,
944
873
  keep_inline_images_in=keep_inline_images_in,
874
+ list_indent_type=list_indent_type,
875
+ list_indent_width=list_indent_width,
945
876
  newline_style=newline_style,
946
877
  strong_em_symbol=strong_em_symbol,
947
878
  sub_symbol=sub_symbol,
@@ -969,6 +900,7 @@ def _process_html_core(
969
900
  escape_misc=escape_misc,
970
901
  escape_asterisks=escape_asterisks,
971
902
  escape_underscores=escape_underscores,
903
+ whitespace_handler=whitespace_handler,
972
904
  )
973
905
  sink.write(text)
974
906
  context += text
@@ -982,6 +914,7 @@ def _process_html_core(
982
914
  escape_misc=escape_misc,
983
915
  escape_underscores=escape_underscores,
984
916
  strip=_as_optional_set(strip),
917
+ whitespace_handler=whitespace_handler,
985
918
  context_before=context[-2:],
986
919
  )
987
920
  sink.write(text)
@@ -1013,54 +946,18 @@ def convert_to_markdown_stream(
1013
946
  heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
1014
947
  highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
1015
948
  keep_inline_images_in: Iterable[str] | None = None,
949
+ list_indent_type: Literal["spaces", "tabs"] = "spaces",
950
+ list_indent_width: int = 4,
1016
951
  newline_style: Literal["spaces", "backslash"] = SPACES,
1017
952
  strip: str | Iterable[str] | None = None,
1018
953
  strip_newlines: bool = False,
1019
954
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
1020
955
  sub_symbol: str = "",
1021
956
  sup_symbol: str = "",
957
+ whitespace_mode: Literal["normalized", "strict"] = WHITESPACE_NORMALIZED,
1022
958
  wrap: bool = False,
1023
959
  wrap_width: int = 80,
1024
960
  ) -> Generator[str, None, None]:
1025
- """Convert HTML to Markdown using streaming/chunked processing.
1026
-
1027
- This function yields chunks of converted Markdown text, allowing for
1028
- memory-efficient processing of large HTML documents. The output is guaranteed
1029
- to be identical to convert_to_markdown().
1030
-
1031
- Args:
1032
- source: An HTML document or a an initialized instance of BeautifulSoup.
1033
- chunk_size: Size of chunks to yield (approximate, in characters).
1034
- progress_callback: Optional callback function called with (processed_bytes, total_bytes).
1035
- parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
1036
- Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
1037
- autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
1038
- bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
1039
- code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
1040
- code_language_callback: Function to dynamically determine the language for code blocks.
1041
- convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
1042
- convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
1043
- custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
1044
- default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
1045
- escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
1046
- escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
1047
- escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
1048
- extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
1049
- heading_style: The style to use for Markdown headings. Defaults to "underlined".
1050
- highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
1051
- keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
1052
- newline_style: Style for handling newlines in text content. Defaults to "spaces".
1053
- strip: Tags to strip from the output. Defaults to None.
1054
- strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
1055
- strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
1056
- sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
1057
- sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
1058
- wrap: Wrap text to the specified width. Defaults to False.
1059
- wrap_width: The number of characters at which to wrap text. Defaults to 80.
1060
-
1061
- Yields:
1062
- str: Chunks of Markdown-formatted text.
1063
- """
1064
961
  sink = StreamingSink(chunk_size, progress_callback)
1065
962
 
1066
963
  if isinstance(source, str):
@@ -1068,9 +965,12 @@ def convert_to_markdown_stream(
1068
965
  elif isinstance(source, BeautifulSoup):
1069
966
  sink.total_bytes = len(str(source))
1070
967
 
968
+ whitespace_handler = WhitespaceHandler(whitespace_mode)
969
+
1071
970
  _process_html_core(
1072
971
  source,
1073
972
  sink,
973
+ whitespace_handler=whitespace_handler,
1074
974
  parser=parser,
1075
975
  autolinks=autolinks,
1076
976
  bullets=bullets,
@@ -1087,6 +987,8 @@ def convert_to_markdown_stream(
1087
987
  heading_style=heading_style,
1088
988
  highlight_style=highlight_style,
1089
989
  keep_inline_images_in=keep_inline_images_in,
990
+ list_indent_type=list_indent_type,
991
+ list_indent_width=list_indent_width,
1090
992
  newline_style=newline_style,
1091
993
  strip=strip,
1092
994
  strip_newlines=strip_newlines,
html_to_markdown/utils.py CHANGED
@@ -6,17 +6,6 @@ from html_to_markdown.constants import line_beginning_re
6
6
 
7
7
 
8
8
  def chomp(text: str) -> tuple[str, str, str]:
9
- """Simplified whitespace handling for inline elements.
10
-
11
- For semantic markdown output, preserves leading/trailing spaces as single spaces
12
- and normalizes internal whitespace.
13
-
14
- Args:
15
- text: The text to chomp.
16
-
17
- Returns:
18
- A tuple containing the prefix, suffix, and the normalized text.
19
- """
20
9
  if not text:
21
10
  return "", "", ""
22
11
 
@@ -29,17 +18,6 @@ def chomp(text: str) -> tuple[str, str, str]:
29
18
 
30
19
 
31
20
  def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:
32
- """Escape special characters in text.
33
-
34
- Args:
35
- text: The text to escape.
36
- escape_misc: Whether to escape miscellaneous characters.
37
- escape_asterisks: Whether to escape asterisks.
38
- escape_underscores: Whether to escape underscores.
39
-
40
- Returns:
41
- The escaped text.
42
- """
43
21
  if not text:
44
22
  return ""
45
23
  if escape_misc:
@@ -52,28 +30,10 @@ def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_under
52
30
  return text
53
31
 
54
32
 
55
- def indent(*, text: str, level: int) -> str:
56
- """Indent text by a given level.
57
-
58
- Args:
59
- text: The text to indent.
60
- level: The level of indentation.
61
-
62
- Returns:
63
- The indented text.
64
- """
65
- return line_beginning_re.sub("\t" * level, text) if text else ""
33
+ def indent(*, text: str, level: int, indent_str: str = "\t") -> str:
34
+ return line_beginning_re.sub(indent_str * level, text) if text else ""
66
35
 
67
36
 
68
37
  def underline(*, text: str, pad_char: str) -> str:
69
- """Underline text with a given character.
70
-
71
- Args:
72
- text: The text to underline.
73
- pad_char: The character to use for underlining.
74
-
75
- Returns:
76
- The underlined text.
77
- """
78
38
  text = (text or "").rstrip()
79
39
  return f"{text}\n{pad_char * len(text)}\n\n" if text else ""