html-to-markdown 1.6.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -3,18 +3,24 @@ from __future__ import annotations
3
3
  from typing import TYPE_CHECKING
4
4
 
5
5
  if TYPE_CHECKING:
6
- from collections.abc import Generator, Mapping
7
- # Use the imported PageElement instead of re-importing
6
+ from collections.abc import Callable, Generator, Mapping
7
+
8
8
  import re
9
9
  from contextvars import ContextVar
10
10
  from io import StringIO
11
11
  from itertools import chain
12
- from typing import TYPE_CHECKING, Any, Callable, Literal, cast
12
+ from typing import TYPE_CHECKING, Any, Literal, cast
13
13
 
14
14
  from bs4 import BeautifulSoup, Comment, Doctype, Tag
15
15
  from bs4.element import NavigableString, PageElement
16
16
 
17
- # Check if lxml is available for better performance
17
+ try:
18
+ from html_to_markdown.preprocessor import create_preprocessor
19
+ from html_to_markdown.preprocessor import preprocess_html as preprocess_fn
20
+ except ImportError:
21
+ create_preprocessor = None # type: ignore[assignment]
22
+ preprocess_fn = None # type: ignore[assignment]
23
+
18
24
  try:
19
25
  import importlib.util
20
26
 
@@ -170,7 +176,7 @@ def _process_tag(
170
176
  tag_name: SupportedTag | None = (
171
177
  cast("SupportedTag", tag.name.lower()) if tag.name.lower() in converters_map else None
172
178
  )
173
- text = ""
179
+ text_parts: list[str] = []
174
180
 
175
181
  is_heading = html_heading_re.match(tag.name) is not None
176
182
  is_cell = tag_name in {"td", "th"}
@@ -187,33 +193,61 @@ def _process_tag(
187
193
  if can_extract and isinstance(el, NavigableString) and not el.strip():
188
194
  el.extract()
189
195
 
190
- for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children):
196
+ children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
197
+
198
+ # List of tags that return empty string when they have no content
199
+ empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
200
+
201
+ for i, el in enumerate(children):
191
202
  if isinstance(el, NavigableString):
192
- text += _process_text(
193
- el=el,
194
- escape_misc=escape_misc,
195
- escape_asterisks=escape_asterisks,
196
- escape_underscores=escape_underscores,
203
+ # Check if this is whitespace between empty elements
204
+ if el.strip() == "" and i > 0 and i < len(children) - 1:
205
+ prev_el = children[i - 1]
206
+ next_el = children[i + 1]
207
+
208
+ # If previous element was a tag that produced empty output
209
+ # and next element is also a tag that could be empty, skip this whitespace
210
+ if (
211
+ isinstance(prev_el, Tag)
212
+ and isinstance(next_el, Tag)
213
+ and prev_el.name.lower() in empty_when_no_content_tags
214
+ and next_el.name.lower() in empty_when_no_content_tags
215
+ and not prev_el.get_text().strip()
216
+ ):
217
+ # Previous tag is empty and next could be empty too, skip this whitespace
218
+ continue
219
+
220
+ text_parts.append(
221
+ _process_text(
222
+ el=el,
223
+ escape_misc=escape_misc,
224
+ escape_asterisks=escape_asterisks,
225
+ escape_underscores=escape_underscores,
226
+ )
197
227
  )
198
228
  elif isinstance(el, Tag):
199
- text += _process_tag(
200
- el,
201
- converters_map,
202
- convert_as_inline=convert_children_as_inline,
203
- convert=convert,
204
- escape_asterisks=escape_asterisks,
205
- escape_misc=escape_misc,
206
- escape_underscores=escape_underscores,
207
- strip=strip,
208
- context_before=(context_before + text)[-2:],
229
+ current_text = "".join(text_parts)
230
+ text_parts.append(
231
+ _process_tag(
232
+ el,
233
+ converters_map,
234
+ convert_as_inline=convert_children_as_inline,
235
+ convert=convert,
236
+ escape_asterisks=escape_asterisks,
237
+ escape_misc=escape_misc,
238
+ escape_underscores=escape_underscores,
239
+ strip=strip,
240
+ context_before=(context_before + current_text)[-2:],
241
+ )
209
242
  )
210
243
 
244
+ text = "".join(text_parts)
245
+
211
246
  if tag_name and should_convert_tag:
212
247
  rendered = converters_map[tag_name]( # type: ignore[call-arg]
213
248
  tag=tag, text=text, convert_as_inline=convert_as_inline
214
249
  )
215
- # For headings, ensure two newlines before if not already present
216
- # Edge case where the document starts with a \n and then a heading
250
+
217
251
  if is_heading and context_before not in {"", "\n"}:
218
252
  n_eol_to_add = 2 - (len(context_before) - len(context_before.rstrip("\n")))
219
253
  if n_eol_to_add > 0:
@@ -233,27 +267,90 @@ def _process_text(
233
267
  ) -> str:
234
268
  text = str(el) or ""
235
269
 
236
- # Cache parent lookups to avoid repeated traversal
237
270
  parent = el.parent
238
271
  parent_name = parent.name if parent else None
239
272
 
240
- # Build set of ancestor tag names for efficient lookup
241
- # Only traverse once instead of multiple find_parent calls
242
273
  ancestor_names = set()
243
274
  current = parent
244
275
  while current and hasattr(current, "name"):
245
276
  if current.name:
246
277
  ancestor_names.add(current.name)
247
278
  current = getattr(current, "parent", None)
248
- # Limit traversal depth for performance
279
+
249
280
  if len(ancestor_names) > 10:
250
281
  break
251
282
 
252
- # Check for pre ancestor (whitespace handling)
253
283
  if "pre" not in ancestor_names:
254
- text = whitespace_re.sub(" ", text)
284
+ # Special case: if the text is only whitespace
285
+ if text.strip() == "":
286
+ # If it contains newlines, it's probably indentation whitespace, return empty
287
+ if "\n" in text:
288
+ text = ""
289
+ else:
290
+ # Check if this whitespace is between block elements
291
+ # Define block elements that should not have whitespace between them
292
+ block_elements = {
293
+ "p",
294
+ "ul",
295
+ "ol",
296
+ "div",
297
+ "blockquote",
298
+ "pre",
299
+ "h1",
300
+ "h2",
301
+ "h3",
302
+ "h4",
303
+ "h5",
304
+ "h6",
305
+ "table",
306
+ "dl",
307
+ "hr",
308
+ "figure",
309
+ "article",
310
+ "section",
311
+ "nav",
312
+ "aside",
313
+ "header",
314
+ "footer",
315
+ "main",
316
+ "form",
317
+ "fieldset",
318
+ }
319
+
320
+ prev_sibling = el.previous_sibling
321
+ next_sibling = el.next_sibling
322
+
323
+ # Check if whitespace is between block elements
324
+ if (
325
+ prev_sibling
326
+ and hasattr(prev_sibling, "name")
327
+ and prev_sibling.name in block_elements
328
+ and next_sibling
329
+ and hasattr(next_sibling, "name")
330
+ and next_sibling.name in block_elements
331
+ ):
332
+ # Remove whitespace between block elements
333
+ text = ""
334
+ else:
335
+ # Otherwise it's inline whitespace, normalize to single space
336
+ text = " " if text else ""
337
+ else:
338
+ has_leading_space = text.startswith((" ", "\t"))
339
+ has_trailing_space = text.endswith((" ", "\t"))
340
+
341
+ middle_content = (
342
+ text[1:-1]
343
+ if has_leading_space and has_trailing_space
344
+ else text[1:]
345
+ if has_leading_space
346
+ else text[:-1]
347
+ if has_trailing_space
348
+ else text
349
+ )
350
+
351
+ middle_content = whitespace_re.sub(" ", middle_content.strip())
352
+ text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
255
353
 
256
- # Check for code-like ancestors (escaping)
257
354
  if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
258
355
  text = escape(
259
356
  text=text,
@@ -262,14 +359,12 @@ def _process_text(
262
359
  escape_underscores=escape_underscores,
263
360
  )
264
361
 
265
- # List item text processing
266
362
  if parent_name == "li" and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"}):
267
363
  text = text.rstrip()
268
364
 
269
365
  return text
270
366
 
271
367
 
272
- # Context variable for ancestor cache - automatically isolated per conversion
273
368
  _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_cache", default=None)
274
369
 
275
370
 
@@ -281,7 +376,6 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
281
376
  cache = {}
282
377
  _ancestor_cache.set(cache)
283
378
 
284
- # Check cache first
285
379
  if elem_id in cache:
286
380
  return cache[elem_id]
287
381
 
@@ -293,17 +387,14 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
293
387
  if hasattr(current, "name") and current.name:
294
388
  ancestor_names.add(current.name)
295
389
 
296
- # Check if we've already cached this parent's ancestors
297
390
  parent_id = id(current)
298
391
  if parent_id in cache:
299
- # Reuse cached ancestors
300
392
  ancestor_names.update(cache[parent_id])
301
393
  break
302
394
 
303
395
  current = getattr(current, "parent", None)
304
396
  depth += 1
305
397
 
306
- # Cache the result
307
398
  cache[elem_id] = ancestor_names
308
399
  return ancestor_names
309
400
 
@@ -345,33 +436,29 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
345
436
  """
346
437
  metadata = {}
347
438
 
348
- # Extract title
349
439
  title_tag = soup.find("title")
350
440
  if title_tag and isinstance(title_tag, Tag) and title_tag.string:
351
441
  metadata["title"] = title_tag.string.strip()
352
442
 
353
- # Extract base href
354
443
  base_tag = soup.find("base", href=True)
355
444
  if base_tag and isinstance(base_tag, Tag) and isinstance(base_tag["href"], str):
356
445
  metadata["base-href"] = base_tag["href"]
357
446
 
358
- # Extract meta tags
359
447
  for meta in soup.find_all("meta"):
360
- # Handle name-based meta tags
361
448
  if meta.get("name") and meta.get("content") is not None:
362
449
  name = meta["name"]
363
450
  content = meta["content"]
364
451
  if isinstance(name, str) and isinstance(content, str):
365
452
  key = f"meta-{name.lower()}"
366
453
  metadata[key] = content
367
- # Handle property-based meta tags (Open Graph, etc.)
454
+
368
455
  elif meta.get("property") and meta.get("content") is not None:
369
456
  prop = meta["property"]
370
457
  content = meta["content"]
371
458
  if isinstance(prop, str) and isinstance(content, str):
372
459
  key = f"meta-{prop.lower().replace(':', '-')}"
373
460
  metadata[key] = content
374
- # Handle http-equiv meta tags
461
+
375
462
  elif meta.get("http-equiv") and meta.get("content") is not None:
376
463
  equiv = meta["http-equiv"]
377
464
  content = meta["content"]
@@ -379,13 +466,13 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
379
466
  key = f"meta-{equiv.lower()}"
380
467
  metadata[key] = content
381
468
 
382
- # Extract canonical link
383
469
  canonical = soup.find("link", rel="canonical", href=True)
384
470
  if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
385
471
  metadata["canonical"] = canonical["href"]
386
472
 
387
- # Extract other important link relations
388
- for rel_type in ["author", "license", "alternate"]:
473
+ # Extract link relations
474
+ link_relations = {"author", "license", "alternate"}
475
+ for rel_type in link_relations:
389
476
  link = soup.find("link", rel=rel_type, href=True)
390
477
  if link and isinstance(link, Tag) and isinstance(link["href"], str):
391
478
  metadata[f"link-{rel_type}"] = link["href"]
@@ -407,7 +494,6 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
407
494
 
408
495
  lines = ["<!--"]
409
496
  for key, value in sorted(metadata.items()):
410
- # Escape any potential comment closers in the value
411
497
  safe_value = value.replace("-->", "--&gt;")
412
498
  lines.append(f"{key}: {safe_value}")
413
499
  lines.append("-->")
@@ -446,6 +532,10 @@ def convert_to_markdown(
446
532
  sup_symbol: str = "",
447
533
  wrap: bool = False,
448
534
  wrap_width: int = 80,
535
+ preprocess_html: bool = False,
536
+ preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
537
+ remove_navigation: bool = True,
538
+ remove_forms: bool = True,
449
539
  ) -> str:
450
540
  """Convert HTML to Markdown.
451
541
 
@@ -480,6 +570,10 @@ def convert_to_markdown(
480
570
  sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
481
571
  wrap: Wrap text to the specified width. Defaults to False.
482
572
  wrap_width: The number of characters at which to wrap text. Defaults to 80.
573
+ preprocess_html: Apply HTML preprocessing to improve quality. Defaults to False.
574
+ preprocessing_preset: Preset configuration for preprocessing. Defaults to "standard".
575
+ remove_navigation: Remove navigation elements during preprocessing. Defaults to True.
576
+ remove_forms: Remove form elements during preprocessing. Defaults to True.
483
577
 
484
578
  Raises:
485
579
  ConflictingOptionsError: If both 'strip' and 'convert' are specified.
@@ -499,27 +593,63 @@ def convert_to_markdown(
499
593
  return source
500
594
 
501
595
  if strip_newlines:
502
- # Replace all newlines with spaces before parsing
503
596
  source = source.replace("\n", " ").replace("\r", " ")
504
597
 
598
+ # Fix lxml parsing of void elements like <wbr>
599
+ # lxml incorrectly treats them as container tags
600
+ source = re.sub(r"<wbr\s*>", "<wbr />", source, flags=re.IGNORECASE)
601
+
602
+ if preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
603
+ config = create_preprocessor(
604
+ preset=preprocessing_preset,
605
+ remove_navigation=remove_navigation,
606
+ remove_forms=remove_forms,
607
+ )
608
+ source = preprocess_fn(source, **config)
609
+
505
610
  if "".join(source.split("\n")):
506
- # Determine parser to use
507
611
  if parser is None:
508
- # Auto-detect best available parser
509
612
  parser = "lxml" if LXML_AVAILABLE else "html.parser"
510
613
 
511
- # Validate parser choice
512
614
  if parser == "lxml" and not LXML_AVAILABLE:
513
615
  raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
514
616
 
617
+ original_source = source if isinstance(source, str) else str(source)
618
+ needs_leading_whitespace_fix = (
619
+ parser == "lxml" and isinstance(source, str) and original_source.startswith((" ", "\t", "\n", "\r"))
620
+ )
621
+
515
622
  source = BeautifulSoup(source, parser)
623
+
624
+ if parser == "lxml":
625
+ body = source.find("body")
626
+ if body and isinstance(body, Tag):
627
+ children = list(body.children)
628
+
629
+ if (
630
+ len(children) == 1
631
+ and isinstance(children[0], NavigableString)
632
+ and original_source.startswith((" ", "\t", "\n", "\r"))
633
+ and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
634
+ ):
635
+ first_child = children[0]
636
+
637
+ leading_ws = ""
638
+ for char in original_source:
639
+ if char in " \t":
640
+ leading_ws += char
641
+ else:
642
+ break
643
+
644
+ new_text = NavigableString(leading_ws + str(first_child))
645
+ first_child.replace_with(new_text)
646
+ needs_leading_space_fix = False
516
647
  else:
517
648
  raise EmptyHtmlError
518
649
 
519
650
  if strip is not None and convert is not None:
520
651
  raise ConflictingOptionsError("strip", "convert")
521
652
 
522
- # Use streaming processing if requested
523
653
  if stream_processing:
524
654
  result_chunks = []
525
655
  for chunk in convert_to_markdown_stream(
@@ -555,19 +685,15 @@ def convert_to_markdown(
555
685
  chunk_callback(chunk)
556
686
  result_chunks.append(chunk)
557
687
 
558
- # Apply same post-processing as regular path
559
688
  result = "".join(result_chunks)
560
689
 
561
- # Normalize excessive newlines - max 2 consecutive newlines (one empty line)
562
690
  result = re.sub(r"\n{3,}", "\n\n", result)
563
691
 
564
- # Strip all trailing newlines in inline mode
565
692
  if convert_as_inline:
566
693
  result = result.rstrip("\n")
567
694
 
568
695
  return result
569
696
 
570
- # Use shared core with string sink for regular processing
571
697
  sink = StringSink()
572
698
 
573
699
  _process_html_core(
@@ -601,10 +727,54 @@ def convert_to_markdown(
601
727
 
602
728
  result = sink.get_result()
603
729
 
604
- # Normalize excessive newlines - max 2 consecutive newlines (one empty line)
730
+ if (
731
+ "needs_leading_whitespace_fix" in locals()
732
+ and needs_leading_whitespace_fix
733
+ and not result.startswith((" ", "\t", "\n", "\r"))
734
+ ):
735
+ original_input = sink.original_source if hasattr(sink, "original_source") else original_source
736
+ leading_whitespace_match = re.match(r"^[\s]*", original_input)
737
+ if leading_whitespace_match:
738
+ leading_whitespace = leading_whitespace_match.group(0)
739
+
740
+ # Check if input contains list or heading tags
741
+ list_heading_tags = {"<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"}
742
+ if any(tag in original_input for tag in list_heading_tags):
743
+ leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
744
+ leading_whitespace = leading_newlines.group(0) if leading_newlines else ""
745
+
746
+ if leading_whitespace:
747
+ result = leading_whitespace + result
748
+
605
749
  result = re.sub(r"\n{3,}", "\n\n", result)
606
750
 
607
- # Strip all trailing newlines in inline mode
751
+ def normalize_spaces_outside_code(text: str) -> str:
752
+ parts = text.split("```")
753
+ for i in range(0, len(parts), 2):
754
+ # Process each line separately to preserve leading spaces
755
+ lines = parts[i].split("\n")
756
+ processed_lines = []
757
+ for line in lines:
758
+ # Preserve definition list formatting (: followed by 3 spaces)
759
+ def_parts = re.split(r"(:\s{3})", line)
760
+ for j in range(0, len(def_parts), 2):
761
+ # Only normalize non-definition-list parts
762
+ # Also preserve leading spaces (for list indentation)
763
+ match = re.match(r"^(\s*)(.*)", def_parts[j])
764
+ if match:
765
+ leading_spaces, rest = match.groups()
766
+ # Only normalize multiple spaces that are not at the beginning
767
+ rest = re.sub(r" {3,}", " ", rest)
768
+ def_parts[j] = leading_spaces + rest
769
+ processed_lines.append("".join(def_parts))
770
+ parts[i] = "\n".join(processed_lines)
771
+ return "```".join(parts)
772
+
773
+ result = normalize_spaces_outside_code(result)
774
+
775
+ result = re.sub(r"\*\* {2,}", "** ", result)
776
+ result = re.sub(r" {2,}\*\*", " **", result)
777
+
608
778
  if convert_as_inline:
609
779
  result = result.rstrip("\n")
610
780
 
@@ -654,25 +824,19 @@ class StreamingSink(OutputSink):
654
824
  if not text:
655
825
  return
656
826
 
657
- # Use string concatenation instead of StringIO for better performance
658
827
  current_content = self.buffer.getvalue() if self.buffer_size > 0 else ""
659
828
  current_content += text
660
829
 
661
- # Yield chunks when buffer is large enough
662
830
  while len(current_content) >= self.chunk_size:
663
- # Find optimal split point (prefer after newlines)
664
831
  split_pos = self._find_split_position(current_content)
665
832
 
666
- # Extract chunk and update remaining content
667
833
  chunk = current_content[:split_pos]
668
834
  current_content = current_content[split_pos:]
669
835
 
670
- # Store chunk and update progress
671
836
  self.chunks.append(chunk)
672
837
  self.processed_bytes += len(chunk)
673
838
  self._update_progress()
674
839
 
675
- # Update buffer with remaining content
676
840
  self.buffer = StringIO()
677
841
  if current_content:
678
842
  self.buffer.write(current_content)
@@ -692,7 +856,6 @@ class StreamingSink(OutputSink):
692
856
 
693
857
  def _find_split_position(self, content: str) -> int:
694
858
  """Find optimal position to split content for chunks."""
695
- # Look for newline within reasonable distance of target size
696
859
  target = self.chunk_size
697
860
  lookahead = min(100, len(content) - target)
698
861
 
@@ -740,11 +903,9 @@ def _process_html_core(
740
903
  wrap_width: int,
741
904
  ) -> None:
742
905
  """Core HTML to Markdown processing logic shared by both regular and streaming."""
743
- # Set up a fresh cache for this conversion
744
906
  token = _ancestor_cache.set({})
745
907
 
746
908
  try:
747
- # Input validation and preprocessing
748
909
  if isinstance(source, str):
749
910
  if (
750
911
  heading_style == UNDERLINED
@@ -759,12 +920,9 @@ def _process_html_core(
759
920
  source = source.replace("\n", " ").replace("\r", " ")
760
921
 
761
922
  if "".join(source.split("\n")):
762
- # Determine parser to use
763
923
  if parser is None:
764
- # Auto-detect best available parser
765
924
  parser = "lxml" if LXML_AVAILABLE else "html.parser"
766
925
 
767
- # Validate parser choice
768
926
  if parser == "lxml" and not LXML_AVAILABLE:
769
927
  raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
770
928
 
@@ -775,7 +933,6 @@ def _process_html_core(
775
933
  if strip is not None and convert is not None:
776
934
  raise ConflictingOptionsError("strip", "convert")
777
935
 
778
- # Create converters map
779
936
  converters_map = create_converters_map(
780
937
  autolinks=autolinks,
781
938
  bullets=bullets,
@@ -795,18 +952,15 @@ def _process_html_core(
795
952
  if custom_converters:
796
953
  converters_map.update(cast("ConvertersMap", custom_converters))
797
954
 
798
- # Extract metadata if requested
799
955
  if extract_metadata and not convert_as_inline:
800
956
  metadata = _extract_metadata(source)
801
957
  metadata_comment = _format_metadata_comment(metadata)
802
958
  if metadata_comment:
803
959
  sink.write(metadata_comment)
804
960
 
805
- # Find the body tag to process only its content
806
961
  body = source.find("body")
807
962
  elements_to_process = body.children if body and isinstance(body, Tag) else source.children
808
963
 
809
- # Process elements using shared logic
810
964
  context = ""
811
965
  for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
812
966
  if isinstance(el, NavigableString):
@@ -833,10 +987,8 @@ def _process_html_core(
833
987
  sink.write(text)
834
988
  context += text
835
989
 
836
- # Finalize output
837
990
  sink.finalize()
838
991
  finally:
839
- # Reset context
840
992
  _ancestor_cache.reset(token)
841
993
 
842
994
 
@@ -909,16 +1061,13 @@ def convert_to_markdown_stream(
909
1061
  Yields:
910
1062
  str: Chunks of Markdown-formatted text.
911
1063
  """
912
- # Use shared core with streaming sink
913
1064
  sink = StreamingSink(chunk_size, progress_callback)
914
1065
 
915
- # Estimate total size for progress reporting
916
1066
  if isinstance(source, str):
917
1067
  sink.total_bytes = len(source)
918
1068
  elif isinstance(source, BeautifulSoup):
919
1069
  sink.total_bytes = len(str(source))
920
1070
 
921
- # Process using shared core
922
1071
  _process_html_core(
923
1072
  source,
924
1073
  sink,
@@ -948,30 +1097,22 @@ def convert_to_markdown_stream(
948
1097
  wrap_width=wrap_width,
949
1098
  )
950
1099
 
951
- # Get all chunks from the sink and apply post-processing
952
1100
  all_chunks = list(sink.get_chunks())
953
1101
  combined_result = "".join(all_chunks)
954
1102
 
955
- # Apply same post-processing as regular conversion
956
- # Normalize excessive newlines - max 2 consecutive newlines (one empty line)
957
1103
  combined_result = re.sub(r"\n{3,}", "\n\n", combined_result)
958
1104
 
959
- # Strip all trailing newlines in inline mode
960
1105
  if convert_as_inline:
961
1106
  combined_result = combined_result.rstrip("\n")
962
1107
 
963
- # Now split the post-processed result back into chunks at good boundaries
964
1108
  if not combined_result:
965
1109
  return
966
1110
 
967
1111
  pos = 0
968
1112
  while pos < len(combined_result):
969
- # Calculate chunk end position
970
1113
  end_pos = min(pos + chunk_size, len(combined_result))
971
1114
 
972
- # If not at the end, try to find a good split point
973
1115
  if end_pos < len(combined_result):
974
- # Look for newline within reasonable distance
975
1116
  search_start = max(pos, end_pos - 50)
976
1117
  search_end = min(len(combined_result), end_pos + 50)
977
1118
  search_area = combined_result[search_start:search_end]
@@ -980,7 +1121,6 @@ def convert_to_markdown_stream(
980
1121
  if newline_pos > 0:
981
1122
  end_pos = search_start + newline_pos + 1
982
1123
 
983
- # Yield the chunk
984
1124
  chunk = combined_result[pos:end_pos]
985
1125
  if chunk:
986
1126
  yield chunk
html_to_markdown/utils.py CHANGED
@@ -6,18 +6,25 @@ from html_to_markdown.constants import line_beginning_re
6
6
 
7
7
 
8
8
  def chomp(text: str) -> tuple[str, str, str]:
9
- """If the text in an inline tag like b, a, or em contains a leading or trailing
10
- space, strip the string and return a space as suffix of prefix, if needed.
9
+ """Simplified whitespace handling for inline elements.
10
+
11
+ For semantic markdown output, preserves leading/trailing spaces as single spaces
12
+ and normalizes internal whitespace.
11
13
 
12
14
  Args:
13
15
  text: The text to chomp.
14
16
 
15
17
  Returns:
16
- A tuple containing the prefix, suffix, and the stripped text.
18
+ A tuple containing the prefix, suffix, and the normalized text.
17
19
  """
18
- prefix = " " if text and text[0] == " " else ""
19
- suffix = " " if text and text[-1] == " " else ""
20
+ if not text:
21
+ return "", "", ""
22
+
23
+ prefix = " " if text.startswith((" ", "\t")) else ""
24
+ suffix = " " if text.endswith((" ", "\t")) else ""
25
+
20
26
  text = text.strip()
27
+
21
28
  return prefix, suffix, text
22
29
 
23
30