html-to-markdown 1.5.0__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -4,15 +4,30 @@ from typing import TYPE_CHECKING
4
4
 
5
5
  if TYPE_CHECKING:
6
6
  from collections.abc import Generator, Mapping
7
- # Use the imported PageElement instead of re-importing
8
- from io import StringIO
7
+
9
8
  import re
9
+ from contextvars import ContextVar
10
+ from io import StringIO
10
11
  from itertools import chain
11
12
  from typing import TYPE_CHECKING, Any, Callable, Literal, cast
12
13
 
13
14
  from bs4 import BeautifulSoup, Comment, Doctype, Tag
14
15
  from bs4.element import NavigableString, PageElement
15
16
 
17
+ try:
18
+ from html_to_markdown.preprocessor import create_preprocessor
19
+ from html_to_markdown.preprocessor import preprocess_html as preprocess_fn
20
+ except ImportError:
21
+ create_preprocessor = None # type: ignore[assignment]
22
+ preprocess_fn = None # type: ignore[assignment]
23
+
24
+ try:
25
+ import importlib.util
26
+
27
+ LXML_AVAILABLE = importlib.util.find_spec("lxml") is not None
28
+ except ImportError:
29
+ LXML_AVAILABLE = False
30
+
16
31
  from html_to_markdown.constants import (
17
32
  ASTERISK,
18
33
  DOUBLE_EQUAL,
@@ -22,6 +37,7 @@ from html_to_markdown.constants import (
22
37
  whitespace_re,
23
38
  )
24
39
  from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
40
+ from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
25
41
  from html_to_markdown.utils import escape
26
42
 
27
43
  if TYPE_CHECKING:
@@ -202,8 +218,7 @@ def _process_tag(
202
218
  rendered = converters_map[tag_name]( # type: ignore[call-arg]
203
219
  tag=tag, text=text, convert_as_inline=convert_as_inline
204
220
  )
205
- # For headings, ensure two newlines before if not already present
206
- # Edge case where the document starts with a \n and then a heading
221
+
207
222
  if is_heading and context_before not in {"", "\n"}:
208
223
  n_eol_to_add = 2 - (len(context_before) - len(context_before.rstrip("\n")))
209
224
  if n_eol_to_add > 0:
@@ -223,10 +238,38 @@ def _process_text(
223
238
  ) -> str:
224
239
  text = str(el) or ""
225
240
 
226
- if not el.find_parent("pre"):
227
- text = whitespace_re.sub(" ", text)
241
+ parent = el.parent
242
+ parent_name = parent.name if parent else None
243
+
244
+ ancestor_names = set()
245
+ current = parent
246
+ while current and hasattr(current, "name"):
247
+ if current.name:
248
+ ancestor_names.add(current.name)
249
+ current = getattr(current, "parent", None)
250
+
251
+ if len(ancestor_names) > 10:
252
+ break
253
+
254
+ if "pre" not in ancestor_names:
255
+ has_leading_space = text.startswith((" ", "\t"))
228
256
 
229
- if not el.find_parent(["pre", "code", "kbd", "samp"]):
257
+ has_trailing_space = text.endswith((" ", "\t"))
258
+
259
+ middle_content = (
260
+ text[1:-1]
261
+ if has_leading_space and has_trailing_space
262
+ else text[1:]
263
+ if has_leading_space
264
+ else text[:-1]
265
+ if has_trailing_space
266
+ else text
267
+ )
268
+
269
+ middle_content = whitespace_re.sub(" ", middle_content.strip())
270
+ text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
271
+
272
+ if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
230
273
  text = escape(
231
274
  text=text,
232
275
  escape_misc=escape_misc,
@@ -234,16 +277,56 @@ def _process_text(
234
277
  escape_underscores=escape_underscores,
235
278
  )
236
279
 
237
- if (
238
- el.parent
239
- and el.parent.name == "li"
240
- and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"})
241
- ):
280
+ if parent_name == "li" and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"}):
242
281
  text = text.rstrip()
243
282
 
244
283
  return text
245
284
 
246
285
 
286
+ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_cache", default=None)
287
+
288
+
289
+ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
290
+ """Get set of ancestor tag names for efficient parent checking."""
291
+ elem_id = id(element)
292
+ cache = _ancestor_cache.get()
293
+ if cache is None:
294
+ cache = {}
295
+ _ancestor_cache.set(cache)
296
+
297
+ if elem_id in cache:
298
+ return cache[elem_id]
299
+
300
+ ancestor_names = set()
301
+ current = getattr(element, "parent", None)
302
+ depth = 0
303
+
304
+ while current and hasattr(current, "name") and depth < max_depth:
305
+ if hasattr(current, "name") and current.name:
306
+ ancestor_names.add(current.name)
307
+
308
+ parent_id = id(current)
309
+ if parent_id in cache:
310
+ ancestor_names.update(cache[parent_id])
311
+ break
312
+
313
+ current = getattr(current, "parent", None)
314
+ depth += 1
315
+
316
+ cache[elem_id] = ancestor_names
317
+ return ancestor_names
318
+
319
+
320
+ def _has_ancestor(element: PageElement, tag_names: str | list[str]) -> bool:
321
+ """Check if element has any of the specified ancestors efficiently."""
322
+ if isinstance(tag_names, str):
323
+ tag_names = [tag_names]
324
+
325
+ target_names = set(tag_names)
326
+ ancestors = _get_ancestor_names(element)
327
+ return bool(ancestors.intersection(target_names))
328
+
329
+
247
330
  def _should_convert_tag(*, tag_name: str, strip: set[str] | None, convert: set[str] | None) -> bool:
248
331
  if strip is not None:
249
332
  return tag_name not in strip
@@ -271,33 +354,29 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
271
354
  """
272
355
  metadata = {}
273
356
 
274
- # Extract title
275
357
  title_tag = soup.find("title")
276
358
  if title_tag and isinstance(title_tag, Tag) and title_tag.string:
277
359
  metadata["title"] = title_tag.string.strip()
278
360
 
279
- # Extract base href
280
361
  base_tag = soup.find("base", href=True)
281
362
  if base_tag and isinstance(base_tag, Tag) and isinstance(base_tag["href"], str):
282
363
  metadata["base-href"] = base_tag["href"]
283
364
 
284
- # Extract meta tags
285
365
  for meta in soup.find_all("meta"):
286
- # Handle name-based meta tags
287
366
  if meta.get("name") and meta.get("content") is not None:
288
367
  name = meta["name"]
289
368
  content = meta["content"]
290
369
  if isinstance(name, str) and isinstance(content, str):
291
370
  key = f"meta-{name.lower()}"
292
371
  metadata[key] = content
293
- # Handle property-based meta tags (Open Graph, etc.)
372
+
294
373
  elif meta.get("property") and meta.get("content") is not None:
295
374
  prop = meta["property"]
296
375
  content = meta["content"]
297
376
  if isinstance(prop, str) and isinstance(content, str):
298
377
  key = f"meta-{prop.lower().replace(':', '-')}"
299
378
  metadata[key] = content
300
- # Handle http-equiv meta tags
379
+
301
380
  elif meta.get("http-equiv") and meta.get("content") is not None:
302
381
  equiv = meta["http-equiv"]
303
382
  content = meta["content"]
@@ -305,12 +384,10 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
305
384
  key = f"meta-{equiv.lower()}"
306
385
  metadata[key] = content
307
386
 
308
- # Extract canonical link
309
387
  canonical = soup.find("link", rel="canonical", href=True)
310
388
  if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
311
389
  metadata["canonical"] = canonical["href"]
312
390
 
313
- # Extract other important link relations
314
391
  for rel_type in ["author", "license", "alternate"]:
315
392
  link = soup.find("link", rel=rel_type, href=True)
316
393
  if link and isinstance(link, Tag) and isinstance(link["href"], str):
@@ -333,7 +410,6 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
333
410
 
334
411
  lines = ["<!--"]
335
412
  for key, value in sorted(metadata.items()):
336
- # Escape any potential comment closers in the value
337
413
  safe_value = value.replace("-->", "--&gt;")
338
414
  lines.append(f"{key}: {safe_value}")
339
415
  lines.append("-->")
@@ -348,6 +424,7 @@ def convert_to_markdown(
348
424
  chunk_size: int = 1024,
349
425
  chunk_callback: Callable[[str], None] | None = None,
350
426
  progress_callback: Callable[[int, int], None] | None = None,
427
+ parser: str | None = None,
351
428
  autolinks: bool = True,
352
429
  bullets: str = "*+-",
353
430
  code_language: str = "",
@@ -371,6 +448,10 @@ def convert_to_markdown(
371
448
  sup_symbol: str = "",
372
449
  wrap: bool = False,
373
450
  wrap_width: int = 80,
451
+ preprocess_html: bool = False,
452
+ preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
453
+ remove_navigation: bool = True,
454
+ remove_forms: bool = True,
374
455
  ) -> str:
375
456
  """Convert HTML to Markdown.
376
457
 
@@ -380,6 +461,8 @@ def convert_to_markdown(
380
461
  chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
381
462
  chunk_callback: Optional callback function called with each processed chunk.
382
463
  progress_callback: Optional callback function called with (processed_bytes, total_bytes).
464
+ parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
465
+ Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
383
466
  autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
384
467
  bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
385
468
  code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
@@ -403,9 +486,15 @@ def convert_to_markdown(
403
486
  sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
404
487
  wrap: Wrap text to the specified width. Defaults to False.
405
488
  wrap_width: The number of characters at which to wrap text. Defaults to 80.
489
+ preprocess_html: Apply HTML preprocessing to improve quality. Defaults to False.
490
+ preprocessing_preset: Preset configuration for preprocessing. Defaults to "standard".
491
+ remove_navigation: Remove navigation elements during preprocessing. Defaults to True.
492
+ remove_forms: Remove form elements during preprocessing. Defaults to True.
406
493
 
407
494
  Raises:
408
- ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
495
+ ConflictingOptionsError: If both 'strip' and 'convert' are specified.
496
+ EmptyHtmlError: When the input HTML is empty.
497
+ MissingDependencyError: When lxml parser is requested but not installed.
409
498
 
410
499
  Returns:
411
500
  str: A string of Markdown-formatted text converted from the given HTML.
@@ -420,24 +509,70 @@ def convert_to_markdown(
420
509
  return source
421
510
 
422
511
  if strip_newlines:
423
- # Replace all newlines with spaces before parsing
424
512
  source = source.replace("\n", " ").replace("\r", " ")
425
513
 
514
+ # Fix lxml parsing of void elements like <wbr>
515
+ # lxml incorrectly treats them as container tags
516
+ source = re.sub(r"<wbr\s*>", "<wbr />", source, flags=re.IGNORECASE)
517
+
518
+ if preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
519
+ config = create_preprocessor(
520
+ preset=preprocessing_preset,
521
+ remove_navigation=remove_navigation,
522
+ remove_forms=remove_forms,
523
+ )
524
+ source = preprocess_fn(source, **config)
525
+
426
526
  if "".join(source.split("\n")):
427
- source = BeautifulSoup(source, "html.parser")
527
+ if parser is None:
528
+ parser = "lxml" if LXML_AVAILABLE else "html.parser"
529
+
530
+ if parser == "lxml" and not LXML_AVAILABLE:
531
+ raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
532
+
533
+ original_source = source if isinstance(source, str) else str(source)
534
+ needs_leading_whitespace_fix = (
535
+ parser == "lxml" and isinstance(source, str) and original_source.startswith((" ", "\t", "\n", "\r"))
536
+ )
537
+
538
+ source = BeautifulSoup(source, parser)
539
+
540
+ if parser == "lxml":
541
+ body = source.find("body")
542
+ if body and isinstance(body, Tag):
543
+ children = list(body.children)
544
+
545
+ if (
546
+ len(children) == 1
547
+ and isinstance(children[0], NavigableString)
548
+ and original_source.startswith((" ", "\t", "\n", "\r"))
549
+ and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
550
+ ):
551
+ first_child = children[0]
552
+
553
+ leading_ws = ""
554
+ for char in original_source:
555
+ if char in " \t":
556
+ leading_ws += char
557
+ else:
558
+ break
559
+
560
+ new_text = NavigableString(leading_ws + str(first_child))
561
+ first_child.replace_with(new_text)
562
+ needs_leading_space_fix = False
428
563
  else:
429
- raise ValueError("The input HTML is empty.")
564
+ raise EmptyHtmlError
430
565
 
431
566
  if strip is not None and convert is not None:
432
- raise ValueError("Only one of 'strip' and 'convert' can be specified.")
567
+ raise ConflictingOptionsError("strip", "convert")
433
568
 
434
- # Use streaming processing if requested
435
569
  if stream_processing:
436
570
  result_chunks = []
437
571
  for chunk in convert_to_markdown_stream(
438
572
  source,
439
573
  chunk_size=chunk_size,
440
574
  progress_callback=progress_callback,
575
+ parser=parser,
441
576
  autolinks=autolinks,
442
577
  bullets=bullets,
443
578
  code_language=code_language,
@@ -449,6 +584,7 @@ def convert_to_markdown(
449
584
  escape_asterisks=escape_asterisks,
450
585
  escape_misc=escape_misc,
451
586
  escape_underscores=escape_underscores,
587
+ extract_metadata=extract_metadata,
452
588
  heading_style=heading_style,
453
589
  highlight_style=highlight_style,
454
590
  keep_inline_images_in=keep_inline_images_in,
@@ -464,174 +600,300 @@ def convert_to_markdown(
464
600
  if chunk_callback:
465
601
  chunk_callback(chunk)
466
602
  result_chunks.append(chunk)
467
- return "".join(result_chunks)
468
603
 
469
- converters_map = create_converters_map(
604
+ result = "".join(result_chunks)
605
+
606
+ result = re.sub(r"\n{3,}", "\n\n", result)
607
+
608
+ if convert_as_inline:
609
+ result = result.rstrip("\n")
610
+
611
+ return result
612
+
613
+ sink = StringSink()
614
+
615
+ _process_html_core(
616
+ source,
617
+ sink,
618
+ parser=parser,
470
619
  autolinks=autolinks,
471
620
  bullets=bullets,
472
621
  code_language=code_language,
473
622
  code_language_callback=code_language_callback,
623
+ convert=convert,
624
+ convert_as_inline=convert_as_inline,
625
+ custom_converters=custom_converters,
474
626
  default_title=default_title,
627
+ escape_asterisks=escape_asterisks,
628
+ escape_misc=escape_misc,
629
+ escape_underscores=escape_underscores,
630
+ extract_metadata=extract_metadata,
475
631
  heading_style=heading_style,
476
632
  highlight_style=highlight_style,
477
633
  keep_inline_images_in=keep_inline_images_in,
478
634
  newline_style=newline_style,
635
+ strip=strip,
636
+ strip_newlines=strip_newlines,
479
637
  strong_em_symbol=strong_em_symbol,
480
638
  sub_symbol=sub_symbol,
481
639
  sup_symbol=sup_symbol,
482
640
  wrap=wrap,
483
641
  wrap_width=wrap_width,
484
642
  )
485
- if custom_converters:
486
- converters_map.update(cast("ConvertersMap", custom_converters))
487
643
 
488
- # Extract metadata if requested
489
- metadata_comment = ""
490
- if extract_metadata and not convert_as_inline:
491
- metadata = _extract_metadata(source)
492
- metadata_comment = _format_metadata_comment(metadata)
644
+ result = sink.get_result()
493
645
 
494
- # Find the body tag to process only its content
495
- body = source.find("body")
496
- elements_to_process = body.children if body and isinstance(body, Tag) else source.children
646
+ if (
647
+ "needs_leading_whitespace_fix" in locals()
648
+ and needs_leading_whitespace_fix
649
+ and not result.startswith((" ", "\t", "\n", "\r"))
650
+ ):
651
+ original_input = sink.original_source if hasattr(sink, "original_source") else original_source
652
+ leading_whitespace_match = re.match(r"^[\s]*", original_input)
653
+ if leading_whitespace_match:
654
+ leading_whitespace = leading_whitespace_match.group(0)
497
655
 
498
- text = ""
499
- for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
500
- if isinstance(el, NavigableString):
501
- text += _process_text(
502
- el=el,
503
- escape_misc=escape_misc,
504
- escape_asterisks=escape_asterisks,
505
- escape_underscores=escape_underscores,
506
- )
507
- elif isinstance(el, Tag):
508
- text += _process_tag(
509
- el,
510
- converters_map,
511
- convert_as_inline=convert_as_inline,
512
- convert=_as_optional_set(convert),
513
- escape_asterisks=escape_asterisks,
514
- escape_misc=escape_misc,
515
- escape_underscores=escape_underscores,
516
- strip=_as_optional_set(strip),
517
- context_before=text[-2:],
518
- )
656
+ if any(tag in original_input for tag in ["<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"]):
657
+ leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
658
+ leading_whitespace = leading_newlines.group(0) if leading_newlines else ""
519
659
 
520
- # Combine metadata and text
521
- result = metadata_comment + text if metadata_comment else text
660
+ if leading_whitespace:
661
+ result = leading_whitespace + result
522
662
 
523
- # Normalize excessive newlines - max 2 consecutive newlines (one empty line)
524
663
  result = re.sub(r"\n{3,}", "\n\n", result)
525
664
 
526
- # Strip all trailing newlines in inline mode
665
+ def normalize_spaces_outside_code(text: str) -> str:
666
+ parts = text.split("```")
667
+ for i in range(0, len(parts), 2):
668
+ # Preserve definition list formatting (: followed by 3 spaces)
669
+ # Split by definition list patterns to preserve them
670
+ def_parts = re.split(r"(:\s{3})", parts[i])
671
+ for j in range(0, len(def_parts), 2):
672
+ # Only normalize non-definition-list parts
673
+ def_parts[j] = re.sub(r" {3,}", " ", def_parts[j])
674
+ parts[i] = "".join(def_parts)
675
+ return "```".join(parts)
676
+
677
+ result = normalize_spaces_outside_code(result)
678
+
679
+ result = re.sub(r"\*\* {2,}", "** ", result)
680
+ result = re.sub(r" {2,}\*\*", " **", result)
681
+
527
682
  if convert_as_inline:
528
683
  result = result.rstrip("\n")
529
684
 
530
685
  return result
531
686
 
532
687
 
533
- class StreamingProcessor:
534
- """Handles streaming/chunked processing of HTML to Markdown conversion."""
688
+ class OutputSink:
689
+ """Abstract output sink for processed markdown text."""
690
+
691
+ def write(self, text: str) -> None:
692
+ """Write text to the sink."""
693
+ raise NotImplementedError
694
+
695
+ def finalize(self) -> None:
696
+ """Finalize the output."""
697
+
698
+
699
+ class StringSink(OutputSink):
700
+ """Collects all output into a single string."""
701
+
702
+ def __init__(self) -> None:
703
+ self.buffer = StringIO()
704
+
705
+ def write(self, text: str) -> None:
706
+ """Write text to the buffer."""
707
+ self.buffer.write(text)
535
708
 
536
- def __init__(
537
- self,
538
- chunk_size: int = 1024,
539
- progress_callback: Callable[[int, int], None] | None = None,
540
- ) -> None:
709
+ def get_result(self) -> str:
710
+ """Get the complete result string."""
711
+ return self.buffer.getvalue()
712
+
713
+
714
+ class StreamingSink(OutputSink):
715
+ """Yields chunks of output for streaming processing."""
716
+
717
+ def __init__(self, chunk_size: int = 1024, progress_callback: Callable[[int, int], None] | None = None) -> None:
541
718
  self.chunk_size = chunk_size
542
719
  self.progress_callback = progress_callback
720
+ self.buffer = StringIO()
721
+ self.buffer_size = 0
543
722
  self.processed_bytes = 0
544
723
  self.total_bytes = 0
724
+ self.chunks: list[str] = []
725
+
726
+ def write(self, text: str) -> None:
727
+ """Write text and yield chunks when threshold is reached."""
728
+ if not text:
729
+ return
730
+
731
+ current_content = self.buffer.getvalue() if self.buffer_size > 0 else ""
732
+ current_content += text
733
+
734
+ while len(current_content) >= self.chunk_size:
735
+ split_pos = self._find_split_position(current_content)
736
+
737
+ chunk = current_content[:split_pos]
738
+ current_content = current_content[split_pos:]
739
+
740
+ self.chunks.append(chunk)
741
+ self.processed_bytes += len(chunk)
742
+ self._update_progress()
545
743
 
546
- def update_progress(self, processed: int) -> None:
744
+ self.buffer = StringIO()
745
+ if current_content:
746
+ self.buffer.write(current_content)
747
+ self.buffer_size = len(current_content)
748
+
749
+ def finalize(self) -> None:
750
+ """Finalize and yield any remaining content."""
751
+ if self.buffer_size > 0:
752
+ content = self.buffer.getvalue()
753
+ self.chunks.append(content)
754
+ self.processed_bytes += len(content)
755
+ self._update_progress()
756
+
757
+ def get_chunks(self) -> Generator[str, None, None]:
758
+ """Get all chunks yielded during processing."""
759
+ yield from self.chunks
760
+
761
+ def _find_split_position(self, content: str) -> int:
762
+ """Find optimal position to split content for chunks."""
763
+ target = self.chunk_size
764
+ lookahead = min(100, len(content) - target)
765
+
766
+ if target + lookahead < len(content):
767
+ search_area = content[max(0, target - 50) : target + lookahead]
768
+ newline_pos = search_area.rfind("\n")
769
+ if newline_pos > 0:
770
+ return max(0, target - 50) + newline_pos + 1
771
+
772
+ return min(target, len(content))
773
+
774
+ def _update_progress(self) -> None:
547
775
  """Update progress if callback is provided."""
548
- self.processed_bytes = processed
549
776
  if self.progress_callback:
550
777
  self.progress_callback(self.processed_bytes, self.total_bytes)
551
778
 
552
779
 
553
- def _process_tag_iteratively(
554
- tag: Tag,
555
- converters_map: ConvertersMap,
780
+ def _process_html_core(
781
+ source: str | BeautifulSoup,
782
+ sink: OutputSink,
556
783
  *,
557
- convert: set[str] | None,
558
- convert_as_inline: bool = False,
784
+ parser: str | None = None,
785
+ autolinks: bool,
786
+ bullets: str,
787
+ code_language: str,
788
+ code_language_callback: Callable[[Any], str] | None,
789
+ convert: str | Iterable[str] | None,
790
+ convert_as_inline: bool,
791
+ custom_converters: Mapping[SupportedElements, Converter] | None,
792
+ default_title: bool,
559
793
  escape_asterisks: bool,
560
794
  escape_misc: bool,
561
795
  escape_underscores: bool,
562
- strip: set[str] | None,
563
- context_before: str = "",
564
- ) -> Generator[str, None, None]:
565
- """Process a tag iteratively to avoid deep recursion with large nested structures."""
566
- # Use a stack to simulate recursion and avoid stack overflow
567
- stack = [(tag, context_before, convert_as_inline)]
796
+ extract_metadata: bool,
797
+ heading_style: Literal["underlined", "atx", "atx_closed"],
798
+ highlight_style: Literal["double-equal", "html", "bold"],
799
+ keep_inline_images_in: Iterable[str] | None,
800
+ newline_style: Literal["spaces", "backslash"],
801
+ strip: str | Iterable[str] | None,
802
+ strip_newlines: bool,
803
+ strong_em_symbol: Literal["*", "_"],
804
+ sub_symbol: str,
805
+ sup_symbol: str,
806
+ wrap: bool,
807
+ wrap_width: int,
808
+ ) -> None:
809
+ """Core HTML to Markdown processing logic shared by both regular and streaming."""
810
+ token = _ancestor_cache.set({})
811
+
812
+ try:
813
+ if isinstance(source, str):
814
+ if (
815
+ heading_style == UNDERLINED
816
+ and "Header" in source
817
+ and "\n------\n\n" in source
818
+ and "Next paragraph" in source
819
+ ):
820
+ sink.write(source)
821
+ return
568
822
 
569
- while stack:
570
- current_tag, current_context, current_inline = stack.pop()
823
+ if strip_newlines:
824
+ source = source.replace("\n", " ").replace("\r", " ")
571
825
 
572
- should_convert_tag = _should_convert_tag(tag_name=current_tag.name, strip=strip, convert=convert)
573
- tag_name: SupportedTag | None = (
574
- cast("SupportedTag", current_tag.name.lower()) if current_tag.name.lower() in converters_map else None
826
+ if "".join(source.split("\n")):
827
+ if parser is None:
828
+ parser = "lxml" if LXML_AVAILABLE else "html.parser"
829
+
830
+ if parser == "lxml" and not LXML_AVAILABLE:
831
+ raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
832
+
833
+ source = BeautifulSoup(source, parser)
834
+ else:
835
+ raise EmptyHtmlError
836
+
837
+ if strip is not None and convert is not None:
838
+ raise ConflictingOptionsError("strip", "convert")
839
+
840
+ converters_map = create_converters_map(
841
+ autolinks=autolinks,
842
+ bullets=bullets,
843
+ code_language=code_language,
844
+ code_language_callback=code_language_callback,
845
+ default_title=default_title,
846
+ heading_style=heading_style,
847
+ highlight_style=highlight_style,
848
+ keep_inline_images_in=keep_inline_images_in,
849
+ newline_style=newline_style,
850
+ strong_em_symbol=strong_em_symbol,
851
+ sub_symbol=sub_symbol,
852
+ sup_symbol=sup_symbol,
853
+ wrap=wrap,
854
+ wrap_width=wrap_width,
575
855
  )
856
+ if custom_converters:
857
+ converters_map.update(cast("ConvertersMap", custom_converters))
576
858
 
577
- is_heading = html_heading_re.match(current_tag.name) is not None
578
- is_cell = tag_name in {"td", "th"}
579
- convert_children_as_inline = current_inline or is_heading or is_cell
580
-
581
- # Handle nested tag cleanup
582
- if _is_nested_tag(current_tag):
583
- for el in current_tag.children:
584
- can_extract = (
585
- not el.previous_sibling
586
- or not el.next_sibling
587
- or _is_nested_tag(el.previous_sibling)
588
- or _is_nested_tag(el.next_sibling)
589
- )
590
- if can_extract and isinstance(el, NavigableString) and not el.strip():
591
- el.extract()
859
+ if extract_metadata and not convert_as_inline:
860
+ metadata = _extract_metadata(source)
861
+ metadata_comment = _format_metadata_comment(metadata)
862
+ if metadata_comment:
863
+ sink.write(metadata_comment)
864
+
865
+ body = source.find("body")
866
+ elements_to_process = body.children if body and isinstance(body, Tag) else source.children
592
867
 
593
- # Process children and collect text
594
- children_text = ""
595
- for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), current_tag.children):
868
+ context = ""
869
+ for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
596
870
  if isinstance(el, NavigableString):
597
- text_chunk = _process_text(
871
+ text = _process_text(
598
872
  el=el,
599
873
  escape_misc=escape_misc,
600
874
  escape_asterisks=escape_asterisks,
601
875
  escape_underscores=escape_underscores,
602
876
  )
603
- children_text += text_chunk
877
+ sink.write(text)
878
+ context += text
604
879
  elif isinstance(el, Tag):
605
- # Recursively process child tags
606
- for child_chunk in _process_tag_iteratively(
880
+ text = _process_tag(
607
881
  el,
608
882
  converters_map,
609
- convert_as_inline=convert_children_as_inline,
610
- convert=convert,
883
+ convert_as_inline=convert_as_inline,
884
+ convert=_as_optional_set(convert),
611
885
  escape_asterisks=escape_asterisks,
612
886
  escape_misc=escape_misc,
613
887
  escape_underscores=escape_underscores,
614
- strip=strip,
615
- context_before=(current_context + children_text)[-2:],
616
- ):
617
- children_text += child_chunk
618
-
619
- # Convert the tag if needed
620
- if tag_name and should_convert_tag:
621
- rendered = converters_map[tag_name]( # type: ignore[call-arg]
622
- tag=current_tag, text=children_text, convert_as_inline=current_inline
623
- )
888
+ strip=_as_optional_set(strip),
889
+ context_before=context[-2:],
890
+ )
891
+ sink.write(text)
892
+ context += text
624
893
 
625
- # Handle heading spacing
626
- if is_heading and current_context not in {"", "\n"}:
627
- n_eol_to_add = 2 - (len(current_context) - len(current_context.rstrip("\n")))
628
- if n_eol_to_add > 0:
629
- prefix = "\n" * n_eol_to_add
630
- rendered = f"{prefix}{rendered}"
631
-
632
- yield rendered
633
- else:
634
- yield children_text
894
+ sink.finalize()
895
+ finally:
896
+ _ancestor_cache.reset(token)
635
897
 
636
898
 
637
899
  def convert_to_markdown_stream(
@@ -639,6 +901,7 @@ def convert_to_markdown_stream(
639
901
  *,
640
902
  chunk_size: int = 1024,
641
903
  progress_callback: Callable[[int, int], None] | None = None,
904
+ parser: str | None = None,
642
905
  autolinks: bool = True,
643
906
  bullets: str = "*+-",
644
907
  code_language: str = "",
@@ -650,6 +913,7 @@ def convert_to_markdown_stream(
650
913
  escape_asterisks: bool = True,
651
914
  escape_misc: bool = True,
652
915
  escape_underscores: bool = True,
916
+ extract_metadata: bool = True,
653
917
  heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
654
918
  highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
655
919
  keep_inline_images_in: Iterable[str] | None = None,
@@ -665,12 +929,15 @@ def convert_to_markdown_stream(
665
929
  """Convert HTML to Markdown using streaming/chunked processing.
666
930
 
667
931
  This function yields chunks of converted Markdown text, allowing for
668
- memory-efficient processing of large HTML documents.
932
+ memory-efficient processing of large HTML documents. The output is guaranteed
933
+ to be identical to convert_to_markdown().
669
934
 
670
935
  Args:
671
936
  source: An HTML document or a an initialized instance of BeautifulSoup.
672
937
  chunk_size: Size of chunks to yield (approximate, in characters).
673
938
  progress_callback: Optional callback function called with (processed_bytes, total_bytes).
939
+ parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
940
+ Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
674
941
  autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
675
942
  bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
676
943
  code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
@@ -682,6 +949,7 @@ def convert_to_markdown_stream(
682
949
  escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
683
950
  escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
684
951
  escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
952
+ extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
685
953
  heading_style: The style to use for Markdown headings. Defaults to "underlined".
686
954
  highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
687
955
  keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
@@ -696,100 +964,69 @@ def convert_to_markdown_stream(
696
964
 
697
965
  Yields:
698
966
  str: Chunks of Markdown-formatted text.
699
-
700
- Raises:
701
- ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
702
967
  """
703
- # Input validation and preprocessing (same as original)
704
- if isinstance(source, str):
705
- if (
706
- heading_style == UNDERLINED
707
- and "Header" in source
708
- and "\n------\n\n" in source
709
- and "Next paragraph" in source
710
- ):
711
- yield source
712
- return
713
-
714
- if strip_newlines:
715
- source = source.replace("\n", " ").replace("\r", " ")
716
-
717
- if "".join(source.split("\n")):
718
- source = BeautifulSoup(source, "html.parser")
719
- else:
720
- raise ValueError("The input HTML is empty.")
968
+ sink = StreamingSink(chunk_size, progress_callback)
721
969
 
722
- if strip is not None and convert is not None:
723
- raise ValueError("Only one of 'strip' and 'convert' can be specified.")
724
-
725
- # Create converters map
726
- converters_map = create_converters_map(
970
+ if isinstance(source, str):
971
+ sink.total_bytes = len(source)
972
+ elif isinstance(source, BeautifulSoup):
973
+ sink.total_bytes = len(str(source))
974
+
975
+ _process_html_core(
976
+ source,
977
+ sink,
978
+ parser=parser,
727
979
  autolinks=autolinks,
728
980
  bullets=bullets,
729
981
  code_language=code_language,
730
982
  code_language_callback=code_language_callback,
983
+ convert=convert,
984
+ convert_as_inline=convert_as_inline,
985
+ custom_converters=custom_converters,
731
986
  default_title=default_title,
987
+ escape_asterisks=escape_asterisks,
988
+ escape_misc=escape_misc,
989
+ escape_underscores=escape_underscores,
990
+ extract_metadata=extract_metadata,
732
991
  heading_style=heading_style,
733
992
  highlight_style=highlight_style,
734
993
  keep_inline_images_in=keep_inline_images_in,
735
994
  newline_style=newline_style,
995
+ strip=strip,
996
+ strip_newlines=strip_newlines,
736
997
  strong_em_symbol=strong_em_symbol,
737
998
  sub_symbol=sub_symbol,
738
999
  sup_symbol=sup_symbol,
739
1000
  wrap=wrap,
740
1001
  wrap_width=wrap_width,
741
1002
  )
742
- if custom_converters:
743
- converters_map.update(cast("ConvertersMap", custom_converters))
744
1003
 
745
- # Initialize streaming processor
746
- processor = StreamingProcessor(chunk_size, progress_callback)
1004
+ all_chunks = list(sink.get_chunks())
1005
+ combined_result = "".join(all_chunks)
747
1006
 
748
- # Estimate total size for progress reporting
749
- if isinstance(source, BeautifulSoup):
750
- processor.total_bytes = len(str(source))
1007
+ combined_result = re.sub(r"\n{3,}", "\n\n", combined_result)
751
1008
 
752
- # Process elements and yield chunks
753
- buffer = StringIO()
754
- buffer_size = 0
1009
+ if convert_as_inline:
1010
+ combined_result = combined_result.rstrip("\n")
755
1011
 
756
- for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), source.children):
757
- if isinstance(el, NavigableString):
758
- text_chunk = _process_text(
759
- el=el,
760
- escape_misc=escape_misc,
761
- escape_asterisks=escape_asterisks,
762
- escape_underscores=escape_underscores,
763
- )
764
- buffer.write(text_chunk)
765
- buffer_size += len(text_chunk)
766
- elif isinstance(el, Tag):
767
- for text_chunk in _process_tag_iteratively(
768
- el,
769
- converters_map,
770
- convert_as_inline=convert_as_inline,
771
- convert=_as_optional_set(convert),
772
- escape_asterisks=escape_asterisks,
773
- escape_misc=escape_misc,
774
- escape_underscores=escape_underscores,
775
- strip=_as_optional_set(strip),
776
- context_before="",
777
- ):
778
- buffer.write(text_chunk)
779
- buffer_size += len(text_chunk)
780
-
781
- # Yield chunk if buffer is large enough
782
- if buffer_size >= chunk_size:
783
- content = buffer.getvalue()
784
- buffer = StringIO()
785
- buffer_size = 0
786
- processor.processed_bytes += len(content)
787
- processor.update_progress(processor.processed_bytes)
788
- yield content
789
-
790
- # Yield remaining content
791
- if buffer_size > 0:
792
- content = buffer.getvalue()
793
- processor.processed_bytes += len(content)
794
- processor.update_progress(processor.processed_bytes)
795
- yield content
1012
+ if not combined_result:
1013
+ return
1014
+
1015
+ pos = 0
1016
+ while pos < len(combined_result):
1017
+ end_pos = min(pos + chunk_size, len(combined_result))
1018
+
1019
+ if end_pos < len(combined_result):
1020
+ search_start = max(pos, end_pos - 50)
1021
+ search_end = min(len(combined_result), end_pos + 50)
1022
+ search_area = combined_result[search_start:search_end]
1023
+
1024
+ newline_pos = search_area.rfind("\n", 0, end_pos - search_start + 50)
1025
+ if newline_pos > 0:
1026
+ end_pos = search_start + newline_pos + 1
1027
+
1028
+ chunk = combined_result[pos:end_pos]
1029
+ if chunk:
1030
+ yield chunk
1031
+
1032
+ pos = end_pos