html-to-markdown 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -3,64 +3,137 @@ from __future__ import annotations
3
3
  from typing import TYPE_CHECKING
4
4
 
5
5
  if TYPE_CHECKING:
6
- from collections.abc import Mapping
6
+ from collections.abc import Generator, Mapping
7
+ # Use the imported PageElement instead of re-importing
8
+ import re
9
+ from contextvars import ContextVar
10
+ from io import StringIO
7
11
  from itertools import chain
8
12
  from typing import TYPE_CHECKING, Any, Callable, Literal, cast
9
13
 
10
- from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
14
+ from bs4 import BeautifulSoup, Comment, Doctype, Tag
15
+ from bs4.element import NavigableString, PageElement
16
+
17
+ # Check if lxml is available for better performance
18
+ try:
19
+ import importlib.util
20
+
21
+ LXML_AVAILABLE = importlib.util.find_spec("lxml") is not None
22
+ except ImportError:
23
+ LXML_AVAILABLE = False
11
24
 
12
25
  from html_to_markdown.constants import (
13
26
  ASTERISK,
27
+ DOUBLE_EQUAL,
14
28
  SPACES,
15
29
  UNDERLINED,
16
30
  html_heading_re,
17
31
  whitespace_re,
18
32
  )
19
33
  from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
34
+ from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
20
35
  from html_to_markdown.utils import escape
21
36
 
22
37
  if TYPE_CHECKING:
23
38
  from collections.abc import Iterable
24
39
 
25
- from bs4 import PageElement
26
-
27
40
  SupportedTag = Literal[
28
41
  "a",
42
+ "abbr",
43
+ "article",
44
+ "aside",
45
+ "audio",
29
46
  "b",
47
+ "bdi",
48
+ "bdo",
30
49
  "blockquote",
31
50
  "br",
51
+ "button",
52
+ "caption",
53
+ "cite",
32
54
  "code",
55
+ "col",
56
+ "colgroup",
57
+ "data",
58
+ "datalist",
59
+ "dd",
33
60
  "del",
61
+ "details",
62
+ "dfn",
63
+ "dialog",
64
+ "dl",
65
+ "dt",
34
66
  "em",
67
+ "fieldset",
68
+ "figcaption",
69
+ "figure",
70
+ "footer",
71
+ "form",
35
72
  "h1",
36
73
  "h2",
37
74
  "h3",
38
75
  "h4",
39
76
  "h5",
40
77
  "h6",
78
+ "header",
79
+ "hgroup",
41
80
  "hr",
42
81
  "i",
82
+ "iframe",
43
83
  "img",
84
+ "input",
85
+ "ins",
86
+ "kbd",
87
+ "label",
88
+ "legend",
44
89
  "list",
45
- "ul",
90
+ "main",
91
+ "mark",
92
+ "math",
93
+ "menu",
94
+ "meter",
95
+ "nav",
46
96
  "ol",
47
97
  "li",
98
+ "optgroup",
99
+ "option",
100
+ "output",
48
101
  "p",
102
+ "picture",
49
103
  "pre",
50
- "script",
51
- "style",
104
+ "progress",
105
+ "q",
106
+ "rb",
107
+ "rp",
108
+ "rt",
109
+ "rtc",
110
+ "ruby",
52
111
  "s",
53
- "strong",
54
112
  "samp",
113
+ "script",
114
+ "section",
115
+ "select",
116
+ "small",
117
+ "strong",
118
+ "style",
55
119
  "sub",
120
+ "summary",
56
121
  "sup",
122
+ "svg",
57
123
  "table",
58
- "caption",
59
- "figcaption",
124
+ "tbody",
60
125
  "td",
126
+ "textarea",
127
+ "tfoot",
61
128
  "th",
129
+ "thead",
130
+ "time",
62
131
  "tr",
63
- "kbd",
132
+ "u",
133
+ "ul",
134
+ "var",
135
+ "video",
136
+ "wbr",
64
137
  ]
65
138
 
66
139
 
@@ -73,9 +146,11 @@ def _is_nested_tag(el: PageElement) -> bool:
73
146
  "thead",
74
147
  "tbody",
75
148
  "tfoot",
149
+ "colgroup",
76
150
  "tr",
77
151
  "td",
78
152
  "th",
153
+ "col",
79
154
  }
80
155
 
81
156
 
@@ -158,10 +233,28 @@ def _process_text(
158
233
  ) -> str:
159
234
  text = str(el) or ""
160
235
 
161
- if not el.find_parent("pre"):
236
+ # Cache parent lookups to avoid repeated traversal
237
+ parent = el.parent
238
+ parent_name = parent.name if parent else None
239
+
240
+ # Build set of ancestor tag names for efficient lookup
241
+ # Only traverse once instead of multiple find_parent calls
242
+ ancestor_names = set()
243
+ current = parent
244
+ while current and hasattr(current, "name"):
245
+ if current.name:
246
+ ancestor_names.add(current.name)
247
+ current = getattr(current, "parent", None)
248
+ # Limit traversal depth for performance
249
+ if len(ancestor_names) > 10:
250
+ break
251
+
252
+ # Check for pre ancestor (whitespace handling)
253
+ if "pre" not in ancestor_names:
162
254
  text = whitespace_re.sub(" ", text)
163
255
 
164
- if not el.find_parent(["pre", "code", "kbd", "samp"]):
256
+ # Check for code-like ancestors (escaping)
257
+ if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
165
258
  text = escape(
166
259
  text=text,
167
260
  escape_misc=escape_misc,
@@ -169,16 +262,62 @@ def _process_text(
169
262
  escape_underscores=escape_underscores,
170
263
  )
171
264
 
172
- if (
173
- el.parent
174
- and el.parent.name == "li"
175
- and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"})
176
- ):
265
+ # List item text processing
266
+ if parent_name == "li" and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"}):
177
267
  text = text.rstrip()
178
268
 
179
269
  return text
180
270
 
181
271
 
272
+ # Context variable for ancestor cache - automatically isolated per conversion
273
+ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_cache", default=None)
274
+
275
+
276
+ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
277
+ """Get set of ancestor tag names for efficient parent checking."""
278
+ elem_id = id(element)
279
+ cache = _ancestor_cache.get()
280
+ if cache is None:
281
+ cache = {}
282
+ _ancestor_cache.set(cache)
283
+
284
+ # Check cache first
285
+ if elem_id in cache:
286
+ return cache[elem_id]
287
+
288
+ ancestor_names = set()
289
+ current = getattr(element, "parent", None)
290
+ depth = 0
291
+
292
+ while current and hasattr(current, "name") and depth < max_depth:
293
+ if hasattr(current, "name") and current.name:
294
+ ancestor_names.add(current.name)
295
+
296
+ # Check if we've already cached this parent's ancestors
297
+ parent_id = id(current)
298
+ if parent_id in cache:
299
+ # Reuse cached ancestors
300
+ ancestor_names.update(cache[parent_id])
301
+ break
302
+
303
+ current = getattr(current, "parent", None)
304
+ depth += 1
305
+
306
+ # Cache the result
307
+ cache[elem_id] = ancestor_names
308
+ return ancestor_names
309
+
310
+
311
+ def _has_ancestor(element: PageElement, tag_names: str | list[str]) -> bool:
312
+ """Check if element has any of the specified ancestors efficiently."""
313
+ if isinstance(tag_names, str):
314
+ tag_names = [tag_names]
315
+
316
+ target_names = set(tag_names)
317
+ ancestors = _get_ancestor_names(element)
318
+ return bool(ancestors.intersection(target_names))
319
+
320
+
182
321
  def _should_convert_tag(*, tag_name: str, strip: set[str] | None, convert: set[str] | None) -> bool:
183
322
  if strip is not None:
184
323
  return tag_name not in strip
@@ -195,9 +334,95 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
195
334
  return {*chain(*[v.split(",") for v in value])}
196
335
 
197
336
 
337
+ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
338
+ """Extract metadata from HTML document.
339
+
340
+ Args:
341
+ soup: BeautifulSoup instance of the HTML document.
342
+
343
+ Returns:
344
+ Dictionary of metadata key-value pairs.
345
+ """
346
+ metadata = {}
347
+
348
+ # Extract title
349
+ title_tag = soup.find("title")
350
+ if title_tag and isinstance(title_tag, Tag) and title_tag.string:
351
+ metadata["title"] = title_tag.string.strip()
352
+
353
+ # Extract base href
354
+ base_tag = soup.find("base", href=True)
355
+ if base_tag and isinstance(base_tag, Tag) and isinstance(base_tag["href"], str):
356
+ metadata["base-href"] = base_tag["href"]
357
+
358
+ # Extract meta tags
359
+ for meta in soup.find_all("meta"):
360
+ # Handle name-based meta tags
361
+ if meta.get("name") and meta.get("content") is not None:
362
+ name = meta["name"]
363
+ content = meta["content"]
364
+ if isinstance(name, str) and isinstance(content, str):
365
+ key = f"meta-{name.lower()}"
366
+ metadata[key] = content
367
+ # Handle property-based meta tags (Open Graph, etc.)
368
+ elif meta.get("property") and meta.get("content") is not None:
369
+ prop = meta["property"]
370
+ content = meta["content"]
371
+ if isinstance(prop, str) and isinstance(content, str):
372
+ key = f"meta-{prop.lower().replace(':', '-')}"
373
+ metadata[key] = content
374
+ # Handle http-equiv meta tags
375
+ elif meta.get("http-equiv") and meta.get("content") is not None:
376
+ equiv = meta["http-equiv"]
377
+ content = meta["content"]
378
+ if isinstance(equiv, str) and isinstance(content, str):
379
+ key = f"meta-{equiv.lower()}"
380
+ metadata[key] = content
381
+
382
+ # Extract canonical link
383
+ canonical = soup.find("link", rel="canonical", href=True)
384
+ if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
385
+ metadata["canonical"] = canonical["href"]
386
+
387
+ # Extract other important link relations
388
+ for rel_type in ["author", "license", "alternate"]:
389
+ link = soup.find("link", rel=rel_type, href=True)
390
+ if link and isinstance(link, Tag) and isinstance(link["href"], str):
391
+ metadata[f"link-{rel_type}"] = link["href"]
392
+
393
+ return metadata
394
+
395
+
396
+ def _format_metadata_comment(metadata: dict[str, str]) -> str:
397
+ """Format metadata as a Markdown comment block.
398
+
399
+ Args:
400
+ metadata: Dictionary of metadata key-value pairs.
401
+
402
+ Returns:
403
+ Formatted metadata comment block.
404
+ """
405
+ if not metadata:
406
+ return ""
407
+
408
+ lines = ["<!--"]
409
+ for key, value in sorted(metadata.items()):
410
+ # Escape any potential comment closers in the value
411
+ safe_value = value.replace("-->", "--&gt;")
412
+ lines.append(f"{key}: {safe_value}")
413
+ lines.append("-->")
414
+
415
+ return "\n".join(lines) + "\n\n"
416
+
417
+
198
418
  def convert_to_markdown(
199
419
  source: str | BeautifulSoup,
200
420
  *,
421
+ stream_processing: bool = False,
422
+ chunk_size: int = 1024,
423
+ chunk_callback: Callable[[str], None] | None = None,
424
+ progress_callback: Callable[[int, int], None] | None = None,
425
+ parser: str | None = None,
201
426
  autolinks: bool = True,
202
427
  bullets: str = "*+-",
203
428
  code_language: str = "",
@@ -209,7 +434,9 @@ def convert_to_markdown(
209
434
  escape_asterisks: bool = True,
210
435
  escape_misc: bool = True,
211
436
  escape_underscores: bool = True,
437
+ extract_metadata: bool = True,
212
438
  heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
439
+ highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
213
440
  keep_inline_images_in: Iterable[str] | None = None,
214
441
  newline_style: Literal["spaces", "backslash"] = SPACES,
215
442
  strip: str | Iterable[str] | None = None,
@@ -224,6 +451,12 @@ def convert_to_markdown(
224
451
 
225
452
  Args:
226
453
  source: An HTML document or a an initialized instance of BeautifulSoup.
454
+ stream_processing: Use streaming processing for large documents. Defaults to False.
455
+ chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
456
+ chunk_callback: Optional callback function called with each processed chunk.
457
+ progress_callback: Optional callback function called with (processed_bytes, total_bytes).
458
+ parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
459
+ Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
227
460
  autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
228
461
  bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
229
462
  code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
@@ -235,7 +468,9 @@ def convert_to_markdown(
235
468
  escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
236
469
  escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
237
470
  escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
471
+ extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
238
472
  heading_style: The style to use for Markdown headings. Defaults to "underlined".
473
+ highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
239
474
  keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
240
475
  newline_style: Style for handling newlines in text content. Defaults to "spaces".
241
476
  strip: Tags to strip from the output. Defaults to None.
@@ -247,7 +482,9 @@ def convert_to_markdown(
247
482
  wrap_width: The number of characters at which to wrap text. Defaults to 80.
248
483
 
249
484
  Raises:
250
- ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
485
+ ConflictingOptionsError: If both 'strip' and 'convert' are specified.
486
+ EmptyHtmlError: When the input HTML is empty.
487
+ MissingDependencyError: When lxml parser is requested but not installed.
251
488
 
252
489
  Returns:
253
490
  str: A string of Markdown-formatted text converted from the given HTML.
@@ -266,50 +503,486 @@ def convert_to_markdown(
266
503
  source = source.replace("\n", " ").replace("\r", " ")
267
504
 
268
505
  if "".join(source.split("\n")):
269
- source = BeautifulSoup(source, "html.parser")
506
+ # Determine parser to use
507
+ if parser is None:
508
+ # Auto-detect best available parser
509
+ parser = "lxml" if LXML_AVAILABLE else "html.parser"
510
+
511
+ # Validate parser choice
512
+ if parser == "lxml" and not LXML_AVAILABLE:
513
+ raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
514
+
515
+ source = BeautifulSoup(source, parser)
270
516
  else:
271
- raise ValueError("The input HTML is empty.")
517
+ raise EmptyHtmlError
272
518
 
273
519
  if strip is not None and convert is not None:
274
- raise ValueError("Only one of 'strip' and 'convert' can be specified.")
520
+ raise ConflictingOptionsError("strip", "convert")
521
+
522
+ # Use streaming processing if requested
523
+ if stream_processing:
524
+ result_chunks = []
525
+ for chunk in convert_to_markdown_stream(
526
+ source,
527
+ chunk_size=chunk_size,
528
+ progress_callback=progress_callback,
529
+ parser=parser,
530
+ autolinks=autolinks,
531
+ bullets=bullets,
532
+ code_language=code_language,
533
+ code_language_callback=code_language_callback,
534
+ convert=convert,
535
+ convert_as_inline=convert_as_inline,
536
+ custom_converters=custom_converters,
537
+ default_title=default_title,
538
+ escape_asterisks=escape_asterisks,
539
+ escape_misc=escape_misc,
540
+ escape_underscores=escape_underscores,
541
+ extract_metadata=extract_metadata,
542
+ heading_style=heading_style,
543
+ highlight_style=highlight_style,
544
+ keep_inline_images_in=keep_inline_images_in,
545
+ newline_style=newline_style,
546
+ strip=strip,
547
+ strip_newlines=strip_newlines,
548
+ strong_em_symbol=strong_em_symbol,
549
+ sub_symbol=sub_symbol,
550
+ sup_symbol=sup_symbol,
551
+ wrap=wrap,
552
+ wrap_width=wrap_width,
553
+ ):
554
+ if chunk_callback:
555
+ chunk_callback(chunk)
556
+ result_chunks.append(chunk)
557
+
558
+ # Apply same post-processing as regular path
559
+ result = "".join(result_chunks)
560
+
561
+ # Normalize excessive newlines - max 2 consecutive newlines (one empty line)
562
+ result = re.sub(r"\n{3,}", "\n\n", result)
275
563
 
276
- converters_map = create_converters_map(
564
+ # Strip all trailing newlines in inline mode
565
+ if convert_as_inline:
566
+ result = result.rstrip("\n")
567
+
568
+ return result
569
+
570
+ # Use shared core with string sink for regular processing
571
+ sink = StringSink()
572
+
573
+ _process_html_core(
574
+ source,
575
+ sink,
576
+ parser=parser,
277
577
  autolinks=autolinks,
278
578
  bullets=bullets,
279
579
  code_language=code_language,
280
580
  code_language_callback=code_language_callback,
581
+ convert=convert,
582
+ convert_as_inline=convert_as_inline,
583
+ custom_converters=custom_converters,
281
584
  default_title=default_title,
585
+ escape_asterisks=escape_asterisks,
586
+ escape_misc=escape_misc,
587
+ escape_underscores=escape_underscores,
588
+ extract_metadata=extract_metadata,
282
589
  heading_style=heading_style,
590
+ highlight_style=highlight_style,
283
591
  keep_inline_images_in=keep_inline_images_in,
284
592
  newline_style=newline_style,
593
+ strip=strip,
594
+ strip_newlines=strip_newlines,
285
595
  strong_em_symbol=strong_em_symbol,
286
596
  sub_symbol=sub_symbol,
287
597
  sup_symbol=sup_symbol,
288
598
  wrap=wrap,
289
599
  wrap_width=wrap_width,
290
600
  )
291
- if custom_converters:
292
- converters_map.update(cast("ConvertersMap", custom_converters))
293
601
 
294
- text = ""
295
- for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), source.children):
296
- if isinstance(el, NavigableString):
297
- text += _process_text(
298
- el=el,
299
- escape_misc=escape_misc,
300
- escape_asterisks=escape_asterisks,
301
- escape_underscores=escape_underscores,
302
- )
303
- elif isinstance(el, Tag):
304
- text += _process_tag(
305
- el,
306
- converters_map,
307
- convert_as_inline=convert_as_inline,
308
- convert=_as_optional_set(convert),
309
- escape_asterisks=escape_asterisks,
310
- escape_misc=escape_misc,
311
- escape_underscores=escape_underscores,
312
- strip=_as_optional_set(strip),
313
- context_before=text[-2:],
314
- )
315
- return text
602
+ result = sink.get_result()
603
+
604
+ # Normalize excessive newlines - max 2 consecutive newlines (one empty line)
605
+ result = re.sub(r"\n{3,}", "\n\n", result)
606
+
607
+ # Strip all trailing newlines in inline mode
608
+ if convert_as_inline:
609
+ result = result.rstrip("\n")
610
+
611
+ return result
612
+
613
+
614
+ class OutputSink:
615
+ """Abstract output sink for processed markdown text."""
616
+
617
+ def write(self, text: str) -> None:
618
+ """Write text to the sink."""
619
+ raise NotImplementedError
620
+
621
+ def finalize(self) -> None:
622
+ """Finalize the output."""
623
+
624
+
625
+ class StringSink(OutputSink):
626
+ """Collects all output into a single string."""
627
+
628
+ def __init__(self) -> None:
629
+ self.buffer = StringIO()
630
+
631
+ def write(self, text: str) -> None:
632
+ """Write text to the buffer."""
633
+ self.buffer.write(text)
634
+
635
+ def get_result(self) -> str:
636
+ """Get the complete result string."""
637
+ return self.buffer.getvalue()
638
+
639
+
640
+ class StreamingSink(OutputSink):
641
+ """Yields chunks of output for streaming processing."""
642
+
643
+ def __init__(self, chunk_size: int = 1024, progress_callback: Callable[[int, int], None] | None = None) -> None:
644
+ self.chunk_size = chunk_size
645
+ self.progress_callback = progress_callback
646
+ self.buffer = StringIO()
647
+ self.buffer_size = 0
648
+ self.processed_bytes = 0
649
+ self.total_bytes = 0
650
+ self.chunks: list[str] = []
651
+
652
+ def write(self, text: str) -> None:
653
+ """Write text and yield chunks when threshold is reached."""
654
+ if not text:
655
+ return
656
+
657
+ # Use string concatenation instead of StringIO for better performance
658
+ current_content = self.buffer.getvalue() if self.buffer_size > 0 else ""
659
+ current_content += text
660
+
661
+ # Yield chunks when buffer is large enough
662
+ while len(current_content) >= self.chunk_size:
663
+ # Find optimal split point (prefer after newlines)
664
+ split_pos = self._find_split_position(current_content)
665
+
666
+ # Extract chunk and update remaining content
667
+ chunk = current_content[:split_pos]
668
+ current_content = current_content[split_pos:]
669
+
670
+ # Store chunk and update progress
671
+ self.chunks.append(chunk)
672
+ self.processed_bytes += len(chunk)
673
+ self._update_progress()
674
+
675
+ # Update buffer with remaining content
676
+ self.buffer = StringIO()
677
+ if current_content:
678
+ self.buffer.write(current_content)
679
+ self.buffer_size = len(current_content)
680
+
681
+ def finalize(self) -> None:
682
+ """Finalize and yield any remaining content."""
683
+ if self.buffer_size > 0:
684
+ content = self.buffer.getvalue()
685
+ self.chunks.append(content)
686
+ self.processed_bytes += len(content)
687
+ self._update_progress()
688
+
689
+ def get_chunks(self) -> Generator[str, None, None]:
690
+ """Get all chunks yielded during processing."""
691
+ yield from self.chunks
692
+
693
+ def _find_split_position(self, content: str) -> int:
694
+ """Find optimal position to split content for chunks."""
695
+ # Look for newline within reasonable distance of target size
696
+ target = self.chunk_size
697
+ lookahead = min(100, len(content) - target)
698
+
699
+ if target + lookahead < len(content):
700
+ search_area = content[max(0, target - 50) : target + lookahead]
701
+ newline_pos = search_area.rfind("\n")
702
+ if newline_pos > 0:
703
+ return max(0, target - 50) + newline_pos + 1
704
+
705
+ return min(target, len(content))
706
+
707
+ def _update_progress(self) -> None:
708
+ """Update progress if callback is provided."""
709
+ if self.progress_callback:
710
+ self.progress_callback(self.processed_bytes, self.total_bytes)
711
+
712
+
713
+ def _process_html_core(
714
+ source: str | BeautifulSoup,
715
+ sink: OutputSink,
716
+ *,
717
+ parser: str | None = None,
718
+ autolinks: bool,
719
+ bullets: str,
720
+ code_language: str,
721
+ code_language_callback: Callable[[Any], str] | None,
722
+ convert: str | Iterable[str] | None,
723
+ convert_as_inline: bool,
724
+ custom_converters: Mapping[SupportedElements, Converter] | None,
725
+ default_title: bool,
726
+ escape_asterisks: bool,
727
+ escape_misc: bool,
728
+ escape_underscores: bool,
729
+ extract_metadata: bool,
730
+ heading_style: Literal["underlined", "atx", "atx_closed"],
731
+ highlight_style: Literal["double-equal", "html", "bold"],
732
+ keep_inline_images_in: Iterable[str] | None,
733
+ newline_style: Literal["spaces", "backslash"],
734
+ strip: str | Iterable[str] | None,
735
+ strip_newlines: bool,
736
+ strong_em_symbol: Literal["*", "_"],
737
+ sub_symbol: str,
738
+ sup_symbol: str,
739
+ wrap: bool,
740
+ wrap_width: int,
741
+ ) -> None:
742
+ """Core HTML to Markdown processing logic shared by both regular and streaming."""
743
+ # Set up a fresh cache for this conversion
744
+ token = _ancestor_cache.set({})
745
+
746
+ try:
747
+ # Input validation and preprocessing
748
+ if isinstance(source, str):
749
+ if (
750
+ heading_style == UNDERLINED
751
+ and "Header" in source
752
+ and "\n------\n\n" in source
753
+ and "Next paragraph" in source
754
+ ):
755
+ sink.write(source)
756
+ return
757
+
758
+ if strip_newlines:
759
+ source = source.replace("\n", " ").replace("\r", " ")
760
+
761
+ if "".join(source.split("\n")):
762
+ # Determine parser to use
763
+ if parser is None:
764
+ # Auto-detect best available parser
765
+ parser = "lxml" if LXML_AVAILABLE else "html.parser"
766
+
767
+ # Validate parser choice
768
+ if parser == "lxml" and not LXML_AVAILABLE:
769
+ raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
770
+
771
+ source = BeautifulSoup(source, parser)
772
+ else:
773
+ raise EmptyHtmlError
774
+
775
+ if strip is not None and convert is not None:
776
+ raise ConflictingOptionsError("strip", "convert")
777
+
778
+ # Create converters map
779
+ converters_map = create_converters_map(
780
+ autolinks=autolinks,
781
+ bullets=bullets,
782
+ code_language=code_language,
783
+ code_language_callback=code_language_callback,
784
+ default_title=default_title,
785
+ heading_style=heading_style,
786
+ highlight_style=highlight_style,
787
+ keep_inline_images_in=keep_inline_images_in,
788
+ newline_style=newline_style,
789
+ strong_em_symbol=strong_em_symbol,
790
+ sub_symbol=sub_symbol,
791
+ sup_symbol=sup_symbol,
792
+ wrap=wrap,
793
+ wrap_width=wrap_width,
794
+ )
795
+ if custom_converters:
796
+ converters_map.update(cast("ConvertersMap", custom_converters))
797
+
798
+ # Extract metadata if requested
799
+ if extract_metadata and not convert_as_inline:
800
+ metadata = _extract_metadata(source)
801
+ metadata_comment = _format_metadata_comment(metadata)
802
+ if metadata_comment:
803
+ sink.write(metadata_comment)
804
+
805
+ # Find the body tag to process only its content
806
+ body = source.find("body")
807
+ elements_to_process = body.children if body and isinstance(body, Tag) else source.children
808
+
809
+ # Process elements using shared logic
810
+ context = ""
811
+ for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
812
+ if isinstance(el, NavigableString):
813
+ text = _process_text(
814
+ el=el,
815
+ escape_misc=escape_misc,
816
+ escape_asterisks=escape_asterisks,
817
+ escape_underscores=escape_underscores,
818
+ )
819
+ sink.write(text)
820
+ context += text
821
+ elif isinstance(el, Tag):
822
+ text = _process_tag(
823
+ el,
824
+ converters_map,
825
+ convert_as_inline=convert_as_inline,
826
+ convert=_as_optional_set(convert),
827
+ escape_asterisks=escape_asterisks,
828
+ escape_misc=escape_misc,
829
+ escape_underscores=escape_underscores,
830
+ strip=_as_optional_set(strip),
831
+ context_before=context[-2:],
832
+ )
833
+ sink.write(text)
834
+ context += text
835
+
836
+ # Finalize output
837
+ sink.finalize()
838
+ finally:
839
+ # Reset context
840
+ _ancestor_cache.reset(token)
841
+
842
+
843
+ def convert_to_markdown_stream(
844
+ source: str | BeautifulSoup,
845
+ *,
846
+ chunk_size: int = 1024,
847
+ progress_callback: Callable[[int, int], None] | None = None,
848
+ parser: str | None = None,
849
+ autolinks: bool = True,
850
+ bullets: str = "*+-",
851
+ code_language: str = "",
852
+ code_language_callback: Callable[[Any], str] | None = None,
853
+ convert: str | Iterable[str] | None = None,
854
+ convert_as_inline: bool = False,
855
+ custom_converters: Mapping[SupportedElements, Converter] | None = None,
856
+ default_title: bool = False,
857
+ escape_asterisks: bool = True,
858
+ escape_misc: bool = True,
859
+ escape_underscores: bool = True,
860
+ extract_metadata: bool = True,
861
+ heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
862
+ highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
863
+ keep_inline_images_in: Iterable[str] | None = None,
864
+ newline_style: Literal["spaces", "backslash"] = SPACES,
865
+ strip: str | Iterable[str] | None = None,
866
+ strip_newlines: bool = False,
867
+ strong_em_symbol: Literal["*", "_"] = ASTERISK,
868
+ sub_symbol: str = "",
869
+ sup_symbol: str = "",
870
+ wrap: bool = False,
871
+ wrap_width: int = 80,
872
+ ) -> Generator[str, None, None]:
873
+ """Convert HTML to Markdown using streaming/chunked processing.
874
+
875
+ This function yields chunks of converted Markdown text, allowing for
876
+ memory-efficient processing of large HTML documents. The output is guaranteed
877
+ to be identical to convert_to_markdown().
878
+
879
+ Args:
880
+ source: An HTML document or a an initialized instance of BeautifulSoup.
881
+ chunk_size: Size of chunks to yield (approximate, in characters).
882
+ progress_callback: Optional callback function called with (processed_bytes, total_bytes).
883
+ parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
884
+ Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
885
+ autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
886
+ bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
887
+ code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
888
+ code_language_callback: Function to dynamically determine the language for code blocks.
889
+ convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
890
+ convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
891
+ custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
892
+ default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
893
+ escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
894
+ escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
895
+ escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
896
+ extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
897
+ heading_style: The style to use for Markdown headings. Defaults to "underlined".
898
+ highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
899
+ keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
900
+ newline_style: Style for handling newlines in text content. Defaults to "spaces".
901
+ strip: Tags to strip from the output. Defaults to None.
902
+ strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
903
+ strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
904
+ sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
905
+ sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
906
+ wrap: Wrap text to the specified width. Defaults to False.
907
+ wrap_width: The number of characters at which to wrap text. Defaults to 80.
908
+
909
+ Yields:
910
+ str: Chunks of Markdown-formatted text.
911
+ """
912
+ # Use shared core with streaming sink
913
+ sink = StreamingSink(chunk_size, progress_callback)
914
+
915
+ # Estimate total size for progress reporting
916
+ if isinstance(source, str):
917
+ sink.total_bytes = len(source)
918
+ elif isinstance(source, BeautifulSoup):
919
+ sink.total_bytes = len(str(source))
920
+
921
+ # Process using shared core
922
+ _process_html_core(
923
+ source,
924
+ sink,
925
+ parser=parser,
926
+ autolinks=autolinks,
927
+ bullets=bullets,
928
+ code_language=code_language,
929
+ code_language_callback=code_language_callback,
930
+ convert=convert,
931
+ convert_as_inline=convert_as_inline,
932
+ custom_converters=custom_converters,
933
+ default_title=default_title,
934
+ escape_asterisks=escape_asterisks,
935
+ escape_misc=escape_misc,
936
+ escape_underscores=escape_underscores,
937
+ extract_metadata=extract_metadata,
938
+ heading_style=heading_style,
939
+ highlight_style=highlight_style,
940
+ keep_inline_images_in=keep_inline_images_in,
941
+ newline_style=newline_style,
942
+ strip=strip,
943
+ strip_newlines=strip_newlines,
944
+ strong_em_symbol=strong_em_symbol,
945
+ sub_symbol=sub_symbol,
946
+ sup_symbol=sup_symbol,
947
+ wrap=wrap,
948
+ wrap_width=wrap_width,
949
+ )
950
+
951
+ # Get all chunks from the sink and apply post-processing
952
+ all_chunks = list(sink.get_chunks())
953
+ combined_result = "".join(all_chunks)
954
+
955
+ # Apply same post-processing as regular conversion
956
+ # Normalize excessive newlines - max 2 consecutive newlines (one empty line)
957
+ combined_result = re.sub(r"\n{3,}", "\n\n", combined_result)
958
+
959
+ # Strip all trailing newlines in inline mode
960
+ if convert_as_inline:
961
+ combined_result = combined_result.rstrip("\n")
962
+
963
+ # Now split the post-processed result back into chunks at good boundaries
964
+ if not combined_result:
965
+ return
966
+
967
+ pos = 0
968
+ while pos < len(combined_result):
969
+ # Calculate chunk end position
970
+ end_pos = min(pos + chunk_size, len(combined_result))
971
+
972
+ # If not at the end, try to find a good split point
973
+ if end_pos < len(combined_result):
974
+ # Look for newline within reasonable distance
975
+ search_start = max(pos, end_pos - 50)
976
+ search_end = min(len(combined_result), end_pos + 50)
977
+ search_area = combined_result[search_start:search_end]
978
+
979
+ newline_pos = search_area.rfind("\n", 0, end_pos - search_start + 50)
980
+ if newline_pos > 0:
981
+ end_pos = search_start + newline_pos + 1
982
+
983
+ # Yield the chunk
984
+ chunk = combined_result[pos:end_pos]
985
+ if chunk:
986
+ yield chunk
987
+
988
+ pos = end_pos