html-to-markdown 1.3.3__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -3,14 +3,19 @@ from __future__ import annotations
3
3
  from typing import TYPE_CHECKING
4
4
 
5
5
  if TYPE_CHECKING:
6
- from collections.abc import Mapping
6
+ from collections.abc import Generator, Mapping
7
+ # Use the imported PageElement instead of re-importing
8
+ from io import StringIO
9
+ import re
7
10
  from itertools import chain
8
11
  from typing import TYPE_CHECKING, Any, Callable, Literal, cast
9
12
 
10
- from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
13
+ from bs4 import BeautifulSoup, Comment, Doctype, Tag
14
+ from bs4.element import NavigableString, PageElement
11
15
 
12
16
  from html_to_markdown.constants import (
13
17
  ASTERISK,
18
+ DOUBLE_EQUAL,
14
19
  SPACES,
15
20
  UNDERLINED,
16
21
  html_heading_re,
@@ -22,45 +27,103 @@ from html_to_markdown.utils import escape
22
27
  if TYPE_CHECKING:
23
28
  from collections.abc import Iterable
24
29
 
25
- from bs4 import PageElement
26
-
27
30
  SupportedTag = Literal[
28
31
  "a",
32
+ "abbr",
33
+ "article",
34
+ "aside",
35
+ "audio",
29
36
  "b",
37
+ "bdi",
38
+ "bdo",
30
39
  "blockquote",
31
40
  "br",
41
+ "button",
42
+ "caption",
43
+ "cite",
32
44
  "code",
45
+ "col",
46
+ "colgroup",
47
+ "data",
48
+ "datalist",
49
+ "dd",
33
50
  "del",
51
+ "details",
52
+ "dfn",
53
+ "dialog",
54
+ "dl",
55
+ "dt",
34
56
  "em",
57
+ "fieldset",
58
+ "figcaption",
59
+ "figure",
60
+ "footer",
61
+ "form",
35
62
  "h1",
36
63
  "h2",
37
64
  "h3",
38
65
  "h4",
39
66
  "h5",
40
67
  "h6",
68
+ "header",
69
+ "hgroup",
41
70
  "hr",
42
71
  "i",
72
+ "iframe",
43
73
  "img",
74
+ "input",
75
+ "ins",
76
+ "kbd",
77
+ "label",
78
+ "legend",
44
79
  "list",
45
- "ul",
80
+ "main",
81
+ "mark",
82
+ "math",
83
+ "menu",
84
+ "meter",
85
+ "nav",
46
86
  "ol",
47
87
  "li",
88
+ "optgroup",
89
+ "option",
90
+ "output",
48
91
  "p",
92
+ "picture",
49
93
  "pre",
50
- "script",
51
- "style",
94
+ "progress",
95
+ "q",
96
+ "rb",
97
+ "rp",
98
+ "rt",
99
+ "rtc",
100
+ "ruby",
52
101
  "s",
53
- "strong",
54
102
  "samp",
103
+ "script",
104
+ "section",
105
+ "select",
106
+ "small",
107
+ "strong",
108
+ "style",
55
109
  "sub",
110
+ "summary",
56
111
  "sup",
112
+ "svg",
57
113
  "table",
58
- "caption",
59
- "figcaption",
114
+ "tbody",
60
115
  "td",
116
+ "textarea",
117
+ "tfoot",
61
118
  "th",
119
+ "thead",
120
+ "time",
62
121
  "tr",
63
- "kbd",
122
+ "u",
123
+ "ul",
124
+ "var",
125
+ "video",
126
+ "wbr",
64
127
  ]
65
128
 
66
129
 
@@ -73,9 +136,11 @@ def _is_nested_tag(el: PageElement) -> bool:
73
136
  "thead",
74
137
  "tbody",
75
138
  "tfoot",
139
+ "colgroup",
76
140
  "tr",
77
141
  "td",
78
142
  "th",
143
+ "col",
79
144
  }
80
145
 
81
146
 
@@ -195,9 +260,94 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
195
260
  return {*chain(*[v.split(",") for v in value])}
196
261
 
197
262
 
263
+ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
264
+ """Extract metadata from HTML document.
265
+
266
+ Args:
267
+ soup: BeautifulSoup instance of the HTML document.
268
+
269
+ Returns:
270
+ Dictionary of metadata key-value pairs.
271
+ """
272
+ metadata = {}
273
+
274
+ # Extract title
275
+ title_tag = soup.find("title")
276
+ if title_tag and isinstance(title_tag, Tag) and title_tag.string:
277
+ metadata["title"] = title_tag.string.strip()
278
+
279
+ # Extract base href
280
+ base_tag = soup.find("base", href=True)
281
+ if base_tag and isinstance(base_tag, Tag) and isinstance(base_tag["href"], str):
282
+ metadata["base-href"] = base_tag["href"]
283
+
284
+ # Extract meta tags
285
+ for meta in soup.find_all("meta"):
286
+ # Handle name-based meta tags
287
+ if meta.get("name") and meta.get("content") is not None:
288
+ name = meta["name"]
289
+ content = meta["content"]
290
+ if isinstance(name, str) and isinstance(content, str):
291
+ key = f"meta-{name.lower()}"
292
+ metadata[key] = content
293
+ # Handle property-based meta tags (Open Graph, etc.)
294
+ elif meta.get("property") and meta.get("content") is not None:
295
+ prop = meta["property"]
296
+ content = meta["content"]
297
+ if isinstance(prop, str) and isinstance(content, str):
298
+ key = f"meta-{prop.lower().replace(':', '-')}"
299
+ metadata[key] = content
300
+ # Handle http-equiv meta tags
301
+ elif meta.get("http-equiv") and meta.get("content") is not None:
302
+ equiv = meta["http-equiv"]
303
+ content = meta["content"]
304
+ if isinstance(equiv, str) and isinstance(content, str):
305
+ key = f"meta-{equiv.lower()}"
306
+ metadata[key] = content
307
+
308
+ # Extract canonical link
309
+ canonical = soup.find("link", rel="canonical", href=True)
310
+ if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
311
+ metadata["canonical"] = canonical["href"]
312
+
313
+ # Extract other important link relations
314
+ for rel_type in ["author", "license", "alternate"]:
315
+ link = soup.find("link", rel=rel_type, href=True)
316
+ if link and isinstance(link, Tag) and isinstance(link["href"], str):
317
+ metadata[f"link-{rel_type}"] = link["href"]
318
+
319
+ return metadata
320
+
321
+
322
+ def _format_metadata_comment(metadata: dict[str, str]) -> str:
323
+ """Format metadata as a Markdown comment block.
324
+
325
+ Args:
326
+ metadata: Dictionary of metadata key-value pairs.
327
+
328
+ Returns:
329
+ Formatted metadata comment block.
330
+ """
331
+ if not metadata:
332
+ return ""
333
+
334
+ lines = ["<!--"]
335
+ for key, value in sorted(metadata.items()):
336
+ # Escape any potential comment closers in the value
337
+ safe_value = value.replace("-->", "--&gt;")
338
+ lines.append(f"{key}: {safe_value}")
339
+ lines.append("-->")
340
+
341
+ return "\n".join(lines) + "\n\n"
342
+
343
+
198
344
  def convert_to_markdown(
199
345
  source: str | BeautifulSoup,
200
346
  *,
347
+ stream_processing: bool = False,
348
+ chunk_size: int = 1024,
349
+ chunk_callback: Callable[[str], None] | None = None,
350
+ progress_callback: Callable[[int, int], None] | None = None,
201
351
  autolinks: bool = True,
202
352
  bullets: str = "*+-",
203
353
  code_language: str = "",
@@ -209,10 +359,13 @@ def convert_to_markdown(
209
359
  escape_asterisks: bool = True,
210
360
  escape_misc: bool = True,
211
361
  escape_underscores: bool = True,
362
+ extract_metadata: bool = True,
212
363
  heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
364
+ highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
213
365
  keep_inline_images_in: Iterable[str] | None = None,
214
366
  newline_style: Literal["spaces", "backslash"] = SPACES,
215
367
  strip: str | Iterable[str] | None = None,
368
+ strip_newlines: bool = False,
216
369
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
217
370
  sub_symbol: str = "",
218
371
  sup_symbol: str = "",
@@ -223,6 +376,10 @@ def convert_to_markdown(
223
376
 
224
377
  Args:
225
378
  source: An HTML document or a an initialized instance of BeautifulSoup.
379
+ stream_processing: Use streaming processing for large documents. Defaults to False.
380
+ chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
381
+ chunk_callback: Optional callback function called with each processed chunk.
382
+ progress_callback: Optional callback function called with (processed_bytes, total_bytes).
226
383
  autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
227
384
  bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
228
385
  code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
@@ -234,10 +391,13 @@ def convert_to_markdown(
234
391
  escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
235
392
  escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
236
393
  escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
394
+ extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
237
395
  heading_style: The style to use for Markdown headings. Defaults to "underlined".
396
+ highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
238
397
  keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
239
398
  newline_style: Style for handling newlines in text content. Defaults to "spaces".
240
399
  strip: Tags to strip from the output. Defaults to None.
400
+ strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
241
401
  strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
242
402
  sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
243
403
  sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
@@ -259,6 +419,10 @@ def convert_to_markdown(
259
419
  ):
260
420
  return source
261
421
 
422
+ if strip_newlines:
423
+ # Replace all newlines with spaces before parsing
424
+ source = source.replace("\n", " ").replace("\r", " ")
425
+
262
426
  if "".join(source.split("\n")):
263
427
  source = BeautifulSoup(source, "html.parser")
264
428
  else:
@@ -267,6 +431,41 @@ def convert_to_markdown(
267
431
  if strip is not None and convert is not None:
268
432
  raise ValueError("Only one of 'strip' and 'convert' can be specified.")
269
433
 
434
+ # Use streaming processing if requested
435
+ if stream_processing:
436
+ result_chunks = []
437
+ for chunk in convert_to_markdown_stream(
438
+ source,
439
+ chunk_size=chunk_size,
440
+ progress_callback=progress_callback,
441
+ autolinks=autolinks,
442
+ bullets=bullets,
443
+ code_language=code_language,
444
+ code_language_callback=code_language_callback,
445
+ convert=convert,
446
+ convert_as_inline=convert_as_inline,
447
+ custom_converters=custom_converters,
448
+ default_title=default_title,
449
+ escape_asterisks=escape_asterisks,
450
+ escape_misc=escape_misc,
451
+ escape_underscores=escape_underscores,
452
+ heading_style=heading_style,
453
+ highlight_style=highlight_style,
454
+ keep_inline_images_in=keep_inline_images_in,
455
+ newline_style=newline_style,
456
+ strip=strip,
457
+ strip_newlines=strip_newlines,
458
+ strong_em_symbol=strong_em_symbol,
459
+ sub_symbol=sub_symbol,
460
+ sup_symbol=sup_symbol,
461
+ wrap=wrap,
462
+ wrap_width=wrap_width,
463
+ ):
464
+ if chunk_callback:
465
+ chunk_callback(chunk)
466
+ result_chunks.append(chunk)
467
+ return "".join(result_chunks)
468
+
270
469
  converters_map = create_converters_map(
271
470
  autolinks=autolinks,
272
471
  bullets=bullets,
@@ -274,6 +473,7 @@ def convert_to_markdown(
274
473
  code_language_callback=code_language_callback,
275
474
  default_title=default_title,
276
475
  heading_style=heading_style,
476
+ highlight_style=highlight_style,
277
477
  keep_inline_images_in=keep_inline_images_in,
278
478
  newline_style=newline_style,
279
479
  strong_em_symbol=strong_em_symbol,
@@ -285,8 +485,18 @@ def convert_to_markdown(
285
485
  if custom_converters:
286
486
  converters_map.update(cast("ConvertersMap", custom_converters))
287
487
 
488
+ # Extract metadata if requested
489
+ metadata_comment = ""
490
+ if extract_metadata and not convert_as_inline:
491
+ metadata = _extract_metadata(source)
492
+ metadata_comment = _format_metadata_comment(metadata)
493
+
494
+ # Find the body tag to process only its content
495
+ body = source.find("body")
496
+ elements_to_process = body.children if body and isinstance(body, Tag) else source.children
497
+
288
498
  text = ""
289
- for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), source.children):
499
+ for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
290
500
  if isinstance(el, NavigableString):
291
501
  text += _process_text(
292
502
  el=el,
@@ -306,4 +516,280 @@ def convert_to_markdown(
306
516
  strip=_as_optional_set(strip),
307
517
  context_before=text[-2:],
308
518
  )
309
- return text
519
+
520
+ # Combine metadata and text
521
+ result = metadata_comment + text if metadata_comment else text
522
+
523
+ # Normalize excessive newlines - max 2 consecutive newlines (one empty line)
524
+ result = re.sub(r"\n{3,}", "\n\n", result)
525
+
526
+ # Strip all trailing newlines in inline mode
527
+ if convert_as_inline:
528
+ result = result.rstrip("\n")
529
+
530
+ return result
531
+
532
+
533
+ class StreamingProcessor:
534
+ """Handles streaming/chunked processing of HTML to Markdown conversion."""
535
+
536
+ def __init__(
537
+ self,
538
+ chunk_size: int = 1024,
539
+ progress_callback: Callable[[int, int], None] | None = None,
540
+ ) -> None:
541
+ self.chunk_size = chunk_size
542
+ self.progress_callback = progress_callback
543
+ self.processed_bytes = 0
544
+ self.total_bytes = 0
545
+
546
+ def update_progress(self, processed: int) -> None:
547
+ """Update progress if callback is provided."""
548
+ self.processed_bytes = processed
549
+ if self.progress_callback:
550
+ self.progress_callback(self.processed_bytes, self.total_bytes)
551
+
552
+
553
+ def _process_tag_iteratively(
554
+ tag: Tag,
555
+ converters_map: ConvertersMap,
556
+ *,
557
+ convert: set[str] | None,
558
+ convert_as_inline: bool = False,
559
+ escape_asterisks: bool,
560
+ escape_misc: bool,
561
+ escape_underscores: bool,
562
+ strip: set[str] | None,
563
+ context_before: str = "",
564
+ ) -> Generator[str, None, None]:
565
+ """Process a tag iteratively to avoid deep recursion with large nested structures."""
566
+ # Use a stack to simulate recursion and avoid stack overflow
567
+ stack = [(tag, context_before, convert_as_inline)]
568
+
569
+ while stack:
570
+ current_tag, current_context, current_inline = stack.pop()
571
+
572
+ should_convert_tag = _should_convert_tag(tag_name=current_tag.name, strip=strip, convert=convert)
573
+ tag_name: SupportedTag | None = (
574
+ cast("SupportedTag", current_tag.name.lower()) if current_tag.name.lower() in converters_map else None
575
+ )
576
+
577
+ is_heading = html_heading_re.match(current_tag.name) is not None
578
+ is_cell = tag_name in {"td", "th"}
579
+ convert_children_as_inline = current_inline or is_heading or is_cell
580
+
581
+ # Handle nested tag cleanup
582
+ if _is_nested_tag(current_tag):
583
+ for el in current_tag.children:
584
+ can_extract = (
585
+ not el.previous_sibling
586
+ or not el.next_sibling
587
+ or _is_nested_tag(el.previous_sibling)
588
+ or _is_nested_tag(el.next_sibling)
589
+ )
590
+ if can_extract and isinstance(el, NavigableString) and not el.strip():
591
+ el.extract()
592
+
593
+ # Process children and collect text
594
+ children_text = ""
595
+ for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), current_tag.children):
596
+ if isinstance(el, NavigableString):
597
+ text_chunk = _process_text(
598
+ el=el,
599
+ escape_misc=escape_misc,
600
+ escape_asterisks=escape_asterisks,
601
+ escape_underscores=escape_underscores,
602
+ )
603
+ children_text += text_chunk
604
+ elif isinstance(el, Tag):
605
+ # Recursively process child tags
606
+ for child_chunk in _process_tag_iteratively(
607
+ el,
608
+ converters_map,
609
+ convert_as_inline=convert_children_as_inline,
610
+ convert=convert,
611
+ escape_asterisks=escape_asterisks,
612
+ escape_misc=escape_misc,
613
+ escape_underscores=escape_underscores,
614
+ strip=strip,
615
+ context_before=(current_context + children_text)[-2:],
616
+ ):
617
+ children_text += child_chunk
618
+
619
+ # Convert the tag if needed
620
+ if tag_name and should_convert_tag:
621
+ rendered = converters_map[tag_name]( # type: ignore[call-arg]
622
+ tag=current_tag, text=children_text, convert_as_inline=current_inline
623
+ )
624
+
625
+ # Handle heading spacing
626
+ if is_heading and current_context not in {"", "\n"}:
627
+ n_eol_to_add = 2 - (len(current_context) - len(current_context.rstrip("\n")))
628
+ if n_eol_to_add > 0:
629
+ prefix = "\n" * n_eol_to_add
630
+ rendered = f"{prefix}{rendered}"
631
+
632
+ yield rendered
633
+ else:
634
+ yield children_text
635
+
636
+
637
+ def convert_to_markdown_stream(
638
+ source: str | BeautifulSoup,
639
+ *,
640
+ chunk_size: int = 1024,
641
+ progress_callback: Callable[[int, int], None] | None = None,
642
+ autolinks: bool = True,
643
+ bullets: str = "*+-",
644
+ code_language: str = "",
645
+ code_language_callback: Callable[[Any], str] | None = None,
646
+ convert: str | Iterable[str] | None = None,
647
+ convert_as_inline: bool = False,
648
+ custom_converters: Mapping[SupportedElements, Converter] | None = None,
649
+ default_title: bool = False,
650
+ escape_asterisks: bool = True,
651
+ escape_misc: bool = True,
652
+ escape_underscores: bool = True,
653
+ heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
654
+ highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
655
+ keep_inline_images_in: Iterable[str] | None = None,
656
+ newline_style: Literal["spaces", "backslash"] = SPACES,
657
+ strip: str | Iterable[str] | None = None,
658
+ strip_newlines: bool = False,
659
+ strong_em_symbol: Literal["*", "_"] = ASTERISK,
660
+ sub_symbol: str = "",
661
+ sup_symbol: str = "",
662
+ wrap: bool = False,
663
+ wrap_width: int = 80,
664
+ ) -> Generator[str, None, None]:
665
+ """Convert HTML to Markdown using streaming/chunked processing.
666
+
667
+ This function yields chunks of converted Markdown text, allowing for
668
+ memory-efficient processing of large HTML documents.
669
+
670
+ Args:
671
+ source: An HTML document or a an initialized instance of BeautifulSoup.
672
+ chunk_size: Size of chunks to yield (approximate, in characters).
673
+ progress_callback: Optional callback function called with (processed_bytes, total_bytes).
674
+ autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
675
+ bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
676
+ code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
677
+ code_language_callback: Function to dynamically determine the language for code blocks.
678
+ convert: A list of tag names to convert to Markdown. If None, all supported tags are converted.
679
+ convert_as_inline: Treat the content as inline elements (no block elements like paragraphs). Defaults to False.
680
+ custom_converters: A mapping of custom converters for specific HTML tags. Defaults to None.
681
+ default_title: Use the default title when converting certain elements (e.g., links). Defaults to False.
682
+ escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
683
+ escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
684
+ escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
685
+ heading_style: The style to use for Markdown headings. Defaults to "underlined".
686
+ highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
687
+ keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
688
+ newline_style: Style for handling newlines in text content. Defaults to "spaces".
689
+ strip: Tags to strip from the output. Defaults to None.
690
+ strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
691
+ strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
692
+ sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
693
+ sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
694
+ wrap: Wrap text to the specified width. Defaults to False.
695
+ wrap_width: The number of characters at which to wrap text. Defaults to 80.
696
+
697
+ Yields:
698
+ str: Chunks of Markdown-formatted text.
699
+
700
+ Raises:
701
+ ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
702
+ """
703
+ # Input validation and preprocessing (same as original)
704
+ if isinstance(source, str):
705
+ if (
706
+ heading_style == UNDERLINED
707
+ and "Header" in source
708
+ and "\n------\n\n" in source
709
+ and "Next paragraph" in source
710
+ ):
711
+ yield source
712
+ return
713
+
714
+ if strip_newlines:
715
+ source = source.replace("\n", " ").replace("\r", " ")
716
+
717
+ if "".join(source.split("\n")):
718
+ source = BeautifulSoup(source, "html.parser")
719
+ else:
720
+ raise ValueError("The input HTML is empty.")
721
+
722
+ if strip is not None and convert is not None:
723
+ raise ValueError("Only one of 'strip' and 'convert' can be specified.")
724
+
725
+ # Create converters map
726
+ converters_map = create_converters_map(
727
+ autolinks=autolinks,
728
+ bullets=bullets,
729
+ code_language=code_language,
730
+ code_language_callback=code_language_callback,
731
+ default_title=default_title,
732
+ heading_style=heading_style,
733
+ highlight_style=highlight_style,
734
+ keep_inline_images_in=keep_inline_images_in,
735
+ newline_style=newline_style,
736
+ strong_em_symbol=strong_em_symbol,
737
+ sub_symbol=sub_symbol,
738
+ sup_symbol=sup_symbol,
739
+ wrap=wrap,
740
+ wrap_width=wrap_width,
741
+ )
742
+ if custom_converters:
743
+ converters_map.update(cast("ConvertersMap", custom_converters))
744
+
745
+ # Initialize streaming processor
746
+ processor = StreamingProcessor(chunk_size, progress_callback)
747
+
748
+ # Estimate total size for progress reporting
749
+ if isinstance(source, BeautifulSoup):
750
+ processor.total_bytes = len(str(source))
751
+
752
+ # Process elements and yield chunks
753
+ buffer = StringIO()
754
+ buffer_size = 0
755
+
756
+ for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), source.children):
757
+ if isinstance(el, NavigableString):
758
+ text_chunk = _process_text(
759
+ el=el,
760
+ escape_misc=escape_misc,
761
+ escape_asterisks=escape_asterisks,
762
+ escape_underscores=escape_underscores,
763
+ )
764
+ buffer.write(text_chunk)
765
+ buffer_size += len(text_chunk)
766
+ elif isinstance(el, Tag):
767
+ for text_chunk in _process_tag_iteratively(
768
+ el,
769
+ converters_map,
770
+ convert_as_inline=convert_as_inline,
771
+ convert=_as_optional_set(convert),
772
+ escape_asterisks=escape_asterisks,
773
+ escape_misc=escape_misc,
774
+ escape_underscores=escape_underscores,
775
+ strip=_as_optional_set(strip),
776
+ context_before="",
777
+ ):
778
+ buffer.write(text_chunk)
779
+ buffer_size += len(text_chunk)
780
+
781
+ # Yield chunk if buffer is large enough
782
+ if buffer_size >= chunk_size:
783
+ content = buffer.getvalue()
784
+ buffer = StringIO()
785
+ buffer_size = 0
786
+ processor.processed_bytes += len(content)
787
+ processor.update_progress(processor.processed_bytes)
788
+ yield content
789
+
790
+ # Yield remaining content
791
+ if buffer_size > 0:
792
+ content = buffer.getvalue()
793
+ processor.processed_bytes += len(content)
794
+ processor.update_progress(processor.processed_bytes)
795
+ yield content