html-to-markdown 1.6.0__tar.gz → 1.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (21) hide show
  1. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/PKG-INFO +2 -1
  2. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown/__init__.py +3 -1
  3. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown/cli.py +1 -4
  4. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown/converters.py +23 -86
  5. html_to_markdown-1.8.0/html_to_markdown/preprocessor.py +407 -0
  6. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown/processing.py +111 -67
  7. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown/utils.py +12 -5
  8. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/PKG-INFO +2 -1
  9. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/SOURCES.txt +1 -0
  10. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/requires.txt +1 -0
  11. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/pyproject.toml +2 -2
  12. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/LICENSE +0 -0
  13. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/README.md +0 -0
  14. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown/__main__.py +0 -0
  15. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown/constants.py +0 -0
  16. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown/exceptions.py +0 -0
  17. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown/py.typed +0 -0
  18. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
  19. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/entry_points.txt +0 -0
  20. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/top_level.txt +0 -0
  21. {html_to_markdown-1.6.0 → html_to_markdown-1.8.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.6.0
3
+ Version: 1.8.0
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -32,6 +32,7 @@ Requires-Python: >=3.9
32
32
  Description-Content-Type: text/markdown
33
33
  License-File: LICENSE
34
34
  Requires-Dist: beautifulsoup4>=4.13.4
35
+ Requires-Dist: nh3>=0.2.21
35
36
  Provides-Extra: lxml
36
37
  Requires-Dist: lxml>=5; extra == "lxml"
37
38
  Dynamic: license-file
@@ -5,9 +5,9 @@ from html_to_markdown.exceptions import (
5
5
  InvalidParserError,
6
6
  MissingDependencyError,
7
7
  )
8
+ from html_to_markdown.preprocessor import create_preprocessor, preprocess_html
8
9
  from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
9
10
 
10
- # For backward compatibility and to maintain the existing API
11
11
  markdownify = convert_to_markdown
12
12
 
13
13
  __all__ = [
@@ -18,5 +18,7 @@ __all__ = [
18
18
  "MissingDependencyError",
19
19
  "convert_to_markdown",
20
20
  "convert_to_markdown_stream",
21
+ "create_preprocessor",
21
22
  "markdownify",
23
+ "preprocess_html",
22
24
  ]
@@ -191,7 +191,6 @@ def main(argv: list[str]) -> str:
191
191
 
192
192
  args = parser.parse_args(argv)
193
193
 
194
- # Prepare base arguments
195
194
  base_args = {
196
195
  "strip": args.strip,
197
196
  "convert": args.convert,
@@ -216,18 +215,16 @@ def main(argv: list[str]) -> str:
216
215
  "highlight_style": args.highlight_style,
217
216
  }
218
217
 
219
- # Add streaming parameters only if streaming is enabled
220
218
  if args.stream_processing:
221
219
  base_args["stream_processing"] = True
222
220
  base_args["chunk_size"] = args.chunk_size
223
221
 
224
- # Progress callback for CLI
225
222
  if args.show_progress:
226
223
 
227
224
  def progress_callback(processed: int, total: int) -> None:
228
225
  if total > 0:
229
226
  percent = (processed / total) * 100
230
- # Use sys.stderr to avoid ruff T201 error for progress output
227
+
231
228
  sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
232
229
  sys.stderr.flush()
233
230
 
@@ -137,7 +137,6 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
137
137
  """
138
138
 
139
139
  def implementation(*, tag: Tag, text: str) -> str:
140
- # Check if we're in a code context - if so, don't apply markup
141
140
  from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
142
141
 
143
142
  if _has_ancestor(tag, ["pre", "code", "kbd", "samp"]):
@@ -151,7 +150,6 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
151
150
  markup_suffix = "</" + markup_prefix[1:]
152
151
 
153
152
  prefix, suffix, text = chomp(text)
154
-
155
153
  return f"{prefix}{markup_prefix}{text}{markup_suffix}{suffix}"
156
154
 
157
155
  return cast("Callable[[Tag, str], str]", implementation)
@@ -191,7 +189,6 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
191
189
  if not text:
192
190
  return ""
193
191
 
194
- # Handle cite attribute
195
192
  cite_url = tag.get("cite")
196
193
  quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
197
194
 
@@ -202,14 +199,12 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
202
199
 
203
200
 
204
201
  def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
205
- # Convert br to line break, but handle headings specially
206
202
  from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
207
203
 
208
204
  if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
209
- return " " # Convert to space in headings
205
+ return " "
210
206
 
211
- # Always convert br to line break in other contexts
212
- _ = convert_as_inline # Unused but kept for API consistency
207
+ _ = convert_as_inline
213
208
  return "\\\n" if newline_style.lower() == BACKSLASH else " \n"
214
209
 
215
210
 
@@ -247,7 +242,7 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
247
242
  height = height if isinstance(height, str) else ""
248
243
  title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
249
244
  parent_name = tag.parent.name if tag.parent else ""
250
- # Always preserve images in table cells (td, th) by default
245
+
251
246
  default_preserve_in = ["td", "th"]
252
247
  preserve_in = set(keep_inline_images_in or []) | set(default_preserve_in)
253
248
  if convert_as_inline and parent_name not in preserve_in:
@@ -281,12 +276,11 @@ def _convert_list(*, tag: Tag, text: str) -> str:
281
276
 
282
277
 
283
278
  def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
284
- # Check for task list (checkbox input)
285
279
  checkbox = tag.find("input", {"type": "checkbox"})
286
280
  if checkbox and isinstance(checkbox, Tag):
287
281
  checked = checkbox.get("checked") is not None
288
282
  checkbox_symbol = "[x]" if checked else "[ ]"
289
- # Remove the checkbox from the text content
283
+
290
284
  checkbox_text = text
291
285
  if checkbox.string:
292
286
  checkbox_text = text.replace(str(checkbox.string), "").strip()
@@ -676,7 +670,6 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
676
670
  if not text.strip():
677
671
  return ""
678
672
 
679
- # Escape any existing quotes in the text
680
673
  escaped_text = text.strip().replace('"', '\\"')
681
674
  return f'"{escaped_text}"'
682
675
 
@@ -692,23 +685,20 @@ def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
692
685
  Returns:
693
686
  The converted markdown text preserving audio element.
694
687
  """
695
- _ = convert_as_inline # Unused but kept for API consistency
688
+ _ = convert_as_inline
696
689
  src = tag.get("src", "")
697
690
 
698
- # Check for source elements if no src attribute
699
691
  if not src:
700
692
  source_tag = tag.find("source")
701
693
  if source_tag and isinstance(source_tag, Tag):
702
694
  src = source_tag.get("src", "")
703
695
 
704
- # Get other attributes
705
696
  controls = "controls" if tag.get("controls") is not None else ""
706
697
  autoplay = "autoplay" if tag.get("autoplay") is not None else ""
707
698
  loop = "loop" if tag.get("loop") is not None else ""
708
699
  muted = "muted" if tag.get("muted") is not None else ""
709
700
  preload = tag.get("preload", "")
710
701
 
711
- # Build attributes string
712
702
  attrs = []
713
703
  if src and isinstance(src, str) and src.strip():
714
704
  attrs.append(f'src="{src}"')
@@ -725,13 +715,11 @@ def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
725
715
 
726
716
  attrs_str = " ".join(attrs)
727
717
 
728
- # If there's fallback content, preserve it
729
718
  if text.strip():
730
719
  if attrs_str:
731
720
  return f"<audio {attrs_str}>\n{text.strip()}\n</audio>\n\n"
732
721
  return f"<audio>\n{text.strip()}\n</audio>\n\n"
733
722
 
734
- # Self-closing for no fallback content
735
723
  if attrs_str:
736
724
  return f"<audio {attrs_str} />\n\n"
737
725
  return "<audio />\n\n"
@@ -748,16 +736,14 @@ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
748
736
  Returns:
749
737
  The converted markdown text preserving video element.
750
738
  """
751
- _ = convert_as_inline # Unused but kept for API consistency
739
+ _ = convert_as_inline
752
740
  src = tag.get("src", "")
753
741
 
754
- # Check for source elements if no src attribute
755
742
  if not src:
756
743
  source_tag = tag.find("source")
757
744
  if source_tag and isinstance(source_tag, Tag):
758
745
  src = source_tag.get("src", "")
759
746
 
760
- # Get other attributes
761
747
  width = tag.get("width", "")
762
748
  height = tag.get("height", "")
763
749
  poster = tag.get("poster", "")
@@ -767,7 +753,6 @@ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
767
753
  muted = "muted" if tag.get("muted") is not None else ""
768
754
  preload = tag.get("preload", "")
769
755
 
770
- # Build attributes string
771
756
  attrs = []
772
757
  if src and isinstance(src, str) and src.strip():
773
758
  attrs.append(f'src="{src}"')
@@ -790,13 +775,11 @@ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
790
775
 
791
776
  attrs_str = " ".join(attrs)
792
777
 
793
- # If there's fallback content, preserve it
794
778
  if text.strip():
795
779
  if attrs_str:
796
780
  return f"<video {attrs_str}>\n{text.strip()}\n</video>\n\n"
797
781
  return f"<video>\n{text.strip()}\n</video>\n\n"
798
782
 
799
- # Self-closing for no fallback content
800
783
  if attrs_str:
801
784
  return f"<video {attrs_str} />\n\n"
802
785
  return "<video />\n\n"
@@ -813,17 +796,16 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
813
796
  Returns:
814
797
  The converted markdown text preserving iframe element.
815
798
  """
816
- _ = text # Unused but kept for API consistency
817
- _ = convert_as_inline # Unused but kept for API consistency
799
+ _ = text
800
+ _ = convert_as_inline
818
801
  src = tag.get("src", "")
819
802
  width = tag.get("width", "")
820
803
  height = tag.get("height", "")
821
804
  title = tag.get("title", "")
822
805
  allow = tag.get("allow", "")
823
- sandbox = tag.get("sandbox") # Don't provide default
806
+ sandbox = tag.get("sandbox")
824
807
  loading = tag.get("loading", "")
825
808
 
826
- # Build attributes string
827
809
  attrs = []
828
810
  if src and isinstance(src, str) and src.strip():
829
811
  attrs.append(f'src="{src}"')
@@ -837,11 +819,9 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
837
819
  attrs.append(f'allow="{allow}"')
838
820
  if sandbox is not None:
839
821
  if isinstance(sandbox, list):
840
- # BeautifulSoup returns AttributeValueList for space-separated values
841
822
  if sandbox:
842
823
  attrs.append(f'sandbox="{" ".join(sandbox)}"')
843
824
  else:
844
- # Empty list means boolean attribute
845
825
  attrs.append("sandbox")
846
826
  elif isinstance(sandbox, str) and sandbox:
847
827
  attrs.append(f'sandbox="{sandbox}"')
@@ -852,7 +832,6 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
852
832
 
853
833
  attrs_str = " ".join(attrs)
854
834
 
855
- # iframes are typically self-closing in usage
856
835
  if attrs_str:
857
836
  return f"<iframe {attrs_str}></iframe>\n\n"
858
837
  return "<iframe></iframe>\n\n"
@@ -869,13 +848,12 @@ def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
869
848
  Returns:
870
849
  The converted markdown text with optional title annotation.
871
850
  """
872
- _ = convert_as_inline # Unused but kept for API consistency
851
+ _ = convert_as_inline
873
852
  if not text.strip():
874
853
  return ""
875
854
 
876
855
  title = tag.get("title")
877
856
  if title and isinstance(title, str) and title.strip():
878
- # Show abbreviation with title in parentheses
879
857
  return f"{text.strip()} ({title.strip()})"
880
858
 
881
859
  return text.strip()
@@ -892,13 +870,12 @@ def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
892
870
  Returns:
893
871
  The converted markdown text preserving time information.
894
872
  """
895
- _ = convert_as_inline # Unused but kept for API consistency
873
+ _ = convert_as_inline
896
874
  if not text.strip():
897
875
  return ""
898
876
 
899
877
  datetime_attr = tag.get("datetime")
900
878
  if datetime_attr and isinstance(datetime_attr, str) and datetime_attr.strip():
901
- # Preserve machine-readable datetime in HTML
902
879
  return f'<time datetime="{datetime_attr.strip()}">{text.strip()}</time>'
903
880
 
904
881
  return text.strip()
@@ -915,13 +892,12 @@ def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
915
892
  Returns:
916
893
  The converted markdown text preserving machine-readable data.
917
894
  """
918
- _ = convert_as_inline # Unused but kept for API consistency
895
+ _ = convert_as_inline
919
896
  if not text.strip():
920
897
  return ""
921
898
 
922
899
  value_attr = tag.get("value")
923
900
  if value_attr and isinstance(value_attr, str) and value_attr.strip():
924
- # Preserve machine-readable value in HTML
925
901
  return f'<data value="{value_attr.strip()}">{text.strip()}</data>'
926
902
 
927
903
  return text.strip()
@@ -936,8 +912,8 @@ def _convert_wbr(*, convert_as_inline: bool) -> str:
936
912
  Returns:
937
913
  Empty string as wbr is just a break opportunity.
938
914
  """
939
- _ = convert_as_inline # Unused but kept for API consistency
940
- return "" # Word break opportunity doesn't produce visible output
915
+ _ = convert_as_inline
916
+ return ""
941
917
 
942
918
 
943
919
  def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
@@ -1046,8 +1022,6 @@ def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
1046
1022
  """
1047
1023
  input_type = tag.get("type", "text")
1048
1024
 
1049
- # Special handling for inputs in list items - let _convert_li handle checkboxes
1050
- # and ignore other input types in list items (legacy behavior)
1051
1025
  from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
1052
1026
 
1053
1027
  if _has_ancestor(tag, "li"):
@@ -1380,7 +1354,6 @@ def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1380
1354
 
1381
1355
  attrs = []
1382
1356
  if for_attr:
1383
- # BeautifulSoup returns space-separated attributes as lists
1384
1357
  for_value = " ".join(for_attr) if isinstance(for_attr, list) else str(for_attr)
1385
1358
  if for_value.strip():
1386
1359
  attrs.append(f'for="{for_value}"')
@@ -1438,7 +1411,6 @@ def _convert_ruby(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1438
1411
  if not text.strip():
1439
1412
  return ""
1440
1413
 
1441
- # Ruby elements are always inline by nature
1442
1414
  return text.strip()
1443
1415
 
1444
1416
 
@@ -1455,7 +1427,6 @@ def _convert_rb(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1455
1427
  if not text.strip():
1456
1428
  return ""
1457
1429
 
1458
- # Ruby base is the main text, pass through as-is
1459
1430
  return text.strip()
1460
1431
 
1461
1432
 
@@ -1470,21 +1441,17 @@ def _convert_rt(*, text: str, convert_as_inline: bool, tag: Tag) -> str: # noqa
1470
1441
  Returns:
1471
1442
  The converted markdown text with pronunciation in parentheses.
1472
1443
  """
1473
- # Handle empty rt elements - still need parentheses
1474
1444
  content = text.strip()
1475
1445
 
1476
- # Check if this rt is surrounded by rp elements (fallback parentheses)
1477
1446
  prev_sibling = tag.previous_sibling
1478
1447
  next_sibling = tag.next_sibling
1479
1448
 
1480
- # If surrounded by rp elements, don't add extra parentheses
1481
1449
  has_rp_before = prev_sibling and getattr(prev_sibling, "name", None) == "rp"
1482
1450
  has_rp_after = next_sibling and getattr(next_sibling, "name", None) == "rp"
1483
1451
 
1484
1452
  if has_rp_before and has_rp_after:
1485
- # Already has rp parentheses, just return the text
1486
1453
  return content
1487
- # Ruby text (pronunciation) shown in parentheses as fallback
1454
+
1488
1455
  return f"({content})"
1489
1456
 
1490
1457
 
@@ -1501,7 +1468,6 @@ def _convert_rp(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1501
1468
  if not text.strip():
1502
1469
  return ""
1503
1470
 
1504
- # Ruby parentheses preserved for fallback compatibility
1505
1471
  return text.strip()
1506
1472
 
1507
1473
 
@@ -1518,7 +1484,6 @@ def _convert_rtc(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1518
1484
  if not text.strip():
1519
1485
  return ""
1520
1486
 
1521
- # Ruby text container, pass through content
1522
1487
  return text.strip()
1523
1488
 
1524
1489
 
@@ -1539,7 +1504,6 @@ def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1539
1504
  if not text.strip():
1540
1505
  return ""
1541
1506
 
1542
- # Get dialog attributes for preservation
1543
1507
  attrs = []
1544
1508
  if tag.get("open") is not None:
1545
1509
  attrs.append("open")
@@ -1568,7 +1532,6 @@ def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1568
1532
  if not text.strip():
1569
1533
  return ""
1570
1534
 
1571
- # Get menu attributes for preservation
1572
1535
  attrs = []
1573
1536
  if tag.get("type") and tag.get("type") != "list":
1574
1537
  attrs.append(f'type="{tag.get("type")}"')
@@ -1599,12 +1562,10 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1599
1562
  if convert_as_inline:
1600
1563
  return text
1601
1564
 
1602
- # Get figure attributes for preservation
1603
1565
  attrs = []
1604
1566
  if tag.get("id"):
1605
1567
  attrs.append(f'id="{tag.get("id")}"')
1606
1568
  if tag.get("class"):
1607
- # Handle class attribute which might be a list
1608
1569
  class_val = tag.get("class")
1609
1570
  if isinstance(class_val, list):
1610
1571
  class_val = " ".join(class_val)
@@ -1612,11 +1573,8 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1612
1573
 
1613
1574
  attrs_str = " " + " ".join(attrs) if attrs else ""
1614
1575
 
1615
- # Check if the figure contains only an image (common case)
1616
- # In that case, we might want to preserve the figure wrapper
1617
1576
  content = text.strip()
1618
1577
 
1619
- # If content already has proper spacing, don't add extra newlines
1620
1578
  if content.endswith("\n\n"):
1621
1579
  return f"<figure{attrs_str}>\n{content}</figure>\n\n"
1622
1580
 
@@ -1639,12 +1597,8 @@ def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
1639
1597
  if not text.strip():
1640
1598
  return ""
1641
1599
 
1642
- # Preserve the semantic grouping of headings
1643
- # Add a marker to indicate this is a grouped heading
1644
1600
  content = text.strip()
1645
1601
 
1646
- # Remove excessive newlines between headings in the group
1647
- # Headings in hgroup should be visually closer together
1648
1602
  content = re.sub(r"\n{3,}", "\n\n", content)
1649
1603
 
1650
1604
  return f"<!-- heading group -->\n{content}\n<!-- end heading group -->\n\n"
@@ -1664,22 +1618,17 @@ def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1664
1618
  if not text.strip():
1665
1619
  return ""
1666
1620
 
1667
- # Find all source elements
1668
1621
  sources = tag.find_all("source")
1669
1622
  img = tag.find("img")
1670
1623
 
1671
1624
  if not img:
1672
- # No img fallback, just return the text content
1673
1625
  return text.strip()
1674
1626
 
1675
- # Get the primary image markdown (already converted)
1676
1627
  img_markdown = text.strip()
1677
1628
 
1678
- # If there are no sources, just return the image
1679
1629
  if not sources:
1680
1630
  return img_markdown
1681
1631
 
1682
- # Build a comment with source information for responsive images
1683
1632
  source_info = []
1684
1633
  for source in sources:
1685
1634
  srcset = source.get("srcset")
@@ -1695,14 +1644,12 @@ def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1695
1644
  source_info.append(info)
1696
1645
 
1697
1646
  if source_info and not convert_as_inline:
1698
- # Add picture source information as a comment
1699
1647
  sources_comment = "<!-- picture sources:\n"
1700
1648
  for info in source_info:
1701
1649
  sources_comment += f" {info}\n"
1702
1650
  sources_comment += "-->\n"
1703
1651
  return f"{sources_comment}{img_markdown}"
1704
1652
 
1705
- # In inline mode or no sources, just return the image
1706
1653
  return img_markdown
1707
1654
 
1708
1655
 
@@ -1718,23 +1665,17 @@ def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1718
1665
  The converted markdown text as an image reference.
1719
1666
  """
1720
1667
  if convert_as_inline:
1721
- # In inline mode, just return any text content
1722
1668
  return text.strip()
1723
1669
 
1724
- # Get SVG attributes
1725
1670
  title = tag.find("title")
1726
1671
  title_text = title.get_text().strip() if title else ""
1727
1672
 
1728
- # For inline SVG, we'll convert to a data URI
1729
- # First, we need to get the full SVG markup
1730
1673
  svg_markup = str(tag)
1731
1674
 
1732
- # Create a data URI
1733
1675
  svg_bytes = svg_markup.encode("utf-8")
1734
1676
  svg_base64 = base64.b64encode(svg_bytes).decode("utf-8")
1735
1677
  data_uri = f"data:image/svg+xml;base64,{svg_base64}"
1736
1678
 
1737
- # Use title as alt text, or "SVG Image" if no title
1738
1679
  alt_text = title_text or "SVG Image"
1739
1680
 
1740
1681
  return f"![{alt_text}]({data_uri})"
@@ -1754,17 +1695,13 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1754
1695
  if not text.strip():
1755
1696
  return ""
1756
1697
 
1757
- # Check if it's display math vs inline math
1758
1698
  display = tag.get("display") == "block"
1759
1699
 
1760
- # For now, preserve the MathML as a comment with the text representation
1761
- # This allows systems that understand MathML to process it
1762
1700
  math_comment = f"<!-- MathML: {tag!s} -->"
1763
1701
 
1764
1702
  if convert_as_inline or not display:
1765
- # Inline math - just the text with comment
1766
1703
  return f"{math_comment}{text.strip()}"
1767
- # Display math - on its own line
1704
+
1768
1705
  return f"\n\n{math_comment}\n{text.strip()}\n\n"
1769
1706
 
1770
1707
 
@@ -1830,8 +1767,8 @@ def create_converters_map(
1830
1767
  "aside": _wrapper(_convert_semantic_block),
1831
1768
  "audio": _wrapper(_convert_audio),
1832
1769
  "b": _wrapper(partial(_create_inline_converter(2 * strong_em_symbol))),
1833
- "bdi": _wrapper(_create_inline_converter("")), # Bidirectional isolation - pass through
1834
- "bdo": _wrapper(_create_inline_converter("")), # Bidirectional override - pass through
1770
+ "bdi": _wrapper(_create_inline_converter("")),
1771
+ "bdo": _wrapper(_create_inline_converter("")),
1835
1772
  "blockquote": _wrapper(partial(_convert_blockquote)),
1836
1773
  "br": _wrapper(partial(_convert_br, newline_style=newline_style)),
1837
1774
  "button": _wrapper(_convert_button),
@@ -1845,7 +1782,7 @@ def create_converters_map(
1845
1782
  "dd": _wrapper(_convert_dd),
1846
1783
  "del": _wrapper(_create_inline_converter("~~")),
1847
1784
  "details": _wrapper(_convert_details),
1848
- "dfn": _wrapper(_create_inline_converter("*")), # Definition term - italic
1785
+ "dfn": _wrapper(_create_inline_converter("*")),
1849
1786
  "dialog": _wrapper(_convert_dialog),
1850
1787
  "dl": _wrapper(_convert_dl),
1851
1788
  "dt": _wrapper(_convert_dt),
@@ -1868,7 +1805,7 @@ def create_converters_map(
1868
1805
  "iframe": _wrapper(_convert_iframe),
1869
1806
  "img": _wrapper(partial(_convert_img, keep_inline_images_in=keep_inline_images_in)),
1870
1807
  "input": _wrapper(_convert_input_enhanced),
1871
- "ins": _wrapper(_create_inline_converter("==")), # Inserted text - highlight style
1808
+ "ins": _wrapper(_create_inline_converter("==")),
1872
1809
  "kbd": _wrapper(_create_inline_converter("`")),
1873
1810
  "label": _wrapper(_convert_label),
1874
1811
  "legend": _wrapper(_convert_legend),
@@ -1905,7 +1842,7 @@ def create_converters_map(
1905
1842
  "script": _wrapper(lambda _: ""),
1906
1843
  "section": _wrapper(_convert_semantic_block),
1907
1844
  "select": _wrapper(_convert_select),
1908
- "small": _wrapper(_create_inline_converter("")), # Small text - pass through
1845
+ "small": _wrapper(_create_inline_converter("")),
1909
1846
  "strong": _wrapper(_create_inline_converter(strong_em_symbol * 2)),
1910
1847
  "style": _wrapper(lambda _: ""),
1911
1848
  "sub": _wrapper(_create_inline_converter(sub_symbol)),
@@ -1921,9 +1858,9 @@ def create_converters_map(
1921
1858
  "thead": _wrapper(_convert_thead),
1922
1859
  "time": _wrapper(_convert_time),
1923
1860
  "tr": _wrapper(_convert_tr),
1924
- "u": _wrapper(_create_inline_converter("")), # Underlined text - pass through (no Markdown equivalent)
1861
+ "u": _wrapper(_create_inline_converter("")),
1925
1862
  "ul": _wrapper(_convert_list),
1926
- "var": _wrapper(_create_inline_converter("*")), # Variable - italic
1863
+ "var": _wrapper(_create_inline_converter("*")),
1927
1864
  "video": _wrapper(_convert_video),
1928
1865
  "wbr": _wrapper(_convert_wbr),
1929
1866
  }