html-to-markdown 1.5.0__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -1,6 +1,24 @@
1
+ from html_to_markdown.exceptions import (
2
+ ConflictingOptionsError,
3
+ EmptyHtmlError,
4
+ HtmlToMarkdownError,
5
+ InvalidParserError,
6
+ MissingDependencyError,
7
+ )
8
+ from html_to_markdown.preprocessor import create_preprocessor, preprocess_html
1
9
  from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
2
10
 
3
- # For backward compatibility and to maintain the existing API
4
11
  markdownify = convert_to_markdown
5
12
 
6
- __all__ = ["convert_to_markdown", "convert_to_markdown_stream", "markdownify"]
13
+ __all__ = [
14
+ "ConflictingOptionsError",
15
+ "EmptyHtmlError",
16
+ "HtmlToMarkdownError",
17
+ "InvalidParserError",
18
+ "MissingDependencyError",
19
+ "convert_to_markdown",
20
+ "convert_to_markdown_stream",
21
+ "create_preprocessor",
22
+ "markdownify",
23
+ "preprocess_html",
24
+ ]
html_to_markdown/cli.py CHANGED
@@ -191,7 +191,6 @@ def main(argv: list[str]) -> str:
191
191
 
192
192
  args = parser.parse_args(argv)
193
193
 
194
- # Prepare base arguments
195
194
  base_args = {
196
195
  "strip": args.strip,
197
196
  "convert": args.convert,
@@ -216,18 +215,16 @@ def main(argv: list[str]) -> str:
216
215
  "highlight_style": args.highlight_style,
217
216
  }
218
217
 
219
- # Add streaming parameters only if streaming is enabled
220
218
  if args.stream_processing:
221
219
  base_args["stream_processing"] = True
222
220
  base_args["chunk_size"] = args.chunk_size
223
221
 
224
- # Progress callback for CLI
225
222
  if args.show_progress:
226
223
 
227
224
  def progress_callback(processed: int, total: int) -> None:
228
225
  if total > 0:
229
226
  percent = (processed / total) * 100
230
- # Use sys.stderr to avoid ruff T201 error for progress output
227
+
231
228
  sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
232
229
  sys.stderr.flush()
233
230
 
@@ -137,7 +137,9 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
137
137
  """
138
138
 
139
139
  def implementation(*, tag: Tag, text: str) -> str:
140
- if tag.find_parent(["pre", "code", "kbd", "samp"]):
140
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
141
+
142
+ if _has_ancestor(tag, ["pre", "code", "kbd", "samp"]):
141
143
  return text
142
144
 
143
145
  if not text.strip():
@@ -148,7 +150,6 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
148
150
  markup_suffix = "</" + markup_prefix[1:]
149
151
 
150
152
  prefix, suffix, text = chomp(text)
151
-
152
153
  return f"{prefix}{markup_prefix}{text}{markup_suffix}{suffix}"
153
154
 
154
155
  return cast("Callable[[Tag, str], str]", implementation)
@@ -188,7 +189,6 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
188
189
  if not text:
189
190
  return ""
190
191
 
191
- # Handle cite attribute
192
192
  cite_url = tag.get("cite")
193
193
  quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
194
194
 
@@ -199,12 +199,12 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
199
199
 
200
200
 
201
201
  def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
202
- # Convert br to line break, but handle headings specially
203
- if tag.find_parent(["h1", "h2", "h3", "h4", "h5", "h6"]):
204
- return " " # Convert to space in headings
202
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
205
203
 
206
- # Always convert br to line break in other contexts
207
- _ = convert_as_inline # Unused but kept for API consistency
204
+ if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
205
+ return " "
206
+
207
+ _ = convert_as_inline
208
208
  return "\\\n" if newline_style.lower() == BACKSLASH else " \n"
209
209
 
210
210
 
@@ -242,7 +242,7 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
242
242
  height = height if isinstance(height, str) else ""
243
243
  title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
244
244
  parent_name = tag.parent.name if tag.parent else ""
245
- # Always preserve images in table cells (td, th) by default
245
+
246
246
  default_preserve_in = ["td", "th"]
247
247
  preserve_in = set(keep_inline_images_in or []) | set(default_preserve_in)
248
248
  if convert_as_inline and parent_name not in preserve_in:
@@ -276,12 +276,11 @@ def _convert_list(*, tag: Tag, text: str) -> str:
276
276
 
277
277
 
278
278
  def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
279
- # Check for task list (checkbox input)
280
279
  checkbox = tag.find("input", {"type": "checkbox"})
281
280
  if checkbox and isinstance(checkbox, Tag):
282
281
  checked = checkbox.get("checked") is not None
283
282
  checkbox_symbol = "[x]" if checked else "[ ]"
284
- # Remove the checkbox from the text content
283
+
285
284
  checkbox_text = text
286
285
  if checkbox.string:
287
286
  checkbox_text = text.replace(str(checkbox.string), "").strip()
@@ -671,12 +670,11 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
671
670
  if not text.strip():
672
671
  return ""
673
672
 
674
- # Escape any existing quotes in the text
675
673
  escaped_text = text.strip().replace('"', '\\"')
676
674
  return f'"{escaped_text}"'
677
675
 
678
676
 
679
- def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # noqa: C901
677
+ def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
680
678
  """Convert HTML audio element preserving structure with fallback.
681
679
 
682
680
  Args:
@@ -687,23 +685,20 @@ def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # n
687
685
  Returns:
688
686
  The converted markdown text preserving audio element.
689
687
  """
690
- _ = convert_as_inline # Unused but kept for API consistency
688
+ _ = convert_as_inline
691
689
  src = tag.get("src", "")
692
690
 
693
- # Check for source elements if no src attribute
694
691
  if not src:
695
692
  source_tag = tag.find("source")
696
693
  if source_tag and isinstance(source_tag, Tag):
697
694
  src = source_tag.get("src", "")
698
695
 
699
- # Get other attributes
700
696
  controls = "controls" if tag.get("controls") is not None else ""
701
697
  autoplay = "autoplay" if tag.get("autoplay") is not None else ""
702
698
  loop = "loop" if tag.get("loop") is not None else ""
703
699
  muted = "muted" if tag.get("muted") is not None else ""
704
700
  preload = tag.get("preload", "")
705
701
 
706
- # Build attributes string
707
702
  attrs = []
708
703
  if src and isinstance(src, str) and src.strip():
709
704
  attrs.append(f'src="{src}"')
@@ -720,19 +715,17 @@ def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # n
720
715
 
721
716
  attrs_str = " ".join(attrs)
722
717
 
723
- # If there's fallback content, preserve it
724
718
  if text.strip():
725
719
  if attrs_str:
726
720
  return f"<audio {attrs_str}>\n{text.strip()}\n</audio>\n\n"
727
721
  return f"<audio>\n{text.strip()}\n</audio>\n\n"
728
722
 
729
- # Self-closing for no fallback content
730
723
  if attrs_str:
731
724
  return f"<audio {attrs_str} />\n\n"
732
725
  return "<audio />\n\n"
733
726
 
734
727
 
735
- def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # noqa: C901, PLR0912
728
+ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
736
729
  """Convert HTML video element preserving structure with fallback.
737
730
 
738
731
  Args:
@@ -743,16 +736,14 @@ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # n
743
736
  Returns:
744
737
  The converted markdown text preserving video element.
745
738
  """
746
- _ = convert_as_inline # Unused but kept for API consistency
739
+ _ = convert_as_inline
747
740
  src = tag.get("src", "")
748
741
 
749
- # Check for source elements if no src attribute
750
742
  if not src:
751
743
  source_tag = tag.find("source")
752
744
  if source_tag and isinstance(source_tag, Tag):
753
745
  src = source_tag.get("src", "")
754
746
 
755
- # Get other attributes
756
747
  width = tag.get("width", "")
757
748
  height = tag.get("height", "")
758
749
  poster = tag.get("poster", "")
@@ -762,7 +753,6 @@ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # n
762
753
  muted = "muted" if tag.get("muted") is not None else ""
763
754
  preload = tag.get("preload", "")
764
755
 
765
- # Build attributes string
766
756
  attrs = []
767
757
  if src and isinstance(src, str) and src.strip():
768
758
  attrs.append(f'src="{src}"')
@@ -785,19 +775,17 @@ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # n
785
775
 
786
776
  attrs_str = " ".join(attrs)
787
777
 
788
- # If there's fallback content, preserve it
789
778
  if text.strip():
790
779
  if attrs_str:
791
780
  return f"<video {attrs_str}>\n{text.strip()}\n</video>\n\n"
792
781
  return f"<video>\n{text.strip()}\n</video>\n\n"
793
782
 
794
- # Self-closing for no fallback content
795
783
  if attrs_str:
796
784
  return f"<video {attrs_str} />\n\n"
797
785
  return "<video />\n\n"
798
786
 
799
787
 
800
- def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # noqa: C901, PLR0912
788
+ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
801
789
  """Convert HTML iframe element preserving structure.
802
790
 
803
791
  Args:
@@ -808,17 +796,16 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str: #
808
796
  Returns:
809
797
  The converted markdown text preserving iframe element.
810
798
  """
811
- _ = text # Unused but kept for API consistency
812
- _ = convert_as_inline # Unused but kept for API consistency
799
+ _ = text
800
+ _ = convert_as_inline
813
801
  src = tag.get("src", "")
814
802
  width = tag.get("width", "")
815
803
  height = tag.get("height", "")
816
804
  title = tag.get("title", "")
817
805
  allow = tag.get("allow", "")
818
- sandbox = tag.get("sandbox") # Don't provide default
806
+ sandbox = tag.get("sandbox")
819
807
  loading = tag.get("loading", "")
820
808
 
821
- # Build attributes string
822
809
  attrs = []
823
810
  if src and isinstance(src, str) and src.strip():
824
811
  attrs.append(f'src="{src}"')
@@ -832,11 +819,9 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str: #
832
819
  attrs.append(f'allow="{allow}"')
833
820
  if sandbox is not None:
834
821
  if isinstance(sandbox, list):
835
- # BeautifulSoup returns AttributeValueList for space-separated values
836
822
  if sandbox:
837
823
  attrs.append(f'sandbox="{" ".join(sandbox)}"')
838
824
  else:
839
- # Empty list means boolean attribute
840
825
  attrs.append("sandbox")
841
826
  elif isinstance(sandbox, str) and sandbox:
842
827
  attrs.append(f'sandbox="{sandbox}"')
@@ -847,7 +832,6 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str: #
847
832
 
848
833
  attrs_str = " ".join(attrs)
849
834
 
850
- # iframes are typically self-closing in usage
851
835
  if attrs_str:
852
836
  return f"<iframe {attrs_str}></iframe>\n\n"
853
837
  return "<iframe></iframe>\n\n"
@@ -864,13 +848,12 @@ def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
864
848
  Returns:
865
849
  The converted markdown text with optional title annotation.
866
850
  """
867
- _ = convert_as_inline # Unused but kept for API consistency
851
+ _ = convert_as_inline
868
852
  if not text.strip():
869
853
  return ""
870
854
 
871
855
  title = tag.get("title")
872
856
  if title and isinstance(title, str) and title.strip():
873
- # Show abbreviation with title in parentheses
874
857
  return f"{text.strip()} ({title.strip()})"
875
858
 
876
859
  return text.strip()
@@ -887,13 +870,12 @@ def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
887
870
  Returns:
888
871
  The converted markdown text preserving time information.
889
872
  """
890
- _ = convert_as_inline # Unused but kept for API consistency
873
+ _ = convert_as_inline
891
874
  if not text.strip():
892
875
  return ""
893
876
 
894
877
  datetime_attr = tag.get("datetime")
895
878
  if datetime_attr and isinstance(datetime_attr, str) and datetime_attr.strip():
896
- # Preserve machine-readable datetime in HTML
897
879
  return f'<time datetime="{datetime_attr.strip()}">{text.strip()}</time>'
898
880
 
899
881
  return text.strip()
@@ -910,13 +892,12 @@ def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
910
892
  Returns:
911
893
  The converted markdown text preserving machine-readable data.
912
894
  """
913
- _ = convert_as_inline # Unused but kept for API consistency
895
+ _ = convert_as_inline
914
896
  if not text.strip():
915
897
  return ""
916
898
 
917
899
  value_attr = tag.get("value")
918
900
  if value_attr and isinstance(value_attr, str) and value_attr.strip():
919
- # Preserve machine-readable value in HTML
920
901
  return f'<data value="{value_attr.strip()}">{text.strip()}</data>'
921
902
 
922
903
  return text.strip()
@@ -931,8 +912,8 @@ def _convert_wbr(*, convert_as_inline: bool) -> str:
931
912
  Returns:
932
913
  Empty string as wbr is just a break opportunity.
933
914
  """
934
- _ = convert_as_inline # Unused but kept for API consistency
935
- return "" # Word break opportunity doesn't produce visible output
915
+ _ = convert_as_inline
916
+ return ""
936
917
 
937
918
 
938
919
  def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
@@ -1029,7 +1010,7 @@ def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1029
1010
  return f"<label>{text.strip()}</label>\n\n"
1030
1011
 
1031
1012
 
1032
- def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str: # noqa: C901
1013
+ def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
1033
1014
  """Convert HTML input element preserving all relevant attributes.
1034
1015
 
1035
1016
  Args:
@@ -1041,9 +1022,9 @@ def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str: # noq
1041
1022
  """
1042
1023
  input_type = tag.get("type", "text")
1043
1024
 
1044
- # Special handling for inputs in list items - let _convert_li handle checkboxes
1045
- # and ignore other input types in list items (legacy behavior)
1046
- if tag.find_parent("li"):
1025
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
1026
+
1027
+ if _has_ancestor(tag, "li"):
1047
1028
  return ""
1048
1029
 
1049
1030
  id_attr = tag.get("id", "")
@@ -1373,7 +1354,6 @@ def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1373
1354
 
1374
1355
  attrs = []
1375
1356
  if for_attr:
1376
- # BeautifulSoup returns space-separated attributes as lists
1377
1357
  for_value = " ".join(for_attr) if isinstance(for_attr, list) else str(for_attr)
1378
1358
  if for_value.strip():
1379
1359
  attrs.append(f'for="{for_value}"')
@@ -1431,7 +1411,6 @@ def _convert_ruby(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1431
1411
  if not text.strip():
1432
1412
  return ""
1433
1413
 
1434
- # Ruby elements are always inline by nature
1435
1414
  return text.strip()
1436
1415
 
1437
1416
 
@@ -1448,7 +1427,6 @@ def _convert_rb(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1448
1427
  if not text.strip():
1449
1428
  return ""
1450
1429
 
1451
- # Ruby base is the main text, pass through as-is
1452
1430
  return text.strip()
1453
1431
 
1454
1432
 
@@ -1463,21 +1441,17 @@ def _convert_rt(*, text: str, convert_as_inline: bool, tag: Tag) -> str: # noqa
1463
1441
  Returns:
1464
1442
  The converted markdown text with pronunciation in parentheses.
1465
1443
  """
1466
- # Handle empty rt elements - still need parentheses
1467
1444
  content = text.strip()
1468
1445
 
1469
- # Check if this rt is surrounded by rp elements (fallback parentheses)
1470
1446
  prev_sibling = tag.previous_sibling
1471
1447
  next_sibling = tag.next_sibling
1472
1448
 
1473
- # If surrounded by rp elements, don't add extra parentheses
1474
1449
  has_rp_before = prev_sibling and getattr(prev_sibling, "name", None) == "rp"
1475
1450
  has_rp_after = next_sibling and getattr(next_sibling, "name", None) == "rp"
1476
1451
 
1477
1452
  if has_rp_before and has_rp_after:
1478
- # Already has rp parentheses, just return the text
1479
1453
  return content
1480
- # Ruby text (pronunciation) shown in parentheses as fallback
1454
+
1481
1455
  return f"({content})"
1482
1456
 
1483
1457
 
@@ -1494,7 +1468,6 @@ def _convert_rp(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1494
1468
  if not text.strip():
1495
1469
  return ""
1496
1470
 
1497
- # Ruby parentheses preserved for fallback compatibility
1498
1471
  return text.strip()
1499
1472
 
1500
1473
 
@@ -1511,7 +1484,6 @@ def _convert_rtc(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1511
1484
  if not text.strip():
1512
1485
  return ""
1513
1486
 
1514
- # Ruby text container, pass through content
1515
1487
  return text.strip()
1516
1488
 
1517
1489
 
@@ -1532,7 +1504,6 @@ def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1532
1504
  if not text.strip():
1533
1505
  return ""
1534
1506
 
1535
- # Get dialog attributes for preservation
1536
1507
  attrs = []
1537
1508
  if tag.get("open") is not None:
1538
1509
  attrs.append("open")
@@ -1561,7 +1532,6 @@ def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1561
1532
  if not text.strip():
1562
1533
  return ""
1563
1534
 
1564
- # Get menu attributes for preservation
1565
1535
  attrs = []
1566
1536
  if tag.get("type") and tag.get("type") != "list":
1567
1537
  attrs.append(f'type="{tag.get("type")}"')
@@ -1592,12 +1562,10 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1592
1562
  if convert_as_inline:
1593
1563
  return text
1594
1564
 
1595
- # Get figure attributes for preservation
1596
1565
  attrs = []
1597
1566
  if tag.get("id"):
1598
1567
  attrs.append(f'id="{tag.get("id")}"')
1599
1568
  if tag.get("class"):
1600
- # Handle class attribute which might be a list
1601
1569
  class_val = tag.get("class")
1602
1570
  if isinstance(class_val, list):
1603
1571
  class_val = " ".join(class_val)
@@ -1605,11 +1573,8 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1605
1573
 
1606
1574
  attrs_str = " " + " ".join(attrs) if attrs else ""
1607
1575
 
1608
- # Check if the figure contains only an image (common case)
1609
- # In that case, we might want to preserve the figure wrapper
1610
1576
  content = text.strip()
1611
1577
 
1612
- # If content already has proper spacing, don't add extra newlines
1613
1578
  if content.endswith("\n\n"):
1614
1579
  return f"<figure{attrs_str}>\n{content}</figure>\n\n"
1615
1580
 
@@ -1632,12 +1597,8 @@ def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
1632
1597
  if not text.strip():
1633
1598
  return ""
1634
1599
 
1635
- # Preserve the semantic grouping of headings
1636
- # Add a marker to indicate this is a grouped heading
1637
1600
  content = text.strip()
1638
1601
 
1639
- # Remove excessive newlines between headings in the group
1640
- # Headings in hgroup should be visually closer together
1641
1602
  content = re.sub(r"\n{3,}", "\n\n", content)
1642
1603
 
1643
1604
  return f"<!-- heading group -->\n{content}\n<!-- end heading group -->\n\n"
@@ -1657,22 +1618,17 @@ def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1657
1618
  if not text.strip():
1658
1619
  return ""
1659
1620
 
1660
- # Find all source elements
1661
1621
  sources = tag.find_all("source")
1662
1622
  img = tag.find("img")
1663
1623
 
1664
1624
  if not img:
1665
- # No img fallback, just return the text content
1666
1625
  return text.strip()
1667
1626
 
1668
- # Get the primary image markdown (already converted)
1669
1627
  img_markdown = text.strip()
1670
1628
 
1671
- # If there are no sources, just return the image
1672
1629
  if not sources:
1673
1630
  return img_markdown
1674
1631
 
1675
- # Build a comment with source information for responsive images
1676
1632
  source_info = []
1677
1633
  for source in sources:
1678
1634
  srcset = source.get("srcset")
@@ -1688,14 +1644,12 @@ def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1688
1644
  source_info.append(info)
1689
1645
 
1690
1646
  if source_info and not convert_as_inline:
1691
- # Add picture source information as a comment
1692
1647
  sources_comment = "<!-- picture sources:\n"
1693
1648
  for info in source_info:
1694
1649
  sources_comment += f" {info}\n"
1695
1650
  sources_comment += "-->\n"
1696
1651
  return f"{sources_comment}{img_markdown}"
1697
1652
 
1698
- # In inline mode or no sources, just return the image
1699
1653
  return img_markdown
1700
1654
 
1701
1655
 
@@ -1711,23 +1665,17 @@ def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1711
1665
  The converted markdown text as an image reference.
1712
1666
  """
1713
1667
  if convert_as_inline:
1714
- # In inline mode, just return any text content
1715
1668
  return text.strip()
1716
1669
 
1717
- # Get SVG attributes
1718
1670
  title = tag.find("title")
1719
1671
  title_text = title.get_text().strip() if title else ""
1720
1672
 
1721
- # For inline SVG, we'll convert to a data URI
1722
- # First, we need to get the full SVG markup
1723
1673
  svg_markup = str(tag)
1724
1674
 
1725
- # Create a data URI
1726
1675
  svg_bytes = svg_markup.encode("utf-8")
1727
1676
  svg_base64 = base64.b64encode(svg_bytes).decode("utf-8")
1728
1677
  data_uri = f"data:image/svg+xml;base64,{svg_base64}"
1729
1678
 
1730
- # Use title as alt text, or "SVG Image" if no title
1731
1679
  alt_text = title_text or "SVG Image"
1732
1680
 
1733
1681
  return f"![{alt_text}]({data_uri})"
@@ -1747,17 +1695,13 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1747
1695
  if not text.strip():
1748
1696
  return ""
1749
1697
 
1750
- # Check if it's display math vs inline math
1751
1698
  display = tag.get("display") == "block"
1752
1699
 
1753
- # For now, preserve the MathML as a comment with the text representation
1754
- # This allows systems that understand MathML to process it
1755
1700
  math_comment = f"<!-- MathML: {tag!s} -->"
1756
1701
 
1757
1702
  if convert_as_inline or not display:
1758
- # Inline math - just the text with comment
1759
1703
  return f"{math_comment}{text.strip()}"
1760
- # Display math - on its own line
1704
+
1761
1705
  return f"\n\n{math_comment}\n{text.strip()}\n\n"
1762
1706
 
1763
1707
 
@@ -1823,8 +1767,8 @@ def create_converters_map(
1823
1767
  "aside": _wrapper(_convert_semantic_block),
1824
1768
  "audio": _wrapper(_convert_audio),
1825
1769
  "b": _wrapper(partial(_create_inline_converter(2 * strong_em_symbol))),
1826
- "bdi": _wrapper(_create_inline_converter("")), # Bidirectional isolation - pass through
1827
- "bdo": _wrapper(_create_inline_converter("")), # Bidirectional override - pass through
1770
+ "bdi": _wrapper(_create_inline_converter("")),
1771
+ "bdo": _wrapper(_create_inline_converter("")),
1828
1772
  "blockquote": _wrapper(partial(_convert_blockquote)),
1829
1773
  "br": _wrapper(partial(_convert_br, newline_style=newline_style)),
1830
1774
  "button": _wrapper(_convert_button),
@@ -1838,7 +1782,7 @@ def create_converters_map(
1838
1782
  "dd": _wrapper(_convert_dd),
1839
1783
  "del": _wrapper(_create_inline_converter("~~")),
1840
1784
  "details": _wrapper(_convert_details),
1841
- "dfn": _wrapper(_create_inline_converter("*")), # Definition term - italic
1785
+ "dfn": _wrapper(_create_inline_converter("*")),
1842
1786
  "dialog": _wrapper(_convert_dialog),
1843
1787
  "dl": _wrapper(_convert_dl),
1844
1788
  "dt": _wrapper(_convert_dt),
@@ -1861,7 +1805,7 @@ def create_converters_map(
1861
1805
  "iframe": _wrapper(_convert_iframe),
1862
1806
  "img": _wrapper(partial(_convert_img, keep_inline_images_in=keep_inline_images_in)),
1863
1807
  "input": _wrapper(_convert_input_enhanced),
1864
- "ins": _wrapper(_create_inline_converter("==")), # Inserted text - highlight style
1808
+ "ins": _wrapper(_create_inline_converter("==")),
1865
1809
  "kbd": _wrapper(_create_inline_converter("`")),
1866
1810
  "label": _wrapper(_convert_label),
1867
1811
  "legend": _wrapper(_convert_legend),
@@ -1898,7 +1842,7 @@ def create_converters_map(
1898
1842
  "script": _wrapper(lambda _: ""),
1899
1843
  "section": _wrapper(_convert_semantic_block),
1900
1844
  "select": _wrapper(_convert_select),
1901
- "small": _wrapper(_create_inline_converter("")), # Small text - pass through
1845
+ "small": _wrapper(_create_inline_converter("")),
1902
1846
  "strong": _wrapper(_create_inline_converter(strong_em_symbol * 2)),
1903
1847
  "style": _wrapper(lambda _: ""),
1904
1848
  "sub": _wrapper(_create_inline_converter(sub_symbol)),
@@ -1914,9 +1858,9 @@ def create_converters_map(
1914
1858
  "thead": _wrapper(_convert_thead),
1915
1859
  "time": _wrapper(_convert_time),
1916
1860
  "tr": _wrapper(_convert_tr),
1917
- "u": _wrapper(_create_inline_converter("")), # Underlined text - pass through (no Markdown equivalent)
1861
+ "u": _wrapper(_create_inline_converter("")),
1918
1862
  "ul": _wrapper(_convert_list),
1919
- "var": _wrapper(_create_inline_converter("*")), # Variable - italic
1863
+ "var": _wrapper(_create_inline_converter("*")),
1920
1864
  "video": _wrapper(_convert_video),
1921
1865
  "wbr": _wrapper(_convert_wbr),
1922
1866
  }
@@ -0,0 +1,49 @@
1
+ """Custom exceptions for the html-to-markdown library."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class HtmlToMarkdownError(Exception):
7
+ """Base exception for all html-to-markdown errors."""
8
+
9
+
10
+ class MissingDependencyError(HtmlToMarkdownError):
11
+ """Raised when an optional dependency is required but not installed."""
12
+
13
+ def __init__(self, dependency: str, install_command: str | None = None) -> None:
14
+ self.dependency = dependency
15
+ self.install_command = install_command
16
+
17
+ message = f"{dependency} is not installed."
18
+ if install_command:
19
+ message += f" Install with: {install_command}"
20
+
21
+ super().__init__(message)
22
+
23
+
24
+ class InvalidParserError(HtmlToMarkdownError):
25
+ """Raised when an invalid parser is specified."""
26
+
27
+ def __init__(self, parser: str, available_parsers: list[str]) -> None:
28
+ self.parser = parser
29
+ self.available_parsers = available_parsers
30
+
31
+ message = f"Invalid parser '{parser}'. Available parsers: {', '.join(available_parsers)}"
32
+ super().__init__(message)
33
+
34
+
35
+ class EmptyHtmlError(HtmlToMarkdownError):
36
+ """Raised when the input HTML is empty."""
37
+
38
+ def __init__(self) -> None:
39
+ super().__init__("The input HTML is empty.")
40
+
41
+
42
+ class ConflictingOptionsError(HtmlToMarkdownError):
43
+ """Raised when conflicting options are specified."""
44
+
45
+ def __init__(self, option1: str, option2: str) -> None:
46
+ self.option1 = option1
47
+ self.option2 = option2
48
+
49
+ super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")