html-to-markdown 1.5.0__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +20 -2
- html_to_markdown/cli.py +1 -4
- html_to_markdown/converters.py +36 -92
- html_to_markdown/exceptions.py +49 -0
- html_to_markdown/preprocessor.py +407 -0
- html_to_markdown/processing.py +447 -210
- html_to_markdown/utils.py +12 -5
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/METADATA +50 -13
- html_to_markdown-1.8.0.dist-info/RECORD +16 -0
- html_to_markdown-1.5.0.dist-info/RECORD +0 -14
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/top_level.txt +0 -0
html_to_markdown/__init__.py
CHANGED
|
@@ -1,6 +1,24 @@
|
|
|
1
|
+
from html_to_markdown.exceptions import (
|
|
2
|
+
ConflictingOptionsError,
|
|
3
|
+
EmptyHtmlError,
|
|
4
|
+
HtmlToMarkdownError,
|
|
5
|
+
InvalidParserError,
|
|
6
|
+
MissingDependencyError,
|
|
7
|
+
)
|
|
8
|
+
from html_to_markdown.preprocessor import create_preprocessor, preprocess_html
|
|
1
9
|
from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
|
|
2
10
|
|
|
3
|
-
# For backward compatibility and to maintain the existing API
|
|
4
11
|
markdownify = convert_to_markdown
|
|
5
12
|
|
|
6
|
-
__all__ = [
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ConflictingOptionsError",
|
|
15
|
+
"EmptyHtmlError",
|
|
16
|
+
"HtmlToMarkdownError",
|
|
17
|
+
"InvalidParserError",
|
|
18
|
+
"MissingDependencyError",
|
|
19
|
+
"convert_to_markdown",
|
|
20
|
+
"convert_to_markdown_stream",
|
|
21
|
+
"create_preprocessor",
|
|
22
|
+
"markdownify",
|
|
23
|
+
"preprocess_html",
|
|
24
|
+
]
|
html_to_markdown/cli.py
CHANGED
|
@@ -191,7 +191,6 @@ def main(argv: list[str]) -> str:
|
|
|
191
191
|
|
|
192
192
|
args = parser.parse_args(argv)
|
|
193
193
|
|
|
194
|
-
# Prepare base arguments
|
|
195
194
|
base_args = {
|
|
196
195
|
"strip": args.strip,
|
|
197
196
|
"convert": args.convert,
|
|
@@ -216,18 +215,16 @@ def main(argv: list[str]) -> str:
|
|
|
216
215
|
"highlight_style": args.highlight_style,
|
|
217
216
|
}
|
|
218
217
|
|
|
219
|
-
# Add streaming parameters only if streaming is enabled
|
|
220
218
|
if args.stream_processing:
|
|
221
219
|
base_args["stream_processing"] = True
|
|
222
220
|
base_args["chunk_size"] = args.chunk_size
|
|
223
221
|
|
|
224
|
-
# Progress callback for CLI
|
|
225
222
|
if args.show_progress:
|
|
226
223
|
|
|
227
224
|
def progress_callback(processed: int, total: int) -> None:
|
|
228
225
|
if total > 0:
|
|
229
226
|
percent = (processed / total) * 100
|
|
230
|
-
|
|
227
|
+
|
|
231
228
|
sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
|
|
232
229
|
sys.stderr.flush()
|
|
233
230
|
|
html_to_markdown/converters.py
CHANGED
|
@@ -137,7 +137,9 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
|
|
|
137
137
|
"""
|
|
138
138
|
|
|
139
139
|
def implementation(*, tag: Tag, text: str) -> str:
|
|
140
|
-
|
|
140
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
141
|
+
|
|
142
|
+
if _has_ancestor(tag, ["pre", "code", "kbd", "samp"]):
|
|
141
143
|
return text
|
|
142
144
|
|
|
143
145
|
if not text.strip():
|
|
@@ -148,7 +150,6 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
|
|
|
148
150
|
markup_suffix = "</" + markup_prefix[1:]
|
|
149
151
|
|
|
150
152
|
prefix, suffix, text = chomp(text)
|
|
151
|
-
|
|
152
153
|
return f"{prefix}{markup_prefix}{text}{markup_suffix}{suffix}"
|
|
153
154
|
|
|
154
155
|
return cast("Callable[[Tag, str], str]", implementation)
|
|
@@ -188,7 +189,6 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
|
|
|
188
189
|
if not text:
|
|
189
190
|
return ""
|
|
190
191
|
|
|
191
|
-
# Handle cite attribute
|
|
192
192
|
cite_url = tag.get("cite")
|
|
193
193
|
quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
|
|
194
194
|
|
|
@@ -199,12 +199,12 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
|
|
|
199
199
|
|
|
200
200
|
|
|
201
201
|
def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
|
|
202
|
-
|
|
203
|
-
if tag.find_parent(["h1", "h2", "h3", "h4", "h5", "h6"]):
|
|
204
|
-
return " " # Convert to space in headings
|
|
202
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
205
203
|
|
|
206
|
-
|
|
207
|
-
|
|
204
|
+
if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
|
|
205
|
+
return " "
|
|
206
|
+
|
|
207
|
+
_ = convert_as_inline
|
|
208
208
|
return "\\\n" if newline_style.lower() == BACKSLASH else " \n"
|
|
209
209
|
|
|
210
210
|
|
|
@@ -242,7 +242,7 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
|
|
|
242
242
|
height = height if isinstance(height, str) else ""
|
|
243
243
|
title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
|
|
244
244
|
parent_name = tag.parent.name if tag.parent else ""
|
|
245
|
-
|
|
245
|
+
|
|
246
246
|
default_preserve_in = ["td", "th"]
|
|
247
247
|
preserve_in = set(keep_inline_images_in or []) | set(default_preserve_in)
|
|
248
248
|
if convert_as_inline and parent_name not in preserve_in:
|
|
@@ -276,12 +276,11 @@ def _convert_list(*, tag: Tag, text: str) -> str:
|
|
|
276
276
|
|
|
277
277
|
|
|
278
278
|
def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
|
|
279
|
-
# Check for task list (checkbox input)
|
|
280
279
|
checkbox = tag.find("input", {"type": "checkbox"})
|
|
281
280
|
if checkbox and isinstance(checkbox, Tag):
|
|
282
281
|
checked = checkbox.get("checked") is not None
|
|
283
282
|
checkbox_symbol = "[x]" if checked else "[ ]"
|
|
284
|
-
|
|
283
|
+
|
|
285
284
|
checkbox_text = text
|
|
286
285
|
if checkbox.string:
|
|
287
286
|
checkbox_text = text.replace(str(checkbox.string), "").strip()
|
|
@@ -671,12 +670,11 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
|
|
|
671
670
|
if not text.strip():
|
|
672
671
|
return ""
|
|
673
672
|
|
|
674
|
-
# Escape any existing quotes in the text
|
|
675
673
|
escaped_text = text.strip().replace('"', '\\"')
|
|
676
674
|
return f'"{escaped_text}"'
|
|
677
675
|
|
|
678
676
|
|
|
679
|
-
def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
677
|
+
def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
680
678
|
"""Convert HTML audio element preserving structure with fallback.
|
|
681
679
|
|
|
682
680
|
Args:
|
|
@@ -687,23 +685,20 @@ def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # n
|
|
|
687
685
|
Returns:
|
|
688
686
|
The converted markdown text preserving audio element.
|
|
689
687
|
"""
|
|
690
|
-
_ = convert_as_inline
|
|
688
|
+
_ = convert_as_inline
|
|
691
689
|
src = tag.get("src", "")
|
|
692
690
|
|
|
693
|
-
# Check for source elements if no src attribute
|
|
694
691
|
if not src:
|
|
695
692
|
source_tag = tag.find("source")
|
|
696
693
|
if source_tag and isinstance(source_tag, Tag):
|
|
697
694
|
src = source_tag.get("src", "")
|
|
698
695
|
|
|
699
|
-
# Get other attributes
|
|
700
696
|
controls = "controls" if tag.get("controls") is not None else ""
|
|
701
697
|
autoplay = "autoplay" if tag.get("autoplay") is not None else ""
|
|
702
698
|
loop = "loop" if tag.get("loop") is not None else ""
|
|
703
699
|
muted = "muted" if tag.get("muted") is not None else ""
|
|
704
700
|
preload = tag.get("preload", "")
|
|
705
701
|
|
|
706
|
-
# Build attributes string
|
|
707
702
|
attrs = []
|
|
708
703
|
if src and isinstance(src, str) and src.strip():
|
|
709
704
|
attrs.append(f'src="{src}"')
|
|
@@ -720,19 +715,17 @@ def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # n
|
|
|
720
715
|
|
|
721
716
|
attrs_str = " ".join(attrs)
|
|
722
717
|
|
|
723
|
-
# If there's fallback content, preserve it
|
|
724
718
|
if text.strip():
|
|
725
719
|
if attrs_str:
|
|
726
720
|
return f"<audio {attrs_str}>\n{text.strip()}\n</audio>\n\n"
|
|
727
721
|
return f"<audio>\n{text.strip()}\n</audio>\n\n"
|
|
728
722
|
|
|
729
|
-
# Self-closing for no fallback content
|
|
730
723
|
if attrs_str:
|
|
731
724
|
return f"<audio {attrs_str} />\n\n"
|
|
732
725
|
return "<audio />\n\n"
|
|
733
726
|
|
|
734
727
|
|
|
735
|
-
def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
728
|
+
def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
736
729
|
"""Convert HTML video element preserving structure with fallback.
|
|
737
730
|
|
|
738
731
|
Args:
|
|
@@ -743,16 +736,14 @@ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # n
|
|
|
743
736
|
Returns:
|
|
744
737
|
The converted markdown text preserving video element.
|
|
745
738
|
"""
|
|
746
|
-
_ = convert_as_inline
|
|
739
|
+
_ = convert_as_inline
|
|
747
740
|
src = tag.get("src", "")
|
|
748
741
|
|
|
749
|
-
# Check for source elements if no src attribute
|
|
750
742
|
if not src:
|
|
751
743
|
source_tag = tag.find("source")
|
|
752
744
|
if source_tag and isinstance(source_tag, Tag):
|
|
753
745
|
src = source_tag.get("src", "")
|
|
754
746
|
|
|
755
|
-
# Get other attributes
|
|
756
747
|
width = tag.get("width", "")
|
|
757
748
|
height = tag.get("height", "")
|
|
758
749
|
poster = tag.get("poster", "")
|
|
@@ -762,7 +753,6 @@ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # n
|
|
|
762
753
|
muted = "muted" if tag.get("muted") is not None else ""
|
|
763
754
|
preload = tag.get("preload", "")
|
|
764
755
|
|
|
765
|
-
# Build attributes string
|
|
766
756
|
attrs = []
|
|
767
757
|
if src and isinstance(src, str) and src.strip():
|
|
768
758
|
attrs.append(f'src="{src}"')
|
|
@@ -785,19 +775,17 @@ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str: # n
|
|
|
785
775
|
|
|
786
776
|
attrs_str = " ".join(attrs)
|
|
787
777
|
|
|
788
|
-
# If there's fallback content, preserve it
|
|
789
778
|
if text.strip():
|
|
790
779
|
if attrs_str:
|
|
791
780
|
return f"<video {attrs_str}>\n{text.strip()}\n</video>\n\n"
|
|
792
781
|
return f"<video>\n{text.strip()}\n</video>\n\n"
|
|
793
782
|
|
|
794
|
-
# Self-closing for no fallback content
|
|
795
783
|
if attrs_str:
|
|
796
784
|
return f"<video {attrs_str} />\n\n"
|
|
797
785
|
return "<video />\n\n"
|
|
798
786
|
|
|
799
787
|
|
|
800
|
-
def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
788
|
+
def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
801
789
|
"""Convert HTML iframe element preserving structure.
|
|
802
790
|
|
|
803
791
|
Args:
|
|
@@ -808,17 +796,16 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str: #
|
|
|
808
796
|
Returns:
|
|
809
797
|
The converted markdown text preserving iframe element.
|
|
810
798
|
"""
|
|
811
|
-
_ = text
|
|
812
|
-
_ = convert_as_inline
|
|
799
|
+
_ = text
|
|
800
|
+
_ = convert_as_inline
|
|
813
801
|
src = tag.get("src", "")
|
|
814
802
|
width = tag.get("width", "")
|
|
815
803
|
height = tag.get("height", "")
|
|
816
804
|
title = tag.get("title", "")
|
|
817
805
|
allow = tag.get("allow", "")
|
|
818
|
-
sandbox = tag.get("sandbox")
|
|
806
|
+
sandbox = tag.get("sandbox")
|
|
819
807
|
loading = tag.get("loading", "")
|
|
820
808
|
|
|
821
|
-
# Build attributes string
|
|
822
809
|
attrs = []
|
|
823
810
|
if src and isinstance(src, str) and src.strip():
|
|
824
811
|
attrs.append(f'src="{src}"')
|
|
@@ -832,11 +819,9 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str: #
|
|
|
832
819
|
attrs.append(f'allow="{allow}"')
|
|
833
820
|
if sandbox is not None:
|
|
834
821
|
if isinstance(sandbox, list):
|
|
835
|
-
# BeautifulSoup returns AttributeValueList for space-separated values
|
|
836
822
|
if sandbox:
|
|
837
823
|
attrs.append(f'sandbox="{" ".join(sandbox)}"')
|
|
838
824
|
else:
|
|
839
|
-
# Empty list means boolean attribute
|
|
840
825
|
attrs.append("sandbox")
|
|
841
826
|
elif isinstance(sandbox, str) and sandbox:
|
|
842
827
|
attrs.append(f'sandbox="{sandbox}"')
|
|
@@ -847,7 +832,6 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str: #
|
|
|
847
832
|
|
|
848
833
|
attrs_str = " ".join(attrs)
|
|
849
834
|
|
|
850
|
-
# iframes are typically self-closing in usage
|
|
851
835
|
if attrs_str:
|
|
852
836
|
return f"<iframe {attrs_str}></iframe>\n\n"
|
|
853
837
|
return "<iframe></iframe>\n\n"
|
|
@@ -864,13 +848,12 @@ def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
864
848
|
Returns:
|
|
865
849
|
The converted markdown text with optional title annotation.
|
|
866
850
|
"""
|
|
867
|
-
_ = convert_as_inline
|
|
851
|
+
_ = convert_as_inline
|
|
868
852
|
if not text.strip():
|
|
869
853
|
return ""
|
|
870
854
|
|
|
871
855
|
title = tag.get("title")
|
|
872
856
|
if title and isinstance(title, str) and title.strip():
|
|
873
|
-
# Show abbreviation with title in parentheses
|
|
874
857
|
return f"{text.strip()} ({title.strip()})"
|
|
875
858
|
|
|
876
859
|
return text.strip()
|
|
@@ -887,13 +870,12 @@ def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
887
870
|
Returns:
|
|
888
871
|
The converted markdown text preserving time information.
|
|
889
872
|
"""
|
|
890
|
-
_ = convert_as_inline
|
|
873
|
+
_ = convert_as_inline
|
|
891
874
|
if not text.strip():
|
|
892
875
|
return ""
|
|
893
876
|
|
|
894
877
|
datetime_attr = tag.get("datetime")
|
|
895
878
|
if datetime_attr and isinstance(datetime_attr, str) and datetime_attr.strip():
|
|
896
|
-
# Preserve machine-readable datetime in HTML
|
|
897
879
|
return f'<time datetime="{datetime_attr.strip()}">{text.strip()}</time>'
|
|
898
880
|
|
|
899
881
|
return text.strip()
|
|
@@ -910,13 +892,12 @@ def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
910
892
|
Returns:
|
|
911
893
|
The converted markdown text preserving machine-readable data.
|
|
912
894
|
"""
|
|
913
|
-
_ = convert_as_inline
|
|
895
|
+
_ = convert_as_inline
|
|
914
896
|
if not text.strip():
|
|
915
897
|
return ""
|
|
916
898
|
|
|
917
899
|
value_attr = tag.get("value")
|
|
918
900
|
if value_attr and isinstance(value_attr, str) and value_attr.strip():
|
|
919
|
-
# Preserve machine-readable value in HTML
|
|
920
901
|
return f'<data value="{value_attr.strip()}">{text.strip()}</data>'
|
|
921
902
|
|
|
922
903
|
return text.strip()
|
|
@@ -931,8 +912,8 @@ def _convert_wbr(*, convert_as_inline: bool) -> str:
|
|
|
931
912
|
Returns:
|
|
932
913
|
Empty string as wbr is just a break opportunity.
|
|
933
914
|
"""
|
|
934
|
-
_ = convert_as_inline
|
|
935
|
-
return ""
|
|
915
|
+
_ = convert_as_inline
|
|
916
|
+
return ""
|
|
936
917
|
|
|
937
918
|
|
|
938
919
|
def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
@@ -1029,7 +1010,7 @@ def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1029
1010
|
return f"<label>{text.strip()}</label>\n\n"
|
|
1030
1011
|
|
|
1031
1012
|
|
|
1032
|
-
def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
|
|
1013
|
+
def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
|
|
1033
1014
|
"""Convert HTML input element preserving all relevant attributes.
|
|
1034
1015
|
|
|
1035
1016
|
Args:
|
|
@@ -1041,9 +1022,9 @@ def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str: # noq
|
|
|
1041
1022
|
"""
|
|
1042
1023
|
input_type = tag.get("type", "text")
|
|
1043
1024
|
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
if tag
|
|
1025
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
1026
|
+
|
|
1027
|
+
if _has_ancestor(tag, "li"):
|
|
1047
1028
|
return ""
|
|
1048
1029
|
|
|
1049
1030
|
id_attr = tag.get("id", "")
|
|
@@ -1373,7 +1354,6 @@ def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1373
1354
|
|
|
1374
1355
|
attrs = []
|
|
1375
1356
|
if for_attr:
|
|
1376
|
-
# BeautifulSoup returns space-separated attributes as lists
|
|
1377
1357
|
for_value = " ".join(for_attr) if isinstance(for_attr, list) else str(for_attr)
|
|
1378
1358
|
if for_value.strip():
|
|
1379
1359
|
attrs.append(f'for="{for_value}"')
|
|
@@ -1431,7 +1411,6 @@ def _convert_ruby(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1431
1411
|
if not text.strip():
|
|
1432
1412
|
return ""
|
|
1433
1413
|
|
|
1434
|
-
# Ruby elements are always inline by nature
|
|
1435
1414
|
return text.strip()
|
|
1436
1415
|
|
|
1437
1416
|
|
|
@@ -1448,7 +1427,6 @@ def _convert_rb(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1448
1427
|
if not text.strip():
|
|
1449
1428
|
return ""
|
|
1450
1429
|
|
|
1451
|
-
# Ruby base is the main text, pass through as-is
|
|
1452
1430
|
return text.strip()
|
|
1453
1431
|
|
|
1454
1432
|
|
|
@@ -1463,21 +1441,17 @@ def _convert_rt(*, text: str, convert_as_inline: bool, tag: Tag) -> str: # noqa
|
|
|
1463
1441
|
Returns:
|
|
1464
1442
|
The converted markdown text with pronunciation in parentheses.
|
|
1465
1443
|
"""
|
|
1466
|
-
# Handle empty rt elements - still need parentheses
|
|
1467
1444
|
content = text.strip()
|
|
1468
1445
|
|
|
1469
|
-
# Check if this rt is surrounded by rp elements (fallback parentheses)
|
|
1470
1446
|
prev_sibling = tag.previous_sibling
|
|
1471
1447
|
next_sibling = tag.next_sibling
|
|
1472
1448
|
|
|
1473
|
-
# If surrounded by rp elements, don't add extra parentheses
|
|
1474
1449
|
has_rp_before = prev_sibling and getattr(prev_sibling, "name", None) == "rp"
|
|
1475
1450
|
has_rp_after = next_sibling and getattr(next_sibling, "name", None) == "rp"
|
|
1476
1451
|
|
|
1477
1452
|
if has_rp_before and has_rp_after:
|
|
1478
|
-
# Already has rp parentheses, just return the text
|
|
1479
1453
|
return content
|
|
1480
|
-
|
|
1454
|
+
|
|
1481
1455
|
return f"({content})"
|
|
1482
1456
|
|
|
1483
1457
|
|
|
@@ -1494,7 +1468,6 @@ def _convert_rp(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1494
1468
|
if not text.strip():
|
|
1495
1469
|
return ""
|
|
1496
1470
|
|
|
1497
|
-
# Ruby parentheses preserved for fallback compatibility
|
|
1498
1471
|
return text.strip()
|
|
1499
1472
|
|
|
1500
1473
|
|
|
@@ -1511,7 +1484,6 @@ def _convert_rtc(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1511
1484
|
if not text.strip():
|
|
1512
1485
|
return ""
|
|
1513
1486
|
|
|
1514
|
-
# Ruby text container, pass through content
|
|
1515
1487
|
return text.strip()
|
|
1516
1488
|
|
|
1517
1489
|
|
|
@@ -1532,7 +1504,6 @@ def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1532
1504
|
if not text.strip():
|
|
1533
1505
|
return ""
|
|
1534
1506
|
|
|
1535
|
-
# Get dialog attributes for preservation
|
|
1536
1507
|
attrs = []
|
|
1537
1508
|
if tag.get("open") is not None:
|
|
1538
1509
|
attrs.append("open")
|
|
@@ -1561,7 +1532,6 @@ def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1561
1532
|
if not text.strip():
|
|
1562
1533
|
return ""
|
|
1563
1534
|
|
|
1564
|
-
# Get menu attributes for preservation
|
|
1565
1535
|
attrs = []
|
|
1566
1536
|
if tag.get("type") and tag.get("type") != "list":
|
|
1567
1537
|
attrs.append(f'type="{tag.get("type")}"')
|
|
@@ -1592,12 +1562,10 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1592
1562
|
if convert_as_inline:
|
|
1593
1563
|
return text
|
|
1594
1564
|
|
|
1595
|
-
# Get figure attributes for preservation
|
|
1596
1565
|
attrs = []
|
|
1597
1566
|
if tag.get("id"):
|
|
1598
1567
|
attrs.append(f'id="{tag.get("id")}"')
|
|
1599
1568
|
if tag.get("class"):
|
|
1600
|
-
# Handle class attribute which might be a list
|
|
1601
1569
|
class_val = tag.get("class")
|
|
1602
1570
|
if isinstance(class_val, list):
|
|
1603
1571
|
class_val = " ".join(class_val)
|
|
@@ -1605,11 +1573,8 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1605
1573
|
|
|
1606
1574
|
attrs_str = " " + " ".join(attrs) if attrs else ""
|
|
1607
1575
|
|
|
1608
|
-
# Check if the figure contains only an image (common case)
|
|
1609
|
-
# In that case, we might want to preserve the figure wrapper
|
|
1610
1576
|
content = text.strip()
|
|
1611
1577
|
|
|
1612
|
-
# If content already has proper spacing, don't add extra newlines
|
|
1613
1578
|
if content.endswith("\n\n"):
|
|
1614
1579
|
return f"<figure{attrs_str}>\n{content}</figure>\n\n"
|
|
1615
1580
|
|
|
@@ -1632,12 +1597,8 @@ def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
|
|
|
1632
1597
|
if not text.strip():
|
|
1633
1598
|
return ""
|
|
1634
1599
|
|
|
1635
|
-
# Preserve the semantic grouping of headings
|
|
1636
|
-
# Add a marker to indicate this is a grouped heading
|
|
1637
1600
|
content = text.strip()
|
|
1638
1601
|
|
|
1639
|
-
# Remove excessive newlines between headings in the group
|
|
1640
|
-
# Headings in hgroup should be visually closer together
|
|
1641
1602
|
content = re.sub(r"\n{3,}", "\n\n", content)
|
|
1642
1603
|
|
|
1643
1604
|
return f"<!-- heading group -->\n{content}\n<!-- end heading group -->\n\n"
|
|
@@ -1657,22 +1618,17 @@ def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1657
1618
|
if not text.strip():
|
|
1658
1619
|
return ""
|
|
1659
1620
|
|
|
1660
|
-
# Find all source elements
|
|
1661
1621
|
sources = tag.find_all("source")
|
|
1662
1622
|
img = tag.find("img")
|
|
1663
1623
|
|
|
1664
1624
|
if not img:
|
|
1665
|
-
# No img fallback, just return the text content
|
|
1666
1625
|
return text.strip()
|
|
1667
1626
|
|
|
1668
|
-
# Get the primary image markdown (already converted)
|
|
1669
1627
|
img_markdown = text.strip()
|
|
1670
1628
|
|
|
1671
|
-
# If there are no sources, just return the image
|
|
1672
1629
|
if not sources:
|
|
1673
1630
|
return img_markdown
|
|
1674
1631
|
|
|
1675
|
-
# Build a comment with source information for responsive images
|
|
1676
1632
|
source_info = []
|
|
1677
1633
|
for source in sources:
|
|
1678
1634
|
srcset = source.get("srcset")
|
|
@@ -1688,14 +1644,12 @@ def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1688
1644
|
source_info.append(info)
|
|
1689
1645
|
|
|
1690
1646
|
if source_info and not convert_as_inline:
|
|
1691
|
-
# Add picture source information as a comment
|
|
1692
1647
|
sources_comment = "<!-- picture sources:\n"
|
|
1693
1648
|
for info in source_info:
|
|
1694
1649
|
sources_comment += f" {info}\n"
|
|
1695
1650
|
sources_comment += "-->\n"
|
|
1696
1651
|
return f"{sources_comment}{img_markdown}"
|
|
1697
1652
|
|
|
1698
|
-
# In inline mode or no sources, just return the image
|
|
1699
1653
|
return img_markdown
|
|
1700
1654
|
|
|
1701
1655
|
|
|
@@ -1711,23 +1665,17 @@ def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1711
1665
|
The converted markdown text as an image reference.
|
|
1712
1666
|
"""
|
|
1713
1667
|
if convert_as_inline:
|
|
1714
|
-
# In inline mode, just return any text content
|
|
1715
1668
|
return text.strip()
|
|
1716
1669
|
|
|
1717
|
-
# Get SVG attributes
|
|
1718
1670
|
title = tag.find("title")
|
|
1719
1671
|
title_text = title.get_text().strip() if title else ""
|
|
1720
1672
|
|
|
1721
|
-
# For inline SVG, we'll convert to a data URI
|
|
1722
|
-
# First, we need to get the full SVG markup
|
|
1723
1673
|
svg_markup = str(tag)
|
|
1724
1674
|
|
|
1725
|
-
# Create a data URI
|
|
1726
1675
|
svg_bytes = svg_markup.encode("utf-8")
|
|
1727
1676
|
svg_base64 = base64.b64encode(svg_bytes).decode("utf-8")
|
|
1728
1677
|
data_uri = f"data:image/svg+xml;base64,{svg_base64}"
|
|
1729
1678
|
|
|
1730
|
-
# Use title as alt text, or "SVG Image" if no title
|
|
1731
1679
|
alt_text = title_text or "SVG Image"
|
|
1732
1680
|
|
|
1733
1681
|
return f""
|
|
@@ -1747,17 +1695,13 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1747
1695
|
if not text.strip():
|
|
1748
1696
|
return ""
|
|
1749
1697
|
|
|
1750
|
-
# Check if it's display math vs inline math
|
|
1751
1698
|
display = tag.get("display") == "block"
|
|
1752
1699
|
|
|
1753
|
-
# For now, preserve the MathML as a comment with the text representation
|
|
1754
|
-
# This allows systems that understand MathML to process it
|
|
1755
1700
|
math_comment = f"<!-- MathML: {tag!s} -->"
|
|
1756
1701
|
|
|
1757
1702
|
if convert_as_inline or not display:
|
|
1758
|
-
# Inline math - just the text with comment
|
|
1759
1703
|
return f"{math_comment}{text.strip()}"
|
|
1760
|
-
|
|
1704
|
+
|
|
1761
1705
|
return f"\n\n{math_comment}\n{text.strip()}\n\n"
|
|
1762
1706
|
|
|
1763
1707
|
|
|
@@ -1823,8 +1767,8 @@ def create_converters_map(
|
|
|
1823
1767
|
"aside": _wrapper(_convert_semantic_block),
|
|
1824
1768
|
"audio": _wrapper(_convert_audio),
|
|
1825
1769
|
"b": _wrapper(partial(_create_inline_converter(2 * strong_em_symbol))),
|
|
1826
|
-
"bdi": _wrapper(_create_inline_converter("")),
|
|
1827
|
-
"bdo": _wrapper(_create_inline_converter("")),
|
|
1770
|
+
"bdi": _wrapper(_create_inline_converter("")),
|
|
1771
|
+
"bdo": _wrapper(_create_inline_converter("")),
|
|
1828
1772
|
"blockquote": _wrapper(partial(_convert_blockquote)),
|
|
1829
1773
|
"br": _wrapper(partial(_convert_br, newline_style=newline_style)),
|
|
1830
1774
|
"button": _wrapper(_convert_button),
|
|
@@ -1838,7 +1782,7 @@ def create_converters_map(
|
|
|
1838
1782
|
"dd": _wrapper(_convert_dd),
|
|
1839
1783
|
"del": _wrapper(_create_inline_converter("~~")),
|
|
1840
1784
|
"details": _wrapper(_convert_details),
|
|
1841
|
-
"dfn": _wrapper(_create_inline_converter("*")),
|
|
1785
|
+
"dfn": _wrapper(_create_inline_converter("*")),
|
|
1842
1786
|
"dialog": _wrapper(_convert_dialog),
|
|
1843
1787
|
"dl": _wrapper(_convert_dl),
|
|
1844
1788
|
"dt": _wrapper(_convert_dt),
|
|
@@ -1861,7 +1805,7 @@ def create_converters_map(
|
|
|
1861
1805
|
"iframe": _wrapper(_convert_iframe),
|
|
1862
1806
|
"img": _wrapper(partial(_convert_img, keep_inline_images_in=keep_inline_images_in)),
|
|
1863
1807
|
"input": _wrapper(_convert_input_enhanced),
|
|
1864
|
-
"ins": _wrapper(_create_inline_converter("==")),
|
|
1808
|
+
"ins": _wrapper(_create_inline_converter("==")),
|
|
1865
1809
|
"kbd": _wrapper(_create_inline_converter("`")),
|
|
1866
1810
|
"label": _wrapper(_convert_label),
|
|
1867
1811
|
"legend": _wrapper(_convert_legend),
|
|
@@ -1898,7 +1842,7 @@ def create_converters_map(
|
|
|
1898
1842
|
"script": _wrapper(lambda _: ""),
|
|
1899
1843
|
"section": _wrapper(_convert_semantic_block),
|
|
1900
1844
|
"select": _wrapper(_convert_select),
|
|
1901
|
-
"small": _wrapper(_create_inline_converter("")),
|
|
1845
|
+
"small": _wrapper(_create_inline_converter("")),
|
|
1902
1846
|
"strong": _wrapper(_create_inline_converter(strong_em_symbol * 2)),
|
|
1903
1847
|
"style": _wrapper(lambda _: ""),
|
|
1904
1848
|
"sub": _wrapper(_create_inline_converter(sub_symbol)),
|
|
@@ -1914,9 +1858,9 @@ def create_converters_map(
|
|
|
1914
1858
|
"thead": _wrapper(_convert_thead),
|
|
1915
1859
|
"time": _wrapper(_convert_time),
|
|
1916
1860
|
"tr": _wrapper(_convert_tr),
|
|
1917
|
-
"u": _wrapper(_create_inline_converter("")),
|
|
1861
|
+
"u": _wrapper(_create_inline_converter("")),
|
|
1918
1862
|
"ul": _wrapper(_convert_list),
|
|
1919
|
-
"var": _wrapper(_create_inline_converter("*")),
|
|
1863
|
+
"var": _wrapper(_create_inline_converter("*")),
|
|
1920
1864
|
"video": _wrapper(_convert_video),
|
|
1921
1865
|
"wbr": _wrapper(_convert_wbr),
|
|
1922
1866
|
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Custom exceptions for the html-to-markdown library."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class HtmlToMarkdownError(Exception):
|
|
7
|
+
"""Base exception for all html-to-markdown errors."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MissingDependencyError(HtmlToMarkdownError):
|
|
11
|
+
"""Raised when an optional dependency is required but not installed."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, dependency: str, install_command: str | None = None) -> None:
|
|
14
|
+
self.dependency = dependency
|
|
15
|
+
self.install_command = install_command
|
|
16
|
+
|
|
17
|
+
message = f"{dependency} is not installed."
|
|
18
|
+
if install_command:
|
|
19
|
+
message += f" Install with: {install_command}"
|
|
20
|
+
|
|
21
|
+
super().__init__(message)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class InvalidParserError(HtmlToMarkdownError):
|
|
25
|
+
"""Raised when an invalid parser is specified."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, parser: str, available_parsers: list[str]) -> None:
|
|
28
|
+
self.parser = parser
|
|
29
|
+
self.available_parsers = available_parsers
|
|
30
|
+
|
|
31
|
+
message = f"Invalid parser '{parser}'. Available parsers: {', '.join(available_parsers)}"
|
|
32
|
+
super().__init__(message)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class EmptyHtmlError(HtmlToMarkdownError):
|
|
36
|
+
"""Raised when the input HTML is empty."""
|
|
37
|
+
|
|
38
|
+
def __init__(self) -> None:
|
|
39
|
+
super().__init__("The input HTML is empty.")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ConflictingOptionsError(HtmlToMarkdownError):
|
|
43
|
+
"""Raised when conflicting options are specified."""
|
|
44
|
+
|
|
45
|
+
def __init__(self, option1: str, option2: str) -> None:
|
|
46
|
+
self.option1 = option1
|
|
47
|
+
self.option2 = option2
|
|
48
|
+
|
|
49
|
+
super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
|