html-to-markdown 1.6.0__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +3 -1
- html_to_markdown/cli.py +1 -4
- html_to_markdown/converters.py +23 -86
- html_to_markdown/preprocessor.py +407 -0
- html_to_markdown/processing.py +111 -67
- html_to_markdown/utils.py +12 -5
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.8.0.dist-info}/METADATA +2 -1
- html_to_markdown-1.8.0.dist-info/RECORD +16 -0
- html_to_markdown-1.6.0.dist-info/RECORD +0 -15
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.8.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.8.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.8.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.8.0.dist-info}/top_level.txt +0 -0
html_to_markdown/__init__.py
CHANGED
|
@@ -5,9 +5,9 @@ from html_to_markdown.exceptions import (
|
|
|
5
5
|
InvalidParserError,
|
|
6
6
|
MissingDependencyError,
|
|
7
7
|
)
|
|
8
|
+
from html_to_markdown.preprocessor import create_preprocessor, preprocess_html
|
|
8
9
|
from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
|
|
9
10
|
|
|
10
|
-
# For backward compatibility and to maintain the existing API
|
|
11
11
|
markdownify = convert_to_markdown
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
@@ -18,5 +18,7 @@ __all__ = [
|
|
|
18
18
|
"MissingDependencyError",
|
|
19
19
|
"convert_to_markdown",
|
|
20
20
|
"convert_to_markdown_stream",
|
|
21
|
+
"create_preprocessor",
|
|
21
22
|
"markdownify",
|
|
23
|
+
"preprocess_html",
|
|
22
24
|
]
|
html_to_markdown/cli.py
CHANGED
|
@@ -191,7 +191,6 @@ def main(argv: list[str]) -> str:
|
|
|
191
191
|
|
|
192
192
|
args = parser.parse_args(argv)
|
|
193
193
|
|
|
194
|
-
# Prepare base arguments
|
|
195
194
|
base_args = {
|
|
196
195
|
"strip": args.strip,
|
|
197
196
|
"convert": args.convert,
|
|
@@ -216,18 +215,16 @@ def main(argv: list[str]) -> str:
|
|
|
216
215
|
"highlight_style": args.highlight_style,
|
|
217
216
|
}
|
|
218
217
|
|
|
219
|
-
# Add streaming parameters only if streaming is enabled
|
|
220
218
|
if args.stream_processing:
|
|
221
219
|
base_args["stream_processing"] = True
|
|
222
220
|
base_args["chunk_size"] = args.chunk_size
|
|
223
221
|
|
|
224
|
-
# Progress callback for CLI
|
|
225
222
|
if args.show_progress:
|
|
226
223
|
|
|
227
224
|
def progress_callback(processed: int, total: int) -> None:
|
|
228
225
|
if total > 0:
|
|
229
226
|
percent = (processed / total) * 100
|
|
230
|
-
|
|
227
|
+
|
|
231
228
|
sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
|
|
232
229
|
sys.stderr.flush()
|
|
233
230
|
|
html_to_markdown/converters.py
CHANGED
|
@@ -137,7 +137,6 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
|
|
|
137
137
|
"""
|
|
138
138
|
|
|
139
139
|
def implementation(*, tag: Tag, text: str) -> str:
|
|
140
|
-
# Check if we're in a code context - if so, don't apply markup
|
|
141
140
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
142
141
|
|
|
143
142
|
if _has_ancestor(tag, ["pre", "code", "kbd", "samp"]):
|
|
@@ -151,7 +150,6 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
|
|
|
151
150
|
markup_suffix = "</" + markup_prefix[1:]
|
|
152
151
|
|
|
153
152
|
prefix, suffix, text = chomp(text)
|
|
154
|
-
|
|
155
153
|
return f"{prefix}{markup_prefix}{text}{markup_suffix}{suffix}"
|
|
156
154
|
|
|
157
155
|
return cast("Callable[[Tag, str], str]", implementation)
|
|
@@ -191,7 +189,6 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
|
|
|
191
189
|
if not text:
|
|
192
190
|
return ""
|
|
193
191
|
|
|
194
|
-
# Handle cite attribute
|
|
195
192
|
cite_url = tag.get("cite")
|
|
196
193
|
quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
|
|
197
194
|
|
|
@@ -202,14 +199,12 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
|
|
|
202
199
|
|
|
203
200
|
|
|
204
201
|
def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
|
|
205
|
-
# Convert br to line break, but handle headings specially
|
|
206
202
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
207
203
|
|
|
208
204
|
if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
|
|
209
|
-
return " "
|
|
205
|
+
return " "
|
|
210
206
|
|
|
211
|
-
|
|
212
|
-
_ = convert_as_inline # Unused but kept for API consistency
|
|
207
|
+
_ = convert_as_inline
|
|
213
208
|
return "\\\n" if newline_style.lower() == BACKSLASH else " \n"
|
|
214
209
|
|
|
215
210
|
|
|
@@ -247,7 +242,7 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
|
|
|
247
242
|
height = height if isinstance(height, str) else ""
|
|
248
243
|
title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
|
|
249
244
|
parent_name = tag.parent.name if tag.parent else ""
|
|
250
|
-
|
|
245
|
+
|
|
251
246
|
default_preserve_in = ["td", "th"]
|
|
252
247
|
preserve_in = set(keep_inline_images_in or []) | set(default_preserve_in)
|
|
253
248
|
if convert_as_inline and parent_name not in preserve_in:
|
|
@@ -281,12 +276,11 @@ def _convert_list(*, tag: Tag, text: str) -> str:
|
|
|
281
276
|
|
|
282
277
|
|
|
283
278
|
def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
|
|
284
|
-
# Check for task list (checkbox input)
|
|
285
279
|
checkbox = tag.find("input", {"type": "checkbox"})
|
|
286
280
|
if checkbox and isinstance(checkbox, Tag):
|
|
287
281
|
checked = checkbox.get("checked") is not None
|
|
288
282
|
checkbox_symbol = "[x]" if checked else "[ ]"
|
|
289
|
-
|
|
283
|
+
|
|
290
284
|
checkbox_text = text
|
|
291
285
|
if checkbox.string:
|
|
292
286
|
checkbox_text = text.replace(str(checkbox.string), "").strip()
|
|
@@ -676,7 +670,6 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
|
|
|
676
670
|
if not text.strip():
|
|
677
671
|
return ""
|
|
678
672
|
|
|
679
|
-
# Escape any existing quotes in the text
|
|
680
673
|
escaped_text = text.strip().replace('"', '\\"')
|
|
681
674
|
return f'"{escaped_text}"'
|
|
682
675
|
|
|
@@ -692,23 +685,20 @@ def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
692
685
|
Returns:
|
|
693
686
|
The converted markdown text preserving audio element.
|
|
694
687
|
"""
|
|
695
|
-
_ = convert_as_inline
|
|
688
|
+
_ = convert_as_inline
|
|
696
689
|
src = tag.get("src", "")
|
|
697
690
|
|
|
698
|
-
# Check for source elements if no src attribute
|
|
699
691
|
if not src:
|
|
700
692
|
source_tag = tag.find("source")
|
|
701
693
|
if source_tag and isinstance(source_tag, Tag):
|
|
702
694
|
src = source_tag.get("src", "")
|
|
703
695
|
|
|
704
|
-
# Get other attributes
|
|
705
696
|
controls = "controls" if tag.get("controls") is not None else ""
|
|
706
697
|
autoplay = "autoplay" if tag.get("autoplay") is not None else ""
|
|
707
698
|
loop = "loop" if tag.get("loop") is not None else ""
|
|
708
699
|
muted = "muted" if tag.get("muted") is not None else ""
|
|
709
700
|
preload = tag.get("preload", "")
|
|
710
701
|
|
|
711
|
-
# Build attributes string
|
|
712
702
|
attrs = []
|
|
713
703
|
if src and isinstance(src, str) and src.strip():
|
|
714
704
|
attrs.append(f'src="{src}"')
|
|
@@ -725,13 +715,11 @@ def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
725
715
|
|
|
726
716
|
attrs_str = " ".join(attrs)
|
|
727
717
|
|
|
728
|
-
# If there's fallback content, preserve it
|
|
729
718
|
if text.strip():
|
|
730
719
|
if attrs_str:
|
|
731
720
|
return f"<audio {attrs_str}>\n{text.strip()}\n</audio>\n\n"
|
|
732
721
|
return f"<audio>\n{text.strip()}\n</audio>\n\n"
|
|
733
722
|
|
|
734
|
-
# Self-closing for no fallback content
|
|
735
723
|
if attrs_str:
|
|
736
724
|
return f"<audio {attrs_str} />\n\n"
|
|
737
725
|
return "<audio />\n\n"
|
|
@@ -748,16 +736,14 @@ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
748
736
|
Returns:
|
|
749
737
|
The converted markdown text preserving video element.
|
|
750
738
|
"""
|
|
751
|
-
_ = convert_as_inline
|
|
739
|
+
_ = convert_as_inline
|
|
752
740
|
src = tag.get("src", "")
|
|
753
741
|
|
|
754
|
-
# Check for source elements if no src attribute
|
|
755
742
|
if not src:
|
|
756
743
|
source_tag = tag.find("source")
|
|
757
744
|
if source_tag and isinstance(source_tag, Tag):
|
|
758
745
|
src = source_tag.get("src", "")
|
|
759
746
|
|
|
760
|
-
# Get other attributes
|
|
761
747
|
width = tag.get("width", "")
|
|
762
748
|
height = tag.get("height", "")
|
|
763
749
|
poster = tag.get("poster", "")
|
|
@@ -767,7 +753,6 @@ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
767
753
|
muted = "muted" if tag.get("muted") is not None else ""
|
|
768
754
|
preload = tag.get("preload", "")
|
|
769
755
|
|
|
770
|
-
# Build attributes string
|
|
771
756
|
attrs = []
|
|
772
757
|
if src and isinstance(src, str) and src.strip():
|
|
773
758
|
attrs.append(f'src="{src}"')
|
|
@@ -790,13 +775,11 @@ def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
790
775
|
|
|
791
776
|
attrs_str = " ".join(attrs)
|
|
792
777
|
|
|
793
|
-
# If there's fallback content, preserve it
|
|
794
778
|
if text.strip():
|
|
795
779
|
if attrs_str:
|
|
796
780
|
return f"<video {attrs_str}>\n{text.strip()}\n</video>\n\n"
|
|
797
781
|
return f"<video>\n{text.strip()}\n</video>\n\n"
|
|
798
782
|
|
|
799
|
-
# Self-closing for no fallback content
|
|
800
783
|
if attrs_str:
|
|
801
784
|
return f"<video {attrs_str} />\n\n"
|
|
802
785
|
return "<video />\n\n"
|
|
@@ -813,17 +796,16 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
813
796
|
Returns:
|
|
814
797
|
The converted markdown text preserving iframe element.
|
|
815
798
|
"""
|
|
816
|
-
_ = text
|
|
817
|
-
_ = convert_as_inline
|
|
799
|
+
_ = text
|
|
800
|
+
_ = convert_as_inline
|
|
818
801
|
src = tag.get("src", "")
|
|
819
802
|
width = tag.get("width", "")
|
|
820
803
|
height = tag.get("height", "")
|
|
821
804
|
title = tag.get("title", "")
|
|
822
805
|
allow = tag.get("allow", "")
|
|
823
|
-
sandbox = tag.get("sandbox")
|
|
806
|
+
sandbox = tag.get("sandbox")
|
|
824
807
|
loading = tag.get("loading", "")
|
|
825
808
|
|
|
826
|
-
# Build attributes string
|
|
827
809
|
attrs = []
|
|
828
810
|
if src and isinstance(src, str) and src.strip():
|
|
829
811
|
attrs.append(f'src="{src}"')
|
|
@@ -837,11 +819,9 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
837
819
|
attrs.append(f'allow="{allow}"')
|
|
838
820
|
if sandbox is not None:
|
|
839
821
|
if isinstance(sandbox, list):
|
|
840
|
-
# BeautifulSoup returns AttributeValueList for space-separated values
|
|
841
822
|
if sandbox:
|
|
842
823
|
attrs.append(f'sandbox="{" ".join(sandbox)}"')
|
|
843
824
|
else:
|
|
844
|
-
# Empty list means boolean attribute
|
|
845
825
|
attrs.append("sandbox")
|
|
846
826
|
elif isinstance(sandbox, str) and sandbox:
|
|
847
827
|
attrs.append(f'sandbox="{sandbox}"')
|
|
@@ -852,7 +832,6 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
852
832
|
|
|
853
833
|
attrs_str = " ".join(attrs)
|
|
854
834
|
|
|
855
|
-
# iframes are typically self-closing in usage
|
|
856
835
|
if attrs_str:
|
|
857
836
|
return f"<iframe {attrs_str}></iframe>\n\n"
|
|
858
837
|
return "<iframe></iframe>\n\n"
|
|
@@ -869,13 +848,12 @@ def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
869
848
|
Returns:
|
|
870
849
|
The converted markdown text with optional title annotation.
|
|
871
850
|
"""
|
|
872
|
-
_ = convert_as_inline
|
|
851
|
+
_ = convert_as_inline
|
|
873
852
|
if not text.strip():
|
|
874
853
|
return ""
|
|
875
854
|
|
|
876
855
|
title = tag.get("title")
|
|
877
856
|
if title and isinstance(title, str) and title.strip():
|
|
878
|
-
# Show abbreviation with title in parentheses
|
|
879
857
|
return f"{text.strip()} ({title.strip()})"
|
|
880
858
|
|
|
881
859
|
return text.strip()
|
|
@@ -892,13 +870,12 @@ def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
892
870
|
Returns:
|
|
893
871
|
The converted markdown text preserving time information.
|
|
894
872
|
"""
|
|
895
|
-
_ = convert_as_inline
|
|
873
|
+
_ = convert_as_inline
|
|
896
874
|
if not text.strip():
|
|
897
875
|
return ""
|
|
898
876
|
|
|
899
877
|
datetime_attr = tag.get("datetime")
|
|
900
878
|
if datetime_attr and isinstance(datetime_attr, str) and datetime_attr.strip():
|
|
901
|
-
# Preserve machine-readable datetime in HTML
|
|
902
879
|
return f'<time datetime="{datetime_attr.strip()}">{text.strip()}</time>'
|
|
903
880
|
|
|
904
881
|
return text.strip()
|
|
@@ -915,13 +892,12 @@ def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
915
892
|
Returns:
|
|
916
893
|
The converted markdown text preserving machine-readable data.
|
|
917
894
|
"""
|
|
918
|
-
_ = convert_as_inline
|
|
895
|
+
_ = convert_as_inline
|
|
919
896
|
if not text.strip():
|
|
920
897
|
return ""
|
|
921
898
|
|
|
922
899
|
value_attr = tag.get("value")
|
|
923
900
|
if value_attr and isinstance(value_attr, str) and value_attr.strip():
|
|
924
|
-
# Preserve machine-readable value in HTML
|
|
925
901
|
return f'<data value="{value_attr.strip()}">{text.strip()}</data>'
|
|
926
902
|
|
|
927
903
|
return text.strip()
|
|
@@ -936,8 +912,8 @@ def _convert_wbr(*, convert_as_inline: bool) -> str:
|
|
|
936
912
|
Returns:
|
|
937
913
|
Empty string as wbr is just a break opportunity.
|
|
938
914
|
"""
|
|
939
|
-
_ = convert_as_inline
|
|
940
|
-
return ""
|
|
915
|
+
_ = convert_as_inline
|
|
916
|
+
return ""
|
|
941
917
|
|
|
942
918
|
|
|
943
919
|
def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
@@ -1046,8 +1022,6 @@ def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
|
|
|
1046
1022
|
"""
|
|
1047
1023
|
input_type = tag.get("type", "text")
|
|
1048
1024
|
|
|
1049
|
-
# Special handling for inputs in list items - let _convert_li handle checkboxes
|
|
1050
|
-
# and ignore other input types in list items (legacy behavior)
|
|
1051
1025
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
1052
1026
|
|
|
1053
1027
|
if _has_ancestor(tag, "li"):
|
|
@@ -1380,7 +1354,6 @@ def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1380
1354
|
|
|
1381
1355
|
attrs = []
|
|
1382
1356
|
if for_attr:
|
|
1383
|
-
# BeautifulSoup returns space-separated attributes as lists
|
|
1384
1357
|
for_value = " ".join(for_attr) if isinstance(for_attr, list) else str(for_attr)
|
|
1385
1358
|
if for_value.strip():
|
|
1386
1359
|
attrs.append(f'for="{for_value}"')
|
|
@@ -1438,7 +1411,6 @@ def _convert_ruby(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1438
1411
|
if not text.strip():
|
|
1439
1412
|
return ""
|
|
1440
1413
|
|
|
1441
|
-
# Ruby elements are always inline by nature
|
|
1442
1414
|
return text.strip()
|
|
1443
1415
|
|
|
1444
1416
|
|
|
@@ -1455,7 +1427,6 @@ def _convert_rb(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1455
1427
|
if not text.strip():
|
|
1456
1428
|
return ""
|
|
1457
1429
|
|
|
1458
|
-
# Ruby base is the main text, pass through as-is
|
|
1459
1430
|
return text.strip()
|
|
1460
1431
|
|
|
1461
1432
|
|
|
@@ -1470,21 +1441,17 @@ def _convert_rt(*, text: str, convert_as_inline: bool, tag: Tag) -> str: # noqa
|
|
|
1470
1441
|
Returns:
|
|
1471
1442
|
The converted markdown text with pronunciation in parentheses.
|
|
1472
1443
|
"""
|
|
1473
|
-
# Handle empty rt elements - still need parentheses
|
|
1474
1444
|
content = text.strip()
|
|
1475
1445
|
|
|
1476
|
-
# Check if this rt is surrounded by rp elements (fallback parentheses)
|
|
1477
1446
|
prev_sibling = tag.previous_sibling
|
|
1478
1447
|
next_sibling = tag.next_sibling
|
|
1479
1448
|
|
|
1480
|
-
# If surrounded by rp elements, don't add extra parentheses
|
|
1481
1449
|
has_rp_before = prev_sibling and getattr(prev_sibling, "name", None) == "rp"
|
|
1482
1450
|
has_rp_after = next_sibling and getattr(next_sibling, "name", None) == "rp"
|
|
1483
1451
|
|
|
1484
1452
|
if has_rp_before and has_rp_after:
|
|
1485
|
-
# Already has rp parentheses, just return the text
|
|
1486
1453
|
return content
|
|
1487
|
-
|
|
1454
|
+
|
|
1488
1455
|
return f"({content})"
|
|
1489
1456
|
|
|
1490
1457
|
|
|
@@ -1501,7 +1468,6 @@ def _convert_rp(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1501
1468
|
if not text.strip():
|
|
1502
1469
|
return ""
|
|
1503
1470
|
|
|
1504
|
-
# Ruby parentheses preserved for fallback compatibility
|
|
1505
1471
|
return text.strip()
|
|
1506
1472
|
|
|
1507
1473
|
|
|
@@ -1518,7 +1484,6 @@ def _convert_rtc(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1518
1484
|
if not text.strip():
|
|
1519
1485
|
return ""
|
|
1520
1486
|
|
|
1521
|
-
# Ruby text container, pass through content
|
|
1522
1487
|
return text.strip()
|
|
1523
1488
|
|
|
1524
1489
|
|
|
@@ -1539,7 +1504,6 @@ def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1539
1504
|
if not text.strip():
|
|
1540
1505
|
return ""
|
|
1541
1506
|
|
|
1542
|
-
# Get dialog attributes for preservation
|
|
1543
1507
|
attrs = []
|
|
1544
1508
|
if tag.get("open") is not None:
|
|
1545
1509
|
attrs.append("open")
|
|
@@ -1568,7 +1532,6 @@ def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1568
1532
|
if not text.strip():
|
|
1569
1533
|
return ""
|
|
1570
1534
|
|
|
1571
|
-
# Get menu attributes for preservation
|
|
1572
1535
|
attrs = []
|
|
1573
1536
|
if tag.get("type") and tag.get("type") != "list":
|
|
1574
1537
|
attrs.append(f'type="{tag.get("type")}"')
|
|
@@ -1599,12 +1562,10 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1599
1562
|
if convert_as_inline:
|
|
1600
1563
|
return text
|
|
1601
1564
|
|
|
1602
|
-
# Get figure attributes for preservation
|
|
1603
1565
|
attrs = []
|
|
1604
1566
|
if tag.get("id"):
|
|
1605
1567
|
attrs.append(f'id="{tag.get("id")}"')
|
|
1606
1568
|
if tag.get("class"):
|
|
1607
|
-
# Handle class attribute which might be a list
|
|
1608
1569
|
class_val = tag.get("class")
|
|
1609
1570
|
if isinstance(class_val, list):
|
|
1610
1571
|
class_val = " ".join(class_val)
|
|
@@ -1612,11 +1573,8 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1612
1573
|
|
|
1613
1574
|
attrs_str = " " + " ".join(attrs) if attrs else ""
|
|
1614
1575
|
|
|
1615
|
-
# Check if the figure contains only an image (common case)
|
|
1616
|
-
# In that case, we might want to preserve the figure wrapper
|
|
1617
1576
|
content = text.strip()
|
|
1618
1577
|
|
|
1619
|
-
# If content already has proper spacing, don't add extra newlines
|
|
1620
1578
|
if content.endswith("\n\n"):
|
|
1621
1579
|
return f"<figure{attrs_str}>\n{content}</figure>\n\n"
|
|
1622
1580
|
|
|
@@ -1639,12 +1597,8 @@ def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
|
|
|
1639
1597
|
if not text.strip():
|
|
1640
1598
|
return ""
|
|
1641
1599
|
|
|
1642
|
-
# Preserve the semantic grouping of headings
|
|
1643
|
-
# Add a marker to indicate this is a grouped heading
|
|
1644
1600
|
content = text.strip()
|
|
1645
1601
|
|
|
1646
|
-
# Remove excessive newlines between headings in the group
|
|
1647
|
-
# Headings in hgroup should be visually closer together
|
|
1648
1602
|
content = re.sub(r"\n{3,}", "\n\n", content)
|
|
1649
1603
|
|
|
1650
1604
|
return f"<!-- heading group -->\n{content}\n<!-- end heading group -->\n\n"
|
|
@@ -1664,22 +1618,17 @@ def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1664
1618
|
if not text.strip():
|
|
1665
1619
|
return ""
|
|
1666
1620
|
|
|
1667
|
-
# Find all source elements
|
|
1668
1621
|
sources = tag.find_all("source")
|
|
1669
1622
|
img = tag.find("img")
|
|
1670
1623
|
|
|
1671
1624
|
if not img:
|
|
1672
|
-
# No img fallback, just return the text content
|
|
1673
1625
|
return text.strip()
|
|
1674
1626
|
|
|
1675
|
-
# Get the primary image markdown (already converted)
|
|
1676
1627
|
img_markdown = text.strip()
|
|
1677
1628
|
|
|
1678
|
-
# If there are no sources, just return the image
|
|
1679
1629
|
if not sources:
|
|
1680
1630
|
return img_markdown
|
|
1681
1631
|
|
|
1682
|
-
# Build a comment with source information for responsive images
|
|
1683
1632
|
source_info = []
|
|
1684
1633
|
for source in sources:
|
|
1685
1634
|
srcset = source.get("srcset")
|
|
@@ -1695,14 +1644,12 @@ def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1695
1644
|
source_info.append(info)
|
|
1696
1645
|
|
|
1697
1646
|
if source_info and not convert_as_inline:
|
|
1698
|
-
# Add picture source information as a comment
|
|
1699
1647
|
sources_comment = "<!-- picture sources:\n"
|
|
1700
1648
|
for info in source_info:
|
|
1701
1649
|
sources_comment += f" {info}\n"
|
|
1702
1650
|
sources_comment += "-->\n"
|
|
1703
1651
|
return f"{sources_comment}{img_markdown}"
|
|
1704
1652
|
|
|
1705
|
-
# In inline mode or no sources, just return the image
|
|
1706
1653
|
return img_markdown
|
|
1707
1654
|
|
|
1708
1655
|
|
|
@@ -1718,23 +1665,17 @@ def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1718
1665
|
The converted markdown text as an image reference.
|
|
1719
1666
|
"""
|
|
1720
1667
|
if convert_as_inline:
|
|
1721
|
-
# In inline mode, just return any text content
|
|
1722
1668
|
return text.strip()
|
|
1723
1669
|
|
|
1724
|
-
# Get SVG attributes
|
|
1725
1670
|
title = tag.find("title")
|
|
1726
1671
|
title_text = title.get_text().strip() if title else ""
|
|
1727
1672
|
|
|
1728
|
-
# For inline SVG, we'll convert to a data URI
|
|
1729
|
-
# First, we need to get the full SVG markup
|
|
1730
1673
|
svg_markup = str(tag)
|
|
1731
1674
|
|
|
1732
|
-
# Create a data URI
|
|
1733
1675
|
svg_bytes = svg_markup.encode("utf-8")
|
|
1734
1676
|
svg_base64 = base64.b64encode(svg_bytes).decode("utf-8")
|
|
1735
1677
|
data_uri = f"data:image/svg+xml;base64,{svg_base64}"
|
|
1736
1678
|
|
|
1737
|
-
# Use title as alt text, or "SVG Image" if no title
|
|
1738
1679
|
alt_text = title_text or "SVG Image"
|
|
1739
1680
|
|
|
1740
1681
|
return f""
|
|
@@ -1754,17 +1695,13 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1754
1695
|
if not text.strip():
|
|
1755
1696
|
return ""
|
|
1756
1697
|
|
|
1757
|
-
# Check if it's display math vs inline math
|
|
1758
1698
|
display = tag.get("display") == "block"
|
|
1759
1699
|
|
|
1760
|
-
# For now, preserve the MathML as a comment with the text representation
|
|
1761
|
-
# This allows systems that understand MathML to process it
|
|
1762
1700
|
math_comment = f"<!-- MathML: {tag!s} -->"
|
|
1763
1701
|
|
|
1764
1702
|
if convert_as_inline or not display:
|
|
1765
|
-
# Inline math - just the text with comment
|
|
1766
1703
|
return f"{math_comment}{text.strip()}"
|
|
1767
|
-
|
|
1704
|
+
|
|
1768
1705
|
return f"\n\n{math_comment}\n{text.strip()}\n\n"
|
|
1769
1706
|
|
|
1770
1707
|
|
|
@@ -1830,8 +1767,8 @@ def create_converters_map(
|
|
|
1830
1767
|
"aside": _wrapper(_convert_semantic_block),
|
|
1831
1768
|
"audio": _wrapper(_convert_audio),
|
|
1832
1769
|
"b": _wrapper(partial(_create_inline_converter(2 * strong_em_symbol))),
|
|
1833
|
-
"bdi": _wrapper(_create_inline_converter("")),
|
|
1834
|
-
"bdo": _wrapper(_create_inline_converter("")),
|
|
1770
|
+
"bdi": _wrapper(_create_inline_converter("")),
|
|
1771
|
+
"bdo": _wrapper(_create_inline_converter("")),
|
|
1835
1772
|
"blockquote": _wrapper(partial(_convert_blockquote)),
|
|
1836
1773
|
"br": _wrapper(partial(_convert_br, newline_style=newline_style)),
|
|
1837
1774
|
"button": _wrapper(_convert_button),
|
|
@@ -1845,7 +1782,7 @@ def create_converters_map(
|
|
|
1845
1782
|
"dd": _wrapper(_convert_dd),
|
|
1846
1783
|
"del": _wrapper(_create_inline_converter("~~")),
|
|
1847
1784
|
"details": _wrapper(_convert_details),
|
|
1848
|
-
"dfn": _wrapper(_create_inline_converter("*")),
|
|
1785
|
+
"dfn": _wrapper(_create_inline_converter("*")),
|
|
1849
1786
|
"dialog": _wrapper(_convert_dialog),
|
|
1850
1787
|
"dl": _wrapper(_convert_dl),
|
|
1851
1788
|
"dt": _wrapper(_convert_dt),
|
|
@@ -1868,7 +1805,7 @@ def create_converters_map(
|
|
|
1868
1805
|
"iframe": _wrapper(_convert_iframe),
|
|
1869
1806
|
"img": _wrapper(partial(_convert_img, keep_inline_images_in=keep_inline_images_in)),
|
|
1870
1807
|
"input": _wrapper(_convert_input_enhanced),
|
|
1871
|
-
"ins": _wrapper(_create_inline_converter("==")),
|
|
1808
|
+
"ins": _wrapper(_create_inline_converter("==")),
|
|
1872
1809
|
"kbd": _wrapper(_create_inline_converter("`")),
|
|
1873
1810
|
"label": _wrapper(_convert_label),
|
|
1874
1811
|
"legend": _wrapper(_convert_legend),
|
|
@@ -1905,7 +1842,7 @@ def create_converters_map(
|
|
|
1905
1842
|
"script": _wrapper(lambda _: ""),
|
|
1906
1843
|
"section": _wrapper(_convert_semantic_block),
|
|
1907
1844
|
"select": _wrapper(_convert_select),
|
|
1908
|
-
"small": _wrapper(_create_inline_converter("")),
|
|
1845
|
+
"small": _wrapper(_create_inline_converter("")),
|
|
1909
1846
|
"strong": _wrapper(_create_inline_converter(strong_em_symbol * 2)),
|
|
1910
1847
|
"style": _wrapper(lambda _: ""),
|
|
1911
1848
|
"sub": _wrapper(_create_inline_converter(sub_symbol)),
|
|
@@ -1921,9 +1858,9 @@ def create_converters_map(
|
|
|
1921
1858
|
"thead": _wrapper(_convert_thead),
|
|
1922
1859
|
"time": _wrapper(_convert_time),
|
|
1923
1860
|
"tr": _wrapper(_convert_tr),
|
|
1924
|
-
"u": _wrapper(_create_inline_converter("")),
|
|
1861
|
+
"u": _wrapper(_create_inline_converter("")),
|
|
1925
1862
|
"ul": _wrapper(_convert_list),
|
|
1926
|
-
"var": _wrapper(_create_inline_converter("*")),
|
|
1863
|
+
"var": _wrapper(_create_inline_converter("*")),
|
|
1927
1864
|
"video": _wrapper(_convert_video),
|
|
1928
1865
|
"wbr": _wrapper(_convert_wbr),
|
|
1929
1866
|
}
|