html-to-markdown 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +19 -2
- html_to_markdown/cli.py +103 -25
- html_to_markdown/constants.py +1 -0
- html_to_markdown/converters.py +1646 -104
- html_to_markdown/exceptions.py +49 -0
- html_to_markdown/processing.py +720 -47
- html_to_markdown-1.6.0.dist-info/METADATA +472 -0
- html_to_markdown-1.6.0.dist-info/RECORD +15 -0
- html_to_markdown-1.4.0.dist-info/METADATA +0 -249
- html_to_markdown-1.4.0.dist-info/RECORD +0 -14
- {html_to_markdown-1.4.0.dist-info → html_to_markdown-1.6.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.4.0.dist-info → html_to_markdown-1.6.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.4.0.dist-info → html_to_markdown-1.6.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.4.0.dist-info → html_to_markdown-1.6.0.dist-info}/top_level.txt +0 -0
html_to_markdown/converters.py
CHANGED
|
@@ -4,6 +4,8 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
6
|
from collections.abc import Iterable
|
|
7
|
+
import base64
|
|
8
|
+
import re
|
|
7
9
|
from functools import partial
|
|
8
10
|
from inspect import getfullargspec
|
|
9
11
|
from textwrap import fill
|
|
@@ -21,41 +23,101 @@ from html_to_markdown.utils import chomp, indent, underline
|
|
|
21
23
|
|
|
22
24
|
SupportedElements = Literal[
|
|
23
25
|
"a",
|
|
26
|
+
"abbr",
|
|
27
|
+
"article",
|
|
28
|
+
"aside",
|
|
29
|
+
"audio",
|
|
24
30
|
"b",
|
|
31
|
+
"bdi",
|
|
32
|
+
"bdo",
|
|
25
33
|
"blockquote",
|
|
26
34
|
"br",
|
|
35
|
+
"button",
|
|
36
|
+
"caption",
|
|
37
|
+
"cite",
|
|
27
38
|
"code",
|
|
39
|
+
"col",
|
|
40
|
+
"colgroup",
|
|
41
|
+
"data",
|
|
42
|
+
"datalist",
|
|
43
|
+
"dd",
|
|
28
44
|
"del",
|
|
45
|
+
"details",
|
|
46
|
+
"dfn",
|
|
47
|
+
"dialog",
|
|
48
|
+
"dl",
|
|
49
|
+
"dt",
|
|
29
50
|
"em",
|
|
51
|
+
"fieldset",
|
|
52
|
+
"figcaption",
|
|
53
|
+
"figure",
|
|
54
|
+
"footer",
|
|
55
|
+
"form",
|
|
30
56
|
"h1",
|
|
31
57
|
"h2",
|
|
32
58
|
"h3",
|
|
33
59
|
"h4",
|
|
34
60
|
"h5",
|
|
35
61
|
"h6",
|
|
62
|
+
"header",
|
|
63
|
+
"hgroup",
|
|
36
64
|
"hr",
|
|
37
65
|
"i",
|
|
66
|
+
"iframe",
|
|
38
67
|
"img",
|
|
68
|
+
"input",
|
|
69
|
+
"ins",
|
|
70
|
+
"kbd",
|
|
71
|
+
"label",
|
|
72
|
+
"legend",
|
|
39
73
|
"list",
|
|
40
|
-
"
|
|
74
|
+
"main",
|
|
75
|
+
"mark",
|
|
76
|
+
"math",
|
|
77
|
+
"menu",
|
|
78
|
+
"meter",
|
|
79
|
+
"nav",
|
|
41
80
|
"ol",
|
|
42
81
|
"li",
|
|
82
|
+
"optgroup",
|
|
83
|
+
"option",
|
|
84
|
+
"output",
|
|
43
85
|
"p",
|
|
86
|
+
"picture",
|
|
44
87
|
"pre",
|
|
45
|
-
"
|
|
46
|
-
"
|
|
88
|
+
"progress",
|
|
89
|
+
"q",
|
|
90
|
+
"rb",
|
|
91
|
+
"rp",
|
|
92
|
+
"rt",
|
|
93
|
+
"rtc",
|
|
94
|
+
"ruby",
|
|
47
95
|
"s",
|
|
48
|
-
"strong",
|
|
49
96
|
"samp",
|
|
97
|
+
"script",
|
|
98
|
+
"section",
|
|
99
|
+
"select",
|
|
100
|
+
"small",
|
|
101
|
+
"strong",
|
|
102
|
+
"style",
|
|
50
103
|
"sub",
|
|
104
|
+
"summary",
|
|
51
105
|
"sup",
|
|
106
|
+
"svg",
|
|
52
107
|
"table",
|
|
53
|
-
"
|
|
54
|
-
"figcaption",
|
|
108
|
+
"tbody",
|
|
55
109
|
"td",
|
|
110
|
+
"textarea",
|
|
111
|
+
"tfoot",
|
|
56
112
|
"th",
|
|
113
|
+
"thead",
|
|
114
|
+
"time",
|
|
57
115
|
"tr",
|
|
58
|
-
"
|
|
116
|
+
"u",
|
|
117
|
+
"ul",
|
|
118
|
+
"var",
|
|
119
|
+
"video",
|
|
120
|
+
"wbr",
|
|
59
121
|
]
|
|
60
122
|
|
|
61
123
|
Converter = Callable[[str, Tag], str]
|
|
@@ -75,7 +137,10 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
|
|
|
75
137
|
"""
|
|
76
138
|
|
|
77
139
|
def implementation(*, tag: Tag, text: str) -> str:
|
|
78
|
-
if
|
|
140
|
+
# Check if we're in a code context - if so, don't apply markup
|
|
141
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
142
|
+
|
|
143
|
+
if _has_ancestor(tag, ["pre", "code", "kbd", "samp"]):
|
|
79
144
|
return text
|
|
80
145
|
|
|
81
146
|
if not text.strip():
|
|
@@ -119,15 +184,32 @@ def _convert_a(*, tag: Tag, text: str, autolinks: bool, default_title: bool) ->
|
|
|
119
184
|
return f"{prefix}[{text}]({href}{title_part}){suffix}" if href else text
|
|
120
185
|
|
|
121
186
|
|
|
122
|
-
def _convert_blockquote(*, text: str, convert_as_inline: bool) -> str:
|
|
187
|
+
def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
|
|
123
188
|
if convert_as_inline:
|
|
124
189
|
return text
|
|
125
|
-
return f"\n{line_beginning_re.sub('> ', text.strip())}\n\n" if text else ""
|
|
126
|
-
|
|
127
190
|
|
|
128
|
-
|
|
129
|
-
if convert_as_inline:
|
|
191
|
+
if not text:
|
|
130
192
|
return ""
|
|
193
|
+
|
|
194
|
+
# Handle cite attribute
|
|
195
|
+
cite_url = tag.get("cite")
|
|
196
|
+
quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
|
|
197
|
+
|
|
198
|
+
if cite_url:
|
|
199
|
+
quote_text += f"— <{cite_url}>\n\n"
|
|
200
|
+
|
|
201
|
+
return quote_text
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
|
|
205
|
+
# Convert br to line break, but handle headings specially
|
|
206
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
207
|
+
|
|
208
|
+
if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
|
|
209
|
+
return " " # Convert to space in headings
|
|
210
|
+
|
|
211
|
+
# Always convert br to line break in other contexts
|
|
212
|
+
_ = convert_as_inline # Unused but kept for API consistency
|
|
131
213
|
return "\\\n" if newline_style.lower() == BACKSLASH else " \n"
|
|
132
214
|
|
|
133
215
|
|
|
@@ -154,10 +236,15 @@ def _convert_hn(
|
|
|
154
236
|
|
|
155
237
|
def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: Iterable[str] | None) -> str:
|
|
156
238
|
alt = tag.attrs.get("alt", "")
|
|
239
|
+
alt = alt if isinstance(alt, str) else ""
|
|
157
240
|
src = tag.attrs.get("src", "")
|
|
241
|
+
src = src if isinstance(src, str) else ""
|
|
158
242
|
title = tag.attrs.get("title", "")
|
|
243
|
+
title = title if isinstance(title, str) else ""
|
|
159
244
|
width = tag.attrs.get("width", "")
|
|
245
|
+
width = width if isinstance(width, str) else ""
|
|
160
246
|
height = tag.attrs.get("height", "")
|
|
247
|
+
height = height if isinstance(height, str) else ""
|
|
161
248
|
title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
|
|
162
249
|
parent_name = tag.parent.name if tag.parent else ""
|
|
163
250
|
# Always preserve images in table cells (td, th) by default
|
|
@@ -194,6 +281,17 @@ def _convert_list(*, tag: Tag, text: str) -> str:
|
|
|
194
281
|
|
|
195
282
|
|
|
196
283
|
def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
|
|
284
|
+
# Check for task list (checkbox input)
|
|
285
|
+
checkbox = tag.find("input", {"type": "checkbox"})
|
|
286
|
+
if checkbox and isinstance(checkbox, Tag):
|
|
287
|
+
checked = checkbox.get("checked") is not None
|
|
288
|
+
checkbox_symbol = "[x]" if checked else "[ ]"
|
|
289
|
+
# Remove the checkbox from the text content
|
|
290
|
+
checkbox_text = text
|
|
291
|
+
if checkbox.string:
|
|
292
|
+
checkbox_text = text.replace(str(checkbox.string), "").strip()
|
|
293
|
+
return f"- {checkbox_symbol} {checkbox_text.strip()}\n"
|
|
294
|
+
|
|
197
295
|
parent = tag.parent
|
|
198
296
|
if parent is not None and parent.name == "ol":
|
|
199
297
|
start = (
|
|
@@ -231,6 +329,29 @@ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: in
|
|
|
231
329
|
return f"{text}\n\n" if text else ""
|
|
232
330
|
|
|
233
331
|
|
|
332
|
+
def _convert_mark(*, text: str, convert_as_inline: bool, highlight_style: str) -> str:
|
|
333
|
+
"""Convert HTML mark element to Markdown highlighting.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
text: The text content of the mark element.
|
|
337
|
+
convert_as_inline: Whether to convert as inline content.
|
|
338
|
+
highlight_style: The style to use for highlighting ("double-equal", "html", "bold").
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
The converted markdown text.
|
|
342
|
+
"""
|
|
343
|
+
if convert_as_inline:
|
|
344
|
+
return text
|
|
345
|
+
|
|
346
|
+
if highlight_style == "double-equal":
|
|
347
|
+
return f"=={text}=="
|
|
348
|
+
if highlight_style == "bold":
|
|
349
|
+
return f"**{text}**"
|
|
350
|
+
if highlight_style == "html":
|
|
351
|
+
return f"<mark>{text}</mark>"
|
|
352
|
+
return text
|
|
353
|
+
|
|
354
|
+
|
|
234
355
|
def _convert_pre(
|
|
235
356
|
*,
|
|
236
357
|
tag: Tag,
|
|
@@ -259,10 +380,10 @@ def _convert_th(*, tag: Tag, text: str) -> str:
|
|
|
259
380
|
|
|
260
381
|
def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
261
382
|
cells = tag.find_all(["td", "th"])
|
|
262
|
-
parent_name = tag.parent.name if tag.parent else ""
|
|
383
|
+
parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
|
|
263
384
|
tag_grand_parent = tag.parent.parent if tag.parent else None
|
|
264
385
|
is_headrow = (
|
|
265
|
-
all(cell.name == "th" for cell in cells)
|
|
386
|
+
all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
|
|
266
387
|
or (not tag.previous_sibling and parent_name != "tbody")
|
|
267
388
|
or (
|
|
268
389
|
not tag.previous_sibling
|
|
@@ -275,8 +396,12 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
275
396
|
if is_headrow and not tag.previous_sibling:
|
|
276
397
|
full_colspan = 0
|
|
277
398
|
for cell in cells:
|
|
278
|
-
if "colspan" in cell.attrs
|
|
279
|
-
|
|
399
|
+
if hasattr(cell, "attrs") and "colspan" in cell.attrs:
|
|
400
|
+
colspan_value = cell.attrs["colspan"]
|
|
401
|
+
if isinstance(colspan_value, str) and colspan_value.isdigit():
|
|
402
|
+
full_colspan += int(colspan_value)
|
|
403
|
+
else:
|
|
404
|
+
full_colspan += 1
|
|
280
405
|
else:
|
|
281
406
|
full_colspan += 1
|
|
282
407
|
underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
|
|
@@ -288,100 +413,1517 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
288
413
|
return overline + "|" + text + "\n" + underline
|
|
289
414
|
|
|
290
415
|
|
|
291
|
-
def
|
|
292
|
-
|
|
293
|
-
bullets: str,
|
|
294
|
-
code_language: str,
|
|
295
|
-
code_language_callback: Callable[[Tag], str] | None,
|
|
296
|
-
default_title: bool,
|
|
297
|
-
heading_style: Literal["atx", "atx_closed", "underlined"],
|
|
298
|
-
keep_inline_images_in: Iterable[str] | None,
|
|
299
|
-
newline_style: str,
|
|
300
|
-
strong_em_symbol: str,
|
|
301
|
-
sub_symbol: str,
|
|
302
|
-
sup_symbol: str,
|
|
303
|
-
wrap: bool,
|
|
304
|
-
wrap_width: int,
|
|
305
|
-
) -> ConvertersMap:
|
|
306
|
-
"""Create a mapping of HTML elements to their corresponding conversion functions.
|
|
416
|
+
def _convert_caption(*, text: str, convert_as_inline: bool) -> str:
|
|
417
|
+
"""Convert HTML caption element to emphasized text.
|
|
307
418
|
|
|
308
419
|
Args:
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
code_language: The default code language to use.
|
|
312
|
-
code_language_callback: A callback to get the code language.
|
|
313
|
-
default_title: Whether to use the URL as the title for links.
|
|
314
|
-
heading_style: The style of headings.
|
|
315
|
-
keep_inline_images_in: The tags to keep inline images in.
|
|
316
|
-
newline_style: The style of newlines.
|
|
317
|
-
strong_em_symbol: The symbol to use for strong and emphasis text.
|
|
318
|
-
sub_symbol: The symbol to use for subscript text.
|
|
319
|
-
sup_symbol: The symbol to use for superscript text.
|
|
320
|
-
wrap: Whether to wrap text.
|
|
321
|
-
wrap_width: The width to wrap text at.
|
|
420
|
+
text: The text content of the caption element.
|
|
421
|
+
convert_as_inline: Whether to convert as inline content.
|
|
322
422
|
|
|
323
423
|
Returns:
|
|
324
|
-
|
|
424
|
+
The converted markdown text with caption formatting.
|
|
325
425
|
"""
|
|
426
|
+
if convert_as_inline:
|
|
427
|
+
return text
|
|
326
428
|
|
|
327
|
-
|
|
328
|
-
|
|
429
|
+
if not text.strip():
|
|
430
|
+
return ""
|
|
329
431
|
|
|
330
|
-
|
|
331
|
-
if spec.kwonlyargs:
|
|
332
|
-
kwargs: dict[str, Any] = {}
|
|
333
|
-
if "tag" in spec.kwonlyargs:
|
|
334
|
-
kwargs["tag"] = tag
|
|
335
|
-
if "text" in spec.kwonlyargs:
|
|
336
|
-
kwargs["text"] = text
|
|
337
|
-
if "convert_as_inline" in spec.kwonlyargs:
|
|
338
|
-
kwargs["convert_as_inline"] = convert_as_inline
|
|
339
|
-
return func(**kwargs)
|
|
340
|
-
return func(text)
|
|
432
|
+
return f"*{text.strip()}*\n\n"
|
|
341
433
|
|
|
342
|
-
return cast("Callable[[str, Tag], T]", _inner)
|
|
343
434
|
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
435
|
+
def _convert_thead(*, text: str, convert_as_inline: bool) -> str:
|
|
436
|
+
"""Convert HTML thead element preserving table structure.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
text: The text content of the thead element.
|
|
440
|
+
convert_as_inline: Whether to convert as inline content.
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
The converted markdown text preserving table structure.
|
|
444
|
+
"""
|
|
445
|
+
if convert_as_inline:
|
|
446
|
+
return text
|
|
447
|
+
|
|
448
|
+
return text
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def _convert_tbody(*, text: str, convert_as_inline: bool) -> str:
|
|
452
|
+
"""Convert HTML tbody element preserving table structure.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
text: The text content of the tbody element.
|
|
456
|
+
convert_as_inline: Whether to convert as inline content.
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
The converted markdown text preserving table structure.
|
|
460
|
+
"""
|
|
461
|
+
if convert_as_inline:
|
|
462
|
+
return text
|
|
463
|
+
|
|
464
|
+
return text
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def _convert_tfoot(*, text: str, convert_as_inline: bool) -> str:
|
|
468
|
+
"""Convert HTML tfoot element preserving table structure.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
text: The text content of the tfoot element.
|
|
472
|
+
convert_as_inline: Whether to convert as inline content.
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
The converted markdown text preserving table structure.
|
|
476
|
+
"""
|
|
477
|
+
if convert_as_inline:
|
|
478
|
+
return text
|
|
479
|
+
|
|
480
|
+
return text
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def _convert_colgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
484
|
+
"""Convert HTML colgroup element preserving column structure for documentation.
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
tag: The colgroup tag element.
|
|
488
|
+
text: The text content of the colgroup element.
|
|
489
|
+
convert_as_inline: Whether to convert as inline content.
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
The converted markdown text preserving colgroup structure.
|
|
493
|
+
"""
|
|
494
|
+
if convert_as_inline:
|
|
495
|
+
return text
|
|
496
|
+
|
|
497
|
+
if not text.strip():
|
|
498
|
+
return ""
|
|
499
|
+
|
|
500
|
+
span = tag.get("span", "")
|
|
501
|
+
attrs = []
|
|
502
|
+
if span and isinstance(span, str) and span.strip():
|
|
503
|
+
attrs.append(f'span="{span}"')
|
|
504
|
+
|
|
505
|
+
attrs_str = " ".join(attrs)
|
|
506
|
+
if attrs_str:
|
|
507
|
+
return f"<colgroup {attrs_str}>\n{text.strip()}\n</colgroup>\n\n"
|
|
508
|
+
return f"<colgroup>\n{text.strip()}\n</colgroup>\n\n"
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def _convert_col(*, tag: Tag, convert_as_inline: bool) -> str:
|
|
512
|
+
"""Convert HTML col element preserving column attributes for documentation.
|
|
513
|
+
|
|
514
|
+
Args:
|
|
515
|
+
tag: The col tag element.
|
|
516
|
+
convert_as_inline: Whether to convert as inline content.
|
|
517
|
+
|
|
518
|
+
Returns:
|
|
519
|
+
The converted markdown text preserving col structure.
|
|
520
|
+
"""
|
|
521
|
+
if convert_as_inline:
|
|
522
|
+
return ""
|
|
523
|
+
|
|
524
|
+
span = tag.get("span", "")
|
|
525
|
+
width = tag.get("width", "")
|
|
526
|
+
style = tag.get("style", "")
|
|
527
|
+
|
|
528
|
+
attrs = []
|
|
529
|
+
if width and isinstance(width, str) and width.strip():
|
|
530
|
+
attrs.append(f'width="{width}"')
|
|
531
|
+
if style and isinstance(style, str) and style.strip():
|
|
532
|
+
attrs.append(f'style="{style}"')
|
|
533
|
+
if span and isinstance(span, str) and span.strip():
|
|
534
|
+
attrs.append(f'span="{span}"')
|
|
535
|
+
|
|
536
|
+
attrs_str = " ".join(attrs)
|
|
537
|
+
if attrs_str:
|
|
538
|
+
return f"<col {attrs_str} />\n"
|
|
539
|
+
return "<col />\n"
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
|
|
543
|
+
"""Convert HTML5 semantic elements to block-level Markdown.
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
text: The text content of the semantic element.
|
|
547
|
+
convert_as_inline: Whether to convert as inline content.
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
The converted markdown text with proper block spacing.
|
|
551
|
+
"""
|
|
552
|
+
if convert_as_inline:
|
|
553
|
+
return text
|
|
554
|
+
|
|
555
|
+
return f"{text}\n\n" if text.strip() else ""
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def _convert_details(*, text: str, convert_as_inline: bool) -> str:
|
|
559
|
+
"""Convert HTML details element preserving HTML structure.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
text: The text content of the details element.
|
|
563
|
+
convert_as_inline: Whether to convert as inline content.
|
|
564
|
+
|
|
565
|
+
Returns:
|
|
566
|
+
The converted markdown text preserving HTML structure.
|
|
567
|
+
"""
|
|
568
|
+
if convert_as_inline:
|
|
569
|
+
return text
|
|
570
|
+
|
|
571
|
+
return f"<details>\n{text.strip()}\n</details>\n\n" if text.strip() else ""
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def _convert_summary(*, text: str, convert_as_inline: bool) -> str:
|
|
575
|
+
"""Convert HTML summary element preserving HTML structure.
|
|
576
|
+
|
|
577
|
+
Args:
|
|
578
|
+
text: The text content of the summary element.
|
|
579
|
+
convert_as_inline: Whether to convert as inline content.
|
|
580
|
+
|
|
581
|
+
Returns:
|
|
582
|
+
The converted markdown text preserving HTML structure.
|
|
583
|
+
"""
|
|
584
|
+
if convert_as_inline:
|
|
585
|
+
return text
|
|
586
|
+
|
|
587
|
+
return f"<summary>{text.strip()}</summary>\n\n" if text.strip() else ""
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
|
|
591
|
+
"""Convert HTML definition list element.
|
|
592
|
+
|
|
593
|
+
Args:
|
|
594
|
+
text: The text content of the definition list.
|
|
595
|
+
convert_as_inline: Whether to convert as inline content.
|
|
596
|
+
|
|
597
|
+
Returns:
|
|
598
|
+
The converted markdown text with proper spacing.
|
|
599
|
+
"""
|
|
600
|
+
if convert_as_inline:
|
|
601
|
+
return text
|
|
602
|
+
|
|
603
|
+
return f"{text}\n" if text.strip() else ""
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
|
|
607
|
+
"""Convert HTML definition term element.
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
text: The text content of the definition term.
|
|
611
|
+
convert_as_inline: Whether to convert as inline content.
|
|
612
|
+
|
|
613
|
+
Returns:
|
|
614
|
+
The converted markdown text as a definition term.
|
|
615
|
+
"""
|
|
616
|
+
if convert_as_inline:
|
|
617
|
+
return text
|
|
618
|
+
|
|
619
|
+
if not text.strip():
|
|
620
|
+
return ""
|
|
621
|
+
|
|
622
|
+
return f"{text.strip()}\n"
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
def _convert_dd(*, text: str, convert_as_inline: bool) -> str:
|
|
626
|
+
"""Convert HTML definition description element.
|
|
627
|
+
|
|
628
|
+
Args:
|
|
629
|
+
text: The text content of the definition description.
|
|
630
|
+
convert_as_inline: Whether to convert as inline content.
|
|
631
|
+
|
|
632
|
+
Returns:
|
|
633
|
+
The converted markdown text as a definition description.
|
|
634
|
+
"""
|
|
635
|
+
if convert_as_inline:
|
|
636
|
+
return text
|
|
637
|
+
|
|
638
|
+
if not text.strip():
|
|
639
|
+
return ""
|
|
640
|
+
|
|
641
|
+
return f": {text.strip()}\n\n"
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
def _convert_cite(*, text: str, convert_as_inline: bool) -> str:
|
|
645
|
+
"""Convert HTML cite element to italic text.
|
|
646
|
+
|
|
647
|
+
Args:
|
|
648
|
+
text: The text content of the cite element.
|
|
649
|
+
convert_as_inline: Whether to convert as inline content.
|
|
650
|
+
|
|
651
|
+
Returns:
|
|
652
|
+
The converted markdown text in italic format.
|
|
653
|
+
"""
|
|
654
|
+
if convert_as_inline:
|
|
655
|
+
return text
|
|
656
|
+
|
|
657
|
+
if not text.strip():
|
|
658
|
+
return ""
|
|
659
|
+
|
|
660
|
+
return f"*{text.strip()}*"
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def _convert_q(*, text: str, convert_as_inline: bool) -> str:
|
|
664
|
+
"""Convert HTML q element to quoted text.
|
|
665
|
+
|
|
666
|
+
Args:
|
|
667
|
+
text: The text content of the q element.
|
|
668
|
+
convert_as_inline: Whether to convert as inline content.
|
|
669
|
+
|
|
670
|
+
Returns:
|
|
671
|
+
The converted markdown text with quotes.
|
|
672
|
+
"""
|
|
673
|
+
if convert_as_inline:
|
|
674
|
+
return text
|
|
675
|
+
|
|
676
|
+
if not text.strip():
|
|
677
|
+
return ""
|
|
678
|
+
|
|
679
|
+
# Escape any existing quotes in the text
|
|
680
|
+
escaped_text = text.strip().replace('"', '\\"')
|
|
681
|
+
return f'"{escaped_text}"'
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
685
|
+
"""Convert HTML audio element preserving structure with fallback.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
tag: The audio tag element.
|
|
689
|
+
text: The text content of the audio element (fallback content).
|
|
690
|
+
convert_as_inline: Whether to convert as inline content.
|
|
691
|
+
|
|
692
|
+
Returns:
|
|
693
|
+
The converted markdown text preserving audio element.
|
|
694
|
+
"""
|
|
695
|
+
_ = convert_as_inline # Unused but kept for API consistency
|
|
696
|
+
src = tag.get("src", "")
|
|
697
|
+
|
|
698
|
+
# Check for source elements if no src attribute
|
|
699
|
+
if not src:
|
|
700
|
+
source_tag = tag.find("source")
|
|
701
|
+
if source_tag and isinstance(source_tag, Tag):
|
|
702
|
+
src = source_tag.get("src", "")
|
|
703
|
+
|
|
704
|
+
# Get other attributes
|
|
705
|
+
controls = "controls" if tag.get("controls") is not None else ""
|
|
706
|
+
autoplay = "autoplay" if tag.get("autoplay") is not None else ""
|
|
707
|
+
loop = "loop" if tag.get("loop") is not None else ""
|
|
708
|
+
muted = "muted" if tag.get("muted") is not None else ""
|
|
709
|
+
preload = tag.get("preload", "")
|
|
710
|
+
|
|
711
|
+
# Build attributes string
|
|
712
|
+
attrs = []
|
|
713
|
+
if src and isinstance(src, str) and src.strip():
|
|
714
|
+
attrs.append(f'src="{src}"')
|
|
715
|
+
if controls:
|
|
716
|
+
attrs.append(controls)
|
|
717
|
+
if autoplay:
|
|
718
|
+
attrs.append(autoplay)
|
|
719
|
+
if loop:
|
|
720
|
+
attrs.append(loop)
|
|
721
|
+
if muted:
|
|
722
|
+
attrs.append(muted)
|
|
723
|
+
if preload and isinstance(preload, str) and preload.strip():
|
|
724
|
+
attrs.append(f'preload="{preload}"')
|
|
725
|
+
|
|
726
|
+
attrs_str = " ".join(attrs)
|
|
727
|
+
|
|
728
|
+
# If there's fallback content, preserve it
|
|
729
|
+
if text.strip():
|
|
730
|
+
if attrs_str:
|
|
731
|
+
return f"<audio {attrs_str}>\n{text.strip()}\n</audio>\n\n"
|
|
732
|
+
return f"<audio>\n{text.strip()}\n</audio>\n\n"
|
|
733
|
+
|
|
734
|
+
# Self-closing for no fallback content
|
|
735
|
+
if attrs_str:
|
|
736
|
+
return f"<audio {attrs_str} />\n\n"
|
|
737
|
+
return "<audio />\n\n"
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
741
|
+
"""Convert HTML video element preserving structure with fallback.
|
|
742
|
+
|
|
743
|
+
Args:
|
|
744
|
+
tag: The video tag element.
|
|
745
|
+
text: The text content of the video element (fallback content).
|
|
746
|
+
convert_as_inline: Whether to convert as inline content.
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
The converted markdown text preserving video element.
|
|
750
|
+
"""
|
|
751
|
+
_ = convert_as_inline # Unused but kept for API consistency
|
|
752
|
+
src = tag.get("src", "")
|
|
753
|
+
|
|
754
|
+
# Check for source elements if no src attribute
|
|
755
|
+
if not src:
|
|
756
|
+
source_tag = tag.find("source")
|
|
757
|
+
if source_tag and isinstance(source_tag, Tag):
|
|
758
|
+
src = source_tag.get("src", "")
|
|
759
|
+
|
|
760
|
+
# Get other attributes
|
|
761
|
+
width = tag.get("width", "")
|
|
762
|
+
height = tag.get("height", "")
|
|
763
|
+
poster = tag.get("poster", "")
|
|
764
|
+
controls = "controls" if tag.get("controls") is not None else ""
|
|
765
|
+
autoplay = "autoplay" if tag.get("autoplay") is not None else ""
|
|
766
|
+
loop = "loop" if tag.get("loop") is not None else ""
|
|
767
|
+
muted = "muted" if tag.get("muted") is not None else ""
|
|
768
|
+
preload = tag.get("preload", "")
|
|
769
|
+
|
|
770
|
+
# Build attributes string
|
|
771
|
+
attrs = []
|
|
772
|
+
if src and isinstance(src, str) and src.strip():
|
|
773
|
+
attrs.append(f'src="{src}"')
|
|
774
|
+
if width and isinstance(width, str) and width.strip():
|
|
775
|
+
attrs.append(f'width="{width}"')
|
|
776
|
+
if height and isinstance(height, str) and height.strip():
|
|
777
|
+
attrs.append(f'height="{height}"')
|
|
778
|
+
if poster and isinstance(poster, str) and poster.strip():
|
|
779
|
+
attrs.append(f'poster="{poster}"')
|
|
780
|
+
if controls:
|
|
781
|
+
attrs.append(controls)
|
|
782
|
+
if autoplay:
|
|
783
|
+
attrs.append(autoplay)
|
|
784
|
+
if loop:
|
|
785
|
+
attrs.append(loop)
|
|
786
|
+
if muted:
|
|
787
|
+
attrs.append(muted)
|
|
788
|
+
if preload and isinstance(preload, str) and preload.strip():
|
|
789
|
+
attrs.append(f'preload="{preload}"')
|
|
790
|
+
|
|
791
|
+
attrs_str = " ".join(attrs)
|
|
792
|
+
|
|
793
|
+
# If there's fallback content, preserve it
|
|
794
|
+
if text.strip():
|
|
795
|
+
if attrs_str:
|
|
796
|
+
return f"<video {attrs_str}>\n{text.strip()}\n</video>\n\n"
|
|
797
|
+
return f"<video>\n{text.strip()}\n</video>\n\n"
|
|
798
|
+
|
|
799
|
+
# Self-closing for no fallback content
|
|
800
|
+
if attrs_str:
|
|
801
|
+
return f"<video {attrs_str} />\n\n"
|
|
802
|
+
return "<video />\n\n"
|
|
803
|
+
|
|
804
|
+
|
|
805
|
+
def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
806
|
+
"""Convert HTML iframe element preserving structure.
|
|
807
|
+
|
|
808
|
+
Args:
|
|
809
|
+
tag: The iframe tag element.
|
|
810
|
+
text: The text content of the iframe element (usually empty).
|
|
811
|
+
convert_as_inline: Whether to convert as inline content.
|
|
812
|
+
|
|
813
|
+
Returns:
|
|
814
|
+
The converted markdown text preserving iframe element.
|
|
815
|
+
"""
|
|
816
|
+
_ = text # Unused but kept for API consistency
|
|
817
|
+
_ = convert_as_inline # Unused but kept for API consistency
|
|
818
|
+
src = tag.get("src", "")
|
|
819
|
+
width = tag.get("width", "")
|
|
820
|
+
height = tag.get("height", "")
|
|
821
|
+
title = tag.get("title", "")
|
|
822
|
+
allow = tag.get("allow", "")
|
|
823
|
+
sandbox = tag.get("sandbox") # Don't provide default
|
|
824
|
+
loading = tag.get("loading", "")
|
|
825
|
+
|
|
826
|
+
# Build attributes string
|
|
827
|
+
attrs = []
|
|
828
|
+
if src and isinstance(src, str) and src.strip():
|
|
829
|
+
attrs.append(f'src="{src}"')
|
|
830
|
+
if width and isinstance(width, str) and width.strip():
|
|
831
|
+
attrs.append(f'width="{width}"')
|
|
832
|
+
if height and isinstance(height, str) and height.strip():
|
|
833
|
+
attrs.append(f'height="{height}"')
|
|
834
|
+
if title and isinstance(title, str) and title.strip():
|
|
835
|
+
attrs.append(f'title="{title}"')
|
|
836
|
+
if allow and isinstance(allow, str) and allow.strip():
|
|
837
|
+
attrs.append(f'allow="{allow}"')
|
|
838
|
+
if sandbox is not None:
|
|
839
|
+
if isinstance(sandbox, list):
|
|
840
|
+
# BeautifulSoup returns AttributeValueList for space-separated values
|
|
841
|
+
if sandbox:
|
|
842
|
+
attrs.append(f'sandbox="{" ".join(sandbox)}"')
|
|
843
|
+
else:
|
|
844
|
+
# Empty list means boolean attribute
|
|
845
|
+
attrs.append("sandbox")
|
|
846
|
+
elif isinstance(sandbox, str) and sandbox:
|
|
847
|
+
attrs.append(f'sandbox="{sandbox}"')
|
|
848
|
+
else:
|
|
849
|
+
attrs.append("sandbox")
|
|
850
|
+
if loading and isinstance(loading, str) and loading.strip():
|
|
851
|
+
attrs.append(f'loading="{loading}"')
|
|
852
|
+
|
|
853
|
+
attrs_str = " ".join(attrs)
|
|
854
|
+
|
|
855
|
+
# iframes are typically self-closing in usage
|
|
856
|
+
if attrs_str:
|
|
857
|
+
return f"<iframe {attrs_str}></iframe>\n\n"
|
|
858
|
+
return "<iframe></iframe>\n\n"
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
862
|
+
"""Convert HTML abbr element to text with optional title.
|
|
863
|
+
|
|
864
|
+
Args:
|
|
865
|
+
tag: The abbr tag element.
|
|
866
|
+
text: The text content of the abbr element.
|
|
867
|
+
convert_as_inline: Whether to convert as inline content.
|
|
868
|
+
|
|
869
|
+
Returns:
|
|
870
|
+
The converted markdown text with optional title annotation.
|
|
871
|
+
"""
|
|
872
|
+
_ = convert_as_inline # Unused but kept for API consistency
|
|
873
|
+
if not text.strip():
|
|
874
|
+
return ""
|
|
875
|
+
|
|
876
|
+
title = tag.get("title")
|
|
877
|
+
if title and isinstance(title, str) and title.strip():
|
|
878
|
+
# Show abbreviation with title in parentheses
|
|
879
|
+
return f"{text.strip()} ({title.strip()})"
|
|
880
|
+
|
|
881
|
+
return text.strip()
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
885
|
+
"""Convert HTML time element preserving datetime attribute.
|
|
886
|
+
|
|
887
|
+
Args:
|
|
888
|
+
tag: The time tag element.
|
|
889
|
+
text: The text content of the time element.
|
|
890
|
+
convert_as_inline: Whether to convert as inline content.
|
|
891
|
+
|
|
892
|
+
Returns:
|
|
893
|
+
The converted markdown text preserving time information.
|
|
894
|
+
"""
|
|
895
|
+
_ = convert_as_inline # Unused but kept for API consistency
|
|
896
|
+
if not text.strip():
|
|
897
|
+
return ""
|
|
898
|
+
|
|
899
|
+
datetime_attr = tag.get("datetime")
|
|
900
|
+
if datetime_attr and isinstance(datetime_attr, str) and datetime_attr.strip():
|
|
901
|
+
# Preserve machine-readable datetime in HTML
|
|
902
|
+
return f'<time datetime="{datetime_attr.strip()}">{text.strip()}</time>'
|
|
903
|
+
|
|
904
|
+
return text.strip()
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
908
|
+
"""Convert HTML data element preserving value attribute.
|
|
909
|
+
|
|
910
|
+
Args:
|
|
911
|
+
tag: The data tag element.
|
|
912
|
+
text: The text content of the data element.
|
|
913
|
+
convert_as_inline: Whether to convert as inline content.
|
|
914
|
+
|
|
915
|
+
Returns:
|
|
916
|
+
The converted markdown text preserving machine-readable data.
|
|
917
|
+
"""
|
|
918
|
+
_ = convert_as_inline # Unused but kept for API consistency
|
|
919
|
+
if not text.strip():
|
|
920
|
+
return ""
|
|
921
|
+
|
|
922
|
+
value_attr = tag.get("value")
|
|
923
|
+
if value_attr and isinstance(value_attr, str) and value_attr.strip():
|
|
924
|
+
# Preserve machine-readable value in HTML
|
|
925
|
+
return f'<data value="{value_attr.strip()}">{text.strip()}</data>'
|
|
926
|
+
|
|
927
|
+
return text.strip()
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
def _convert_wbr(*, convert_as_inline: bool) -> str:
|
|
931
|
+
"""Convert HTML wbr (word break opportunity) element.
|
|
932
|
+
|
|
933
|
+
Args:
|
|
934
|
+
convert_as_inline: Whether to convert as inline content.
|
|
935
|
+
|
|
936
|
+
Returns:
|
|
937
|
+
Empty string as wbr is just a break opportunity.
|
|
938
|
+
"""
|
|
939
|
+
_ = convert_as_inline # Unused but kept for API consistency
|
|
940
|
+
return "" # Word break opportunity doesn't produce visible output
|
|
941
|
+
|
|
942
|
+
|
|
943
|
+
def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
944
|
+
"""Convert HTML form element preserving structure for documentation.
|
|
945
|
+
|
|
946
|
+
Args:
|
|
947
|
+
tag: The form tag element.
|
|
948
|
+
text: The text content of the form element.
|
|
949
|
+
convert_as_inline: Whether to convert as inline content.
|
|
950
|
+
|
|
951
|
+
Returns:
|
|
952
|
+
The converted markdown text preserving form structure.
|
|
953
|
+
"""
|
|
954
|
+
if convert_as_inline:
|
|
955
|
+
return text
|
|
956
|
+
|
|
957
|
+
if not text.strip():
|
|
958
|
+
return ""
|
|
959
|
+
|
|
960
|
+
action = tag.get("action", "")
|
|
961
|
+
method = tag.get("method", "")
|
|
962
|
+
attrs = []
|
|
963
|
+
|
|
964
|
+
if action and isinstance(action, str) and action.strip():
|
|
965
|
+
attrs.append(f'action="{action.strip()}"')
|
|
966
|
+
if method and isinstance(method, str) and method.strip():
|
|
967
|
+
attrs.append(f'method="{method.strip()}"')
|
|
968
|
+
|
|
969
|
+
attrs_str = " ".join(attrs)
|
|
970
|
+
if attrs_str:
|
|
971
|
+
return f"<form {attrs_str}>\n{text.strip()}\n</form>\n\n"
|
|
972
|
+
return f"<form>\n{text.strip()}\n</form>\n\n"
|
|
973
|
+
|
|
974
|
+
|
|
975
|
+
def _convert_fieldset(*, text: str, convert_as_inline: bool) -> str:
|
|
976
|
+
"""Convert HTML fieldset element preserving structure.
|
|
977
|
+
|
|
978
|
+
Args:
|
|
979
|
+
text: The text content of the fieldset element.
|
|
980
|
+
convert_as_inline: Whether to convert as inline content.
|
|
981
|
+
|
|
982
|
+
Returns:
|
|
983
|
+
The converted markdown text preserving fieldset structure.
|
|
984
|
+
"""
|
|
985
|
+
if convert_as_inline:
|
|
986
|
+
return text
|
|
987
|
+
|
|
988
|
+
if not text.strip():
|
|
989
|
+
return ""
|
|
990
|
+
|
|
991
|
+
return f"<fieldset>\n{text.strip()}\n</fieldset>\n\n"
|
|
992
|
+
|
|
993
|
+
|
|
994
|
+
def _convert_legend(*, text: str, convert_as_inline: bool) -> str:
|
|
995
|
+
"""Convert HTML legend element to emphasized text.
|
|
996
|
+
|
|
997
|
+
Args:
|
|
998
|
+
text: The text content of the legend element.
|
|
999
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1000
|
+
|
|
1001
|
+
Returns:
|
|
1002
|
+
The converted markdown text as emphasized legend.
|
|
1003
|
+
"""
|
|
1004
|
+
if convert_as_inline:
|
|
1005
|
+
return text
|
|
1006
|
+
|
|
1007
|
+
if not text.strip():
|
|
1008
|
+
return ""
|
|
1009
|
+
|
|
1010
|
+
return f"<legend>{text.strip()}</legend>\n\n"
|
|
1011
|
+
|
|
1012
|
+
|
|
1013
|
+
def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1014
|
+
"""Convert HTML label element preserving for attribute.
|
|
1015
|
+
|
|
1016
|
+
Args:
|
|
1017
|
+
tag: The label tag element.
|
|
1018
|
+
text: The text content of the label element.
|
|
1019
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1020
|
+
|
|
1021
|
+
Returns:
|
|
1022
|
+
The converted markdown text preserving label structure.
|
|
1023
|
+
"""
|
|
1024
|
+
if convert_as_inline:
|
|
1025
|
+
return text
|
|
1026
|
+
|
|
1027
|
+
if not text.strip():
|
|
1028
|
+
return ""
|
|
1029
|
+
|
|
1030
|
+
for_attr = tag.get("for")
|
|
1031
|
+
if for_attr and isinstance(for_attr, str) and for_attr.strip():
|
|
1032
|
+
return f'<label for="{for_attr.strip()}">{text.strip()}</label>\n\n'
|
|
1033
|
+
|
|
1034
|
+
return f"<label>{text.strip()}</label>\n\n"
|
|
1035
|
+
|
|
1036
|
+
|
|
1037
|
+
def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
|
|
1038
|
+
"""Convert HTML input element preserving all relevant attributes.
|
|
1039
|
+
|
|
1040
|
+
Args:
|
|
1041
|
+
tag: The input tag element.
|
|
1042
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1043
|
+
|
|
1044
|
+
Returns:
|
|
1045
|
+
The converted markdown text preserving input structure.
|
|
1046
|
+
"""
|
|
1047
|
+
input_type = tag.get("type", "text")
|
|
1048
|
+
|
|
1049
|
+
# Special handling for inputs in list items - let _convert_li handle checkboxes
|
|
1050
|
+
# and ignore other input types in list items (legacy behavior)
|
|
1051
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
1052
|
+
|
|
1053
|
+
if _has_ancestor(tag, "li"):
|
|
1054
|
+
return ""
|
|
1055
|
+
|
|
1056
|
+
id_attr = tag.get("id", "")
|
|
1057
|
+
name = tag.get("name", "")
|
|
1058
|
+
value = tag.get("value", "")
|
|
1059
|
+
placeholder = tag.get("placeholder", "")
|
|
1060
|
+
required = tag.get("required") is not None
|
|
1061
|
+
disabled = tag.get("disabled") is not None
|
|
1062
|
+
readonly = tag.get("readonly") is not None
|
|
1063
|
+
checked = tag.get("checked") is not None
|
|
1064
|
+
accept = tag.get("accept", "")
|
|
1065
|
+
|
|
1066
|
+
attrs = []
|
|
1067
|
+
if input_type and isinstance(input_type, str):
|
|
1068
|
+
attrs.append(f'type="{input_type}"')
|
|
1069
|
+
if id_attr and isinstance(id_attr, str) and id_attr.strip():
|
|
1070
|
+
attrs.append(f'id="{id_attr}"')
|
|
1071
|
+
if name and isinstance(name, str) and name.strip():
|
|
1072
|
+
attrs.append(f'name="{name}"')
|
|
1073
|
+
if value and isinstance(value, str) and value.strip():
|
|
1074
|
+
attrs.append(f'value="{value}"')
|
|
1075
|
+
if placeholder and isinstance(placeholder, str) and placeholder.strip():
|
|
1076
|
+
attrs.append(f'placeholder="{placeholder}"')
|
|
1077
|
+
if accept and isinstance(accept, str) and accept.strip():
|
|
1078
|
+
attrs.append(f'accept="{accept}"')
|
|
1079
|
+
if required:
|
|
1080
|
+
attrs.append("required")
|
|
1081
|
+
if disabled:
|
|
1082
|
+
attrs.append("disabled")
|
|
1083
|
+
if readonly:
|
|
1084
|
+
attrs.append("readonly")
|
|
1085
|
+
if checked:
|
|
1086
|
+
attrs.append("checked")
|
|
1087
|
+
|
|
1088
|
+
attrs_str = " ".join(attrs)
|
|
1089
|
+
result = f"<input {attrs_str} />" if attrs_str else "<input />"
|
|
1090
|
+
|
|
1091
|
+
return result if convert_as_inline else f"{result}\n\n"
|
|
1092
|
+
|
|
1093
|
+
|
|
1094
|
+
def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1095
|
+
"""Convert HTML textarea element preserving attributes.
|
|
1096
|
+
|
|
1097
|
+
Args:
|
|
1098
|
+
tag: The textarea tag element.
|
|
1099
|
+
text: The text content of the textarea element.
|
|
1100
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1101
|
+
|
|
1102
|
+
Returns:
|
|
1103
|
+
The converted markdown text preserving textarea structure.
|
|
1104
|
+
"""
|
|
1105
|
+
if convert_as_inline:
|
|
1106
|
+
return text
|
|
1107
|
+
|
|
1108
|
+
if not text.strip():
|
|
1109
|
+
return ""
|
|
1110
|
+
|
|
1111
|
+
name = tag.get("name", "")
|
|
1112
|
+
placeholder = tag.get("placeholder", "")
|
|
1113
|
+
rows = tag.get("rows", "")
|
|
1114
|
+
cols = tag.get("cols", "")
|
|
1115
|
+
required = tag.get("required") is not None
|
|
1116
|
+
|
|
1117
|
+
attrs = []
|
|
1118
|
+
if name and isinstance(name, str) and name.strip():
|
|
1119
|
+
attrs.append(f'name="{name}"')
|
|
1120
|
+
if placeholder and isinstance(placeholder, str) and placeholder.strip():
|
|
1121
|
+
attrs.append(f'placeholder="{placeholder}"')
|
|
1122
|
+
if rows and isinstance(rows, str) and rows.strip():
|
|
1123
|
+
attrs.append(f'rows="{rows}"')
|
|
1124
|
+
if cols and isinstance(cols, str) and cols.strip():
|
|
1125
|
+
attrs.append(f'cols="{cols}"')
|
|
1126
|
+
if required:
|
|
1127
|
+
attrs.append("required")
|
|
1128
|
+
|
|
1129
|
+
attrs_str = " ".join(attrs)
|
|
1130
|
+
content = text.strip()
|
|
1131
|
+
|
|
1132
|
+
if attrs_str:
|
|
1133
|
+
return f"<textarea {attrs_str}>{content}</textarea>\n\n"
|
|
1134
|
+
return f"<textarea>{content}</textarea>\n\n"
|
|
1135
|
+
|
|
1136
|
+
|
|
1137
|
+
def _convert_select(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1138
|
+
"""Convert HTML select element preserving structure.
|
|
1139
|
+
|
|
1140
|
+
Args:
|
|
1141
|
+
tag: The select tag element.
|
|
1142
|
+
text: The text content of the select element.
|
|
1143
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1144
|
+
|
|
1145
|
+
Returns:
|
|
1146
|
+
The converted markdown text preserving select structure.
|
|
1147
|
+
"""
|
|
1148
|
+
if convert_as_inline:
|
|
1149
|
+
return text
|
|
1150
|
+
|
|
1151
|
+
if not text.strip():
|
|
1152
|
+
return ""
|
|
1153
|
+
|
|
1154
|
+
id_attr = tag.get("id", "")
|
|
1155
|
+
name = tag.get("name", "")
|
|
1156
|
+
multiple = tag.get("multiple") is not None
|
|
1157
|
+
required = tag.get("required") is not None
|
|
1158
|
+
|
|
1159
|
+
attrs = []
|
|
1160
|
+
if id_attr and isinstance(id_attr, str) and id_attr.strip():
|
|
1161
|
+
attrs.append(f'id="{id_attr}"')
|
|
1162
|
+
if name and isinstance(name, str) and name.strip():
|
|
1163
|
+
attrs.append(f'name="{name}"')
|
|
1164
|
+
if multiple:
|
|
1165
|
+
attrs.append("multiple")
|
|
1166
|
+
if required:
|
|
1167
|
+
attrs.append("required")
|
|
1168
|
+
|
|
1169
|
+
attrs_str = " ".join(attrs)
|
|
1170
|
+
content = text.strip()
|
|
1171
|
+
|
|
1172
|
+
if attrs_str:
|
|
1173
|
+
return f"<select {attrs_str}>\n{content}\n</select>\n\n"
|
|
1174
|
+
return f"<select>\n{content}\n</select>\n\n"
|
|
1175
|
+
|
|
1176
|
+
|
|
1177
|
+
def _convert_option(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1178
|
+
"""Convert HTML option element preserving value and selected state.
|
|
1179
|
+
|
|
1180
|
+
Args:
|
|
1181
|
+
tag: The option tag element.
|
|
1182
|
+
text: The text content of the option element.
|
|
1183
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1184
|
+
|
|
1185
|
+
Returns:
|
|
1186
|
+
The converted markdown text preserving option structure.
|
|
1187
|
+
"""
|
|
1188
|
+
if convert_as_inline:
|
|
1189
|
+
return text
|
|
1190
|
+
|
|
1191
|
+
if not text.strip():
|
|
1192
|
+
return ""
|
|
1193
|
+
|
|
1194
|
+
value = tag.get("value", "")
|
|
1195
|
+
selected = tag.get("selected") is not None
|
|
1196
|
+
|
|
1197
|
+
attrs = []
|
|
1198
|
+
if value and isinstance(value, str) and value.strip():
|
|
1199
|
+
attrs.append(f'value="{value}"')
|
|
1200
|
+
if selected:
|
|
1201
|
+
attrs.append("selected")
|
|
1202
|
+
|
|
1203
|
+
attrs_str = " ".join(attrs)
|
|
1204
|
+
content = text.strip()
|
|
1205
|
+
|
|
1206
|
+
if attrs_str:
|
|
1207
|
+
return f"<option {attrs_str}>{content}</option>\n"
|
|
1208
|
+
return f"<option>{content}</option>\n"
|
|
1209
|
+
|
|
1210
|
+
|
|
1211
|
+
def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1212
|
+
"""Convert HTML optgroup element preserving label.
|
|
1213
|
+
|
|
1214
|
+
Args:
|
|
1215
|
+
tag: The optgroup tag element.
|
|
1216
|
+
text: The text content of the optgroup element.
|
|
1217
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1218
|
+
|
|
1219
|
+
Returns:
|
|
1220
|
+
The converted markdown text preserving optgroup structure.
|
|
1221
|
+
"""
|
|
1222
|
+
if convert_as_inline:
|
|
1223
|
+
return text
|
|
1224
|
+
|
|
1225
|
+
if not text.strip():
|
|
1226
|
+
return ""
|
|
1227
|
+
|
|
1228
|
+
label = tag.get("label", "")
|
|
1229
|
+
|
|
1230
|
+
attrs = []
|
|
1231
|
+
if label and isinstance(label, str) and label.strip():
|
|
1232
|
+
attrs.append(f'label="{label}"')
|
|
1233
|
+
|
|
1234
|
+
attrs_str = " ".join(attrs)
|
|
1235
|
+
content = text.strip()
|
|
1236
|
+
|
|
1237
|
+
if attrs_str:
|
|
1238
|
+
return f"<optgroup {attrs_str}>\n{content}\n</optgroup>\n"
|
|
1239
|
+
return f"<optgroup>\n{content}\n</optgroup>\n"
|
|
1240
|
+
|
|
1241
|
+
|
|
1242
|
+
def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1243
|
+
"""Convert HTML button element preserving type and attributes.
|
|
1244
|
+
|
|
1245
|
+
Args:
|
|
1246
|
+
tag: The button tag element.
|
|
1247
|
+
text: The text content of the button element.
|
|
1248
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1249
|
+
|
|
1250
|
+
Returns:
|
|
1251
|
+
The converted markdown text preserving button structure.
|
|
1252
|
+
"""
|
|
1253
|
+
if convert_as_inline:
|
|
1254
|
+
return text
|
|
1255
|
+
|
|
1256
|
+
if not text.strip():
|
|
1257
|
+
return ""
|
|
1258
|
+
|
|
1259
|
+
button_type = tag.get("type", "")
|
|
1260
|
+
name = tag.get("name", "")
|
|
1261
|
+
value = tag.get("value", "")
|
|
1262
|
+
disabled = tag.get("disabled") is not None
|
|
1263
|
+
|
|
1264
|
+
attrs = []
|
|
1265
|
+
if button_type and isinstance(button_type, str) and button_type.strip():
|
|
1266
|
+
attrs.append(f'type="{button_type}"')
|
|
1267
|
+
if name and isinstance(name, str) and name.strip():
|
|
1268
|
+
attrs.append(f'name="{name}"')
|
|
1269
|
+
if value and isinstance(value, str) and value.strip():
|
|
1270
|
+
attrs.append(f'value="{value}"')
|
|
1271
|
+
if disabled:
|
|
1272
|
+
attrs.append("disabled")
|
|
1273
|
+
|
|
1274
|
+
attrs_str = " ".join(attrs)
|
|
1275
|
+
|
|
1276
|
+
if attrs_str:
|
|
1277
|
+
return f"<button {attrs_str}>{text.strip()}</button>\n\n"
|
|
1278
|
+
return f"<button>{text.strip()}</button>\n\n"
|
|
1279
|
+
|
|
1280
|
+
|
|
1281
|
+
def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1282
|
+
"""Convert HTML progress element preserving value and max.
|
|
1283
|
+
|
|
1284
|
+
Args:
|
|
1285
|
+
tag: The progress tag element.
|
|
1286
|
+
text: The text content of the progress element.
|
|
1287
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1288
|
+
|
|
1289
|
+
Returns:
|
|
1290
|
+
The converted markdown text preserving progress structure.
|
|
1291
|
+
"""
|
|
1292
|
+
if convert_as_inline:
|
|
1293
|
+
return text
|
|
1294
|
+
|
|
1295
|
+
if not text.strip():
|
|
1296
|
+
return ""
|
|
1297
|
+
|
|
1298
|
+
value = tag.get("value", "")
|
|
1299
|
+
max_val = tag.get("max", "")
|
|
1300
|
+
|
|
1301
|
+
attrs = []
|
|
1302
|
+
if value and isinstance(value, str) and value.strip():
|
|
1303
|
+
attrs.append(f'value="{value}"')
|
|
1304
|
+
if max_val and isinstance(max_val, str) and max_val.strip():
|
|
1305
|
+
attrs.append(f'max="{max_val}"')
|
|
1306
|
+
|
|
1307
|
+
attrs_str = " ".join(attrs)
|
|
1308
|
+
content = text.strip()
|
|
1309
|
+
|
|
1310
|
+
if attrs_str:
|
|
1311
|
+
return f"<progress {attrs_str}>{content}</progress>\n\n"
|
|
1312
|
+
return f"<progress>{content}</progress>\n\n"
|
|
1313
|
+
|
|
1314
|
+
|
|
1315
|
+
def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1316
|
+
"""Convert HTML meter element preserving value and range attributes.
|
|
1317
|
+
|
|
1318
|
+
Args:
|
|
1319
|
+
tag: The meter tag element.
|
|
1320
|
+
text: The text content of the meter element.
|
|
1321
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1322
|
+
|
|
1323
|
+
Returns:
|
|
1324
|
+
The converted markdown text preserving meter structure.
|
|
1325
|
+
"""
|
|
1326
|
+
if convert_as_inline:
|
|
1327
|
+
return text
|
|
1328
|
+
|
|
1329
|
+
if not text.strip():
|
|
1330
|
+
return ""
|
|
1331
|
+
|
|
1332
|
+
value = tag.get("value", "")
|
|
1333
|
+
min_val = tag.get("min", "")
|
|
1334
|
+
max_val = tag.get("max", "")
|
|
1335
|
+
low = tag.get("low", "")
|
|
1336
|
+
high = tag.get("high", "")
|
|
1337
|
+
optimum = tag.get("optimum", "")
|
|
1338
|
+
|
|
1339
|
+
attrs = []
|
|
1340
|
+
if value and isinstance(value, str) and value.strip():
|
|
1341
|
+
attrs.append(f'value="{value}"')
|
|
1342
|
+
if min_val and isinstance(min_val, str) and min_val.strip():
|
|
1343
|
+
attrs.append(f'min="{min_val}"')
|
|
1344
|
+
if max_val and isinstance(max_val, str) and max_val.strip():
|
|
1345
|
+
attrs.append(f'max="{max_val}"')
|
|
1346
|
+
if low and isinstance(low, str) and low.strip():
|
|
1347
|
+
attrs.append(f'low="{low}"')
|
|
1348
|
+
if high and isinstance(high, str) and high.strip():
|
|
1349
|
+
attrs.append(f'high="{high}"')
|
|
1350
|
+
if optimum and isinstance(optimum, str) and optimum.strip():
|
|
1351
|
+
attrs.append(f'optimum="{optimum}"')
|
|
1352
|
+
|
|
1353
|
+
attrs_str = " ".join(attrs)
|
|
1354
|
+
content = text.strip()
|
|
1355
|
+
|
|
1356
|
+
if attrs_str:
|
|
1357
|
+
return f"<meter {attrs_str}>{content}</meter>\n\n"
|
|
1358
|
+
return f"<meter>{content}</meter>\n\n"
|
|
1359
|
+
|
|
1360
|
+
|
|
1361
|
+
def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1362
|
+
"""Convert HTML output element preserving for and name attributes.
|
|
1363
|
+
|
|
1364
|
+
Args:
|
|
1365
|
+
tag: The output tag element.
|
|
1366
|
+
text: The text content of the output element.
|
|
1367
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1368
|
+
|
|
1369
|
+
Returns:
|
|
1370
|
+
The converted markdown text preserving output structure.
|
|
1371
|
+
"""
|
|
1372
|
+
if convert_as_inline:
|
|
1373
|
+
return text
|
|
1374
|
+
|
|
1375
|
+
if not text.strip():
|
|
1376
|
+
return ""
|
|
1377
|
+
|
|
1378
|
+
for_attr = tag.get("for", "")
|
|
1379
|
+
name = tag.get("name", "")
|
|
1380
|
+
|
|
1381
|
+
attrs = []
|
|
1382
|
+
if for_attr:
|
|
1383
|
+
# BeautifulSoup returns space-separated attributes as lists
|
|
1384
|
+
for_value = " ".join(for_attr) if isinstance(for_attr, list) else str(for_attr)
|
|
1385
|
+
if for_value.strip():
|
|
1386
|
+
attrs.append(f'for="{for_value}"')
|
|
1387
|
+
if name and isinstance(name, str) and name.strip():
|
|
1388
|
+
attrs.append(f'name="{name}"')
|
|
1389
|
+
|
|
1390
|
+
attrs_str = " ".join(attrs)
|
|
1391
|
+
|
|
1392
|
+
if attrs_str:
|
|
1393
|
+
return f"<output {attrs_str}>{text.strip()}</output>\n\n"
|
|
1394
|
+
return f"<output>{text.strip()}</output>\n\n"
|
|
1395
|
+
|
|
1396
|
+
|
|
1397
|
+
def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1398
|
+
"""Convert HTML datalist element preserving structure.
|
|
1399
|
+
|
|
1400
|
+
Args:
|
|
1401
|
+
tag: The datalist tag element.
|
|
1402
|
+
text: The text content of the datalist element.
|
|
1403
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1404
|
+
|
|
1405
|
+
Returns:
|
|
1406
|
+
The converted markdown text preserving datalist structure.
|
|
1407
|
+
"""
|
|
1408
|
+
if convert_as_inline:
|
|
1409
|
+
return text
|
|
1410
|
+
|
|
1411
|
+
if not text.strip():
|
|
1412
|
+
return ""
|
|
1413
|
+
|
|
1414
|
+
id_attr = tag.get("id", "")
|
|
1415
|
+
|
|
1416
|
+
attrs = []
|
|
1417
|
+
if id_attr and isinstance(id_attr, str) and id_attr.strip():
|
|
1418
|
+
attrs.append(f'id="{id_attr}"')
|
|
1419
|
+
|
|
1420
|
+
attrs_str = " ".join(attrs)
|
|
1421
|
+
content = text.strip()
|
|
1422
|
+
|
|
1423
|
+
if attrs_str:
|
|
1424
|
+
return f"<datalist {attrs_str}>\n{content}\n</datalist>\n\n"
|
|
1425
|
+
return f"<datalist>\n{content}\n</datalist>\n\n"
|
|
1426
|
+
|
|
1427
|
+
|
|
1428
|
+
def _convert_ruby(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
1429
|
+
"""Convert HTML ruby element providing pronunciation annotation.
|
|
1430
|
+
|
|
1431
|
+
Args:
|
|
1432
|
+
text: The text content of the ruby element.
|
|
1433
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1434
|
+
|
|
1435
|
+
Returns:
|
|
1436
|
+
The converted markdown text with ruby annotation as fallback text.
|
|
1437
|
+
"""
|
|
1438
|
+
if not text.strip():
|
|
1439
|
+
return ""
|
|
1440
|
+
|
|
1441
|
+
# Ruby elements are always inline by nature
|
|
1442
|
+
return text.strip()
|
|
1443
|
+
|
|
1444
|
+
|
|
1445
|
+
def _convert_rb(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
1446
|
+
"""Convert HTML rb (ruby base) element.
|
|
1447
|
+
|
|
1448
|
+
Args:
|
|
1449
|
+
text: The text content of the rb element.
|
|
1450
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1451
|
+
|
|
1452
|
+
Returns:
|
|
1453
|
+
The converted markdown text (ruby base text).
|
|
1454
|
+
"""
|
|
1455
|
+
if not text.strip():
|
|
1456
|
+
return ""
|
|
1457
|
+
|
|
1458
|
+
# Ruby base is the main text, pass through as-is
|
|
1459
|
+
return text.strip()
|
|
1460
|
+
|
|
1461
|
+
|
|
1462
|
+
def _convert_rt(*, text: str, convert_as_inline: bool, tag: Tag) -> str: # noqa: ARG001
|
|
1463
|
+
"""Convert HTML rt (ruby text) element for pronunciation.
|
|
1464
|
+
|
|
1465
|
+
Args:
|
|
1466
|
+
text: The text content of the rt element.
|
|
1467
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1468
|
+
tag: The rt tag element.
|
|
1469
|
+
|
|
1470
|
+
Returns:
|
|
1471
|
+
The converted markdown text with pronunciation in parentheses.
|
|
1472
|
+
"""
|
|
1473
|
+
# Handle empty rt elements - still need parentheses
|
|
1474
|
+
content = text.strip()
|
|
1475
|
+
|
|
1476
|
+
# Check if this rt is surrounded by rp elements (fallback parentheses)
|
|
1477
|
+
prev_sibling = tag.previous_sibling
|
|
1478
|
+
next_sibling = tag.next_sibling
|
|
1479
|
+
|
|
1480
|
+
# If surrounded by rp elements, don't add extra parentheses
|
|
1481
|
+
has_rp_before = prev_sibling and getattr(prev_sibling, "name", None) == "rp"
|
|
1482
|
+
has_rp_after = next_sibling and getattr(next_sibling, "name", None) == "rp"
|
|
1483
|
+
|
|
1484
|
+
if has_rp_before and has_rp_after:
|
|
1485
|
+
# Already has rp parentheses, just return the text
|
|
1486
|
+
return content
|
|
1487
|
+
# Ruby text (pronunciation) shown in parentheses as fallback
|
|
1488
|
+
return f"({content})"
|
|
1489
|
+
|
|
1490
|
+
|
|
1491
|
+
def _convert_rp(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
1492
|
+
"""Convert HTML rp (ruby parentheses) element for fallback.
|
|
1493
|
+
|
|
1494
|
+
Args:
|
|
1495
|
+
text: The text content of the rp element.
|
|
1496
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1497
|
+
|
|
1498
|
+
Returns:
|
|
1499
|
+
The converted markdown text (parentheses for ruby fallback).
|
|
1500
|
+
"""
|
|
1501
|
+
if not text.strip():
|
|
1502
|
+
return ""
|
|
1503
|
+
|
|
1504
|
+
# Ruby parentheses preserved for fallback compatibility
|
|
1505
|
+
return text.strip()
|
|
1506
|
+
|
|
1507
|
+
|
|
1508
|
+
def _convert_rtc(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
1509
|
+
"""Convert HTML rtc (ruby text container) element.
|
|
1510
|
+
|
|
1511
|
+
Args:
|
|
1512
|
+
text: The text content of the rtc element.
|
|
1513
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1514
|
+
|
|
1515
|
+
Returns:
|
|
1516
|
+
The converted markdown text (ruby text container).
|
|
1517
|
+
"""
|
|
1518
|
+
if not text.strip():
|
|
1519
|
+
return ""
|
|
1520
|
+
|
|
1521
|
+
# Ruby text container, pass through content
|
|
1522
|
+
return text.strip()
|
|
1523
|
+
|
|
1524
|
+
|
|
1525
|
+
def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1526
|
+
"""Convert HTML dialog element preserving structure with attributes.
|
|
1527
|
+
|
|
1528
|
+
Args:
|
|
1529
|
+
text: The text content of the dialog element.
|
|
1530
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1531
|
+
tag: The dialog tag element.
|
|
1532
|
+
|
|
1533
|
+
Returns:
|
|
1534
|
+
The converted markdown text preserving dialog structure.
|
|
1535
|
+
"""
|
|
1536
|
+
if convert_as_inline:
|
|
1537
|
+
return text
|
|
1538
|
+
|
|
1539
|
+
if not text.strip():
|
|
1540
|
+
return ""
|
|
1541
|
+
|
|
1542
|
+
# Get dialog attributes for preservation
|
|
1543
|
+
attrs = []
|
|
1544
|
+
if tag.get("open") is not None:
|
|
1545
|
+
attrs.append("open")
|
|
1546
|
+
if tag.get("id"):
|
|
1547
|
+
attrs.append(f'id="{tag.get("id")}"')
|
|
1548
|
+
|
|
1549
|
+
attrs_str = " " + " ".join(attrs) if attrs else ""
|
|
1550
|
+
|
|
1551
|
+
return f"<dialog{attrs_str}>\n{text.strip()}\n</dialog>\n\n"
|
|
1552
|
+
|
|
1553
|
+
|
|
1554
|
+
def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1555
|
+
"""Convert HTML menu element preserving structure with attributes.
|
|
1556
|
+
|
|
1557
|
+
Args:
|
|
1558
|
+
text: The text content of the menu element.
|
|
1559
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1560
|
+
tag: The menu tag element.
|
|
1561
|
+
|
|
1562
|
+
Returns:
|
|
1563
|
+
The converted markdown text preserving menu structure.
|
|
1564
|
+
"""
|
|
1565
|
+
if convert_as_inline:
|
|
1566
|
+
return text
|
|
1567
|
+
|
|
1568
|
+
if not text.strip():
|
|
1569
|
+
return ""
|
|
1570
|
+
|
|
1571
|
+
# Get menu attributes for preservation
|
|
1572
|
+
attrs = []
|
|
1573
|
+
if tag.get("type") and tag.get("type") != "list":
|
|
1574
|
+
attrs.append(f'type="{tag.get("type")}"')
|
|
1575
|
+
if tag.get("label"):
|
|
1576
|
+
attrs.append(f'label="{tag.get("label")}"')
|
|
1577
|
+
if tag.get("id"):
|
|
1578
|
+
attrs.append(f'id="{tag.get("id")}"')
|
|
1579
|
+
|
|
1580
|
+
attrs_str = " " + " ".join(attrs) if attrs else ""
|
|
1581
|
+
|
|
1582
|
+
return f"<menu{attrs_str}>\n{text.strip()}\n</menu>\n\n"
|
|
1583
|
+
|
|
1584
|
+
|
|
1585
|
+
def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1586
|
+
"""Convert HTML figure element preserving semantic structure.
|
|
1587
|
+
|
|
1588
|
+
Args:
|
|
1589
|
+
text: The text content of the figure element.
|
|
1590
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1591
|
+
tag: The figure tag element.
|
|
1592
|
+
|
|
1593
|
+
Returns:
|
|
1594
|
+
The converted markdown text preserving figure structure.
|
|
1595
|
+
"""
|
|
1596
|
+
if not text.strip():
|
|
1597
|
+
return ""
|
|
1598
|
+
|
|
1599
|
+
if convert_as_inline:
|
|
1600
|
+
return text
|
|
1601
|
+
|
|
1602
|
+
# Get figure attributes for preservation
|
|
1603
|
+
attrs = []
|
|
1604
|
+
if tag.get("id"):
|
|
1605
|
+
attrs.append(f'id="{tag.get("id")}"')
|
|
1606
|
+
if tag.get("class"):
|
|
1607
|
+
# Handle class attribute which might be a list
|
|
1608
|
+
class_val = tag.get("class")
|
|
1609
|
+
if isinstance(class_val, list):
|
|
1610
|
+
class_val = " ".join(class_val)
|
|
1611
|
+
attrs.append(f'class="{class_val}"')
|
|
1612
|
+
|
|
1613
|
+
attrs_str = " " + " ".join(attrs) if attrs else ""
|
|
1614
|
+
|
|
1615
|
+
# Check if the figure contains only an image (common case)
|
|
1616
|
+
# In that case, we might want to preserve the figure wrapper
|
|
1617
|
+
content = text.strip()
|
|
1618
|
+
|
|
1619
|
+
# If content already has proper spacing, don't add extra newlines
|
|
1620
|
+
if content.endswith("\n\n"):
|
|
1621
|
+
return f"<figure{attrs_str}>\n{content}</figure>\n\n"
|
|
1622
|
+
|
|
1623
|
+
return f"<figure{attrs_str}>\n{content}\n</figure>\n\n"
|
|
1624
|
+
|
|
1625
|
+
|
|
1626
|
+
def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
|
|
1627
|
+
"""Convert HTML hgroup element preserving heading group semantics.
|
|
1628
|
+
|
|
1629
|
+
Args:
|
|
1630
|
+
text: The text content of the hgroup element.
|
|
1631
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1632
|
+
|
|
1633
|
+
Returns:
|
|
1634
|
+
The converted markdown text preserving heading group structure.
|
|
1635
|
+
"""
|
|
1636
|
+
if convert_as_inline:
|
|
1637
|
+
return text
|
|
1638
|
+
|
|
1639
|
+
if not text.strip():
|
|
1640
|
+
return ""
|
|
1641
|
+
|
|
1642
|
+
# Preserve the semantic grouping of headings
|
|
1643
|
+
# Add a marker to indicate this is a grouped heading
|
|
1644
|
+
content = text.strip()
|
|
1645
|
+
|
|
1646
|
+
# Remove excessive newlines between headings in the group
|
|
1647
|
+
# Headings in hgroup should be visually closer together
|
|
1648
|
+
content = re.sub(r"\n{3,}", "\n\n", content)
|
|
1649
|
+
|
|
1650
|
+
return f"<!-- heading group -->\n{content}\n<!-- end heading group -->\n\n"
|
|
1651
|
+
|
|
1652
|
+
|
|
1653
|
+
def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1654
|
+
"""Convert HTML picture element with responsive image sources.
|
|
1655
|
+
|
|
1656
|
+
Args:
|
|
1657
|
+
text: The text content of the picture element.
|
|
1658
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1659
|
+
tag: The picture tag element.
|
|
1660
|
+
|
|
1661
|
+
Returns:
|
|
1662
|
+
The converted markdown text with picture information preserved.
|
|
1663
|
+
"""
|
|
1664
|
+
if not text.strip():
|
|
1665
|
+
return ""
|
|
1666
|
+
|
|
1667
|
+
# Find all source elements
|
|
1668
|
+
sources = tag.find_all("source")
|
|
1669
|
+
img = tag.find("img")
|
|
1670
|
+
|
|
1671
|
+
if not img:
|
|
1672
|
+
# No img fallback, just return the text content
|
|
1673
|
+
return text.strip()
|
|
1674
|
+
|
|
1675
|
+
# Get the primary image markdown (already converted)
|
|
1676
|
+
img_markdown = text.strip()
|
|
1677
|
+
|
|
1678
|
+
# If there are no sources, just return the image
|
|
1679
|
+
if not sources:
|
|
1680
|
+
return img_markdown
|
|
1681
|
+
|
|
1682
|
+
# Build a comment with source information for responsive images
|
|
1683
|
+
source_info = []
|
|
1684
|
+
for source in sources:
|
|
1685
|
+
srcset = source.get("srcset")
|
|
1686
|
+
media = source.get("media")
|
|
1687
|
+
mime_type = source.get("type")
|
|
1688
|
+
|
|
1689
|
+
if srcset:
|
|
1690
|
+
info = f'srcset="{srcset}"'
|
|
1691
|
+
if media:
|
|
1692
|
+
info += f' media="{media}"'
|
|
1693
|
+
if mime_type:
|
|
1694
|
+
info += f' type="{mime_type}"'
|
|
1695
|
+
source_info.append(info)
|
|
1696
|
+
|
|
1697
|
+
if source_info and not convert_as_inline:
|
|
1698
|
+
# Add picture source information as a comment
|
|
1699
|
+
sources_comment = "<!-- picture sources:\n"
|
|
1700
|
+
for info in source_info:
|
|
1701
|
+
sources_comment += f" {info}\n"
|
|
1702
|
+
sources_comment += "-->\n"
|
|
1703
|
+
return f"{sources_comment}{img_markdown}"
|
|
1704
|
+
|
|
1705
|
+
# In inline mode or no sources, just return the image
|
|
1706
|
+
return img_markdown
|
|
1707
|
+
|
|
1708
|
+
|
|
1709
|
+
def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1710
|
+
"""Convert SVG element to Markdown image reference.
|
|
1711
|
+
|
|
1712
|
+
Args:
|
|
1713
|
+
text: The text content of the SVG element.
|
|
1714
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1715
|
+
tag: The SVG tag element.
|
|
1716
|
+
|
|
1717
|
+
Returns:
|
|
1718
|
+
The converted markdown text as an image reference.
|
|
1719
|
+
"""
|
|
1720
|
+
if convert_as_inline:
|
|
1721
|
+
# In inline mode, just return any text content
|
|
1722
|
+
return text.strip()
|
|
1723
|
+
|
|
1724
|
+
# Get SVG attributes
|
|
1725
|
+
title = tag.find("title")
|
|
1726
|
+
title_text = title.get_text().strip() if title else ""
|
|
1727
|
+
|
|
1728
|
+
# For inline SVG, we'll convert to a data URI
|
|
1729
|
+
# First, we need to get the full SVG markup
|
|
1730
|
+
svg_markup = str(tag)
|
|
1731
|
+
|
|
1732
|
+
# Create a data URI
|
|
1733
|
+
svg_bytes = svg_markup.encode("utf-8")
|
|
1734
|
+
svg_base64 = base64.b64encode(svg_bytes).decode("utf-8")
|
|
1735
|
+
data_uri = f"data:image/svg+xml;base64,{svg_base64}"
|
|
1736
|
+
|
|
1737
|
+
# Use title as alt text, or "SVG Image" if no title
|
|
1738
|
+
alt_text = title_text or "SVG Image"
|
|
1739
|
+
|
|
1740
|
+
return f""
|
|
1741
|
+
|
|
1742
|
+
|
|
1743
|
+
def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1744
|
+
"""Convert MathML math element preserving mathematical notation.
|
|
1745
|
+
|
|
1746
|
+
Args:
|
|
1747
|
+
text: The text content of the math element.
|
|
1748
|
+
convert_as_inline: Whether to convert as inline content.
|
|
1749
|
+
tag: The math tag element.
|
|
1750
|
+
|
|
1751
|
+
Returns:
|
|
1752
|
+
The converted markdown text preserving math structure.
|
|
1753
|
+
"""
|
|
1754
|
+
if not text.strip():
|
|
1755
|
+
return ""
|
|
1756
|
+
|
|
1757
|
+
# Check if it's display math vs inline math
|
|
1758
|
+
display = tag.get("display") == "block"
|
|
1759
|
+
|
|
1760
|
+
# For now, preserve the MathML as a comment with the text representation
|
|
1761
|
+
# This allows systems that understand MathML to process it
|
|
1762
|
+
math_comment = f"<!-- MathML: {tag!s} -->"
|
|
1763
|
+
|
|
1764
|
+
if convert_as_inline or not display:
|
|
1765
|
+
# Inline math - just the text with comment
|
|
1766
|
+
return f"{math_comment}{text.strip()}"
|
|
1767
|
+
# Display math - on its own line
|
|
1768
|
+
return f"\n\n{math_comment}\n{text.strip()}\n\n"
|
|
1769
|
+
|
|
1770
|
+
|
|
1771
|
+
def create_converters_map(
|
|
1772
|
+
autolinks: bool,
|
|
1773
|
+
bullets: str,
|
|
1774
|
+
code_language: str,
|
|
1775
|
+
code_language_callback: Callable[[Tag], str] | None,
|
|
1776
|
+
default_title: bool,
|
|
1777
|
+
heading_style: Literal["atx", "atx_closed", "underlined"],
|
|
1778
|
+
highlight_style: Literal["double-equal", "html", "bold"],
|
|
1779
|
+
keep_inline_images_in: Iterable[str] | None,
|
|
1780
|
+
newline_style: str,
|
|
1781
|
+
strong_em_symbol: str,
|
|
1782
|
+
sub_symbol: str,
|
|
1783
|
+
sup_symbol: str,
|
|
1784
|
+
wrap: bool,
|
|
1785
|
+
wrap_width: int,
|
|
1786
|
+
) -> ConvertersMap:
|
|
1787
|
+
"""Create a mapping of HTML elements to their corresponding conversion functions.
|
|
1788
|
+
|
|
1789
|
+
Args:
|
|
1790
|
+
autolinks: Whether to convert URLs into links.
|
|
1791
|
+
bullets: The bullet characters to use for unordered lists.
|
|
1792
|
+
code_language: The default code language to use.
|
|
1793
|
+
code_language_callback: A callback to get the code language.
|
|
1794
|
+
default_title: Whether to use the URL as the title for links.
|
|
1795
|
+
heading_style: The style of headings.
|
|
1796
|
+
highlight_style: The style to use for highlighted text (mark elements).
|
|
1797
|
+
keep_inline_images_in: The tags to keep inline images in.
|
|
1798
|
+
newline_style: The style of newlines.
|
|
1799
|
+
strong_em_symbol: The symbol to use for strong and emphasis text.
|
|
1800
|
+
sub_symbol: The symbol to use for subscript text.
|
|
1801
|
+
sup_symbol: The symbol to use for superscript text.
|
|
1802
|
+
wrap: Whether to wrap text.
|
|
1803
|
+
wrap_width: The width to wrap text at.
|
|
1804
|
+
|
|
1805
|
+
Returns:
|
|
1806
|
+
A mapping of HTML elements to their corresponding conversion functions
|
|
1807
|
+
"""
|
|
1808
|
+
|
|
1809
|
+
def _wrapper(func: Callable[..., T]) -> Callable[[str, Tag], T]:
|
|
1810
|
+
spec = getfullargspec(func)
|
|
1811
|
+
|
|
1812
|
+
def _inner(*, text: str, tag: Tag, convert_as_inline: bool) -> T:
|
|
1813
|
+
if spec.kwonlyargs:
|
|
1814
|
+
kwargs: dict[str, Any] = {}
|
|
1815
|
+
if "tag" in spec.kwonlyargs:
|
|
1816
|
+
kwargs["tag"] = tag
|
|
1817
|
+
if "text" in spec.kwonlyargs:
|
|
1818
|
+
kwargs["text"] = text
|
|
1819
|
+
if "convert_as_inline" in spec.kwonlyargs:
|
|
1820
|
+
kwargs["convert_as_inline"] = convert_as_inline
|
|
1821
|
+
return func(**kwargs)
|
|
1822
|
+
return func(text)
|
|
1823
|
+
|
|
1824
|
+
return cast("Callable[[str, Tag], T]", _inner)
|
|
1825
|
+
|
|
1826
|
+
return {
|
|
1827
|
+
"a": _wrapper(partial(_convert_a, autolinks=autolinks, default_title=default_title)),
|
|
1828
|
+
"abbr": _wrapper(_convert_abbr),
|
|
1829
|
+
"article": _wrapper(_convert_semantic_block),
|
|
1830
|
+
"aside": _wrapper(_convert_semantic_block),
|
|
1831
|
+
"audio": _wrapper(_convert_audio),
|
|
1832
|
+
"b": _wrapper(partial(_create_inline_converter(2 * strong_em_symbol))),
|
|
1833
|
+
"bdi": _wrapper(_create_inline_converter("")), # Bidirectional isolation - pass through
|
|
1834
|
+
"bdo": _wrapper(_create_inline_converter("")), # Bidirectional override - pass through
|
|
1835
|
+
"blockquote": _wrapper(partial(_convert_blockquote)),
|
|
1836
|
+
"br": _wrapper(partial(_convert_br, newline_style=newline_style)),
|
|
1837
|
+
"button": _wrapper(_convert_button),
|
|
1838
|
+
"caption": _wrapper(_convert_caption),
|
|
1839
|
+
"cite": _wrapper(_convert_cite),
|
|
1840
|
+
"code": _wrapper(_create_inline_converter("`")),
|
|
1841
|
+
"col": _wrapper(_convert_col),
|
|
1842
|
+
"colgroup": _wrapper(_convert_colgroup),
|
|
1843
|
+
"data": _wrapper(_convert_data),
|
|
1844
|
+
"datalist": _wrapper(_convert_datalist),
|
|
1845
|
+
"dd": _wrapper(_convert_dd),
|
|
1846
|
+
"del": _wrapper(_create_inline_converter("~~")),
|
|
1847
|
+
"details": _wrapper(_convert_details),
|
|
1848
|
+
"dfn": _wrapper(_create_inline_converter("*")), # Definition term - italic
|
|
1849
|
+
"dialog": _wrapper(_convert_dialog),
|
|
1850
|
+
"dl": _wrapper(_convert_dl),
|
|
1851
|
+
"dt": _wrapper(_convert_dt),
|
|
1852
|
+
"em": _wrapper(_create_inline_converter(strong_em_symbol)),
|
|
1853
|
+
"fieldset": _wrapper(_convert_fieldset),
|
|
1854
|
+
"figcaption": _wrapper(lambda text: f"\n\n{text}\n\n"),
|
|
1855
|
+
"figure": _wrapper(_convert_figure),
|
|
1856
|
+
"footer": _wrapper(_convert_semantic_block),
|
|
1857
|
+
"form": _wrapper(_convert_form),
|
|
1858
|
+
"h1": _wrapper(partial(_convert_hn, n=1, heading_style=heading_style)),
|
|
1859
|
+
"h2": _wrapper(partial(_convert_hn, n=2, heading_style=heading_style)),
|
|
1860
|
+
"h3": _wrapper(partial(_convert_hn, n=3, heading_style=heading_style)),
|
|
1861
|
+
"h4": _wrapper(partial(_convert_hn, n=4, heading_style=heading_style)),
|
|
1862
|
+
"h5": _wrapper(partial(_convert_hn, n=5, heading_style=heading_style)),
|
|
1863
|
+
"h6": _wrapper(partial(_convert_hn, n=6, heading_style=heading_style)),
|
|
1864
|
+
"header": _wrapper(_convert_semantic_block),
|
|
1865
|
+
"hgroup": _wrapper(_convert_hgroup),
|
|
1866
|
+
"hr": _wrapper(lambda _: "\n\n---\n\n"),
|
|
1867
|
+
"i": _wrapper(partial(_create_inline_converter(strong_em_symbol))),
|
|
1868
|
+
"iframe": _wrapper(_convert_iframe),
|
|
1869
|
+
"img": _wrapper(partial(_convert_img, keep_inline_images_in=keep_inline_images_in)),
|
|
1870
|
+
"input": _wrapper(_convert_input_enhanced),
|
|
1871
|
+
"ins": _wrapper(_create_inline_converter("==")), # Inserted text - highlight style
|
|
1872
|
+
"kbd": _wrapper(_create_inline_converter("`")),
|
|
1873
|
+
"label": _wrapper(_convert_label),
|
|
1874
|
+
"legend": _wrapper(_convert_legend),
|
|
1875
|
+
"li": _wrapper(partial(_convert_li, bullets=bullets)),
|
|
1876
|
+
"list": _wrapper(_convert_list),
|
|
1877
|
+
"main": _wrapper(_convert_semantic_block),
|
|
1878
|
+
"mark": _wrapper(partial(_convert_mark, highlight_style=highlight_style)),
|
|
1879
|
+
"math": _wrapper(_convert_math),
|
|
1880
|
+
"menu": _wrapper(_convert_menu),
|
|
1881
|
+
"meter": _wrapper(_convert_meter),
|
|
1882
|
+
"nav": _wrapper(_convert_semantic_block),
|
|
1883
|
+
"ol": _wrapper(_convert_list),
|
|
1884
|
+
"optgroup": _wrapper(_convert_optgroup),
|
|
1885
|
+
"option": _wrapper(_convert_option),
|
|
1886
|
+
"output": _wrapper(_convert_output),
|
|
1887
|
+
"p": _wrapper(partial(_convert_p, wrap=wrap, wrap_width=wrap_width)),
|
|
1888
|
+
"picture": _wrapper(_convert_picture),
|
|
1889
|
+
"pre": _wrapper(
|
|
1890
|
+
partial(
|
|
1891
|
+
_convert_pre,
|
|
1892
|
+
code_language=code_language,
|
|
1893
|
+
code_language_callback=code_language_callback,
|
|
1894
|
+
)
|
|
1895
|
+
),
|
|
1896
|
+
"progress": _wrapper(_convert_progress),
|
|
1897
|
+
"q": _wrapper(_convert_q),
|
|
1898
|
+
"rb": _wrapper(_convert_rb),
|
|
1899
|
+
"rp": _wrapper(_convert_rp),
|
|
1900
|
+
"rt": _wrapper(_convert_rt),
|
|
1901
|
+
"rtc": _wrapper(_convert_rtc),
|
|
1902
|
+
"ruby": _wrapper(_convert_ruby),
|
|
1903
|
+
"s": _wrapper(_create_inline_converter("~~")),
|
|
1904
|
+
"samp": _wrapper(_create_inline_converter("`")),
|
|
1905
|
+
"script": _wrapper(lambda _: ""),
|
|
1906
|
+
"section": _wrapper(_convert_semantic_block),
|
|
1907
|
+
"select": _wrapper(_convert_select),
|
|
1908
|
+
"small": _wrapper(_create_inline_converter("")), # Small text - pass through
|
|
1909
|
+
"strong": _wrapper(_create_inline_converter(strong_em_symbol * 2)),
|
|
1910
|
+
"style": _wrapper(lambda _: ""),
|
|
1911
|
+
"sub": _wrapper(_create_inline_converter(sub_symbol)),
|
|
1912
|
+
"summary": _wrapper(_convert_summary),
|
|
1913
|
+
"sup": _wrapper(_create_inline_converter(sup_symbol)),
|
|
1914
|
+
"svg": _wrapper(_convert_svg),
|
|
1915
|
+
"table": _wrapper(lambda text: f"\n\n{text}\n"),
|
|
1916
|
+
"tbody": _wrapper(_convert_tbody),
|
|
1917
|
+
"td": _wrapper(_convert_td),
|
|
1918
|
+
"textarea": _wrapper(_convert_textarea),
|
|
1919
|
+
"tfoot": _wrapper(_convert_tfoot),
|
|
1920
|
+
"th": _wrapper(_convert_th),
|
|
1921
|
+
"thead": _wrapper(_convert_thead),
|
|
1922
|
+
"time": _wrapper(_convert_time),
|
|
1923
|
+
"tr": _wrapper(_convert_tr),
|
|
1924
|
+
"u": _wrapper(_create_inline_converter("")), # Underlined text - pass through (no Markdown equivalent)
|
|
1925
|
+
"ul": _wrapper(_convert_list),
|
|
1926
|
+
"var": _wrapper(_create_inline_converter("*")), # Variable - italic
|
|
1927
|
+
"video": _wrapper(_convert_video),
|
|
1928
|
+
"wbr": _wrapper(_convert_wbr),
|
|
387
1929
|
}
|