html-to-markdown 1.6.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +3 -1
- html_to_markdown/cli.py +1 -4
- html_to_markdown/converters.py +375 -645
- html_to_markdown/preprocessor.py +407 -0
- html_to_markdown/processing.py +227 -87
- html_to_markdown/utils.py +12 -5
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/METADATA +87 -14
- html_to_markdown-1.9.0.dist-info/RECORD +16 -0
- html_to_markdown-1.6.0.dist-info/RECORD +0 -15
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/top_level.txt +0 -0
html_to_markdown/converters.py
CHANGED
|
@@ -5,11 +5,11 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
6
|
from collections.abc import Iterable
|
|
7
7
|
import base64
|
|
8
|
-
import
|
|
8
|
+
from collections.abc import Callable
|
|
9
9
|
from functools import partial
|
|
10
10
|
from inspect import getfullargspec
|
|
11
11
|
from textwrap import fill
|
|
12
|
-
from typing import Any,
|
|
12
|
+
from typing import Any, Literal, TypeVar, cast
|
|
13
13
|
|
|
14
14
|
from bs4.element import Tag
|
|
15
15
|
|
|
@@ -21,6 +21,24 @@ from html_to_markdown.constants import (
|
|
|
21
21
|
)
|
|
22
22
|
from html_to_markdown.utils import chomp, indent, underline
|
|
23
23
|
|
|
24
|
+
|
|
25
|
+
def _format_block_element(text: str) -> str:
|
|
26
|
+
"""Format text as a block element with trailing newlines."""
|
|
27
|
+
return f"{text.strip()}\n\n" if text.strip() else ""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _format_inline_or_block(text: str, convert_as_inline: bool) -> str:
|
|
31
|
+
"""Format text as inline or block element based on context."""
|
|
32
|
+
return text.strip() if convert_as_inline else _format_block_element(text)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") -> str:
|
|
36
|
+
"""Format text wrapped in markers as a block element."""
|
|
37
|
+
if not end_marker:
|
|
38
|
+
end_marker = start_marker
|
|
39
|
+
return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
|
|
40
|
+
|
|
41
|
+
|
|
24
42
|
SupportedElements = Literal[
|
|
25
43
|
"a",
|
|
26
44
|
"abbr",
|
|
@@ -137,7 +155,6 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
|
|
|
137
155
|
"""
|
|
138
156
|
|
|
139
157
|
def implementation(*, tag: Tag, text: str) -> str:
|
|
140
|
-
# Check if we're in a code context - if so, don't apply markup
|
|
141
158
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
142
159
|
|
|
143
160
|
if _has_ancestor(tag, ["pre", "code", "kbd", "samp"]):
|
|
@@ -151,7 +168,6 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
|
|
|
151
168
|
markup_suffix = "</" + markup_prefix[1:]
|
|
152
169
|
|
|
153
170
|
prefix, suffix, text = chomp(text)
|
|
154
|
-
|
|
155
171
|
return f"{prefix}{markup_prefix}{text}{markup_suffix}{suffix}"
|
|
156
172
|
|
|
157
173
|
return cast("Callable[[Tag, str], str]", implementation)
|
|
@@ -191,25 +207,35 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
|
|
|
191
207
|
if not text:
|
|
192
208
|
return ""
|
|
193
209
|
|
|
194
|
-
#
|
|
210
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
211
|
+
|
|
195
212
|
cite_url = tag.get("cite")
|
|
196
|
-
|
|
213
|
+
|
|
214
|
+
# Check if this blockquote is inside a list item
|
|
215
|
+
if _has_ancestor(tag, "li"):
|
|
216
|
+
# Indent the blockquote by 4 spaces
|
|
217
|
+
lines = text.strip().split("\n")
|
|
218
|
+
indented_lines = [f" > {line}" if line.strip() else "" for line in lines]
|
|
219
|
+
quote_text = "\n".join(indented_lines) + "\n\n"
|
|
220
|
+
else:
|
|
221
|
+
quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
|
|
197
222
|
|
|
198
223
|
if cite_url:
|
|
199
|
-
|
|
224
|
+
if _has_ancestor(tag, "li"):
|
|
225
|
+
quote_text += f" — <{cite_url}>\n\n"
|
|
226
|
+
else:
|
|
227
|
+
quote_text += f"— <{cite_url}>\n\n"
|
|
200
228
|
|
|
201
229
|
return quote_text
|
|
202
230
|
|
|
203
231
|
|
|
204
232
|
def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
|
|
205
|
-
# Convert br to line break, but handle headings specially
|
|
206
233
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
207
234
|
|
|
208
235
|
if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
|
|
209
|
-
return " "
|
|
236
|
+
return " "
|
|
210
237
|
|
|
211
|
-
|
|
212
|
-
_ = convert_as_inline # Unused but kept for API consistency
|
|
238
|
+
_ = convert_as_inline
|
|
213
239
|
return "\\\n" if newline_style.lower() == BACKSLASH else " \n"
|
|
214
240
|
|
|
215
241
|
|
|
@@ -247,9 +273,9 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
|
|
|
247
273
|
height = height if isinstance(height, str) else ""
|
|
248
274
|
title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
|
|
249
275
|
parent_name = tag.parent.name if tag.parent else ""
|
|
250
|
-
|
|
251
|
-
default_preserve_in =
|
|
252
|
-
preserve_in = set(keep_inline_images_in or []) |
|
|
276
|
+
|
|
277
|
+
default_preserve_in = {"td", "th"}
|
|
278
|
+
preserve_in = set(keep_inline_images_in or []) | default_preserve_in
|
|
253
279
|
if convert_as_inline and parent_name not in preserve_in:
|
|
254
280
|
return alt
|
|
255
281
|
if width or height:
|
|
@@ -258,35 +284,52 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
|
|
|
258
284
|
|
|
259
285
|
|
|
260
286
|
def _convert_list(*, tag: Tag, text: str) -> str:
|
|
261
|
-
|
|
287
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
262
288
|
|
|
263
289
|
before_paragraph = False
|
|
264
290
|
if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
|
|
265
291
|
before_paragraph = True
|
|
266
292
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
293
|
+
# Check if this list is inside a list item
|
|
294
|
+
if _has_ancestor(tag, "li"):
|
|
295
|
+
# This is a nested list - needs indentation
|
|
296
|
+
# But we need to check if it's the first element after a paragraph
|
|
297
|
+
parent = tag.parent
|
|
298
|
+
while parent and parent.name != "li":
|
|
299
|
+
parent = parent.parent
|
|
300
|
+
|
|
301
|
+
if parent:
|
|
302
|
+
# Check if there's a paragraph before this list
|
|
303
|
+
prev_p = None
|
|
304
|
+
for child in parent.children:
|
|
305
|
+
if hasattr(child, "name"):
|
|
306
|
+
if child == tag:
|
|
307
|
+
break
|
|
308
|
+
if child.name == "p":
|
|
309
|
+
prev_p = child
|
|
310
|
+
|
|
311
|
+
if prev_p:
|
|
312
|
+
# If there's a paragraph before, we need proper indentation
|
|
313
|
+
lines = text.strip().split("\n")
|
|
314
|
+
indented_lines = []
|
|
315
|
+
for line in lines:
|
|
316
|
+
if line.strip():
|
|
317
|
+
indented_lines.append(f" {line}")
|
|
318
|
+
else:
|
|
319
|
+
indented_lines.append("")
|
|
320
|
+
return "\n" + "\n".join(indented_lines) + "\n"
|
|
321
|
+
# Otherwise use the original tab indentation
|
|
322
|
+
return "\n" + indent(text=text, level=1).rstrip()
|
|
279
323
|
|
|
280
324
|
return text + ("\n" if before_paragraph else "")
|
|
281
325
|
|
|
282
326
|
|
|
283
327
|
def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
|
|
284
|
-
# Check for task list (checkbox input)
|
|
285
328
|
checkbox = tag.find("input", {"type": "checkbox"})
|
|
286
329
|
if checkbox and isinstance(checkbox, Tag):
|
|
287
330
|
checked = checkbox.get("checked") is not None
|
|
288
331
|
checkbox_symbol = "[x]" if checked else "[ ]"
|
|
289
|
-
|
|
332
|
+
|
|
290
333
|
checkbox_text = text
|
|
291
334
|
if checkbox.string:
|
|
292
335
|
checkbox_text = text.replace(str(checkbox.string), "").strip()
|
|
@@ -311,10 +354,38 @@ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
|
|
|
311
354
|
tag = tag.parent
|
|
312
355
|
|
|
313
356
|
bullet = bullets[depth % len(bullets)]
|
|
357
|
+
|
|
358
|
+
# Check if the list item contains block-level elements (like <p>, <blockquote>, etc.)
|
|
359
|
+
has_block_children = any(
|
|
360
|
+
child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
|
|
361
|
+
for child in tag.children
|
|
362
|
+
if hasattr(child, "name")
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
if has_block_children:
|
|
366
|
+
# Handle multi-paragraph list items
|
|
367
|
+
# Split by double newlines (paragraph separators)
|
|
368
|
+
paragraphs = text.strip().split("\n\n")
|
|
369
|
+
|
|
370
|
+
if paragraphs:
|
|
371
|
+
# First paragraph goes directly after the bullet
|
|
372
|
+
result_parts = [f"{bullet} {paragraphs[0].strip()}\n"]
|
|
373
|
+
|
|
374
|
+
# Subsequent paragraphs need to be indented and separated by blank lines
|
|
375
|
+
for para in paragraphs[1:]:
|
|
376
|
+
if para.strip():
|
|
377
|
+
# Add blank line before the paragraph
|
|
378
|
+
result_parts.append("\n")
|
|
379
|
+
# Indent each line of the paragraph by 4 spaces
|
|
380
|
+
result_parts.extend(f" {line}\n" for line in para.strip().split("\n") if line.strip())
|
|
381
|
+
|
|
382
|
+
return "".join(result_parts)
|
|
383
|
+
|
|
384
|
+
# Simple case: no block elements, just inline content
|
|
314
385
|
return "{} {}\n".format(bullet, (text or "").strip())
|
|
315
386
|
|
|
316
387
|
|
|
317
|
-
def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: int) -> str:
|
|
388
|
+
def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: int, tag: Tag) -> str:
|
|
318
389
|
if convert_as_inline:
|
|
319
390
|
return text
|
|
320
391
|
|
|
@@ -326,6 +397,30 @@ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: in
|
|
|
326
397
|
break_on_hyphens=False,
|
|
327
398
|
)
|
|
328
399
|
|
|
400
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
401
|
+
|
|
402
|
+
# Check if this paragraph is inside a list item
|
|
403
|
+
if _has_ancestor(tag, "li"):
|
|
404
|
+
# Check if this is the first paragraph in the list item
|
|
405
|
+
parent = tag.parent
|
|
406
|
+
while parent and parent.name != "li":
|
|
407
|
+
parent = parent.parent
|
|
408
|
+
|
|
409
|
+
if parent:
|
|
410
|
+
# Get all direct children that are paragraphs
|
|
411
|
+
p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
|
|
412
|
+
|
|
413
|
+
# If this is not the first paragraph, indent it
|
|
414
|
+
if p_children and tag != p_children[0]:
|
|
415
|
+
# Indent all lines by 4 spaces
|
|
416
|
+
indented_lines = []
|
|
417
|
+
for line in text.split("\n"):
|
|
418
|
+
if line.strip():
|
|
419
|
+
indented_lines.append(f" {line}")
|
|
420
|
+
else:
|
|
421
|
+
indented_lines.append("")
|
|
422
|
+
text = "\n".join(indented_lines)
|
|
423
|
+
|
|
329
424
|
return f"{text}\n\n" if text else ""
|
|
330
425
|
|
|
331
426
|
|
|
@@ -343,13 +438,15 @@ def _convert_mark(*, text: str, convert_as_inline: bool, highlight_style: str) -
|
|
|
343
438
|
if convert_as_inline:
|
|
344
439
|
return text
|
|
345
440
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
441
|
+
match highlight_style:
|
|
442
|
+
case "double-equal":
|
|
443
|
+
return f"=={text}=="
|
|
444
|
+
case "bold":
|
|
445
|
+
return f"**{text}**"
|
|
446
|
+
case "html":
|
|
447
|
+
return f"<mark>{text}</mark>"
|
|
448
|
+
case _:
|
|
449
|
+
return text
|
|
353
450
|
|
|
354
451
|
|
|
355
452
|
def _convert_pre(
|
|
@@ -382,6 +479,58 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
382
479
|
cells = tag.find_all(["td", "th"])
|
|
383
480
|
parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
|
|
384
481
|
tag_grand_parent = tag.parent.parent if tag.parent else None
|
|
482
|
+
|
|
483
|
+
# Simple rowspan handling: if previous row had cells with rowspan, add empty cells
|
|
484
|
+
if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
|
|
485
|
+
prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
|
|
486
|
+
rowspan_positions = []
|
|
487
|
+
col_pos = 0
|
|
488
|
+
|
|
489
|
+
# Check which cells in previous row have rowspan > 1
|
|
490
|
+
for prev_cell in prev_cells:
|
|
491
|
+
rowspan = 1
|
|
492
|
+
if (
|
|
493
|
+
"rowspan" in prev_cell.attrs
|
|
494
|
+
and isinstance(prev_cell["rowspan"], str)
|
|
495
|
+
and prev_cell["rowspan"].isdigit()
|
|
496
|
+
):
|
|
497
|
+
rowspan = int(prev_cell["rowspan"])
|
|
498
|
+
|
|
499
|
+
if rowspan > 1:
|
|
500
|
+
# This cell spans into current row
|
|
501
|
+
rowspan_positions.append(col_pos)
|
|
502
|
+
|
|
503
|
+
# Account for colspan
|
|
504
|
+
colspan = 1
|
|
505
|
+
if (
|
|
506
|
+
"colspan" in prev_cell.attrs
|
|
507
|
+
and isinstance(prev_cell["colspan"], str)
|
|
508
|
+
and prev_cell["colspan"].isdigit()
|
|
509
|
+
):
|
|
510
|
+
colspan = int(prev_cell["colspan"])
|
|
511
|
+
col_pos += colspan
|
|
512
|
+
|
|
513
|
+
# If there are rowspan cells from previous row, add empty cells
|
|
514
|
+
if rowspan_positions:
|
|
515
|
+
# Build new text with empty cells inserted
|
|
516
|
+
new_cells = []
|
|
517
|
+
cell_index = 0
|
|
518
|
+
|
|
519
|
+
for pos in range(col_pos): # Total columns
|
|
520
|
+
if pos in rowspan_positions:
|
|
521
|
+
# Add empty cell for rowspan
|
|
522
|
+
new_cells.append(" |")
|
|
523
|
+
elif cell_index < len(cells):
|
|
524
|
+
# Add actual cell content
|
|
525
|
+
cell = cells[cell_index]
|
|
526
|
+
cell_text = cell.get_text().strip().replace("\n", " ")
|
|
527
|
+
colspan = _get_colspan(cell)
|
|
528
|
+
new_cells.append(f" {cell_text} |" * colspan)
|
|
529
|
+
cell_index += 1
|
|
530
|
+
|
|
531
|
+
# Override text with new cell arrangement
|
|
532
|
+
text = "".join(new_cells)
|
|
533
|
+
|
|
385
534
|
is_headrow = (
|
|
386
535
|
all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
|
|
387
536
|
or (not tag.previous_sibling and parent_name != "tbody")
|
|
@@ -429,7 +578,7 @@ def _convert_caption(*, text: str, convert_as_inline: bool) -> str:
|
|
|
429
578
|
if not text.strip():
|
|
430
579
|
return ""
|
|
431
580
|
|
|
432
|
-
return
|
|
581
|
+
return _format_wrapped_block(text, "*")
|
|
433
582
|
|
|
434
583
|
|
|
435
584
|
def _convert_thead(*, text: str, convert_as_inline: bool) -> str:
|
|
@@ -481,7 +630,10 @@ def _convert_tfoot(*, text: str, convert_as_inline: bool) -> str:
|
|
|
481
630
|
|
|
482
631
|
|
|
483
632
|
def _convert_colgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
484
|
-
"""Convert HTML colgroup element
|
|
633
|
+
"""Convert HTML colgroup element - removes it entirely from Markdown output.
|
|
634
|
+
|
|
635
|
+
Colgroup is a table column grouping element that defines styling for columns.
|
|
636
|
+
It has no representation in Markdown and should be removed.
|
|
485
637
|
|
|
486
638
|
Args:
|
|
487
639
|
tag: The colgroup tag element.
|
|
@@ -489,54 +641,30 @@ def _convert_colgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
489
641
|
convert_as_inline: Whether to convert as inline content.
|
|
490
642
|
|
|
491
643
|
Returns:
|
|
492
|
-
|
|
644
|
+
Empty string as colgroup has no Markdown representation.
|
|
493
645
|
"""
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
return ""
|
|
499
|
-
|
|
500
|
-
span = tag.get("span", "")
|
|
501
|
-
attrs = []
|
|
502
|
-
if span and isinstance(span, str) and span.strip():
|
|
503
|
-
attrs.append(f'span="{span}"')
|
|
504
|
-
|
|
505
|
-
attrs_str = " ".join(attrs)
|
|
506
|
-
if attrs_str:
|
|
507
|
-
return f"<colgroup {attrs_str}>\n{text.strip()}\n</colgroup>\n\n"
|
|
508
|
-
return f"<colgroup>\n{text.strip()}\n</colgroup>\n\n"
|
|
646
|
+
_ = tag, text, convert_as_inline
|
|
647
|
+
# Colgroup and its contents (col elements) are purely presentational
|
|
648
|
+
# and have no equivalent in Markdown tables
|
|
649
|
+
return ""
|
|
509
650
|
|
|
510
651
|
|
|
511
652
|
def _convert_col(*, tag: Tag, convert_as_inline: bool) -> str:
|
|
512
|
-
"""Convert HTML col element
|
|
653
|
+
"""Convert HTML col element - removes it entirely from Markdown output.
|
|
654
|
+
|
|
655
|
+
Col elements define column properties (width, style) in HTML tables.
|
|
656
|
+
They have no representation in Markdown and should be removed.
|
|
513
657
|
|
|
514
658
|
Args:
|
|
515
659
|
tag: The col tag element.
|
|
516
660
|
convert_as_inline: Whether to convert as inline content.
|
|
517
661
|
|
|
518
662
|
Returns:
|
|
519
|
-
|
|
663
|
+
Empty string as col has no Markdown representation.
|
|
520
664
|
"""
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
span = tag.get("span", "")
|
|
525
|
-
width = tag.get("width", "")
|
|
526
|
-
style = tag.get("style", "")
|
|
527
|
-
|
|
528
|
-
attrs = []
|
|
529
|
-
if width and isinstance(width, str) and width.strip():
|
|
530
|
-
attrs.append(f'width="{width}"')
|
|
531
|
-
if style and isinstance(style, str) and style.strip():
|
|
532
|
-
attrs.append(f'style="{style}"')
|
|
533
|
-
if span and isinstance(span, str) and span.strip():
|
|
534
|
-
attrs.append(f'span="{span}"')
|
|
535
|
-
|
|
536
|
-
attrs_str = " ".join(attrs)
|
|
537
|
-
if attrs_str:
|
|
538
|
-
return f"<col {attrs_str} />\n"
|
|
539
|
-
return "<col />\n"
|
|
665
|
+
_ = tag, convert_as_inline
|
|
666
|
+
# Col elements are self-closing and purely presentational
|
|
667
|
+
return ""
|
|
540
668
|
|
|
541
669
|
|
|
542
670
|
def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
|
|
@@ -556,35 +684,37 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
|
|
|
556
684
|
|
|
557
685
|
|
|
558
686
|
def _convert_details(*, text: str, convert_as_inline: bool) -> str:
|
|
559
|
-
"""Convert HTML details element
|
|
687
|
+
"""Convert HTML details element to semantic Markdown.
|
|
560
688
|
|
|
561
689
|
Args:
|
|
562
690
|
text: The text content of the details element.
|
|
563
691
|
convert_as_inline: Whether to convert as inline content.
|
|
564
692
|
|
|
565
693
|
Returns:
|
|
566
|
-
The converted markdown text
|
|
694
|
+
The converted markdown text (only content, no HTML tags).
|
|
567
695
|
"""
|
|
568
696
|
if convert_as_inline:
|
|
569
697
|
return text
|
|
570
698
|
|
|
571
|
-
|
|
699
|
+
# Details is a semantic container, return its content
|
|
700
|
+
return _format_block_element(text)
|
|
572
701
|
|
|
573
702
|
|
|
574
703
|
def _convert_summary(*, text: str, convert_as_inline: bool) -> str:
|
|
575
|
-
"""Convert HTML summary element
|
|
704
|
+
"""Convert HTML summary element to emphasized text.
|
|
576
705
|
|
|
577
706
|
Args:
|
|
578
707
|
text: The text content of the summary element.
|
|
579
708
|
convert_as_inline: Whether to convert as inline content.
|
|
580
709
|
|
|
581
710
|
Returns:
|
|
582
|
-
The converted markdown text
|
|
711
|
+
The converted markdown text as bold heading.
|
|
583
712
|
"""
|
|
584
713
|
if convert_as_inline:
|
|
585
714
|
return text
|
|
586
715
|
|
|
587
|
-
|
|
716
|
+
# Summary is like a heading/title
|
|
717
|
+
return _format_wrapped_block(text, "**")
|
|
588
718
|
|
|
589
719
|
|
|
590
720
|
def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
|
|
@@ -676,134 +806,46 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
|
|
|
676
806
|
if not text.strip():
|
|
677
807
|
return ""
|
|
678
808
|
|
|
679
|
-
# Escape any existing quotes in the text
|
|
680
809
|
escaped_text = text.strip().replace('"', '\\"')
|
|
681
810
|
return f'"{escaped_text}"'
|
|
682
811
|
|
|
683
812
|
|
|
684
|
-
def
|
|
685
|
-
"""Convert HTML
|
|
813
|
+
def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
814
|
+
"""Convert HTML media elements (audio/video) to semantic Markdown.
|
|
686
815
|
|
|
687
816
|
Args:
|
|
688
|
-
tag: The
|
|
689
|
-
text: The text content of the
|
|
817
|
+
tag: The media tag element.
|
|
818
|
+
text: The text content of the media element (fallback content).
|
|
690
819
|
convert_as_inline: Whether to convert as inline content.
|
|
691
820
|
|
|
692
821
|
Returns:
|
|
693
|
-
The converted markdown text
|
|
822
|
+
The converted markdown text (link if src exists, otherwise fallback content).
|
|
694
823
|
"""
|
|
695
|
-
_ = convert_as_inline # Unused but kept for API consistency
|
|
696
824
|
src = tag.get("src", "")
|
|
697
825
|
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
source_tag = tag.find("source")
|
|
701
|
-
if source_tag and isinstance(source_tag, Tag):
|
|
702
|
-
src = source_tag.get("src", "")
|
|
703
|
-
|
|
704
|
-
# Get other attributes
|
|
705
|
-
controls = "controls" if tag.get("controls") is not None else ""
|
|
706
|
-
autoplay = "autoplay" if tag.get("autoplay") is not None else ""
|
|
707
|
-
loop = "loop" if tag.get("loop") is not None else ""
|
|
708
|
-
muted = "muted" if tag.get("muted") is not None else ""
|
|
709
|
-
preload = tag.get("preload", "")
|
|
710
|
-
|
|
711
|
-
# Build attributes string
|
|
712
|
-
attrs = []
|
|
713
|
-
if src and isinstance(src, str) and src.strip():
|
|
714
|
-
attrs.append(f'src="{src}"')
|
|
715
|
-
if controls:
|
|
716
|
-
attrs.append(controls)
|
|
717
|
-
if autoplay:
|
|
718
|
-
attrs.append(autoplay)
|
|
719
|
-
if loop:
|
|
720
|
-
attrs.append(loop)
|
|
721
|
-
if muted:
|
|
722
|
-
attrs.append(muted)
|
|
723
|
-
if preload and isinstance(preload, str) and preload.strip():
|
|
724
|
-
attrs.append(f'preload="{preload}"')
|
|
725
|
-
|
|
726
|
-
attrs_str = " ".join(attrs)
|
|
727
|
-
|
|
728
|
-
# If there's fallback content, preserve it
|
|
729
|
-
if text.strip():
|
|
730
|
-
if attrs_str:
|
|
731
|
-
return f"<audio {attrs_str}>\n{text.strip()}\n</audio>\n\n"
|
|
732
|
-
return f"<audio>\n{text.strip()}\n</audio>\n\n"
|
|
826
|
+
if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
|
|
827
|
+
src = source_tag.get("src", "")
|
|
733
828
|
|
|
734
|
-
#
|
|
735
|
-
if attrs_str:
|
|
736
|
-
return f"<audio {attrs_str} />\n\n"
|
|
737
|
-
return "<audio />\n\n"
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
741
|
-
"""Convert HTML video element preserving structure with fallback.
|
|
742
|
-
|
|
743
|
-
Args:
|
|
744
|
-
tag: The video tag element.
|
|
745
|
-
text: The text content of the video element (fallback content).
|
|
746
|
-
convert_as_inline: Whether to convert as inline content.
|
|
747
|
-
|
|
748
|
-
Returns:
|
|
749
|
-
The converted markdown text preserving video element.
|
|
750
|
-
"""
|
|
751
|
-
_ = convert_as_inline # Unused but kept for API consistency
|
|
752
|
-
src = tag.get("src", "")
|
|
753
|
-
|
|
754
|
-
# Check for source elements if no src attribute
|
|
755
|
-
if not src:
|
|
756
|
-
source_tag = tag.find("source")
|
|
757
|
-
if source_tag and isinstance(source_tag, Tag):
|
|
758
|
-
src = source_tag.get("src", "")
|
|
759
|
-
|
|
760
|
-
# Get other attributes
|
|
761
|
-
width = tag.get("width", "")
|
|
762
|
-
height = tag.get("height", "")
|
|
763
|
-
poster = tag.get("poster", "")
|
|
764
|
-
controls = "controls" if tag.get("controls") is not None else ""
|
|
765
|
-
autoplay = "autoplay" if tag.get("autoplay") is not None else ""
|
|
766
|
-
loop = "loop" if tag.get("loop") is not None else ""
|
|
767
|
-
muted = "muted" if tag.get("muted") is not None else ""
|
|
768
|
-
preload = tag.get("preload", "")
|
|
769
|
-
|
|
770
|
-
# Build attributes string
|
|
771
|
-
attrs = []
|
|
829
|
+
# If we have a src, convert to a link
|
|
772
830
|
if src and isinstance(src, str) and src.strip():
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
attrs.append(autoplay)
|
|
784
|
-
if loop:
|
|
785
|
-
attrs.append(loop)
|
|
786
|
-
if muted:
|
|
787
|
-
attrs.append(muted)
|
|
788
|
-
if preload and isinstance(preload, str) and preload.strip():
|
|
789
|
-
attrs.append(f'preload="{preload}"')
|
|
790
|
-
|
|
791
|
-
attrs_str = " ".join(attrs)
|
|
792
|
-
|
|
793
|
-
# If there's fallback content, preserve it
|
|
831
|
+
link = f"[{src}]({src})"
|
|
832
|
+
if convert_as_inline:
|
|
833
|
+
return link
|
|
834
|
+
result = f"{link}\n\n"
|
|
835
|
+
# Add fallback content if present
|
|
836
|
+
if text.strip():
|
|
837
|
+
result += f"{text.strip()}\n\n"
|
|
838
|
+
return result
|
|
839
|
+
|
|
840
|
+
# No src, just return fallback content
|
|
794
841
|
if text.strip():
|
|
795
|
-
|
|
796
|
-
return f"<video {attrs_str}>\n{text.strip()}\n</video>\n\n"
|
|
797
|
-
return f"<video>\n{text.strip()}\n</video>\n\n"
|
|
842
|
+
return _format_inline_or_block(text, convert_as_inline)
|
|
798
843
|
|
|
799
|
-
|
|
800
|
-
if attrs_str:
|
|
801
|
-
return f"<video {attrs_str} />\n\n"
|
|
802
|
-
return "<video />\n\n"
|
|
844
|
+
return ""
|
|
803
845
|
|
|
804
846
|
|
|
805
847
|
def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
806
|
-
"""Convert HTML iframe element
|
|
848
|
+
"""Convert HTML iframe element to semantic Markdown.
|
|
807
849
|
|
|
808
850
|
Args:
|
|
809
851
|
tag: The iframe tag element.
|
|
@@ -811,51 +853,19 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
811
853
|
convert_as_inline: Whether to convert as inline content.
|
|
812
854
|
|
|
813
855
|
Returns:
|
|
814
|
-
The converted markdown text
|
|
856
|
+
The converted markdown text (link if src exists).
|
|
815
857
|
"""
|
|
816
|
-
_ = text
|
|
817
|
-
_ = convert_as_inline # Unused but kept for API consistency
|
|
858
|
+
_ = text
|
|
818
859
|
src = tag.get("src", "")
|
|
819
|
-
width = tag.get("width", "")
|
|
820
|
-
height = tag.get("height", "")
|
|
821
|
-
title = tag.get("title", "")
|
|
822
|
-
allow = tag.get("allow", "")
|
|
823
|
-
sandbox = tag.get("sandbox") # Don't provide default
|
|
824
|
-
loading = tag.get("loading", "")
|
|
825
|
-
|
|
826
|
-
# Build attributes string
|
|
827
|
-
attrs = []
|
|
828
|
-
if src and isinstance(src, str) and src.strip():
|
|
829
|
-
attrs.append(f'src="{src}"')
|
|
830
|
-
if width and isinstance(width, str) and width.strip():
|
|
831
|
-
attrs.append(f'width="{width}"')
|
|
832
|
-
if height and isinstance(height, str) and height.strip():
|
|
833
|
-
attrs.append(f'height="{height}"')
|
|
834
|
-
if title and isinstance(title, str) and title.strip():
|
|
835
|
-
attrs.append(f'title="{title}"')
|
|
836
|
-
if allow and isinstance(allow, str) and allow.strip():
|
|
837
|
-
attrs.append(f'allow="{allow}"')
|
|
838
|
-
if sandbox is not None:
|
|
839
|
-
if isinstance(sandbox, list):
|
|
840
|
-
# BeautifulSoup returns AttributeValueList for space-separated values
|
|
841
|
-
if sandbox:
|
|
842
|
-
attrs.append(f'sandbox="{" ".join(sandbox)}"')
|
|
843
|
-
else:
|
|
844
|
-
# Empty list means boolean attribute
|
|
845
|
-
attrs.append("sandbox")
|
|
846
|
-
elif isinstance(sandbox, str) and sandbox:
|
|
847
|
-
attrs.append(f'sandbox="{sandbox}"')
|
|
848
|
-
else:
|
|
849
|
-
attrs.append("sandbox")
|
|
850
|
-
if loading and isinstance(loading, str) and loading.strip():
|
|
851
|
-
attrs.append(f'loading="{loading}"')
|
|
852
860
|
|
|
853
|
-
|
|
861
|
+
# If we have a src, convert to a link
|
|
862
|
+
if src and isinstance(src, str) and src.strip():
|
|
863
|
+
link = f"[{src}]({src})"
|
|
864
|
+
if convert_as_inline:
|
|
865
|
+
return link
|
|
866
|
+
return f"{link}\n\n"
|
|
854
867
|
|
|
855
|
-
|
|
856
|
-
if attrs_str:
|
|
857
|
-
return f"<iframe {attrs_str}></iframe>\n\n"
|
|
858
|
-
return "<iframe></iframe>\n\n"
|
|
868
|
+
return ""
|
|
859
869
|
|
|
860
870
|
|
|
861
871
|
def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
@@ -869,20 +879,19 @@ def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
869
879
|
Returns:
|
|
870
880
|
The converted markdown text with optional title annotation.
|
|
871
881
|
"""
|
|
872
|
-
_ = convert_as_inline
|
|
882
|
+
_ = convert_as_inline
|
|
873
883
|
if not text.strip():
|
|
874
884
|
return ""
|
|
875
885
|
|
|
876
886
|
title = tag.get("title")
|
|
877
887
|
if title and isinstance(title, str) and title.strip():
|
|
878
|
-
# Show abbreviation with title in parentheses
|
|
879
888
|
return f"{text.strip()} ({title.strip()})"
|
|
880
889
|
|
|
881
890
|
return text.strip()
|
|
882
891
|
|
|
883
892
|
|
|
884
893
|
def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
885
|
-
"""Convert HTML time element
|
|
894
|
+
"""Convert HTML time element to semantic Markdown.
|
|
886
895
|
|
|
887
896
|
Args:
|
|
888
897
|
tag: The time tag element.
|
|
@@ -890,22 +899,19 @@ def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
890
899
|
convert_as_inline: Whether to convert as inline content.
|
|
891
900
|
|
|
892
901
|
Returns:
|
|
893
|
-
The converted markdown text
|
|
902
|
+
The converted markdown text (content only, no HTML tags).
|
|
894
903
|
"""
|
|
895
|
-
_ =
|
|
904
|
+
_ = tag
|
|
905
|
+
_ = convert_as_inline
|
|
896
906
|
if not text.strip():
|
|
897
907
|
return ""
|
|
898
908
|
|
|
899
|
-
|
|
900
|
-
if datetime_attr and isinstance(datetime_attr, str) and datetime_attr.strip():
|
|
901
|
-
# Preserve machine-readable datetime in HTML
|
|
902
|
-
return f'<time datetime="{datetime_attr.strip()}">{text.strip()}</time>'
|
|
903
|
-
|
|
909
|
+
# Time elements are semantic - just return the content
|
|
904
910
|
return text.strip()
|
|
905
911
|
|
|
906
912
|
|
|
907
913
|
def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
908
|
-
"""Convert HTML data element
|
|
914
|
+
"""Convert HTML data element to semantic Markdown.
|
|
909
915
|
|
|
910
916
|
Args:
|
|
911
917
|
tag: The data tag element.
|
|
@@ -913,17 +919,14 @@ def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
913
919
|
convert_as_inline: Whether to convert as inline content.
|
|
914
920
|
|
|
915
921
|
Returns:
|
|
916
|
-
The converted markdown text
|
|
922
|
+
The converted markdown text (content only, no HTML tags).
|
|
917
923
|
"""
|
|
918
|
-
_ =
|
|
924
|
+
_ = tag
|
|
925
|
+
_ = convert_as_inline
|
|
919
926
|
if not text.strip():
|
|
920
927
|
return ""
|
|
921
928
|
|
|
922
|
-
|
|
923
|
-
if value_attr and isinstance(value_attr, str) and value_attr.strip():
|
|
924
|
-
# Preserve machine-readable value in HTML
|
|
925
|
-
return f'<data value="{value_attr.strip()}">{text.strip()}</data>'
|
|
926
|
-
|
|
929
|
+
# Data elements are semantic - just return the content
|
|
927
930
|
return text.strip()
|
|
928
931
|
|
|
929
932
|
|
|
@@ -936,12 +939,12 @@ def _convert_wbr(*, convert_as_inline: bool) -> str:
|
|
|
936
939
|
Returns:
|
|
937
940
|
Empty string as wbr is just a break opportunity.
|
|
938
941
|
"""
|
|
939
|
-
_ = convert_as_inline
|
|
940
|
-
return ""
|
|
942
|
+
_ = convert_as_inline
|
|
943
|
+
return ""
|
|
941
944
|
|
|
942
945
|
|
|
943
946
|
def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
944
|
-
"""Convert HTML form element
|
|
947
|
+
"""Convert HTML form element to semantic Markdown.
|
|
945
948
|
|
|
946
949
|
Args:
|
|
947
950
|
tag: The form tag element.
|
|
@@ -949,38 +952,28 @@ def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
949
952
|
convert_as_inline: Whether to convert as inline content.
|
|
950
953
|
|
|
951
954
|
Returns:
|
|
952
|
-
The converted markdown text
|
|
955
|
+
The converted markdown text (only content, no HTML tags).
|
|
953
956
|
"""
|
|
957
|
+
_ = tag
|
|
954
958
|
if convert_as_inline:
|
|
955
959
|
return text
|
|
956
960
|
|
|
957
961
|
if not text.strip():
|
|
958
962
|
return ""
|
|
959
963
|
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
attrs = []
|
|
963
|
-
|
|
964
|
-
if action and isinstance(action, str) and action.strip():
|
|
965
|
-
attrs.append(f'action="{action.strip()}"')
|
|
966
|
-
if method and isinstance(method, str) and method.strip():
|
|
967
|
-
attrs.append(f'method="{method.strip()}"')
|
|
968
|
-
|
|
969
|
-
attrs_str = " ".join(attrs)
|
|
970
|
-
if attrs_str:
|
|
971
|
-
return f"<form {attrs_str}>\n{text.strip()}\n</form>\n\n"
|
|
972
|
-
return f"<form>\n{text.strip()}\n</form>\n\n"
|
|
964
|
+
# Forms are just containers, return their content
|
|
965
|
+
return text
|
|
973
966
|
|
|
974
967
|
|
|
975
968
|
def _convert_fieldset(*, text: str, convert_as_inline: bool) -> str:
|
|
976
|
-
"""Convert HTML fieldset element
|
|
969
|
+
"""Convert HTML fieldset element to semantic Markdown.
|
|
977
970
|
|
|
978
971
|
Args:
|
|
979
972
|
text: The text content of the fieldset element.
|
|
980
973
|
convert_as_inline: Whether to convert as inline content.
|
|
981
974
|
|
|
982
975
|
Returns:
|
|
983
|
-
The converted markdown text
|
|
976
|
+
The converted markdown text (only content, no HTML tags).
|
|
984
977
|
"""
|
|
985
978
|
if convert_as_inline:
|
|
986
979
|
return text
|
|
@@ -988,7 +981,8 @@ def _convert_fieldset(*, text: str, convert_as_inline: bool) -> str:
|
|
|
988
981
|
if not text.strip():
|
|
989
982
|
return ""
|
|
990
983
|
|
|
991
|
-
return
|
|
984
|
+
# Fieldsets are semantic groupings, return their content
|
|
985
|
+
return text
|
|
992
986
|
|
|
993
987
|
|
|
994
988
|
def _convert_legend(*, text: str, convert_as_inline: bool) -> str:
|
|
@@ -1007,11 +1001,12 @@ def _convert_legend(*, text: str, convert_as_inline: bool) -> str:
|
|
|
1007
1001
|
if not text.strip():
|
|
1008
1002
|
return ""
|
|
1009
1003
|
|
|
1010
|
-
|
|
1004
|
+
# Legend is like a heading/title for fieldsets
|
|
1005
|
+
return _format_wrapped_block(text, "**")
|
|
1011
1006
|
|
|
1012
1007
|
|
|
1013
1008
|
def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1014
|
-
"""Convert HTML label element
|
|
1009
|
+
"""Convert HTML label element to Markdown.
|
|
1015
1010
|
|
|
1016
1011
|
Args:
|
|
1017
1012
|
tag: The label tag element.
|
|
@@ -1019,80 +1014,33 @@ def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1019
1014
|
convert_as_inline: Whether to convert as inline content.
|
|
1020
1015
|
|
|
1021
1016
|
Returns:
|
|
1022
|
-
The
|
|
1017
|
+
The label text content.
|
|
1023
1018
|
"""
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1019
|
+
_ = tag
|
|
1020
|
+
# Labels are just text, return the content
|
|
1027
1021
|
if not text.strip():
|
|
1028
1022
|
return ""
|
|
1029
1023
|
|
|
1030
|
-
|
|
1031
|
-
if for_attr and isinstance(for_attr, str) and for_attr.strip():
|
|
1032
|
-
return f'<label for="{for_attr.strip()}">{text.strip()}</label>\n\n'
|
|
1033
|
-
|
|
1034
|
-
return f"<label>{text.strip()}</label>\n\n"
|
|
1024
|
+
return _format_inline_or_block(text, convert_as_inline)
|
|
1035
1025
|
|
|
1036
1026
|
|
|
1037
1027
|
def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
|
|
1038
|
-
"""Convert HTML input element
|
|
1028
|
+
"""Convert HTML input element to Markdown.
|
|
1039
1029
|
|
|
1040
1030
|
Args:
|
|
1041
1031
|
tag: The input tag element.
|
|
1042
1032
|
convert_as_inline: Whether to convert as inline content.
|
|
1043
1033
|
|
|
1044
1034
|
Returns:
|
|
1045
|
-
|
|
1035
|
+
Empty string since input elements have no Markdown representation.
|
|
1046
1036
|
"""
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
# and ignore other input types in list items (legacy behavior)
|
|
1051
|
-
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
1052
|
-
|
|
1053
|
-
if _has_ancestor(tag, "li"):
|
|
1054
|
-
return ""
|
|
1055
|
-
|
|
1056
|
-
id_attr = tag.get("id", "")
|
|
1057
|
-
name = tag.get("name", "")
|
|
1058
|
-
value = tag.get("value", "")
|
|
1059
|
-
placeholder = tag.get("placeholder", "")
|
|
1060
|
-
required = tag.get("required") is not None
|
|
1061
|
-
disabled = tag.get("disabled") is not None
|
|
1062
|
-
readonly = tag.get("readonly") is not None
|
|
1063
|
-
checked = tag.get("checked") is not None
|
|
1064
|
-
accept = tag.get("accept", "")
|
|
1065
|
-
|
|
1066
|
-
attrs = []
|
|
1067
|
-
if input_type and isinstance(input_type, str):
|
|
1068
|
-
attrs.append(f'type="{input_type}"')
|
|
1069
|
-
if id_attr and isinstance(id_attr, str) and id_attr.strip():
|
|
1070
|
-
attrs.append(f'id="{id_attr}"')
|
|
1071
|
-
if name and isinstance(name, str) and name.strip():
|
|
1072
|
-
attrs.append(f'name="{name}"')
|
|
1073
|
-
if value and isinstance(value, str) and value.strip():
|
|
1074
|
-
attrs.append(f'value="{value}"')
|
|
1075
|
-
if placeholder and isinstance(placeholder, str) and placeholder.strip():
|
|
1076
|
-
attrs.append(f'placeholder="{placeholder}"')
|
|
1077
|
-
if accept and isinstance(accept, str) and accept.strip():
|
|
1078
|
-
attrs.append(f'accept="{accept}"')
|
|
1079
|
-
if required:
|
|
1080
|
-
attrs.append("required")
|
|
1081
|
-
if disabled:
|
|
1082
|
-
attrs.append("disabled")
|
|
1083
|
-
if readonly:
|
|
1084
|
-
attrs.append("readonly")
|
|
1085
|
-
if checked:
|
|
1086
|
-
attrs.append("checked")
|
|
1087
|
-
|
|
1088
|
-
attrs_str = " ".join(attrs)
|
|
1089
|
-
result = f"<input {attrs_str} />" if attrs_str else "<input />"
|
|
1090
|
-
|
|
1091
|
-
return result if convert_as_inline else f"{result}\n\n"
|
|
1037
|
+
_ = tag, convert_as_inline
|
|
1038
|
+
# Input elements have no content and no Markdown equivalent
|
|
1039
|
+
return ""
|
|
1092
1040
|
|
|
1093
1041
|
|
|
1094
1042
|
def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1095
|
-
"""Convert HTML textarea element
|
|
1043
|
+
"""Convert HTML textarea element to Markdown.
|
|
1096
1044
|
|
|
1097
1045
|
Args:
|
|
1098
1046
|
tag: The textarea tag element.
|
|
@@ -1100,42 +1048,18 @@ def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1100
1048
|
convert_as_inline: Whether to convert as inline content.
|
|
1101
1049
|
|
|
1102
1050
|
Returns:
|
|
1103
|
-
The
|
|
1051
|
+
The text content of the textarea.
|
|
1104
1052
|
"""
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1053
|
+
_ = tag
|
|
1054
|
+
# Return the text content, which is what the user entered
|
|
1108
1055
|
if not text.strip():
|
|
1109
1056
|
return ""
|
|
1110
1057
|
|
|
1111
|
-
|
|
1112
|
-
placeholder = tag.get("placeholder", "")
|
|
1113
|
-
rows = tag.get("rows", "")
|
|
1114
|
-
cols = tag.get("cols", "")
|
|
1115
|
-
required = tag.get("required") is not None
|
|
1116
|
-
|
|
1117
|
-
attrs = []
|
|
1118
|
-
if name and isinstance(name, str) and name.strip():
|
|
1119
|
-
attrs.append(f'name="{name}"')
|
|
1120
|
-
if placeholder and isinstance(placeholder, str) and placeholder.strip():
|
|
1121
|
-
attrs.append(f'placeholder="{placeholder}"')
|
|
1122
|
-
if rows and isinstance(rows, str) and rows.strip():
|
|
1123
|
-
attrs.append(f'rows="{rows}"')
|
|
1124
|
-
if cols and isinstance(cols, str) and cols.strip():
|
|
1125
|
-
attrs.append(f'cols="{cols}"')
|
|
1126
|
-
if required:
|
|
1127
|
-
attrs.append("required")
|
|
1128
|
-
|
|
1129
|
-
attrs_str = " ".join(attrs)
|
|
1130
|
-
content = text.strip()
|
|
1131
|
-
|
|
1132
|
-
if attrs_str:
|
|
1133
|
-
return f"<textarea {attrs_str}>{content}</textarea>\n\n"
|
|
1134
|
-
return f"<textarea>{content}</textarea>\n\n"
|
|
1058
|
+
return _format_inline_or_block(text, convert_as_inline)
|
|
1135
1059
|
|
|
1136
1060
|
|
|
1137
1061
|
def _convert_select(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1138
|
-
"""Convert HTML select element
|
|
1062
|
+
"""Convert HTML select element to Markdown.
|
|
1139
1063
|
|
|
1140
1064
|
Args:
|
|
1141
1065
|
tag: The select tag element.
|
|
@@ -1143,39 +1067,25 @@ def _convert_select(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1143
1067
|
convert_as_inline: Whether to convert as inline content.
|
|
1144
1068
|
|
|
1145
1069
|
Returns:
|
|
1146
|
-
The
|
|
1070
|
+
The text content (options) as a comma-separated list.
|
|
1147
1071
|
"""
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1072
|
+
_ = tag
|
|
1073
|
+
# Return the options as text
|
|
1151
1074
|
if not text.strip():
|
|
1152
1075
|
return ""
|
|
1153
1076
|
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
attrs = []
|
|
1160
|
-
if id_attr and isinstance(id_attr, str) and id_attr.strip():
|
|
1161
|
-
attrs.append(f'id="{id_attr}"')
|
|
1162
|
-
if name and isinstance(name, str) and name.strip():
|
|
1163
|
-
attrs.append(f'name="{name}"')
|
|
1164
|
-
if multiple:
|
|
1165
|
-
attrs.append("multiple")
|
|
1166
|
-
if required:
|
|
1167
|
-
attrs.append("required")
|
|
1168
|
-
|
|
1169
|
-
attrs_str = " ".join(attrs)
|
|
1170
|
-
content = text.strip()
|
|
1077
|
+
# In inline mode, show options separated by commas
|
|
1078
|
+
if convert_as_inline:
|
|
1079
|
+
# Remove extra whitespace and join options
|
|
1080
|
+
options = [opt.strip() for opt in text.strip().split("\n") if opt.strip()]
|
|
1081
|
+
return ", ".join(options)
|
|
1171
1082
|
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
return f"<select>\n{content}\n</select>\n\n"
|
|
1083
|
+
# In block mode, show as a list
|
|
1084
|
+
return _format_block_element(text)
|
|
1175
1085
|
|
|
1176
1086
|
|
|
1177
1087
|
def _convert_option(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1178
|
-
"""Convert HTML option element
|
|
1088
|
+
"""Convert HTML option element to Markdown.
|
|
1179
1089
|
|
|
1180
1090
|
Args:
|
|
1181
1091
|
tag: The option tag element.
|
|
@@ -1183,33 +1093,26 @@ def _convert_option(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1183
1093
|
convert_as_inline: Whether to convert as inline content.
|
|
1184
1094
|
|
|
1185
1095
|
Returns:
|
|
1186
|
-
The
|
|
1096
|
+
The option text, potentially with a marker if selected.
|
|
1187
1097
|
"""
|
|
1188
|
-
if convert_as_inline:
|
|
1189
|
-
return text
|
|
1190
|
-
|
|
1191
1098
|
if not text.strip():
|
|
1192
1099
|
return ""
|
|
1193
1100
|
|
|
1194
|
-
|
|
1101
|
+
# Check if this option is selected
|
|
1195
1102
|
selected = tag.get("selected") is not None
|
|
1196
|
-
|
|
1197
|
-
attrs = []
|
|
1198
|
-
if value and isinstance(value, str) and value.strip():
|
|
1199
|
-
attrs.append(f'value="{value}"')
|
|
1200
|
-
if selected:
|
|
1201
|
-
attrs.append("selected")
|
|
1202
|
-
|
|
1203
|
-
attrs_str = " ".join(attrs)
|
|
1204
1103
|
content = text.strip()
|
|
1205
1104
|
|
|
1206
|
-
if
|
|
1207
|
-
return
|
|
1208
|
-
|
|
1105
|
+
if convert_as_inline:
|
|
1106
|
+
return content
|
|
1107
|
+
|
|
1108
|
+
# In block mode, mark selected options
|
|
1109
|
+
if selected:
|
|
1110
|
+
return f"* {content}\n"
|
|
1111
|
+
return f"{content}\n"
|
|
1209
1112
|
|
|
1210
1113
|
|
|
1211
1114
|
def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1212
|
-
"""Convert HTML optgroup element
|
|
1115
|
+
"""Convert HTML optgroup element to semantic Markdown.
|
|
1213
1116
|
|
|
1214
1117
|
Args:
|
|
1215
1118
|
tag: The optgroup tag element.
|
|
@@ -1217,7 +1120,7 @@ def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1217
1120
|
convert_as_inline: Whether to convert as inline content.
|
|
1218
1121
|
|
|
1219
1122
|
Returns:
|
|
1220
|
-
The converted markdown text
|
|
1123
|
+
The converted markdown text with label as heading.
|
|
1221
1124
|
"""
|
|
1222
1125
|
if convert_as_inline:
|
|
1223
1126
|
return text
|
|
@@ -1226,21 +1129,17 @@ def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1226
1129
|
return ""
|
|
1227
1130
|
|
|
1228
1131
|
label = tag.get("label", "")
|
|
1132
|
+
content = text.strip()
|
|
1229
1133
|
|
|
1230
|
-
|
|
1134
|
+
# If there's a label, show it as a heading
|
|
1231
1135
|
if label and isinstance(label, str) and label.strip():
|
|
1232
|
-
|
|
1136
|
+
return f"**{label.strip()}**\n{content}\n"
|
|
1233
1137
|
|
|
1234
|
-
|
|
1235
|
-
content = text.strip()
|
|
1236
|
-
|
|
1237
|
-
if attrs_str:
|
|
1238
|
-
return f"<optgroup {attrs_str}>\n{content}\n</optgroup>\n"
|
|
1239
|
-
return f"<optgroup>\n{content}\n</optgroup>\n"
|
|
1138
|
+
return f"{content}\n"
|
|
1240
1139
|
|
|
1241
1140
|
|
|
1242
1141
|
def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1243
|
-
"""Convert HTML button element
|
|
1142
|
+
"""Convert HTML button element to Markdown.
|
|
1244
1143
|
|
|
1245
1144
|
Args:
|
|
1246
1145
|
tag: The button tag element.
|
|
@@ -1248,38 +1147,18 @@ def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1248
1147
|
convert_as_inline: Whether to convert as inline content.
|
|
1249
1148
|
|
|
1250
1149
|
Returns:
|
|
1251
|
-
The
|
|
1150
|
+
The button text content.
|
|
1252
1151
|
"""
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1152
|
+
_ = tag
|
|
1153
|
+
# Buttons are just interactive text, return the text content
|
|
1256
1154
|
if not text.strip():
|
|
1257
1155
|
return ""
|
|
1258
1156
|
|
|
1259
|
-
|
|
1260
|
-
name = tag.get("name", "")
|
|
1261
|
-
value = tag.get("value", "")
|
|
1262
|
-
disabled = tag.get("disabled") is not None
|
|
1263
|
-
|
|
1264
|
-
attrs = []
|
|
1265
|
-
if button_type and isinstance(button_type, str) and button_type.strip():
|
|
1266
|
-
attrs.append(f'type="{button_type}"')
|
|
1267
|
-
if name and isinstance(name, str) and name.strip():
|
|
1268
|
-
attrs.append(f'name="{name}"')
|
|
1269
|
-
if value and isinstance(value, str) and value.strip():
|
|
1270
|
-
attrs.append(f'value="{value}"')
|
|
1271
|
-
if disabled:
|
|
1272
|
-
attrs.append("disabled")
|
|
1273
|
-
|
|
1274
|
-
attrs_str = " ".join(attrs)
|
|
1275
|
-
|
|
1276
|
-
if attrs_str:
|
|
1277
|
-
return f"<button {attrs_str}>{text.strip()}</button>\n\n"
|
|
1278
|
-
return f"<button>{text.strip()}</button>\n\n"
|
|
1157
|
+
return _format_inline_or_block(text, convert_as_inline)
|
|
1279
1158
|
|
|
1280
1159
|
|
|
1281
1160
|
def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1282
|
-
"""Convert HTML progress element
|
|
1161
|
+
"""Convert HTML progress element to semantic text.
|
|
1283
1162
|
|
|
1284
1163
|
Args:
|
|
1285
1164
|
tag: The progress tag element.
|
|
@@ -1287,33 +1166,21 @@ def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1287
1166
|
convert_as_inline: Whether to convert as inline content.
|
|
1288
1167
|
|
|
1289
1168
|
Returns:
|
|
1290
|
-
The converted markdown text
|
|
1169
|
+
The converted markdown text (only content, no HTML tags).
|
|
1291
1170
|
"""
|
|
1171
|
+
_ = tag
|
|
1292
1172
|
if convert_as_inline:
|
|
1293
1173
|
return text
|
|
1294
1174
|
|
|
1295
1175
|
if not text.strip():
|
|
1296
1176
|
return ""
|
|
1297
1177
|
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
attrs = []
|
|
1302
|
-
if value and isinstance(value, str) and value.strip():
|
|
1303
|
-
attrs.append(f'value="{value}"')
|
|
1304
|
-
if max_val and isinstance(max_val, str) and max_val.strip():
|
|
1305
|
-
attrs.append(f'max="{max_val}"')
|
|
1306
|
-
|
|
1307
|
-
attrs_str = " ".join(attrs)
|
|
1308
|
-
content = text.strip()
|
|
1309
|
-
|
|
1310
|
-
if attrs_str:
|
|
1311
|
-
return f"<progress {attrs_str}>{content}</progress>\n\n"
|
|
1312
|
-
return f"<progress>{content}</progress>\n\n"
|
|
1178
|
+
# Progress elements convert to their text content
|
|
1179
|
+
return _format_block_element(text)
|
|
1313
1180
|
|
|
1314
1181
|
|
|
1315
1182
|
def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1316
|
-
"""Convert HTML meter element
|
|
1183
|
+
"""Convert HTML meter element to semantic text.
|
|
1317
1184
|
|
|
1318
1185
|
Args:
|
|
1319
1186
|
tag: The meter tag element.
|
|
@@ -1321,45 +1188,21 @@ def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1321
1188
|
convert_as_inline: Whether to convert as inline content.
|
|
1322
1189
|
|
|
1323
1190
|
Returns:
|
|
1324
|
-
The converted markdown text
|
|
1191
|
+
The converted markdown text (only content, no HTML tags).
|
|
1325
1192
|
"""
|
|
1193
|
+
_ = tag
|
|
1326
1194
|
if convert_as_inline:
|
|
1327
1195
|
return text
|
|
1328
1196
|
|
|
1329
1197
|
if not text.strip():
|
|
1330
1198
|
return ""
|
|
1331
1199
|
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
max_val = tag.get("max", "")
|
|
1335
|
-
low = tag.get("low", "")
|
|
1336
|
-
high = tag.get("high", "")
|
|
1337
|
-
optimum = tag.get("optimum", "")
|
|
1338
|
-
|
|
1339
|
-
attrs = []
|
|
1340
|
-
if value and isinstance(value, str) and value.strip():
|
|
1341
|
-
attrs.append(f'value="{value}"')
|
|
1342
|
-
if min_val and isinstance(min_val, str) and min_val.strip():
|
|
1343
|
-
attrs.append(f'min="{min_val}"')
|
|
1344
|
-
if max_val and isinstance(max_val, str) and max_val.strip():
|
|
1345
|
-
attrs.append(f'max="{max_val}"')
|
|
1346
|
-
if low and isinstance(low, str) and low.strip():
|
|
1347
|
-
attrs.append(f'low="{low}"')
|
|
1348
|
-
if high and isinstance(high, str) and high.strip():
|
|
1349
|
-
attrs.append(f'high="{high}"')
|
|
1350
|
-
if optimum and isinstance(optimum, str) and optimum.strip():
|
|
1351
|
-
attrs.append(f'optimum="{optimum}"')
|
|
1352
|
-
|
|
1353
|
-
attrs_str = " ".join(attrs)
|
|
1354
|
-
content = text.strip()
|
|
1355
|
-
|
|
1356
|
-
if attrs_str:
|
|
1357
|
-
return f"<meter {attrs_str}>{content}</meter>\n\n"
|
|
1358
|
-
return f"<meter>{content}</meter>\n\n"
|
|
1200
|
+
# Meter elements convert to their text content
|
|
1201
|
+
return _format_block_element(text)
|
|
1359
1202
|
|
|
1360
1203
|
|
|
1361
1204
|
def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1362
|
-
"""Convert HTML output element
|
|
1205
|
+
"""Convert HTML output element to semantic text.
|
|
1363
1206
|
|
|
1364
1207
|
Args:
|
|
1365
1208
|
tag: The output tag element.
|
|
@@ -1367,35 +1210,21 @@ def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1367
1210
|
convert_as_inline: Whether to convert as inline content.
|
|
1368
1211
|
|
|
1369
1212
|
Returns:
|
|
1370
|
-
The converted markdown text
|
|
1213
|
+
The converted markdown text (only content, no HTML tags).
|
|
1371
1214
|
"""
|
|
1215
|
+
_ = tag
|
|
1372
1216
|
if convert_as_inline:
|
|
1373
1217
|
return text
|
|
1374
1218
|
|
|
1375
1219
|
if not text.strip():
|
|
1376
1220
|
return ""
|
|
1377
1221
|
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
attrs = []
|
|
1382
|
-
if for_attr:
|
|
1383
|
-
# BeautifulSoup returns space-separated attributes as lists
|
|
1384
|
-
for_value = " ".join(for_attr) if isinstance(for_attr, list) else str(for_attr)
|
|
1385
|
-
if for_value.strip():
|
|
1386
|
-
attrs.append(f'for="{for_value}"')
|
|
1387
|
-
if name and isinstance(name, str) and name.strip():
|
|
1388
|
-
attrs.append(f'name="{name}"')
|
|
1389
|
-
|
|
1390
|
-
attrs_str = " ".join(attrs)
|
|
1391
|
-
|
|
1392
|
-
if attrs_str:
|
|
1393
|
-
return f"<output {attrs_str}>{text.strip()}</output>\n\n"
|
|
1394
|
-
return f"<output>{text.strip()}</output>\n\n"
|
|
1222
|
+
# Output elements convert to their text content
|
|
1223
|
+
return _format_block_element(text)
|
|
1395
1224
|
|
|
1396
1225
|
|
|
1397
1226
|
def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1398
|
-
"""Convert HTML datalist element
|
|
1227
|
+
"""Convert HTML datalist element to semantic Markdown.
|
|
1399
1228
|
|
|
1400
1229
|
Args:
|
|
1401
1230
|
tag: The datalist tag element.
|
|
@@ -1403,26 +1232,17 @@ def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1403
1232
|
convert_as_inline: Whether to convert as inline content.
|
|
1404
1233
|
|
|
1405
1234
|
Returns:
|
|
1406
|
-
The converted markdown text
|
|
1235
|
+
The converted markdown text (only content, no HTML tags).
|
|
1407
1236
|
"""
|
|
1237
|
+
_ = tag
|
|
1408
1238
|
if convert_as_inline:
|
|
1409
1239
|
return text
|
|
1410
1240
|
|
|
1411
1241
|
if not text.strip():
|
|
1412
1242
|
return ""
|
|
1413
1243
|
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
attrs = []
|
|
1417
|
-
if id_attr and isinstance(id_attr, str) and id_attr.strip():
|
|
1418
|
-
attrs.append(f'id="{id_attr}"')
|
|
1419
|
-
|
|
1420
|
-
attrs_str = " ".join(attrs)
|
|
1421
|
-
content = text.strip()
|
|
1422
|
-
|
|
1423
|
-
if attrs_str:
|
|
1424
|
-
return f"<datalist {attrs_str}>\n{content}\n</datalist>\n\n"
|
|
1425
|
-
return f"<datalist>\n{content}\n</datalist>\n\n"
|
|
1244
|
+
# Datalist shows options as a list
|
|
1245
|
+
return _format_block_element(text)
|
|
1426
1246
|
|
|
1427
1247
|
|
|
1428
1248
|
def _convert_ruby(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
@@ -1438,7 +1258,6 @@ def _convert_ruby(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1438
1258
|
if not text.strip():
|
|
1439
1259
|
return ""
|
|
1440
1260
|
|
|
1441
|
-
# Ruby elements are always inline by nature
|
|
1442
1261
|
return text.strip()
|
|
1443
1262
|
|
|
1444
1263
|
|
|
@@ -1455,7 +1274,6 @@ def _convert_rb(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1455
1274
|
if not text.strip():
|
|
1456
1275
|
return ""
|
|
1457
1276
|
|
|
1458
|
-
# Ruby base is the main text, pass through as-is
|
|
1459
1277
|
return text.strip()
|
|
1460
1278
|
|
|
1461
1279
|
|
|
@@ -1470,21 +1288,17 @@ def _convert_rt(*, text: str, convert_as_inline: bool, tag: Tag) -> str: # noqa
|
|
|
1470
1288
|
Returns:
|
|
1471
1289
|
The converted markdown text with pronunciation in parentheses.
|
|
1472
1290
|
"""
|
|
1473
|
-
# Handle empty rt elements - still need parentheses
|
|
1474
1291
|
content = text.strip()
|
|
1475
1292
|
|
|
1476
|
-
# Check if this rt is surrounded by rp elements (fallback parentheses)
|
|
1477
1293
|
prev_sibling = tag.previous_sibling
|
|
1478
1294
|
next_sibling = tag.next_sibling
|
|
1479
1295
|
|
|
1480
|
-
# If surrounded by rp elements, don't add extra parentheses
|
|
1481
1296
|
has_rp_before = prev_sibling and getattr(prev_sibling, "name", None) == "rp"
|
|
1482
1297
|
has_rp_after = next_sibling and getattr(next_sibling, "name", None) == "rp"
|
|
1483
1298
|
|
|
1484
1299
|
if has_rp_before and has_rp_after:
|
|
1485
|
-
# Already has rp parentheses, just return the text
|
|
1486
1300
|
return content
|
|
1487
|
-
|
|
1301
|
+
|
|
1488
1302
|
return f"({content})"
|
|
1489
1303
|
|
|
1490
1304
|
|
|
@@ -1501,7 +1315,6 @@ def _convert_rp(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1501
1315
|
if not text.strip():
|
|
1502
1316
|
return ""
|
|
1503
1317
|
|
|
1504
|
-
# Ruby parentheses preserved for fallback compatibility
|
|
1505
1318
|
return text.strip()
|
|
1506
1319
|
|
|
1507
1320
|
|
|
@@ -1518,12 +1331,11 @@ def _convert_rtc(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1518
1331
|
if not text.strip():
|
|
1519
1332
|
return ""
|
|
1520
1333
|
|
|
1521
|
-
# Ruby text container, pass through content
|
|
1522
1334
|
return text.strip()
|
|
1523
1335
|
|
|
1524
1336
|
|
|
1525
1337
|
def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1526
|
-
"""Convert HTML dialog element
|
|
1338
|
+
"""Convert HTML dialog element to semantic Markdown.
|
|
1527
1339
|
|
|
1528
1340
|
Args:
|
|
1529
1341
|
text: The text content of the dialog element.
|
|
@@ -1531,28 +1343,21 @@ def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1531
1343
|
tag: The dialog tag element.
|
|
1532
1344
|
|
|
1533
1345
|
Returns:
|
|
1534
|
-
The converted markdown text
|
|
1346
|
+
The converted markdown text (only content, no HTML tags).
|
|
1535
1347
|
"""
|
|
1348
|
+
_ = tag
|
|
1536
1349
|
if convert_as_inline:
|
|
1537
1350
|
return text
|
|
1538
1351
|
|
|
1539
1352
|
if not text.strip():
|
|
1540
1353
|
return ""
|
|
1541
1354
|
|
|
1542
|
-
#
|
|
1543
|
-
|
|
1544
|
-
if tag.get("open") is not None:
|
|
1545
|
-
attrs.append("open")
|
|
1546
|
-
if tag.get("id"):
|
|
1547
|
-
attrs.append(f'id="{tag.get("id")}"')
|
|
1548
|
-
|
|
1549
|
-
attrs_str = " " + " ".join(attrs) if attrs else ""
|
|
1550
|
-
|
|
1551
|
-
return f"<dialog{attrs_str}>\n{text.strip()}\n</dialog>\n\n"
|
|
1355
|
+
# Dialog is a semantic container, return its content
|
|
1356
|
+
return _format_block_element(text)
|
|
1552
1357
|
|
|
1553
1358
|
|
|
1554
1359
|
def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1555
|
-
"""Convert HTML menu element
|
|
1360
|
+
"""Convert HTML menu element to semantic Markdown.
|
|
1556
1361
|
|
|
1557
1362
|
Args:
|
|
1558
1363
|
text: The text content of the menu element.
|
|
@@ -1560,30 +1365,21 @@ def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1560
1365
|
tag: The menu tag element.
|
|
1561
1366
|
|
|
1562
1367
|
Returns:
|
|
1563
|
-
The converted markdown text
|
|
1368
|
+
The converted markdown text (only content, no HTML tags).
|
|
1564
1369
|
"""
|
|
1370
|
+
_ = tag
|
|
1565
1371
|
if convert_as_inline:
|
|
1566
1372
|
return text
|
|
1567
1373
|
|
|
1568
1374
|
if not text.strip():
|
|
1569
1375
|
return ""
|
|
1570
1376
|
|
|
1571
|
-
#
|
|
1572
|
-
|
|
1573
|
-
if tag.get("type") and tag.get("type") != "list":
|
|
1574
|
-
attrs.append(f'type="{tag.get("type")}"')
|
|
1575
|
-
if tag.get("label"):
|
|
1576
|
-
attrs.append(f'label="{tag.get("label")}"')
|
|
1577
|
-
if tag.get("id"):
|
|
1578
|
-
attrs.append(f'id="{tag.get("id")}"')
|
|
1579
|
-
|
|
1580
|
-
attrs_str = " " + " ".join(attrs) if attrs else ""
|
|
1581
|
-
|
|
1582
|
-
return f"<menu{attrs_str}>\n{text.strip()}\n</menu>\n\n"
|
|
1377
|
+
# Menu is converted as a list
|
|
1378
|
+
return _format_block_element(text)
|
|
1583
1379
|
|
|
1584
1380
|
|
|
1585
1381
|
def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1586
|
-
"""Convert HTML figure element
|
|
1382
|
+
"""Convert HTML figure element to semantic Markdown.
|
|
1587
1383
|
|
|
1588
1384
|
Args:
|
|
1589
1385
|
text: The text content of the figure element.
|
|
@@ -1591,47 +1387,35 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1591
1387
|
tag: The figure tag element.
|
|
1592
1388
|
|
|
1593
1389
|
Returns:
|
|
1594
|
-
The converted markdown text
|
|
1390
|
+
The converted markdown text (only content, no HTML tags).
|
|
1595
1391
|
"""
|
|
1392
|
+
_ = tag
|
|
1596
1393
|
if not text.strip():
|
|
1597
1394
|
return ""
|
|
1598
1395
|
|
|
1599
1396
|
if convert_as_inline:
|
|
1600
1397
|
return text
|
|
1601
1398
|
|
|
1602
|
-
#
|
|
1603
|
-
|
|
1604
|
-
if tag.get("id"):
|
|
1605
|
-
attrs.append(f'id="{tag.get("id")}"')
|
|
1606
|
-
if tag.get("class"):
|
|
1607
|
-
# Handle class attribute which might be a list
|
|
1608
|
-
class_val = tag.get("class")
|
|
1609
|
-
if isinstance(class_val, list):
|
|
1610
|
-
class_val = " ".join(class_val)
|
|
1611
|
-
attrs.append(f'class="{class_val}"')
|
|
1612
|
-
|
|
1613
|
-
attrs_str = " " + " ".join(attrs) if attrs else ""
|
|
1614
|
-
|
|
1615
|
-
# Check if the figure contains only an image (common case)
|
|
1616
|
-
# In that case, we might want to preserve the figure wrapper
|
|
1399
|
+
# Figure is a semantic container, return its content
|
|
1400
|
+
# Make sure there's proper spacing after the figure content
|
|
1617
1401
|
content = text.strip()
|
|
1618
|
-
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
return
|
|
1402
|
+
if content and not content.endswith("\n\n"):
|
|
1403
|
+
if content.endswith("\n"):
|
|
1404
|
+
content += "\n"
|
|
1405
|
+
else:
|
|
1406
|
+
content += "\n\n"
|
|
1407
|
+
return content
|
|
1624
1408
|
|
|
1625
1409
|
|
|
1626
1410
|
def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
|
|
1627
|
-
"""Convert HTML hgroup element
|
|
1411
|
+
"""Convert HTML hgroup element to semantic Markdown.
|
|
1628
1412
|
|
|
1629
1413
|
Args:
|
|
1630
1414
|
text: The text content of the hgroup element.
|
|
1631
1415
|
convert_as_inline: Whether to convert as inline content.
|
|
1632
1416
|
|
|
1633
1417
|
Returns:
|
|
1634
|
-
The converted markdown text
|
|
1418
|
+
The converted markdown text (only content, no HTML tags).
|
|
1635
1419
|
"""
|
|
1636
1420
|
if convert_as_inline:
|
|
1637
1421
|
return text
|
|
@@ -1639,19 +1423,12 @@ def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
|
|
|
1639
1423
|
if not text.strip():
|
|
1640
1424
|
return ""
|
|
1641
1425
|
|
|
1642
|
-
#
|
|
1643
|
-
|
|
1644
|
-
content = text.strip()
|
|
1645
|
-
|
|
1646
|
-
# Remove excessive newlines between headings in the group
|
|
1647
|
-
# Headings in hgroup should be visually closer together
|
|
1648
|
-
content = re.sub(r"\n{3,}", "\n\n", content)
|
|
1649
|
-
|
|
1650
|
-
return f"<!-- heading group -->\n{content}\n<!-- end heading group -->\n\n"
|
|
1426
|
+
# Hgroup is a semantic container for headings, return its content
|
|
1427
|
+
return text
|
|
1651
1428
|
|
|
1652
1429
|
|
|
1653
1430
|
def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1654
|
-
"""Convert HTML picture element
|
|
1431
|
+
"""Convert HTML picture element to semantic Markdown.
|
|
1655
1432
|
|
|
1656
1433
|
Args:
|
|
1657
1434
|
text: The text content of the picture element.
|
|
@@ -1659,51 +1436,14 @@ def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1659
1436
|
tag: The picture tag element.
|
|
1660
1437
|
|
|
1661
1438
|
Returns:
|
|
1662
|
-
The converted markdown text
|
|
1439
|
+
The converted markdown text (only the img element).
|
|
1663
1440
|
"""
|
|
1441
|
+
_ = tag, convert_as_inline
|
|
1664
1442
|
if not text.strip():
|
|
1665
1443
|
return ""
|
|
1666
1444
|
|
|
1667
|
-
#
|
|
1668
|
-
|
|
1669
|
-
img = tag.find("img")
|
|
1670
|
-
|
|
1671
|
-
if not img:
|
|
1672
|
-
# No img fallback, just return the text content
|
|
1673
|
-
return text.strip()
|
|
1674
|
-
|
|
1675
|
-
# Get the primary image markdown (already converted)
|
|
1676
|
-
img_markdown = text.strip()
|
|
1677
|
-
|
|
1678
|
-
# If there are no sources, just return the image
|
|
1679
|
-
if not sources:
|
|
1680
|
-
return img_markdown
|
|
1681
|
-
|
|
1682
|
-
# Build a comment with source information for responsive images
|
|
1683
|
-
source_info = []
|
|
1684
|
-
for source in sources:
|
|
1685
|
-
srcset = source.get("srcset")
|
|
1686
|
-
media = source.get("media")
|
|
1687
|
-
mime_type = source.get("type")
|
|
1688
|
-
|
|
1689
|
-
if srcset:
|
|
1690
|
-
info = f'srcset="{srcset}"'
|
|
1691
|
-
if media:
|
|
1692
|
-
info += f' media="{media}"'
|
|
1693
|
-
if mime_type:
|
|
1694
|
-
info += f' type="{mime_type}"'
|
|
1695
|
-
source_info.append(info)
|
|
1696
|
-
|
|
1697
|
-
if source_info and not convert_as_inline:
|
|
1698
|
-
# Add picture source information as a comment
|
|
1699
|
-
sources_comment = "<!-- picture sources:\n"
|
|
1700
|
-
for info in source_info:
|
|
1701
|
-
sources_comment += f" {info}\n"
|
|
1702
|
-
sources_comment += "-->\n"
|
|
1703
|
-
return f"{sources_comment}{img_markdown}"
|
|
1704
|
-
|
|
1705
|
-
# In inline mode or no sources, just return the image
|
|
1706
|
-
return img_markdown
|
|
1445
|
+
# Picture is a container for responsive images, only the img matters for Markdown
|
|
1446
|
+
return text.strip()
|
|
1707
1447
|
|
|
1708
1448
|
|
|
1709
1449
|
def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
@@ -1718,23 +1458,17 @@ def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1718
1458
|
The converted markdown text as an image reference.
|
|
1719
1459
|
"""
|
|
1720
1460
|
if convert_as_inline:
|
|
1721
|
-
# In inline mode, just return any text content
|
|
1722
1461
|
return text.strip()
|
|
1723
1462
|
|
|
1724
|
-
# Get SVG attributes
|
|
1725
1463
|
title = tag.find("title")
|
|
1726
1464
|
title_text = title.get_text().strip() if title else ""
|
|
1727
1465
|
|
|
1728
|
-
# For inline SVG, we'll convert to a data URI
|
|
1729
|
-
# First, we need to get the full SVG markup
|
|
1730
1466
|
svg_markup = str(tag)
|
|
1731
1467
|
|
|
1732
|
-
# Create a data URI
|
|
1733
1468
|
svg_bytes = svg_markup.encode("utf-8")
|
|
1734
1469
|
svg_base64 = base64.b64encode(svg_bytes).decode("utf-8")
|
|
1735
1470
|
data_uri = f"data:image/svg+xml;base64,{svg_base64}"
|
|
1736
1471
|
|
|
1737
|
-
# Use title as alt text, or "SVG Image" if no title
|
|
1738
1472
|
alt_text = title_text or "SVG Image"
|
|
1739
1473
|
|
|
1740
1474
|
return f""
|
|
@@ -1754,17 +1488,13 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1754
1488
|
if not text.strip():
|
|
1755
1489
|
return ""
|
|
1756
1490
|
|
|
1757
|
-
# Check if it's display math vs inline math
|
|
1758
1491
|
display = tag.get("display") == "block"
|
|
1759
1492
|
|
|
1760
|
-
# For now, preserve the MathML as a comment with the text representation
|
|
1761
|
-
# This allows systems that understand MathML to process it
|
|
1762
1493
|
math_comment = f"<!-- MathML: {tag!s} -->"
|
|
1763
1494
|
|
|
1764
1495
|
if convert_as_inline or not display:
|
|
1765
|
-
# Inline math - just the text with comment
|
|
1766
1496
|
return f"{math_comment}{text.strip()}"
|
|
1767
|
-
|
|
1497
|
+
|
|
1768
1498
|
return f"\n\n{math_comment}\n{text.strip()}\n\n"
|
|
1769
1499
|
|
|
1770
1500
|
|
|
@@ -1828,10 +1558,10 @@ def create_converters_map(
|
|
|
1828
1558
|
"abbr": _wrapper(_convert_abbr),
|
|
1829
1559
|
"article": _wrapper(_convert_semantic_block),
|
|
1830
1560
|
"aside": _wrapper(_convert_semantic_block),
|
|
1831
|
-
"audio": _wrapper(
|
|
1561
|
+
"audio": _wrapper(_convert_media_element),
|
|
1832
1562
|
"b": _wrapper(partial(_create_inline_converter(2 * strong_em_symbol))),
|
|
1833
|
-
"bdi": _wrapper(_create_inline_converter("")),
|
|
1834
|
-
"bdo": _wrapper(_create_inline_converter("")),
|
|
1563
|
+
"bdi": _wrapper(_create_inline_converter("")),
|
|
1564
|
+
"bdo": _wrapper(_create_inline_converter("")),
|
|
1835
1565
|
"blockquote": _wrapper(partial(_convert_blockquote)),
|
|
1836
1566
|
"br": _wrapper(partial(_convert_br, newline_style=newline_style)),
|
|
1837
1567
|
"button": _wrapper(_convert_button),
|
|
@@ -1845,13 +1575,13 @@ def create_converters_map(
|
|
|
1845
1575
|
"dd": _wrapper(_convert_dd),
|
|
1846
1576
|
"del": _wrapper(_create_inline_converter("~~")),
|
|
1847
1577
|
"details": _wrapper(_convert_details),
|
|
1848
|
-
"dfn": _wrapper(_create_inline_converter("*")),
|
|
1578
|
+
"dfn": _wrapper(_create_inline_converter("*")),
|
|
1849
1579
|
"dialog": _wrapper(_convert_dialog),
|
|
1850
1580
|
"dl": _wrapper(_convert_dl),
|
|
1851
1581
|
"dt": _wrapper(_convert_dt),
|
|
1852
1582
|
"em": _wrapper(_create_inline_converter(strong_em_symbol)),
|
|
1853
1583
|
"fieldset": _wrapper(_convert_fieldset),
|
|
1854
|
-
"figcaption": _wrapper(lambda text: f"\n\n{text}
|
|
1584
|
+
"figcaption": _wrapper(lambda text: f"\n\n*{text.strip()}*\n\n" if text.strip() else ""),
|
|
1855
1585
|
"figure": _wrapper(_convert_figure),
|
|
1856
1586
|
"footer": _wrapper(_convert_semantic_block),
|
|
1857
1587
|
"form": _wrapper(_convert_form),
|
|
@@ -1868,7 +1598,7 @@ def create_converters_map(
|
|
|
1868
1598
|
"iframe": _wrapper(_convert_iframe),
|
|
1869
1599
|
"img": _wrapper(partial(_convert_img, keep_inline_images_in=keep_inline_images_in)),
|
|
1870
1600
|
"input": _wrapper(_convert_input_enhanced),
|
|
1871
|
-
"ins": _wrapper(_create_inline_converter("==")),
|
|
1601
|
+
"ins": _wrapper(_create_inline_converter("==")),
|
|
1872
1602
|
"kbd": _wrapper(_create_inline_converter("`")),
|
|
1873
1603
|
"label": _wrapper(_convert_label),
|
|
1874
1604
|
"legend": _wrapper(_convert_legend),
|
|
@@ -1905,7 +1635,7 @@ def create_converters_map(
|
|
|
1905
1635
|
"script": _wrapper(lambda _: ""),
|
|
1906
1636
|
"section": _wrapper(_convert_semantic_block),
|
|
1907
1637
|
"select": _wrapper(_convert_select),
|
|
1908
|
-
"small": _wrapper(_create_inline_converter("")),
|
|
1638
|
+
"small": _wrapper(_create_inline_converter("")),
|
|
1909
1639
|
"strong": _wrapper(_create_inline_converter(strong_em_symbol * 2)),
|
|
1910
1640
|
"style": _wrapper(lambda _: ""),
|
|
1911
1641
|
"sub": _wrapper(_create_inline_converter(sub_symbol)),
|
|
@@ -1921,9 +1651,9 @@ def create_converters_map(
|
|
|
1921
1651
|
"thead": _wrapper(_convert_thead),
|
|
1922
1652
|
"time": _wrapper(_convert_time),
|
|
1923
1653
|
"tr": _wrapper(_convert_tr),
|
|
1924
|
-
"u": _wrapper(_create_inline_converter("")),
|
|
1654
|
+
"u": _wrapper(_create_inline_converter("")),
|
|
1925
1655
|
"ul": _wrapper(_convert_list),
|
|
1926
|
-
"var": _wrapper(_create_inline_converter("*")),
|
|
1927
|
-
"video": _wrapper(
|
|
1656
|
+
"var": _wrapper(_create_inline_converter("*")),
|
|
1657
|
+
"video": _wrapper(_convert_media_element),
|
|
1928
1658
|
"wbr": _wrapper(_convert_wbr),
|
|
1929
1659
|
}
|