html-to-markdown 1.8.0__py3-none-any.whl → 1.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,11 +5,11 @@ from typing import TYPE_CHECKING
5
5
  if TYPE_CHECKING:
6
6
  from collections.abc import Iterable
7
7
  import base64
8
- import re
8
+ from collections.abc import Callable
9
9
  from functools import partial
10
10
  from inspect import getfullargspec
11
11
  from textwrap import fill
12
- from typing import Any, Callable, Literal, TypeVar, cast
12
+ from typing import Any, Literal, TypeVar, cast
13
13
 
14
14
  from bs4.element import Tag
15
15
 
@@ -21,6 +21,24 @@ from html_to_markdown.constants import (
21
21
  )
22
22
  from html_to_markdown.utils import chomp, indent, underline
23
23
 
24
+
25
+ def _format_block_element(text: str) -> str:
26
+ """Format text as a block element with trailing newlines."""
27
+ return f"{text.strip()}\n\n" if text.strip() else ""
28
+
29
+
30
+ def _format_inline_or_block(text: str, convert_as_inline: bool) -> str:
31
+ """Format text as inline or block element based on context."""
32
+ return text.strip() if convert_as_inline else _format_block_element(text)
33
+
34
+
35
+ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") -> str:
36
+ """Format text wrapped in markers as a block element."""
37
+ if not end_marker:
38
+ end_marker = start_marker
39
+ return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
40
+
41
+
24
42
  SupportedElements = Literal[
25
43
  "a",
26
44
  "abbr",
@@ -189,11 +207,22 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
189
207
  if not text:
190
208
  return ""
191
209
 
210
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
211
+
192
212
  cite_url = tag.get("cite")
193
- quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
213
+
214
+ if _has_ancestor(tag, "li"):
215
+ lines = text.strip().split("\n")
216
+ indented_lines = [f" > {line}" if line.strip() else "" for line in lines]
217
+ quote_text = "\n".join(indented_lines) + "\n\n"
218
+ else:
219
+ quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
194
220
 
195
221
  if cite_url:
196
- quote_text += f"— <{cite_url}>\n\n"
222
+ if _has_ancestor(tag, "li"):
223
+ quote_text += f" — <{cite_url}>\n\n"
224
+ else:
225
+ quote_text += f"— <{cite_url}>\n\n"
197
226
 
198
227
  return quote_text
199
228
 
@@ -243,8 +272,8 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
243
272
  title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
244
273
  parent_name = tag.parent.name if tag.parent else ""
245
274
 
246
- default_preserve_in = ["td", "th"]
247
- preserve_in = set(keep_inline_images_in or []) | set(default_preserve_in)
275
+ default_preserve_in = {"td", "th"}
276
+ preserve_in = set(keep_inline_images_in or []) | default_preserve_in
248
277
  if convert_as_inline and parent_name not in preserve_in:
249
278
  return alt
250
279
  if width or height:
@@ -253,24 +282,49 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
253
282
 
254
283
 
255
284
  def _convert_list(*, tag: Tag, text: str) -> str:
256
- nested = False
285
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
257
286
 
258
287
  before_paragraph = False
259
288
  if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
260
289
  before_paragraph = True
261
290
 
262
- while tag:
263
- if tag.name == "li":
264
- nested = True
265
- break
266
-
267
- if not tag.parent:
268
- break
269
-
270
- tag = tag.parent
271
-
272
- if nested:
273
- return "\n" + indent(text=text, level=1).rstrip()
291
+ if _has_ancestor(tag, "li"):
292
+ parent = tag.parent
293
+ while parent and parent.name != "li":
294
+ parent = parent.parent
295
+
296
+ if parent:
297
+ prev_p = None
298
+ for child in parent.children:
299
+ if hasattr(child, "name"):
300
+ if child == tag:
301
+ break
302
+ if child.name == "p":
303
+ prev_p = child
304
+
305
+ if prev_p:
306
+ lines = text.strip().split("\n")
307
+ indented_lines = []
308
+ for line in lines:
309
+ if line.strip():
310
+ indented_lines.append(f" {line}")
311
+ else:
312
+ indented_lines.append("")
313
+ return "\n" + "\n".join(indented_lines) + "\n"
314
+ return "\n" + indent(text=text, level=1).rstrip()
315
+
316
+ if tag.parent and tag.parent.name in {"ul", "ol"}:
317
+ lines = text.strip().split("\n")
318
+ indented_lines = []
319
+ for line in lines:
320
+ if line.strip():
321
+ indented_lines.append(f" {line}")
322
+ else:
323
+ indented_lines.append("")
324
+ result = "\n".join(indented_lines)
325
+ if not result.endswith("\n"):
326
+ result += "\n"
327
+ return result
274
328
 
275
329
  return text + ("\n" if before_paragraph else "")
276
330
 
@@ -305,10 +359,30 @@ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
305
359
  tag = tag.parent
306
360
 
307
361
  bullet = bullets[depth % len(bullets)]
362
+
363
+ has_block_children = any(
364
+ child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
365
+ for child in tag.children
366
+ if hasattr(child, "name")
367
+ )
368
+
369
+ if has_block_children:
370
+ paragraphs = text.strip().split("\n\n")
371
+
372
+ if paragraphs:
373
+ result_parts = [f"{bullet} {paragraphs[0].strip()}\n"]
374
+
375
+ for para in paragraphs[1:]:
376
+ if para.strip():
377
+ result_parts.append("\n")
378
+ result_parts.extend(f" {line}\n" for line in para.strip().split("\n") if line.strip())
379
+
380
+ return "".join(result_parts)
381
+
308
382
  return "{} {}\n".format(bullet, (text or "").strip())
309
383
 
310
384
 
311
- def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: int) -> str:
385
+ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: int, tag: Tag) -> str:
312
386
  if convert_as_inline:
313
387
  return text
314
388
 
@@ -320,6 +394,25 @@ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: in
320
394
  break_on_hyphens=False,
321
395
  )
322
396
 
397
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
398
+
399
+ if _has_ancestor(tag, "li"):
400
+ parent = tag.parent
401
+ while parent and parent.name != "li":
402
+ parent = parent.parent
403
+
404
+ if parent:
405
+ p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
406
+
407
+ if p_children and tag != p_children[0]:
408
+ indented_lines = []
409
+ for line in text.split("\n"):
410
+ if line.strip():
411
+ indented_lines.append(f" {line}")
412
+ else:
413
+ indented_lines.append("")
414
+ text = "\n".join(indented_lines)
415
+
323
416
  return f"{text}\n\n" if text else ""
324
417
 
325
418
 
@@ -337,13 +430,15 @@ def _convert_mark(*, text: str, convert_as_inline: bool, highlight_style: str) -
337
430
  if convert_as_inline:
338
431
  return text
339
432
 
340
- if highlight_style == "double-equal":
341
- return f"=={text}=="
342
- if highlight_style == "bold":
343
- return f"**{text}**"
344
- if highlight_style == "html":
345
- return f"<mark>{text}</mark>"
346
- return text
433
+ match highlight_style:
434
+ case "double-equal":
435
+ return f"=={text}=="
436
+ case "bold":
437
+ return f"**{text}**"
438
+ case "html":
439
+ return f"<mark>{text}</mark>"
440
+ case _:
441
+ return text
347
442
 
348
443
 
349
444
  def _convert_pre(
@@ -376,6 +471,51 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
376
471
  cells = tag.find_all(["td", "th"])
377
472
  parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
378
473
  tag_grand_parent = tag.parent.parent if tag.parent else None
474
+
475
+ if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
476
+ prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
477
+ rowspan_positions = []
478
+ col_pos = 0
479
+
480
+ for prev_cell in prev_cells:
481
+ rowspan = 1
482
+ if (
483
+ "rowspan" in prev_cell.attrs
484
+ and isinstance(prev_cell["rowspan"], str)
485
+ and prev_cell["rowspan"].isdigit()
486
+ ):
487
+ rowspan = int(prev_cell["rowspan"])
488
+
489
+ if rowspan > 1:
490
+ rowspan_positions.append(col_pos)
491
+
492
+ colspan = 1
493
+ if (
494
+ "colspan" in prev_cell.attrs
495
+ and isinstance(prev_cell["colspan"], str)
496
+ and prev_cell["colspan"].isdigit()
497
+ ):
498
+ colspan = int(prev_cell["colspan"])
499
+ col_pos += colspan
500
+
501
+ if rowspan_positions:
502
+ converted_cells: list[str] = []
503
+ if text.strip():
504
+ parts = text.split("|")
505
+ converted_cells.extend(part.rstrip() + " |" for part in parts[:-1] if part)
506
+
507
+ new_cells: list[str] = []
508
+ cell_index = 0
509
+
510
+ for pos in range(col_pos):
511
+ if pos in rowspan_positions:
512
+ new_cells.append(" |")
513
+ elif cell_index < len(converted_cells):
514
+ new_cells.append(converted_cells[cell_index])
515
+ cell_index += 1
516
+
517
+ text = "".join(new_cells)
518
+
379
519
  is_headrow = (
380
520
  all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
381
521
  or (not tag.previous_sibling and parent_name != "tbody")
@@ -423,7 +563,7 @@ def _convert_caption(*, text: str, convert_as_inline: bool) -> str:
423
563
  if not text.strip():
424
564
  return ""
425
565
 
426
- return f"*{text.strip()}*\n\n"
566
+ return _format_wrapped_block(text, "*")
427
567
 
428
568
 
429
569
  def _convert_thead(*, text: str, convert_as_inline: bool) -> str:
@@ -475,7 +615,10 @@ def _convert_tfoot(*, text: str, convert_as_inline: bool) -> str:
475
615
 
476
616
 
477
617
  def _convert_colgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
478
- """Convert HTML colgroup element preserving column structure for documentation.
618
+ """Convert HTML colgroup element - removes it entirely from Markdown output.
619
+
620
+ Colgroup is a table column grouping element that defines styling for columns.
621
+ It has no representation in Markdown and should be removed.
479
622
 
480
623
  Args:
481
624
  tag: The colgroup tag element.
@@ -483,54 +626,27 @@ def _convert_colgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
483
626
  convert_as_inline: Whether to convert as inline content.
484
627
 
485
628
  Returns:
486
- The converted markdown text preserving colgroup structure.
629
+ Empty string as colgroup has no Markdown representation.
487
630
  """
488
- if convert_as_inline:
489
- return text
490
-
491
- if not text.strip():
492
- return ""
493
-
494
- span = tag.get("span", "")
495
- attrs = []
496
- if span and isinstance(span, str) and span.strip():
497
- attrs.append(f'span="{span}"')
498
-
499
- attrs_str = " ".join(attrs)
500
- if attrs_str:
501
- return f"<colgroup {attrs_str}>\n{text.strip()}\n</colgroup>\n\n"
502
- return f"<colgroup>\n{text.strip()}\n</colgroup>\n\n"
631
+ _ = tag, text, convert_as_inline
632
+ return ""
503
633
 
504
634
 
505
635
  def _convert_col(*, tag: Tag, convert_as_inline: bool) -> str:
506
- """Convert HTML col element preserving column attributes for documentation.
636
+ """Convert HTML col element - removes it entirely from Markdown output.
637
+
638
+ Col elements define column properties (width, style) in HTML tables.
639
+ They have no representation in Markdown and should be removed.
507
640
 
508
641
  Args:
509
642
  tag: The col tag element.
510
643
  convert_as_inline: Whether to convert as inline content.
511
644
 
512
645
  Returns:
513
- The converted markdown text preserving col structure.
646
+ Empty string as col has no Markdown representation.
514
647
  """
515
- if convert_as_inline:
516
- return ""
517
-
518
- span = tag.get("span", "")
519
- width = tag.get("width", "")
520
- style = tag.get("style", "")
521
-
522
- attrs = []
523
- if width and isinstance(width, str) and width.strip():
524
- attrs.append(f'width="{width}"')
525
- if style and isinstance(style, str) and style.strip():
526
- attrs.append(f'style="{style}"')
527
- if span and isinstance(span, str) and span.strip():
528
- attrs.append(f'span="{span}"')
529
-
530
- attrs_str = " ".join(attrs)
531
- if attrs_str:
532
- return f"<col {attrs_str} />\n"
533
- return "<col />\n"
648
+ _ = tag, convert_as_inline
649
+ return ""
534
650
 
535
651
 
536
652
  def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
@@ -550,35 +666,35 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
550
666
 
551
667
 
552
668
  def _convert_details(*, text: str, convert_as_inline: bool) -> str:
553
- """Convert HTML details element preserving HTML structure.
669
+ """Convert HTML details element to semantic Markdown.
554
670
 
555
671
  Args:
556
672
  text: The text content of the details element.
557
673
  convert_as_inline: Whether to convert as inline content.
558
674
 
559
675
  Returns:
560
- The converted markdown text preserving HTML structure.
676
+ The converted markdown text (only content, no HTML tags).
561
677
  """
562
678
  if convert_as_inline:
563
679
  return text
564
680
 
565
- return f"<details>\n{text.strip()}\n</details>\n\n" if text.strip() else ""
681
+ return _format_block_element(text)
566
682
 
567
683
 
568
684
  def _convert_summary(*, text: str, convert_as_inline: bool) -> str:
569
- """Convert HTML summary element preserving HTML structure.
685
+ """Convert HTML summary element to emphasized text.
570
686
 
571
687
  Args:
572
688
  text: The text content of the summary element.
573
689
  convert_as_inline: Whether to convert as inline content.
574
690
 
575
691
  Returns:
576
- The converted markdown text preserving HTML structure.
692
+ The converted markdown text as bold heading.
577
693
  """
578
694
  if convert_as_inline:
579
695
  return text
580
696
 
581
- return f"<summary>{text.strip()}</summary>\n\n" if text.strip() else ""
697
+ return _format_wrapped_block(text, "**")
582
698
 
583
699
 
584
700
  def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
@@ -674,119 +790,39 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
674
790
  return f'"{escaped_text}"'
675
791
 
676
792
 
677
- def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
678
- """Convert HTML audio element preserving structure with fallback.
793
+ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
794
+ """Convert HTML media elements (audio/video) to semantic Markdown.
679
795
 
680
796
  Args:
681
- tag: The audio tag element.
682
- text: The text content of the audio element (fallback content).
797
+ tag: The media tag element.
798
+ text: The text content of the media element (fallback content).
683
799
  convert_as_inline: Whether to convert as inline content.
684
800
 
685
801
  Returns:
686
- The converted markdown text preserving audio element.
802
+ The converted markdown text (link if src exists, otherwise fallback content).
687
803
  """
688
- _ = convert_as_inline
689
804
  src = tag.get("src", "")
690
805
 
691
- if not src:
692
- source_tag = tag.find("source")
693
- if source_tag and isinstance(source_tag, Tag):
694
- src = source_tag.get("src", "")
695
-
696
- controls = "controls" if tag.get("controls") is not None else ""
697
- autoplay = "autoplay" if tag.get("autoplay") is not None else ""
698
- loop = "loop" if tag.get("loop") is not None else ""
699
- muted = "muted" if tag.get("muted") is not None else ""
700
- preload = tag.get("preload", "")
806
+ if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
807
+ src = source_tag.get("src", "")
701
808
 
702
- attrs = []
703
809
  if src and isinstance(src, str) and src.strip():
704
- attrs.append(f'src="{src}"')
705
- if controls:
706
- attrs.append(controls)
707
- if autoplay:
708
- attrs.append(autoplay)
709
- if loop:
710
- attrs.append(loop)
711
- if muted:
712
- attrs.append(muted)
713
- if preload and isinstance(preload, str) and preload.strip():
714
- attrs.append(f'preload="{preload}"')
715
-
716
- attrs_str = " ".join(attrs)
810
+ link = f"[{src}]({src})"
811
+ if convert_as_inline:
812
+ return link
813
+ result = f"{link}\n\n"
814
+ if text.strip():
815
+ result += f"{text.strip()}\n\n"
816
+ return result
717
817
 
718
818
  if text.strip():
719
- if attrs_str:
720
- return f"<audio {attrs_str}>\n{text.strip()}\n</audio>\n\n"
721
- return f"<audio>\n{text.strip()}\n</audio>\n\n"
722
-
723
- if attrs_str:
724
- return f"<audio {attrs_str} />\n\n"
725
- return "<audio />\n\n"
726
-
819
+ return _format_inline_or_block(text, convert_as_inline)
727
820
 
728
- def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
729
- """Convert HTML video element preserving structure with fallback.
730
-
731
- Args:
732
- tag: The video tag element.
733
- text: The text content of the video element (fallback content).
734
- convert_as_inline: Whether to convert as inline content.
735
-
736
- Returns:
737
- The converted markdown text preserving video element.
738
- """
739
- _ = convert_as_inline
740
- src = tag.get("src", "")
741
-
742
- if not src:
743
- source_tag = tag.find("source")
744
- if source_tag and isinstance(source_tag, Tag):
745
- src = source_tag.get("src", "")
746
-
747
- width = tag.get("width", "")
748
- height = tag.get("height", "")
749
- poster = tag.get("poster", "")
750
- controls = "controls" if tag.get("controls") is not None else ""
751
- autoplay = "autoplay" if tag.get("autoplay") is not None else ""
752
- loop = "loop" if tag.get("loop") is not None else ""
753
- muted = "muted" if tag.get("muted") is not None else ""
754
- preload = tag.get("preload", "")
755
-
756
- attrs = []
757
- if src and isinstance(src, str) and src.strip():
758
- attrs.append(f'src="{src}"')
759
- if width and isinstance(width, str) and width.strip():
760
- attrs.append(f'width="{width}"')
761
- if height and isinstance(height, str) and height.strip():
762
- attrs.append(f'height="{height}"')
763
- if poster and isinstance(poster, str) and poster.strip():
764
- attrs.append(f'poster="{poster}"')
765
- if controls:
766
- attrs.append(controls)
767
- if autoplay:
768
- attrs.append(autoplay)
769
- if loop:
770
- attrs.append(loop)
771
- if muted:
772
- attrs.append(muted)
773
- if preload and isinstance(preload, str) and preload.strip():
774
- attrs.append(f'preload="{preload}"')
775
-
776
- attrs_str = " ".join(attrs)
777
-
778
- if text.strip():
779
- if attrs_str:
780
- return f"<video {attrs_str}>\n{text.strip()}\n</video>\n\n"
781
- return f"<video>\n{text.strip()}\n</video>\n\n"
782
-
783
- if attrs_str:
784
- return f"<video {attrs_str} />\n\n"
785
- return "<video />\n\n"
821
+ return ""
786
822
 
787
823
 
788
824
  def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
789
- """Convert HTML iframe element preserving structure.
825
+ """Convert HTML iframe element to semantic Markdown.
790
826
 
791
827
  Args:
792
828
  tag: The iframe tag element.
@@ -794,47 +830,18 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
794
830
  convert_as_inline: Whether to convert as inline content.
795
831
 
796
832
  Returns:
797
- The converted markdown text preserving iframe element.
833
+ The converted markdown text (link if src exists).
798
834
  """
799
835
  _ = text
800
- _ = convert_as_inline
801
836
  src = tag.get("src", "")
802
- width = tag.get("width", "")
803
- height = tag.get("height", "")
804
- title = tag.get("title", "")
805
- allow = tag.get("allow", "")
806
- sandbox = tag.get("sandbox")
807
- loading = tag.get("loading", "")
808
-
809
- attrs = []
810
- if src and isinstance(src, str) and src.strip():
811
- attrs.append(f'src="{src}"')
812
- if width and isinstance(width, str) and width.strip():
813
- attrs.append(f'width="{width}"')
814
- if height and isinstance(height, str) and height.strip():
815
- attrs.append(f'height="{height}"')
816
- if title and isinstance(title, str) and title.strip():
817
- attrs.append(f'title="{title}"')
818
- if allow and isinstance(allow, str) and allow.strip():
819
- attrs.append(f'allow="{allow}"')
820
- if sandbox is not None:
821
- if isinstance(sandbox, list):
822
- if sandbox:
823
- attrs.append(f'sandbox="{" ".join(sandbox)}"')
824
- else:
825
- attrs.append("sandbox")
826
- elif isinstance(sandbox, str) and sandbox:
827
- attrs.append(f'sandbox="{sandbox}"')
828
- else:
829
- attrs.append("sandbox")
830
- if loading and isinstance(loading, str) and loading.strip():
831
- attrs.append(f'loading="{loading}"')
832
837
 
833
- attrs_str = " ".join(attrs)
838
+ if src and isinstance(src, str) and src.strip():
839
+ link = f"[{src}]({src})"
840
+ if convert_as_inline:
841
+ return link
842
+ return f"{link}\n\n"
834
843
 
835
- if attrs_str:
836
- return f"<iframe {attrs_str}></iframe>\n\n"
837
- return "<iframe></iframe>\n\n"
844
+ return ""
838
845
 
839
846
 
840
847
  def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
@@ -860,7 +867,7 @@ def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
860
867
 
861
868
 
862
869
  def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
863
- """Convert HTML time element preserving datetime attribute.
870
+ """Convert HTML time element to semantic Markdown.
864
871
 
865
872
  Args:
866
873
  tag: The time tag element.
@@ -868,21 +875,18 @@ def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
868
875
  convert_as_inline: Whether to convert as inline content.
869
876
 
870
877
  Returns:
871
- The converted markdown text preserving time information.
878
+ The converted markdown text (content only, no HTML tags).
872
879
  """
880
+ _ = tag
873
881
  _ = convert_as_inline
874
882
  if not text.strip():
875
883
  return ""
876
884
 
877
- datetime_attr = tag.get("datetime")
878
- if datetime_attr and isinstance(datetime_attr, str) and datetime_attr.strip():
879
- return f'<time datetime="{datetime_attr.strip()}">{text.strip()}</time>'
880
-
881
885
  return text.strip()
882
886
 
883
887
 
884
888
  def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
885
- """Convert HTML data element preserving value attribute.
889
+ """Convert HTML data element to semantic Markdown.
886
890
 
887
891
  Args:
888
892
  tag: The data tag element.
@@ -890,16 +894,13 @@ def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
890
894
  convert_as_inline: Whether to convert as inline content.
891
895
 
892
896
  Returns:
893
- The converted markdown text preserving machine-readable data.
897
+ The converted markdown text (content only, no HTML tags).
894
898
  """
899
+ _ = tag
895
900
  _ = convert_as_inline
896
901
  if not text.strip():
897
902
  return ""
898
903
 
899
- value_attr = tag.get("value")
900
- if value_attr and isinstance(value_attr, str) and value_attr.strip():
901
- return f'<data value="{value_attr.strip()}">{text.strip()}</data>'
902
-
903
904
  return text.strip()
904
905
 
905
906
 
@@ -917,7 +918,7 @@ def _convert_wbr(*, convert_as_inline: bool) -> str:
917
918
 
918
919
 
919
920
  def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
920
- """Convert HTML form element preserving structure for documentation.
921
+ """Convert HTML form element to semantic Markdown.
921
922
 
922
923
  Args:
923
924
  tag: The form tag element.
@@ -925,38 +926,27 @@ def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
925
926
  convert_as_inline: Whether to convert as inline content.
926
927
 
927
928
  Returns:
928
- The converted markdown text preserving form structure.
929
+ The converted markdown text (only content, no HTML tags).
929
930
  """
931
+ _ = tag
930
932
  if convert_as_inline:
931
933
  return text
932
934
 
933
935
  if not text.strip():
934
936
  return ""
935
937
 
936
- action = tag.get("action", "")
937
- method = tag.get("method", "")
938
- attrs = []
939
-
940
- if action and isinstance(action, str) and action.strip():
941
- attrs.append(f'action="{action.strip()}"')
942
- if method and isinstance(method, str) and method.strip():
943
- attrs.append(f'method="{method.strip()}"')
944
-
945
- attrs_str = " ".join(attrs)
946
- if attrs_str:
947
- return f"<form {attrs_str}>\n{text.strip()}\n</form>\n\n"
948
- return f"<form>\n{text.strip()}\n</form>\n\n"
938
+ return text
949
939
 
950
940
 
951
941
  def _convert_fieldset(*, text: str, convert_as_inline: bool) -> str:
952
- """Convert HTML fieldset element preserving structure.
942
+ """Convert HTML fieldset element to semantic Markdown.
953
943
 
954
944
  Args:
955
945
  text: The text content of the fieldset element.
956
946
  convert_as_inline: Whether to convert as inline content.
957
947
 
958
948
  Returns:
959
- The converted markdown text preserving fieldset structure.
949
+ The converted markdown text (only content, no HTML tags).
960
950
  """
961
951
  if convert_as_inline:
962
952
  return text
@@ -964,7 +954,7 @@ def _convert_fieldset(*, text: str, convert_as_inline: bool) -> str:
964
954
  if not text.strip():
965
955
  return ""
966
956
 
967
- return f"<fieldset>\n{text.strip()}\n</fieldset>\n\n"
957
+ return text
968
958
 
969
959
 
970
960
  def _convert_legend(*, text: str, convert_as_inline: bool) -> str:
@@ -983,11 +973,11 @@ def _convert_legend(*, text: str, convert_as_inline: bool) -> str:
983
973
  if not text.strip():
984
974
  return ""
985
975
 
986
- return f"<legend>{text.strip()}</legend>\n\n"
976
+ return _format_wrapped_block(text, "**")
987
977
 
988
978
 
989
979
  def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
990
- """Convert HTML label element preserving for attribute.
980
+ """Convert HTML label element to Markdown.
991
981
 
992
982
  Args:
993
983
  tag: The label tag element.
@@ -995,78 +985,31 @@ def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
995
985
  convert_as_inline: Whether to convert as inline content.
996
986
 
997
987
  Returns:
998
- The converted markdown text preserving label structure.
988
+ The label text content.
999
989
  """
1000
- if convert_as_inline:
1001
- return text
1002
-
990
+ _ = tag
1003
991
  if not text.strip():
1004
992
  return ""
1005
993
 
1006
- for_attr = tag.get("for")
1007
- if for_attr and isinstance(for_attr, str) and for_attr.strip():
1008
- return f'<label for="{for_attr.strip()}">{text.strip()}</label>\n\n'
1009
-
1010
- return f"<label>{text.strip()}</label>\n\n"
994
+ return _format_inline_or_block(text, convert_as_inline)
1011
995
 
1012
996
 
1013
997
  def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
1014
- """Convert HTML input element preserving all relevant attributes.
998
+ """Convert HTML input element to Markdown.
1015
999
 
1016
1000
  Args:
1017
1001
  tag: The input tag element.
1018
1002
  convert_as_inline: Whether to convert as inline content.
1019
1003
 
1020
1004
  Returns:
1021
- The converted markdown text preserving input structure.
1005
+ Empty string since input elements have no Markdown representation.
1022
1006
  """
1023
- input_type = tag.get("type", "text")
1024
-
1025
- from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
1026
-
1027
- if _has_ancestor(tag, "li"):
1028
- return ""
1029
-
1030
- id_attr = tag.get("id", "")
1031
- name = tag.get("name", "")
1032
- value = tag.get("value", "")
1033
- placeholder = tag.get("placeholder", "")
1034
- required = tag.get("required") is not None
1035
- disabled = tag.get("disabled") is not None
1036
- readonly = tag.get("readonly") is not None
1037
- checked = tag.get("checked") is not None
1038
- accept = tag.get("accept", "")
1039
-
1040
- attrs = []
1041
- if input_type and isinstance(input_type, str):
1042
- attrs.append(f'type="{input_type}"')
1043
- if id_attr and isinstance(id_attr, str) and id_attr.strip():
1044
- attrs.append(f'id="{id_attr}"')
1045
- if name and isinstance(name, str) and name.strip():
1046
- attrs.append(f'name="{name}"')
1047
- if value and isinstance(value, str) and value.strip():
1048
- attrs.append(f'value="{value}"')
1049
- if placeholder and isinstance(placeholder, str) and placeholder.strip():
1050
- attrs.append(f'placeholder="{placeholder}"')
1051
- if accept and isinstance(accept, str) and accept.strip():
1052
- attrs.append(f'accept="{accept}"')
1053
- if required:
1054
- attrs.append("required")
1055
- if disabled:
1056
- attrs.append("disabled")
1057
- if readonly:
1058
- attrs.append("readonly")
1059
- if checked:
1060
- attrs.append("checked")
1061
-
1062
- attrs_str = " ".join(attrs)
1063
- result = f"<input {attrs_str} />" if attrs_str else "<input />"
1064
-
1065
- return result if convert_as_inline else f"{result}\n\n"
1007
+ _ = tag, convert_as_inline
1008
+ return ""
1066
1009
 
1067
1010
 
1068
1011
  def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1069
- """Convert HTML textarea element preserving attributes.
1012
+ """Convert HTML textarea element to Markdown.
1070
1013
 
1071
1014
  Args:
1072
1015
  tag: The textarea tag element.
@@ -1074,42 +1017,17 @@ def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1074
1017
  convert_as_inline: Whether to convert as inline content.
1075
1018
 
1076
1019
  Returns:
1077
- The converted markdown text preserving textarea structure.
1020
+ The text content of the textarea.
1078
1021
  """
1079
- if convert_as_inline:
1080
- return text
1081
-
1022
+ _ = tag
1082
1023
  if not text.strip():
1083
1024
  return ""
1084
1025
 
1085
- name = tag.get("name", "")
1086
- placeholder = tag.get("placeholder", "")
1087
- rows = tag.get("rows", "")
1088
- cols = tag.get("cols", "")
1089
- required = tag.get("required") is not None
1090
-
1091
- attrs = []
1092
- if name and isinstance(name, str) and name.strip():
1093
- attrs.append(f'name="{name}"')
1094
- if placeholder and isinstance(placeholder, str) and placeholder.strip():
1095
- attrs.append(f'placeholder="{placeholder}"')
1096
- if rows and isinstance(rows, str) and rows.strip():
1097
- attrs.append(f'rows="{rows}"')
1098
- if cols and isinstance(cols, str) and cols.strip():
1099
- attrs.append(f'cols="{cols}"')
1100
- if required:
1101
- attrs.append("required")
1102
-
1103
- attrs_str = " ".join(attrs)
1104
- content = text.strip()
1105
-
1106
- if attrs_str:
1107
- return f"<textarea {attrs_str}>{content}</textarea>\n\n"
1108
- return f"<textarea>{content}</textarea>\n\n"
1026
+ return _format_inline_or_block(text, convert_as_inline)
1109
1027
 
1110
1028
 
1111
1029
  def _convert_select(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1112
- """Convert HTML select element preserving structure.
1030
+ """Convert HTML select element to Markdown.
1113
1031
 
1114
1032
  Args:
1115
1033
  tag: The select tag element.
@@ -1117,39 +1035,21 @@ def _convert_select(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1117
1035
  convert_as_inline: Whether to convert as inline content.
1118
1036
 
1119
1037
  Returns:
1120
- The converted markdown text preserving select structure.
1038
+ The text content (options) as a comma-separated list.
1121
1039
  """
1122
- if convert_as_inline:
1123
- return text
1124
-
1040
+ _ = tag
1125
1041
  if not text.strip():
1126
1042
  return ""
1127
1043
 
1128
- id_attr = tag.get("id", "")
1129
- name = tag.get("name", "")
1130
- multiple = tag.get("multiple") is not None
1131
- required = tag.get("required") is not None
1132
-
1133
- attrs = []
1134
- if id_attr and isinstance(id_attr, str) and id_attr.strip():
1135
- attrs.append(f'id="{id_attr}"')
1136
- if name and isinstance(name, str) and name.strip():
1137
- attrs.append(f'name="{name}"')
1138
- if multiple:
1139
- attrs.append("multiple")
1140
- if required:
1141
- attrs.append("required")
1142
-
1143
- attrs_str = " ".join(attrs)
1144
- content = text.strip()
1044
+ if convert_as_inline:
1045
+ options = [opt.strip() for opt in text.strip().split("\n") if opt.strip()]
1046
+ return ", ".join(options)
1145
1047
 
1146
- if attrs_str:
1147
- return f"<select {attrs_str}>\n{content}\n</select>\n\n"
1148
- return f"<select>\n{content}\n</select>\n\n"
1048
+ return _format_block_element(text)
1149
1049
 
1150
1050
 
1151
1051
  def _convert_option(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1152
- """Convert HTML option element preserving value and selected state.
1052
+ """Convert HTML option element to Markdown.
1153
1053
 
1154
1054
  Args:
1155
1055
  tag: The option tag element.
@@ -1157,33 +1057,24 @@ def _convert_option(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1157
1057
  convert_as_inline: Whether to convert as inline content.
1158
1058
 
1159
1059
  Returns:
1160
- The converted markdown text preserving option structure.
1060
+ The option text, potentially with a marker if selected.
1161
1061
  """
1162
- if convert_as_inline:
1163
- return text
1164
-
1165
1062
  if not text.strip():
1166
1063
  return ""
1167
1064
 
1168
- value = tag.get("value", "")
1169
1065
  selected = tag.get("selected") is not None
1170
-
1171
- attrs = []
1172
- if value and isinstance(value, str) and value.strip():
1173
- attrs.append(f'value="{value}"')
1174
- if selected:
1175
- attrs.append("selected")
1176
-
1177
- attrs_str = " ".join(attrs)
1178
1066
  content = text.strip()
1179
1067
 
1180
- if attrs_str:
1181
- return f"<option {attrs_str}>{content}</option>\n"
1182
- return f"<option>{content}</option>\n"
1068
+ if convert_as_inline:
1069
+ return content
1070
+
1071
+ if selected:
1072
+ return f"* {content}\n"
1073
+ return f"{content}\n"
1183
1074
 
1184
1075
 
1185
1076
  def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1186
- """Convert HTML optgroup element preserving label.
1077
+ """Convert HTML optgroup element to semantic Markdown.
1187
1078
 
1188
1079
  Args:
1189
1080
  tag: The optgroup tag element.
@@ -1191,7 +1082,7 @@ def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1191
1082
  convert_as_inline: Whether to convert as inline content.
1192
1083
 
1193
1084
  Returns:
1194
- The converted markdown text preserving optgroup structure.
1085
+ The converted markdown text with label as heading.
1195
1086
  """
1196
1087
  if convert_as_inline:
1197
1088
  return text
@@ -1200,21 +1091,16 @@ def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1200
1091
  return ""
1201
1092
 
1202
1093
  label = tag.get("label", "")
1094
+ content = text.strip()
1203
1095
 
1204
- attrs = []
1205
1096
  if label and isinstance(label, str) and label.strip():
1206
- attrs.append(f'label="{label}"')
1097
+ return f"**{label.strip()}**\n{content}\n"
1207
1098
 
1208
- attrs_str = " ".join(attrs)
1209
- content = text.strip()
1210
-
1211
- if attrs_str:
1212
- return f"<optgroup {attrs_str}>\n{content}\n</optgroup>\n"
1213
- return f"<optgroup>\n{content}\n</optgroup>\n"
1099
+ return f"{content}\n"
1214
1100
 
1215
1101
 
1216
1102
  def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1217
- """Convert HTML button element preserving type and attributes.
1103
+ """Convert HTML button element to Markdown.
1218
1104
 
1219
1105
  Args:
1220
1106
  tag: The button tag element.
@@ -1222,38 +1108,17 @@ def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1222
1108
  convert_as_inline: Whether to convert as inline content.
1223
1109
 
1224
1110
  Returns:
1225
- The converted markdown text preserving button structure.
1111
+ The button text content.
1226
1112
  """
1227
- if convert_as_inline:
1228
- return text
1229
-
1113
+ _ = tag
1230
1114
  if not text.strip():
1231
1115
  return ""
1232
1116
 
1233
- button_type = tag.get("type", "")
1234
- name = tag.get("name", "")
1235
- value = tag.get("value", "")
1236
- disabled = tag.get("disabled") is not None
1237
-
1238
- attrs = []
1239
- if button_type and isinstance(button_type, str) and button_type.strip():
1240
- attrs.append(f'type="{button_type}"')
1241
- if name and isinstance(name, str) and name.strip():
1242
- attrs.append(f'name="{name}"')
1243
- if value and isinstance(value, str) and value.strip():
1244
- attrs.append(f'value="{value}"')
1245
- if disabled:
1246
- attrs.append("disabled")
1247
-
1248
- attrs_str = " ".join(attrs)
1249
-
1250
- if attrs_str:
1251
- return f"<button {attrs_str}>{text.strip()}</button>\n\n"
1252
- return f"<button>{text.strip()}</button>\n\n"
1117
+ return _format_inline_or_block(text, convert_as_inline)
1253
1118
 
1254
1119
 
1255
1120
  def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1256
- """Convert HTML progress element preserving value and max.
1121
+ """Convert HTML progress element to semantic text.
1257
1122
 
1258
1123
  Args:
1259
1124
  tag: The progress tag element.
@@ -1261,33 +1126,20 @@ def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1261
1126
  convert_as_inline: Whether to convert as inline content.
1262
1127
 
1263
1128
  Returns:
1264
- The converted markdown text preserving progress structure.
1129
+ The converted markdown text (only content, no HTML tags).
1265
1130
  """
1131
+ _ = tag
1266
1132
  if convert_as_inline:
1267
1133
  return text
1268
1134
 
1269
1135
  if not text.strip():
1270
1136
  return ""
1271
1137
 
1272
- value = tag.get("value", "")
1273
- max_val = tag.get("max", "")
1274
-
1275
- attrs = []
1276
- if value and isinstance(value, str) and value.strip():
1277
- attrs.append(f'value="{value}"')
1278
- if max_val and isinstance(max_val, str) and max_val.strip():
1279
- attrs.append(f'max="{max_val}"')
1280
-
1281
- attrs_str = " ".join(attrs)
1282
- content = text.strip()
1283
-
1284
- if attrs_str:
1285
- return f"<progress {attrs_str}>{content}</progress>\n\n"
1286
- return f"<progress>{content}</progress>\n\n"
1138
+ return _format_block_element(text)
1287
1139
 
1288
1140
 
1289
1141
  def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1290
- """Convert HTML meter element preserving value and range attributes.
1142
+ """Convert HTML meter element to semantic text.
1291
1143
 
1292
1144
  Args:
1293
1145
  tag: The meter tag element.
@@ -1295,45 +1147,20 @@ def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1295
1147
  convert_as_inline: Whether to convert as inline content.
1296
1148
 
1297
1149
  Returns:
1298
- The converted markdown text preserving meter structure.
1150
+ The converted markdown text (only content, no HTML tags).
1299
1151
  """
1152
+ _ = tag
1300
1153
  if convert_as_inline:
1301
1154
  return text
1302
1155
 
1303
1156
  if not text.strip():
1304
1157
  return ""
1305
1158
 
1306
- value = tag.get("value", "")
1307
- min_val = tag.get("min", "")
1308
- max_val = tag.get("max", "")
1309
- low = tag.get("low", "")
1310
- high = tag.get("high", "")
1311
- optimum = tag.get("optimum", "")
1312
-
1313
- attrs = []
1314
- if value and isinstance(value, str) and value.strip():
1315
- attrs.append(f'value="{value}"')
1316
- if min_val and isinstance(min_val, str) and min_val.strip():
1317
- attrs.append(f'min="{min_val}"')
1318
- if max_val and isinstance(max_val, str) and max_val.strip():
1319
- attrs.append(f'max="{max_val}"')
1320
- if low and isinstance(low, str) and low.strip():
1321
- attrs.append(f'low="{low}"')
1322
- if high and isinstance(high, str) and high.strip():
1323
- attrs.append(f'high="{high}"')
1324
- if optimum and isinstance(optimum, str) and optimum.strip():
1325
- attrs.append(f'optimum="{optimum}"')
1326
-
1327
- attrs_str = " ".join(attrs)
1328
- content = text.strip()
1329
-
1330
- if attrs_str:
1331
- return f"<meter {attrs_str}>{content}</meter>\n\n"
1332
- return f"<meter>{content}</meter>\n\n"
1159
+ return _format_block_element(text)
1333
1160
 
1334
1161
 
1335
1162
  def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1336
- """Convert HTML output element preserving for and name attributes.
1163
+ """Convert HTML output element to semantic text.
1337
1164
 
1338
1165
  Args:
1339
1166
  tag: The output tag element.
@@ -1341,34 +1168,20 @@ def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1341
1168
  convert_as_inline: Whether to convert as inline content.
1342
1169
 
1343
1170
  Returns:
1344
- The converted markdown text preserving output structure.
1171
+ The converted markdown text (only content, no HTML tags).
1345
1172
  """
1173
+ _ = tag
1346
1174
  if convert_as_inline:
1347
1175
  return text
1348
1176
 
1349
1177
  if not text.strip():
1350
1178
  return ""
1351
1179
 
1352
- for_attr = tag.get("for", "")
1353
- name = tag.get("name", "")
1354
-
1355
- attrs = []
1356
- if for_attr:
1357
- for_value = " ".join(for_attr) if isinstance(for_attr, list) else str(for_attr)
1358
- if for_value.strip():
1359
- attrs.append(f'for="{for_value}"')
1360
- if name and isinstance(name, str) and name.strip():
1361
- attrs.append(f'name="{name}"')
1362
-
1363
- attrs_str = " ".join(attrs)
1364
-
1365
- if attrs_str:
1366
- return f"<output {attrs_str}>{text.strip()}</output>\n\n"
1367
- return f"<output>{text.strip()}</output>\n\n"
1180
+ return _format_block_element(text)
1368
1181
 
1369
1182
 
1370
1183
  def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1371
- """Convert HTML datalist element preserving structure.
1184
+ """Convert HTML datalist element to semantic Markdown.
1372
1185
 
1373
1186
  Args:
1374
1187
  tag: The datalist tag element.
@@ -1376,26 +1189,16 @@ def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1376
1189
  convert_as_inline: Whether to convert as inline content.
1377
1190
 
1378
1191
  Returns:
1379
- The converted markdown text preserving datalist structure.
1192
+ The converted markdown text (only content, no HTML tags).
1380
1193
  """
1194
+ _ = tag
1381
1195
  if convert_as_inline:
1382
1196
  return text
1383
1197
 
1384
1198
  if not text.strip():
1385
1199
  return ""
1386
1200
 
1387
- id_attr = tag.get("id", "")
1388
-
1389
- attrs = []
1390
- if id_attr and isinstance(id_attr, str) and id_attr.strip():
1391
- attrs.append(f'id="{id_attr}"')
1392
-
1393
- attrs_str = " ".join(attrs)
1394
- content = text.strip()
1395
-
1396
- if attrs_str:
1397
- return f"<datalist {attrs_str}>\n{content}\n</datalist>\n\n"
1398
- return f"<datalist>\n{content}\n</datalist>\n\n"
1201
+ return _format_block_element(text)
1399
1202
 
1400
1203
 
1401
1204
  def _convert_ruby(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
@@ -1488,7 +1291,7 @@ def _convert_rtc(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1488
1291
 
1489
1292
 
1490
1293
  def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1491
- """Convert HTML dialog element preserving structure with attributes.
1294
+ """Convert HTML dialog element to semantic Markdown.
1492
1295
 
1493
1296
  Args:
1494
1297
  text: The text content of the dialog element.
@@ -1496,27 +1299,20 @@ def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1496
1299
  tag: The dialog tag element.
1497
1300
 
1498
1301
  Returns:
1499
- The converted markdown text preserving dialog structure.
1302
+ The converted markdown text (only content, no HTML tags).
1500
1303
  """
1304
+ _ = tag
1501
1305
  if convert_as_inline:
1502
1306
  return text
1503
1307
 
1504
1308
  if not text.strip():
1505
1309
  return ""
1506
1310
 
1507
- attrs = []
1508
- if tag.get("open") is not None:
1509
- attrs.append("open")
1510
- if tag.get("id"):
1511
- attrs.append(f'id="{tag.get("id")}"')
1512
-
1513
- attrs_str = " " + " ".join(attrs) if attrs else ""
1514
-
1515
- return f"<dialog{attrs_str}>\n{text.strip()}\n</dialog>\n\n"
1311
+ return _format_block_element(text)
1516
1312
 
1517
1313
 
1518
1314
  def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1519
- """Convert HTML menu element preserving structure with attributes.
1315
+ """Convert HTML menu element to semantic Markdown.
1520
1316
 
1521
1317
  Args:
1522
1318
  text: The text content of the menu element.
@@ -1524,29 +1320,20 @@ def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1524
1320
  tag: The menu tag element.
1525
1321
 
1526
1322
  Returns:
1527
- The converted markdown text preserving menu structure.
1323
+ The converted markdown text (only content, no HTML tags).
1528
1324
  """
1325
+ _ = tag
1529
1326
  if convert_as_inline:
1530
1327
  return text
1531
1328
 
1532
1329
  if not text.strip():
1533
1330
  return ""
1534
1331
 
1535
- attrs = []
1536
- if tag.get("type") and tag.get("type") != "list":
1537
- attrs.append(f'type="{tag.get("type")}"')
1538
- if tag.get("label"):
1539
- attrs.append(f'label="{tag.get("label")}"')
1540
- if tag.get("id"):
1541
- attrs.append(f'id="{tag.get("id")}"')
1542
-
1543
- attrs_str = " " + " ".join(attrs) if attrs else ""
1544
-
1545
- return f"<menu{attrs_str}>\n{text.strip()}\n</menu>\n\n"
1332
+ return _format_block_element(text)
1546
1333
 
1547
1334
 
1548
1335
  def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1549
- """Convert HTML figure element preserving semantic structure.
1336
+ """Convert HTML figure element to semantic Markdown.
1550
1337
 
1551
1338
  Args:
1552
1339
  text: The text content of the figure element.
@@ -1554,42 +1341,33 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1554
1341
  tag: The figure tag element.
1555
1342
 
1556
1343
  Returns:
1557
- The converted markdown text preserving figure structure.
1344
+ The converted markdown text (only content, no HTML tags).
1558
1345
  """
1346
+ _ = tag
1559
1347
  if not text.strip():
1560
1348
  return ""
1561
1349
 
1562
1350
  if convert_as_inline:
1563
1351
  return text
1564
1352
 
1565
- attrs = []
1566
- if tag.get("id"):
1567
- attrs.append(f'id="{tag.get("id")}"')
1568
- if tag.get("class"):
1569
- class_val = tag.get("class")
1570
- if isinstance(class_val, list):
1571
- class_val = " ".join(class_val)
1572
- attrs.append(f'class="{class_val}"')
1573
-
1574
- attrs_str = " " + " ".join(attrs) if attrs else ""
1575
-
1576
1353
  content = text.strip()
1577
-
1578
- if content.endswith("\n\n"):
1579
- return f"<figure{attrs_str}>\n{content}</figure>\n\n"
1580
-
1581
- return f"<figure{attrs_str}>\n{content}\n</figure>\n\n"
1354
+ if content and not content.endswith("\n\n"):
1355
+ if content.endswith("\n"):
1356
+ content += "\n"
1357
+ else:
1358
+ content += "\n\n"
1359
+ return content
1582
1360
 
1583
1361
 
1584
1362
  def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
1585
- """Convert HTML hgroup element preserving heading group semantics.
1363
+ """Convert HTML hgroup element to semantic Markdown.
1586
1364
 
1587
1365
  Args:
1588
1366
  text: The text content of the hgroup element.
1589
1367
  convert_as_inline: Whether to convert as inline content.
1590
1368
 
1591
1369
  Returns:
1592
- The converted markdown text preserving heading group structure.
1370
+ The converted markdown text (only content, no HTML tags).
1593
1371
  """
1594
1372
  if convert_as_inline:
1595
1373
  return text
@@ -1597,15 +1375,11 @@ def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
1597
1375
  if not text.strip():
1598
1376
  return ""
1599
1377
 
1600
- content = text.strip()
1601
-
1602
- content = re.sub(r"\n{3,}", "\n\n", content)
1603
-
1604
- return f"<!-- heading group -->\n{content}\n<!-- end heading group -->\n\n"
1378
+ return text
1605
1379
 
1606
1380
 
1607
1381
  def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1608
- """Convert HTML picture element with responsive image sources.
1382
+ """Convert HTML picture element to semantic Markdown.
1609
1383
 
1610
1384
  Args:
1611
1385
  text: The text content of the picture element.
@@ -1613,44 +1387,13 @@ def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1613
1387
  tag: The picture tag element.
1614
1388
 
1615
1389
  Returns:
1616
- The converted markdown text with picture information preserved.
1390
+ The converted markdown text (only the img element).
1617
1391
  """
1392
+ _ = tag, convert_as_inline
1618
1393
  if not text.strip():
1619
1394
  return ""
1620
1395
 
1621
- sources = tag.find_all("source")
1622
- img = tag.find("img")
1623
-
1624
- if not img:
1625
- return text.strip()
1626
-
1627
- img_markdown = text.strip()
1628
-
1629
- if not sources:
1630
- return img_markdown
1631
-
1632
- source_info = []
1633
- for source in sources:
1634
- srcset = source.get("srcset")
1635
- media = source.get("media")
1636
- mime_type = source.get("type")
1637
-
1638
- if srcset:
1639
- info = f'srcset="{srcset}"'
1640
- if media:
1641
- info += f' media="{media}"'
1642
- if mime_type:
1643
- info += f' type="{mime_type}"'
1644
- source_info.append(info)
1645
-
1646
- if source_info and not convert_as_inline:
1647
- sources_comment = "<!-- picture sources:\n"
1648
- for info in source_info:
1649
- sources_comment += f" {info}\n"
1650
- sources_comment += "-->\n"
1651
- return f"{sources_comment}{img_markdown}"
1652
-
1653
- return img_markdown
1396
+ return text.strip()
1654
1397
 
1655
1398
 
1656
1399
  def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
@@ -1765,7 +1508,7 @@ def create_converters_map(
1765
1508
  "abbr": _wrapper(_convert_abbr),
1766
1509
  "article": _wrapper(_convert_semantic_block),
1767
1510
  "aside": _wrapper(_convert_semantic_block),
1768
- "audio": _wrapper(_convert_audio),
1511
+ "audio": _wrapper(_convert_media_element),
1769
1512
  "b": _wrapper(partial(_create_inline_converter(2 * strong_em_symbol))),
1770
1513
  "bdi": _wrapper(_create_inline_converter("")),
1771
1514
  "bdo": _wrapper(_create_inline_converter("")),
@@ -1788,7 +1531,7 @@ def create_converters_map(
1788
1531
  "dt": _wrapper(_convert_dt),
1789
1532
  "em": _wrapper(_create_inline_converter(strong_em_symbol)),
1790
1533
  "fieldset": _wrapper(_convert_fieldset),
1791
- "figcaption": _wrapper(lambda text: f"\n\n{text}\n\n"),
1534
+ "figcaption": _wrapper(lambda text: f"\n\n*{text.strip()}*\n\n" if text.strip() else ""),
1792
1535
  "figure": _wrapper(_convert_figure),
1793
1536
  "footer": _wrapper(_convert_semantic_block),
1794
1537
  "form": _wrapper(_convert_form),
@@ -1861,6 +1604,6 @@ def create_converters_map(
1861
1604
  "u": _wrapper(_create_inline_converter("")),
1862
1605
  "ul": _wrapper(_convert_list),
1863
1606
  "var": _wrapper(_create_inline_converter("*")),
1864
- "video": _wrapper(_convert_video),
1607
+ "video": _wrapper(_convert_media_element),
1865
1608
  "wbr": _wrapper(_convert_wbr),
1866
1609
  }