html-to-markdown 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -5,11 +5,11 @@ from typing import TYPE_CHECKING
5
5
  if TYPE_CHECKING:
6
6
  from collections.abc import Iterable
7
7
  import base64
8
- import re
8
+ from collections.abc import Callable
9
9
  from functools import partial
10
10
  from inspect import getfullargspec
11
11
  from textwrap import fill
12
- from typing import Any, Callable, Literal, TypeVar, cast
12
+ from typing import Any, Literal, TypeVar, cast
13
13
 
14
14
  from bs4.element import Tag
15
15
 
@@ -21,6 +21,24 @@ from html_to_markdown.constants import (
21
21
  )
22
22
  from html_to_markdown.utils import chomp, indent, underline
23
23
 
24
+
25
+ def _format_block_element(text: str) -> str:
26
+ """Format text as a block element with trailing newlines."""
27
+ return f"{text.strip()}\n\n" if text.strip() else ""
28
+
29
+
30
+ def _format_inline_or_block(text: str, convert_as_inline: bool) -> str:
31
+ """Format text as inline or block element based on context."""
32
+ return text.strip() if convert_as_inline else _format_block_element(text)
33
+
34
+
35
+ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") -> str:
36
+ """Format text wrapped in markers as a block element."""
37
+ if not end_marker:
38
+ end_marker = start_marker
39
+ return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
40
+
41
+
24
42
  SupportedElements = Literal[
25
43
  "a",
26
44
  "abbr",
@@ -189,11 +207,24 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
189
207
  if not text:
190
208
  return ""
191
209
 
210
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
211
+
192
212
  cite_url = tag.get("cite")
193
- quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
213
+
214
+ # Check if this blockquote is inside a list item
215
+ if _has_ancestor(tag, "li"):
216
+ # Indent the blockquote by 4 spaces
217
+ lines = text.strip().split("\n")
218
+ indented_lines = [f" > {line}" if line.strip() else "" for line in lines]
219
+ quote_text = "\n".join(indented_lines) + "\n\n"
220
+ else:
221
+ quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
194
222
 
195
223
  if cite_url:
196
- quote_text += f"— <{cite_url}>\n\n"
224
+ if _has_ancestor(tag, "li"):
225
+ quote_text += f" — <{cite_url}>\n\n"
226
+ else:
227
+ quote_text += f"— <{cite_url}>\n\n"
197
228
 
198
229
  return quote_text
199
230
 
@@ -243,8 +274,8 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
243
274
  title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
244
275
  parent_name = tag.parent.name if tag.parent else ""
245
276
 
246
- default_preserve_in = ["td", "th"]
247
- preserve_in = set(keep_inline_images_in or []) | set(default_preserve_in)
277
+ default_preserve_in = {"td", "th"}
278
+ preserve_in = set(keep_inline_images_in or []) | default_preserve_in
248
279
  if convert_as_inline and parent_name not in preserve_in:
249
280
  return alt
250
281
  if width or height:
@@ -253,24 +284,42 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
253
284
 
254
285
 
255
286
  def _convert_list(*, tag: Tag, text: str) -> str:
256
- nested = False
287
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
257
288
 
258
289
  before_paragraph = False
259
290
  if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
260
291
  before_paragraph = True
261
292
 
262
- while tag:
263
- if tag.name == "li":
264
- nested = True
265
- break
266
-
267
- if not tag.parent:
268
- break
269
-
270
- tag = tag.parent
271
-
272
- if nested:
273
- return "\n" + indent(text=text, level=1).rstrip()
293
+ # Check if this list is inside a list item
294
+ if _has_ancestor(tag, "li"):
295
+ # This is a nested list - needs indentation
296
+ # But we need to check if it's the first element after a paragraph
297
+ parent = tag.parent
298
+ while parent and parent.name != "li":
299
+ parent = parent.parent
300
+
301
+ if parent:
302
+ # Check if there's a paragraph before this list
303
+ prev_p = None
304
+ for child in parent.children:
305
+ if hasattr(child, "name"):
306
+ if child == tag:
307
+ break
308
+ if child.name == "p":
309
+ prev_p = child
310
+
311
+ if prev_p:
312
+ # If there's a paragraph before, we need proper indentation
313
+ lines = text.strip().split("\n")
314
+ indented_lines = []
315
+ for line in lines:
316
+ if line.strip():
317
+ indented_lines.append(f" {line}")
318
+ else:
319
+ indented_lines.append("")
320
+ return "\n" + "\n".join(indented_lines) + "\n"
321
+ # Otherwise use the original tab indentation
322
+ return "\n" + indent(text=text, level=1).rstrip()
274
323
 
275
324
  return text + ("\n" if before_paragraph else "")
276
325
 
@@ -305,10 +354,38 @@ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
305
354
  tag = tag.parent
306
355
 
307
356
  bullet = bullets[depth % len(bullets)]
357
+
358
+ # Check if the list item contains block-level elements (like <p>, <blockquote>, etc.)
359
+ has_block_children = any(
360
+ child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
361
+ for child in tag.children
362
+ if hasattr(child, "name")
363
+ )
364
+
365
+ if has_block_children:
366
+ # Handle multi-paragraph list items
367
+ # Split by double newlines (paragraph separators)
368
+ paragraphs = text.strip().split("\n\n")
369
+
370
+ if paragraphs:
371
+ # First paragraph goes directly after the bullet
372
+ result_parts = [f"{bullet} {paragraphs[0].strip()}\n"]
373
+
374
+ # Subsequent paragraphs need to be indented and separated by blank lines
375
+ for para in paragraphs[1:]:
376
+ if para.strip():
377
+ # Add blank line before the paragraph
378
+ result_parts.append("\n")
379
+ # Indent each line of the paragraph by 4 spaces
380
+ result_parts.extend(f" {line}\n" for line in para.strip().split("\n") if line.strip())
381
+
382
+ return "".join(result_parts)
383
+
384
+ # Simple case: no block elements, just inline content
308
385
  return "{} {}\n".format(bullet, (text or "").strip())
309
386
 
310
387
 
311
- def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: int) -> str:
388
+ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: int, tag: Tag) -> str:
312
389
  if convert_as_inline:
313
390
  return text
314
391
 
@@ -320,6 +397,30 @@ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: in
320
397
  break_on_hyphens=False,
321
398
  )
322
399
 
400
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
401
+
402
+ # Check if this paragraph is inside a list item
403
+ if _has_ancestor(tag, "li"):
404
+ # Check if this is the first paragraph in the list item
405
+ parent = tag.parent
406
+ while parent and parent.name != "li":
407
+ parent = parent.parent
408
+
409
+ if parent:
410
+ # Get all direct children that are paragraphs
411
+ p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
412
+
413
+ # If this is not the first paragraph, indent it
414
+ if p_children and tag != p_children[0]:
415
+ # Indent all lines by 4 spaces
416
+ indented_lines = []
417
+ for line in text.split("\n"):
418
+ if line.strip():
419
+ indented_lines.append(f" {line}")
420
+ else:
421
+ indented_lines.append("")
422
+ text = "\n".join(indented_lines)
423
+
323
424
  return f"{text}\n\n" if text else ""
324
425
 
325
426
 
@@ -337,13 +438,15 @@ def _convert_mark(*, text: str, convert_as_inline: bool, highlight_style: str) -
337
438
  if convert_as_inline:
338
439
  return text
339
440
 
340
- if highlight_style == "double-equal":
341
- return f"=={text}=="
342
- if highlight_style == "bold":
343
- return f"**{text}**"
344
- if highlight_style == "html":
345
- return f"<mark>{text}</mark>"
346
- return text
441
+ match highlight_style:
442
+ case "double-equal":
443
+ return f"=={text}=="
444
+ case "bold":
445
+ return f"**{text}**"
446
+ case "html":
447
+ return f"<mark>{text}</mark>"
448
+ case _:
449
+ return text
347
450
 
348
451
 
349
452
  def _convert_pre(
@@ -376,6 +479,58 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
376
479
  cells = tag.find_all(["td", "th"])
377
480
  parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
378
481
  tag_grand_parent = tag.parent.parent if tag.parent else None
482
+
483
+ # Simple rowspan handling: if previous row had cells with rowspan, add empty cells
484
+ if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
485
+ prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
486
+ rowspan_positions = []
487
+ col_pos = 0
488
+
489
+ # Check which cells in previous row have rowspan > 1
490
+ for prev_cell in prev_cells:
491
+ rowspan = 1
492
+ if (
493
+ "rowspan" in prev_cell.attrs
494
+ and isinstance(prev_cell["rowspan"], str)
495
+ and prev_cell["rowspan"].isdigit()
496
+ ):
497
+ rowspan = int(prev_cell["rowspan"])
498
+
499
+ if rowspan > 1:
500
+ # This cell spans into current row
501
+ rowspan_positions.append(col_pos)
502
+
503
+ # Account for colspan
504
+ colspan = 1
505
+ if (
506
+ "colspan" in prev_cell.attrs
507
+ and isinstance(prev_cell["colspan"], str)
508
+ and prev_cell["colspan"].isdigit()
509
+ ):
510
+ colspan = int(prev_cell["colspan"])
511
+ col_pos += colspan
512
+
513
+ # If there are rowspan cells from previous row, add empty cells
514
+ if rowspan_positions:
515
+ # Build new text with empty cells inserted
516
+ new_cells = []
517
+ cell_index = 0
518
+
519
+ for pos in range(col_pos): # Total columns
520
+ if pos in rowspan_positions:
521
+ # Add empty cell for rowspan
522
+ new_cells.append(" |")
523
+ elif cell_index < len(cells):
524
+ # Add actual cell content
525
+ cell = cells[cell_index]
526
+ cell_text = cell.get_text().strip().replace("\n", " ")
527
+ colspan = _get_colspan(cell)
528
+ new_cells.append(f" {cell_text} |" * colspan)
529
+ cell_index += 1
530
+
531
+ # Override text with new cell arrangement
532
+ text = "".join(new_cells)
533
+
379
534
  is_headrow = (
380
535
  all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
381
536
  or (not tag.previous_sibling and parent_name != "tbody")
@@ -423,7 +578,7 @@ def _convert_caption(*, text: str, convert_as_inline: bool) -> str:
423
578
  if not text.strip():
424
579
  return ""
425
580
 
426
- return f"*{text.strip()}*\n\n"
581
+ return _format_wrapped_block(text, "*")
427
582
 
428
583
 
429
584
  def _convert_thead(*, text: str, convert_as_inline: bool) -> str:
@@ -475,7 +630,10 @@ def _convert_tfoot(*, text: str, convert_as_inline: bool) -> str:
475
630
 
476
631
 
477
632
  def _convert_colgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
478
- """Convert HTML colgroup element preserving column structure for documentation.
633
+ """Convert HTML colgroup element - removes it entirely from Markdown output.
634
+
635
+ Colgroup is a table column grouping element that defines styling for columns.
636
+ It has no representation in Markdown and should be removed.
479
637
 
480
638
  Args:
481
639
  tag: The colgroup tag element.
@@ -483,54 +641,30 @@ def _convert_colgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
483
641
  convert_as_inline: Whether to convert as inline content.
484
642
 
485
643
  Returns:
486
- The converted markdown text preserving colgroup structure.
644
+ Empty string as colgroup has no Markdown representation.
487
645
  """
488
- if convert_as_inline:
489
- return text
490
-
491
- if not text.strip():
492
- return ""
493
-
494
- span = tag.get("span", "")
495
- attrs = []
496
- if span and isinstance(span, str) and span.strip():
497
- attrs.append(f'span="{span}"')
498
-
499
- attrs_str = " ".join(attrs)
500
- if attrs_str:
501
- return f"<colgroup {attrs_str}>\n{text.strip()}\n</colgroup>\n\n"
502
- return f"<colgroup>\n{text.strip()}\n</colgroup>\n\n"
646
+ _ = tag, text, convert_as_inline
647
+ # Colgroup and its contents (col elements) are purely presentational
648
+ # and have no equivalent in Markdown tables
649
+ return ""
503
650
 
504
651
 
505
652
  def _convert_col(*, tag: Tag, convert_as_inline: bool) -> str:
506
- """Convert HTML col element preserving column attributes for documentation.
653
+ """Convert HTML col element - removes it entirely from Markdown output.
654
+
655
+ Col elements define column properties (width, style) in HTML tables.
656
+ They have no representation in Markdown and should be removed.
507
657
 
508
658
  Args:
509
659
  tag: The col tag element.
510
660
  convert_as_inline: Whether to convert as inline content.
511
661
 
512
662
  Returns:
513
- The converted markdown text preserving col structure.
663
+ Empty string as col has no Markdown representation.
514
664
  """
515
- if convert_as_inline:
516
- return ""
517
-
518
- span = tag.get("span", "")
519
- width = tag.get("width", "")
520
- style = tag.get("style", "")
521
-
522
- attrs = []
523
- if width and isinstance(width, str) and width.strip():
524
- attrs.append(f'width="{width}"')
525
- if style and isinstance(style, str) and style.strip():
526
- attrs.append(f'style="{style}"')
527
- if span and isinstance(span, str) and span.strip():
528
- attrs.append(f'span="{span}"')
529
-
530
- attrs_str = " ".join(attrs)
531
- if attrs_str:
532
- return f"<col {attrs_str} />\n"
533
- return "<col />\n"
665
+ _ = tag, convert_as_inline
666
+ # Col elements are self-closing and purely presentational
667
+ return ""
534
668
 
535
669
 
536
670
  def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
@@ -550,35 +684,37 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
550
684
 
551
685
 
552
686
  def _convert_details(*, text: str, convert_as_inline: bool) -> str:
553
- """Convert HTML details element preserving HTML structure.
687
+ """Convert HTML details element to semantic Markdown.
554
688
 
555
689
  Args:
556
690
  text: The text content of the details element.
557
691
  convert_as_inline: Whether to convert as inline content.
558
692
 
559
693
  Returns:
560
- The converted markdown text preserving HTML structure.
694
+ The converted markdown text (only content, no HTML tags).
561
695
  """
562
696
  if convert_as_inline:
563
697
  return text
564
698
 
565
- return f"<details>\n{text.strip()}\n</details>\n\n" if text.strip() else ""
699
+ # Details is a semantic container, return its content
700
+ return _format_block_element(text)
566
701
 
567
702
 
568
703
  def _convert_summary(*, text: str, convert_as_inline: bool) -> str:
569
- """Convert HTML summary element preserving HTML structure.
704
+ """Convert HTML summary element to emphasized text.
570
705
 
571
706
  Args:
572
707
  text: The text content of the summary element.
573
708
  convert_as_inline: Whether to convert as inline content.
574
709
 
575
710
  Returns:
576
- The converted markdown text preserving HTML structure.
711
+ The converted markdown text as bold heading.
577
712
  """
578
713
  if convert_as_inline:
579
714
  return text
580
715
 
581
- return f"<summary>{text.strip()}</summary>\n\n" if text.strip() else ""
716
+ # Summary is like a heading/title
717
+ return _format_wrapped_block(text, "**")
582
718
 
583
719
 
584
720
  def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
@@ -674,119 +810,42 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
674
810
  return f'"{escaped_text}"'
675
811
 
676
812
 
677
- def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
678
- """Convert HTML audio element preserving structure with fallback.
813
+ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
814
+ """Convert HTML media elements (audio/video) to semantic Markdown.
679
815
 
680
816
  Args:
681
- tag: The audio tag element.
682
- text: The text content of the audio element (fallback content).
817
+ tag: The media tag element.
818
+ text: The text content of the media element (fallback content).
683
819
  convert_as_inline: Whether to convert as inline content.
684
820
 
685
821
  Returns:
686
- The converted markdown text preserving audio element.
822
+ The converted markdown text (link if src exists, otherwise fallback content).
687
823
  """
688
- _ = convert_as_inline
689
824
  src = tag.get("src", "")
690
825
 
691
- if not src:
692
- source_tag = tag.find("source")
693
- if source_tag and isinstance(source_tag, Tag):
694
- src = source_tag.get("src", "")
695
-
696
- controls = "controls" if tag.get("controls") is not None else ""
697
- autoplay = "autoplay" if tag.get("autoplay") is not None else ""
698
- loop = "loop" if tag.get("loop") is not None else ""
699
- muted = "muted" if tag.get("muted") is not None else ""
700
- preload = tag.get("preload", "")
826
+ if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
827
+ src = source_tag.get("src", "")
701
828
 
702
- attrs = []
829
+ # If we have a src, convert to a link
703
830
  if src and isinstance(src, str) and src.strip():
704
- attrs.append(f'src="{src}"')
705
- if controls:
706
- attrs.append(controls)
707
- if autoplay:
708
- attrs.append(autoplay)
709
- if loop:
710
- attrs.append(loop)
711
- if muted:
712
- attrs.append(muted)
713
- if preload and isinstance(preload, str) and preload.strip():
714
- attrs.append(f'preload="{preload}"')
715
-
716
- attrs_str = " ".join(attrs)
717
-
831
+ link = f"[{src}]({src})"
832
+ if convert_as_inline:
833
+ return link
834
+ result = f"{link}\n\n"
835
+ # Add fallback content if present
836
+ if text.strip():
837
+ result += f"{text.strip()}\n\n"
838
+ return result
839
+
840
+ # No src, just return fallback content
718
841
  if text.strip():
719
- if attrs_str:
720
- return f"<audio {attrs_str}>\n{text.strip()}\n</audio>\n\n"
721
- return f"<audio>\n{text.strip()}\n</audio>\n\n"
722
-
723
- if attrs_str:
724
- return f"<audio {attrs_str} />\n\n"
725
- return "<audio />\n\n"
726
-
842
+ return _format_inline_or_block(text, convert_as_inline)
727
843
 
728
- def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
729
- """Convert HTML video element preserving structure with fallback.
730
-
731
- Args:
732
- tag: The video tag element.
733
- text: The text content of the video element (fallback content).
734
- convert_as_inline: Whether to convert as inline content.
735
-
736
- Returns:
737
- The converted markdown text preserving video element.
738
- """
739
- _ = convert_as_inline
740
- src = tag.get("src", "")
741
-
742
- if not src:
743
- source_tag = tag.find("source")
744
- if source_tag and isinstance(source_tag, Tag):
745
- src = source_tag.get("src", "")
746
-
747
- width = tag.get("width", "")
748
- height = tag.get("height", "")
749
- poster = tag.get("poster", "")
750
- controls = "controls" if tag.get("controls") is not None else ""
751
- autoplay = "autoplay" if tag.get("autoplay") is not None else ""
752
- loop = "loop" if tag.get("loop") is not None else ""
753
- muted = "muted" if tag.get("muted") is not None else ""
754
- preload = tag.get("preload", "")
755
-
756
- attrs = []
757
- if src and isinstance(src, str) and src.strip():
758
- attrs.append(f'src="{src}"')
759
- if width and isinstance(width, str) and width.strip():
760
- attrs.append(f'width="{width}"')
761
- if height and isinstance(height, str) and height.strip():
762
- attrs.append(f'height="{height}"')
763
- if poster and isinstance(poster, str) and poster.strip():
764
- attrs.append(f'poster="{poster}"')
765
- if controls:
766
- attrs.append(controls)
767
- if autoplay:
768
- attrs.append(autoplay)
769
- if loop:
770
- attrs.append(loop)
771
- if muted:
772
- attrs.append(muted)
773
- if preload and isinstance(preload, str) and preload.strip():
774
- attrs.append(f'preload="{preload}"')
775
-
776
- attrs_str = " ".join(attrs)
777
-
778
- if text.strip():
779
- if attrs_str:
780
- return f"<video {attrs_str}>\n{text.strip()}\n</video>\n\n"
781
- return f"<video>\n{text.strip()}\n</video>\n\n"
782
-
783
- if attrs_str:
784
- return f"<video {attrs_str} />\n\n"
785
- return "<video />\n\n"
844
+ return ""
786
845
 
787
846
 
788
847
  def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
789
- """Convert HTML iframe element preserving structure.
848
+ """Convert HTML iframe element to semantic Markdown.
790
849
 
791
850
  Args:
792
851
  tag: The iframe tag element.
@@ -794,47 +853,19 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
794
853
  convert_as_inline: Whether to convert as inline content.
795
854
 
796
855
  Returns:
797
- The converted markdown text preserving iframe element.
856
+ The converted markdown text (link if src exists).
798
857
  """
799
858
  _ = text
800
- _ = convert_as_inline
801
859
  src = tag.get("src", "")
802
- width = tag.get("width", "")
803
- height = tag.get("height", "")
804
- title = tag.get("title", "")
805
- allow = tag.get("allow", "")
806
- sandbox = tag.get("sandbox")
807
- loading = tag.get("loading", "")
808
-
809
- attrs = []
810
- if src and isinstance(src, str) and src.strip():
811
- attrs.append(f'src="{src}"')
812
- if width and isinstance(width, str) and width.strip():
813
- attrs.append(f'width="{width}"')
814
- if height and isinstance(height, str) and height.strip():
815
- attrs.append(f'height="{height}"')
816
- if title and isinstance(title, str) and title.strip():
817
- attrs.append(f'title="{title}"')
818
- if allow and isinstance(allow, str) and allow.strip():
819
- attrs.append(f'allow="{allow}"')
820
- if sandbox is not None:
821
- if isinstance(sandbox, list):
822
- if sandbox:
823
- attrs.append(f'sandbox="{" ".join(sandbox)}"')
824
- else:
825
- attrs.append("sandbox")
826
- elif isinstance(sandbox, str) and sandbox:
827
- attrs.append(f'sandbox="{sandbox}"')
828
- else:
829
- attrs.append("sandbox")
830
- if loading and isinstance(loading, str) and loading.strip():
831
- attrs.append(f'loading="{loading}"')
832
860
 
833
- attrs_str = " ".join(attrs)
861
+ # If we have a src, convert to a link
862
+ if src and isinstance(src, str) and src.strip():
863
+ link = f"[{src}]({src})"
864
+ if convert_as_inline:
865
+ return link
866
+ return f"{link}\n\n"
834
867
 
835
- if attrs_str:
836
- return f"<iframe {attrs_str}></iframe>\n\n"
837
- return "<iframe></iframe>\n\n"
868
+ return ""
838
869
 
839
870
 
840
871
  def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
@@ -860,7 +891,7 @@ def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
860
891
 
861
892
 
862
893
  def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
863
- """Convert HTML time element preserving datetime attribute.
894
+ """Convert HTML time element to semantic Markdown.
864
895
 
865
896
  Args:
866
897
  tag: The time tag element.
@@ -868,21 +899,19 @@ def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
868
899
  convert_as_inline: Whether to convert as inline content.
869
900
 
870
901
  Returns:
871
- The converted markdown text preserving time information.
902
+ The converted markdown text (content only, no HTML tags).
872
903
  """
904
+ _ = tag
873
905
  _ = convert_as_inline
874
906
  if not text.strip():
875
907
  return ""
876
908
 
877
- datetime_attr = tag.get("datetime")
878
- if datetime_attr and isinstance(datetime_attr, str) and datetime_attr.strip():
879
- return f'<time datetime="{datetime_attr.strip()}">{text.strip()}</time>'
880
-
909
+ # Time elements are semantic - just return the content
881
910
  return text.strip()
882
911
 
883
912
 
884
913
  def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
885
- """Convert HTML data element preserving value attribute.
914
+ """Convert HTML data element to semantic Markdown.
886
915
 
887
916
  Args:
888
917
  tag: The data tag element.
@@ -890,16 +919,14 @@ def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
890
919
  convert_as_inline: Whether to convert as inline content.
891
920
 
892
921
  Returns:
893
- The converted markdown text preserving machine-readable data.
922
+ The converted markdown text (content only, no HTML tags).
894
923
  """
924
+ _ = tag
895
925
  _ = convert_as_inline
896
926
  if not text.strip():
897
927
  return ""
898
928
 
899
- value_attr = tag.get("value")
900
- if value_attr and isinstance(value_attr, str) and value_attr.strip():
901
- return f'<data value="{value_attr.strip()}">{text.strip()}</data>'
902
-
929
+ # Data elements are semantic - just return the content
903
930
  return text.strip()
904
931
 
905
932
 
@@ -917,7 +944,7 @@ def _convert_wbr(*, convert_as_inline: bool) -> str:
917
944
 
918
945
 
919
946
  def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
920
- """Convert HTML form element preserving structure for documentation.
947
+ """Convert HTML form element to semantic Markdown.
921
948
 
922
949
  Args:
923
950
  tag: The form tag element.
@@ -925,38 +952,28 @@ def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
925
952
  convert_as_inline: Whether to convert as inline content.
926
953
 
927
954
  Returns:
928
- The converted markdown text preserving form structure.
955
+ The converted markdown text (only content, no HTML tags).
929
956
  """
957
+ _ = tag
930
958
  if convert_as_inline:
931
959
  return text
932
960
 
933
961
  if not text.strip():
934
962
  return ""
935
963
 
936
- action = tag.get("action", "")
937
- method = tag.get("method", "")
938
- attrs = []
939
-
940
- if action and isinstance(action, str) and action.strip():
941
- attrs.append(f'action="{action.strip()}"')
942
- if method and isinstance(method, str) and method.strip():
943
- attrs.append(f'method="{method.strip()}"')
944
-
945
- attrs_str = " ".join(attrs)
946
- if attrs_str:
947
- return f"<form {attrs_str}>\n{text.strip()}\n</form>\n\n"
948
- return f"<form>\n{text.strip()}\n</form>\n\n"
964
+ # Forms are just containers, return their content
965
+ return text
949
966
 
950
967
 
951
968
  def _convert_fieldset(*, text: str, convert_as_inline: bool) -> str:
952
- """Convert HTML fieldset element preserving structure.
969
+ """Convert HTML fieldset element to semantic Markdown.
953
970
 
954
971
  Args:
955
972
  text: The text content of the fieldset element.
956
973
  convert_as_inline: Whether to convert as inline content.
957
974
 
958
975
  Returns:
959
- The converted markdown text preserving fieldset structure.
976
+ The converted markdown text (only content, no HTML tags).
960
977
  """
961
978
  if convert_as_inline:
962
979
  return text
@@ -964,7 +981,8 @@ def _convert_fieldset(*, text: str, convert_as_inline: bool) -> str:
964
981
  if not text.strip():
965
982
  return ""
966
983
 
967
- return f"<fieldset>\n{text.strip()}\n</fieldset>\n\n"
984
+ # Fieldsets are semantic groupings, return their content
985
+ return text
968
986
 
969
987
 
970
988
  def _convert_legend(*, text: str, convert_as_inline: bool) -> str:
@@ -983,11 +1001,12 @@ def _convert_legend(*, text: str, convert_as_inline: bool) -> str:
983
1001
  if not text.strip():
984
1002
  return ""
985
1003
 
986
- return f"<legend>{text.strip()}</legend>\n\n"
1004
+ # Legend is like a heading/title for fieldsets
1005
+ return _format_wrapped_block(text, "**")
987
1006
 
988
1007
 
989
1008
  def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
990
- """Convert HTML label element preserving for attribute.
1009
+ """Convert HTML label element to Markdown.
991
1010
 
992
1011
  Args:
993
1012
  tag: The label tag element.
@@ -995,78 +1014,33 @@ def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
995
1014
  convert_as_inline: Whether to convert as inline content.
996
1015
 
997
1016
  Returns:
998
- The converted markdown text preserving label structure.
1017
+ The label text content.
999
1018
  """
1000
- if convert_as_inline:
1001
- return text
1002
-
1019
+ _ = tag
1020
+ # Labels are just text, return the content
1003
1021
  if not text.strip():
1004
1022
  return ""
1005
1023
 
1006
- for_attr = tag.get("for")
1007
- if for_attr and isinstance(for_attr, str) and for_attr.strip():
1008
- return f'<label for="{for_attr.strip()}">{text.strip()}</label>\n\n'
1009
-
1010
- return f"<label>{text.strip()}</label>\n\n"
1024
+ return _format_inline_or_block(text, convert_as_inline)
1011
1025
 
1012
1026
 
1013
1027
  def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
1014
- """Convert HTML input element preserving all relevant attributes.
1028
+ """Convert HTML input element to Markdown.
1015
1029
 
1016
1030
  Args:
1017
1031
  tag: The input tag element.
1018
1032
  convert_as_inline: Whether to convert as inline content.
1019
1033
 
1020
1034
  Returns:
1021
- The converted markdown text preserving input structure.
1035
+ Empty string since input elements have no Markdown representation.
1022
1036
  """
1023
- input_type = tag.get("type", "text")
1024
-
1025
- from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
1026
-
1027
- if _has_ancestor(tag, "li"):
1028
- return ""
1029
-
1030
- id_attr = tag.get("id", "")
1031
- name = tag.get("name", "")
1032
- value = tag.get("value", "")
1033
- placeholder = tag.get("placeholder", "")
1034
- required = tag.get("required") is not None
1035
- disabled = tag.get("disabled") is not None
1036
- readonly = tag.get("readonly") is not None
1037
- checked = tag.get("checked") is not None
1038
- accept = tag.get("accept", "")
1039
-
1040
- attrs = []
1041
- if input_type and isinstance(input_type, str):
1042
- attrs.append(f'type="{input_type}"')
1043
- if id_attr and isinstance(id_attr, str) and id_attr.strip():
1044
- attrs.append(f'id="{id_attr}"')
1045
- if name and isinstance(name, str) and name.strip():
1046
- attrs.append(f'name="{name}"')
1047
- if value and isinstance(value, str) and value.strip():
1048
- attrs.append(f'value="{value}"')
1049
- if placeholder and isinstance(placeholder, str) and placeholder.strip():
1050
- attrs.append(f'placeholder="{placeholder}"')
1051
- if accept and isinstance(accept, str) and accept.strip():
1052
- attrs.append(f'accept="{accept}"')
1053
- if required:
1054
- attrs.append("required")
1055
- if disabled:
1056
- attrs.append("disabled")
1057
- if readonly:
1058
- attrs.append("readonly")
1059
- if checked:
1060
- attrs.append("checked")
1061
-
1062
- attrs_str = " ".join(attrs)
1063
- result = f"<input {attrs_str} />" if attrs_str else "<input />"
1064
-
1065
- return result if convert_as_inline else f"{result}\n\n"
1037
+ _ = tag, convert_as_inline
1038
+ # Input elements have no content and no Markdown equivalent
1039
+ return ""
1066
1040
 
1067
1041
 
1068
1042
  def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1069
- """Convert HTML textarea element preserving attributes.
1043
+ """Convert HTML textarea element to Markdown.
1070
1044
 
1071
1045
  Args:
1072
1046
  tag: The textarea tag element.
@@ -1074,42 +1048,18 @@ def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1074
1048
  convert_as_inline: Whether to convert as inline content.
1075
1049
 
1076
1050
  Returns:
1077
- The converted markdown text preserving textarea structure.
1051
+ The text content of the textarea.
1078
1052
  """
1079
- if convert_as_inline:
1080
- return text
1081
-
1053
+ _ = tag
1054
+ # Return the text content, which is what the user entered
1082
1055
  if not text.strip():
1083
1056
  return ""
1084
1057
 
1085
- name = tag.get("name", "")
1086
- placeholder = tag.get("placeholder", "")
1087
- rows = tag.get("rows", "")
1088
- cols = tag.get("cols", "")
1089
- required = tag.get("required") is not None
1090
-
1091
- attrs = []
1092
- if name and isinstance(name, str) and name.strip():
1093
- attrs.append(f'name="{name}"')
1094
- if placeholder and isinstance(placeholder, str) and placeholder.strip():
1095
- attrs.append(f'placeholder="{placeholder}"')
1096
- if rows and isinstance(rows, str) and rows.strip():
1097
- attrs.append(f'rows="{rows}"')
1098
- if cols and isinstance(cols, str) and cols.strip():
1099
- attrs.append(f'cols="{cols}"')
1100
- if required:
1101
- attrs.append("required")
1102
-
1103
- attrs_str = " ".join(attrs)
1104
- content = text.strip()
1105
-
1106
- if attrs_str:
1107
- return f"<textarea {attrs_str}>{content}</textarea>\n\n"
1108
- return f"<textarea>{content}</textarea>\n\n"
1058
+ return _format_inline_or_block(text, convert_as_inline)
1109
1059
 
1110
1060
 
1111
1061
  def _convert_select(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1112
- """Convert HTML select element preserving structure.
1062
+ """Convert HTML select element to Markdown.
1113
1063
 
1114
1064
  Args:
1115
1065
  tag: The select tag element.
@@ -1117,39 +1067,25 @@ def _convert_select(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1117
1067
  convert_as_inline: Whether to convert as inline content.
1118
1068
 
1119
1069
  Returns:
1120
- The converted markdown text preserving select structure.
1070
+ The text content (options) as a comma-separated list.
1121
1071
  """
1122
- if convert_as_inline:
1123
- return text
1124
-
1072
+ _ = tag
1073
+ # Return the options as text
1125
1074
  if not text.strip():
1126
1075
  return ""
1127
1076
 
1128
- id_attr = tag.get("id", "")
1129
- name = tag.get("name", "")
1130
- multiple = tag.get("multiple") is not None
1131
- required = tag.get("required") is not None
1132
-
1133
- attrs = []
1134
- if id_attr and isinstance(id_attr, str) and id_attr.strip():
1135
- attrs.append(f'id="{id_attr}"')
1136
- if name and isinstance(name, str) and name.strip():
1137
- attrs.append(f'name="{name}"')
1138
- if multiple:
1139
- attrs.append("multiple")
1140
- if required:
1141
- attrs.append("required")
1142
-
1143
- attrs_str = " ".join(attrs)
1144
- content = text.strip()
1077
+ # In inline mode, show options separated by commas
1078
+ if convert_as_inline:
1079
+ # Remove extra whitespace and join options
1080
+ options = [opt.strip() for opt in text.strip().split("\n") if opt.strip()]
1081
+ return ", ".join(options)
1145
1082
 
1146
- if attrs_str:
1147
- return f"<select {attrs_str}>\n{content}\n</select>\n\n"
1148
- return f"<select>\n{content}\n</select>\n\n"
1083
+ # In block mode, show as a list
1084
+ return _format_block_element(text)
1149
1085
 
1150
1086
 
1151
1087
  def _convert_option(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1152
- """Convert HTML option element preserving value and selected state.
1088
+ """Convert HTML option element to Markdown.
1153
1089
 
1154
1090
  Args:
1155
1091
  tag: The option tag element.
@@ -1157,33 +1093,26 @@ def _convert_option(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1157
1093
  convert_as_inline: Whether to convert as inline content.
1158
1094
 
1159
1095
  Returns:
1160
- The converted markdown text preserving option structure.
1096
+ The option text, potentially with a marker if selected.
1161
1097
  """
1162
- if convert_as_inline:
1163
- return text
1164
-
1165
1098
  if not text.strip():
1166
1099
  return ""
1167
1100
 
1168
- value = tag.get("value", "")
1101
+ # Check if this option is selected
1169
1102
  selected = tag.get("selected") is not None
1170
-
1171
- attrs = []
1172
- if value and isinstance(value, str) and value.strip():
1173
- attrs.append(f'value="{value}"')
1174
- if selected:
1175
- attrs.append("selected")
1176
-
1177
- attrs_str = " ".join(attrs)
1178
1103
  content = text.strip()
1179
1104
 
1180
- if attrs_str:
1181
- return f"<option {attrs_str}>{content}</option>\n"
1182
- return f"<option>{content}</option>\n"
1105
+ if convert_as_inline:
1106
+ return content
1107
+
1108
+ # In block mode, mark selected options
1109
+ if selected:
1110
+ return f"* {content}\n"
1111
+ return f"{content}\n"
1183
1112
 
1184
1113
 
1185
1114
  def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1186
- """Convert HTML optgroup element preserving label.
1115
+ """Convert HTML optgroup element to semantic Markdown.
1187
1116
 
1188
1117
  Args:
1189
1118
  tag: The optgroup tag element.
@@ -1191,7 +1120,7 @@ def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1191
1120
  convert_as_inline: Whether to convert as inline content.
1192
1121
 
1193
1122
  Returns:
1194
- The converted markdown text preserving optgroup structure.
1123
+ The converted markdown text with label as heading.
1195
1124
  """
1196
1125
  if convert_as_inline:
1197
1126
  return text
@@ -1200,21 +1129,17 @@ def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1200
1129
  return ""
1201
1130
 
1202
1131
  label = tag.get("label", "")
1132
+ content = text.strip()
1203
1133
 
1204
- attrs = []
1134
+ # If there's a label, show it as a heading
1205
1135
  if label and isinstance(label, str) and label.strip():
1206
- attrs.append(f'label="{label}"')
1136
+ return f"**{label.strip()}**\n{content}\n"
1207
1137
 
1208
- attrs_str = " ".join(attrs)
1209
- content = text.strip()
1210
-
1211
- if attrs_str:
1212
- return f"<optgroup {attrs_str}>\n{content}\n</optgroup>\n"
1213
- return f"<optgroup>\n{content}\n</optgroup>\n"
1138
+ return f"{content}\n"
1214
1139
 
1215
1140
 
1216
1141
  def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1217
- """Convert HTML button element preserving type and attributes.
1142
+ """Convert HTML button element to Markdown.
1218
1143
 
1219
1144
  Args:
1220
1145
  tag: The button tag element.
@@ -1222,38 +1147,18 @@ def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1222
1147
  convert_as_inline: Whether to convert as inline content.
1223
1148
 
1224
1149
  Returns:
1225
- The converted markdown text preserving button structure.
1150
+ The button text content.
1226
1151
  """
1227
- if convert_as_inline:
1228
- return text
1229
-
1152
+ _ = tag
1153
+ # Buttons are just interactive text, return the text content
1230
1154
  if not text.strip():
1231
1155
  return ""
1232
1156
 
1233
- button_type = tag.get("type", "")
1234
- name = tag.get("name", "")
1235
- value = tag.get("value", "")
1236
- disabled = tag.get("disabled") is not None
1237
-
1238
- attrs = []
1239
- if button_type and isinstance(button_type, str) and button_type.strip():
1240
- attrs.append(f'type="{button_type}"')
1241
- if name and isinstance(name, str) and name.strip():
1242
- attrs.append(f'name="{name}"')
1243
- if value and isinstance(value, str) and value.strip():
1244
- attrs.append(f'value="{value}"')
1245
- if disabled:
1246
- attrs.append("disabled")
1247
-
1248
- attrs_str = " ".join(attrs)
1249
-
1250
- if attrs_str:
1251
- return f"<button {attrs_str}>{text.strip()}</button>\n\n"
1252
- return f"<button>{text.strip()}</button>\n\n"
1157
+ return _format_inline_or_block(text, convert_as_inline)
1253
1158
 
1254
1159
 
1255
1160
  def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1256
- """Convert HTML progress element preserving value and max.
1161
+ """Convert HTML progress element to semantic text.
1257
1162
 
1258
1163
  Args:
1259
1164
  tag: The progress tag element.
@@ -1261,33 +1166,21 @@ def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1261
1166
  convert_as_inline: Whether to convert as inline content.
1262
1167
 
1263
1168
  Returns:
1264
- The converted markdown text preserving progress structure.
1169
+ The converted markdown text (only content, no HTML tags).
1265
1170
  """
1171
+ _ = tag
1266
1172
  if convert_as_inline:
1267
1173
  return text
1268
1174
 
1269
1175
  if not text.strip():
1270
1176
  return ""
1271
1177
 
1272
- value = tag.get("value", "")
1273
- max_val = tag.get("max", "")
1274
-
1275
- attrs = []
1276
- if value and isinstance(value, str) and value.strip():
1277
- attrs.append(f'value="{value}"')
1278
- if max_val and isinstance(max_val, str) and max_val.strip():
1279
- attrs.append(f'max="{max_val}"')
1280
-
1281
- attrs_str = " ".join(attrs)
1282
- content = text.strip()
1283
-
1284
- if attrs_str:
1285
- return f"<progress {attrs_str}>{content}</progress>\n\n"
1286
- return f"<progress>{content}</progress>\n\n"
1178
+ # Progress elements convert to their text content
1179
+ return _format_block_element(text)
1287
1180
 
1288
1181
 
1289
1182
  def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1290
- """Convert HTML meter element preserving value and range attributes.
1183
+ """Convert HTML meter element to semantic text.
1291
1184
 
1292
1185
  Args:
1293
1186
  tag: The meter tag element.
@@ -1295,45 +1188,21 @@ def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1295
1188
  convert_as_inline: Whether to convert as inline content.
1296
1189
 
1297
1190
  Returns:
1298
- The converted markdown text preserving meter structure.
1191
+ The converted markdown text (only content, no HTML tags).
1299
1192
  """
1193
+ _ = tag
1300
1194
  if convert_as_inline:
1301
1195
  return text
1302
1196
 
1303
1197
  if not text.strip():
1304
1198
  return ""
1305
1199
 
1306
- value = tag.get("value", "")
1307
- min_val = tag.get("min", "")
1308
- max_val = tag.get("max", "")
1309
- low = tag.get("low", "")
1310
- high = tag.get("high", "")
1311
- optimum = tag.get("optimum", "")
1312
-
1313
- attrs = []
1314
- if value and isinstance(value, str) and value.strip():
1315
- attrs.append(f'value="{value}"')
1316
- if min_val and isinstance(min_val, str) and min_val.strip():
1317
- attrs.append(f'min="{min_val}"')
1318
- if max_val and isinstance(max_val, str) and max_val.strip():
1319
- attrs.append(f'max="{max_val}"')
1320
- if low and isinstance(low, str) and low.strip():
1321
- attrs.append(f'low="{low}"')
1322
- if high and isinstance(high, str) and high.strip():
1323
- attrs.append(f'high="{high}"')
1324
- if optimum and isinstance(optimum, str) and optimum.strip():
1325
- attrs.append(f'optimum="{optimum}"')
1326
-
1327
- attrs_str = " ".join(attrs)
1328
- content = text.strip()
1329
-
1330
- if attrs_str:
1331
- return f"<meter {attrs_str}>{content}</meter>\n\n"
1332
- return f"<meter>{content}</meter>\n\n"
1200
+ # Meter elements convert to their text content
1201
+ return _format_block_element(text)
1333
1202
 
1334
1203
 
1335
1204
  def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1336
- """Convert HTML output element preserving for and name attributes.
1205
+ """Convert HTML output element to semantic text.
1337
1206
 
1338
1207
  Args:
1339
1208
  tag: The output tag element.
@@ -1341,34 +1210,21 @@ def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1341
1210
  convert_as_inline: Whether to convert as inline content.
1342
1211
 
1343
1212
  Returns:
1344
- The converted markdown text preserving output structure.
1213
+ The converted markdown text (only content, no HTML tags).
1345
1214
  """
1215
+ _ = tag
1346
1216
  if convert_as_inline:
1347
1217
  return text
1348
1218
 
1349
1219
  if not text.strip():
1350
1220
  return ""
1351
1221
 
1352
- for_attr = tag.get("for", "")
1353
- name = tag.get("name", "")
1354
-
1355
- attrs = []
1356
- if for_attr:
1357
- for_value = " ".join(for_attr) if isinstance(for_attr, list) else str(for_attr)
1358
- if for_value.strip():
1359
- attrs.append(f'for="{for_value}"')
1360
- if name and isinstance(name, str) and name.strip():
1361
- attrs.append(f'name="{name}"')
1362
-
1363
- attrs_str = " ".join(attrs)
1364
-
1365
- if attrs_str:
1366
- return f"<output {attrs_str}>{text.strip()}</output>\n\n"
1367
- return f"<output>{text.strip()}</output>\n\n"
1222
+ # Output elements convert to their text content
1223
+ return _format_block_element(text)
1368
1224
 
1369
1225
 
1370
1226
  def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1371
- """Convert HTML datalist element preserving structure.
1227
+ """Convert HTML datalist element to semantic Markdown.
1372
1228
 
1373
1229
  Args:
1374
1230
  tag: The datalist tag element.
@@ -1376,26 +1232,17 @@ def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1376
1232
  convert_as_inline: Whether to convert as inline content.
1377
1233
 
1378
1234
  Returns:
1379
- The converted markdown text preserving datalist structure.
1235
+ The converted markdown text (only content, no HTML tags).
1380
1236
  """
1237
+ _ = tag
1381
1238
  if convert_as_inline:
1382
1239
  return text
1383
1240
 
1384
1241
  if not text.strip():
1385
1242
  return ""
1386
1243
 
1387
- id_attr = tag.get("id", "")
1388
-
1389
- attrs = []
1390
- if id_attr and isinstance(id_attr, str) and id_attr.strip():
1391
- attrs.append(f'id="{id_attr}"')
1392
-
1393
- attrs_str = " ".join(attrs)
1394
- content = text.strip()
1395
-
1396
- if attrs_str:
1397
- return f"<datalist {attrs_str}>\n{content}\n</datalist>\n\n"
1398
- return f"<datalist>\n{content}\n</datalist>\n\n"
1244
+ # Datalist shows options as a list
1245
+ return _format_block_element(text)
1399
1246
 
1400
1247
 
1401
1248
  def _convert_ruby(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
@@ -1488,7 +1335,7 @@ def _convert_rtc(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1488
1335
 
1489
1336
 
1490
1337
  def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1491
- """Convert HTML dialog element preserving structure with attributes.
1338
+ """Convert HTML dialog element to semantic Markdown.
1492
1339
 
1493
1340
  Args:
1494
1341
  text: The text content of the dialog element.
@@ -1496,27 +1343,21 @@ def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1496
1343
  tag: The dialog tag element.
1497
1344
 
1498
1345
  Returns:
1499
- The converted markdown text preserving dialog structure.
1346
+ The converted markdown text (only content, no HTML tags).
1500
1347
  """
1348
+ _ = tag
1501
1349
  if convert_as_inline:
1502
1350
  return text
1503
1351
 
1504
1352
  if not text.strip():
1505
1353
  return ""
1506
1354
 
1507
- attrs = []
1508
- if tag.get("open") is not None:
1509
- attrs.append("open")
1510
- if tag.get("id"):
1511
- attrs.append(f'id="{tag.get("id")}"')
1512
-
1513
- attrs_str = " " + " ".join(attrs) if attrs else ""
1514
-
1515
- return f"<dialog{attrs_str}>\n{text.strip()}\n</dialog>\n\n"
1355
+ # Dialog is a semantic container, return its content
1356
+ return _format_block_element(text)
1516
1357
 
1517
1358
 
1518
1359
  def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1519
- """Convert HTML menu element preserving structure with attributes.
1360
+ """Convert HTML menu element to semantic Markdown.
1520
1361
 
1521
1362
  Args:
1522
1363
  text: The text content of the menu element.
@@ -1524,29 +1365,21 @@ def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1524
1365
  tag: The menu tag element.
1525
1366
 
1526
1367
  Returns:
1527
- The converted markdown text preserving menu structure.
1368
+ The converted markdown text (only content, no HTML tags).
1528
1369
  """
1370
+ _ = tag
1529
1371
  if convert_as_inline:
1530
1372
  return text
1531
1373
 
1532
1374
  if not text.strip():
1533
1375
  return ""
1534
1376
 
1535
- attrs = []
1536
- if tag.get("type") and tag.get("type") != "list":
1537
- attrs.append(f'type="{tag.get("type")}"')
1538
- if tag.get("label"):
1539
- attrs.append(f'label="{tag.get("label")}"')
1540
- if tag.get("id"):
1541
- attrs.append(f'id="{tag.get("id")}"')
1542
-
1543
- attrs_str = " " + " ".join(attrs) if attrs else ""
1544
-
1545
- return f"<menu{attrs_str}>\n{text.strip()}\n</menu>\n\n"
1377
+ # Menu is converted as a list
1378
+ return _format_block_element(text)
1546
1379
 
1547
1380
 
1548
1381
  def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1549
- """Convert HTML figure element preserving semantic structure.
1382
+ """Convert HTML figure element to semantic Markdown.
1550
1383
 
1551
1384
  Args:
1552
1385
  text: The text content of the figure element.
@@ -1554,42 +1387,35 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1554
1387
  tag: The figure tag element.
1555
1388
 
1556
1389
  Returns:
1557
- The converted markdown text preserving figure structure.
1390
+ The converted markdown text (only content, no HTML tags).
1558
1391
  """
1392
+ _ = tag
1559
1393
  if not text.strip():
1560
1394
  return ""
1561
1395
 
1562
1396
  if convert_as_inline:
1563
1397
  return text
1564
1398
 
1565
- attrs = []
1566
- if tag.get("id"):
1567
- attrs.append(f'id="{tag.get("id")}"')
1568
- if tag.get("class"):
1569
- class_val = tag.get("class")
1570
- if isinstance(class_val, list):
1571
- class_val = " ".join(class_val)
1572
- attrs.append(f'class="{class_val}"')
1573
-
1574
- attrs_str = " " + " ".join(attrs) if attrs else ""
1575
-
1399
+ # Figure is a semantic container, return its content
1400
+ # Make sure there's proper spacing after the figure content
1576
1401
  content = text.strip()
1577
-
1578
- if content.endswith("\n\n"):
1579
- return f"<figure{attrs_str}>\n{content}</figure>\n\n"
1580
-
1581
- return f"<figure{attrs_str}>\n{content}\n</figure>\n\n"
1402
+ if content and not content.endswith("\n\n"):
1403
+ if content.endswith("\n"):
1404
+ content += "\n"
1405
+ else:
1406
+ content += "\n\n"
1407
+ return content
1582
1408
 
1583
1409
 
1584
1410
  def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
1585
- """Convert HTML hgroup element preserving heading group semantics.
1411
+ """Convert HTML hgroup element to semantic Markdown.
1586
1412
 
1587
1413
  Args:
1588
1414
  text: The text content of the hgroup element.
1589
1415
  convert_as_inline: Whether to convert as inline content.
1590
1416
 
1591
1417
  Returns:
1592
- The converted markdown text preserving heading group structure.
1418
+ The converted markdown text (only content, no HTML tags).
1593
1419
  """
1594
1420
  if convert_as_inline:
1595
1421
  return text
@@ -1597,15 +1423,12 @@ def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
1597
1423
  if not text.strip():
1598
1424
  return ""
1599
1425
 
1600
- content = text.strip()
1601
-
1602
- content = re.sub(r"\n{3,}", "\n\n", content)
1603
-
1604
- return f"<!-- heading group -->\n{content}\n<!-- end heading group -->\n\n"
1426
+ # Hgroup is a semantic container for headings, return its content
1427
+ return text
1605
1428
 
1606
1429
 
1607
1430
  def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1608
- """Convert HTML picture element with responsive image sources.
1431
+ """Convert HTML picture element to semantic Markdown.
1609
1432
 
1610
1433
  Args:
1611
1434
  text: The text content of the picture element.
@@ -1613,44 +1436,14 @@ def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1613
1436
  tag: The picture tag element.
1614
1437
 
1615
1438
  Returns:
1616
- The converted markdown text with picture information preserved.
1439
+ The converted markdown text (only the img element).
1617
1440
  """
1441
+ _ = tag, convert_as_inline
1618
1442
  if not text.strip():
1619
1443
  return ""
1620
1444
 
1621
- sources = tag.find_all("source")
1622
- img = tag.find("img")
1623
-
1624
- if not img:
1625
- return text.strip()
1626
-
1627
- img_markdown = text.strip()
1628
-
1629
- if not sources:
1630
- return img_markdown
1631
-
1632
- source_info = []
1633
- for source in sources:
1634
- srcset = source.get("srcset")
1635
- media = source.get("media")
1636
- mime_type = source.get("type")
1637
-
1638
- if srcset:
1639
- info = f'srcset="{srcset}"'
1640
- if media:
1641
- info += f' media="{media}"'
1642
- if mime_type:
1643
- info += f' type="{mime_type}"'
1644
- source_info.append(info)
1645
-
1646
- if source_info and not convert_as_inline:
1647
- sources_comment = "<!-- picture sources:\n"
1648
- for info in source_info:
1649
- sources_comment += f" {info}\n"
1650
- sources_comment += "-->\n"
1651
- return f"{sources_comment}{img_markdown}"
1652
-
1653
- return img_markdown
1445
+ # Picture is a container for responsive images, only the img matters for Markdown
1446
+ return text.strip()
1654
1447
 
1655
1448
 
1656
1449
  def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
@@ -1765,7 +1558,7 @@ def create_converters_map(
1765
1558
  "abbr": _wrapper(_convert_abbr),
1766
1559
  "article": _wrapper(_convert_semantic_block),
1767
1560
  "aside": _wrapper(_convert_semantic_block),
1768
- "audio": _wrapper(_convert_audio),
1561
+ "audio": _wrapper(_convert_media_element),
1769
1562
  "b": _wrapper(partial(_create_inline_converter(2 * strong_em_symbol))),
1770
1563
  "bdi": _wrapper(_create_inline_converter("")),
1771
1564
  "bdo": _wrapper(_create_inline_converter("")),
@@ -1788,7 +1581,7 @@ def create_converters_map(
1788
1581
  "dt": _wrapper(_convert_dt),
1789
1582
  "em": _wrapper(_create_inline_converter(strong_em_symbol)),
1790
1583
  "fieldset": _wrapper(_convert_fieldset),
1791
- "figcaption": _wrapper(lambda text: f"\n\n{text}\n\n"),
1584
+ "figcaption": _wrapper(lambda text: f"\n\n*{text.strip()}*\n\n" if text.strip() else ""),
1792
1585
  "figure": _wrapper(_convert_figure),
1793
1586
  "footer": _wrapper(_convert_semantic_block),
1794
1587
  "form": _wrapper(_convert_form),
@@ -1861,6 +1654,6 @@ def create_converters_map(
1861
1654
  "u": _wrapper(_create_inline_converter("")),
1862
1655
  "ul": _wrapper(_convert_list),
1863
1656
  "var": _wrapper(_create_inline_converter("*")),
1864
- "video": _wrapper(_convert_video),
1657
+ "video": _wrapper(_convert_media_element),
1865
1658
  "wbr": _wrapper(_convert_wbr),
1866
1659
  }