html-to-markdown 1.6.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -5,11 +5,11 @@ from typing import TYPE_CHECKING
5
5
  if TYPE_CHECKING:
6
6
  from collections.abc import Iterable
7
7
  import base64
8
- import re
8
+ from collections.abc import Callable
9
9
  from functools import partial
10
10
  from inspect import getfullargspec
11
11
  from textwrap import fill
12
- from typing import Any, Callable, Literal, TypeVar, cast
12
+ from typing import Any, Literal, TypeVar, cast
13
13
 
14
14
  from bs4.element import Tag
15
15
 
@@ -21,6 +21,24 @@ from html_to_markdown.constants import (
21
21
  )
22
22
  from html_to_markdown.utils import chomp, indent, underline
23
23
 
24
+
25
+ def _format_block_element(text: str) -> str:
26
+ """Format text as a block element with trailing newlines."""
27
+ return f"{text.strip()}\n\n" if text.strip() else ""
28
+
29
+
30
+ def _format_inline_or_block(text: str, convert_as_inline: bool) -> str:
31
+ """Format text as inline or block element based on context."""
32
+ return text.strip() if convert_as_inline else _format_block_element(text)
33
+
34
+
35
+ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") -> str:
36
+ """Format text wrapped in markers as a block element."""
37
+ if not end_marker:
38
+ end_marker = start_marker
39
+ return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
40
+
41
+
24
42
  SupportedElements = Literal[
25
43
  "a",
26
44
  "abbr",
@@ -137,7 +155,6 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
137
155
  """
138
156
 
139
157
  def implementation(*, tag: Tag, text: str) -> str:
140
- # Check if we're in a code context - if so, don't apply markup
141
158
  from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
142
159
 
143
160
  if _has_ancestor(tag, ["pre", "code", "kbd", "samp"]):
@@ -151,7 +168,6 @@ def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
151
168
  markup_suffix = "</" + markup_prefix[1:]
152
169
 
153
170
  prefix, suffix, text = chomp(text)
154
-
155
171
  return f"{prefix}{markup_prefix}{text}{markup_suffix}{suffix}"
156
172
 
157
173
  return cast("Callable[[Tag, str], str]", implementation)
@@ -191,25 +207,35 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
191
207
  if not text:
192
208
  return ""
193
209
 
194
- # Handle cite attribute
210
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
211
+
195
212
  cite_url = tag.get("cite")
196
- quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
213
+
214
+ # Check if this blockquote is inside a list item
215
+ if _has_ancestor(tag, "li"):
216
+ # Indent the blockquote by 4 spaces
217
+ lines = text.strip().split("\n")
218
+ indented_lines = [f" > {line}" if line.strip() else "" for line in lines]
219
+ quote_text = "\n".join(indented_lines) + "\n\n"
220
+ else:
221
+ quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
197
222
 
198
223
  if cite_url:
199
- quote_text += f"— <{cite_url}>\n\n"
224
+ if _has_ancestor(tag, "li"):
225
+ quote_text += f" — <{cite_url}>\n\n"
226
+ else:
227
+ quote_text += f"— <{cite_url}>\n\n"
200
228
 
201
229
  return quote_text
202
230
 
203
231
 
204
232
  def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
205
- # Convert br to line break, but handle headings specially
206
233
  from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
207
234
 
208
235
  if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
209
- return " " # Convert to space in headings
236
+ return " "
210
237
 
211
- # Always convert br to line break in other contexts
212
- _ = convert_as_inline # Unused but kept for API consistency
238
+ _ = convert_as_inline
213
239
  return "\\\n" if newline_style.lower() == BACKSLASH else " \n"
214
240
 
215
241
 
@@ -247,9 +273,9 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
247
273
  height = height if isinstance(height, str) else ""
248
274
  title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
249
275
  parent_name = tag.parent.name if tag.parent else ""
250
- # Always preserve images in table cells (td, th) by default
251
- default_preserve_in = ["td", "th"]
252
- preserve_in = set(keep_inline_images_in or []) | set(default_preserve_in)
276
+
277
+ default_preserve_in = {"td", "th"}
278
+ preserve_in = set(keep_inline_images_in or []) | default_preserve_in
253
279
  if convert_as_inline and parent_name not in preserve_in:
254
280
  return alt
255
281
  if width or height:
@@ -258,35 +284,52 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
258
284
 
259
285
 
260
286
  def _convert_list(*, tag: Tag, text: str) -> str:
261
- nested = False
287
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
262
288
 
263
289
  before_paragraph = False
264
290
  if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
265
291
  before_paragraph = True
266
292
 
267
- while tag:
268
- if tag.name == "li":
269
- nested = True
270
- break
271
-
272
- if not tag.parent:
273
- break
274
-
275
- tag = tag.parent
276
-
277
- if nested:
278
- return "\n" + indent(text=text, level=1).rstrip()
293
+ # Check if this list is inside a list item
294
+ if _has_ancestor(tag, "li"):
295
+ # This is a nested list - needs indentation
296
+ # But we need to check if it's the first element after a paragraph
297
+ parent = tag.parent
298
+ while parent and parent.name != "li":
299
+ parent = parent.parent
300
+
301
+ if parent:
302
+ # Check if there's a paragraph before this list
303
+ prev_p = None
304
+ for child in parent.children:
305
+ if hasattr(child, "name"):
306
+ if child == tag:
307
+ break
308
+ if child.name == "p":
309
+ prev_p = child
310
+
311
+ if prev_p:
312
+ # If there's a paragraph before, we need proper indentation
313
+ lines = text.strip().split("\n")
314
+ indented_lines = []
315
+ for line in lines:
316
+ if line.strip():
317
+ indented_lines.append(f" {line}")
318
+ else:
319
+ indented_lines.append("")
320
+ return "\n" + "\n".join(indented_lines) + "\n"
321
+ # Otherwise use the original tab indentation
322
+ return "\n" + indent(text=text, level=1).rstrip()
279
323
 
280
324
  return text + ("\n" if before_paragraph else "")
281
325
 
282
326
 
283
327
  def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
284
- # Check for task list (checkbox input)
285
328
  checkbox = tag.find("input", {"type": "checkbox"})
286
329
  if checkbox and isinstance(checkbox, Tag):
287
330
  checked = checkbox.get("checked") is not None
288
331
  checkbox_symbol = "[x]" if checked else "[ ]"
289
- # Remove the checkbox from the text content
332
+
290
333
  checkbox_text = text
291
334
  if checkbox.string:
292
335
  checkbox_text = text.replace(str(checkbox.string), "").strip()
@@ -311,10 +354,38 @@ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
311
354
  tag = tag.parent
312
355
 
313
356
  bullet = bullets[depth % len(bullets)]
357
+
358
+ # Check if the list item contains block-level elements (like <p>, <blockquote>, etc.)
359
+ has_block_children = any(
360
+ child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
361
+ for child in tag.children
362
+ if hasattr(child, "name")
363
+ )
364
+
365
+ if has_block_children:
366
+ # Handle multi-paragraph list items
367
+ # Split by double newlines (paragraph separators)
368
+ paragraphs = text.strip().split("\n\n")
369
+
370
+ if paragraphs:
371
+ # First paragraph goes directly after the bullet
372
+ result_parts = [f"{bullet} {paragraphs[0].strip()}\n"]
373
+
374
+ # Subsequent paragraphs need to be indented and separated by blank lines
375
+ for para in paragraphs[1:]:
376
+ if para.strip():
377
+ # Add blank line before the paragraph
378
+ result_parts.append("\n")
379
+ # Indent each line of the paragraph by 4 spaces
380
+ result_parts.extend(f" {line}\n" for line in para.strip().split("\n") if line.strip())
381
+
382
+ return "".join(result_parts)
383
+
384
+ # Simple case: no block elements, just inline content
314
385
  return "{} {}\n".format(bullet, (text or "").strip())
315
386
 
316
387
 
317
- def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: int) -> str:
388
+ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: int, tag: Tag) -> str:
318
389
  if convert_as_inline:
319
390
  return text
320
391
 
@@ -326,6 +397,30 @@ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: in
326
397
  break_on_hyphens=False,
327
398
  )
328
399
 
400
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
401
+
402
+ # Check if this paragraph is inside a list item
403
+ if _has_ancestor(tag, "li"):
404
+ # Check if this is the first paragraph in the list item
405
+ parent = tag.parent
406
+ while parent and parent.name != "li":
407
+ parent = parent.parent
408
+
409
+ if parent:
410
+ # Get all direct children that are paragraphs
411
+ p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
412
+
413
+ # If this is not the first paragraph, indent it
414
+ if p_children and tag != p_children[0]:
415
+ # Indent all lines by 4 spaces
416
+ indented_lines = []
417
+ for line in text.split("\n"):
418
+ if line.strip():
419
+ indented_lines.append(f" {line}")
420
+ else:
421
+ indented_lines.append("")
422
+ text = "\n".join(indented_lines)
423
+
329
424
  return f"{text}\n\n" if text else ""
330
425
 
331
426
 
@@ -343,13 +438,15 @@ def _convert_mark(*, text: str, convert_as_inline: bool, highlight_style: str) -
343
438
  if convert_as_inline:
344
439
  return text
345
440
 
346
- if highlight_style == "double-equal":
347
- return f"=={text}=="
348
- if highlight_style == "bold":
349
- return f"**{text}**"
350
- if highlight_style == "html":
351
- return f"<mark>{text}</mark>"
352
- return text
441
+ match highlight_style:
442
+ case "double-equal":
443
+ return f"=={text}=="
444
+ case "bold":
445
+ return f"**{text}**"
446
+ case "html":
447
+ return f"<mark>{text}</mark>"
448
+ case _:
449
+ return text
353
450
 
354
451
 
355
452
  def _convert_pre(
@@ -382,6 +479,58 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
382
479
  cells = tag.find_all(["td", "th"])
383
480
  parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
384
481
  tag_grand_parent = tag.parent.parent if tag.parent else None
482
+
483
+ # Simple rowspan handling: if previous row had cells with rowspan, add empty cells
484
+ if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
485
+ prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
486
+ rowspan_positions = []
487
+ col_pos = 0
488
+
489
+ # Check which cells in previous row have rowspan > 1
490
+ for prev_cell in prev_cells:
491
+ rowspan = 1
492
+ if (
493
+ "rowspan" in prev_cell.attrs
494
+ and isinstance(prev_cell["rowspan"], str)
495
+ and prev_cell["rowspan"].isdigit()
496
+ ):
497
+ rowspan = int(prev_cell["rowspan"])
498
+
499
+ if rowspan > 1:
500
+ # This cell spans into current row
501
+ rowspan_positions.append(col_pos)
502
+
503
+ # Account for colspan
504
+ colspan = 1
505
+ if (
506
+ "colspan" in prev_cell.attrs
507
+ and isinstance(prev_cell["colspan"], str)
508
+ and prev_cell["colspan"].isdigit()
509
+ ):
510
+ colspan = int(prev_cell["colspan"])
511
+ col_pos += colspan
512
+
513
+ # If there are rowspan cells from previous row, add empty cells
514
+ if rowspan_positions:
515
+ # Build new text with empty cells inserted
516
+ new_cells = []
517
+ cell_index = 0
518
+
519
+ for pos in range(col_pos): # Total columns
520
+ if pos in rowspan_positions:
521
+ # Add empty cell for rowspan
522
+ new_cells.append(" |")
523
+ elif cell_index < len(cells):
524
+ # Add actual cell content
525
+ cell = cells[cell_index]
526
+ cell_text = cell.get_text().strip().replace("\n", " ")
527
+ colspan = _get_colspan(cell)
528
+ new_cells.append(f" {cell_text} |" * colspan)
529
+ cell_index += 1
530
+
531
+ # Override text with new cell arrangement
532
+ text = "".join(new_cells)
533
+
385
534
  is_headrow = (
386
535
  all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
387
536
  or (not tag.previous_sibling and parent_name != "tbody")
@@ -429,7 +578,7 @@ def _convert_caption(*, text: str, convert_as_inline: bool) -> str:
429
578
  if not text.strip():
430
579
  return ""
431
580
 
432
- return f"*{text.strip()}*\n\n"
581
+ return _format_wrapped_block(text, "*")
433
582
 
434
583
 
435
584
  def _convert_thead(*, text: str, convert_as_inline: bool) -> str:
@@ -481,7 +630,10 @@ def _convert_tfoot(*, text: str, convert_as_inline: bool) -> str:
481
630
 
482
631
 
483
632
  def _convert_colgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
484
- """Convert HTML colgroup element preserving column structure for documentation.
633
+ """Convert HTML colgroup element - removes it entirely from Markdown output.
634
+
635
+ Colgroup is a table column grouping element that defines styling for columns.
636
+ It has no representation in Markdown and should be removed.
485
637
 
486
638
  Args:
487
639
  tag: The colgroup tag element.
@@ -489,54 +641,30 @@ def _convert_colgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
489
641
  convert_as_inline: Whether to convert as inline content.
490
642
 
491
643
  Returns:
492
- The converted markdown text preserving colgroup structure.
644
+ Empty string as colgroup has no Markdown representation.
493
645
  """
494
- if convert_as_inline:
495
- return text
496
-
497
- if not text.strip():
498
- return ""
499
-
500
- span = tag.get("span", "")
501
- attrs = []
502
- if span and isinstance(span, str) and span.strip():
503
- attrs.append(f'span="{span}"')
504
-
505
- attrs_str = " ".join(attrs)
506
- if attrs_str:
507
- return f"<colgroup {attrs_str}>\n{text.strip()}\n</colgroup>\n\n"
508
- return f"<colgroup>\n{text.strip()}\n</colgroup>\n\n"
646
+ _ = tag, text, convert_as_inline
647
+ # Colgroup and its contents (col elements) are purely presentational
648
+ # and have no equivalent in Markdown tables
649
+ return ""
509
650
 
510
651
 
511
652
  def _convert_col(*, tag: Tag, convert_as_inline: bool) -> str:
512
- """Convert HTML col element preserving column attributes for documentation.
653
+ """Convert HTML col element - removes it entirely from Markdown output.
654
+
655
+ Col elements define column properties (width, style) in HTML tables.
656
+ They have no representation in Markdown and should be removed.
513
657
 
514
658
  Args:
515
659
  tag: The col tag element.
516
660
  convert_as_inline: Whether to convert as inline content.
517
661
 
518
662
  Returns:
519
- The converted markdown text preserving col structure.
663
+ Empty string as col has no Markdown representation.
520
664
  """
521
- if convert_as_inline:
522
- return ""
523
-
524
- span = tag.get("span", "")
525
- width = tag.get("width", "")
526
- style = tag.get("style", "")
527
-
528
- attrs = []
529
- if width and isinstance(width, str) and width.strip():
530
- attrs.append(f'width="{width}"')
531
- if style and isinstance(style, str) and style.strip():
532
- attrs.append(f'style="{style}"')
533
- if span and isinstance(span, str) and span.strip():
534
- attrs.append(f'span="{span}"')
535
-
536
- attrs_str = " ".join(attrs)
537
- if attrs_str:
538
- return f"<col {attrs_str} />\n"
539
- return "<col />\n"
665
+ _ = tag, convert_as_inline
666
+ # Col elements are self-closing and purely presentational
667
+ return ""
540
668
 
541
669
 
542
670
  def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
@@ -556,35 +684,37 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
556
684
 
557
685
 
558
686
  def _convert_details(*, text: str, convert_as_inline: bool) -> str:
559
- """Convert HTML details element preserving HTML structure.
687
+ """Convert HTML details element to semantic Markdown.
560
688
 
561
689
  Args:
562
690
  text: The text content of the details element.
563
691
  convert_as_inline: Whether to convert as inline content.
564
692
 
565
693
  Returns:
566
- The converted markdown text preserving HTML structure.
694
+ The converted markdown text (only content, no HTML tags).
567
695
  """
568
696
  if convert_as_inline:
569
697
  return text
570
698
 
571
- return f"<details>\n{text.strip()}\n</details>\n\n" if text.strip() else ""
699
+ # Details is a semantic container, return its content
700
+ return _format_block_element(text)
572
701
 
573
702
 
574
703
  def _convert_summary(*, text: str, convert_as_inline: bool) -> str:
575
- """Convert HTML summary element preserving HTML structure.
704
+ """Convert HTML summary element to emphasized text.
576
705
 
577
706
  Args:
578
707
  text: The text content of the summary element.
579
708
  convert_as_inline: Whether to convert as inline content.
580
709
 
581
710
  Returns:
582
- The converted markdown text preserving HTML structure.
711
+ The converted markdown text as bold heading.
583
712
  """
584
713
  if convert_as_inline:
585
714
  return text
586
715
 
587
- return f"<summary>{text.strip()}</summary>\n\n" if text.strip() else ""
716
+ # Summary is like a heading/title
717
+ return _format_wrapped_block(text, "**")
588
718
 
589
719
 
590
720
  def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
@@ -676,134 +806,46 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
676
806
  if not text.strip():
677
807
  return ""
678
808
 
679
- # Escape any existing quotes in the text
680
809
  escaped_text = text.strip().replace('"', '\\"')
681
810
  return f'"{escaped_text}"'
682
811
 
683
812
 
684
- def _convert_audio(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
685
- """Convert HTML audio element preserving structure with fallback.
813
+ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
814
+ """Convert HTML media elements (audio/video) to semantic Markdown.
686
815
 
687
816
  Args:
688
- tag: The audio tag element.
689
- text: The text content of the audio element (fallback content).
817
+ tag: The media tag element.
818
+ text: The text content of the media element (fallback content).
690
819
  convert_as_inline: Whether to convert as inline content.
691
820
 
692
821
  Returns:
693
- The converted markdown text preserving audio element.
822
+ The converted markdown text (link if src exists, otherwise fallback content).
694
823
  """
695
- _ = convert_as_inline # Unused but kept for API consistency
696
824
  src = tag.get("src", "")
697
825
 
698
- # Check for source elements if no src attribute
699
- if not src:
700
- source_tag = tag.find("source")
701
- if source_tag and isinstance(source_tag, Tag):
702
- src = source_tag.get("src", "")
703
-
704
- # Get other attributes
705
- controls = "controls" if tag.get("controls") is not None else ""
706
- autoplay = "autoplay" if tag.get("autoplay") is not None else ""
707
- loop = "loop" if tag.get("loop") is not None else ""
708
- muted = "muted" if tag.get("muted") is not None else ""
709
- preload = tag.get("preload", "")
710
-
711
- # Build attributes string
712
- attrs = []
713
- if src and isinstance(src, str) and src.strip():
714
- attrs.append(f'src="{src}"')
715
- if controls:
716
- attrs.append(controls)
717
- if autoplay:
718
- attrs.append(autoplay)
719
- if loop:
720
- attrs.append(loop)
721
- if muted:
722
- attrs.append(muted)
723
- if preload and isinstance(preload, str) and preload.strip():
724
- attrs.append(f'preload="{preload}"')
725
-
726
- attrs_str = " ".join(attrs)
727
-
728
- # If there's fallback content, preserve it
729
- if text.strip():
730
- if attrs_str:
731
- return f"<audio {attrs_str}>\n{text.strip()}\n</audio>\n\n"
732
- return f"<audio>\n{text.strip()}\n</audio>\n\n"
826
+ if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
827
+ src = source_tag.get("src", "")
733
828
 
734
- # Self-closing for no fallback content
735
- if attrs_str:
736
- return f"<audio {attrs_str} />\n\n"
737
- return "<audio />\n\n"
738
-
739
-
740
- def _convert_video(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
741
- """Convert HTML video element preserving structure with fallback.
742
-
743
- Args:
744
- tag: The video tag element.
745
- text: The text content of the video element (fallback content).
746
- convert_as_inline: Whether to convert as inline content.
747
-
748
- Returns:
749
- The converted markdown text preserving video element.
750
- """
751
- _ = convert_as_inline # Unused but kept for API consistency
752
- src = tag.get("src", "")
753
-
754
- # Check for source elements if no src attribute
755
- if not src:
756
- source_tag = tag.find("source")
757
- if source_tag and isinstance(source_tag, Tag):
758
- src = source_tag.get("src", "")
759
-
760
- # Get other attributes
761
- width = tag.get("width", "")
762
- height = tag.get("height", "")
763
- poster = tag.get("poster", "")
764
- controls = "controls" if tag.get("controls") is not None else ""
765
- autoplay = "autoplay" if tag.get("autoplay") is not None else ""
766
- loop = "loop" if tag.get("loop") is not None else ""
767
- muted = "muted" if tag.get("muted") is not None else ""
768
- preload = tag.get("preload", "")
769
-
770
- # Build attributes string
771
- attrs = []
829
+ # If we have a src, convert to a link
772
830
  if src and isinstance(src, str) and src.strip():
773
- attrs.append(f'src="{src}"')
774
- if width and isinstance(width, str) and width.strip():
775
- attrs.append(f'width="{width}"')
776
- if height and isinstance(height, str) and height.strip():
777
- attrs.append(f'height="{height}"')
778
- if poster and isinstance(poster, str) and poster.strip():
779
- attrs.append(f'poster="{poster}"')
780
- if controls:
781
- attrs.append(controls)
782
- if autoplay:
783
- attrs.append(autoplay)
784
- if loop:
785
- attrs.append(loop)
786
- if muted:
787
- attrs.append(muted)
788
- if preload and isinstance(preload, str) and preload.strip():
789
- attrs.append(f'preload="{preload}"')
790
-
791
- attrs_str = " ".join(attrs)
792
-
793
- # If there's fallback content, preserve it
831
+ link = f"[{src}]({src})"
832
+ if convert_as_inline:
833
+ return link
834
+ result = f"{link}\n\n"
835
+ # Add fallback content if present
836
+ if text.strip():
837
+ result += f"{text.strip()}\n\n"
838
+ return result
839
+
840
+ # No src, just return fallback content
794
841
  if text.strip():
795
- if attrs_str:
796
- return f"<video {attrs_str}>\n{text.strip()}\n</video>\n\n"
797
- return f"<video>\n{text.strip()}\n</video>\n\n"
842
+ return _format_inline_or_block(text, convert_as_inline)
798
843
 
799
- # Self-closing for no fallback content
800
- if attrs_str:
801
- return f"<video {attrs_str} />\n\n"
802
- return "<video />\n\n"
844
+ return ""
803
845
 
804
846
 
805
847
  def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
806
- """Convert HTML iframe element preserving structure.
848
+ """Convert HTML iframe element to semantic Markdown.
807
849
 
808
850
  Args:
809
851
  tag: The iframe tag element.
@@ -811,51 +853,19 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
811
853
  convert_as_inline: Whether to convert as inline content.
812
854
 
813
855
  Returns:
814
- The converted markdown text preserving iframe element.
856
+ The converted markdown text (link if src exists).
815
857
  """
816
- _ = text # Unused but kept for API consistency
817
- _ = convert_as_inline # Unused but kept for API consistency
858
+ _ = text
818
859
  src = tag.get("src", "")
819
- width = tag.get("width", "")
820
- height = tag.get("height", "")
821
- title = tag.get("title", "")
822
- allow = tag.get("allow", "")
823
- sandbox = tag.get("sandbox") # Don't provide default
824
- loading = tag.get("loading", "")
825
-
826
- # Build attributes string
827
- attrs = []
828
- if src and isinstance(src, str) and src.strip():
829
- attrs.append(f'src="{src}"')
830
- if width and isinstance(width, str) and width.strip():
831
- attrs.append(f'width="{width}"')
832
- if height and isinstance(height, str) and height.strip():
833
- attrs.append(f'height="{height}"')
834
- if title and isinstance(title, str) and title.strip():
835
- attrs.append(f'title="{title}"')
836
- if allow and isinstance(allow, str) and allow.strip():
837
- attrs.append(f'allow="{allow}"')
838
- if sandbox is not None:
839
- if isinstance(sandbox, list):
840
- # BeautifulSoup returns AttributeValueList for space-separated values
841
- if sandbox:
842
- attrs.append(f'sandbox="{" ".join(sandbox)}"')
843
- else:
844
- # Empty list means boolean attribute
845
- attrs.append("sandbox")
846
- elif isinstance(sandbox, str) and sandbox:
847
- attrs.append(f'sandbox="{sandbox}"')
848
- else:
849
- attrs.append("sandbox")
850
- if loading and isinstance(loading, str) and loading.strip():
851
- attrs.append(f'loading="{loading}"')
852
860
 
853
- attrs_str = " ".join(attrs)
861
+ # If we have a src, convert to a link
862
+ if src and isinstance(src, str) and src.strip():
863
+ link = f"[{src}]({src})"
864
+ if convert_as_inline:
865
+ return link
866
+ return f"{link}\n\n"
854
867
 
855
- # iframes are typically self-closing in usage
856
- if attrs_str:
857
- return f"<iframe {attrs_str}></iframe>\n\n"
858
- return "<iframe></iframe>\n\n"
868
+ return ""
859
869
 
860
870
 
861
871
  def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
@@ -869,20 +879,19 @@ def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
869
879
  Returns:
870
880
  The converted markdown text with optional title annotation.
871
881
  """
872
- _ = convert_as_inline # Unused but kept for API consistency
882
+ _ = convert_as_inline
873
883
  if not text.strip():
874
884
  return ""
875
885
 
876
886
  title = tag.get("title")
877
887
  if title and isinstance(title, str) and title.strip():
878
- # Show abbreviation with title in parentheses
879
888
  return f"{text.strip()} ({title.strip()})"
880
889
 
881
890
  return text.strip()
882
891
 
883
892
 
884
893
  def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
885
- """Convert HTML time element preserving datetime attribute.
894
+ """Convert HTML time element to semantic Markdown.
886
895
 
887
896
  Args:
888
897
  tag: The time tag element.
@@ -890,22 +899,19 @@ def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
890
899
  convert_as_inline: Whether to convert as inline content.
891
900
 
892
901
  Returns:
893
- The converted markdown text preserving time information.
902
+ The converted markdown text (content only, no HTML tags).
894
903
  """
895
- _ = convert_as_inline # Unused but kept for API consistency
904
+ _ = tag
905
+ _ = convert_as_inline
896
906
  if not text.strip():
897
907
  return ""
898
908
 
899
- datetime_attr = tag.get("datetime")
900
- if datetime_attr and isinstance(datetime_attr, str) and datetime_attr.strip():
901
- # Preserve machine-readable datetime in HTML
902
- return f'<time datetime="{datetime_attr.strip()}">{text.strip()}</time>'
903
-
909
+ # Time elements are semantic - just return the content
904
910
  return text.strip()
905
911
 
906
912
 
907
913
  def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
908
- """Convert HTML data element preserving value attribute.
914
+ """Convert HTML data element to semantic Markdown.
909
915
 
910
916
  Args:
911
917
  tag: The data tag element.
@@ -913,17 +919,14 @@ def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
913
919
  convert_as_inline: Whether to convert as inline content.
914
920
 
915
921
  Returns:
916
- The converted markdown text preserving machine-readable data.
922
+ The converted markdown text (content only, no HTML tags).
917
923
  """
918
- _ = convert_as_inline # Unused but kept for API consistency
924
+ _ = tag
925
+ _ = convert_as_inline
919
926
  if not text.strip():
920
927
  return ""
921
928
 
922
- value_attr = tag.get("value")
923
- if value_attr and isinstance(value_attr, str) and value_attr.strip():
924
- # Preserve machine-readable value in HTML
925
- return f'<data value="{value_attr.strip()}">{text.strip()}</data>'
926
-
929
+ # Data elements are semantic - just return the content
927
930
  return text.strip()
928
931
 
929
932
 
@@ -936,12 +939,12 @@ def _convert_wbr(*, convert_as_inline: bool) -> str:
936
939
  Returns:
937
940
  Empty string as wbr is just a break opportunity.
938
941
  """
939
- _ = convert_as_inline # Unused but kept for API consistency
940
- return "" # Word break opportunity doesn't produce visible output
942
+ _ = convert_as_inline
943
+ return ""
941
944
 
942
945
 
943
946
  def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
944
- """Convert HTML form element preserving structure for documentation.
947
+ """Convert HTML form element to semantic Markdown.
945
948
 
946
949
  Args:
947
950
  tag: The form tag element.
@@ -949,38 +952,28 @@ def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
949
952
  convert_as_inline: Whether to convert as inline content.
950
953
 
951
954
  Returns:
952
- The converted markdown text preserving form structure.
955
+ The converted markdown text (only content, no HTML tags).
953
956
  """
957
+ _ = tag
954
958
  if convert_as_inline:
955
959
  return text
956
960
 
957
961
  if not text.strip():
958
962
  return ""
959
963
 
960
- action = tag.get("action", "")
961
- method = tag.get("method", "")
962
- attrs = []
963
-
964
- if action and isinstance(action, str) and action.strip():
965
- attrs.append(f'action="{action.strip()}"')
966
- if method and isinstance(method, str) and method.strip():
967
- attrs.append(f'method="{method.strip()}"')
968
-
969
- attrs_str = " ".join(attrs)
970
- if attrs_str:
971
- return f"<form {attrs_str}>\n{text.strip()}\n</form>\n\n"
972
- return f"<form>\n{text.strip()}\n</form>\n\n"
964
+ # Forms are just containers, return their content
965
+ return text
973
966
 
974
967
 
975
968
  def _convert_fieldset(*, text: str, convert_as_inline: bool) -> str:
976
- """Convert HTML fieldset element preserving structure.
969
+ """Convert HTML fieldset element to semantic Markdown.
977
970
 
978
971
  Args:
979
972
  text: The text content of the fieldset element.
980
973
  convert_as_inline: Whether to convert as inline content.
981
974
 
982
975
  Returns:
983
- The converted markdown text preserving fieldset structure.
976
+ The converted markdown text (only content, no HTML tags).
984
977
  """
985
978
  if convert_as_inline:
986
979
  return text
@@ -988,7 +981,8 @@ def _convert_fieldset(*, text: str, convert_as_inline: bool) -> str:
988
981
  if not text.strip():
989
982
  return ""
990
983
 
991
- return f"<fieldset>\n{text.strip()}\n</fieldset>\n\n"
984
+ # Fieldsets are semantic groupings, return their content
985
+ return text
992
986
 
993
987
 
994
988
  def _convert_legend(*, text: str, convert_as_inline: bool) -> str:
@@ -1007,11 +1001,12 @@ def _convert_legend(*, text: str, convert_as_inline: bool) -> str:
1007
1001
  if not text.strip():
1008
1002
  return ""
1009
1003
 
1010
- return f"<legend>{text.strip()}</legend>\n\n"
1004
+ # Legend is like a heading/title for fieldsets
1005
+ return _format_wrapped_block(text, "**")
1011
1006
 
1012
1007
 
1013
1008
  def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1014
- """Convert HTML label element preserving for attribute.
1009
+ """Convert HTML label element to Markdown.
1015
1010
 
1016
1011
  Args:
1017
1012
  tag: The label tag element.
@@ -1019,80 +1014,33 @@ def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1019
1014
  convert_as_inline: Whether to convert as inline content.
1020
1015
 
1021
1016
  Returns:
1022
- The converted markdown text preserving label structure.
1017
+ The label text content.
1023
1018
  """
1024
- if convert_as_inline:
1025
- return text
1026
-
1019
+ _ = tag
1020
+ # Labels are just text, return the content
1027
1021
  if not text.strip():
1028
1022
  return ""
1029
1023
 
1030
- for_attr = tag.get("for")
1031
- if for_attr and isinstance(for_attr, str) and for_attr.strip():
1032
- return f'<label for="{for_attr.strip()}">{text.strip()}</label>\n\n'
1033
-
1034
- return f"<label>{text.strip()}</label>\n\n"
1024
+ return _format_inline_or_block(text, convert_as_inline)
1035
1025
 
1036
1026
 
1037
1027
  def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
1038
- """Convert HTML input element preserving all relevant attributes.
1028
+ """Convert HTML input element to Markdown.
1039
1029
 
1040
1030
  Args:
1041
1031
  tag: The input tag element.
1042
1032
  convert_as_inline: Whether to convert as inline content.
1043
1033
 
1044
1034
  Returns:
1045
- The converted markdown text preserving input structure.
1035
+ Empty string since input elements have no Markdown representation.
1046
1036
  """
1047
- input_type = tag.get("type", "text")
1048
-
1049
- # Special handling for inputs in list items - let _convert_li handle checkboxes
1050
- # and ignore other input types in list items (legacy behavior)
1051
- from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
1052
-
1053
- if _has_ancestor(tag, "li"):
1054
- return ""
1055
-
1056
- id_attr = tag.get("id", "")
1057
- name = tag.get("name", "")
1058
- value = tag.get("value", "")
1059
- placeholder = tag.get("placeholder", "")
1060
- required = tag.get("required") is not None
1061
- disabled = tag.get("disabled") is not None
1062
- readonly = tag.get("readonly") is not None
1063
- checked = tag.get("checked") is not None
1064
- accept = tag.get("accept", "")
1065
-
1066
- attrs = []
1067
- if input_type and isinstance(input_type, str):
1068
- attrs.append(f'type="{input_type}"')
1069
- if id_attr and isinstance(id_attr, str) and id_attr.strip():
1070
- attrs.append(f'id="{id_attr}"')
1071
- if name and isinstance(name, str) and name.strip():
1072
- attrs.append(f'name="{name}"')
1073
- if value and isinstance(value, str) and value.strip():
1074
- attrs.append(f'value="{value}"')
1075
- if placeholder and isinstance(placeholder, str) and placeholder.strip():
1076
- attrs.append(f'placeholder="{placeholder}"')
1077
- if accept and isinstance(accept, str) and accept.strip():
1078
- attrs.append(f'accept="{accept}"')
1079
- if required:
1080
- attrs.append("required")
1081
- if disabled:
1082
- attrs.append("disabled")
1083
- if readonly:
1084
- attrs.append("readonly")
1085
- if checked:
1086
- attrs.append("checked")
1087
-
1088
- attrs_str = " ".join(attrs)
1089
- result = f"<input {attrs_str} />" if attrs_str else "<input />"
1090
-
1091
- return result if convert_as_inline else f"{result}\n\n"
1037
+ _ = tag, convert_as_inline
1038
+ # Input elements have no content and no Markdown equivalent
1039
+ return ""
1092
1040
 
1093
1041
 
1094
1042
  def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1095
- """Convert HTML textarea element preserving attributes.
1043
+ """Convert HTML textarea element to Markdown.
1096
1044
 
1097
1045
  Args:
1098
1046
  tag: The textarea tag element.
@@ -1100,42 +1048,18 @@ def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1100
1048
  convert_as_inline: Whether to convert as inline content.
1101
1049
 
1102
1050
  Returns:
1103
- The converted markdown text preserving textarea structure.
1051
+ The text content of the textarea.
1104
1052
  """
1105
- if convert_as_inline:
1106
- return text
1107
-
1053
+ _ = tag
1054
+ # Return the text content, which is what the user entered
1108
1055
  if not text.strip():
1109
1056
  return ""
1110
1057
 
1111
- name = tag.get("name", "")
1112
- placeholder = tag.get("placeholder", "")
1113
- rows = tag.get("rows", "")
1114
- cols = tag.get("cols", "")
1115
- required = tag.get("required") is not None
1116
-
1117
- attrs = []
1118
- if name and isinstance(name, str) and name.strip():
1119
- attrs.append(f'name="{name}"')
1120
- if placeholder and isinstance(placeholder, str) and placeholder.strip():
1121
- attrs.append(f'placeholder="{placeholder}"')
1122
- if rows and isinstance(rows, str) and rows.strip():
1123
- attrs.append(f'rows="{rows}"')
1124
- if cols and isinstance(cols, str) and cols.strip():
1125
- attrs.append(f'cols="{cols}"')
1126
- if required:
1127
- attrs.append("required")
1128
-
1129
- attrs_str = " ".join(attrs)
1130
- content = text.strip()
1131
-
1132
- if attrs_str:
1133
- return f"<textarea {attrs_str}>{content}</textarea>\n\n"
1134
- return f"<textarea>{content}</textarea>\n\n"
1058
+ return _format_inline_or_block(text, convert_as_inline)
1135
1059
 
1136
1060
 
1137
1061
  def _convert_select(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1138
- """Convert HTML select element preserving structure.
1062
+ """Convert HTML select element to Markdown.
1139
1063
 
1140
1064
  Args:
1141
1065
  tag: The select tag element.
@@ -1143,39 +1067,25 @@ def _convert_select(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1143
1067
  convert_as_inline: Whether to convert as inline content.
1144
1068
 
1145
1069
  Returns:
1146
- The converted markdown text preserving select structure.
1070
+ The text content (options) as a comma-separated list.
1147
1071
  """
1148
- if convert_as_inline:
1149
- return text
1150
-
1072
+ _ = tag
1073
+ # Return the options as text
1151
1074
  if not text.strip():
1152
1075
  return ""
1153
1076
 
1154
- id_attr = tag.get("id", "")
1155
- name = tag.get("name", "")
1156
- multiple = tag.get("multiple") is not None
1157
- required = tag.get("required") is not None
1158
-
1159
- attrs = []
1160
- if id_attr and isinstance(id_attr, str) and id_attr.strip():
1161
- attrs.append(f'id="{id_attr}"')
1162
- if name and isinstance(name, str) and name.strip():
1163
- attrs.append(f'name="{name}"')
1164
- if multiple:
1165
- attrs.append("multiple")
1166
- if required:
1167
- attrs.append("required")
1168
-
1169
- attrs_str = " ".join(attrs)
1170
- content = text.strip()
1077
+ # In inline mode, show options separated by commas
1078
+ if convert_as_inline:
1079
+ # Remove extra whitespace and join options
1080
+ options = [opt.strip() for opt in text.strip().split("\n") if opt.strip()]
1081
+ return ", ".join(options)
1171
1082
 
1172
- if attrs_str:
1173
- return f"<select {attrs_str}>\n{content}\n</select>\n\n"
1174
- return f"<select>\n{content}\n</select>\n\n"
1083
+ # In block mode, show as a list
1084
+ return _format_block_element(text)
1175
1085
 
1176
1086
 
1177
1087
  def _convert_option(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1178
- """Convert HTML option element preserving value and selected state.
1088
+ """Convert HTML option element to Markdown.
1179
1089
 
1180
1090
  Args:
1181
1091
  tag: The option tag element.
@@ -1183,33 +1093,26 @@ def _convert_option(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1183
1093
  convert_as_inline: Whether to convert as inline content.
1184
1094
 
1185
1095
  Returns:
1186
- The converted markdown text preserving option structure.
1096
+ The option text, potentially with a marker if selected.
1187
1097
  """
1188
- if convert_as_inline:
1189
- return text
1190
-
1191
1098
  if not text.strip():
1192
1099
  return ""
1193
1100
 
1194
- value = tag.get("value", "")
1101
+ # Check if this option is selected
1195
1102
  selected = tag.get("selected") is not None
1196
-
1197
- attrs = []
1198
- if value and isinstance(value, str) and value.strip():
1199
- attrs.append(f'value="{value}"')
1200
- if selected:
1201
- attrs.append("selected")
1202
-
1203
- attrs_str = " ".join(attrs)
1204
1103
  content = text.strip()
1205
1104
 
1206
- if attrs_str:
1207
- return f"<option {attrs_str}>{content}</option>\n"
1208
- return f"<option>{content}</option>\n"
1105
+ if convert_as_inline:
1106
+ return content
1107
+
1108
+ # In block mode, mark selected options
1109
+ if selected:
1110
+ return f"* {content}\n"
1111
+ return f"{content}\n"
1209
1112
 
1210
1113
 
1211
1114
  def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1212
- """Convert HTML optgroup element preserving label.
1115
+ """Convert HTML optgroup element to semantic Markdown.
1213
1116
 
1214
1117
  Args:
1215
1118
  tag: The optgroup tag element.
@@ -1217,7 +1120,7 @@ def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1217
1120
  convert_as_inline: Whether to convert as inline content.
1218
1121
 
1219
1122
  Returns:
1220
- The converted markdown text preserving optgroup structure.
1123
+ The converted markdown text with label as heading.
1221
1124
  """
1222
1125
  if convert_as_inline:
1223
1126
  return text
@@ -1226,21 +1129,17 @@ def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1226
1129
  return ""
1227
1130
 
1228
1131
  label = tag.get("label", "")
1132
+ content = text.strip()
1229
1133
 
1230
- attrs = []
1134
+ # If there's a label, show it as a heading
1231
1135
  if label and isinstance(label, str) and label.strip():
1232
- attrs.append(f'label="{label}"')
1136
+ return f"**{label.strip()}**\n{content}\n"
1233
1137
 
1234
- attrs_str = " ".join(attrs)
1235
- content = text.strip()
1236
-
1237
- if attrs_str:
1238
- return f"<optgroup {attrs_str}>\n{content}\n</optgroup>\n"
1239
- return f"<optgroup>\n{content}\n</optgroup>\n"
1138
+ return f"{content}\n"
1240
1139
 
1241
1140
 
1242
1141
  def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1243
- """Convert HTML button element preserving type and attributes.
1142
+ """Convert HTML button element to Markdown.
1244
1143
 
1245
1144
  Args:
1246
1145
  tag: The button tag element.
@@ -1248,38 +1147,18 @@ def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1248
1147
  convert_as_inline: Whether to convert as inline content.
1249
1148
 
1250
1149
  Returns:
1251
- The converted markdown text preserving button structure.
1150
+ The button text content.
1252
1151
  """
1253
- if convert_as_inline:
1254
- return text
1255
-
1152
+ _ = tag
1153
+ # Buttons are just interactive text, return the text content
1256
1154
  if not text.strip():
1257
1155
  return ""
1258
1156
 
1259
- button_type = tag.get("type", "")
1260
- name = tag.get("name", "")
1261
- value = tag.get("value", "")
1262
- disabled = tag.get("disabled") is not None
1263
-
1264
- attrs = []
1265
- if button_type and isinstance(button_type, str) and button_type.strip():
1266
- attrs.append(f'type="{button_type}"')
1267
- if name and isinstance(name, str) and name.strip():
1268
- attrs.append(f'name="{name}"')
1269
- if value and isinstance(value, str) and value.strip():
1270
- attrs.append(f'value="{value}"')
1271
- if disabled:
1272
- attrs.append("disabled")
1273
-
1274
- attrs_str = " ".join(attrs)
1275
-
1276
- if attrs_str:
1277
- return f"<button {attrs_str}>{text.strip()}</button>\n\n"
1278
- return f"<button>{text.strip()}</button>\n\n"
1157
+ return _format_inline_or_block(text, convert_as_inline)
1279
1158
 
1280
1159
 
1281
1160
  def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1282
- """Convert HTML progress element preserving value and max.
1161
+ """Convert HTML progress element to semantic text.
1283
1162
 
1284
1163
  Args:
1285
1164
  tag: The progress tag element.
@@ -1287,33 +1166,21 @@ def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1287
1166
  convert_as_inline: Whether to convert as inline content.
1288
1167
 
1289
1168
  Returns:
1290
- The converted markdown text preserving progress structure.
1169
+ The converted markdown text (only content, no HTML tags).
1291
1170
  """
1171
+ _ = tag
1292
1172
  if convert_as_inline:
1293
1173
  return text
1294
1174
 
1295
1175
  if not text.strip():
1296
1176
  return ""
1297
1177
 
1298
- value = tag.get("value", "")
1299
- max_val = tag.get("max", "")
1300
-
1301
- attrs = []
1302
- if value and isinstance(value, str) and value.strip():
1303
- attrs.append(f'value="{value}"')
1304
- if max_val and isinstance(max_val, str) and max_val.strip():
1305
- attrs.append(f'max="{max_val}"')
1306
-
1307
- attrs_str = " ".join(attrs)
1308
- content = text.strip()
1309
-
1310
- if attrs_str:
1311
- return f"<progress {attrs_str}>{content}</progress>\n\n"
1312
- return f"<progress>{content}</progress>\n\n"
1178
+ # Progress elements convert to their text content
1179
+ return _format_block_element(text)
1313
1180
 
1314
1181
 
1315
1182
  def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1316
- """Convert HTML meter element preserving value and range attributes.
1183
+ """Convert HTML meter element to semantic text.
1317
1184
 
1318
1185
  Args:
1319
1186
  tag: The meter tag element.
@@ -1321,45 +1188,21 @@ def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1321
1188
  convert_as_inline: Whether to convert as inline content.
1322
1189
 
1323
1190
  Returns:
1324
- The converted markdown text preserving meter structure.
1191
+ The converted markdown text (only content, no HTML tags).
1325
1192
  """
1193
+ _ = tag
1326
1194
  if convert_as_inline:
1327
1195
  return text
1328
1196
 
1329
1197
  if not text.strip():
1330
1198
  return ""
1331
1199
 
1332
- value = tag.get("value", "")
1333
- min_val = tag.get("min", "")
1334
- max_val = tag.get("max", "")
1335
- low = tag.get("low", "")
1336
- high = tag.get("high", "")
1337
- optimum = tag.get("optimum", "")
1338
-
1339
- attrs = []
1340
- if value and isinstance(value, str) and value.strip():
1341
- attrs.append(f'value="{value}"')
1342
- if min_val and isinstance(min_val, str) and min_val.strip():
1343
- attrs.append(f'min="{min_val}"')
1344
- if max_val and isinstance(max_val, str) and max_val.strip():
1345
- attrs.append(f'max="{max_val}"')
1346
- if low and isinstance(low, str) and low.strip():
1347
- attrs.append(f'low="{low}"')
1348
- if high and isinstance(high, str) and high.strip():
1349
- attrs.append(f'high="{high}"')
1350
- if optimum and isinstance(optimum, str) and optimum.strip():
1351
- attrs.append(f'optimum="{optimum}"')
1352
-
1353
- attrs_str = " ".join(attrs)
1354
- content = text.strip()
1355
-
1356
- if attrs_str:
1357
- return f"<meter {attrs_str}>{content}</meter>\n\n"
1358
- return f"<meter>{content}</meter>\n\n"
1200
+ # Meter elements convert to their text content
1201
+ return _format_block_element(text)
1359
1202
 
1360
1203
 
1361
1204
  def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1362
- """Convert HTML output element preserving for and name attributes.
1205
+ """Convert HTML output element to semantic text.
1363
1206
 
1364
1207
  Args:
1365
1208
  tag: The output tag element.
@@ -1367,35 +1210,21 @@ def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1367
1210
  convert_as_inline: Whether to convert as inline content.
1368
1211
 
1369
1212
  Returns:
1370
- The converted markdown text preserving output structure.
1213
+ The converted markdown text (only content, no HTML tags).
1371
1214
  """
1215
+ _ = tag
1372
1216
  if convert_as_inline:
1373
1217
  return text
1374
1218
 
1375
1219
  if not text.strip():
1376
1220
  return ""
1377
1221
 
1378
- for_attr = tag.get("for", "")
1379
- name = tag.get("name", "")
1380
-
1381
- attrs = []
1382
- if for_attr:
1383
- # BeautifulSoup returns space-separated attributes as lists
1384
- for_value = " ".join(for_attr) if isinstance(for_attr, list) else str(for_attr)
1385
- if for_value.strip():
1386
- attrs.append(f'for="{for_value}"')
1387
- if name and isinstance(name, str) and name.strip():
1388
- attrs.append(f'name="{name}"')
1389
-
1390
- attrs_str = " ".join(attrs)
1391
-
1392
- if attrs_str:
1393
- return f"<output {attrs_str}>{text.strip()}</output>\n\n"
1394
- return f"<output>{text.strip()}</output>\n\n"
1222
+ # Output elements convert to their text content
1223
+ return _format_block_element(text)
1395
1224
 
1396
1225
 
1397
1226
  def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1398
- """Convert HTML datalist element preserving structure.
1227
+ """Convert HTML datalist element to semantic Markdown.
1399
1228
 
1400
1229
  Args:
1401
1230
  tag: The datalist tag element.
@@ -1403,26 +1232,17 @@ def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
1403
1232
  convert_as_inline: Whether to convert as inline content.
1404
1233
 
1405
1234
  Returns:
1406
- The converted markdown text preserving datalist structure.
1235
+ The converted markdown text (only content, no HTML tags).
1407
1236
  """
1237
+ _ = tag
1408
1238
  if convert_as_inline:
1409
1239
  return text
1410
1240
 
1411
1241
  if not text.strip():
1412
1242
  return ""
1413
1243
 
1414
- id_attr = tag.get("id", "")
1415
-
1416
- attrs = []
1417
- if id_attr and isinstance(id_attr, str) and id_attr.strip():
1418
- attrs.append(f'id="{id_attr}"')
1419
-
1420
- attrs_str = " ".join(attrs)
1421
- content = text.strip()
1422
-
1423
- if attrs_str:
1424
- return f"<datalist {attrs_str}>\n{content}\n</datalist>\n\n"
1425
- return f"<datalist>\n{content}\n</datalist>\n\n"
1244
+ # Datalist shows options as a list
1245
+ return _format_block_element(text)
1426
1246
 
1427
1247
 
1428
1248
  def _convert_ruby(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
@@ -1438,7 +1258,6 @@ def _convert_ruby(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1438
1258
  if not text.strip():
1439
1259
  return ""
1440
1260
 
1441
- # Ruby elements are always inline by nature
1442
1261
  return text.strip()
1443
1262
 
1444
1263
 
@@ -1455,7 +1274,6 @@ def _convert_rb(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1455
1274
  if not text.strip():
1456
1275
  return ""
1457
1276
 
1458
- # Ruby base is the main text, pass through as-is
1459
1277
  return text.strip()
1460
1278
 
1461
1279
 
@@ -1470,21 +1288,17 @@ def _convert_rt(*, text: str, convert_as_inline: bool, tag: Tag) -> str: # noqa
1470
1288
  Returns:
1471
1289
  The converted markdown text with pronunciation in parentheses.
1472
1290
  """
1473
- # Handle empty rt elements - still need parentheses
1474
1291
  content = text.strip()
1475
1292
 
1476
- # Check if this rt is surrounded by rp elements (fallback parentheses)
1477
1293
  prev_sibling = tag.previous_sibling
1478
1294
  next_sibling = tag.next_sibling
1479
1295
 
1480
- # If surrounded by rp elements, don't add extra parentheses
1481
1296
  has_rp_before = prev_sibling and getattr(prev_sibling, "name", None) == "rp"
1482
1297
  has_rp_after = next_sibling and getattr(next_sibling, "name", None) == "rp"
1483
1298
 
1484
1299
  if has_rp_before and has_rp_after:
1485
- # Already has rp parentheses, just return the text
1486
1300
  return content
1487
- # Ruby text (pronunciation) shown in parentheses as fallback
1301
+
1488
1302
  return f"({content})"
1489
1303
 
1490
1304
 
@@ -1501,7 +1315,6 @@ def _convert_rp(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1501
1315
  if not text.strip():
1502
1316
  return ""
1503
1317
 
1504
- # Ruby parentheses preserved for fallback compatibility
1505
1318
  return text.strip()
1506
1319
 
1507
1320
 
@@ -1518,12 +1331,11 @@ def _convert_rtc(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
1518
1331
  if not text.strip():
1519
1332
  return ""
1520
1333
 
1521
- # Ruby text container, pass through content
1522
1334
  return text.strip()
1523
1335
 
1524
1336
 
1525
1337
  def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1526
- """Convert HTML dialog element preserving structure with attributes.
1338
+ """Convert HTML dialog element to semantic Markdown.
1527
1339
 
1528
1340
  Args:
1529
1341
  text: The text content of the dialog element.
@@ -1531,28 +1343,21 @@ def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1531
1343
  tag: The dialog tag element.
1532
1344
 
1533
1345
  Returns:
1534
- The converted markdown text preserving dialog structure.
1346
+ The converted markdown text (only content, no HTML tags).
1535
1347
  """
1348
+ _ = tag
1536
1349
  if convert_as_inline:
1537
1350
  return text
1538
1351
 
1539
1352
  if not text.strip():
1540
1353
  return ""
1541
1354
 
1542
- # Get dialog attributes for preservation
1543
- attrs = []
1544
- if tag.get("open") is not None:
1545
- attrs.append("open")
1546
- if tag.get("id"):
1547
- attrs.append(f'id="{tag.get("id")}"')
1548
-
1549
- attrs_str = " " + " ".join(attrs) if attrs else ""
1550
-
1551
- return f"<dialog{attrs_str}>\n{text.strip()}\n</dialog>\n\n"
1355
+ # Dialog is a semantic container, return its content
1356
+ return _format_block_element(text)
1552
1357
 
1553
1358
 
1554
1359
  def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1555
- """Convert HTML menu element preserving structure with attributes.
1360
+ """Convert HTML menu element to semantic Markdown.
1556
1361
 
1557
1362
  Args:
1558
1363
  text: The text content of the menu element.
@@ -1560,30 +1365,21 @@ def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1560
1365
  tag: The menu tag element.
1561
1366
 
1562
1367
  Returns:
1563
- The converted markdown text preserving menu structure.
1368
+ The converted markdown text (only content, no HTML tags).
1564
1369
  """
1370
+ _ = tag
1565
1371
  if convert_as_inline:
1566
1372
  return text
1567
1373
 
1568
1374
  if not text.strip():
1569
1375
  return ""
1570
1376
 
1571
- # Get menu attributes for preservation
1572
- attrs = []
1573
- if tag.get("type") and tag.get("type") != "list":
1574
- attrs.append(f'type="{tag.get("type")}"')
1575
- if tag.get("label"):
1576
- attrs.append(f'label="{tag.get("label")}"')
1577
- if tag.get("id"):
1578
- attrs.append(f'id="{tag.get("id")}"')
1579
-
1580
- attrs_str = " " + " ".join(attrs) if attrs else ""
1581
-
1582
- return f"<menu{attrs_str}>\n{text.strip()}\n</menu>\n\n"
1377
+ # Menu is converted as a list
1378
+ return _format_block_element(text)
1583
1379
 
1584
1380
 
1585
1381
  def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1586
- """Convert HTML figure element preserving semantic structure.
1382
+ """Convert HTML figure element to semantic Markdown.
1587
1383
 
1588
1384
  Args:
1589
1385
  text: The text content of the figure element.
@@ -1591,47 +1387,35 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1591
1387
  tag: The figure tag element.
1592
1388
 
1593
1389
  Returns:
1594
- The converted markdown text preserving figure structure.
1390
+ The converted markdown text (only content, no HTML tags).
1595
1391
  """
1392
+ _ = tag
1596
1393
  if not text.strip():
1597
1394
  return ""
1598
1395
 
1599
1396
  if convert_as_inline:
1600
1397
  return text
1601
1398
 
1602
- # Get figure attributes for preservation
1603
- attrs = []
1604
- if tag.get("id"):
1605
- attrs.append(f'id="{tag.get("id")}"')
1606
- if tag.get("class"):
1607
- # Handle class attribute which might be a list
1608
- class_val = tag.get("class")
1609
- if isinstance(class_val, list):
1610
- class_val = " ".join(class_val)
1611
- attrs.append(f'class="{class_val}"')
1612
-
1613
- attrs_str = " " + " ".join(attrs) if attrs else ""
1614
-
1615
- # Check if the figure contains only an image (common case)
1616
- # In that case, we might want to preserve the figure wrapper
1399
+ # Figure is a semantic container, return its content
1400
+ # Make sure there's proper spacing after the figure content
1617
1401
  content = text.strip()
1618
-
1619
- # If content already has proper spacing, don't add extra newlines
1620
- if content.endswith("\n\n"):
1621
- return f"<figure{attrs_str}>\n{content}</figure>\n\n"
1622
-
1623
- return f"<figure{attrs_str}>\n{content}\n</figure>\n\n"
1402
+ if content and not content.endswith("\n\n"):
1403
+ if content.endswith("\n"):
1404
+ content += "\n"
1405
+ else:
1406
+ content += "\n\n"
1407
+ return content
1624
1408
 
1625
1409
 
1626
1410
  def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
1627
- """Convert HTML hgroup element preserving heading group semantics.
1411
+ """Convert HTML hgroup element to semantic Markdown.
1628
1412
 
1629
1413
  Args:
1630
1414
  text: The text content of the hgroup element.
1631
1415
  convert_as_inline: Whether to convert as inline content.
1632
1416
 
1633
1417
  Returns:
1634
- The converted markdown text preserving heading group structure.
1418
+ The converted markdown text (only content, no HTML tags).
1635
1419
  """
1636
1420
  if convert_as_inline:
1637
1421
  return text
@@ -1639,19 +1423,12 @@ def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
1639
1423
  if not text.strip():
1640
1424
  return ""
1641
1425
 
1642
- # Preserve the semantic grouping of headings
1643
- # Add a marker to indicate this is a grouped heading
1644
- content = text.strip()
1645
-
1646
- # Remove excessive newlines between headings in the group
1647
- # Headings in hgroup should be visually closer together
1648
- content = re.sub(r"\n{3,}", "\n\n", content)
1649
-
1650
- return f"<!-- heading group -->\n{content}\n<!-- end heading group -->\n\n"
1426
+ # Hgroup is a semantic container for headings, return its content
1427
+ return text
1651
1428
 
1652
1429
 
1653
1430
  def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1654
- """Convert HTML picture element with responsive image sources.
1431
+ """Convert HTML picture element to semantic Markdown.
1655
1432
 
1656
1433
  Args:
1657
1434
  text: The text content of the picture element.
@@ -1659,51 +1436,14 @@ def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1659
1436
  tag: The picture tag element.
1660
1437
 
1661
1438
  Returns:
1662
- The converted markdown text with picture information preserved.
1439
+ The converted markdown text (only the img element).
1663
1440
  """
1441
+ _ = tag, convert_as_inline
1664
1442
  if not text.strip():
1665
1443
  return ""
1666
1444
 
1667
- # Find all source elements
1668
- sources = tag.find_all("source")
1669
- img = tag.find("img")
1670
-
1671
- if not img:
1672
- # No img fallback, just return the text content
1673
- return text.strip()
1674
-
1675
- # Get the primary image markdown (already converted)
1676
- img_markdown = text.strip()
1677
-
1678
- # If there are no sources, just return the image
1679
- if not sources:
1680
- return img_markdown
1681
-
1682
- # Build a comment with source information for responsive images
1683
- source_info = []
1684
- for source in sources:
1685
- srcset = source.get("srcset")
1686
- media = source.get("media")
1687
- mime_type = source.get("type")
1688
-
1689
- if srcset:
1690
- info = f'srcset="{srcset}"'
1691
- if media:
1692
- info += f' media="{media}"'
1693
- if mime_type:
1694
- info += f' type="{mime_type}"'
1695
- source_info.append(info)
1696
-
1697
- if source_info and not convert_as_inline:
1698
- # Add picture source information as a comment
1699
- sources_comment = "<!-- picture sources:\n"
1700
- for info in source_info:
1701
- sources_comment += f" {info}\n"
1702
- sources_comment += "-->\n"
1703
- return f"{sources_comment}{img_markdown}"
1704
-
1705
- # In inline mode or no sources, just return the image
1706
- return img_markdown
1445
+ # Picture is a container for responsive images, only the img matters for Markdown
1446
+ return text.strip()
1707
1447
 
1708
1448
 
1709
1449
  def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
@@ -1718,23 +1458,17 @@ def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1718
1458
  The converted markdown text as an image reference.
1719
1459
  """
1720
1460
  if convert_as_inline:
1721
- # In inline mode, just return any text content
1722
1461
  return text.strip()
1723
1462
 
1724
- # Get SVG attributes
1725
1463
  title = tag.find("title")
1726
1464
  title_text = title.get_text().strip() if title else ""
1727
1465
 
1728
- # For inline SVG, we'll convert to a data URI
1729
- # First, we need to get the full SVG markup
1730
1466
  svg_markup = str(tag)
1731
1467
 
1732
- # Create a data URI
1733
1468
  svg_bytes = svg_markup.encode("utf-8")
1734
1469
  svg_base64 = base64.b64encode(svg_bytes).decode("utf-8")
1735
1470
  data_uri = f"data:image/svg+xml;base64,{svg_base64}"
1736
1471
 
1737
- # Use title as alt text, or "SVG Image" if no title
1738
1472
  alt_text = title_text or "SVG Image"
1739
1473
 
1740
1474
  return f"![{alt_text}]({data_uri})"
@@ -1754,17 +1488,13 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
1754
1488
  if not text.strip():
1755
1489
  return ""
1756
1490
 
1757
- # Check if it's display math vs inline math
1758
1491
  display = tag.get("display") == "block"
1759
1492
 
1760
- # For now, preserve the MathML as a comment with the text representation
1761
- # This allows systems that understand MathML to process it
1762
1493
  math_comment = f"<!-- MathML: {tag!s} -->"
1763
1494
 
1764
1495
  if convert_as_inline or not display:
1765
- # Inline math - just the text with comment
1766
1496
  return f"{math_comment}{text.strip()}"
1767
- # Display math - on its own line
1497
+
1768
1498
  return f"\n\n{math_comment}\n{text.strip()}\n\n"
1769
1499
 
1770
1500
 
@@ -1828,10 +1558,10 @@ def create_converters_map(
1828
1558
  "abbr": _wrapper(_convert_abbr),
1829
1559
  "article": _wrapper(_convert_semantic_block),
1830
1560
  "aside": _wrapper(_convert_semantic_block),
1831
- "audio": _wrapper(_convert_audio),
1561
+ "audio": _wrapper(_convert_media_element),
1832
1562
  "b": _wrapper(partial(_create_inline_converter(2 * strong_em_symbol))),
1833
- "bdi": _wrapper(_create_inline_converter("")), # Bidirectional isolation - pass through
1834
- "bdo": _wrapper(_create_inline_converter("")), # Bidirectional override - pass through
1563
+ "bdi": _wrapper(_create_inline_converter("")),
1564
+ "bdo": _wrapper(_create_inline_converter("")),
1835
1565
  "blockquote": _wrapper(partial(_convert_blockquote)),
1836
1566
  "br": _wrapper(partial(_convert_br, newline_style=newline_style)),
1837
1567
  "button": _wrapper(_convert_button),
@@ -1845,13 +1575,13 @@ def create_converters_map(
1845
1575
  "dd": _wrapper(_convert_dd),
1846
1576
  "del": _wrapper(_create_inline_converter("~~")),
1847
1577
  "details": _wrapper(_convert_details),
1848
- "dfn": _wrapper(_create_inline_converter("*")), # Definition term - italic
1578
+ "dfn": _wrapper(_create_inline_converter("*")),
1849
1579
  "dialog": _wrapper(_convert_dialog),
1850
1580
  "dl": _wrapper(_convert_dl),
1851
1581
  "dt": _wrapper(_convert_dt),
1852
1582
  "em": _wrapper(_create_inline_converter(strong_em_symbol)),
1853
1583
  "fieldset": _wrapper(_convert_fieldset),
1854
- "figcaption": _wrapper(lambda text: f"\n\n{text}\n\n"),
1584
+ "figcaption": _wrapper(lambda text: f"\n\n*{text.strip()}*\n\n" if text.strip() else ""),
1855
1585
  "figure": _wrapper(_convert_figure),
1856
1586
  "footer": _wrapper(_convert_semantic_block),
1857
1587
  "form": _wrapper(_convert_form),
@@ -1868,7 +1598,7 @@ def create_converters_map(
1868
1598
  "iframe": _wrapper(_convert_iframe),
1869
1599
  "img": _wrapper(partial(_convert_img, keep_inline_images_in=keep_inline_images_in)),
1870
1600
  "input": _wrapper(_convert_input_enhanced),
1871
- "ins": _wrapper(_create_inline_converter("==")), # Inserted text - highlight style
1601
+ "ins": _wrapper(_create_inline_converter("==")),
1872
1602
  "kbd": _wrapper(_create_inline_converter("`")),
1873
1603
  "label": _wrapper(_convert_label),
1874
1604
  "legend": _wrapper(_convert_legend),
@@ -1905,7 +1635,7 @@ def create_converters_map(
1905
1635
  "script": _wrapper(lambda _: ""),
1906
1636
  "section": _wrapper(_convert_semantic_block),
1907
1637
  "select": _wrapper(_convert_select),
1908
- "small": _wrapper(_create_inline_converter("")), # Small text - pass through
1638
+ "small": _wrapper(_create_inline_converter("")),
1909
1639
  "strong": _wrapper(_create_inline_converter(strong_em_symbol * 2)),
1910
1640
  "style": _wrapper(lambda _: ""),
1911
1641
  "sub": _wrapper(_create_inline_converter(sub_symbol)),
@@ -1921,9 +1651,9 @@ def create_converters_map(
1921
1651
  "thead": _wrapper(_convert_thead),
1922
1652
  "time": _wrapper(_convert_time),
1923
1653
  "tr": _wrapper(_convert_tr),
1924
- "u": _wrapper(_create_inline_converter("")), # Underlined text - pass through (no Markdown equivalent)
1654
+ "u": _wrapper(_create_inline_converter("")),
1925
1655
  "ul": _wrapper(_convert_list),
1926
- "var": _wrapper(_create_inline_converter("*")), # Variable - italic
1927
- "video": _wrapper(_convert_video),
1656
+ "var": _wrapper(_create_inline_converter("*")),
1657
+ "video": _wrapper(_convert_media_element),
1928
1658
  "wbr": _wrapper(_convert_wbr),
1929
1659
  }