html-to-markdown 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__main__.py +0 -1
- html_to_markdown/cli.py +101 -45
- html_to_markdown/constants.py +3 -0
- html_to_markdown/converters.py +52 -573
- html_to_markdown/exceptions.py +1 -11
- html_to_markdown/preprocessor.py +0 -37
- html_to_markdown/processing.py +104 -202
- html_to_markdown/utils.py +2 -42
- html_to_markdown/whitespace.py +292 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/METADATA +204 -204
- html_to_markdown-1.10.0.dist-info/RECORD +17 -0
- html_to_markdown-1.9.0.dist-info/RECORD +0 -16
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/top_level.txt +0 -0
html_to_markdown/converters.py
CHANGED
|
@@ -23,17 +23,14 @@ from html_to_markdown.utils import chomp, indent, underline
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
def _format_block_element(text: str) -> str:
|
|
26
|
-
"""Format text as a block element with trailing newlines."""
|
|
27
26
|
return f"{text.strip()}\n\n" if text.strip() else ""
|
|
28
27
|
|
|
29
28
|
|
|
30
29
|
def _format_inline_or_block(text: str, convert_as_inline: bool) -> str:
|
|
31
|
-
"""Format text as inline or block element based on context."""
|
|
32
30
|
return text.strip() if convert_as_inline else _format_block_element(text)
|
|
33
31
|
|
|
34
32
|
|
|
35
33
|
def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") -> str:
|
|
36
|
-
"""Format text wrapped in markers as a block element."""
|
|
37
34
|
if not end_marker:
|
|
38
35
|
end_marker = start_marker
|
|
39
36
|
return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
|
|
@@ -63,6 +60,7 @@ SupportedElements = Literal[
|
|
|
63
60
|
"details",
|
|
64
61
|
"dfn",
|
|
65
62
|
"dialog",
|
|
63
|
+
"div",
|
|
66
64
|
"dl",
|
|
67
65
|
"dt",
|
|
68
66
|
"em",
|
|
@@ -145,15 +143,6 @@ T = TypeVar("T")
|
|
|
145
143
|
|
|
146
144
|
|
|
147
145
|
def _create_inline_converter(markup_prefix: str) -> Callable[[Tag, str], str]:
|
|
148
|
-
"""Create an inline converter for a markup pattern or tag.
|
|
149
|
-
|
|
150
|
-
Args:
|
|
151
|
-
markup_prefix: The markup prefix to insert.
|
|
152
|
-
|
|
153
|
-
Returns:
|
|
154
|
-
A function that can be used to convert HTML to Markdown.
|
|
155
|
-
"""
|
|
156
|
-
|
|
157
146
|
def implementation(*, tag: Tag, text: str) -> str:
|
|
158
147
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
159
148
|
|
|
@@ -200,7 +189,7 @@ def _convert_a(*, tag: Tag, text: str, autolinks: bool, default_title: bool) ->
|
|
|
200
189
|
return f"{prefix}[{text}]({href}{title_part}){suffix}" if href else text
|
|
201
190
|
|
|
202
191
|
|
|
203
|
-
def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
|
|
192
|
+
def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool, list_indent_str: str) -> str:
|
|
204
193
|
if convert_as_inline:
|
|
205
194
|
return text
|
|
206
195
|
|
|
@@ -211,18 +200,16 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
|
|
|
211
200
|
|
|
212
201
|
cite_url = tag.get("cite")
|
|
213
202
|
|
|
214
|
-
# Check if this blockquote is inside a list item
|
|
215
203
|
if _has_ancestor(tag, "li"):
|
|
216
|
-
# Indent the blockquote by 4 spaces
|
|
217
204
|
lines = text.strip().split("\n")
|
|
218
|
-
indented_lines = [f"
|
|
205
|
+
indented_lines = [f"{list_indent_str}> {line}" if line.strip() else "" for line in lines]
|
|
219
206
|
quote_text = "\n".join(indented_lines) + "\n\n"
|
|
220
207
|
else:
|
|
221
208
|
quote_text = f"\n{line_beginning_re.sub('> ', text.strip())}\n\n"
|
|
222
209
|
|
|
223
210
|
if cite_url:
|
|
224
211
|
if _has_ancestor(tag, "li"):
|
|
225
|
-
quote_text += f"
|
|
212
|
+
quote_text += f"{list_indent_str}— <{cite_url}>\n\n"
|
|
226
213
|
else:
|
|
227
214
|
quote_text += f"— <{cite_url}>\n\n"
|
|
228
215
|
|
|
@@ -283,23 +270,19 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
|
|
|
283
270
|
return f""
|
|
284
271
|
|
|
285
272
|
|
|
286
|
-
def _convert_list(*, tag: Tag, text: str) -> str:
|
|
273
|
+
def _convert_list(*, tag: Tag, text: str, list_indent_str: str) -> str:
|
|
287
274
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
288
275
|
|
|
289
276
|
before_paragraph = False
|
|
290
277
|
if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
|
|
291
278
|
before_paragraph = True
|
|
292
279
|
|
|
293
|
-
# Check if this list is inside a list item
|
|
294
280
|
if _has_ancestor(tag, "li"):
|
|
295
|
-
# This is a nested list - needs indentation
|
|
296
|
-
# But we need to check if it's the first element after a paragraph
|
|
297
281
|
parent = tag.parent
|
|
298
282
|
while parent and parent.name != "li":
|
|
299
283
|
parent = parent.parent
|
|
300
284
|
|
|
301
285
|
if parent:
|
|
302
|
-
# Check if there's a paragraph before this list
|
|
303
286
|
prev_p = None
|
|
304
287
|
for child in parent.children:
|
|
305
288
|
if hasattr(child, "name"):
|
|
@@ -309,22 +292,33 @@ def _convert_list(*, tag: Tag, text: str) -> str:
|
|
|
309
292
|
prev_p = child
|
|
310
293
|
|
|
311
294
|
if prev_p:
|
|
312
|
-
# If there's a paragraph before, we need proper indentation
|
|
313
295
|
lines = text.strip().split("\n")
|
|
314
296
|
indented_lines = []
|
|
315
297
|
for line in lines:
|
|
316
298
|
if line.strip():
|
|
317
|
-
indented_lines.append(f"
|
|
299
|
+
indented_lines.append(f"{list_indent_str}{line}")
|
|
318
300
|
else:
|
|
319
301
|
indented_lines.append("")
|
|
320
302
|
return "\n" + "\n".join(indented_lines) + "\n"
|
|
321
|
-
|
|
322
|
-
|
|
303
|
+
return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
|
|
304
|
+
|
|
305
|
+
if tag.parent and tag.parent.name in {"ul", "ol"}:
|
|
306
|
+
lines = text.strip().split("\n")
|
|
307
|
+
indented_lines = []
|
|
308
|
+
for line in lines:
|
|
309
|
+
if line.strip():
|
|
310
|
+
indented_lines.append(f"{list_indent_str}{line}")
|
|
311
|
+
else:
|
|
312
|
+
indented_lines.append("")
|
|
313
|
+
result = "\n".join(indented_lines)
|
|
314
|
+
if not result.endswith("\n"):
|
|
315
|
+
result += "\n"
|
|
316
|
+
return result
|
|
323
317
|
|
|
324
318
|
return text + ("\n" if before_paragraph else "")
|
|
325
319
|
|
|
326
320
|
|
|
327
|
-
def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
|
|
321
|
+
def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> str:
|
|
328
322
|
checkbox = tag.find("input", {"type": "checkbox"})
|
|
329
323
|
if checkbox and isinstance(checkbox, Tag):
|
|
330
324
|
checked = checkbox.get("checked") is not None
|
|
@@ -355,7 +349,6 @@ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
|
|
|
355
349
|
|
|
356
350
|
bullet = bullets[depth % len(bullets)]
|
|
357
351
|
|
|
358
|
-
# Check if the list item contains block-level elements (like <p>, <blockquote>, etc.)
|
|
359
352
|
has_block_children = any(
|
|
360
353
|
child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
|
|
361
354
|
for child in tag.children
|
|
@@ -363,29 +356,26 @@ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
|
|
|
363
356
|
)
|
|
364
357
|
|
|
365
358
|
if has_block_children:
|
|
366
|
-
# Handle multi-paragraph list items
|
|
367
|
-
# Split by double newlines (paragraph separators)
|
|
368
359
|
paragraphs = text.strip().split("\n\n")
|
|
369
360
|
|
|
370
361
|
if paragraphs:
|
|
371
|
-
# First paragraph goes directly after the bullet
|
|
372
362
|
result_parts = [f"{bullet} {paragraphs[0].strip()}\n"]
|
|
373
363
|
|
|
374
|
-
# Subsequent paragraphs need to be indented and separated by blank lines
|
|
375
364
|
for para in paragraphs[1:]:
|
|
376
365
|
if para.strip():
|
|
377
|
-
# Add blank line before the paragraph
|
|
378
366
|
result_parts.append("\n")
|
|
379
|
-
|
|
380
|
-
|
|
367
|
+
result_parts.extend(
|
|
368
|
+
f"{list_indent_str}{line}\n" for line in para.strip().split("\n") if line.strip()
|
|
369
|
+
)
|
|
381
370
|
|
|
382
371
|
return "".join(result_parts)
|
|
383
372
|
|
|
384
|
-
# Simple case: no block elements, just inline content
|
|
385
373
|
return "{} {}\n".format(bullet, (text or "").strip())
|
|
386
374
|
|
|
387
375
|
|
|
388
|
-
def _convert_p(
|
|
376
|
+
def _convert_p(
|
|
377
|
+
*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: int, tag: Tag, list_indent_str: str
|
|
378
|
+
) -> str:
|
|
389
379
|
if convert_as_inline:
|
|
390
380
|
return text
|
|
391
381
|
|
|
@@ -399,24 +389,19 @@ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: in
|
|
|
399
389
|
|
|
400
390
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
401
391
|
|
|
402
|
-
# Check if this paragraph is inside a list item
|
|
403
392
|
if _has_ancestor(tag, "li"):
|
|
404
|
-
# Check if this is the first paragraph in the list item
|
|
405
393
|
parent = tag.parent
|
|
406
394
|
while parent and parent.name != "li":
|
|
407
395
|
parent = parent.parent
|
|
408
396
|
|
|
409
397
|
if parent:
|
|
410
|
-
# Get all direct children that are paragraphs
|
|
411
398
|
p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
|
|
412
399
|
|
|
413
|
-
# If this is not the first paragraph, indent it
|
|
414
400
|
if p_children and tag != p_children[0]:
|
|
415
|
-
# Indent all lines by 4 spaces
|
|
416
401
|
indented_lines = []
|
|
417
402
|
for line in text.split("\n"):
|
|
418
403
|
if line.strip():
|
|
419
|
-
indented_lines.append(f"
|
|
404
|
+
indented_lines.append(f"{list_indent_str}{line}")
|
|
420
405
|
else:
|
|
421
406
|
indented_lines.append("")
|
|
422
407
|
text = "\n".join(indented_lines)
|
|
@@ -425,16 +410,6 @@ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: in
|
|
|
425
410
|
|
|
426
411
|
|
|
427
412
|
def _convert_mark(*, text: str, convert_as_inline: bool, highlight_style: str) -> str:
|
|
428
|
-
"""Convert HTML mark element to Markdown highlighting.
|
|
429
|
-
|
|
430
|
-
Args:
|
|
431
|
-
text: The text content of the mark element.
|
|
432
|
-
convert_as_inline: Whether to convert as inline content.
|
|
433
|
-
highlight_style: The style to use for highlighting ("double-equal", "html", "bold").
|
|
434
|
-
|
|
435
|
-
Returns:
|
|
436
|
-
The converted markdown text.
|
|
437
|
-
"""
|
|
438
413
|
if convert_as_inline:
|
|
439
414
|
return text
|
|
440
415
|
|
|
@@ -480,13 +455,11 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
480
455
|
parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
|
|
481
456
|
tag_grand_parent = tag.parent.parent if tag.parent else None
|
|
482
457
|
|
|
483
|
-
# Simple rowspan handling: if previous row had cells with rowspan, add empty cells
|
|
484
458
|
if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
|
|
485
459
|
prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
|
|
486
460
|
rowspan_positions = []
|
|
487
461
|
col_pos = 0
|
|
488
462
|
|
|
489
|
-
# Check which cells in previous row have rowspan > 1
|
|
490
463
|
for prev_cell in prev_cells:
|
|
491
464
|
rowspan = 1
|
|
492
465
|
if (
|
|
@@ -497,10 +470,8 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
497
470
|
rowspan = int(prev_cell["rowspan"])
|
|
498
471
|
|
|
499
472
|
if rowspan > 1:
|
|
500
|
-
# This cell spans into current row
|
|
501
473
|
rowspan_positions.append(col_pos)
|
|
502
474
|
|
|
503
|
-
# Account for colspan
|
|
504
475
|
colspan = 1
|
|
505
476
|
if (
|
|
506
477
|
"colspan" in prev_cell.attrs
|
|
@@ -510,25 +481,22 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
510
481
|
colspan = int(prev_cell["colspan"])
|
|
511
482
|
col_pos += colspan
|
|
512
483
|
|
|
513
|
-
# If there are rowspan cells from previous row, add empty cells
|
|
514
484
|
if rowspan_positions:
|
|
515
|
-
|
|
516
|
-
|
|
485
|
+
converted_cells: list[str] = []
|
|
486
|
+
if text.strip():
|
|
487
|
+
parts = text.split("|")
|
|
488
|
+
converted_cells.extend(part.rstrip() + " |" for part in parts[:-1] if part)
|
|
489
|
+
|
|
490
|
+
new_cells: list[str] = []
|
|
517
491
|
cell_index = 0
|
|
518
492
|
|
|
519
|
-
for pos in range(col_pos):
|
|
493
|
+
for pos in range(col_pos):
|
|
520
494
|
if pos in rowspan_positions:
|
|
521
|
-
# Add empty cell for rowspan
|
|
522
495
|
new_cells.append(" |")
|
|
523
|
-
elif cell_index < len(
|
|
524
|
-
|
|
525
|
-
cell = cells[cell_index]
|
|
526
|
-
cell_text = cell.get_text().strip().replace("\n", " ")
|
|
527
|
-
colspan = _get_colspan(cell)
|
|
528
|
-
new_cells.append(f" {cell_text} |" * colspan)
|
|
496
|
+
elif cell_index < len(converted_cells):
|
|
497
|
+
new_cells.append(converted_cells[cell_index])
|
|
529
498
|
cell_index += 1
|
|
530
499
|
|
|
531
|
-
# Override text with new cell arrangement
|
|
532
500
|
text = "".join(new_cells)
|
|
533
501
|
|
|
534
502
|
is_headrow = (
|
|
@@ -563,15 +531,6 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
563
531
|
|
|
564
532
|
|
|
565
533
|
def _convert_caption(*, text: str, convert_as_inline: bool) -> str:
|
|
566
|
-
"""Convert HTML caption element to emphasized text.
|
|
567
|
-
|
|
568
|
-
Args:
|
|
569
|
-
text: The text content of the caption element.
|
|
570
|
-
convert_as_inline: Whether to convert as inline content.
|
|
571
|
-
|
|
572
|
-
Returns:
|
|
573
|
-
The converted markdown text with caption formatting.
|
|
574
|
-
"""
|
|
575
534
|
if convert_as_inline:
|
|
576
535
|
return text
|
|
577
536
|
|
|
@@ -582,15 +541,6 @@ def _convert_caption(*, text: str, convert_as_inline: bool) -> str:
|
|
|
582
541
|
|
|
583
542
|
|
|
584
543
|
def _convert_thead(*, text: str, convert_as_inline: bool) -> str:
|
|
585
|
-
"""Convert HTML thead element preserving table structure.
|
|
586
|
-
|
|
587
|
-
Args:
|
|
588
|
-
text: The text content of the thead element.
|
|
589
|
-
convert_as_inline: Whether to convert as inline content.
|
|
590
|
-
|
|
591
|
-
Returns:
|
|
592
|
-
The converted markdown text preserving table structure.
|
|
593
|
-
"""
|
|
594
544
|
if convert_as_inline:
|
|
595
545
|
return text
|
|
596
546
|
|
|
@@ -598,15 +548,6 @@ def _convert_thead(*, text: str, convert_as_inline: bool) -> str:
|
|
|
598
548
|
|
|
599
549
|
|
|
600
550
|
def _convert_tbody(*, text: str, convert_as_inline: bool) -> str:
|
|
601
|
-
"""Convert HTML tbody element preserving table structure.
|
|
602
|
-
|
|
603
|
-
Args:
|
|
604
|
-
text: The text content of the tbody element.
|
|
605
|
-
convert_as_inline: Whether to convert as inline content.
|
|
606
|
-
|
|
607
|
-
Returns:
|
|
608
|
-
The converted markdown text preserving table structure.
|
|
609
|
-
"""
|
|
610
551
|
if convert_as_inline:
|
|
611
552
|
return text
|
|
612
553
|
|
|
@@ -614,15 +555,6 @@ def _convert_tbody(*, text: str, convert_as_inline: bool) -> str:
|
|
|
614
555
|
|
|
615
556
|
|
|
616
557
|
def _convert_tfoot(*, text: str, convert_as_inline: bool) -> str:
|
|
617
|
-
"""Convert HTML tfoot element preserving table structure.
|
|
618
|
-
|
|
619
|
-
Args:
|
|
620
|
-
text: The text content of the tfoot element.
|
|
621
|
-
convert_as_inline: Whether to convert as inline content.
|
|
622
|
-
|
|
623
|
-
Returns:
|
|
624
|
-
The converted markdown text preserving table structure.
|
|
625
|
-
"""
|
|
626
558
|
if convert_as_inline:
|
|
627
559
|
return text
|
|
628
560
|
|
|
@@ -630,103 +562,41 @@ def _convert_tfoot(*, text: str, convert_as_inline: bool) -> str:
|
|
|
630
562
|
|
|
631
563
|
|
|
632
564
|
def _convert_colgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
633
|
-
"""Convert HTML colgroup element - removes it entirely from Markdown output.
|
|
634
|
-
|
|
635
|
-
Colgroup is a table column grouping element that defines styling for columns.
|
|
636
|
-
It has no representation in Markdown and should be removed.
|
|
637
|
-
|
|
638
|
-
Args:
|
|
639
|
-
tag: The colgroup tag element.
|
|
640
|
-
text: The text content of the colgroup element.
|
|
641
|
-
convert_as_inline: Whether to convert as inline content.
|
|
642
|
-
|
|
643
|
-
Returns:
|
|
644
|
-
Empty string as colgroup has no Markdown representation.
|
|
645
|
-
"""
|
|
646
565
|
_ = tag, text, convert_as_inline
|
|
647
|
-
# Colgroup and its contents (col elements) are purely presentational
|
|
648
|
-
# and have no equivalent in Markdown tables
|
|
649
566
|
return ""
|
|
650
567
|
|
|
651
568
|
|
|
652
569
|
def _convert_col(*, tag: Tag, convert_as_inline: bool) -> str:
|
|
653
|
-
"""Convert HTML col element - removes it entirely from Markdown output.
|
|
654
|
-
|
|
655
|
-
Col elements define column properties (width, style) in HTML tables.
|
|
656
|
-
They have no representation in Markdown and should be removed.
|
|
657
|
-
|
|
658
|
-
Args:
|
|
659
|
-
tag: The col tag element.
|
|
660
|
-
convert_as_inline: Whether to convert as inline content.
|
|
661
|
-
|
|
662
|
-
Returns:
|
|
663
|
-
Empty string as col has no Markdown representation.
|
|
664
|
-
"""
|
|
665
570
|
_ = tag, convert_as_inline
|
|
666
|
-
# Col elements are self-closing and purely presentational
|
|
667
571
|
return ""
|
|
668
572
|
|
|
669
573
|
|
|
670
574
|
def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
|
|
671
|
-
"""Convert HTML5 semantic elements to block-level Markdown.
|
|
672
|
-
|
|
673
|
-
Args:
|
|
674
|
-
text: The text content of the semantic element.
|
|
675
|
-
convert_as_inline: Whether to convert as inline content.
|
|
676
|
-
|
|
677
|
-
Returns:
|
|
678
|
-
The converted markdown text with proper block spacing.
|
|
679
|
-
"""
|
|
680
575
|
if convert_as_inline:
|
|
681
576
|
return text
|
|
682
577
|
|
|
683
578
|
return f"{text}\n\n" if text.strip() else ""
|
|
684
579
|
|
|
685
580
|
|
|
686
|
-
def
|
|
687
|
-
|
|
581
|
+
def _convert_div(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
582
|
+
return text
|
|
688
583
|
|
|
689
|
-
Args:
|
|
690
|
-
text: The text content of the details element.
|
|
691
|
-
convert_as_inline: Whether to convert as inline content.
|
|
692
584
|
|
|
693
|
-
|
|
694
|
-
The converted markdown text (only content, no HTML tags).
|
|
695
|
-
"""
|
|
585
|
+
def _convert_details(*, text: str, convert_as_inline: bool) -> str:
|
|
696
586
|
if convert_as_inline:
|
|
697
587
|
return text
|
|
698
588
|
|
|
699
|
-
# Details is a semantic container, return its content
|
|
700
589
|
return _format_block_element(text)
|
|
701
590
|
|
|
702
591
|
|
|
703
592
|
def _convert_summary(*, text: str, convert_as_inline: bool) -> str:
|
|
704
|
-
"""Convert HTML summary element to emphasized text.
|
|
705
|
-
|
|
706
|
-
Args:
|
|
707
|
-
text: The text content of the summary element.
|
|
708
|
-
convert_as_inline: Whether to convert as inline content.
|
|
709
|
-
|
|
710
|
-
Returns:
|
|
711
|
-
The converted markdown text as bold heading.
|
|
712
|
-
"""
|
|
713
593
|
if convert_as_inline:
|
|
714
594
|
return text
|
|
715
595
|
|
|
716
|
-
# Summary is like a heading/title
|
|
717
596
|
return _format_wrapped_block(text, "**")
|
|
718
597
|
|
|
719
598
|
|
|
720
599
|
def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
|
|
721
|
-
"""Convert HTML definition list element.
|
|
722
|
-
|
|
723
|
-
Args:
|
|
724
|
-
text: The text content of the definition list.
|
|
725
|
-
convert_as_inline: Whether to convert as inline content.
|
|
726
|
-
|
|
727
|
-
Returns:
|
|
728
|
-
The converted markdown text with proper spacing.
|
|
729
|
-
"""
|
|
730
600
|
if convert_as_inline:
|
|
731
601
|
return text
|
|
732
602
|
|
|
@@ -734,15 +604,6 @@ def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
|
|
|
734
604
|
|
|
735
605
|
|
|
736
606
|
def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
|
|
737
|
-
"""Convert HTML definition term element.
|
|
738
|
-
|
|
739
|
-
Args:
|
|
740
|
-
text: The text content of the definition term.
|
|
741
|
-
convert_as_inline: Whether to convert as inline content.
|
|
742
|
-
|
|
743
|
-
Returns:
|
|
744
|
-
The converted markdown text as a definition term.
|
|
745
|
-
"""
|
|
746
607
|
if convert_as_inline:
|
|
747
608
|
return text
|
|
748
609
|
|
|
@@ -753,15 +614,6 @@ def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
|
|
|
753
614
|
|
|
754
615
|
|
|
755
616
|
def _convert_dd(*, text: str, convert_as_inline: bool) -> str:
|
|
756
|
-
"""Convert HTML definition description element.
|
|
757
|
-
|
|
758
|
-
Args:
|
|
759
|
-
text: The text content of the definition description.
|
|
760
|
-
convert_as_inline: Whether to convert as inline content.
|
|
761
|
-
|
|
762
|
-
Returns:
|
|
763
|
-
The converted markdown text as a definition description.
|
|
764
|
-
"""
|
|
765
617
|
if convert_as_inline:
|
|
766
618
|
return text
|
|
767
619
|
|
|
@@ -772,15 +624,6 @@ def _convert_dd(*, text: str, convert_as_inline: bool) -> str:
|
|
|
772
624
|
|
|
773
625
|
|
|
774
626
|
def _convert_cite(*, text: str, convert_as_inline: bool) -> str:
|
|
775
|
-
"""Convert HTML cite element to italic text.
|
|
776
|
-
|
|
777
|
-
Args:
|
|
778
|
-
text: The text content of the cite element.
|
|
779
|
-
convert_as_inline: Whether to convert as inline content.
|
|
780
|
-
|
|
781
|
-
Returns:
|
|
782
|
-
The converted markdown text in italic format.
|
|
783
|
-
"""
|
|
784
627
|
if convert_as_inline:
|
|
785
628
|
return text
|
|
786
629
|
|
|
@@ -791,15 +634,6 @@ def _convert_cite(*, text: str, convert_as_inline: bool) -> str:
|
|
|
791
634
|
|
|
792
635
|
|
|
793
636
|
def _convert_q(*, text: str, convert_as_inline: bool) -> str:
|
|
794
|
-
"""Convert HTML q element to quoted text.
|
|
795
|
-
|
|
796
|
-
Args:
|
|
797
|
-
text: The text content of the q element.
|
|
798
|
-
convert_as_inline: Whether to convert as inline content.
|
|
799
|
-
|
|
800
|
-
Returns:
|
|
801
|
-
The converted markdown text with quotes.
|
|
802
|
-
"""
|
|
803
637
|
if convert_as_inline:
|
|
804
638
|
return text
|
|
805
639
|
|
|
@@ -811,33 +645,20 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
|
|
|
811
645
|
|
|
812
646
|
|
|
813
647
|
def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
814
|
-
"""Convert HTML media elements (audio/video) to semantic Markdown.
|
|
815
|
-
|
|
816
|
-
Args:
|
|
817
|
-
tag: The media tag element.
|
|
818
|
-
text: The text content of the media element (fallback content).
|
|
819
|
-
convert_as_inline: Whether to convert as inline content.
|
|
820
|
-
|
|
821
|
-
Returns:
|
|
822
|
-
The converted markdown text (link if src exists, otherwise fallback content).
|
|
823
|
-
"""
|
|
824
648
|
src = tag.get("src", "")
|
|
825
649
|
|
|
826
650
|
if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
|
|
827
651
|
src = source_tag.get("src", "")
|
|
828
652
|
|
|
829
|
-
# If we have a src, convert to a link
|
|
830
653
|
if src and isinstance(src, str) and src.strip():
|
|
831
654
|
link = f"[{src}]({src})"
|
|
832
655
|
if convert_as_inline:
|
|
833
656
|
return link
|
|
834
657
|
result = f"{link}\n\n"
|
|
835
|
-
# Add fallback content if present
|
|
836
658
|
if text.strip():
|
|
837
659
|
result += f"{text.strip()}\n\n"
|
|
838
660
|
return result
|
|
839
661
|
|
|
840
|
-
# No src, just return fallback content
|
|
841
662
|
if text.strip():
|
|
842
663
|
return _format_inline_or_block(text, convert_as_inline)
|
|
843
664
|
|
|
@@ -845,20 +666,9 @@ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> s
|
|
|
845
666
|
|
|
846
667
|
|
|
847
668
|
def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
848
|
-
"""Convert HTML iframe element to semantic Markdown.
|
|
849
|
-
|
|
850
|
-
Args:
|
|
851
|
-
tag: The iframe tag element.
|
|
852
|
-
text: The text content of the iframe element (usually empty).
|
|
853
|
-
convert_as_inline: Whether to convert as inline content.
|
|
854
|
-
|
|
855
|
-
Returns:
|
|
856
|
-
The converted markdown text (link if src exists).
|
|
857
|
-
"""
|
|
858
669
|
_ = text
|
|
859
670
|
src = tag.get("src", "")
|
|
860
671
|
|
|
861
|
-
# If we have a src, convert to a link
|
|
862
672
|
if src and isinstance(src, str) and src.strip():
|
|
863
673
|
link = f"[{src}]({src})"
|
|
864
674
|
if convert_as_inline:
|
|
@@ -869,16 +679,6 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
869
679
|
|
|
870
680
|
|
|
871
681
|
def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
872
|
-
"""Convert HTML abbr element to text with optional title.
|
|
873
|
-
|
|
874
|
-
Args:
|
|
875
|
-
tag: The abbr tag element.
|
|
876
|
-
text: The text content of the abbr element.
|
|
877
|
-
convert_as_inline: Whether to convert as inline content.
|
|
878
|
-
|
|
879
|
-
Returns:
|
|
880
|
-
The converted markdown text with optional title annotation.
|
|
881
|
-
"""
|
|
882
682
|
_ = convert_as_inline
|
|
883
683
|
if not text.strip():
|
|
884
684
|
return ""
|
|
@@ -891,69 +691,29 @@ def _convert_abbr(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
891
691
|
|
|
892
692
|
|
|
893
693
|
def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
894
|
-
"""Convert HTML time element to semantic Markdown.
|
|
895
|
-
|
|
896
|
-
Args:
|
|
897
|
-
tag: The time tag element.
|
|
898
|
-
text: The text content of the time element.
|
|
899
|
-
convert_as_inline: Whether to convert as inline content.
|
|
900
|
-
|
|
901
|
-
Returns:
|
|
902
|
-
The converted markdown text (content only, no HTML tags).
|
|
903
|
-
"""
|
|
904
694
|
_ = tag
|
|
905
695
|
_ = convert_as_inline
|
|
906
696
|
if not text.strip():
|
|
907
697
|
return ""
|
|
908
698
|
|
|
909
|
-
# Time elements are semantic - just return the content
|
|
910
699
|
return text.strip()
|
|
911
700
|
|
|
912
701
|
|
|
913
702
|
def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
914
|
-
"""Convert HTML data element to semantic Markdown.
|
|
915
|
-
|
|
916
|
-
Args:
|
|
917
|
-
tag: The data tag element.
|
|
918
|
-
text: The text content of the data element.
|
|
919
|
-
convert_as_inline: Whether to convert as inline content.
|
|
920
|
-
|
|
921
|
-
Returns:
|
|
922
|
-
The converted markdown text (content only, no HTML tags).
|
|
923
|
-
"""
|
|
924
703
|
_ = tag
|
|
925
704
|
_ = convert_as_inline
|
|
926
705
|
if not text.strip():
|
|
927
706
|
return ""
|
|
928
707
|
|
|
929
|
-
# Data elements are semantic - just return the content
|
|
930
708
|
return text.strip()
|
|
931
709
|
|
|
932
710
|
|
|
933
711
|
def _convert_wbr(*, convert_as_inline: bool) -> str:
|
|
934
|
-
"""Convert HTML wbr (word break opportunity) element.
|
|
935
|
-
|
|
936
|
-
Args:
|
|
937
|
-
convert_as_inline: Whether to convert as inline content.
|
|
938
|
-
|
|
939
|
-
Returns:
|
|
940
|
-
Empty string as wbr is just a break opportunity.
|
|
941
|
-
"""
|
|
942
712
|
_ = convert_as_inline
|
|
943
713
|
return ""
|
|
944
714
|
|
|
945
715
|
|
|
946
716
|
def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
947
|
-
"""Convert HTML form element to semantic Markdown.
|
|
948
|
-
|
|
949
|
-
Args:
|
|
950
|
-
tag: The form tag element.
|
|
951
|
-
text: The text content of the form element.
|
|
952
|
-
convert_as_inline: Whether to convert as inline content.
|
|
953
|
-
|
|
954
|
-
Returns:
|
|
955
|
-
The converted markdown text (only content, no HTML tags).
|
|
956
|
-
"""
|
|
957
717
|
_ = tag
|
|
958
718
|
if convert_as_inline:
|
|
959
719
|
return text
|
|
@@ -961,63 +721,31 @@ def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
961
721
|
if not text.strip():
|
|
962
722
|
return ""
|
|
963
723
|
|
|
964
|
-
# Forms are just containers, return their content
|
|
965
724
|
return text
|
|
966
725
|
|
|
967
726
|
|
|
968
727
|
def _convert_fieldset(*, text: str, convert_as_inline: bool) -> str:
|
|
969
|
-
"""Convert HTML fieldset element to semantic Markdown.
|
|
970
|
-
|
|
971
|
-
Args:
|
|
972
|
-
text: The text content of the fieldset element.
|
|
973
|
-
convert_as_inline: Whether to convert as inline content.
|
|
974
|
-
|
|
975
|
-
Returns:
|
|
976
|
-
The converted markdown text (only content, no HTML tags).
|
|
977
|
-
"""
|
|
978
728
|
if convert_as_inline:
|
|
979
729
|
return text
|
|
980
730
|
|
|
981
731
|
if not text.strip():
|
|
982
732
|
return ""
|
|
983
733
|
|
|
984
|
-
# Fieldsets are semantic groupings, return their content
|
|
985
734
|
return text
|
|
986
735
|
|
|
987
736
|
|
|
988
737
|
def _convert_legend(*, text: str, convert_as_inline: bool) -> str:
|
|
989
|
-
"""Convert HTML legend element to emphasized text.
|
|
990
|
-
|
|
991
|
-
Args:
|
|
992
|
-
text: The text content of the legend element.
|
|
993
|
-
convert_as_inline: Whether to convert as inline content.
|
|
994
|
-
|
|
995
|
-
Returns:
|
|
996
|
-
The converted markdown text as emphasized legend.
|
|
997
|
-
"""
|
|
998
738
|
if convert_as_inline:
|
|
999
739
|
return text
|
|
1000
740
|
|
|
1001
741
|
if not text.strip():
|
|
1002
742
|
return ""
|
|
1003
743
|
|
|
1004
|
-
# Legend is like a heading/title for fieldsets
|
|
1005
744
|
return _format_wrapped_block(text, "**")
|
|
1006
745
|
|
|
1007
746
|
|
|
1008
747
|
def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1009
|
-
"""Convert HTML label element to Markdown.
|
|
1010
|
-
|
|
1011
|
-
Args:
|
|
1012
|
-
tag: The label tag element.
|
|
1013
|
-
text: The text content of the label element.
|
|
1014
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1015
|
-
|
|
1016
|
-
Returns:
|
|
1017
|
-
The label text content.
|
|
1018
|
-
"""
|
|
1019
748
|
_ = tag
|
|
1020
|
-
# Labels are just text, return the content
|
|
1021
749
|
if not text.strip():
|
|
1022
750
|
return ""
|
|
1023
751
|
|
|
@@ -1025,33 +753,12 @@ def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1025
753
|
|
|
1026
754
|
|
|
1027
755
|
def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
|
|
1028
|
-
"""Convert HTML input element to Markdown.
|
|
1029
|
-
|
|
1030
|
-
Args:
|
|
1031
|
-
tag: The input tag element.
|
|
1032
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1033
|
-
|
|
1034
|
-
Returns:
|
|
1035
|
-
Empty string since input elements have no Markdown representation.
|
|
1036
|
-
"""
|
|
1037
756
|
_ = tag, convert_as_inline
|
|
1038
|
-
# Input elements have no content and no Markdown equivalent
|
|
1039
757
|
return ""
|
|
1040
758
|
|
|
1041
759
|
|
|
1042
760
|
def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1043
|
-
"""Convert HTML textarea element to Markdown.
|
|
1044
|
-
|
|
1045
|
-
Args:
|
|
1046
|
-
tag: The textarea tag element.
|
|
1047
|
-
text: The text content of the textarea element.
|
|
1048
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1049
|
-
|
|
1050
|
-
Returns:
|
|
1051
|
-
The text content of the textarea.
|
|
1052
|
-
"""
|
|
1053
761
|
_ = tag
|
|
1054
|
-
# Return the text content, which is what the user entered
|
|
1055
762
|
if not text.strip():
|
|
1056
763
|
return ""
|
|
1057
764
|
|
|
@@ -1059,69 +766,33 @@ def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1059
766
|
|
|
1060
767
|
|
|
1061
768
|
def _convert_select(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1062
|
-
"""Convert HTML select element to Markdown.
|
|
1063
|
-
|
|
1064
|
-
Args:
|
|
1065
|
-
tag: The select tag element.
|
|
1066
|
-
text: The text content of the select element.
|
|
1067
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1068
|
-
|
|
1069
|
-
Returns:
|
|
1070
|
-
The text content (options) as a comma-separated list.
|
|
1071
|
-
"""
|
|
1072
769
|
_ = tag
|
|
1073
|
-
# Return the options as text
|
|
1074
770
|
if not text.strip():
|
|
1075
771
|
return ""
|
|
1076
772
|
|
|
1077
|
-
# In inline mode, show options separated by commas
|
|
1078
773
|
if convert_as_inline:
|
|
1079
|
-
# Remove extra whitespace and join options
|
|
1080
774
|
options = [opt.strip() for opt in text.strip().split("\n") if opt.strip()]
|
|
1081
775
|
return ", ".join(options)
|
|
1082
776
|
|
|
1083
|
-
# In block mode, show as a list
|
|
1084
777
|
return _format_block_element(text)
|
|
1085
778
|
|
|
1086
779
|
|
|
1087
780
|
def _convert_option(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1088
|
-
"""Convert HTML option element to Markdown.
|
|
1089
|
-
|
|
1090
|
-
Args:
|
|
1091
|
-
tag: The option tag element.
|
|
1092
|
-
text: The text content of the option element.
|
|
1093
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1094
|
-
|
|
1095
|
-
Returns:
|
|
1096
|
-
The option text, potentially with a marker if selected.
|
|
1097
|
-
"""
|
|
1098
781
|
if not text.strip():
|
|
1099
782
|
return ""
|
|
1100
783
|
|
|
1101
|
-
# Check if this option is selected
|
|
1102
784
|
selected = tag.get("selected") is not None
|
|
1103
785
|
content = text.strip()
|
|
1104
786
|
|
|
1105
787
|
if convert_as_inline:
|
|
1106
788
|
return content
|
|
1107
789
|
|
|
1108
|
-
# In block mode, mark selected options
|
|
1109
790
|
if selected:
|
|
1110
791
|
return f"* {content}\n"
|
|
1111
792
|
return f"{content}\n"
|
|
1112
793
|
|
|
1113
794
|
|
|
1114
795
|
def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1115
|
-
"""Convert HTML optgroup element to semantic Markdown.
|
|
1116
|
-
|
|
1117
|
-
Args:
|
|
1118
|
-
tag: The optgroup tag element.
|
|
1119
|
-
text: The text content of the optgroup element.
|
|
1120
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1121
|
-
|
|
1122
|
-
Returns:
|
|
1123
|
-
The converted markdown text with label as heading.
|
|
1124
|
-
"""
|
|
1125
796
|
if convert_as_inline:
|
|
1126
797
|
return text
|
|
1127
798
|
|
|
@@ -1131,7 +802,6 @@ def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1131
802
|
label = tag.get("label", "")
|
|
1132
803
|
content = text.strip()
|
|
1133
804
|
|
|
1134
|
-
# If there's a label, show it as a heading
|
|
1135
805
|
if label and isinstance(label, str) and label.strip():
|
|
1136
806
|
return f"**{label.strip()}**\n{content}\n"
|
|
1137
807
|
|
|
@@ -1139,18 +809,7 @@ def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1139
809
|
|
|
1140
810
|
|
|
1141
811
|
def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1142
|
-
"""Convert HTML button element to Markdown.
|
|
1143
|
-
|
|
1144
|
-
Args:
|
|
1145
|
-
tag: The button tag element.
|
|
1146
|
-
text: The text content of the button element.
|
|
1147
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1148
|
-
|
|
1149
|
-
Returns:
|
|
1150
|
-
The button text content.
|
|
1151
|
-
"""
|
|
1152
812
|
_ = tag
|
|
1153
|
-
# Buttons are just interactive text, return the text content
|
|
1154
813
|
if not text.strip():
|
|
1155
814
|
return ""
|
|
1156
815
|
|
|
@@ -1158,16 +817,6 @@ def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1158
817
|
|
|
1159
818
|
|
|
1160
819
|
def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1161
|
-
"""Convert HTML progress element to semantic text.
|
|
1162
|
-
|
|
1163
|
-
Args:
|
|
1164
|
-
tag: The progress tag element.
|
|
1165
|
-
text: The text content of the progress element.
|
|
1166
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1167
|
-
|
|
1168
|
-
Returns:
|
|
1169
|
-
The converted markdown text (only content, no HTML tags).
|
|
1170
|
-
"""
|
|
1171
820
|
_ = tag
|
|
1172
821
|
if convert_as_inline:
|
|
1173
822
|
return text
|
|
@@ -1175,21 +824,10 @@ def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1175
824
|
if not text.strip():
|
|
1176
825
|
return ""
|
|
1177
826
|
|
|
1178
|
-
# Progress elements convert to their text content
|
|
1179
827
|
return _format_block_element(text)
|
|
1180
828
|
|
|
1181
829
|
|
|
1182
830
|
def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1183
|
-
"""Convert HTML meter element to semantic text.
|
|
1184
|
-
|
|
1185
|
-
Args:
|
|
1186
|
-
tag: The meter tag element.
|
|
1187
|
-
text: The text content of the meter element.
|
|
1188
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1189
|
-
|
|
1190
|
-
Returns:
|
|
1191
|
-
The converted markdown text (only content, no HTML tags).
|
|
1192
|
-
"""
|
|
1193
831
|
_ = tag
|
|
1194
832
|
if convert_as_inline:
|
|
1195
833
|
return text
|
|
@@ -1197,21 +835,10 @@ def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1197
835
|
if not text.strip():
|
|
1198
836
|
return ""
|
|
1199
837
|
|
|
1200
|
-
# Meter elements convert to their text content
|
|
1201
838
|
return _format_block_element(text)
|
|
1202
839
|
|
|
1203
840
|
|
|
1204
841
|
def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1205
|
-
"""Convert HTML output element to semantic text.
|
|
1206
|
-
|
|
1207
|
-
Args:
|
|
1208
|
-
tag: The output tag element.
|
|
1209
|
-
text: The text content of the output element.
|
|
1210
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1211
|
-
|
|
1212
|
-
Returns:
|
|
1213
|
-
The converted markdown text (only content, no HTML tags).
|
|
1214
|
-
"""
|
|
1215
842
|
_ = tag
|
|
1216
843
|
if convert_as_inline:
|
|
1217
844
|
return text
|
|
@@ -1219,21 +846,10 @@ def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1219
846
|
if not text.strip():
|
|
1220
847
|
return ""
|
|
1221
848
|
|
|
1222
|
-
# Output elements convert to their text content
|
|
1223
849
|
return _format_block_element(text)
|
|
1224
850
|
|
|
1225
851
|
|
|
1226
852
|
def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
1227
|
-
"""Convert HTML datalist element to semantic Markdown.
|
|
1228
|
-
|
|
1229
|
-
Args:
|
|
1230
|
-
tag: The datalist tag element.
|
|
1231
|
-
text: The text content of the datalist element.
|
|
1232
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1233
|
-
|
|
1234
|
-
Returns:
|
|
1235
|
-
The converted markdown text (only content, no HTML tags).
|
|
1236
|
-
"""
|
|
1237
853
|
_ = tag
|
|
1238
854
|
if convert_as_inline:
|
|
1239
855
|
return text
|
|
@@ -1241,20 +857,10 @@ def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1241
857
|
if not text.strip():
|
|
1242
858
|
return ""
|
|
1243
859
|
|
|
1244
|
-
# Datalist shows options as a list
|
|
1245
860
|
return _format_block_element(text)
|
|
1246
861
|
|
|
1247
862
|
|
|
1248
863
|
def _convert_ruby(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
1249
|
-
"""Convert HTML ruby element providing pronunciation annotation.
|
|
1250
|
-
|
|
1251
|
-
Args:
|
|
1252
|
-
text: The text content of the ruby element.
|
|
1253
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1254
|
-
|
|
1255
|
-
Returns:
|
|
1256
|
-
The converted markdown text with ruby annotation as fallback text.
|
|
1257
|
-
"""
|
|
1258
864
|
if not text.strip():
|
|
1259
865
|
return ""
|
|
1260
866
|
|
|
@@ -1262,15 +868,6 @@ def _convert_ruby(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1262
868
|
|
|
1263
869
|
|
|
1264
870
|
def _convert_rb(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
1265
|
-
"""Convert HTML rb (ruby base) element.
|
|
1266
|
-
|
|
1267
|
-
Args:
|
|
1268
|
-
text: The text content of the rb element.
|
|
1269
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1270
|
-
|
|
1271
|
-
Returns:
|
|
1272
|
-
The converted markdown text (ruby base text).
|
|
1273
|
-
"""
|
|
1274
871
|
if not text.strip():
|
|
1275
872
|
return ""
|
|
1276
873
|
|
|
@@ -1278,16 +875,6 @@ def _convert_rb(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1278
875
|
|
|
1279
876
|
|
|
1280
877
|
def _convert_rt(*, text: str, convert_as_inline: bool, tag: Tag) -> str: # noqa: ARG001
|
|
1281
|
-
"""Convert HTML rt (ruby text) element for pronunciation.
|
|
1282
|
-
|
|
1283
|
-
Args:
|
|
1284
|
-
text: The text content of the rt element.
|
|
1285
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1286
|
-
tag: The rt tag element.
|
|
1287
|
-
|
|
1288
|
-
Returns:
|
|
1289
|
-
The converted markdown text with pronunciation in parentheses.
|
|
1290
|
-
"""
|
|
1291
878
|
content = text.strip()
|
|
1292
879
|
|
|
1293
880
|
prev_sibling = tag.previous_sibling
|
|
@@ -1303,15 +890,6 @@ def _convert_rt(*, text: str, convert_as_inline: bool, tag: Tag) -> str: # noqa
|
|
|
1303
890
|
|
|
1304
891
|
|
|
1305
892
|
def _convert_rp(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
1306
|
-
"""Convert HTML rp (ruby parentheses) element for fallback.
|
|
1307
|
-
|
|
1308
|
-
Args:
|
|
1309
|
-
text: The text content of the rp element.
|
|
1310
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1311
|
-
|
|
1312
|
-
Returns:
|
|
1313
|
-
The converted markdown text (parentheses for ruby fallback).
|
|
1314
|
-
"""
|
|
1315
893
|
if not text.strip():
|
|
1316
894
|
return ""
|
|
1317
895
|
|
|
@@ -1319,15 +897,6 @@ def _convert_rp(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1319
897
|
|
|
1320
898
|
|
|
1321
899
|
def _convert_rtc(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
1322
|
-
"""Convert HTML rtc (ruby text container) element.
|
|
1323
|
-
|
|
1324
|
-
Args:
|
|
1325
|
-
text: The text content of the rtc element.
|
|
1326
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1327
|
-
|
|
1328
|
-
Returns:
|
|
1329
|
-
The converted markdown text (ruby text container).
|
|
1330
|
-
"""
|
|
1331
900
|
if not text.strip():
|
|
1332
901
|
return ""
|
|
1333
902
|
|
|
@@ -1335,16 +904,6 @@ def _convert_rtc(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
|
|
|
1335
904
|
|
|
1336
905
|
|
|
1337
906
|
def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1338
|
-
"""Convert HTML dialog element to semantic Markdown.
|
|
1339
|
-
|
|
1340
|
-
Args:
|
|
1341
|
-
text: The text content of the dialog element.
|
|
1342
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1343
|
-
tag: The dialog tag element.
|
|
1344
|
-
|
|
1345
|
-
Returns:
|
|
1346
|
-
The converted markdown text (only content, no HTML tags).
|
|
1347
|
-
"""
|
|
1348
907
|
_ = tag
|
|
1349
908
|
if convert_as_inline:
|
|
1350
909
|
return text
|
|
@@ -1352,21 +911,10 @@ def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1352
911
|
if not text.strip():
|
|
1353
912
|
return ""
|
|
1354
913
|
|
|
1355
|
-
# Dialog is a semantic container, return its content
|
|
1356
914
|
return _format_block_element(text)
|
|
1357
915
|
|
|
1358
916
|
|
|
1359
917
|
def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1360
|
-
"""Convert HTML menu element to semantic Markdown.
|
|
1361
|
-
|
|
1362
|
-
Args:
|
|
1363
|
-
text: The text content of the menu element.
|
|
1364
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1365
|
-
tag: The menu tag element.
|
|
1366
|
-
|
|
1367
|
-
Returns:
|
|
1368
|
-
The converted markdown text (only content, no HTML tags).
|
|
1369
|
-
"""
|
|
1370
918
|
_ = tag
|
|
1371
919
|
if convert_as_inline:
|
|
1372
920
|
return text
|
|
@@ -1374,21 +922,10 @@ def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1374
922
|
if not text.strip():
|
|
1375
923
|
return ""
|
|
1376
924
|
|
|
1377
|
-
# Menu is converted as a list
|
|
1378
925
|
return _format_block_element(text)
|
|
1379
926
|
|
|
1380
927
|
|
|
1381
928
|
def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1382
|
-
"""Convert HTML figure element to semantic Markdown.
|
|
1383
|
-
|
|
1384
|
-
Args:
|
|
1385
|
-
text: The text content of the figure element.
|
|
1386
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1387
|
-
tag: The figure tag element.
|
|
1388
|
-
|
|
1389
|
-
Returns:
|
|
1390
|
-
The converted markdown text (only content, no HTML tags).
|
|
1391
|
-
"""
|
|
1392
929
|
_ = tag
|
|
1393
930
|
if not text.strip():
|
|
1394
931
|
return ""
|
|
@@ -1396,8 +933,6 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1396
933
|
if convert_as_inline:
|
|
1397
934
|
return text
|
|
1398
935
|
|
|
1399
|
-
# Figure is a semantic container, return its content
|
|
1400
|
-
# Make sure there's proper spacing after the figure content
|
|
1401
936
|
content = text.strip()
|
|
1402
937
|
if content and not content.endswith("\n\n"):
|
|
1403
938
|
if content.endswith("\n"):
|
|
@@ -1408,55 +943,24 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1408
943
|
|
|
1409
944
|
|
|
1410
945
|
def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
|
|
1411
|
-
"""Convert HTML hgroup element to semantic Markdown.
|
|
1412
|
-
|
|
1413
|
-
Args:
|
|
1414
|
-
text: The text content of the hgroup element.
|
|
1415
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1416
|
-
|
|
1417
|
-
Returns:
|
|
1418
|
-
The converted markdown text (only content, no HTML tags).
|
|
1419
|
-
"""
|
|
1420
946
|
if convert_as_inline:
|
|
1421
947
|
return text
|
|
1422
948
|
|
|
1423
949
|
if not text.strip():
|
|
1424
950
|
return ""
|
|
1425
951
|
|
|
1426
|
-
# Hgroup is a semantic container for headings, return its content
|
|
1427
952
|
return text
|
|
1428
953
|
|
|
1429
954
|
|
|
1430
955
|
def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1431
|
-
"""Convert HTML picture element to semantic Markdown.
|
|
1432
|
-
|
|
1433
|
-
Args:
|
|
1434
|
-
text: The text content of the picture element.
|
|
1435
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1436
|
-
tag: The picture tag element.
|
|
1437
|
-
|
|
1438
|
-
Returns:
|
|
1439
|
-
The converted markdown text (only the img element).
|
|
1440
|
-
"""
|
|
1441
956
|
_ = tag, convert_as_inline
|
|
1442
957
|
if not text.strip():
|
|
1443
958
|
return ""
|
|
1444
959
|
|
|
1445
|
-
# Picture is a container for responsive images, only the img matters for Markdown
|
|
1446
960
|
return text.strip()
|
|
1447
961
|
|
|
1448
962
|
|
|
1449
963
|
def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1450
|
-
"""Convert SVG element to Markdown image reference.
|
|
1451
|
-
|
|
1452
|
-
Args:
|
|
1453
|
-
text: The text content of the SVG element.
|
|
1454
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1455
|
-
tag: The SVG tag element.
|
|
1456
|
-
|
|
1457
|
-
Returns:
|
|
1458
|
-
The converted markdown text as an image reference.
|
|
1459
|
-
"""
|
|
1460
964
|
if convert_as_inline:
|
|
1461
965
|
return text.strip()
|
|
1462
966
|
|
|
@@ -1475,16 +979,6 @@ def _convert_svg(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1475
979
|
|
|
1476
980
|
|
|
1477
981
|
def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
1478
|
-
"""Convert MathML math element preserving mathematical notation.
|
|
1479
|
-
|
|
1480
|
-
Args:
|
|
1481
|
-
text: The text content of the math element.
|
|
1482
|
-
convert_as_inline: Whether to convert as inline content.
|
|
1483
|
-
tag: The math tag element.
|
|
1484
|
-
|
|
1485
|
-
Returns:
|
|
1486
|
-
The converted markdown text preserving math structure.
|
|
1487
|
-
"""
|
|
1488
982
|
if not text.strip():
|
|
1489
983
|
return ""
|
|
1490
984
|
|
|
@@ -1507,6 +1001,8 @@ def create_converters_map(
|
|
|
1507
1001
|
heading_style: Literal["atx", "atx_closed", "underlined"],
|
|
1508
1002
|
highlight_style: Literal["double-equal", "html", "bold"],
|
|
1509
1003
|
keep_inline_images_in: Iterable[str] | None,
|
|
1004
|
+
list_indent_type: str,
|
|
1005
|
+
list_indent_width: int,
|
|
1510
1006
|
newline_style: str,
|
|
1511
1007
|
strong_em_symbol: str,
|
|
1512
1008
|
sub_symbol: str,
|
|
@@ -1514,27 +1010,7 @@ def create_converters_map(
|
|
|
1514
1010
|
wrap: bool,
|
|
1515
1011
|
wrap_width: int,
|
|
1516
1012
|
) -> ConvertersMap:
|
|
1517
|
-
""
|
|
1518
|
-
|
|
1519
|
-
Args:
|
|
1520
|
-
autolinks: Whether to convert URLs into links.
|
|
1521
|
-
bullets: The bullet characters to use for unordered lists.
|
|
1522
|
-
code_language: The default code language to use.
|
|
1523
|
-
code_language_callback: A callback to get the code language.
|
|
1524
|
-
default_title: Whether to use the URL as the title for links.
|
|
1525
|
-
heading_style: The style of headings.
|
|
1526
|
-
highlight_style: The style to use for highlighted text (mark elements).
|
|
1527
|
-
keep_inline_images_in: The tags to keep inline images in.
|
|
1528
|
-
newline_style: The style of newlines.
|
|
1529
|
-
strong_em_symbol: The symbol to use for strong and emphasis text.
|
|
1530
|
-
sub_symbol: The symbol to use for subscript text.
|
|
1531
|
-
sup_symbol: The symbol to use for superscript text.
|
|
1532
|
-
wrap: Whether to wrap text.
|
|
1533
|
-
wrap_width: The width to wrap text at.
|
|
1534
|
-
|
|
1535
|
-
Returns:
|
|
1536
|
-
A mapping of HTML elements to their corresponding conversion functions
|
|
1537
|
-
"""
|
|
1013
|
+
list_indent_str = "\t" if list_indent_type == "tabs" else " " * list_indent_width
|
|
1538
1014
|
|
|
1539
1015
|
def _wrapper(func: Callable[..., T]) -> Callable[[str, Tag], T]:
|
|
1540
1016
|
spec = getfullargspec(func)
|
|
@@ -1548,6 +1024,8 @@ def create_converters_map(
|
|
|
1548
1024
|
kwargs["text"] = text
|
|
1549
1025
|
if "convert_as_inline" in spec.kwonlyargs:
|
|
1550
1026
|
kwargs["convert_as_inline"] = convert_as_inline
|
|
1027
|
+
if "list_indent_str" in spec.kwonlyargs:
|
|
1028
|
+
kwargs["list_indent_str"] = list_indent_str
|
|
1551
1029
|
return func(**kwargs)
|
|
1552
1030
|
return func(text)
|
|
1553
1031
|
|
|
@@ -1562,7 +1040,7 @@ def create_converters_map(
|
|
|
1562
1040
|
"b": _wrapper(partial(_create_inline_converter(2 * strong_em_symbol))),
|
|
1563
1041
|
"bdi": _wrapper(_create_inline_converter("")),
|
|
1564
1042
|
"bdo": _wrapper(_create_inline_converter("")),
|
|
1565
|
-
"blockquote": _wrapper(partial(_convert_blockquote)),
|
|
1043
|
+
"blockquote": _wrapper(partial(_convert_blockquote, list_indent_str=list_indent_str)),
|
|
1566
1044
|
"br": _wrapper(partial(_convert_br, newline_style=newline_style)),
|
|
1567
1045
|
"button": _wrapper(_convert_button),
|
|
1568
1046
|
"caption": _wrapper(_convert_caption),
|
|
@@ -1577,6 +1055,7 @@ def create_converters_map(
|
|
|
1577
1055
|
"details": _wrapper(_convert_details),
|
|
1578
1056
|
"dfn": _wrapper(_create_inline_converter("*")),
|
|
1579
1057
|
"dialog": _wrapper(_convert_dialog),
|
|
1058
|
+
"div": _wrapper(_convert_div),
|
|
1580
1059
|
"dl": _wrapper(_convert_dl),
|
|
1581
1060
|
"dt": _wrapper(_convert_dt),
|
|
1582
1061
|
"em": _wrapper(_create_inline_converter(strong_em_symbol)),
|
|
@@ -1602,19 +1081,19 @@ def create_converters_map(
|
|
|
1602
1081
|
"kbd": _wrapper(_create_inline_converter("`")),
|
|
1603
1082
|
"label": _wrapper(_convert_label),
|
|
1604
1083
|
"legend": _wrapper(_convert_legend),
|
|
1605
|
-
"li": _wrapper(partial(_convert_li, bullets=bullets)),
|
|
1606
|
-
"list": _wrapper(_convert_list),
|
|
1084
|
+
"li": _wrapper(partial(_convert_li, bullets=bullets, list_indent_str=list_indent_str)),
|
|
1085
|
+
"list": _wrapper(partial(_convert_list, list_indent_str=list_indent_str)),
|
|
1607
1086
|
"main": _wrapper(_convert_semantic_block),
|
|
1608
1087
|
"mark": _wrapper(partial(_convert_mark, highlight_style=highlight_style)),
|
|
1609
1088
|
"math": _wrapper(_convert_math),
|
|
1610
1089
|
"menu": _wrapper(_convert_menu),
|
|
1611
1090
|
"meter": _wrapper(_convert_meter),
|
|
1612
1091
|
"nav": _wrapper(_convert_semantic_block),
|
|
1613
|
-
"ol": _wrapper(_convert_list),
|
|
1092
|
+
"ol": _wrapper(partial(_convert_list, list_indent_str=list_indent_str)),
|
|
1614
1093
|
"optgroup": _wrapper(_convert_optgroup),
|
|
1615
1094
|
"option": _wrapper(_convert_option),
|
|
1616
1095
|
"output": _wrapper(_convert_output),
|
|
1617
|
-
"p": _wrapper(partial(_convert_p, wrap=wrap, wrap_width=wrap_width)),
|
|
1096
|
+
"p": _wrapper(partial(_convert_p, wrap=wrap, wrap_width=wrap_width, list_indent_str=list_indent_str)),
|
|
1618
1097
|
"picture": _wrapper(_convert_picture),
|
|
1619
1098
|
"pre": _wrapper(
|
|
1620
1099
|
partial(
|
|
@@ -1652,7 +1131,7 @@ def create_converters_map(
|
|
|
1652
1131
|
"time": _wrapper(_convert_time),
|
|
1653
1132
|
"tr": _wrapper(_convert_tr),
|
|
1654
1133
|
"u": _wrapper(_create_inline_converter("")),
|
|
1655
|
-
"ul": _wrapper(_convert_list),
|
|
1134
|
+
"ul": _wrapper(partial(_convert_list, list_indent_str=list_indent_str)),
|
|
1656
1135
|
"var": _wrapper(_create_inline_converter("*")),
|
|
1657
1136
|
"video": _wrapper(_convert_media_element),
|
|
1658
1137
|
"wbr": _wrapper(_convert_wbr),
|