html-to-markdown 1.9.0__tar.gz → 1.9.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/PKG-INFO +11 -3
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/README.md +8 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown/converters.py +22 -72
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown/processing.py +0 -21
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown.egg-info/PKG-INFO +11 -3
- html_to_markdown-1.9.1/html_to_markdown.egg-info/requires.txt +5 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/pyproject.toml +8 -8
- html_to_markdown-1.9.0/html_to_markdown.egg-info/requires.txt +0 -5
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/LICENSE +0 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown/cli.py +0 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown/constants.py +0 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown/preprocessor.py +0 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown/utils.py +0 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown.egg-info/SOURCES.txt +0 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown.egg-info/dependency_links.txt +0 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown.egg-info/entry_points.txt +0 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown.egg-info/top_level.txt +0 -0
- {html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.9.
|
|
3
|
+
Version: 1.9.1
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -30,10 +30,10 @@ Classifier: Typing :: Typed
|
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Description-Content-Type: text/markdown
|
|
32
32
|
License-File: LICENSE
|
|
33
|
-
Requires-Dist: beautifulsoup4>=4.13.
|
|
33
|
+
Requires-Dist: beautifulsoup4>=4.13.5
|
|
34
34
|
Requires-Dist: nh3>=0.3
|
|
35
35
|
Provides-Extra: lxml
|
|
36
|
-
Requires-Dist: lxml>=6; extra == "lxml"
|
|
36
|
+
Requires-Dist: lxml>=6.0.1; extra == "lxml"
|
|
37
37
|
Dynamic: license-file
|
|
38
38
|
|
|
39
39
|
# html-to-markdown
|
|
@@ -42,6 +42,14 @@ A modern, fully typed Python library for converting HTML to Markdown. This libra
|
|
|
42
42
|
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
43
43
|
Python 3.9+.
|
|
44
44
|
|
|
45
|
+
## Support This Project
|
|
46
|
+
|
|
47
|
+
If you find html-to-markdown useful, please consider sponsoring the development:
|
|
48
|
+
|
|
49
|
+
<a href="https://github.com/sponsors/Goldziher"><img src="https://img.shields.io/badge/Sponsor-%E2%9D%A4-pink?logo=github-sponsors" alt="Sponsor on GitHub" height="32"></a>
|
|
50
|
+
|
|
51
|
+
Your support helps maintain and improve this library for the community! 🚀
|
|
52
|
+
|
|
45
53
|
## Features
|
|
46
54
|
|
|
47
55
|
- **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
|
|
@@ -4,6 +4,14 @@ A modern, fully typed Python library for converting HTML to Markdown. This libra
|
|
|
4
4
|
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
5
5
|
Python 3.9+.
|
|
6
6
|
|
|
7
|
+
## Support This Project
|
|
8
|
+
|
|
9
|
+
If you find html-to-markdown useful, please consider sponsoring the development:
|
|
10
|
+
|
|
11
|
+
<a href="https://github.com/sponsors/Goldziher"><img src="https://img.shields.io/badge/Sponsor-%E2%9D%A4-pink?logo=github-sponsors" alt="Sponsor on GitHub" height="32"></a>
|
|
12
|
+
|
|
13
|
+
Your support helps maintain and improve this library for the community! 🚀
|
|
14
|
+
|
|
7
15
|
## Features
|
|
8
16
|
|
|
9
17
|
- **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
|
|
@@ -211,9 +211,7 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool) -> str:
|
|
|
211
211
|
|
|
212
212
|
cite_url = tag.get("cite")
|
|
213
213
|
|
|
214
|
-
# Check if this blockquote is inside a list item
|
|
215
214
|
if _has_ancestor(tag, "li"):
|
|
216
|
-
# Indent the blockquote by 4 spaces
|
|
217
215
|
lines = text.strip().split("\n")
|
|
218
216
|
indented_lines = [f" > {line}" if line.strip() else "" for line in lines]
|
|
219
217
|
quote_text = "\n".join(indented_lines) + "\n\n"
|
|
@@ -290,16 +288,12 @@ def _convert_list(*, tag: Tag, text: str) -> str:
|
|
|
290
288
|
if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
|
|
291
289
|
before_paragraph = True
|
|
292
290
|
|
|
293
|
-
# Check if this list is inside a list item
|
|
294
291
|
if _has_ancestor(tag, "li"):
|
|
295
|
-
# This is a nested list - needs indentation
|
|
296
|
-
# But we need to check if it's the first element after a paragraph
|
|
297
292
|
parent = tag.parent
|
|
298
293
|
while parent and parent.name != "li":
|
|
299
294
|
parent = parent.parent
|
|
300
295
|
|
|
301
296
|
if parent:
|
|
302
|
-
# Check if there's a paragraph before this list
|
|
303
297
|
prev_p = None
|
|
304
298
|
for child in parent.children:
|
|
305
299
|
if hasattr(child, "name"):
|
|
@@ -309,7 +303,6 @@ def _convert_list(*, tag: Tag, text: str) -> str:
|
|
|
309
303
|
prev_p = child
|
|
310
304
|
|
|
311
305
|
if prev_p:
|
|
312
|
-
# If there's a paragraph before, we need proper indentation
|
|
313
306
|
lines = text.strip().split("\n")
|
|
314
307
|
indented_lines = []
|
|
315
308
|
for line in lines:
|
|
@@ -318,9 +311,21 @@ def _convert_list(*, tag: Tag, text: str) -> str:
|
|
|
318
311
|
else:
|
|
319
312
|
indented_lines.append("")
|
|
320
313
|
return "\n" + "\n".join(indented_lines) + "\n"
|
|
321
|
-
# Otherwise use the original tab indentation
|
|
322
314
|
return "\n" + indent(text=text, level=1).rstrip()
|
|
323
315
|
|
|
316
|
+
if tag.parent and tag.parent.name in {"ul", "ol"}:
|
|
317
|
+
lines = text.strip().split("\n")
|
|
318
|
+
indented_lines = []
|
|
319
|
+
for line in lines:
|
|
320
|
+
if line.strip():
|
|
321
|
+
indented_lines.append(f" {line}")
|
|
322
|
+
else:
|
|
323
|
+
indented_lines.append("")
|
|
324
|
+
result = "\n".join(indented_lines)
|
|
325
|
+
if not result.endswith("\n"):
|
|
326
|
+
result += "\n"
|
|
327
|
+
return result
|
|
328
|
+
|
|
324
329
|
return text + ("\n" if before_paragraph else "")
|
|
325
330
|
|
|
326
331
|
|
|
@@ -355,7 +360,6 @@ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
|
|
|
355
360
|
|
|
356
361
|
bullet = bullets[depth % len(bullets)]
|
|
357
362
|
|
|
358
|
-
# Check if the list item contains block-level elements (like <p>, <blockquote>, etc.)
|
|
359
363
|
has_block_children = any(
|
|
360
364
|
child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
|
|
361
365
|
for child in tag.children
|
|
@@ -363,25 +367,18 @@ def _convert_li(*, tag: Tag, text: str, bullets: str) -> str:
|
|
|
363
367
|
)
|
|
364
368
|
|
|
365
369
|
if has_block_children:
|
|
366
|
-
# Handle multi-paragraph list items
|
|
367
|
-
# Split by double newlines (paragraph separators)
|
|
368
370
|
paragraphs = text.strip().split("\n\n")
|
|
369
371
|
|
|
370
372
|
if paragraphs:
|
|
371
|
-
# First paragraph goes directly after the bullet
|
|
372
373
|
result_parts = [f"{bullet} {paragraphs[0].strip()}\n"]
|
|
373
374
|
|
|
374
|
-
# Subsequent paragraphs need to be indented and separated by blank lines
|
|
375
375
|
for para in paragraphs[1:]:
|
|
376
376
|
if para.strip():
|
|
377
|
-
# Add blank line before the paragraph
|
|
378
377
|
result_parts.append("\n")
|
|
379
|
-
# Indent each line of the paragraph by 4 spaces
|
|
380
378
|
result_parts.extend(f" {line}\n" for line in para.strip().split("\n") if line.strip())
|
|
381
379
|
|
|
382
380
|
return "".join(result_parts)
|
|
383
381
|
|
|
384
|
-
# Simple case: no block elements, just inline content
|
|
385
382
|
return "{} {}\n".format(bullet, (text or "").strip())
|
|
386
383
|
|
|
387
384
|
|
|
@@ -399,20 +396,15 @@ def _convert_p(*, wrap: bool, text: str, convert_as_inline: bool, wrap_width: in
|
|
|
399
396
|
|
|
400
397
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
401
398
|
|
|
402
|
-
# Check if this paragraph is inside a list item
|
|
403
399
|
if _has_ancestor(tag, "li"):
|
|
404
|
-
# Check if this is the first paragraph in the list item
|
|
405
400
|
parent = tag.parent
|
|
406
401
|
while parent and parent.name != "li":
|
|
407
402
|
parent = parent.parent
|
|
408
403
|
|
|
409
404
|
if parent:
|
|
410
|
-
# Get all direct children that are paragraphs
|
|
411
405
|
p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
|
|
412
406
|
|
|
413
|
-
# If this is not the first paragraph, indent it
|
|
414
407
|
if p_children and tag != p_children[0]:
|
|
415
|
-
# Indent all lines by 4 spaces
|
|
416
408
|
indented_lines = []
|
|
417
409
|
for line in text.split("\n"):
|
|
418
410
|
if line.strip():
|
|
@@ -480,13 +472,11 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
480
472
|
parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
|
|
481
473
|
tag_grand_parent = tag.parent.parent if tag.parent else None
|
|
482
474
|
|
|
483
|
-
# Simple rowspan handling: if previous row had cells with rowspan, add empty cells
|
|
484
475
|
if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
|
|
485
476
|
prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
|
|
486
477
|
rowspan_positions = []
|
|
487
478
|
col_pos = 0
|
|
488
479
|
|
|
489
|
-
# Check which cells in previous row have rowspan > 1
|
|
490
480
|
for prev_cell in prev_cells:
|
|
491
481
|
rowspan = 1
|
|
492
482
|
if (
|
|
@@ -497,10 +487,8 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
497
487
|
rowspan = int(prev_cell["rowspan"])
|
|
498
488
|
|
|
499
489
|
if rowspan > 1:
|
|
500
|
-
# This cell spans into current row
|
|
501
490
|
rowspan_positions.append(col_pos)
|
|
502
491
|
|
|
503
|
-
# Account for colspan
|
|
504
492
|
colspan = 1
|
|
505
493
|
if (
|
|
506
494
|
"colspan" in prev_cell.attrs
|
|
@@ -510,25 +498,22 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
510
498
|
colspan = int(prev_cell["colspan"])
|
|
511
499
|
col_pos += colspan
|
|
512
500
|
|
|
513
|
-
# If there are rowspan cells from previous row, add empty cells
|
|
514
501
|
if rowspan_positions:
|
|
515
|
-
|
|
516
|
-
|
|
502
|
+
converted_cells: list[str] = []
|
|
503
|
+
if text.strip():
|
|
504
|
+
parts = text.split("|")
|
|
505
|
+
converted_cells.extend(part.rstrip() + " |" for part in parts[:-1] if part)
|
|
506
|
+
|
|
507
|
+
new_cells: list[str] = []
|
|
517
508
|
cell_index = 0
|
|
518
509
|
|
|
519
|
-
for pos in range(col_pos):
|
|
510
|
+
for pos in range(col_pos):
|
|
520
511
|
if pos in rowspan_positions:
|
|
521
|
-
# Add empty cell for rowspan
|
|
522
512
|
new_cells.append(" |")
|
|
523
|
-
elif cell_index < len(
|
|
524
|
-
|
|
525
|
-
cell = cells[cell_index]
|
|
526
|
-
cell_text = cell.get_text().strip().replace("\n", " ")
|
|
527
|
-
colspan = _get_colspan(cell)
|
|
528
|
-
new_cells.append(f" {cell_text} |" * colspan)
|
|
513
|
+
elif cell_index < len(converted_cells):
|
|
514
|
+
new_cells.append(converted_cells[cell_index])
|
|
529
515
|
cell_index += 1
|
|
530
516
|
|
|
531
|
-
# Override text with new cell arrangement
|
|
532
517
|
text = "".join(new_cells)
|
|
533
518
|
|
|
534
519
|
is_headrow = (
|
|
@@ -644,8 +629,6 @@ def _convert_colgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
644
629
|
Empty string as colgroup has no Markdown representation.
|
|
645
630
|
"""
|
|
646
631
|
_ = tag, text, convert_as_inline
|
|
647
|
-
# Colgroup and its contents (col elements) are purely presentational
|
|
648
|
-
# and have no equivalent in Markdown tables
|
|
649
632
|
return ""
|
|
650
633
|
|
|
651
634
|
|
|
@@ -663,7 +646,6 @@ def _convert_col(*, tag: Tag, convert_as_inline: bool) -> str:
|
|
|
663
646
|
Empty string as col has no Markdown representation.
|
|
664
647
|
"""
|
|
665
648
|
_ = tag, convert_as_inline
|
|
666
|
-
# Col elements are self-closing and purely presentational
|
|
667
649
|
return ""
|
|
668
650
|
|
|
669
651
|
|
|
@@ -696,7 +678,6 @@ def _convert_details(*, text: str, convert_as_inline: bool) -> str:
|
|
|
696
678
|
if convert_as_inline:
|
|
697
679
|
return text
|
|
698
680
|
|
|
699
|
-
# Details is a semantic container, return its content
|
|
700
681
|
return _format_block_element(text)
|
|
701
682
|
|
|
702
683
|
|
|
@@ -713,7 +694,6 @@ def _convert_summary(*, text: str, convert_as_inline: bool) -> str:
|
|
|
713
694
|
if convert_as_inline:
|
|
714
695
|
return text
|
|
715
696
|
|
|
716
|
-
# Summary is like a heading/title
|
|
717
697
|
return _format_wrapped_block(text, "**")
|
|
718
698
|
|
|
719
699
|
|
|
@@ -826,18 +806,15 @@ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> s
|
|
|
826
806
|
if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
|
|
827
807
|
src = source_tag.get("src", "")
|
|
828
808
|
|
|
829
|
-
# If we have a src, convert to a link
|
|
830
809
|
if src and isinstance(src, str) and src.strip():
|
|
831
810
|
link = f"[{src}]({src})"
|
|
832
811
|
if convert_as_inline:
|
|
833
812
|
return link
|
|
834
813
|
result = f"{link}\n\n"
|
|
835
|
-
# Add fallback content if present
|
|
836
814
|
if text.strip():
|
|
837
815
|
result += f"{text.strip()}\n\n"
|
|
838
816
|
return result
|
|
839
817
|
|
|
840
|
-
# No src, just return fallback content
|
|
841
818
|
if text.strip():
|
|
842
819
|
return _format_inline_or_block(text, convert_as_inline)
|
|
843
820
|
|
|
@@ -858,7 +835,6 @@ def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
858
835
|
_ = text
|
|
859
836
|
src = tag.get("src", "")
|
|
860
837
|
|
|
861
|
-
# If we have a src, convert to a link
|
|
862
838
|
if src and isinstance(src, str) and src.strip():
|
|
863
839
|
link = f"[{src}]({src})"
|
|
864
840
|
if convert_as_inline:
|
|
@@ -906,7 +882,6 @@ def _convert_time(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
906
882
|
if not text.strip():
|
|
907
883
|
return ""
|
|
908
884
|
|
|
909
|
-
# Time elements are semantic - just return the content
|
|
910
885
|
return text.strip()
|
|
911
886
|
|
|
912
887
|
|
|
@@ -926,7 +901,6 @@ def _convert_data(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
926
901
|
if not text.strip():
|
|
927
902
|
return ""
|
|
928
903
|
|
|
929
|
-
# Data elements are semantic - just return the content
|
|
930
904
|
return text.strip()
|
|
931
905
|
|
|
932
906
|
|
|
@@ -961,7 +935,6 @@ def _convert_form(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
961
935
|
if not text.strip():
|
|
962
936
|
return ""
|
|
963
937
|
|
|
964
|
-
# Forms are just containers, return their content
|
|
965
938
|
return text
|
|
966
939
|
|
|
967
940
|
|
|
@@ -981,7 +954,6 @@ def _convert_fieldset(*, text: str, convert_as_inline: bool) -> str:
|
|
|
981
954
|
if not text.strip():
|
|
982
955
|
return ""
|
|
983
956
|
|
|
984
|
-
# Fieldsets are semantic groupings, return their content
|
|
985
957
|
return text
|
|
986
958
|
|
|
987
959
|
|
|
@@ -1001,7 +973,6 @@ def _convert_legend(*, text: str, convert_as_inline: bool) -> str:
|
|
|
1001
973
|
if not text.strip():
|
|
1002
974
|
return ""
|
|
1003
975
|
|
|
1004
|
-
# Legend is like a heading/title for fieldsets
|
|
1005
976
|
return _format_wrapped_block(text, "**")
|
|
1006
977
|
|
|
1007
978
|
|
|
@@ -1017,7 +988,6 @@ def _convert_label(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1017
988
|
The label text content.
|
|
1018
989
|
"""
|
|
1019
990
|
_ = tag
|
|
1020
|
-
# Labels are just text, return the content
|
|
1021
991
|
if not text.strip():
|
|
1022
992
|
return ""
|
|
1023
993
|
|
|
@@ -1035,7 +1005,6 @@ def _convert_input_enhanced(*, tag: Tag, convert_as_inline: bool) -> str:
|
|
|
1035
1005
|
Empty string since input elements have no Markdown representation.
|
|
1036
1006
|
"""
|
|
1037
1007
|
_ = tag, convert_as_inline
|
|
1038
|
-
# Input elements have no content and no Markdown equivalent
|
|
1039
1008
|
return ""
|
|
1040
1009
|
|
|
1041
1010
|
|
|
@@ -1051,7 +1020,6 @@ def _convert_textarea(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1051
1020
|
The text content of the textarea.
|
|
1052
1021
|
"""
|
|
1053
1022
|
_ = tag
|
|
1054
|
-
# Return the text content, which is what the user entered
|
|
1055
1023
|
if not text.strip():
|
|
1056
1024
|
return ""
|
|
1057
1025
|
|
|
@@ -1070,17 +1038,13 @@ def _convert_select(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1070
1038
|
The text content (options) as a comma-separated list.
|
|
1071
1039
|
"""
|
|
1072
1040
|
_ = tag
|
|
1073
|
-
# Return the options as text
|
|
1074
1041
|
if not text.strip():
|
|
1075
1042
|
return ""
|
|
1076
1043
|
|
|
1077
|
-
# In inline mode, show options separated by commas
|
|
1078
1044
|
if convert_as_inline:
|
|
1079
|
-
# Remove extra whitespace and join options
|
|
1080
1045
|
options = [opt.strip() for opt in text.strip().split("\n") if opt.strip()]
|
|
1081
1046
|
return ", ".join(options)
|
|
1082
1047
|
|
|
1083
|
-
# In block mode, show as a list
|
|
1084
1048
|
return _format_block_element(text)
|
|
1085
1049
|
|
|
1086
1050
|
|
|
@@ -1098,14 +1062,12 @@ def _convert_option(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1098
1062
|
if not text.strip():
|
|
1099
1063
|
return ""
|
|
1100
1064
|
|
|
1101
|
-
# Check if this option is selected
|
|
1102
1065
|
selected = tag.get("selected") is not None
|
|
1103
1066
|
content = text.strip()
|
|
1104
1067
|
|
|
1105
1068
|
if convert_as_inline:
|
|
1106
1069
|
return content
|
|
1107
1070
|
|
|
1108
|
-
# In block mode, mark selected options
|
|
1109
1071
|
if selected:
|
|
1110
1072
|
return f"* {content}\n"
|
|
1111
1073
|
return f"{content}\n"
|
|
@@ -1131,7 +1093,6 @@ def _convert_optgroup(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1131
1093
|
label = tag.get("label", "")
|
|
1132
1094
|
content = text.strip()
|
|
1133
1095
|
|
|
1134
|
-
# If there's a label, show it as a heading
|
|
1135
1096
|
if label and isinstance(label, str) and label.strip():
|
|
1136
1097
|
return f"**{label.strip()}**\n{content}\n"
|
|
1137
1098
|
|
|
@@ -1150,7 +1111,6 @@ def _convert_button(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1150
1111
|
The button text content.
|
|
1151
1112
|
"""
|
|
1152
1113
|
_ = tag
|
|
1153
|
-
# Buttons are just interactive text, return the text content
|
|
1154
1114
|
if not text.strip():
|
|
1155
1115
|
return ""
|
|
1156
1116
|
|
|
@@ -1175,7 +1135,6 @@ def _convert_progress(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1175
1135
|
if not text.strip():
|
|
1176
1136
|
return ""
|
|
1177
1137
|
|
|
1178
|
-
# Progress elements convert to their text content
|
|
1179
1138
|
return _format_block_element(text)
|
|
1180
1139
|
|
|
1181
1140
|
|
|
@@ -1197,7 +1156,6 @@ def _convert_meter(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1197
1156
|
if not text.strip():
|
|
1198
1157
|
return ""
|
|
1199
1158
|
|
|
1200
|
-
# Meter elements convert to their text content
|
|
1201
1159
|
return _format_block_element(text)
|
|
1202
1160
|
|
|
1203
1161
|
|
|
@@ -1219,7 +1177,6 @@ def _convert_output(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1219
1177
|
if not text.strip():
|
|
1220
1178
|
return ""
|
|
1221
1179
|
|
|
1222
|
-
# Output elements convert to their text content
|
|
1223
1180
|
return _format_block_element(text)
|
|
1224
1181
|
|
|
1225
1182
|
|
|
@@ -1241,7 +1198,6 @@ def _convert_datalist(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
|
1241
1198
|
if not text.strip():
|
|
1242
1199
|
return ""
|
|
1243
1200
|
|
|
1244
|
-
# Datalist shows options as a list
|
|
1245
1201
|
return _format_block_element(text)
|
|
1246
1202
|
|
|
1247
1203
|
|
|
@@ -1352,7 +1308,6 @@ def _convert_dialog(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1352
1308
|
if not text.strip():
|
|
1353
1309
|
return ""
|
|
1354
1310
|
|
|
1355
|
-
# Dialog is a semantic container, return its content
|
|
1356
1311
|
return _format_block_element(text)
|
|
1357
1312
|
|
|
1358
1313
|
|
|
@@ -1374,7 +1329,6 @@ def _convert_menu(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1374
1329
|
if not text.strip():
|
|
1375
1330
|
return ""
|
|
1376
1331
|
|
|
1377
|
-
# Menu is converted as a list
|
|
1378
1332
|
return _format_block_element(text)
|
|
1379
1333
|
|
|
1380
1334
|
|
|
@@ -1396,8 +1350,6 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1396
1350
|
if convert_as_inline:
|
|
1397
1351
|
return text
|
|
1398
1352
|
|
|
1399
|
-
# Figure is a semantic container, return its content
|
|
1400
|
-
# Make sure there's proper spacing after the figure content
|
|
1401
1353
|
content = text.strip()
|
|
1402
1354
|
if content and not content.endswith("\n\n"):
|
|
1403
1355
|
if content.endswith("\n"):
|
|
@@ -1423,7 +1375,6 @@ def _convert_hgroup(*, text: str, convert_as_inline: bool) -> str:
|
|
|
1423
1375
|
if not text.strip():
|
|
1424
1376
|
return ""
|
|
1425
1377
|
|
|
1426
|
-
# Hgroup is a semantic container for headings, return its content
|
|
1427
1378
|
return text
|
|
1428
1379
|
|
|
1429
1380
|
|
|
@@ -1442,7 +1393,6 @@ def _convert_picture(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
1442
1393
|
if not text.strip():
|
|
1443
1394
|
return ""
|
|
1444
1395
|
|
|
1445
|
-
# Picture is a container for responsive images, only the img matters for Markdown
|
|
1446
1396
|
return text.strip()
|
|
1447
1397
|
|
|
1448
1398
|
|
|
@@ -195,18 +195,14 @@ def _process_tag(
|
|
|
195
195
|
|
|
196
196
|
children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
|
|
197
197
|
|
|
198
|
-
# List of tags that return empty string when they have no content
|
|
199
198
|
empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
|
|
200
199
|
|
|
201
200
|
for i, el in enumerate(children):
|
|
202
201
|
if isinstance(el, NavigableString):
|
|
203
|
-
# Check if this is whitespace between empty elements
|
|
204
202
|
if el.strip() == "" and i > 0 and i < len(children) - 1:
|
|
205
203
|
prev_el = children[i - 1]
|
|
206
204
|
next_el = children[i + 1]
|
|
207
205
|
|
|
208
|
-
# If previous element was a tag that produced empty output
|
|
209
|
-
# and next element is also a tag that could be empty, skip this whitespace
|
|
210
206
|
if (
|
|
211
207
|
isinstance(prev_el, Tag)
|
|
212
208
|
and isinstance(next_el, Tag)
|
|
@@ -214,7 +210,6 @@ def _process_tag(
|
|
|
214
210
|
and next_el.name.lower() in empty_when_no_content_tags
|
|
215
211
|
and not prev_el.get_text().strip()
|
|
216
212
|
):
|
|
217
|
-
# Previous tag is empty and next could be empty too, skip this whitespace
|
|
218
213
|
continue
|
|
219
214
|
|
|
220
215
|
text_parts.append(
|
|
@@ -281,14 +276,10 @@ def _process_text(
|
|
|
281
276
|
break
|
|
282
277
|
|
|
283
278
|
if "pre" not in ancestor_names:
|
|
284
|
-
# Special case: if the text is only whitespace
|
|
285
279
|
if text.strip() == "":
|
|
286
|
-
# If it contains newlines, it's probably indentation whitespace, return empty
|
|
287
280
|
if "\n" in text:
|
|
288
281
|
text = ""
|
|
289
282
|
else:
|
|
290
|
-
# Check if this whitespace is between block elements
|
|
291
|
-
# Define block elements that should not have whitespace between them
|
|
292
283
|
block_elements = {
|
|
293
284
|
"p",
|
|
294
285
|
"ul",
|
|
@@ -320,7 +311,6 @@ def _process_text(
|
|
|
320
311
|
prev_sibling = el.previous_sibling
|
|
321
312
|
next_sibling = el.next_sibling
|
|
322
313
|
|
|
323
|
-
# Check if whitespace is between block elements
|
|
324
314
|
if (
|
|
325
315
|
prev_sibling
|
|
326
316
|
and hasattr(prev_sibling, "name")
|
|
@@ -329,10 +319,8 @@ def _process_text(
|
|
|
329
319
|
and hasattr(next_sibling, "name")
|
|
330
320
|
and next_sibling.name in block_elements
|
|
331
321
|
):
|
|
332
|
-
# Remove whitespace between block elements
|
|
333
322
|
text = ""
|
|
334
323
|
else:
|
|
335
|
-
# Otherwise it's inline whitespace, normalize to single space
|
|
336
324
|
text = " " if text else ""
|
|
337
325
|
else:
|
|
338
326
|
has_leading_space = text.startswith((" ", "\t"))
|
|
@@ -470,7 +458,6 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
470
458
|
if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
|
|
471
459
|
metadata["canonical"] = canonical["href"]
|
|
472
460
|
|
|
473
|
-
# Extract link relations
|
|
474
461
|
link_relations = {"author", "license", "alternate"}
|
|
475
462
|
for rel_type in link_relations:
|
|
476
463
|
link = soup.find("link", rel=rel_type, href=True)
|
|
@@ -595,8 +582,6 @@ def convert_to_markdown(
|
|
|
595
582
|
if strip_newlines:
|
|
596
583
|
source = source.replace("\n", " ").replace("\r", " ")
|
|
597
584
|
|
|
598
|
-
# Fix lxml parsing of void elements like <wbr>
|
|
599
|
-
# lxml incorrectly treats them as container tags
|
|
600
585
|
source = re.sub(r"<wbr\s*>", "<wbr />", source, flags=re.IGNORECASE)
|
|
601
586
|
|
|
602
587
|
if preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
|
|
@@ -737,7 +722,6 @@ def convert_to_markdown(
|
|
|
737
722
|
if leading_whitespace_match:
|
|
738
723
|
leading_whitespace = leading_whitespace_match.group(0)
|
|
739
724
|
|
|
740
|
-
# Check if input contains list or heading tags
|
|
741
725
|
list_heading_tags = {"<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"}
|
|
742
726
|
if any(tag in original_input for tag in list_heading_tags):
|
|
743
727
|
leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
|
|
@@ -751,19 +735,14 @@ def convert_to_markdown(
|
|
|
751
735
|
def normalize_spaces_outside_code(text: str) -> str:
|
|
752
736
|
parts = text.split("```")
|
|
753
737
|
for i in range(0, len(parts), 2):
|
|
754
|
-
# Process each line separately to preserve leading spaces
|
|
755
738
|
lines = parts[i].split("\n")
|
|
756
739
|
processed_lines = []
|
|
757
740
|
for line in lines:
|
|
758
|
-
# Preserve definition list formatting (: followed by 3 spaces)
|
|
759
741
|
def_parts = re.split(r"(:\s{3})", line)
|
|
760
742
|
for j in range(0, len(def_parts), 2):
|
|
761
|
-
# Only normalize non-definition-list parts
|
|
762
|
-
# Also preserve leading spaces (for list indentation)
|
|
763
743
|
match = re.match(r"^(\s*)(.*)", def_parts[j])
|
|
764
744
|
if match:
|
|
765
745
|
leading_spaces, rest = match.groups()
|
|
766
|
-
# Only normalize multiple spaces that are not at the beginning
|
|
767
746
|
rest = re.sub(r" {3,}", " ", rest)
|
|
768
747
|
def_parts[j] = leading_spaces + rest
|
|
769
748
|
processed_lines.append("".join(def_parts))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.9.
|
|
3
|
+
Version: 1.9.1
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -30,10 +30,10 @@ Classifier: Typing :: Typed
|
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Description-Content-Type: text/markdown
|
|
32
32
|
License-File: LICENSE
|
|
33
|
-
Requires-Dist: beautifulsoup4>=4.13.
|
|
33
|
+
Requires-Dist: beautifulsoup4>=4.13.5
|
|
34
34
|
Requires-Dist: nh3>=0.3
|
|
35
35
|
Provides-Extra: lxml
|
|
36
|
-
Requires-Dist: lxml>=6; extra == "lxml"
|
|
36
|
+
Requires-Dist: lxml>=6.0.1; extra == "lxml"
|
|
37
37
|
Dynamic: license-file
|
|
38
38
|
|
|
39
39
|
# html-to-markdown
|
|
@@ -42,6 +42,14 @@ A modern, fully typed Python library for converting HTML to Markdown. This libra
|
|
|
42
42
|
of [markdownify](https://pypi.org/project/markdownify/) with a modernized codebase, strict type safety and support for
|
|
43
43
|
Python 3.9+.
|
|
44
44
|
|
|
45
|
+
## Support This Project
|
|
46
|
+
|
|
47
|
+
If you find html-to-markdown useful, please consider sponsoring the development:
|
|
48
|
+
|
|
49
|
+
<a href="https://github.com/sponsors/Goldziher"><img src="https://img.shields.io/badge/Sponsor-%E2%9D%A4-pink?logo=github-sponsors" alt="Sponsor on GitHub" height="32"></a>
|
|
50
|
+
|
|
51
|
+
Your support helps maintain and improve this library for the community! 🚀
|
|
52
|
+
|
|
45
53
|
## Features
|
|
46
54
|
|
|
47
55
|
- **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
|
|
@@ -5,7 +5,7 @@ requires = [ "setuptools>=78.1" ]
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "html-to-markdown"
|
|
8
|
-
version = "1.9.
|
|
8
|
+
version = "1.9.1"
|
|
9
9
|
description = "A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options"
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
keywords = [
|
|
@@ -42,9 +42,9 @@ classifiers = [
|
|
|
42
42
|
"Topic :: Utilities",
|
|
43
43
|
"Typing :: Typed",
|
|
44
44
|
]
|
|
45
|
-
dependencies = [ "beautifulsoup4>=4.13.
|
|
45
|
+
dependencies = [ "beautifulsoup4>=4.13.5", "nh3>=0.3" ]
|
|
46
46
|
|
|
47
|
-
optional-dependencies.lxml = [ "lxml>=6" ]
|
|
47
|
+
optional-dependencies.lxml = [ "lxml>=6.0.1" ]
|
|
48
48
|
urls.Changelog = "https://github.com/Goldziher/html-to-markdown/releases"
|
|
49
49
|
urls.Homepage = "https://github.com/Goldziher/html-to-markdown"
|
|
50
50
|
urls.Issues = "https://github.com/Goldziher/html-to-markdown/issues"
|
|
@@ -54,16 +54,16 @@ scripts.html_to_markdown = "html_to_markdown.__main__:cli"
|
|
|
54
54
|
|
|
55
55
|
[dependency-groups]
|
|
56
56
|
dev = [
|
|
57
|
-
"ai-rulez>=
|
|
57
|
+
"ai-rulez>=1.6.1",
|
|
58
58
|
"covdefaults>=2.3",
|
|
59
|
-
"mypy>=1.17",
|
|
60
|
-
"pre-commit>=4.
|
|
59
|
+
"mypy>=1.17.1",
|
|
60
|
+
"pre-commit>=4.3",
|
|
61
61
|
"pytest>=8.4.1",
|
|
62
62
|
"pytest-cov>=6.2.1",
|
|
63
63
|
"pytest-mock>=3.14.1",
|
|
64
|
-
"ruff>=0.12.
|
|
64
|
+
"ruff>=0.12.11",
|
|
65
65
|
"types-beautifulsoup4>=4.12.0.20250516",
|
|
66
|
-
"types-psutil>=7.0.0.
|
|
66
|
+
"types-psutil>=7.0.0.20250822",
|
|
67
67
|
"uv-bump",
|
|
68
68
|
]
|
|
69
69
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{html_to_markdown-1.9.0 → html_to_markdown-1.9.1}/html_to_markdown.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|