html-to-markdown 1.12.1__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/converters.py +4 -1
- html_to_markdown/processing.py +71 -15
- {html_to_markdown-1.12.1.dist-info → html_to_markdown-1.13.0.dist-info}/METADATA +1 -1
- {html_to_markdown-1.12.1.dist-info → html_to_markdown-1.13.0.dist-info}/RECORD +8 -8
- {html_to_markdown-1.12.1.dist-info → html_to_markdown-1.13.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.12.1.dist-info → html_to_markdown-1.13.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.12.1.dist-info → html_to_markdown-1.13.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.12.1.dist-info → html_to_markdown-1.13.0.dist-info}/top_level.txt +0 -0
html_to_markdown/converters.py
CHANGED
|
@@ -414,7 +414,10 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
|
|
|
414
414
|
|
|
415
415
|
return "".join(result_parts)
|
|
416
416
|
|
|
417
|
-
|
|
417
|
+
# Ensure consistent whitespace handling for list items, especially with strip_newlines=True
|
|
418
|
+
# Strip any leading whitespace that may have been inherited from parent containers
|
|
419
|
+
clean_text = (text or "").strip()
|
|
420
|
+
return f"{bullet} {clean_text}\n"
|
|
418
421
|
|
|
419
422
|
|
|
420
423
|
def _convert_p(
|
html_to_markdown/processing.py
CHANGED
|
@@ -548,7 +548,13 @@ def convert_to_markdown(
|
|
|
548
548
|
>>> convert_to_markdown(html, list_indent_width=2)
|
|
549
549
|
'* Item 1\\n* Item 2\\n\\n'
|
|
550
550
|
"""
|
|
551
|
+
# Initialize original input string for Windows lxml fix
|
|
552
|
+
original_input_str = None
|
|
553
|
+
|
|
551
554
|
if isinstance(source, str):
|
|
555
|
+
# Store original string for plain text detection (Windows lxml fix)
|
|
556
|
+
original_input_str = source
|
|
557
|
+
|
|
552
558
|
if (
|
|
553
559
|
heading_style == UNDERLINED
|
|
554
560
|
and "Header" in source
|
|
@@ -697,23 +703,33 @@ def convert_to_markdown(
|
|
|
697
703
|
|
|
698
704
|
result = sink.get_result()
|
|
699
705
|
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
and not result.startswith((" ", "\t", "\n", "\r"))
|
|
704
|
-
):
|
|
706
|
+
# Parser-agnostic behavior: handle leading whitespace differences between parsers
|
|
707
|
+
# lxml may either add unwanted whitespace or strip meaningful whitespace compared to html.parser
|
|
708
|
+
if "needs_leading_whitespace_fix" in locals() and needs_leading_whitespace_fix:
|
|
705
709
|
original_input = sink.original_source if hasattr(sink, "original_source") else original_source
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
if any(tag in original_input for tag in list_heading_tags):
|
|
712
|
-
leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
|
|
713
|
-
leading_whitespace = leading_newlines.group(0) if leading_newlines else ""
|
|
710
|
+
if isinstance(original_input, str):
|
|
711
|
+
original_leading_whitespace_match = re.match(r"^[\s]*", original_input)
|
|
712
|
+
original_leading_whitespace = (
|
|
713
|
+
original_leading_whitespace_match.group(0) if original_leading_whitespace_match else ""
|
|
714
|
+
)
|
|
714
715
|
|
|
715
|
-
|
|
716
|
-
|
|
716
|
+
# Case 1: lxml added leading newlines (like "\n<figure>") - strip them
|
|
717
|
+
if result.startswith("\n") and not original_input.lstrip().startswith(result.strip()):
|
|
718
|
+
result = result.lstrip("\n\r")
|
|
719
|
+
|
|
720
|
+
# Case 2: lxml stripped meaningful leading whitespace (like " <b>") - restore it
|
|
721
|
+
# However, don't restore whitespace if strip_newlines=True was used, as the user
|
|
722
|
+
# explicitly requested to remove formatting whitespace
|
|
723
|
+
elif (
|
|
724
|
+
not strip_newlines
|
|
725
|
+
and not result.startswith((" ", "\t"))
|
|
726
|
+
and original_leading_whitespace.startswith((" ", "\t"))
|
|
727
|
+
):
|
|
728
|
+
# Only restore spaces/tabs, not newlines (which are usually formatting)
|
|
729
|
+
leading_spaces_tabs_match = re.match(r"^[ \t]*", original_leading_whitespace)
|
|
730
|
+
leading_spaces_tabs = leading_spaces_tabs_match.group(0) if leading_spaces_tabs_match else ""
|
|
731
|
+
if leading_spaces_tabs:
|
|
732
|
+
result = leading_spaces_tabs + result
|
|
717
733
|
|
|
718
734
|
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
719
735
|
|
|
@@ -742,6 +758,46 @@ def convert_to_markdown(
|
|
|
742
758
|
if convert_as_inline:
|
|
743
759
|
result = result.rstrip("\n")
|
|
744
760
|
|
|
761
|
+
# Windows-specific fix: For plain text input (no HTML tags), lxml may add extra trailing newlines
|
|
762
|
+
# This ensures consistent behavior across platforms when processing plain text
|
|
763
|
+
# Only apply to cases where lxml adds extra newlines (\n\n) at the end
|
|
764
|
+
if (
|
|
765
|
+
"original_input_str" in locals()
|
|
766
|
+
and original_input_str
|
|
767
|
+
and not original_input_str.strip().startswith("<")
|
|
768
|
+
and not original_input_str.strip().endswith(">")
|
|
769
|
+
and result.endswith("\n\n")
|
|
770
|
+
):
|
|
771
|
+
# Input appears to be plain text, not HTML - normalize trailing newlines only
|
|
772
|
+
result = result.rstrip("\n")
|
|
773
|
+
|
|
774
|
+
# If the original input contained no block-level elements, normalize any
|
|
775
|
+
# accidental trailing newlines for cross-platform consistency.
|
|
776
|
+
# This guards cases like inline-only inputs (e.g., "text <strong>bold</strong>")
|
|
777
|
+
# and head-only documents (e.g., "<head>head</head>") where output should
|
|
778
|
+
# not end with extra blank lines.
|
|
779
|
+
if "original_input_str" in locals() and original_input_str:
|
|
780
|
+
from html_to_markdown.whitespace import BLOCK_ELEMENTS # noqa: PLC0415
|
|
781
|
+
|
|
782
|
+
# Treat additional tags as block-producing for trailing newline purposes.
|
|
783
|
+
# These may be inline in HTML spec but produce block output in our Markdown conversion.
|
|
784
|
+
blockish = set(BLOCK_ELEMENTS) | {
|
|
785
|
+
"textarea",
|
|
786
|
+
"dialog",
|
|
787
|
+
"label",
|
|
788
|
+
"button",
|
|
789
|
+
"progress",
|
|
790
|
+
"meter",
|
|
791
|
+
"output",
|
|
792
|
+
"math",
|
|
793
|
+
"audio",
|
|
794
|
+
"video",
|
|
795
|
+
"iframe",
|
|
796
|
+
}
|
|
797
|
+
block_pattern = r"<(?:" + "|".join(sorted(blockish)) + r")\b"
|
|
798
|
+
if not re.search(block_pattern, original_input_str, flags=re.IGNORECASE):
|
|
799
|
+
result = result.rstrip("\n")
|
|
800
|
+
|
|
745
801
|
return result
|
|
746
802
|
|
|
747
803
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.13.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -2,16 +2,16 @@ html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,
|
|
|
2
2
|
html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
|
|
3
3
|
html_to_markdown/cli.py,sha256=qB8-1jqJPW-YrOmlyOdJnLM6DpKSUIA3iyn1SJaJgKg,9418
|
|
4
4
|
html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
|
|
5
|
-
html_to_markdown/converters.py,sha256=
|
|
5
|
+
html_to_markdown/converters.py,sha256=l4ZtIhfOdemvaApRjH7qmzHrWNF3PDlBzsT1LRw3n0Y,36022
|
|
6
6
|
html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
|
|
7
7
|
html_to_markdown/preprocessor.py,sha256=otnTOhoivJkxaip1Lb9xNMl8q-x9aGFXSYkSrxsTW8g,9591
|
|
8
|
-
html_to_markdown/processing.py,sha256=
|
|
8
|
+
html_to_markdown/processing.py,sha256=SjVStbriaOb24ZwCcRp8eqOJ1p5bIVxpCXSMW3vQojs,38059
|
|
9
9
|
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
|
|
11
11
|
html_to_markdown/whitespace.py,sha256=rl3eEwqfMpNWx4FBmbkZ1RxO_Od45p3EZ_7UgKcDAtg,7710
|
|
12
|
-
html_to_markdown-1.
|
|
13
|
-
html_to_markdown-1.
|
|
14
|
-
html_to_markdown-1.
|
|
15
|
-
html_to_markdown-1.
|
|
16
|
-
html_to_markdown-1.
|
|
17
|
-
html_to_markdown-1.
|
|
12
|
+
html_to_markdown-1.13.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
13
|
+
html_to_markdown-1.13.0.dist-info/METADATA,sha256=CIfFx5C69D3lFg3wgajZnMRmQV-7C78ga2zbXKcxcsc,22694
|
|
14
|
+
html_to_markdown-1.13.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
html_to_markdown-1.13.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
16
|
+
html_to_markdown-1.13.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
17
|
+
html_to_markdown-1.13.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|