html-to-markdown 1.3.3__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__main__.py +5 -2
- html_to_markdown/cli.py +14 -6
- html_to_markdown/converters.py +8 -2
- html_to_markdown/processing.py +6 -0
- {html_to_markdown-1.3.3.dist-info → html_to_markdown-1.4.0.dist-info}/METADATA +11 -4
- html_to_markdown-1.4.0.dist-info/RECORD +14 -0
- {html_to_markdown-1.3.3.dist-info → html_to_markdown-1.4.0.dist-info}/entry_points.txt +1 -0
- html_to_markdown-1.3.3.dist-info/RECORD +0 -14
- {html_to_markdown-1.3.3.dist-info → html_to_markdown-1.4.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.3.3.dist-info → html_to_markdown-1.4.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.3.3.dist-info → html_to_markdown-1.4.0.dist-info}/top_level.txt +0 -0
html_to_markdown/__main__.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
from html_to_markdown.cli import main
|
|
3
|
+
from html_to_markdown.cli import main
|
|
5
4
|
|
|
5
|
+
|
|
6
|
+
def cli() -> None:
|
|
7
|
+
"""Main CLI entrypoint."""
|
|
6
8
|
try:
|
|
7
9
|
result = main(sys.argv[1:])
|
|
8
10
|
print(result) # noqa: T201
|
|
@@ -10,5 +12,6 @@ def cli():
|
|
|
10
12
|
print(str(e), file=sys.stderr) # noqa: T201
|
|
11
13
|
sys.exit(1)
|
|
12
14
|
|
|
15
|
+
|
|
13
16
|
if __name__ == "__main__":
|
|
14
17
|
cli()
|
html_to_markdown/cli.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
from argparse import ArgumentParser, FileType
|
|
2
|
+
from sys import stdin
|
|
3
|
+
|
|
4
|
+
from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
|
|
5
|
+
from html_to_markdown.processing import convert_to_markdown
|
|
5
6
|
|
|
6
|
-
from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
|
|
7
|
-
from html_to_markdown.processing import convert_to_markdown
|
|
8
7
|
|
|
8
|
+
def main(argv: list[str]) -> str:
|
|
9
|
+
"""Command-line entry point."""
|
|
9
10
|
parser = ArgumentParser(
|
|
10
11
|
prog="html_to_markdown",
|
|
11
12
|
description="Converts HTML to Markdown.",
|
|
@@ -127,6 +128,12 @@ def main(argv: list[str]) -> str:
|
|
|
127
128
|
help="The number of characters at which text paragraphs should wrap. Defaults to 80.",
|
|
128
129
|
)
|
|
129
130
|
|
|
131
|
+
parser.add_argument(
|
|
132
|
+
"--strip-newlines",
|
|
133
|
+
action="store_true",
|
|
134
|
+
help="Remove newlines from HTML input before processing. This helps flatten janky output from HTML with unnecessary line breaks.",
|
|
135
|
+
)
|
|
136
|
+
|
|
130
137
|
args = parser.parse_args(argv)
|
|
131
138
|
|
|
132
139
|
return convert_to_markdown(
|
|
@@ -147,4 +154,5 @@ def main(argv: list[str]) -> str:
|
|
|
147
154
|
keep_inline_images_in=args.keep_inline_images_in,
|
|
148
155
|
wrap=args.wrap,
|
|
149
156
|
wrap_width=args.wrap_width,
|
|
157
|
+
strip_newlines=args.strip_newlines,
|
|
150
158
|
)
|
html_to_markdown/converters.py
CHANGED
|
@@ -156,11 +156,17 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
|
|
|
156
156
|
alt = tag.attrs.get("alt", "")
|
|
157
157
|
src = tag.attrs.get("src", "")
|
|
158
158
|
title = tag.attrs.get("title", "")
|
|
159
|
+
width = tag.attrs.get("width", "")
|
|
160
|
+
height = tag.attrs.get("height", "")
|
|
159
161
|
title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
|
|
160
162
|
parent_name = tag.parent.name if tag.parent else ""
|
|
161
|
-
|
|
163
|
+
# Always preserve images in table cells (td, th) by default
|
|
164
|
+
default_preserve_in = ["td", "th"]
|
|
165
|
+
preserve_in = set(keep_inline_images_in or []) | set(default_preserve_in)
|
|
166
|
+
if convert_as_inline and parent_name not in preserve_in:
|
|
162
167
|
return alt
|
|
163
|
-
|
|
168
|
+
if width or height:
|
|
169
|
+
return f"<img src='{src}' alt='{alt}' title='{title}' width='{width}' height='{height}' />"
|
|
164
170
|
return f""
|
|
165
171
|
|
|
166
172
|
|
html_to_markdown/processing.py
CHANGED
|
@@ -213,6 +213,7 @@ def convert_to_markdown(
|
|
|
213
213
|
keep_inline_images_in: Iterable[str] | None = None,
|
|
214
214
|
newline_style: Literal["spaces", "backslash"] = SPACES,
|
|
215
215
|
strip: str | Iterable[str] | None = None,
|
|
216
|
+
strip_newlines: bool = False,
|
|
216
217
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
217
218
|
sub_symbol: str = "",
|
|
218
219
|
sup_symbol: str = "",
|
|
@@ -238,6 +239,7 @@ def convert_to_markdown(
|
|
|
238
239
|
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
239
240
|
newline_style: Style for handling newlines in text content. Defaults to "spaces".
|
|
240
241
|
strip: Tags to strip from the output. Defaults to None.
|
|
242
|
+
strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
|
|
241
243
|
strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
|
|
242
244
|
sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
|
|
243
245
|
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
@@ -259,6 +261,10 @@ def convert_to_markdown(
|
|
|
259
261
|
):
|
|
260
262
|
return source
|
|
261
263
|
|
|
264
|
+
if strip_newlines:
|
|
265
|
+
# Replace all newlines with spaces before parsing
|
|
266
|
+
source = source.replace("\n", " ").replace("\r", " ")
|
|
267
|
+
|
|
262
268
|
if "".join(source.split("\n")):
|
|
263
269
|
source = BeautifulSoup(source, "html.parser")
|
|
264
270
|
else:
|
|
@@ -1,11 +1,16 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 1.4.0
|
|
4
|
+
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
7
|
-
Project-URL:
|
|
8
|
-
|
|
7
|
+
Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
|
|
8
|
+
Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
|
|
9
|
+
Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
|
|
10
|
+
Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
|
|
11
|
+
Keywords: beautifulsoup,cli-tool,converter,html,html2markdown,markdown,markup,text-extraction,text-processing
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Environment :: Console
|
|
9
14
|
Classifier: Intended Audience :: Developers
|
|
10
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
11
16
|
Classifier: Operating System :: OS Independent
|
|
@@ -15,6 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
15
20
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
21
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
22
|
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
24
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
25
|
Classifier: Topic :: Text Processing
|
|
19
26
|
Classifier: Topic :: Text Processing :: Markup
|
|
20
27
|
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
html_to_markdown/__init__.py,sha256=95S7_7mR_g88uTnFI0FaRNykrtAaSKb6sJbwSea2zjk,145
|
|
2
|
+
html_to_markdown/__main__.py,sha256=DJyJX7NIK0BVPNS2r3BYJ0Ci_lKHhgVOpw7ZEqACH3c,323
|
|
3
|
+
html_to_markdown/cli.py,sha256=Kfh2sF_ySE_fQ0qdwvUZ5Rqx-P4Y12uTpG8xF60gAq0,4789
|
|
4
|
+
html_to_markdown/constants.py,sha256=Usk67k18tuRovJpKDsiEXdgH20KgqI9KOnK4Fbx-M5c,547
|
|
5
|
+
html_to_markdown/converters.py,sha256=SHRAV1qIFQQdXSD_TToR_F_t8hw3-amz8rIs2Q84YbQ,12276
|
|
6
|
+
html_to_markdown/processing.py,sha256=mzF6YNqhj2VoRN6_TafnZ4ZndOyFglsZXTNnOl4uvWM,10564
|
|
7
|
+
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
|
|
9
|
+
html_to_markdown-1.4.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
10
|
+
html_to_markdown-1.4.0.dist-info/METADATA,sha256=LmjDer-QQkH6kSlkua1NBhnaKpIu4sVvyJZPPX5PgLk,8229
|
|
11
|
+
html_to_markdown-1.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
html_to_markdown-1.4.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
13
|
+
html_to_markdown-1.4.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
14
|
+
html_to_markdown-1.4.0.dist-info/RECORD,,
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
html_to_markdown/__init__.py,sha256=95S7_7mR_g88uTnFI0FaRNykrtAaSKb6sJbwSea2zjk,145
|
|
2
|
-
html_to_markdown/__main__.py,sha256=_EeKI8veMWZO7xsl-mBHBE-OmH1vnkVyXuExsOfduFI,286
|
|
3
|
-
html_to_markdown/cli.py,sha256=HVnzmcyrYwah_yWhZ87mZcG0VgnKYp6y89fJh2R-Rlw,4532
|
|
4
|
-
html_to_markdown/constants.py,sha256=Usk67k18tuRovJpKDsiEXdgH20KgqI9KOnK4Fbx-M5c,547
|
|
5
|
-
html_to_markdown/converters.py,sha256=p8arBdejEeuAp9_wIYvp5PuWNBB0M699CgLSEkW3v88,11910
|
|
6
|
-
html_to_markdown/processing.py,sha256=JNCjDgbfuW3YI7mfsj9aHlk2-KriQXJHU8Eo5D9Qj1E,10280
|
|
7
|
-
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
|
|
9
|
-
html_to_markdown-1.3.3.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
10
|
-
html_to_markdown-1.3.3.dist-info/METADATA,sha256=BcfsHs0cSG8Y1BScMsGFOGfN5mrxiu-HA_fJC6DrtFg,7653
|
|
11
|
-
html_to_markdown-1.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
-
html_to_markdown-1.3.3.dist-info/entry_points.txt,sha256=jhMqXDYvIyzQDLKjCn4xCyzCCbAMl94tzQx_HiG5Qi0,67
|
|
13
|
-
html_to_markdown-1.3.3.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
14
|
-
html_to_markdown-1.3.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|