html-to-markdown 1.3.3__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -1,8 +1,10 @@
1
1
  import sys
2
2
 
3
- def cli():
4
- from html_to_markdown.cli import main
3
+ from html_to_markdown.cli import main
5
4
 
5
+
6
+ def cli() -> None:
7
+ """Main CLI entrypoint."""
6
8
  try:
7
9
  result = main(sys.argv[1:])
8
10
  print(result) # noqa: T201
@@ -10,5 +12,6 @@ def cli():
10
12
  print(str(e), file=sys.stderr) # noqa: T201
11
13
  sys.exit(1)
12
14
 
15
+
13
16
  if __name__ == "__main__":
14
17
  cli()
html_to_markdown/cli.py CHANGED
@@ -1,11 +1,12 @@
1
- def main(argv: list[str]) -> str:
2
- """Command-line entry point."""
3
- from argparse import ArgumentParser, FileType
4
- from sys import stdin
1
+ from argparse import ArgumentParser, FileType
2
+ from sys import stdin
3
+
4
+ from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
5
+ from html_to_markdown.processing import convert_to_markdown
5
6
 
6
- from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
7
- from html_to_markdown.processing import convert_to_markdown
8
7
 
8
+ def main(argv: list[str]) -> str:
9
+ """Command-line entry point."""
9
10
  parser = ArgumentParser(
10
11
  prog="html_to_markdown",
11
12
  description="Converts HTML to Markdown.",
@@ -127,6 +128,12 @@ def main(argv: list[str]) -> str:
127
128
  help="The number of characters at which text paragraphs should wrap. Defaults to 80.",
128
129
  )
129
130
 
131
+ parser.add_argument(
132
+ "--strip-newlines",
133
+ action="store_true",
134
+ help="Remove newlines from HTML input before processing. This helps flatten janky output from HTML with unnecessary line breaks.",
135
+ )
136
+
130
137
  args = parser.parse_args(argv)
131
138
 
132
139
  return convert_to_markdown(
@@ -147,4 +154,5 @@ def main(argv: list[str]) -> str:
147
154
  keep_inline_images_in=args.keep_inline_images_in,
148
155
  wrap=args.wrap,
149
156
  wrap_width=args.wrap_width,
157
+ strip_newlines=args.strip_newlines,
150
158
  )
@@ -156,11 +156,17 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
156
156
  alt = tag.attrs.get("alt", "")
157
157
  src = tag.attrs.get("src", "")
158
158
  title = tag.attrs.get("title", "")
159
+ width = tag.attrs.get("width", "")
160
+ height = tag.attrs.get("height", "")
159
161
  title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
160
162
  parent_name = tag.parent.name if tag.parent else ""
161
- if convert_as_inline and parent_name not in (keep_inline_images_in or []):
163
+ # Always preserve images in table cells (td, th) by default
164
+ default_preserve_in = ["td", "th"]
165
+ preserve_in = set(keep_inline_images_in or []) | set(default_preserve_in)
166
+ if convert_as_inline and parent_name not in preserve_in:
162
167
  return alt
163
-
168
+ if width or height:
169
+ return f"<img src='{src}' alt='{alt}' title='{title}' width='{width}' height='{height}' />"
164
170
  return f"![{alt}]({src}{title_part})"
165
171
 
166
172
 
@@ -213,6 +213,7 @@ def convert_to_markdown(
213
213
  keep_inline_images_in: Iterable[str] | None = None,
214
214
  newline_style: Literal["spaces", "backslash"] = SPACES,
215
215
  strip: str | Iterable[str] | None = None,
216
+ strip_newlines: bool = False,
216
217
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
217
218
  sub_symbol: str = "",
218
219
  sup_symbol: str = "",
@@ -238,6 +239,7 @@ def convert_to_markdown(
238
239
  keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
239
240
  newline_style: Style for handling newlines in text content. Defaults to "spaces".
240
241
  strip: Tags to strip from the output. Defaults to None.
242
+ strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
241
243
  strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
242
244
  sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
243
245
  sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
@@ -259,6 +261,10 @@ def convert_to_markdown(
259
261
  ):
260
262
  return source
261
263
 
264
+ if strip_newlines:
265
+ # Replace all newlines with spaces before parsing
266
+ source = source.replace("\n", " ").replace("\r", " ")
267
+
262
268
  if "".join(source.split("\n")):
263
269
  source = BeautifulSoup(source, "html.parser")
264
270
  else:
@@ -1,11 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.3.3
4
- Summary: Convert HTML to markdown
3
+ Version: 1.4.0
4
+ Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
7
- Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
8
- Keywords: converter,html,markdown,text-extraction,text-processing
7
+ Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
8
+ Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
9
+ Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
10
+ Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
11
+ Keywords: beautifulsoup,cli-tool,converter,html,html2markdown,markdown,markup,text-extraction,text-processing
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Environment :: Console
9
14
  Classifier: Intended Audience :: Developers
10
15
  Classifier: License :: OSI Approved :: MIT License
11
16
  Classifier: Operating System :: OS Independent
@@ -15,6 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
15
20
  Classifier: Programming Language :: Python :: 3.11
16
21
  Classifier: Programming Language :: Python :: 3.12
17
22
  Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Internet :: WWW/HTTP
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
25
  Classifier: Topic :: Text Processing
19
26
  Classifier: Topic :: Text Processing :: Markup
20
27
  Classifier: Topic :: Text Processing :: Markup :: HTML
@@ -0,0 +1,14 @@
1
+ html_to_markdown/__init__.py,sha256=95S7_7mR_g88uTnFI0FaRNykrtAaSKb6sJbwSea2zjk,145
2
+ html_to_markdown/__main__.py,sha256=DJyJX7NIK0BVPNS2r3BYJ0Ci_lKHhgVOpw7ZEqACH3c,323
3
+ html_to_markdown/cli.py,sha256=Kfh2sF_ySE_fQ0qdwvUZ5Rqx-P4Y12uTpG8xF60gAq0,4789
4
+ html_to_markdown/constants.py,sha256=Usk67k18tuRovJpKDsiEXdgH20KgqI9KOnK4Fbx-M5c,547
5
+ html_to_markdown/converters.py,sha256=SHRAV1qIFQQdXSD_TToR_F_t8hw3-amz8rIs2Q84YbQ,12276
6
+ html_to_markdown/processing.py,sha256=mzF6YNqhj2VoRN6_TafnZ4ZndOyFglsZXTNnOl4uvWM,10564
7
+ html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
9
+ html_to_markdown-1.4.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
10
+ html_to_markdown-1.4.0.dist-info/METADATA,sha256=LmjDer-QQkH6kSlkua1NBhnaKpIu4sVvyJZPPX5PgLk,8229
11
+ html_to_markdown-1.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ html_to_markdown-1.4.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
13
+ html_to_markdown-1.4.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
14
+ html_to_markdown-1.4.0.dist-info/RECORD,,
@@ -1,2 +1,3 @@
1
1
  [console_scripts]
2
+ html-to-markdown = html_to_markdown.__main__:cli
2
3
  html_to_markdown = html_to_markdown.__main__:cli
@@ -1,14 +0,0 @@
1
- html_to_markdown/__init__.py,sha256=95S7_7mR_g88uTnFI0FaRNykrtAaSKb6sJbwSea2zjk,145
2
- html_to_markdown/__main__.py,sha256=_EeKI8veMWZO7xsl-mBHBE-OmH1vnkVyXuExsOfduFI,286
3
- html_to_markdown/cli.py,sha256=HVnzmcyrYwah_yWhZ87mZcG0VgnKYp6y89fJh2R-Rlw,4532
4
- html_to_markdown/constants.py,sha256=Usk67k18tuRovJpKDsiEXdgH20KgqI9KOnK4Fbx-M5c,547
5
- html_to_markdown/converters.py,sha256=p8arBdejEeuAp9_wIYvp5PuWNBB0M699CgLSEkW3v88,11910
6
- html_to_markdown/processing.py,sha256=JNCjDgbfuW3YI7mfsj9aHlk2-KriQXJHU8Eo5D9Qj1E,10280
7
- html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
9
- html_to_markdown-1.3.3.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
10
- html_to_markdown-1.3.3.dist-info/METADATA,sha256=BcfsHs0cSG8Y1BScMsGFOGfN5mrxiu-HA_fJC6DrtFg,7653
11
- html_to_markdown-1.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- html_to_markdown-1.3.3.dist-info/entry_points.txt,sha256=jhMqXDYvIyzQDLKjCn4xCyzCCbAMl94tzQx_HiG5Qi0,67
13
- html_to_markdown-1.3.3.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
14
- html_to_markdown-1.3.3.dist-info/RECORD,,