html-to-markdown 1.3.3__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (19) hide show
  1. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/PKG-INFO +11 -4
  2. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown/__main__.py +5 -2
  3. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown/cli.py +14 -6
  4. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown/converters.py +8 -2
  5. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown/processing.py +6 -0
  6. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown.egg-info/PKG-INFO +11 -4
  7. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown.egg-info/entry_points.txt +1 -0
  8. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/pyproject.toml +45 -18
  9. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/LICENSE +0 -0
  10. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/README.md +0 -0
  11. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown/__init__.py +0 -0
  12. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown/constants.py +0 -0
  13. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown/py.typed +0 -0
  14. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown/utils.py +0 -0
  15. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown.egg-info/SOURCES.txt +0 -0
  16. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
  17. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown.egg-info/requires.txt +0 -0
  18. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/html_to_markdown.egg-info/top_level.txt +0 -0
  19. {html_to_markdown-1.3.3 → html_to_markdown-1.4.0}/setup.cfg +0 -0
@@ -1,11 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.3.3
4
- Summary: Convert HTML to markdown
3
+ Version: 1.4.0
4
+ Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
7
- Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
8
- Keywords: converter,html,markdown,text-extraction,text-processing
7
+ Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
8
+ Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
9
+ Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
10
+ Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
11
+ Keywords: beautifulsoup,cli-tool,converter,html,html2markdown,markdown,markup,text-extraction,text-processing
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Environment :: Console
9
14
  Classifier: Intended Audience :: Developers
10
15
  Classifier: License :: OSI Approved :: MIT License
11
16
  Classifier: Operating System :: OS Independent
@@ -15,6 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
15
20
  Classifier: Programming Language :: Python :: 3.11
16
21
  Classifier: Programming Language :: Python :: 3.12
17
22
  Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Internet :: WWW/HTTP
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
25
  Classifier: Topic :: Text Processing
19
26
  Classifier: Topic :: Text Processing :: Markup
20
27
  Classifier: Topic :: Text Processing :: Markup :: HTML
@@ -1,8 +1,10 @@
1
1
  import sys
2
2
 
3
- def cli():
4
- from html_to_markdown.cli import main
3
+ from html_to_markdown.cli import main
5
4
 
5
+
6
+ def cli() -> None:
7
+ """Main CLI entrypoint."""
6
8
  try:
7
9
  result = main(sys.argv[1:])
8
10
  print(result) # noqa: T201
@@ -10,5 +12,6 @@ def cli():
10
12
  print(str(e), file=sys.stderr) # noqa: T201
11
13
  sys.exit(1)
12
14
 
15
+
13
16
  if __name__ == "__main__":
14
17
  cli()
@@ -1,11 +1,12 @@
1
- def main(argv: list[str]) -> str:
2
- """Command-line entry point."""
3
- from argparse import ArgumentParser, FileType
4
- from sys import stdin
1
+ from argparse import ArgumentParser, FileType
2
+ from sys import stdin
3
+
4
+ from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
5
+ from html_to_markdown.processing import convert_to_markdown
5
6
 
6
- from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
7
- from html_to_markdown.processing import convert_to_markdown
8
7
 
8
+ def main(argv: list[str]) -> str:
9
+ """Command-line entry point."""
9
10
  parser = ArgumentParser(
10
11
  prog="html_to_markdown",
11
12
  description="Converts HTML to Markdown.",
@@ -127,6 +128,12 @@ def main(argv: list[str]) -> str:
127
128
  help="The number of characters at which text paragraphs should wrap. Defaults to 80.",
128
129
  )
129
130
 
131
+ parser.add_argument(
132
+ "--strip-newlines",
133
+ action="store_true",
134
+ help="Remove newlines from HTML input before processing. This helps flatten janky output from HTML with unnecessary line breaks.",
135
+ )
136
+
130
137
  args = parser.parse_args(argv)
131
138
 
132
139
  return convert_to_markdown(
@@ -147,4 +154,5 @@ def main(argv: list[str]) -> str:
147
154
  keep_inline_images_in=args.keep_inline_images_in,
148
155
  wrap=args.wrap,
149
156
  wrap_width=args.wrap_width,
157
+ strip_newlines=args.strip_newlines,
150
158
  )
@@ -156,11 +156,17 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
156
156
  alt = tag.attrs.get("alt", "")
157
157
  src = tag.attrs.get("src", "")
158
158
  title = tag.attrs.get("title", "")
159
+ width = tag.attrs.get("width", "")
160
+ height = tag.attrs.get("height", "")
159
161
  title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
160
162
  parent_name = tag.parent.name if tag.parent else ""
161
- if convert_as_inline and parent_name not in (keep_inline_images_in or []):
163
+ # Always preserve images in table cells (td, th) by default
164
+ default_preserve_in = ["td", "th"]
165
+ preserve_in = set(keep_inline_images_in or []) | set(default_preserve_in)
166
+ if convert_as_inline and parent_name not in preserve_in:
162
167
  return alt
163
-
168
+ if width or height:
169
+ return f"<img src='{src}' alt='{alt}' title='{title}' width='{width}' height='{height}' />"
164
170
  return f"![{alt}]({src}{title_part})"
165
171
 
166
172
 
@@ -213,6 +213,7 @@ def convert_to_markdown(
213
213
  keep_inline_images_in: Iterable[str] | None = None,
214
214
  newline_style: Literal["spaces", "backslash"] = SPACES,
215
215
  strip: str | Iterable[str] | None = None,
216
+ strip_newlines: bool = False,
216
217
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
217
218
  sub_symbol: str = "",
218
219
  sup_symbol: str = "",
@@ -238,6 +239,7 @@ def convert_to_markdown(
238
239
  keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
239
240
  newline_style: Style for handling newlines in text content. Defaults to "spaces".
240
241
  strip: Tags to strip from the output. Defaults to None.
242
+ strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
241
243
  strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
242
244
  sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
243
245
  sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
@@ -259,6 +261,10 @@ def convert_to_markdown(
259
261
  ):
260
262
  return source
261
263
 
264
+ if strip_newlines:
265
+ # Replace all newlines with spaces before parsing
266
+ source = source.replace("\n", " ").replace("\r", " ")
267
+
262
268
  if "".join(source.split("\n")):
263
269
  source = BeautifulSoup(source, "html.parser")
264
270
  else:
@@ -1,11 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.3.3
4
- Summary: Convert HTML to markdown
3
+ Version: 1.4.0
4
+ Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
7
- Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
8
- Keywords: converter,html,markdown,text-extraction,text-processing
7
+ Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
8
+ Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
9
+ Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
10
+ Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
11
+ Keywords: beautifulsoup,cli-tool,converter,html,html2markdown,markdown,markup,text-extraction,text-processing
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Environment :: Console
9
14
  Classifier: Intended Audience :: Developers
10
15
  Classifier: License :: OSI Approved :: MIT License
11
16
  Classifier: Operating System :: OS Independent
@@ -15,6 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
15
20
  Classifier: Programming Language :: Python :: 3.11
16
21
  Classifier: Programming Language :: Python :: 3.12
17
22
  Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Internet :: WWW/HTTP
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
25
  Classifier: Topic :: Text Processing
19
26
  Classifier: Topic :: Text Processing :: Markup
20
27
  Classifier: Topic :: Text Processing :: Markup :: HTML
@@ -1,2 +1,3 @@
1
1
  [console_scripts]
2
+ html-to-markdown = html_to_markdown.__main__:cli
2
3
  html_to_markdown = html_to_markdown.__main__:cli
@@ -5,14 +5,26 @@ requires = [ "setuptools>=78.1" ]
5
5
 
6
6
  [project]
7
7
  name = "html-to-markdown"
8
- version = "1.3.3"
9
- description = "Convert HTML to markdown"
8
+ version = "1.4.0"
9
+ description = "A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options"
10
10
  readme = "README.md"
11
- keywords = [ "converter", "html", "markdown", "text-extraction", "text-processing" ]
11
+ keywords = [
12
+ "beautifulsoup",
13
+ "cli-tool",
14
+ "converter",
15
+ "html",
16
+ "html2markdown",
17
+ "markdown",
18
+ "markup",
19
+ "text-extraction",
20
+ "text-processing",
21
+ ]
12
22
  license = { text = "MIT" }
13
23
  authors = [ { name = "Na'aman Hirschfeld", email = "nhirschfeld@gmail.com" } ]
14
24
  requires-python = ">=3.9"
15
25
  classifiers = [
26
+ "Development Status :: 5 - Production/Stable",
27
+ "Environment :: Console",
16
28
  "Intended Audience :: Developers",
17
29
  "License :: OSI Approved :: MIT License",
18
30
  "Operating System :: OS Independent",
@@ -22,6 +34,8 @@ classifiers = [
22
34
  "Programming Language :: Python :: 3.11",
23
35
  "Programming Language :: Python :: 3.12",
24
36
  "Programming Language :: Python :: 3.13",
37
+ "Topic :: Internet :: WWW/HTTP",
38
+ "Topic :: Software Development :: Libraries :: Python Modules",
25
39
  "Topic :: Text Processing",
26
40
  "Topic :: Text Processing :: Markup",
27
41
  "Topic :: Text Processing :: Markup :: HTML",
@@ -29,24 +43,27 @@ classifiers = [
29
43
  "Topic :: Utilities",
30
44
  "Typing :: Typed",
31
45
  ]
32
- scripts.html_to_markdown = "html_to_markdown.__main__:cli"
33
-
34
46
  dependencies = [
35
47
  "beautifulsoup4>=4.13.4",
36
48
  ]
37
49
 
38
- urls.homepage = "https://github.com/Goldziher/html-to-markdown"
50
+ urls.Changelog = "https://github.com/Goldziher/html-to-markdown/releases"
51
+ urls.Homepage = "https://github.com/Goldziher/html-to-markdown"
52
+ urls.Issues = "https://github.com/Goldziher/html-to-markdown/issues"
53
+ urls.Repository = "https://github.com/Goldziher/html-to-markdown.git"
54
+ scripts.html-to-markdown = "html_to_markdown.__main__:cli"
55
+ scripts.html_to_markdown = "html_to_markdown.__main__:cli"
39
56
 
40
57
  [dependency-groups]
41
58
  dev = [
42
59
  "covdefaults>=2.3",
43
- "mypy>=1.14.1",
60
+ "mypy>=1.16.1",
44
61
  "pre-commit>=4.1",
45
- "pytest>=8.3.4",
46
- "pytest-cov>=6.1.1",
47
- "pytest-mock>=3.14",
48
- "ruff>=0.11.6",
49
- "types-beautifulsoup4>=4.12.0.20241020",
62
+ "pytest>=8.4.1",
63
+ "pytest-cov>=6.2.1",
64
+ "pytest-mock>=3.14.1",
65
+ "ruff>=0.12",
66
+ "types-beautifulsoup4>=4.12.0.20250516",
50
67
  "uv-bump",
51
68
  ]
52
69
 
@@ -70,16 +87,16 @@ lint.select = [ "ALL" ]
70
87
  lint.ignore = [
71
88
  "ANN401", # dynamically typed ANY
72
89
  "COM812", # Conflicts with formatter
73
- "D100", # # pydocstyle - missing docstring in public module
90
+ "D100", # pydocstyle - missing docstring in public module
74
91
  "D104", # pydocstyle - missing docstring in public package
75
92
  "D107", # pydocstyle - missing docstring in __init__
76
93
  "D205", # pydocstyle - 1 blank line required between summary line and description
77
94
  "E501", # pycodestyle line too long, handled by ruff format
78
- "EM", # Exception messages,
95
+ "EM", # Exception messages
79
96
  "FBT", # Boolean Args
80
97
  "FIX", # we allow todo and fixme comments
81
98
  "ISC001", # Conflicts with formatter
82
- "PLR0913", # Pylint - too many arguments.
99
+ "PLR0913", # Pylint - too many arguments
83
100
  "PLR2004", # Magic variables, we allow them
84
101
  "TD", # we allow todo and fixme comments
85
102
  "TRY", # Try except block, rules are too strict
@@ -91,22 +108,32 @@ lint.pydocstyle.convention = "google"
91
108
  [tool.pytest.ini_options]
92
109
  asyncio_mode = "auto"
93
110
  asyncio_default_fixture_loop_scope = "function"
111
+ testpaths = [ "tests" ]
112
+ filterwarnings = [
113
+ "error",
114
+ "ignore::pytest.PytestConfigWarning",
115
+ "ignore::pytest.PytestUnraisableExceptionWarning",
116
+ "ignore::ResourceWarning",
117
+ ]
94
118
 
95
119
  [tool.coverage.run]
120
+ source = [ "html_to_markdown" ]
96
121
  omit = [ "tests/*" ]
97
122
  plugins = [ "covdefaults" ]
98
- source = [ "html_to_markdown" ]
99
123
 
100
124
  [tool.coverage.report]
101
- exclude_lines = [ 'if TYPE_CHECKING:' ]
125
+ exclude_lines = [ "if TYPE_CHECKING:" ]
102
126
  fail_under = 100
127
+ show_missing = true
103
128
 
104
129
  [tool.mypy]
105
130
  packages = [ "html_to_markdown", "tests" ]
106
131
  python_version = "3.9"
132
+ strict = true
107
133
  implicit_reexport = false
108
134
  show_error_codes = true
109
- strict = true
135
+ warn_return_any = true
136
+ warn_unused_configs = true
110
137
 
111
138
  [[tool.mypy.overrides]]
112
139
  module = "tests.*"