html-to-markdown 1.3.2__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (19) hide show
  1. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/PKG-INFO +11 -4
  2. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown/__main__.py +8 -2
  3. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown/cli.py +14 -6
  4. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown/converters.py +8 -2
  5. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown/processing.py +6 -0
  6. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown.egg-info/PKG-INFO +11 -4
  7. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown.egg-info/SOURCES.txt +1 -0
  8. html_to_markdown-1.4.0/html_to_markdown.egg-info/entry_points.txt +3 -0
  9. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/pyproject.toml +45 -19
  10. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/LICENSE +0 -0
  11. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/README.md +0 -0
  12. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown/__init__.py +0 -0
  13. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown/constants.py +0 -0
  14. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown/py.typed +0 -0
  15. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown/utils.py +0 -0
  16. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
  17. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown.egg-info/requires.txt +0 -0
  18. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/html_to_markdown.egg-info/top_level.txt +0 -0
  19. {html_to_markdown-1.3.2 → html_to_markdown-1.4.0}/setup.cfg +0 -0
@@ -1,11 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.3.2
4
- Summary: Convert HTML to markdown
3
+ Version: 1.4.0
4
+ Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
7
- Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
8
- Keywords: converter,html,markdown,text-extraction,text-processing
7
+ Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
8
+ Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
9
+ Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
10
+ Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
11
+ Keywords: beautifulsoup,cli-tool,converter,html,html2markdown,markdown,markup,text-extraction,text-processing
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Environment :: Console
9
14
  Classifier: Intended Audience :: Developers
10
15
  Classifier: License :: OSI Approved :: MIT License
11
16
  Classifier: Operating System :: OS Independent
@@ -15,6 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
15
20
  Classifier: Programming Language :: Python :: 3.11
16
21
  Classifier: Programming Language :: Python :: 3.12
17
22
  Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Internet :: WWW/HTTP
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
25
  Classifier: Topic :: Text Processing
19
26
  Classifier: Topic :: Text Processing :: Markup
20
27
  Classifier: Topic :: Text Processing :: Markup :: HTML
@@ -1,11 +1,17 @@
1
1
  import sys
2
2
 
3
- if __name__ == "__main__":
4
- from html_to_markdown.cli import main
3
+ from html_to_markdown.cli import main
4
+
5
5
 
6
+ def cli() -> None:
7
+ """Main CLI entrypoint."""
6
8
  try:
7
9
  result = main(sys.argv[1:])
8
10
  print(result) # noqa: T201
9
11
  except ValueError as e:
10
12
  print(str(e), file=sys.stderr) # noqa: T201
11
13
  sys.exit(1)
14
+
15
+
16
+ if __name__ == "__main__":
17
+ cli()
@@ -1,11 +1,12 @@
1
- def main(argv: list[str]) -> str:
2
- """Command-line entry point."""
3
- from argparse import ArgumentParser, FileType
4
- from sys import stdin
1
+ from argparse import ArgumentParser, FileType
2
+ from sys import stdin
3
+
4
+ from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
5
+ from html_to_markdown.processing import convert_to_markdown
5
6
 
6
- from html_to_markdown.constants import ASTERISK, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERLINED, UNDERSCORE
7
- from html_to_markdown.processing import convert_to_markdown
8
7
 
8
+ def main(argv: list[str]) -> str:
9
+ """Command-line entry point."""
9
10
  parser = ArgumentParser(
10
11
  prog="html_to_markdown",
11
12
  description="Converts HTML to Markdown.",
@@ -127,6 +128,12 @@ def main(argv: list[str]) -> str:
127
128
  help="The number of characters at which text paragraphs should wrap. Defaults to 80.",
128
129
  )
129
130
 
131
+ parser.add_argument(
132
+ "--strip-newlines",
133
+ action="store_true",
134
+ help="Remove newlines from HTML input before processing. This helps flatten janky output from HTML with unnecessary line breaks.",
135
+ )
136
+
130
137
  args = parser.parse_args(argv)
131
138
 
132
139
  return convert_to_markdown(
@@ -147,4 +154,5 @@ def main(argv: list[str]) -> str:
147
154
  keep_inline_images_in=args.keep_inline_images_in,
148
155
  wrap=args.wrap,
149
156
  wrap_width=args.wrap_width,
157
+ strip_newlines=args.strip_newlines,
150
158
  )
@@ -156,11 +156,17 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
156
156
  alt = tag.attrs.get("alt", "")
157
157
  src = tag.attrs.get("src", "")
158
158
  title = tag.attrs.get("title", "")
159
+ width = tag.attrs.get("width", "")
160
+ height = tag.attrs.get("height", "")
159
161
  title_part = ' "{}"'.format(title.replace('"', r"\"")) if title else ""
160
162
  parent_name = tag.parent.name if tag.parent else ""
161
- if convert_as_inline and parent_name not in (keep_inline_images_in or []):
163
+ # Always preserve images in table cells (td, th) by default
164
+ default_preserve_in = ["td", "th"]
165
+ preserve_in = set(keep_inline_images_in or []) | set(default_preserve_in)
166
+ if convert_as_inline and parent_name not in preserve_in:
162
167
  return alt
163
-
168
+ if width or height:
169
+ return f"<img src='{src}' alt='{alt}' title='{title}' width='{width}' height='{height}' />"
164
170
  return f"![{alt}]({src}{title_part})"
165
171
 
166
172
 
@@ -213,6 +213,7 @@ def convert_to_markdown(
213
213
  keep_inline_images_in: Iterable[str] | None = None,
214
214
  newline_style: Literal["spaces", "backslash"] = SPACES,
215
215
  strip: str | Iterable[str] | None = None,
216
+ strip_newlines: bool = False,
216
217
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
217
218
  sub_symbol: str = "",
218
219
  sup_symbol: str = "",
@@ -238,6 +239,7 @@ def convert_to_markdown(
238
239
  keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
239
240
  newline_style: Style for handling newlines in text content. Defaults to "spaces".
240
241
  strip: Tags to strip from the output. Defaults to None.
242
+ strip_newlines: Remove newlines from HTML input before processing. Defaults to False.
241
243
  strong_em_symbol: Symbol to use for strong/emphasized text. Defaults to "*".
242
244
  sub_symbol: Custom symbol for subscript text. Defaults to an empty string.
243
245
  sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
@@ -259,6 +261,10 @@ def convert_to_markdown(
259
261
  ):
260
262
  return source
261
263
 
264
+ if strip_newlines:
265
+ # Replace all newlines with spaces before parsing
266
+ source = source.replace("\n", " ").replace("\r", " ")
267
+
262
268
  if "".join(source.split("\n")):
263
269
  source = BeautifulSoup(source, "html.parser")
264
270
  else:
@@ -1,11 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.3.2
4
- Summary: Convert HTML to markdown
3
+ Version: 1.4.0
4
+ Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
7
- Project-URL: homepage, https://github.com/Goldziher/html-to-markdown
8
- Keywords: converter,html,markdown,text-extraction,text-processing
7
+ Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
8
+ Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
9
+ Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
10
+ Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
11
+ Keywords: beautifulsoup,cli-tool,converter,html,html2markdown,markdown,markup,text-extraction,text-processing
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Environment :: Console
9
14
  Classifier: Intended Audience :: Developers
10
15
  Classifier: License :: OSI Approved :: MIT License
11
16
  Classifier: Operating System :: OS Independent
@@ -15,6 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
15
20
  Classifier: Programming Language :: Python :: 3.11
16
21
  Classifier: Programming Language :: Python :: 3.12
17
22
  Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Internet :: WWW/HTTP
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
25
  Classifier: Topic :: Text Processing
19
26
  Classifier: Topic :: Text Processing :: Markup
20
27
  Classifier: Topic :: Text Processing :: Markup :: HTML
@@ -12,5 +12,6 @@ html_to_markdown/utils.py
12
12
  html_to_markdown.egg-info/PKG-INFO
13
13
  html_to_markdown.egg-info/SOURCES.txt
14
14
  html_to_markdown.egg-info/dependency_links.txt
15
+ html_to_markdown.egg-info/entry_points.txt
15
16
  html_to_markdown.egg-info/requires.txt
16
17
  html_to_markdown.egg-info/top_level.txt
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ html-to-markdown = html_to_markdown.__main__:cli
3
+ html_to_markdown = html_to_markdown.__main__:cli
@@ -5,14 +5,26 @@ requires = [ "setuptools>=78.1" ]
5
5
 
6
6
  [project]
7
7
  name = "html-to-markdown"
8
- version = "1.3.2"
9
- description = "Convert HTML to markdown"
8
+ version = "1.4.0"
9
+ description = "A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options"
10
10
  readme = "README.md"
11
- keywords = [ "converter", "html", "markdown", "text-extraction", "text-processing" ]
11
+ keywords = [
12
+ "beautifulsoup",
13
+ "cli-tool",
14
+ "converter",
15
+ "html",
16
+ "html2markdown",
17
+ "markdown",
18
+ "markup",
19
+ "text-extraction",
20
+ "text-processing",
21
+ ]
12
22
  license = { text = "MIT" }
13
23
  authors = [ { name = "Na'aman Hirschfeld", email = "nhirschfeld@gmail.com" } ]
14
24
  requires-python = ">=3.9"
15
25
  classifiers = [
26
+ "Development Status :: 5 - Production/Stable",
27
+ "Environment :: Console",
16
28
  "Intended Audience :: Developers",
17
29
  "License :: OSI Approved :: MIT License",
18
30
  "Operating System :: OS Independent",
@@ -22,6 +34,8 @@ classifiers = [
22
34
  "Programming Language :: Python :: 3.11",
23
35
  "Programming Language :: Python :: 3.12",
24
36
  "Programming Language :: Python :: 3.13",
37
+ "Topic :: Internet :: WWW/HTTP",
38
+ "Topic :: Software Development :: Libraries :: Python Modules",
25
39
  "Topic :: Text Processing",
26
40
  "Topic :: Text Processing :: Markup",
27
41
  "Topic :: Text Processing :: Markup :: HTML",
@@ -29,23 +43,27 @@ classifiers = [
29
43
  "Topic :: Utilities",
30
44
  "Typing :: Typed",
31
45
  ]
32
-
33
46
  dependencies = [
34
47
  "beautifulsoup4>=4.13.4",
35
48
  ]
36
49
 
37
- urls.homepage = "https://github.com/Goldziher/html-to-markdown"
50
+ urls.Changelog = "https://github.com/Goldziher/html-to-markdown/releases"
51
+ urls.Homepage = "https://github.com/Goldziher/html-to-markdown"
52
+ urls.Issues = "https://github.com/Goldziher/html-to-markdown/issues"
53
+ urls.Repository = "https://github.com/Goldziher/html-to-markdown.git"
54
+ scripts.html-to-markdown = "html_to_markdown.__main__:cli"
55
+ scripts.html_to_markdown = "html_to_markdown.__main__:cli"
38
56
 
39
57
  [dependency-groups]
40
58
  dev = [
41
59
  "covdefaults>=2.3",
42
- "mypy>=1.14.1",
60
+ "mypy>=1.16.1",
43
61
  "pre-commit>=4.1",
44
- "pytest>=8.3.4",
45
- "pytest-cov>=6.1.1",
46
- "pytest-mock>=3.14",
47
- "ruff>=0.11.6",
48
- "types-beautifulsoup4>=4.12.0.20241020",
62
+ "pytest>=8.4.1",
63
+ "pytest-cov>=6.2.1",
64
+ "pytest-mock>=3.14.1",
65
+ "ruff>=0.12",
66
+ "types-beautifulsoup4>=4.12.0.20250516",
49
67
  "uv-bump",
50
68
  ]
51
69
 
@@ -58,8 +76,6 @@ html_to_markdown = [ "py.typed" ]
58
76
  [tool.hatch.build]
59
77
  skip-excluded-dirs = true
60
78
 
61
- scripts.html_to_markdown = "html_to_markdown.__main__:cli"
62
-
63
79
  [tool.ruff]
64
80
  target-version = "py39"
65
81
  line-length = 120
@@ -71,16 +87,16 @@ lint.select = [ "ALL" ]
71
87
  lint.ignore = [
72
88
  "ANN401", # dynamically typed ANY
73
89
  "COM812", # Conflicts with formatter
74
- "D100", # # pydocstyle - missing docstring in public module
90
+ "D100", # pydocstyle - missing docstring in public module
75
91
  "D104", # pydocstyle - missing docstring in public package
76
92
  "D107", # pydocstyle - missing docstring in __init__
77
93
  "D205", # pydocstyle - 1 blank line required between summary line and description
78
94
  "E501", # pycodestyle line too long, handled by ruff format
79
- "EM", # Exception messages,
95
+ "EM", # Exception messages
80
96
  "FBT", # Boolean Args
81
97
  "FIX", # we allow todo and fixme comments
82
98
  "ISC001", # Conflicts with formatter
83
- "PLR0913", # Pylint - too many arguments.
99
+ "PLR0913", # Pylint - too many arguments
84
100
  "PLR2004", # Magic variables, we allow them
85
101
  "TD", # we allow todo and fixme comments
86
102
  "TRY", # Try except block, rules are too strict
@@ -92,22 +108,32 @@ lint.pydocstyle.convention = "google"
92
108
  [tool.pytest.ini_options]
93
109
  asyncio_mode = "auto"
94
110
  asyncio_default_fixture_loop_scope = "function"
111
+ testpaths = [ "tests" ]
112
+ filterwarnings = [
113
+ "error",
114
+ "ignore::pytest.PytestConfigWarning",
115
+ "ignore::pytest.PytestUnraisableExceptionWarning",
116
+ "ignore::ResourceWarning",
117
+ ]
95
118
 
96
119
  [tool.coverage.run]
120
+ source = [ "html_to_markdown" ]
97
121
  omit = [ "tests/*" ]
98
122
  plugins = [ "covdefaults" ]
99
- source = [ "html_to_markdown" ]
100
123
 
101
124
  [tool.coverage.report]
102
- exclude_lines = [ 'if TYPE_CHECKING:' ]
125
+ exclude_lines = [ "if TYPE_CHECKING:" ]
103
126
  fail_under = 100
127
+ show_missing = true
104
128
 
105
129
  [tool.mypy]
106
130
  packages = [ "html_to_markdown", "tests" ]
107
131
  python_version = "3.9"
132
+ strict = true
108
133
  implicit_reexport = false
109
134
  show_error_codes = true
110
- strict = true
135
+ warn_return_any = true
136
+ warn_unused_configs = true
111
137
 
112
138
  [[tool.mypy.overrides]]
113
139
  module = "tests.*"