html-to-markdown 1.10.0__tar.gz → 1.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (22) hide show
  1. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/PKG-INFO +2 -2
  2. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown/converters.py +5 -2
  3. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown/processing.py +13 -10
  4. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown/whitespace.py +14 -3
  5. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown.egg-info/PKG-INFO +2 -2
  6. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown.egg-info/requires.txt +1 -1
  7. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/pyproject.toml +5 -6
  8. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/LICENSE +0 -0
  9. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/README.md +0 -0
  10. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown/__init__.py +0 -0
  11. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown/__main__.py +0 -0
  12. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown/cli.py +0 -0
  13. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown/constants.py +0 -0
  14. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown/exceptions.py +0 -0
  15. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown/preprocessor.py +0 -0
  16. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown/py.typed +0 -0
  17. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown/utils.py +0 -0
  18. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown.egg-info/SOURCES.txt +0 -0
  19. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
  20. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown.egg-info/entry_points.txt +0 -0
  21. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/html_to_markdown.egg-info/top_level.txt +0 -0
  22. {html_to_markdown-1.10.0 → html_to_markdown-1.11.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.10.0
3
+ Version: 1.11.0
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -33,7 +33,7 @@ License-File: LICENSE
33
33
  Requires-Dist: beautifulsoup4>=4.13.5
34
34
  Requires-Dist: nh3>=0.3
35
35
  Provides-Extra: lxml
36
- Requires-Dist: lxml>=6.0.1; extra == "lxml"
36
+ Requires-Dist: beautifulsoup4[lxml]>=4.13.5; extra == "lxml"
37
37
  Dynamic: license-file
38
38
 
39
39
  # html-to-markdown
@@ -578,8 +578,11 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
578
578
  return f"{text}\n\n" if text.strip() else ""
579
579
 
580
580
 
581
- def _convert_div(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
582
- return text
581
+ def _convert_div(*, text: str, convert_as_inline: bool) -> str:
582
+ if convert_as_inline:
583
+ return text
584
+
585
+ return _format_block_element(text)
583
586
 
584
587
 
585
588
  def _convert_details(*, text: str, convert_as_inline: bool) -> str:
@@ -258,6 +258,18 @@ def _process_tag(
258
258
  if n_eol_to_add > 0:
259
259
  prefix = "\n" * n_eol_to_add
260
260
  return f"{prefix}{rendered}"
261
+
262
+ from html_to_markdown.whitespace import BLOCK_ELEMENTS # noqa: PLC0415
263
+
264
+ is_block_element = tag.name.lower() in BLOCK_ELEMENTS
265
+ if (
266
+ is_block_element
267
+ and not convert_as_inline
268
+ and context_before
269
+ and not context_before.endswith("\n")
270
+ and rendered.strip()
271
+ ):
272
+ return f"\n\n{rendered}"
261
273
  return rendered
262
274
 
263
275
  return text
@@ -358,7 +370,7 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
358
370
  if value is None:
359
371
  return None
360
372
  if isinstance(value, str):
361
- return set(",".split(value))
373
+ return set(value.split(","))
362
374
  return {*chain(*[v.split(",") for v in value])}
363
375
 
364
376
 
@@ -836,15 +848,6 @@ def _process_html_core(
836
848
 
837
849
  try:
838
850
  if isinstance(source, str):
839
- if (
840
- heading_style == UNDERLINED
841
- and "Header" in source
842
- and "\n------\n\n" in source
843
- and "Next paragraph" in source
844
- ):
845
- sink.write(source)
846
- return
847
-
848
851
  if strip_newlines:
849
852
  source = source.replace("\n", " ").replace("\r", " ")
850
853
 
@@ -171,13 +171,13 @@ class WhitespaceHandler:
171
171
  if not text:
172
172
  return ""
173
173
 
174
- text = self.normalize_unicode_spaces(text)
175
-
176
174
  if in_pre or self.should_preserve_whitespace(element):
177
175
  return text
178
176
 
179
177
  if self.mode == "strict":
180
178
  return text
179
+
180
+ text = self.normalize_unicode_spaces(text)
181
181
  return self._process_normalized(text, element)
182
182
 
183
183
  def _process_normalized(self, text: str, element: NavigableString) -> str:
@@ -242,6 +242,14 @@ class WhitespaceHandler:
242
242
  prev_sibling = element.previous_sibling
243
243
  next_sibling = element.next_sibling
244
244
 
245
+ multiple_newlines_before_block = (
246
+ original
247
+ and original.count("\n") >= 2
248
+ and self.is_block_element(next_sibling)
249
+ and text.strip()
250
+ and (self.is_inline_element(prev_sibling) or prev_sibling is None)
251
+ )
252
+
245
253
  has_leading = (
246
254
  has_lead_space
247
255
  and original[0] == " "
@@ -268,6 +276,9 @@ class WhitespaceHandler:
268
276
  if has_trailing and not (original and original[-1] in "\n\t"):
269
277
  text = text + " "
270
278
 
279
+ if multiple_newlines_before_block:
280
+ text = text + "\n\n"
281
+
271
282
  return text
272
283
 
273
284
  def get_block_spacing(self, tag: Tag, next_sibling: PageElement | None = None) -> str:
@@ -286,7 +297,7 @@ class WhitespaceHandler:
286
297
  return "\n"
287
298
  if tag_name in single_newline_elements:
288
299
  return "\n"
289
- if tag_name.startswith("h") and len(tag_name) == 2:
300
+ if tag_name.startswith("h") and len(tag_name) == 2 and tag_name[1].isdigit():
290
301
  return "\n\n"
291
302
 
292
303
  return ""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.10.0
3
+ Version: 1.11.0
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -33,7 +33,7 @@ License-File: LICENSE
33
33
  Requires-Dist: beautifulsoup4>=4.13.5
34
34
  Requires-Dist: nh3>=0.3
35
35
  Provides-Extra: lxml
36
- Requires-Dist: lxml>=6.0.1; extra == "lxml"
36
+ Requires-Dist: beautifulsoup4[lxml]>=4.13.5; extra == "lxml"
37
37
  Dynamic: license-file
38
38
 
39
39
  # html-to-markdown
@@ -2,4 +2,4 @@ beautifulsoup4>=4.13.5
2
2
  nh3>=0.3
3
3
 
4
4
  [lxml]
5
- lxml>=6.0.1
5
+ beautifulsoup4[lxml]>=4.13.5
@@ -5,7 +5,7 @@ requires = [ "setuptools>=78.1" ]
5
5
 
6
6
  [project]
7
7
  name = "html-to-markdown"
8
- version = "1.10.0"
8
+ version = "1.11.0"
9
9
  description = "A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options"
10
10
  readme = "README.md"
11
11
  keywords = [
@@ -43,8 +43,8 @@ classifiers = [
43
43
  "Typing :: Typed",
44
44
  ]
45
45
  dependencies = [ "beautifulsoup4>=4.13.5", "nh3>=0.3" ]
46
+ optional-dependencies.lxml = [ "beautifulsoup4[lxml]>=4.13.5" ]
46
47
 
47
- optional-dependencies.lxml = [ "lxml>=6.0.1" ]
48
48
  urls.Changelog = "https://github.com/Goldziher/html-to-markdown/releases"
49
49
  urls.Homepage = "https://github.com/Goldziher/html-to-markdown"
50
50
  urls.Issues = "https://github.com/Goldziher/html-to-markdown/issues"
@@ -54,14 +54,13 @@ scripts.html_to_markdown = "html_to_markdown.__main__:cli"
54
54
 
55
55
  [dependency-groups]
56
56
  dev = [
57
- "ai-rulez>=2.0.1",
58
57
  "covdefaults>=2.3",
59
- "mypy>=1.17.1",
58
+ "mypy>=1.18.1",
60
59
  "pre-commit>=4.3",
61
60
  "pytest>=8.4.2",
62
- "pytest-cov>=6.3",
61
+ "pytest-cov>=7",
63
62
  "pytest-mock>=3.15",
64
- "ruff>=0.12.12",
63
+ "ruff>=0.13",
65
64
  "types-beautifulsoup4>=4.12.0.20250516",
66
65
  "types-psutil>=7.0.0.20250822",
67
66
  "uv-bump",