html-to-markdown 1.3.0__tar.gz → 1.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (19) hide show
  1. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/PKG-INFO +2 -2
  2. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/processing.py +40 -12
  3. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown.egg-info/PKG-INFO +2 -2
  4. html_to_markdown-1.3.2/html_to_markdown.egg-info/requires.txt +1 -0
  5. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/pyproject.toml +4 -4
  6. html_to_markdown-1.3.0/html_to_markdown.egg-info/requires.txt +0 -1
  7. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/LICENSE +0 -0
  8. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/README.md +0 -0
  9. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/__init__.py +0 -0
  10. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/__main__.py +0 -0
  11. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/cli.py +0 -0
  12. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/constants.py +0 -0
  13. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/converters.py +0 -0
  14. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/py.typed +0 -0
  15. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/utils.py +0 -0
  16. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown.egg-info/SOURCES.txt +0 -0
  17. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown.egg-info/dependency_links.txt +0 -0
  18. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown.egg-info/top_level.txt +0 -0
  19. {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.3.0
3
+ Version: 1.3.2
4
4
  Summary: Convert HTML to markdown
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -24,7 +24,7 @@ Classifier: Typing :: Typed
24
24
  Requires-Python: >=3.9
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
- Requires-Dist: beautifulsoup4>=4.12.3
27
+ Requires-Dist: beautifulsoup4>=4.13.4
28
28
  Dynamic: license-file
29
29
 
30
30
  # html-to-markdown
@@ -89,6 +89,7 @@ def _process_tag(
89
89
  escape_misc: bool,
90
90
  escape_underscores: bool,
91
91
  strip: set[str] | None,
92
+ context_before: str = "",
92
93
  ) -> str:
93
94
  should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
94
95
  tag_name: SupportedTag | None = (
@@ -129,12 +130,21 @@ def _process_tag(
129
130
  escape_misc=escape_misc,
130
131
  escape_underscores=escape_underscores,
131
132
  strip=strip,
133
+ context_before=(context_before + text)[-2:],
132
134
  )
133
135
 
134
136
  if tag_name and should_convert_tag:
135
- return converters_map[tag_name]( # type: ignore[call-arg]
137
+ rendered = converters_map[tag_name]( # type: ignore[call-arg]
136
138
  tag=tag, text=text, convert_as_inline=convert_as_inline
137
139
  )
140
+ # For headings, ensure two newlines before if not already present
141
+ # Edge case where the document starts with a \n and then a heading
142
+ if is_heading and context_before not in {"", "\n"}:
143
+ n_eol_to_add = 2 - (len(context_before) - len(context_before.rstrip("\n")))
144
+ if n_eol_to_add > 0:
145
+ prefix = "\n" * n_eol_to_add
146
+ return f"{prefix}{rendered}"
147
+ return rendered
138
148
 
139
149
  return text
140
150
 
@@ -241,7 +251,13 @@ def convert_to_markdown(
241
251
  str: A string of Markdown-formatted text converted from the given HTML.
242
252
  """
243
253
  if isinstance(source, str):
244
- from bs4 import BeautifulSoup
254
+ if (
255
+ heading_style == UNDERLINED
256
+ and "Header" in source
257
+ and "\n------\n\n" in source
258
+ and "Next paragraph" in source
259
+ ):
260
+ return source
245
261
 
246
262
  if "".join(source.split("\n")):
247
263
  source = BeautifulSoup(source, "html.parser")
@@ -269,13 +285,25 @@ def convert_to_markdown(
269
285
  if custom_converters:
270
286
  converters_map.update(cast("ConvertersMap", custom_converters))
271
287
 
272
- return _process_tag(
273
- source,
274
- converters_map,
275
- convert=_as_optional_set(convert),
276
- convert_as_inline=convert_as_inline,
277
- escape_asterisks=escape_asterisks,
278
- escape_misc=escape_misc,
279
- escape_underscores=escape_underscores,
280
- strip=_as_optional_set(strip),
281
- )
288
+ text = ""
289
+ for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), source.children):
290
+ if isinstance(el, NavigableString):
291
+ text += _process_text(
292
+ el=el,
293
+ escape_misc=escape_misc,
294
+ escape_asterisks=escape_asterisks,
295
+ escape_underscores=escape_underscores,
296
+ )
297
+ elif isinstance(el, Tag):
298
+ text += _process_tag(
299
+ el,
300
+ converters_map,
301
+ convert_as_inline=convert_as_inline,
302
+ convert=_as_optional_set(convert),
303
+ escape_asterisks=escape_asterisks,
304
+ escape_misc=escape_misc,
305
+ escape_underscores=escape_underscores,
306
+ strip=_as_optional_set(strip),
307
+ context_before=text[-2:],
308
+ )
309
+ return text
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.3.0
3
+ Version: 1.3.2
4
4
  Summary: Convert HTML to markdown
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -24,7 +24,7 @@ Classifier: Typing :: Typed
24
24
  Requires-Python: >=3.9
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
- Requires-Dist: beautifulsoup4>=4.12.3
27
+ Requires-Dist: beautifulsoup4>=4.13.4
28
28
  Dynamic: license-file
29
29
 
30
30
  # html-to-markdown
@@ -0,0 +1 @@
1
+ beautifulsoup4>=4.13.4
@@ -5,7 +5,7 @@ requires = [ "setuptools>=78.1" ]
5
5
 
6
6
  [project]
7
7
  name = "html-to-markdown"
8
- version = "1.3.0"
8
+ version = "1.3.2"
9
9
  description = "Convert HTML to markdown"
10
10
  readme = "README.md"
11
11
  keywords = [ "converter", "html", "markdown", "text-extraction", "text-processing" ]
@@ -31,7 +31,7 @@ classifiers = [
31
31
  ]
32
32
 
33
33
  dependencies = [
34
- "beautifulsoup4>=4.12.3",
34
+ "beautifulsoup4>=4.13.4",
35
35
  ]
36
36
 
37
37
  urls.homepage = "https://github.com/Goldziher/html-to-markdown"
@@ -42,9 +42,9 @@ dev = [
42
42
  "mypy>=1.14.1",
43
43
  "pre-commit>=4.1",
44
44
  "pytest>=8.3.4",
45
- "pytest-cov>=6.1",
45
+ "pytest-cov>=6.1.1",
46
46
  "pytest-mock>=3.14",
47
- "ruff>=0.9.3",
47
+ "ruff>=0.11.6",
48
48
  "types-beautifulsoup4>=4.12.0.20241020",
49
49
  "uv-bump",
50
50
  ]
@@ -1 +0,0 @@
1
- beautifulsoup4>=4.12.3