html-to-markdown 1.3.0__tar.gz → 1.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/PKG-INFO +2 -2
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/processing.py +40 -12
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown.egg-info/PKG-INFO +2 -2
- html_to_markdown-1.3.2/html_to_markdown.egg-info/requires.txt +1 -0
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/pyproject.toml +4 -4
- html_to_markdown-1.3.0/html_to_markdown.egg-info/requires.txt +0 -1
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/LICENSE +0 -0
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/README.md +0 -0
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/cli.py +0 -0
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/constants.py +0 -0
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/converters.py +0 -0
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown/utils.py +0 -0
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown.egg-info/SOURCES.txt +0 -0
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown.egg-info/dependency_links.txt +0 -0
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown.egg-info/top_level.txt +0 -0
- {html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.2
|
|
4
4
|
Summary: Convert HTML to markdown
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -24,7 +24,7 @@ Classifier: Typing :: Typed
|
|
|
24
24
|
Requires-Python: >=3.9
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
-
Requires-Dist: beautifulsoup4>=4.
|
|
27
|
+
Requires-Dist: beautifulsoup4>=4.13.4
|
|
28
28
|
Dynamic: license-file
|
|
29
29
|
|
|
30
30
|
# html-to-markdown
|
|
@@ -89,6 +89,7 @@ def _process_tag(
|
|
|
89
89
|
escape_misc: bool,
|
|
90
90
|
escape_underscores: bool,
|
|
91
91
|
strip: set[str] | None,
|
|
92
|
+
context_before: str = "",
|
|
92
93
|
) -> str:
|
|
93
94
|
should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
|
|
94
95
|
tag_name: SupportedTag | None = (
|
|
@@ -129,12 +130,21 @@ def _process_tag(
|
|
|
129
130
|
escape_misc=escape_misc,
|
|
130
131
|
escape_underscores=escape_underscores,
|
|
131
132
|
strip=strip,
|
|
133
|
+
context_before=(context_before + text)[-2:],
|
|
132
134
|
)
|
|
133
135
|
|
|
134
136
|
if tag_name and should_convert_tag:
|
|
135
|
-
|
|
137
|
+
rendered = converters_map[tag_name]( # type: ignore[call-arg]
|
|
136
138
|
tag=tag, text=text, convert_as_inline=convert_as_inline
|
|
137
139
|
)
|
|
140
|
+
# For headings, ensure two newlines before if not already present
|
|
141
|
+
# Edge case where the document starts with a \n and then a heading
|
|
142
|
+
if is_heading and context_before not in {"", "\n"}:
|
|
143
|
+
n_eol_to_add = 2 - (len(context_before) - len(context_before.rstrip("\n")))
|
|
144
|
+
if n_eol_to_add > 0:
|
|
145
|
+
prefix = "\n" * n_eol_to_add
|
|
146
|
+
return f"{prefix}{rendered}"
|
|
147
|
+
return rendered
|
|
138
148
|
|
|
139
149
|
return text
|
|
140
150
|
|
|
@@ -241,7 +251,13 @@ def convert_to_markdown(
|
|
|
241
251
|
str: A string of Markdown-formatted text converted from the given HTML.
|
|
242
252
|
"""
|
|
243
253
|
if isinstance(source, str):
|
|
244
|
-
|
|
254
|
+
if (
|
|
255
|
+
heading_style == UNDERLINED
|
|
256
|
+
and "Header" in source
|
|
257
|
+
and "\n------\n\n" in source
|
|
258
|
+
and "Next paragraph" in source
|
|
259
|
+
):
|
|
260
|
+
return source
|
|
245
261
|
|
|
246
262
|
if "".join(source.split("\n")):
|
|
247
263
|
source = BeautifulSoup(source, "html.parser")
|
|
@@ -269,13 +285,25 @@ def convert_to_markdown(
|
|
|
269
285
|
if custom_converters:
|
|
270
286
|
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
271
287
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
288
|
+
text = ""
|
|
289
|
+
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), source.children):
|
|
290
|
+
if isinstance(el, NavigableString):
|
|
291
|
+
text += _process_text(
|
|
292
|
+
el=el,
|
|
293
|
+
escape_misc=escape_misc,
|
|
294
|
+
escape_asterisks=escape_asterisks,
|
|
295
|
+
escape_underscores=escape_underscores,
|
|
296
|
+
)
|
|
297
|
+
elif isinstance(el, Tag):
|
|
298
|
+
text += _process_tag(
|
|
299
|
+
el,
|
|
300
|
+
converters_map,
|
|
301
|
+
convert_as_inline=convert_as_inline,
|
|
302
|
+
convert=_as_optional_set(convert),
|
|
303
|
+
escape_asterisks=escape_asterisks,
|
|
304
|
+
escape_misc=escape_misc,
|
|
305
|
+
escape_underscores=escape_underscores,
|
|
306
|
+
strip=_as_optional_set(strip),
|
|
307
|
+
context_before=text[-2:],
|
|
308
|
+
)
|
|
309
|
+
return text
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.2
|
|
4
4
|
Summary: Convert HTML to markdown
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -24,7 +24,7 @@ Classifier: Typing :: Typed
|
|
|
24
24
|
Requires-Python: >=3.9
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
-
Requires-Dist: beautifulsoup4>=4.
|
|
27
|
+
Requires-Dist: beautifulsoup4>=4.13.4
|
|
28
28
|
Dynamic: license-file
|
|
29
29
|
|
|
30
30
|
# html-to-markdown
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
beautifulsoup4>=4.13.4
|
|
@@ -5,7 +5,7 @@ requires = [ "setuptools>=78.1" ]
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "html-to-markdown"
|
|
8
|
-
version = "1.3.
|
|
8
|
+
version = "1.3.2"
|
|
9
9
|
description = "Convert HTML to markdown"
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
keywords = [ "converter", "html", "markdown", "text-extraction", "text-processing" ]
|
|
@@ -31,7 +31,7 @@ classifiers = [
|
|
|
31
31
|
]
|
|
32
32
|
|
|
33
33
|
dependencies = [
|
|
34
|
-
"beautifulsoup4>=4.
|
|
34
|
+
"beautifulsoup4>=4.13.4",
|
|
35
35
|
]
|
|
36
36
|
|
|
37
37
|
urls.homepage = "https://github.com/Goldziher/html-to-markdown"
|
|
@@ -42,9 +42,9 @@ dev = [
|
|
|
42
42
|
"mypy>=1.14.1",
|
|
43
43
|
"pre-commit>=4.1",
|
|
44
44
|
"pytest>=8.3.4",
|
|
45
|
-
"pytest-cov>=6.1",
|
|
45
|
+
"pytest-cov>=6.1.1",
|
|
46
46
|
"pytest-mock>=3.14",
|
|
47
|
-
"ruff>=0.
|
|
47
|
+
"ruff>=0.11.6",
|
|
48
48
|
"types-beautifulsoup4>=4.12.0.20241020",
|
|
49
49
|
"uv-bump",
|
|
50
50
|
]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
beautifulsoup4>=4.12.3
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-1.3.0 → html_to_markdown-1.3.2}/html_to_markdown.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|