html-to-markdown 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/converters.py +5 -2
- html_to_markdown/processing.py +13 -10
- html_to_markdown/whitespace.py +14 -3
- {html_to_markdown-1.10.0.dist-info → html_to_markdown-1.11.0.dist-info}/METADATA +2 -2
- {html_to_markdown-1.10.0.dist-info → html_to_markdown-1.11.0.dist-info}/RECORD +9 -9
- {html_to_markdown-1.10.0.dist-info → html_to_markdown-1.11.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.10.0.dist-info → html_to_markdown-1.11.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.10.0.dist-info → html_to_markdown-1.11.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.10.0.dist-info → html_to_markdown-1.11.0.dist-info}/top_level.txt +0 -0
html_to_markdown/converters.py
CHANGED
|
@@ -578,8 +578,11 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
|
|
|
578
578
|
return f"{text}\n\n" if text.strip() else ""
|
|
579
579
|
|
|
580
580
|
|
|
581
|
-
def _convert_div(*, text: str, convert_as_inline: bool) -> str:
|
|
582
|
-
|
|
581
|
+
def _convert_div(*, text: str, convert_as_inline: bool) -> str:
|
|
582
|
+
if convert_as_inline:
|
|
583
|
+
return text
|
|
584
|
+
|
|
585
|
+
return _format_block_element(text)
|
|
583
586
|
|
|
584
587
|
|
|
585
588
|
def _convert_details(*, text: str, convert_as_inline: bool) -> str:
|
html_to_markdown/processing.py
CHANGED
|
@@ -258,6 +258,18 @@ def _process_tag(
|
|
|
258
258
|
if n_eol_to_add > 0:
|
|
259
259
|
prefix = "\n" * n_eol_to_add
|
|
260
260
|
return f"{prefix}{rendered}"
|
|
261
|
+
|
|
262
|
+
from html_to_markdown.whitespace import BLOCK_ELEMENTS # noqa: PLC0415
|
|
263
|
+
|
|
264
|
+
is_block_element = tag.name.lower() in BLOCK_ELEMENTS
|
|
265
|
+
if (
|
|
266
|
+
is_block_element
|
|
267
|
+
and not convert_as_inline
|
|
268
|
+
and context_before
|
|
269
|
+
and not context_before.endswith("\n")
|
|
270
|
+
and rendered.strip()
|
|
271
|
+
):
|
|
272
|
+
return f"\n\n{rendered}"
|
|
261
273
|
return rendered
|
|
262
274
|
|
|
263
275
|
return text
|
|
@@ -358,7 +370,7 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
|
|
|
358
370
|
if value is None:
|
|
359
371
|
return None
|
|
360
372
|
if isinstance(value, str):
|
|
361
|
-
return set(","
|
|
373
|
+
return set(value.split(","))
|
|
362
374
|
return {*chain(*[v.split(",") for v in value])}
|
|
363
375
|
|
|
364
376
|
|
|
@@ -836,15 +848,6 @@ def _process_html_core(
|
|
|
836
848
|
|
|
837
849
|
try:
|
|
838
850
|
if isinstance(source, str):
|
|
839
|
-
if (
|
|
840
|
-
heading_style == UNDERLINED
|
|
841
|
-
and "Header" in source
|
|
842
|
-
and "\n------\n\n" in source
|
|
843
|
-
and "Next paragraph" in source
|
|
844
|
-
):
|
|
845
|
-
sink.write(source)
|
|
846
|
-
return
|
|
847
|
-
|
|
848
851
|
if strip_newlines:
|
|
849
852
|
source = source.replace("\n", " ").replace("\r", " ")
|
|
850
853
|
|
html_to_markdown/whitespace.py
CHANGED
|
@@ -171,13 +171,13 @@ class WhitespaceHandler:
|
|
|
171
171
|
if not text:
|
|
172
172
|
return ""
|
|
173
173
|
|
|
174
|
-
text = self.normalize_unicode_spaces(text)
|
|
175
|
-
|
|
176
174
|
if in_pre or self.should_preserve_whitespace(element):
|
|
177
175
|
return text
|
|
178
176
|
|
|
179
177
|
if self.mode == "strict":
|
|
180
178
|
return text
|
|
179
|
+
|
|
180
|
+
text = self.normalize_unicode_spaces(text)
|
|
181
181
|
return self._process_normalized(text, element)
|
|
182
182
|
|
|
183
183
|
def _process_normalized(self, text: str, element: NavigableString) -> str:
|
|
@@ -242,6 +242,14 @@ class WhitespaceHandler:
|
|
|
242
242
|
prev_sibling = element.previous_sibling
|
|
243
243
|
next_sibling = element.next_sibling
|
|
244
244
|
|
|
245
|
+
multiple_newlines_before_block = (
|
|
246
|
+
original
|
|
247
|
+
and original.count("\n") >= 2
|
|
248
|
+
and self.is_block_element(next_sibling)
|
|
249
|
+
and text.strip()
|
|
250
|
+
and (self.is_inline_element(prev_sibling) or prev_sibling is None)
|
|
251
|
+
)
|
|
252
|
+
|
|
245
253
|
has_leading = (
|
|
246
254
|
has_lead_space
|
|
247
255
|
and original[0] == " "
|
|
@@ -268,6 +276,9 @@ class WhitespaceHandler:
|
|
|
268
276
|
if has_trailing and not (original and original[-1] in "\n\t"):
|
|
269
277
|
text = text + " "
|
|
270
278
|
|
|
279
|
+
if multiple_newlines_before_block:
|
|
280
|
+
text = text + "\n\n"
|
|
281
|
+
|
|
271
282
|
return text
|
|
272
283
|
|
|
273
284
|
def get_block_spacing(self, tag: Tag, next_sibling: PageElement | None = None) -> str:
|
|
@@ -286,7 +297,7 @@ class WhitespaceHandler:
|
|
|
286
297
|
return "\n"
|
|
287
298
|
if tag_name in single_newline_elements:
|
|
288
299
|
return "\n"
|
|
289
|
-
if tag_name.startswith("h") and len(tag_name) == 2:
|
|
300
|
+
if tag_name.startswith("h") and len(tag_name) == 2 and tag_name[1].isdigit():
|
|
290
301
|
return "\n\n"
|
|
291
302
|
|
|
292
303
|
return ""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.11.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -33,7 +33,7 @@ License-File: LICENSE
|
|
|
33
33
|
Requires-Dist: beautifulsoup4>=4.13.5
|
|
34
34
|
Requires-Dist: nh3>=0.3
|
|
35
35
|
Provides-Extra: lxml
|
|
36
|
-
Requires-Dist: lxml>=
|
|
36
|
+
Requires-Dist: beautifulsoup4[lxml]>=4.13.5; extra == "lxml"
|
|
37
37
|
Dynamic: license-file
|
|
38
38
|
|
|
39
39
|
# html-to-markdown
|
|
@@ -2,16 +2,16 @@ html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,
|
|
|
2
2
|
html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
|
|
3
3
|
html_to_markdown/cli.py,sha256=ilnrJN2XMhPDQ4UkkG4cjLXTvglu_ZJj-bBsohVF3fw,8541
|
|
4
4
|
html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
|
|
5
|
-
html_to_markdown/converters.py,sha256=
|
|
5
|
+
html_to_markdown/converters.py,sha256=CbChkRIlOPe0d1MK5-txDE56IG4Ea_dcCV6KRCTjeKY,32497
|
|
6
6
|
html_to_markdown/exceptions.py,sha256=YjfwVCWE_oZakr9iy0E-_aPSYHNaocJZgWeQ9Enty7Q,1212
|
|
7
7
|
html_to_markdown/preprocessor.py,sha256=acmuJJvx1RaXE3c0F_aWsartQE0cEpa3AOnJYGnPzqw,9708
|
|
8
|
-
html_to_markdown/processing.py,sha256=
|
|
8
|
+
html_to_markdown/processing.py,sha256=sOIIFNyRkRYAH8Q4ehrh66RY71bkvttSuqzXYsMC5JM,34334
|
|
9
9
|
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
html_to_markdown/utils.py,sha256=4Vzk2cCjxN0LAZ1DXQCufYtxE7a6739TYgPbje-VM_E,1086
|
|
11
|
-
html_to_markdown/whitespace.py,sha256=
|
|
12
|
-
html_to_markdown-1.
|
|
13
|
-
html_to_markdown-1.
|
|
14
|
-
html_to_markdown-1.
|
|
15
|
-
html_to_markdown-1.
|
|
16
|
-
html_to_markdown-1.
|
|
17
|
-
html_to_markdown-1.
|
|
11
|
+
html_to_markdown/whitespace.py,sha256=EJ0gEsfLB_wZAk5d5qP4UPhPg0pJJ8LZLRRr_QoL01o,8186
|
|
12
|
+
html_to_markdown-1.11.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
13
|
+
html_to_markdown-1.11.0.dist-info/METADATA,sha256=Cej6bnqT9JVFzACZvND6Z5-kD0QoabiLi46opAaC11U,17814
|
|
14
|
+
html_to_markdown-1.11.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
html_to_markdown-1.11.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
16
|
+
html_to_markdown-1.11.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
17
|
+
html_to_markdown-1.11.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|