html-to-markdown 1.5.0__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +20 -2
- html_to_markdown/cli.py +1 -4
- html_to_markdown/converters.py +36 -92
- html_to_markdown/exceptions.py +49 -0
- html_to_markdown/preprocessor.py +407 -0
- html_to_markdown/processing.py +447 -210
- html_to_markdown/utils.py +12 -5
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/METADATA +50 -13
- html_to_markdown-1.8.0.dist-info/RECORD +16 -0
- html_to_markdown-1.5.0.dist-info/RECORD +0 -14
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/top_level.txt +0 -0
html_to_markdown/utils.py
CHANGED
|
@@ -6,18 +6,25 @@ from html_to_markdown.constants import line_beginning_re
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def chomp(text: str) -> tuple[str, str, str]:
|
|
9
|
-
"""
|
|
10
|
-
|
|
9
|
+
"""Simplified whitespace handling for inline elements.
|
|
10
|
+
|
|
11
|
+
For semantic markdown output, preserves leading/trailing spaces as single spaces
|
|
12
|
+
and normalizes internal whitespace.
|
|
11
13
|
|
|
12
14
|
Args:
|
|
13
15
|
text: The text to chomp.
|
|
14
16
|
|
|
15
17
|
Returns:
|
|
16
|
-
A tuple containing the prefix, suffix, and the
|
|
18
|
+
A tuple containing the prefix, suffix, and the normalized text.
|
|
17
19
|
"""
|
|
18
|
-
|
|
19
|
-
|
|
20
|
+
if not text:
|
|
21
|
+
return "", "", ""
|
|
22
|
+
|
|
23
|
+
prefix = " " if text.startswith((" ", "\t")) else ""
|
|
24
|
+
suffix = " " if text.endswith((" ", "\t")) else ""
|
|
25
|
+
|
|
20
26
|
text = text.strip()
|
|
27
|
+
|
|
21
28
|
return prefix, suffix, text
|
|
22
29
|
|
|
23
30
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.8.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -32,6 +32,9 @@ Requires-Python: >=3.9
|
|
|
32
32
|
Description-Content-Type: text/markdown
|
|
33
33
|
License-File: LICENSE
|
|
34
34
|
Requires-Dist: beautifulsoup4>=4.13.4
|
|
35
|
+
Requires-Dist: nh3>=0.2.21
|
|
36
|
+
Provides-Extra: lxml
|
|
37
|
+
Requires-Dist: lxml>=5; extra == "lxml"
|
|
35
38
|
Dynamic: license-file
|
|
36
39
|
|
|
37
40
|
# html-to-markdown
|
|
@@ -60,6 +63,28 @@ Python 3.9+.
|
|
|
60
63
|
pip install html-to-markdown
|
|
61
64
|
```
|
|
62
65
|
|
|
66
|
+
### Optional lxml Parser
|
|
67
|
+
|
|
68
|
+
For improved performance, you can install with the optional lxml parser:
|
|
69
|
+
|
|
70
|
+
```shell
|
|
71
|
+
pip install html-to-markdown[lxml]
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
The lxml parser offers:
|
|
75
|
+
|
|
76
|
+
- **~30% faster HTML parsing** compared to the default html.parser
|
|
77
|
+
- Better handling of malformed HTML
|
|
78
|
+
- More robust parsing for complex documents
|
|
79
|
+
|
|
80
|
+
Once installed, lxml is automatically used by default for better performance. You can explicitly specify a parser if needed:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
result = convert_to_markdown(html) # Auto-detects: uses lxml if available, otherwise html.parser
|
|
84
|
+
result = convert_to_markdown(html, parser="lxml") # Force lxml (requires installation)
|
|
85
|
+
result = convert_to_markdown(html, parser="html.parser") # Force built-in parser
|
|
86
|
+
```
|
|
87
|
+
|
|
63
88
|
## Quick Start
|
|
64
89
|
|
|
65
90
|
Convert HTML to Markdown with a single function call:
|
|
@@ -180,18 +205,19 @@ Custom converters take precedence over the built-in converters and can be used a
|
|
|
180
205
|
|
|
181
206
|
### Key Configuration Options
|
|
182
207
|
|
|
183
|
-
| Option | Type | Default | Description
|
|
184
|
-
| ------------------- | ---- | ---------------- |
|
|
185
|
-
| `extract_metadata` | bool | `True` | Extract document metadata as comment header
|
|
186
|
-
| `convert_as_inline` | bool | `False` | Treat content as inline elements only
|
|
187
|
-
| `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`)
|
|
188
|
-
| `highlight_style` | str | `'double-equal'` | Highlight style (`'double-equal'`, `'html'`, `'bold'`)
|
|
189
|
-
| `stream_processing` | bool | `False` | Enable streaming for large documents
|
|
190
|
-
| `
|
|
191
|
-
| `
|
|
192
|
-
| `
|
|
193
|
-
| `
|
|
194
|
-
| `
|
|
208
|
+
| Option | Type | Default | Description |
|
|
209
|
+
| ------------------- | ---- | ---------------- | --------------------------------------------------------------- |
|
|
210
|
+
| `extract_metadata` | bool | `True` | Extract document metadata as comment header |
|
|
211
|
+
| `convert_as_inline` | bool | `False` | Treat content as inline elements only |
|
|
212
|
+
| `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
|
|
213
|
+
| `highlight_style` | str | `'double-equal'` | Highlight style (`'double-equal'`, `'html'`, `'bold'`) |
|
|
214
|
+
| `stream_processing` | bool | `False` | Enable streaming for large documents |
|
|
215
|
+
| `parser` | str | auto-detect | BeautifulSoup parser (auto-detects `'lxml'` or `'html.parser'`) |
|
|
216
|
+
| `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
|
|
217
|
+
| `bullets` | str | `'*+-'` | Characters to use for bullet points |
|
|
218
|
+
| `escape_asterisks` | bool | `True` | Escape * characters |
|
|
219
|
+
| `wrap` | bool | `False` | Enable text wrapping |
|
|
220
|
+
| `wrap_width` | int | `80` | Text wrap width |
|
|
195
221
|
|
|
196
222
|
For a complete list of all 20+ options, see the [Configuration Reference](#configuration-reference) section below.
|
|
197
223
|
|
|
@@ -379,6 +405,17 @@ uv run python -m html_to_markdown input.html
|
|
|
379
405
|
uv build
|
|
380
406
|
```
|
|
381
407
|
|
|
408
|
+
## Performance
|
|
409
|
+
|
|
410
|
+
The library is optimized for performance with several key features:
|
|
411
|
+
|
|
412
|
+
- **Efficient ancestor caching**: Reduces repeated DOM traversals using context-aware caching
|
|
413
|
+
- **Streaming support**: Process large documents in chunks to minimize memory usage
|
|
414
|
+
- **Optional lxml parser**: ~30% faster parsing for complex HTML documents
|
|
415
|
+
- **Optimized string operations**: Minimizes string concatenations in hot paths
|
|
416
|
+
|
|
417
|
+
Typical throughput: ~2 MB/s for regular processing on modern hardware.
|
|
418
|
+
|
|
382
419
|
## License
|
|
383
420
|
|
|
384
421
|
This library uses the MIT license.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
|
|
2
|
+
html_to_markdown/__main__.py,sha256=DJyJX7NIK0BVPNS2r3BYJ0Ci_lKHhgVOpw7ZEqACH3c,323
|
|
3
|
+
html_to_markdown/cli.py,sha256=8xlgSEcnqsSM_dr1TCSgPDAo09YvUtO78PvDFivFFdg,6973
|
|
4
|
+
html_to_markdown/constants.py,sha256=8vqANd-7wYvDzBm1VXZvdIxS4Xom4Ov_Yghg6jvmyio,584
|
|
5
|
+
html_to_markdown/converters.py,sha256=COC2KqPelJlMCY5eXUS5gdiPOG8Yzx0U719FeXPw3GA,55514
|
|
6
|
+
html_to_markdown/exceptions.py,sha256=s1DaG6A23rOurF91e4jryuUzplWcC_JIAuK9_bw_4jQ,1558
|
|
7
|
+
html_to_markdown/preprocessor.py,sha256=S4S1ZfLC_hkJVgmA5atImTyWQDOxfHctPbaep2QtyrQ,11248
|
|
8
|
+
html_to_markdown/processing.py,sha256=wkbhLg42U3aeVQSZFuzGt5irtN037XzRKpCE71QYZXI,36520
|
|
9
|
+
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
html_to_markdown/utils.py,sha256=QgWPzmpZKFd6wDTe8IY3gbVT3xNzoGV3PBgd17J0O-w,2066
|
|
11
|
+
html_to_markdown-1.8.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
12
|
+
html_to_markdown-1.8.0.dist-info/METADATA,sha256=6pgiK4p0A77axLfD8MH1EGgzifP06koVV8KWS_5-iYk,17175
|
|
13
|
+
html_to_markdown-1.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
14
|
+
html_to_markdown-1.8.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
15
|
+
html_to_markdown-1.8.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
16
|
+
html_to_markdown-1.8.0.dist-info/RECORD,,
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
html_to_markdown/__init__.py,sha256=ZfPBBhhxQJTFQiOX-5OtgSMP2xFs5UUJeYmLL-AawoQ,265
|
|
2
|
-
html_to_markdown/__main__.py,sha256=DJyJX7NIK0BVPNS2r3BYJ0Ci_lKHhgVOpw7ZEqACH3c,323
|
|
3
|
-
html_to_markdown/cli.py,sha256=WzQVr97jKECEZwW-xIJofSl3v4EhqU-De7XRQjmgc08,7179
|
|
4
|
-
html_to_markdown/constants.py,sha256=8vqANd-7wYvDzBm1VXZvdIxS4Xom4Ov_Yghg6jvmyio,584
|
|
5
|
-
html_to_markdown/converters.py,sha256=xEVT0rQGWBU4V-HBF7Mmm-2XGPB1cboAmKlF6vcxS4k,59456
|
|
6
|
-
html_to_markdown/processing.py,sha256=nqpPiRZu5B--E9dJ9AOwH2r1alg-ynv7ie63rtIb9Ls,28661
|
|
7
|
-
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
|
|
9
|
-
html_to_markdown-1.5.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
10
|
-
html_to_markdown-1.5.0.dist-info/METADATA,sha256=nGVi7PSapoEUNTn5WGBW2g744dZTxaXCcFxl_ILeb9s,15641
|
|
11
|
-
html_to_markdown-1.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
-
html_to_markdown-1.5.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
13
|
-
html_to_markdown-1.5.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
14
|
-
html_to_markdown-1.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|