html-to-markdown 1.5.0__tar.gz → 1.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/PKG-INFO +50 -13
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/README.md +46 -12
- html_to_markdown-1.8.0/html_to_markdown/__init__.py +24 -0
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/cli.py +1 -4
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/converters.py +36 -92
- html_to_markdown-1.8.0/html_to_markdown/exceptions.py +49 -0
- html_to_markdown-1.8.0/html_to_markdown/preprocessor.py +407 -0
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/processing.py +447 -210
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/utils.py +12 -5
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/PKG-INFO +50 -13
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/SOURCES.txt +2 -0
- html_to_markdown-1.8.0/html_to_markdown.egg-info/requires.txt +5 -0
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/pyproject.toml +3 -2
- html_to_markdown-1.5.0/html_to_markdown/__init__.py +0 -6
- html_to_markdown-1.5.0/html_to_markdown.egg-info/requires.txt +0 -1
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/LICENSE +0 -0
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/constants.py +0 -0
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/entry_points.txt +0 -0
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/top_level.txt +0 -0
- {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.8.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -32,6 +32,9 @@ Requires-Python: >=3.9
|
|
|
32
32
|
Description-Content-Type: text/markdown
|
|
33
33
|
License-File: LICENSE
|
|
34
34
|
Requires-Dist: beautifulsoup4>=4.13.4
|
|
35
|
+
Requires-Dist: nh3>=0.2.21
|
|
36
|
+
Provides-Extra: lxml
|
|
37
|
+
Requires-Dist: lxml>=5; extra == "lxml"
|
|
35
38
|
Dynamic: license-file
|
|
36
39
|
|
|
37
40
|
# html-to-markdown
|
|
@@ -60,6 +63,28 @@ Python 3.9+.
|
|
|
60
63
|
pip install html-to-markdown
|
|
61
64
|
```
|
|
62
65
|
|
|
66
|
+
### Optional lxml Parser
|
|
67
|
+
|
|
68
|
+
For improved performance, you can install with the optional lxml parser:
|
|
69
|
+
|
|
70
|
+
```shell
|
|
71
|
+
pip install html-to-markdown[lxml]
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
The lxml parser offers:
|
|
75
|
+
|
|
76
|
+
- **~30% faster HTML parsing** compared to the default html.parser
|
|
77
|
+
- Better handling of malformed HTML
|
|
78
|
+
- More robust parsing for complex documents
|
|
79
|
+
|
|
80
|
+
Once installed, lxml is automatically used by default for better performance. You can explicitly specify a parser if needed:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
result = convert_to_markdown(html) # Auto-detects: uses lxml if available, otherwise html.parser
|
|
84
|
+
result = convert_to_markdown(html, parser="lxml") # Force lxml (requires installation)
|
|
85
|
+
result = convert_to_markdown(html, parser="html.parser") # Force built-in parser
|
|
86
|
+
```
|
|
87
|
+
|
|
63
88
|
## Quick Start
|
|
64
89
|
|
|
65
90
|
Convert HTML to Markdown with a single function call:
|
|
@@ -180,18 +205,19 @@ Custom converters take precedence over the built-in converters and can be used a
|
|
|
180
205
|
|
|
181
206
|
### Key Configuration Options
|
|
182
207
|
|
|
183
|
-
| Option | Type | Default | Description
|
|
184
|
-
| ------------------- | ---- | ---------------- |
|
|
185
|
-
| `extract_metadata` | bool | `True` | Extract document metadata as comment header
|
|
186
|
-
| `convert_as_inline` | bool | `False` | Treat content as inline elements only
|
|
187
|
-
| `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`)
|
|
188
|
-
| `highlight_style` | str | `'double-equal'` | Highlight style (`'double-equal'`, `'html'`, `'bold'`)
|
|
189
|
-
| `stream_processing` | bool | `False` | Enable streaming for large documents
|
|
190
|
-
| `
|
|
191
|
-
| `
|
|
192
|
-
| `
|
|
193
|
-
| `
|
|
194
|
-
| `
|
|
208
|
+
| Option | Type | Default | Description |
|
|
209
|
+
| ------------------- | ---- | ---------------- | --------------------------------------------------------------- |
|
|
210
|
+
| `extract_metadata` | bool | `True` | Extract document metadata as comment header |
|
|
211
|
+
| `convert_as_inline` | bool | `False` | Treat content as inline elements only |
|
|
212
|
+
| `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
|
|
213
|
+
| `highlight_style` | str | `'double-equal'` | Highlight style (`'double-equal'`, `'html'`, `'bold'`) |
|
|
214
|
+
| `stream_processing` | bool | `False` | Enable streaming for large documents |
|
|
215
|
+
| `parser` | str | auto-detect | BeautifulSoup parser (auto-detects `'lxml'` or `'html.parser'`) |
|
|
216
|
+
| `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
|
|
217
|
+
| `bullets` | str | `'*+-'` | Characters to use for bullet points |
|
|
218
|
+
| `escape_asterisks` | bool | `True` | Escape * characters |
|
|
219
|
+
| `wrap` | bool | `False` | Enable text wrapping |
|
|
220
|
+
| `wrap_width` | int | `80` | Text wrap width |
|
|
195
221
|
|
|
196
222
|
For a complete list of all 20+ options, see the [Configuration Reference](#configuration-reference) section below.
|
|
197
223
|
|
|
@@ -379,6 +405,17 @@ uv run python -m html_to_markdown input.html
|
|
|
379
405
|
uv build
|
|
380
406
|
```
|
|
381
407
|
|
|
408
|
+
## Performance
|
|
409
|
+
|
|
410
|
+
The library is optimized for performance with several key features:
|
|
411
|
+
|
|
412
|
+
- **Efficient ancestor caching**: Reduces repeated DOM traversals using context-aware caching
|
|
413
|
+
- **Streaming support**: Process large documents in chunks to minimize memory usage
|
|
414
|
+
- **Optional lxml parser**: ~30% faster parsing for complex HTML documents
|
|
415
|
+
- **Optimized string operations**: Minimizes string concatenations in hot paths
|
|
416
|
+
|
|
417
|
+
Typical throughput: ~2 MB/s for regular processing on modern hardware.
|
|
418
|
+
|
|
382
419
|
## License
|
|
383
420
|
|
|
384
421
|
This library uses the MIT license.
|
|
@@ -24,6 +24,28 @@ Python 3.9+.
|
|
|
24
24
|
pip install html-to-markdown
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
+
### Optional lxml Parser
|
|
28
|
+
|
|
29
|
+
For improved performance, you can install with the optional lxml parser:
|
|
30
|
+
|
|
31
|
+
```shell
|
|
32
|
+
pip install html-to-markdown[lxml]
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
The lxml parser offers:
|
|
36
|
+
|
|
37
|
+
- **~30% faster HTML parsing** compared to the default html.parser
|
|
38
|
+
- Better handling of malformed HTML
|
|
39
|
+
- More robust parsing for complex documents
|
|
40
|
+
|
|
41
|
+
Once installed, lxml is automatically used by default for better performance. You can explicitly specify a parser if needed:
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
result = convert_to_markdown(html) # Auto-detects: uses lxml if available, otherwise html.parser
|
|
45
|
+
result = convert_to_markdown(html, parser="lxml") # Force lxml (requires installation)
|
|
46
|
+
result = convert_to_markdown(html, parser="html.parser") # Force built-in parser
|
|
47
|
+
```
|
|
48
|
+
|
|
27
49
|
## Quick Start
|
|
28
50
|
|
|
29
51
|
Convert HTML to Markdown with a single function call:
|
|
@@ -144,18 +166,19 @@ Custom converters take precedence over the built-in converters and can be used a
|
|
|
144
166
|
|
|
145
167
|
### Key Configuration Options
|
|
146
168
|
|
|
147
|
-
| Option | Type | Default | Description
|
|
148
|
-
| ------------------- | ---- | ---------------- |
|
|
149
|
-
| `extract_metadata` | bool | `True` | Extract document metadata as comment header
|
|
150
|
-
| `convert_as_inline` | bool | `False` | Treat content as inline elements only
|
|
151
|
-
| `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`)
|
|
152
|
-
| `highlight_style` | str | `'double-equal'` | Highlight style (`'double-equal'`, `'html'`, `'bold'`)
|
|
153
|
-
| `stream_processing` | bool | `False` | Enable streaming for large documents
|
|
154
|
-
| `
|
|
155
|
-
| `
|
|
156
|
-
| `
|
|
157
|
-
| `
|
|
158
|
-
| `
|
|
169
|
+
| Option | Type | Default | Description |
|
|
170
|
+
| ------------------- | ---- | ---------------- | --------------------------------------------------------------- |
|
|
171
|
+
| `extract_metadata` | bool | `True` | Extract document metadata as comment header |
|
|
172
|
+
| `convert_as_inline` | bool | `False` | Treat content as inline elements only |
|
|
173
|
+
| `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
|
|
174
|
+
| `highlight_style` | str | `'double-equal'` | Highlight style (`'double-equal'`, `'html'`, `'bold'`) |
|
|
175
|
+
| `stream_processing` | bool | `False` | Enable streaming for large documents |
|
|
176
|
+
| `parser` | str | auto-detect | BeautifulSoup parser (auto-detects `'lxml'` or `'html.parser'`) |
|
|
177
|
+
| `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
|
|
178
|
+
| `bullets` | str | `'*+-'` | Characters to use for bullet points |
|
|
179
|
+
| `escape_asterisks` | bool | `True` | Escape * characters |
|
|
180
|
+
| `wrap` | bool | `False` | Enable text wrapping |
|
|
181
|
+
| `wrap_width` | int | `80` | Text wrap width |
|
|
159
182
|
|
|
160
183
|
For a complete list of all 20+ options, see the [Configuration Reference](#configuration-reference) section below.
|
|
161
184
|
|
|
@@ -343,6 +366,17 @@ uv run python -m html_to_markdown input.html
|
|
|
343
366
|
uv build
|
|
344
367
|
```
|
|
345
368
|
|
|
369
|
+
## Performance
|
|
370
|
+
|
|
371
|
+
The library is optimized for performance with several key features:
|
|
372
|
+
|
|
373
|
+
- **Efficient ancestor caching**: Reduces repeated DOM traversals using context-aware caching
|
|
374
|
+
- **Streaming support**: Process large documents in chunks to minimize memory usage
|
|
375
|
+
- **Optional lxml parser**: ~30% faster parsing for complex HTML documents
|
|
376
|
+
- **Optimized string operations**: Minimizes string concatenations in hot paths
|
|
377
|
+
|
|
378
|
+
Typical throughput: ~2 MB/s for regular processing on modern hardware.
|
|
379
|
+
|
|
346
380
|
## License
|
|
347
381
|
|
|
348
382
|
This library uses the MIT license.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from html_to_markdown.exceptions import (
|
|
2
|
+
ConflictingOptionsError,
|
|
3
|
+
EmptyHtmlError,
|
|
4
|
+
HtmlToMarkdownError,
|
|
5
|
+
InvalidParserError,
|
|
6
|
+
MissingDependencyError,
|
|
7
|
+
)
|
|
8
|
+
from html_to_markdown.preprocessor import create_preprocessor, preprocess_html
|
|
9
|
+
from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
|
|
10
|
+
|
|
11
|
+
markdownify = convert_to_markdown
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ConflictingOptionsError",
|
|
15
|
+
"EmptyHtmlError",
|
|
16
|
+
"HtmlToMarkdownError",
|
|
17
|
+
"InvalidParserError",
|
|
18
|
+
"MissingDependencyError",
|
|
19
|
+
"convert_to_markdown",
|
|
20
|
+
"convert_to_markdown_stream",
|
|
21
|
+
"create_preprocessor",
|
|
22
|
+
"markdownify",
|
|
23
|
+
"preprocess_html",
|
|
24
|
+
]
|
|
@@ -191,7 +191,6 @@ def main(argv: list[str]) -> str:
|
|
|
191
191
|
|
|
192
192
|
args = parser.parse_args(argv)
|
|
193
193
|
|
|
194
|
-
# Prepare base arguments
|
|
195
194
|
base_args = {
|
|
196
195
|
"strip": args.strip,
|
|
197
196
|
"convert": args.convert,
|
|
@@ -216,18 +215,16 @@ def main(argv: list[str]) -> str:
|
|
|
216
215
|
"highlight_style": args.highlight_style,
|
|
217
216
|
}
|
|
218
217
|
|
|
219
|
-
# Add streaming parameters only if streaming is enabled
|
|
220
218
|
if args.stream_processing:
|
|
221
219
|
base_args["stream_processing"] = True
|
|
222
220
|
base_args["chunk_size"] = args.chunk_size
|
|
223
221
|
|
|
224
|
-
# Progress callback for CLI
|
|
225
222
|
if args.show_progress:
|
|
226
223
|
|
|
227
224
|
def progress_callback(processed: int, total: int) -> None:
|
|
228
225
|
if total > 0:
|
|
229
226
|
percent = (processed / total) * 100
|
|
230
|
-
|
|
227
|
+
|
|
231
228
|
sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
|
|
232
229
|
sys.stderr.flush()
|
|
233
230
|
|