html-to-markdown 2.2.0__cp310-abi3-macosx_11_0_arm64.whl → 2.3.3__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +12 -2
- html_to_markdown/_html_to_markdown.abi3.so +0 -0
- html_to_markdown/api.py +95 -26
- html_to_markdown/bin/html-to-markdown +0 -0
- html_to_markdown/cli_proxy.py +1 -1
- {html_to_markdown-2.2.0.data → html_to_markdown-2.3.3.data}/scripts/html-to-markdown +0 -0
- {html_to_markdown-2.2.0.dist-info → html_to_markdown-2.3.3.dist-info}/METADATA +47 -8
- html_to_markdown-2.3.3.dist-info/RECORD +17 -0
- html_to_markdown-2.2.0.dist-info/RECORD +0 -17
- {html_to_markdown-2.2.0.dist-info → html_to_markdown-2.3.3.dist-info}/WHEEL +0 -0
- {html_to_markdown-2.2.0.dist-info → html_to_markdown-2.3.3.dist-info}/licenses/LICENSE +0 -0
html_to_markdown/__init__.py
CHANGED
|
@@ -15,7 +15,13 @@ V1 API (backward compatibility):
|
|
|
15
15
|
markdown = convert_to_markdown(html, heading_style="atx")
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
from html_to_markdown.api import
|
|
18
|
+
from html_to_markdown.api import (
|
|
19
|
+
InlineImage,
|
|
20
|
+
InlineImageConfig,
|
|
21
|
+
InlineImageWarning,
|
|
22
|
+
convert,
|
|
23
|
+
convert_with_inline_images,
|
|
24
|
+
)
|
|
19
25
|
from html_to_markdown.exceptions import (
|
|
20
26
|
ConflictingOptionsError,
|
|
21
27
|
EmptyHtmlError,
|
|
@@ -31,12 +37,16 @@ __all__ = [
|
|
|
31
37
|
"ConversionOptions",
|
|
32
38
|
"EmptyHtmlError",
|
|
33
39
|
"HtmlToMarkdownError",
|
|
40
|
+
"InlineImage",
|
|
41
|
+
"InlineImageConfig",
|
|
42
|
+
"InlineImageWarning",
|
|
34
43
|
"InvalidParserError",
|
|
35
44
|
"MissingDependencyError",
|
|
36
45
|
"PreprocessingOptions",
|
|
37
46
|
"convert",
|
|
38
47
|
"convert_to_markdown",
|
|
48
|
+
"convert_with_inline_images",
|
|
39
49
|
"markdownify",
|
|
40
50
|
]
|
|
41
51
|
|
|
42
|
-
__version__ = "2.
|
|
52
|
+
__version__ = "2.3.3"
|
|
Binary file
|
html_to_markdown/api.py
CHANGED
|
@@ -6,38 +6,52 @@ using the Rust backend for conversion.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
+
from typing import TYPE_CHECKING, Literal, TypedDict, cast
|
|
10
|
+
|
|
9
11
|
import html_to_markdown._html_to_markdown as _rust # type: ignore[import-not-found]
|
|
10
12
|
from html_to_markdown.options import ConversionOptions, PreprocessingOptions
|
|
11
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from html_to_markdown._html_to_markdown import InlineImageConfig
|
|
16
|
+
else:
|
|
17
|
+
InlineImageConfig = _rust.InlineImageConfig # type: ignore[misc, assignment]
|
|
12
18
|
|
|
13
|
-
def convert(
|
|
14
|
-
html: str,
|
|
15
|
-
options: ConversionOptions | None = None,
|
|
16
|
-
preprocessing: PreprocessingOptions | None = None,
|
|
17
|
-
) -> str:
|
|
18
|
-
"""Convert HTML to Markdown using the Rust backend.
|
|
19
19
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
20
|
+
class InlineImage(TypedDict):
|
|
21
|
+
"""Inline image extracted during conversion."""
|
|
22
|
+
|
|
23
|
+
data: bytes
|
|
24
|
+
format: str
|
|
25
|
+
filename: str | None
|
|
26
|
+
description: str | None
|
|
27
|
+
dimensions: tuple[int, int] | None
|
|
28
|
+
source: Literal["img_data_uri", "svg_element"]
|
|
29
|
+
attributes: dict[str, str]
|
|
24
30
|
|
|
25
|
-
Returns:
|
|
26
|
-
Converted Markdown string.
|
|
27
|
-
"""
|
|
28
|
-
if options is None:
|
|
29
|
-
options = ConversionOptions()
|
|
30
|
-
if preprocessing is None:
|
|
31
|
-
preprocessing = PreprocessingOptions()
|
|
32
31
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
class InlineImageWarning(TypedDict):
|
|
33
|
+
"""Warning produced during inline image extraction."""
|
|
34
|
+
|
|
35
|
+
index: int
|
|
36
|
+
message: str
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _to_rust_preprocessing(options: PreprocessingOptions) -> _rust.PreprocessingOptions:
|
|
40
|
+
"""Convert high-level preprocessing options to the Rust bindings."""
|
|
41
|
+
return _rust.PreprocessingOptions(
|
|
42
|
+
enabled=options.enabled,
|
|
43
|
+
preset=options.preset,
|
|
44
|
+
remove_navigation=options.remove_navigation,
|
|
45
|
+
remove_forms=options.remove_forms,
|
|
38
46
|
)
|
|
39
47
|
|
|
40
|
-
|
|
48
|
+
|
|
49
|
+
def _to_rust_options(
|
|
50
|
+
options: ConversionOptions,
|
|
51
|
+
preprocessing: PreprocessingOptions,
|
|
52
|
+
) -> _rust.ConversionOptions:
|
|
53
|
+
"""Convert high-level conversion options to the Rust bindings."""
|
|
54
|
+
return _rust.ConversionOptions(
|
|
41
55
|
heading_style=options.heading_style,
|
|
42
56
|
list_indent_type=options.list_indent_type,
|
|
43
57
|
list_indent_width=options.list_indent_width,
|
|
@@ -64,11 +78,66 @@ def convert(
|
|
|
64
78
|
newline_style=options.newline_style,
|
|
65
79
|
code_block_style=options.code_block_style,
|
|
66
80
|
keep_inline_images_in=list(options.keep_inline_images_in) if options.keep_inline_images_in else [],
|
|
67
|
-
preprocessing=
|
|
81
|
+
preprocessing=_to_rust_preprocessing(preprocessing),
|
|
68
82
|
encoding=options.encoding,
|
|
69
83
|
debug=options.debug,
|
|
70
84
|
strip_tags=list(options.strip_tags) if options.strip_tags else [],
|
|
71
85
|
)
|
|
72
86
|
|
|
73
|
-
|
|
74
|
-
|
|
87
|
+
|
|
88
|
+
def convert(
|
|
89
|
+
html: str,
|
|
90
|
+
options: ConversionOptions | None = None,
|
|
91
|
+
preprocessing: PreprocessingOptions | None = None,
|
|
92
|
+
) -> str:
|
|
93
|
+
"""Convert HTML to Markdown using the Rust backend.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
html: HTML string to convert.
|
|
97
|
+
options: Conversion configuration options (defaults to ConversionOptions()).
|
|
98
|
+
preprocessing: HTML preprocessing options (defaults to PreprocessingOptions()).
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Converted Markdown string.
|
|
102
|
+
"""
|
|
103
|
+
if options is None:
|
|
104
|
+
options = ConversionOptions()
|
|
105
|
+
if preprocessing is None:
|
|
106
|
+
preprocessing = PreprocessingOptions()
|
|
107
|
+
|
|
108
|
+
rust_options = _to_rust_options(options, preprocessing)
|
|
109
|
+
return cast("str", _rust.convert(html, rust_options))
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def convert_with_inline_images(
|
|
113
|
+
html: str,
|
|
114
|
+
options: ConversionOptions | None = None,
|
|
115
|
+
preprocessing: PreprocessingOptions | None = None,
|
|
116
|
+
image_config: InlineImageConfig | None = None,
|
|
117
|
+
) -> tuple[str, list[InlineImage], list[InlineImageWarning]]:
|
|
118
|
+
"""Convert HTML and extract inline images.
|
|
119
|
+
|
|
120
|
+
Returns Markdown along with extracted inline images and any warnings.
|
|
121
|
+
"""
|
|
122
|
+
if options is None:
|
|
123
|
+
options = ConversionOptions()
|
|
124
|
+
if preprocessing is None:
|
|
125
|
+
preprocessing = PreprocessingOptions()
|
|
126
|
+
if image_config is None:
|
|
127
|
+
image_config = InlineImageConfig()
|
|
128
|
+
|
|
129
|
+
rust_options = _to_rust_options(options, preprocessing)
|
|
130
|
+
markdown, images, warnings = cast(
|
|
131
|
+
"tuple[str, list[InlineImage], list[InlineImageWarning]]",
|
|
132
|
+
_rust.convert_with_inline_images(html, rust_options, image_config),
|
|
133
|
+
)
|
|
134
|
+
return markdown, list(images), list(warnings)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
__all__ = [
|
|
138
|
+
"InlineImage",
|
|
139
|
+
"InlineImageConfig",
|
|
140
|
+
"InlineImageWarning",
|
|
141
|
+
"convert",
|
|
142
|
+
"convert_with_inline_images",
|
|
143
|
+
]
|
|
Binary file
|
html_to_markdown/cli_proxy.py
CHANGED
|
Binary file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.3.3
|
|
4
4
|
Classifier: Development Status :: 5 - Production/Stable
|
|
5
5
|
Classifier: Environment :: Console
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -52,11 +52,11 @@ Apple M4 • Real Wikipedia documents • `convert()` (Python)
|
|
|
52
52
|
|
|
53
53
|
| Document | Size | Latency | Throughput | Docs/sec |
|
|
54
54
|
| ------------------- | ----- | ------- | ---------- | -------- |
|
|
55
|
-
| Lists (Timeline) | 129KB | 0.62ms | 208
|
|
56
|
-
| Tables (Countries) | 360KB | 2.02ms | 178
|
|
57
|
-
| Mixed (Python wiki) | 656KB | 4.56ms | 144
|
|
55
|
+
| Lists (Timeline) | 129KB | 0.62ms | 208 MB/s | 1,613 |
|
|
56
|
+
| Tables (Countries) | 360KB | 2.02ms | 178 MB/s | 495 |
|
|
57
|
+
| Mixed (Python wiki) | 656KB | 4.56ms | 144 MB/s | 219 |
|
|
58
58
|
|
|
59
|
-
> V1 averaged ~2.5
|
|
59
|
+
> V1 averaged ~2.5 MB/s (Python/BeautifulSoup). V2's Rust engine delivers 60–80× higher throughput.
|
|
60
60
|
|
|
61
61
|
## Quick Start
|
|
62
62
|
|
|
@@ -173,11 +173,50 @@ Key fields (see docstring for full matrix):
|
|
|
173
173
|
- `capture_svg`: collect inline `<svg>` (default `True`)
|
|
174
174
|
- `infer_dimensions`: decode raster images to obtain dimensions (default `False`)
|
|
175
175
|
|
|
176
|
+
## Performance: V2 vs V1 Compatibility Layer
|
|
177
|
+
|
|
178
|
+
### ⚠️ Important: Always Use V2 API
|
|
179
|
+
|
|
180
|
+
The v2 API (`convert()`) is **strongly recommended** for all code. The v1 compatibility layer adds significant overhead and should only be used for gradual migration:
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
# ✅ RECOMMENDED - V2 Direct API (Fast)
|
|
184
|
+
from html_to_markdown import convert, ConversionOptions
|
|
185
|
+
|
|
186
|
+
markdown = convert(html) # Simple conversion - FAST
|
|
187
|
+
markdown = convert(html, ConversionOptions(heading_style="atx")) # With options - FAST
|
|
188
|
+
|
|
189
|
+
# ❌ AVOID - V1 Compatibility Layer (Slow)
|
|
190
|
+
from html_to_markdown import convert_to_markdown
|
|
191
|
+
|
|
192
|
+
markdown = convert_to_markdown(html, heading_style="atx") # Adds 77% overhead
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Performance Comparison
|
|
196
|
+
|
|
197
|
+
Benchmarked on Apple M4 with 25-paragraph HTML document:
|
|
198
|
+
|
|
199
|
+
| API | ops/sec | Relative Performance | Recommendation |
|
|
200
|
+
| ------------------------ | ---------------- | -------------------- | ------------------- |
|
|
201
|
+
| **V2 API** (`convert()`) | **129,822** | baseline | ✅ **Use this** |
|
|
202
|
+
| **V1 Compat Layer** | **67,673** | **77% slower** | ⚠️ Migration only |
|
|
203
|
+
| **CLI** | **150-210 MB/s** | Fastest | ✅ Batch processing |
|
|
204
|
+
|
|
205
|
+
The v1 compatibility layer creates extra Python objects and performs additional conversions, significantly impacting performance.
|
|
206
|
+
|
|
207
|
+
### When to Use Each
|
|
208
|
+
|
|
209
|
+
- **V2 API (`convert()`)**: All new code, production systems, performance-critical applications ← **Use this**
|
|
210
|
+
- **V1 Compat (`convert_to_markdown()`)**: Only for gradual migration from legacy codebases
|
|
211
|
+
- **CLI (`html-to-markdown`)**: Batch processing, shell scripts, maximum throughput
|
|
212
|
+
|
|
176
213
|
## v1 Compatibility
|
|
177
214
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
- **
|
|
215
|
+
A compatibility layer is provided to ease migration from v1.x:
|
|
216
|
+
|
|
217
|
+
- **Compat shim**: `html_to_markdown.v1_compat` exposes `convert_to_markdown`, `convert_to_markdown_stream`, and `markdownify`. Keyword mappings are listed in the [changelog](CHANGELOG.md#v200).
|
|
218
|
+
- **⚠️ Performance warning**: These compatibility functions add 77% overhead. Migrate to v2 API as soon as possible.
|
|
219
|
+
- **CLI**: The Rust CLI replaces the old Python script. New flags are documented via `html-to-markdown --help`.
|
|
181
220
|
- **Removed options**: `code_language_callback`, `strip`, and streaming APIs were removed; use `ConversionOptions`, `PreprocessingOptions`, and the inline-image helpers instead.
|
|
182
221
|
|
|
183
222
|
## Links
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
html_to_markdown-2.3.3.dist-info/RECORD,,
|
|
2
|
+
html_to_markdown-2.3.3.dist-info/WHEEL,sha256=HtAbUhtjhH1WdiDuIy2CapdoAiKCwe6bij_Tlxr1lEg,131
|
|
3
|
+
html_to_markdown-2.3.3.dist-info/METADATA,sha256=oyNmsZG9JQEEaxycvu3C6Xlj7ZnuzUoYW_A3U2c7P4c,8811
|
|
4
|
+
html_to_markdown-2.3.3.dist-info/licenses/LICENSE,sha256=oQvPC-0UWvfg0WaeUBe11OJMtX60An-TW1ev_oaAA0k,1086
|
|
5
|
+
html_to_markdown-2.3.3.data/scripts/html-to-markdown,sha256=Xba9k0_dGOyIhO3rVoVPjxZ3H1Mig9cwNGwSJ6tkcZA,3834352
|
|
6
|
+
html_to_markdown/options.py,sha256=N1orEtzXiqgV88Y7eqRjw3ilxDi7N97FBkBa79hdfzo,4913
|
|
7
|
+
html_to_markdown/_html_to_markdown.abi3.so,sha256=_4-WXBZHE5NikMeg37k9Eh5mXc0mD2pzk25_qLWwJls,3668304
|
|
8
|
+
html_to_markdown/__init__.py,sha256=QjyTIjeujelR5cVDnq8xE6e-ADD8pwijntK4wFEl4rE,1358
|
|
9
|
+
html_to_markdown/api.py,sha256=sXezV2iZb42s-za0hWWHFhihKjaMo6j_tx7e-0EyQPg,4860
|
|
10
|
+
html_to_markdown/_rust.pyi,sha256=pi6C_qAdB81qUlC89Dy1ZKC1JrpdnqVce-caJx3ekPA,2098
|
|
11
|
+
html_to_markdown/v1_compat.py,sha256=5DZA-fPMqZ5hYiA43rFaOAqshLS8MScbBnivDXuvQII,8034
|
|
12
|
+
html_to_markdown/cli.py,sha256=Rn-s3FZPea1jgCJtDzH_TFvOEiA_uZFVfgjhr6xyL_g,64
|
|
13
|
+
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
html_to_markdown/exceptions.py,sha256=aTASOzbywgfqOYjlw18ZkOWSxKff4EbUbmMua_73TGA,2370
|
|
15
|
+
html_to_markdown/cli_proxy.py,sha256=8Byrnok5-WkDWToaUeMcKi1xVr62PhZKygHymdrSfFE,3682
|
|
16
|
+
html_to_markdown/__main__.py,sha256=3Ic_EbOt2h6W88q084pkz5IKU6iY5z_woBygH6u9aw0,327
|
|
17
|
+
html_to_markdown/bin/html-to-markdown,sha256=Xba9k0_dGOyIhO3rVoVPjxZ3H1Mig9cwNGwSJ6tkcZA,3834352
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
html_to_markdown-2.2.0.data/scripts/html-to-markdown,sha256=_jgp22NNhPsmx9MgSb11wrTWiYpwnzZ7qZtpdXrudls,3817824
|
|
2
|
-
html_to_markdown-2.2.0.dist-info/RECORD,,
|
|
3
|
-
html_to_markdown-2.2.0.dist-info/WHEEL,sha256=HtAbUhtjhH1WdiDuIy2CapdoAiKCwe6bij_Tlxr1lEg,131
|
|
4
|
-
html_to_markdown-2.2.0.dist-info/METADATA,sha256=v8jDHZrq6_Vuwry7Dn3e8r9QlAZDj3-8kU_mFrSA9og,7071
|
|
5
|
-
html_to_markdown-2.2.0.dist-info/licenses/LICENSE,sha256=oQvPC-0UWvfg0WaeUBe11OJMtX60An-TW1ev_oaAA0k,1086
|
|
6
|
-
html_to_markdown/options.py,sha256=N1orEtzXiqgV88Y7eqRjw3ilxDi7N97FBkBa79hdfzo,4913
|
|
7
|
-
html_to_markdown/_html_to_markdown.abi3.so,sha256=xI0Wr26xhNoPGzlrlngo49vWvg61X_4d0xdMd9de8Rk,3668288
|
|
8
|
-
html_to_markdown/__init__.py,sha256=QZ314Edod-PD-v5CvRtJ-Lw7lqXqHP6o36UtyzUQrg8,1149
|
|
9
|
-
html_to_markdown/api.py,sha256=f0jhD003p8Kz5jHe0BdaKN8Uh9mvGDo2Sl9eNw06VAY,2784
|
|
10
|
-
html_to_markdown/_rust.pyi,sha256=pi6C_qAdB81qUlC89Dy1ZKC1JrpdnqVce-caJx3ekPA,2098
|
|
11
|
-
html_to_markdown/v1_compat.py,sha256=5DZA-fPMqZ5hYiA43rFaOAqshLS8MScbBnivDXuvQII,8034
|
|
12
|
-
html_to_markdown/cli.py,sha256=Rn-s3FZPea1jgCJtDzH_TFvOEiA_uZFVfgjhr6xyL_g,64
|
|
13
|
-
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
-
html_to_markdown/exceptions.py,sha256=aTASOzbywgfqOYjlw18ZkOWSxKff4EbUbmMua_73TGA,2370
|
|
15
|
-
html_to_markdown/cli_proxy.py,sha256=MbDRZdmQMCDI9cruy1vifc__FsjNPRdvBXKFU9GaAZE,3695
|
|
16
|
-
html_to_markdown/__main__.py,sha256=3Ic_EbOt2h6W88q084pkz5IKU6iY5z_woBygH6u9aw0,327
|
|
17
|
-
html_to_markdown/bin/html-to-markdown,sha256=_jgp22NNhPsmx9MgSb11wrTWiYpwnzZ7qZtpdXrudls,3817824
|
|
File without changes
|
|
File without changes
|