html-to-markdown 2.6.3__cp310-abi3-macosx_11_0_arm64.whl → 2.14.2__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- html_to_markdown/__init__.py +14 -1
- html_to_markdown/_html_to_markdown.abi3.so +0 -0
- html_to_markdown/_html_to_markdown.pyi +196 -0
- html_to_markdown/api.py +83 -31
- html_to_markdown/bin/html-to-markdown +0 -0
- html_to_markdown/v1_compat.py +11 -13
- html_to_markdown-2.14.2.data/scripts/html-to-markdown +0 -0
- html_to_markdown-2.14.2.dist-info/METADATA +634 -0
- html_to_markdown-2.14.2.dist-info/RECORD +17 -0
- {html_to_markdown-2.6.3.dist-info → html_to_markdown-2.14.2.dist-info}/WHEEL +1 -1
- html_to_markdown/_rust.pyi +0 -73
- html_to_markdown-2.6.3.data/scripts/html-to-markdown +0 -0
- html_to_markdown-2.6.3.dist-info/METADATA +0 -242
- html_to_markdown-2.6.3.dist-info/RECORD +0 -17
- {html_to_markdown-2.6.3.dist-info → html_to_markdown-2.14.2.dist-info}/licenses/LICENSE +0 -0
html_to_markdown/_rust.pyi
DELETED
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
class ConversionOptions:
|
|
2
|
-
heading_style: str
|
|
3
|
-
list_indent_type: str
|
|
4
|
-
list_indent_width: int
|
|
5
|
-
bullets: str
|
|
6
|
-
strong_em_symbol: str
|
|
7
|
-
escape_asterisks: bool
|
|
8
|
-
escape_underscores: bool
|
|
9
|
-
escape_misc: bool
|
|
10
|
-
code_language: str
|
|
11
|
-
autolinks: bool
|
|
12
|
-
default_title: bool
|
|
13
|
-
br_in_tables: bool
|
|
14
|
-
hocr_spatial_tables: bool
|
|
15
|
-
highlight_style: str
|
|
16
|
-
extract_metadata: bool
|
|
17
|
-
whitespace_mode: str
|
|
18
|
-
strip_newlines: bool
|
|
19
|
-
wrap: bool
|
|
20
|
-
wrap_width: int
|
|
21
|
-
convert_as_inline: bool
|
|
22
|
-
sub_symbol: str
|
|
23
|
-
sup_symbol: str
|
|
24
|
-
newline_style: str
|
|
25
|
-
keep_inline_images_in: list[str]
|
|
26
|
-
preprocessing: PreprocessingOptions
|
|
27
|
-
encoding: str
|
|
28
|
-
|
|
29
|
-
def __init__(
|
|
30
|
-
self,
|
|
31
|
-
heading_style: str = "underlined",
|
|
32
|
-
list_indent_type: str = "spaces",
|
|
33
|
-
list_indent_width: int = 4,
|
|
34
|
-
bullets: str = "*+-",
|
|
35
|
-
strong_em_symbol: str = "*",
|
|
36
|
-
escape_asterisks: bool = True,
|
|
37
|
-
escape_underscores: bool = True,
|
|
38
|
-
escape_misc: bool = True,
|
|
39
|
-
code_language: str = "",
|
|
40
|
-
autolinks: bool = True,
|
|
41
|
-
default_title: bool = False,
|
|
42
|
-
br_in_tables: bool = False,
|
|
43
|
-
hocr_spatial_tables: bool = True,
|
|
44
|
-
highlight_style: str = "double-equal",
|
|
45
|
-
extract_metadata: bool = True,
|
|
46
|
-
whitespace_mode: str = "normalized",
|
|
47
|
-
strip_newlines: bool = False,
|
|
48
|
-
wrap: bool = False,
|
|
49
|
-
wrap_width: int = 80,
|
|
50
|
-
convert_as_inline: bool = False,
|
|
51
|
-
sub_symbol: str = "",
|
|
52
|
-
sup_symbol: str = "",
|
|
53
|
-
newline_style: str = "spaces",
|
|
54
|
-
keep_inline_images_in: list[str] | None = None,
|
|
55
|
-
preprocessing: PreprocessingOptions | None = None,
|
|
56
|
-
encoding: str = "utf-8",
|
|
57
|
-
) -> None: ...
|
|
58
|
-
|
|
59
|
-
class PreprocessingOptions:
|
|
60
|
-
enabled: bool
|
|
61
|
-
preset: str
|
|
62
|
-
remove_navigation: bool
|
|
63
|
-
remove_forms: bool
|
|
64
|
-
|
|
65
|
-
def __init__(
|
|
66
|
-
self,
|
|
67
|
-
enabled: bool = False,
|
|
68
|
-
preset: str = "standard",
|
|
69
|
-
remove_navigation: bool = True,
|
|
70
|
-
remove_forms: bool = True,
|
|
71
|
-
) -> None: ...
|
|
72
|
-
|
|
73
|
-
def convert(html: str, options: ConversionOptions | None = None) -> str: ...
|
|
Binary file
|
|
@@ -1,242 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: html-to-markdown
|
|
3
|
-
Version: 2.6.3
|
|
4
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
5
|
-
Classifier: Environment :: Console
|
|
6
|
-
Classifier: Intended Audience :: Developers
|
|
7
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
-
Classifier: Operating System :: OS Independent
|
|
9
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.14
|
|
15
|
-
Classifier: Programming Language :: Rust
|
|
16
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
-
Classifier: Topic :: Text Processing
|
|
18
|
-
Classifier: Topic :: Text Processing :: Markup
|
|
19
|
-
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
20
|
-
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
21
|
-
Classifier: Typing :: Typed
|
|
22
|
-
License-File: LICENSE
|
|
23
|
-
Summary: High-performance HTML to Markdown converter powered by Rust with a clean Python API
|
|
24
|
-
Keywords: cli-tool,converter,html,html2markdown,html5,markdown,markup,parser,rust,text-processing
|
|
25
|
-
Home-Page: https://github.com/Goldziher/html-to-markdown
|
|
26
|
-
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
27
|
-
Requires-Python: >=3.10
|
|
28
|
-
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
29
|
-
Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
|
|
30
|
-
Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
|
|
31
|
-
Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
|
|
32
|
-
Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
|
|
33
|
-
|
|
34
|
-
# html-to-markdown
|
|
35
|
-
|
|
36
|
-
High-performance HTML to Markdown converter with a clean Python API (powered by a Rust core). The same engine also drives the Node.js, Ruby, and WebAssembly bindings, so rendered Markdown stays identical across runtimes. Wheels are published for Linux, macOS, and Windows.
|
|
37
|
-
|
|
38
|
-
[](https://crates.io/crates/html-to-markdown-rs)
|
|
39
|
-
[](https://www.npmjs.com/package/html-to-markdown-node)
|
|
40
|
-
[](https://www.npmjs.com/package/html-to-markdown-wasm)
|
|
41
|
-
[](https://www.npmjs.com/package/html-to-markdown)
|
|
42
|
-
[](https://pypi.org/project/html-to-markdown/)
|
|
43
|
-
[](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
44
|
-
[](https://rubygems.org/gems/html-to-markdown)
|
|
45
|
-
[](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
|
|
46
|
-
|
|
47
|
-
## Installation
|
|
48
|
-
|
|
49
|
-
```bash
|
|
50
|
-
pip install html-to-markdown
|
|
51
|
-
```
|
|
52
|
-
|
|
53
|
-
## Performance Snapshot
|
|
54
|
-
|
|
55
|
-
Apple M4 • Real Wikipedia documents • `convert()` (Python)
|
|
56
|
-
|
|
57
|
-
| Document | Size | Latency | Throughput | Docs/sec |
|
|
58
|
-
| ------------------- | ----- | ------- | ---------- | -------- |
|
|
59
|
-
| Lists (Timeline) | 129KB | 0.62ms | 208 MB/s | 1,613 |
|
|
60
|
-
| Tables (Countries) | 360KB | 2.02ms | 178 MB/s | 495 |
|
|
61
|
-
| Mixed (Python wiki) | 656KB | 4.56ms | 144 MB/s | 219 |
|
|
62
|
-
|
|
63
|
-
> V1 averaged ~2.5 MB/s (Python/BeautifulSoup). V2's Rust engine delivers 60–80× higher throughput.
|
|
64
|
-
|
|
65
|
-
## Quick Start
|
|
66
|
-
|
|
67
|
-
```python
|
|
68
|
-
from html_to_markdown import convert
|
|
69
|
-
|
|
70
|
-
html = """
|
|
71
|
-
<h1>Welcome</h1>
|
|
72
|
-
<p>This is <strong>fast</strong> Rust-powered conversion!</p>
|
|
73
|
-
<ul>
|
|
74
|
-
<li>Blazing fast</li>
|
|
75
|
-
<li>Type safe</li>
|
|
76
|
-
<li>Easy to use</li>
|
|
77
|
-
</ul>
|
|
78
|
-
"""
|
|
79
|
-
|
|
80
|
-
markdown = convert(html)
|
|
81
|
-
print(markdown)
|
|
82
|
-
```
|
|
83
|
-
|
|
84
|
-
## Configuration (v2 API)
|
|
85
|
-
|
|
86
|
-
```python
|
|
87
|
-
from html_to_markdown import ConversionOptions, convert
|
|
88
|
-
|
|
89
|
-
options = ConversionOptions(
|
|
90
|
-
heading_style="atx",
|
|
91
|
-
list_indent_width=2,
|
|
92
|
-
bullets="*+-",
|
|
93
|
-
)
|
|
94
|
-
options.escape_asterisks = True
|
|
95
|
-
options.code_language = "python"
|
|
96
|
-
options.extract_metadata = True
|
|
97
|
-
|
|
98
|
-
markdown = convert(html, options)
|
|
99
|
-
```
|
|
100
|
-
|
|
101
|
-
### HTML Preprocessing
|
|
102
|
-
|
|
103
|
-
```python
|
|
104
|
-
from html_to_markdown import ConversionOptions, PreprocessingOptions, convert
|
|
105
|
-
|
|
106
|
-
options = ConversionOptions(
|
|
107
|
-
preprocessing=PreprocessingOptions(enabled=True, preset="aggressive"),
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
markdown = convert(scraped_html, options)
|
|
111
|
-
```
|
|
112
|
-
|
|
113
|
-
### Inline Image Extraction
|
|
114
|
-
|
|
115
|
-
```python
|
|
116
|
-
from html_to_markdown import InlineImageConfig, convert_with_inline_images
|
|
117
|
-
|
|
118
|
-
markdown, inline_images, warnings = convert_with_inline_images(
|
|
119
|
-
'<p><img src="data:image/png;base64,...==" alt="Pixel" width="1" height="1"></p>',
|
|
120
|
-
image_config=InlineImageConfig(max_decoded_size_bytes=1024, infer_dimensions=True),
|
|
121
|
-
)
|
|
122
|
-
|
|
123
|
-
if inline_images:
|
|
124
|
-
first = inline_images[0]
|
|
125
|
-
print(first["format"], first["dimensions"], first["attributes"]) # e.g. "png", (1, 1), {"width": "1"}
|
|
126
|
-
```
|
|
127
|
-
|
|
128
|
-
Each inline image is returned as a typed dictionary (`bytes` payload, metadata, and relevant HTML attributes). Warnings are human-readable skip reasons.
|
|
129
|
-
|
|
130
|
-
### hOCR (HTML OCR) Support
|
|
131
|
-
|
|
132
|
-
```python
|
|
133
|
-
from html_to_markdown import ConversionOptions, convert
|
|
134
|
-
|
|
135
|
-
# Default: emit structured Markdown directly
|
|
136
|
-
markdown = convert(hocr_html)
|
|
137
|
-
|
|
138
|
-
# hOCR documents are detected automatically; tables are reconstructed without extra configuration.
|
|
139
|
-
markdown = convert(hocr_html)
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
## CLI (same engine)
|
|
143
|
-
|
|
144
|
-
```bash
|
|
145
|
-
pipx install html-to-markdown # or: pip install html-to-markdown
|
|
146
|
-
|
|
147
|
-
html-to-markdown page.html > page.md
|
|
148
|
-
cat page.html | html-to-markdown --heading-style atx > page.md
|
|
149
|
-
```
|
|
150
|
-
|
|
151
|
-
## API Surface
|
|
152
|
-
|
|
153
|
-
### `ConversionOptions`
|
|
154
|
-
|
|
155
|
-
Key fields (see docstring for full matrix):
|
|
156
|
-
|
|
157
|
-
- `heading_style`: `"underlined" | "atx" | "atx_closed"`
|
|
158
|
-
- `list_indent_width`: spaces per indent level (default 2)
|
|
159
|
-
- `bullets`: cycle of bullet characters (`"*+-"`)
|
|
160
|
-
- `strong_em_symbol`: `"*"` or `"_"`
|
|
161
|
-
- `code_language`: default fenced code block language
|
|
162
|
-
- `wrap`, `wrap_width`: wrap Markdown output
|
|
163
|
-
- `strip_tags`: remove specific HTML tags
|
|
164
|
-
- `preprocessing`: `PreprocessingOptions`
|
|
165
|
-
- `encoding`: input character encoding (informational)
|
|
166
|
-
|
|
167
|
-
### `PreprocessingOptions`
|
|
168
|
-
|
|
169
|
-
- `enabled`: enable HTML sanitisation (default: `True` since v2.4.2 for robust malformed HTML handling)
|
|
170
|
-
- `preset`: `"minimal" | "standard" | "aggressive"` (default: `"standard"`)
|
|
171
|
-
- `remove_navigation`: remove navigation elements (default: `True`)
|
|
172
|
-
- `remove_forms`: remove form elements (default: `True`)
|
|
173
|
-
|
|
174
|
-
**Note:** As of v2.4.2, preprocessing is enabled by default to ensure robust handling of malformed HTML (e.g., bare angle brackets like `1<2` in content). Set `enabled=False` if you need minimal preprocessing.
|
|
175
|
-
|
|
176
|
-
### `InlineImageConfig`
|
|
177
|
-
|
|
178
|
-
- `max_decoded_size_bytes`: reject larger payloads
|
|
179
|
-
- `filename_prefix`: generated name prefix (`embedded_image` default)
|
|
180
|
-
- `capture_svg`: collect inline `<svg>` (default `True`)
|
|
181
|
-
- `infer_dimensions`: decode raster images to obtain dimensions (default `False`)
|
|
182
|
-
|
|
183
|
-
## Performance: V2 vs V1 Compatibility Layer
|
|
184
|
-
|
|
185
|
-
### ⚠️ Important: Always Use V2 API
|
|
186
|
-
|
|
187
|
-
The v2 API (`convert()`) is **strongly recommended** for all code. The v1 compatibility layer adds significant overhead and should only be used for gradual migration:
|
|
188
|
-
|
|
189
|
-
```python
|
|
190
|
-
# ✅ RECOMMENDED - V2 Direct API (Fast)
|
|
191
|
-
from html_to_markdown import convert, ConversionOptions
|
|
192
|
-
|
|
193
|
-
markdown = convert(html) # Simple conversion - FAST
|
|
194
|
-
markdown = convert(html, ConversionOptions(heading_style="atx")) # With options - FAST
|
|
195
|
-
|
|
196
|
-
# ❌ AVOID - V1 Compatibility Layer (Slow)
|
|
197
|
-
from html_to_markdown import convert_to_markdown
|
|
198
|
-
|
|
199
|
-
markdown = convert_to_markdown(html, heading_style="atx") # Adds 77% overhead
|
|
200
|
-
```
|
|
201
|
-
|
|
202
|
-
### Performance Comparison
|
|
203
|
-
|
|
204
|
-
Benchmarked on Apple M4 with 25-paragraph HTML document:
|
|
205
|
-
|
|
206
|
-
| API | ops/sec | Relative Performance | Recommendation |
|
|
207
|
-
| ------------------------ | ---------------- | -------------------- | ------------------- |
|
|
208
|
-
| **V2 API** (`convert()`) | **129,822** | baseline | ✅ **Use this** |
|
|
209
|
-
| **V1 Compat Layer** | **67,673** | **77% slower** | ⚠️ Migration only |
|
|
210
|
-
| **CLI** | **150-210 MB/s** | Fastest | ✅ Batch processing |
|
|
211
|
-
|
|
212
|
-
The v1 compatibility layer creates extra Python objects and performs additional conversions, significantly impacting performance.
|
|
213
|
-
|
|
214
|
-
### When to Use Each
|
|
215
|
-
|
|
216
|
-
- **V2 API (`convert()`)**: All new code, production systems, performance-critical applications ← **Use this**
|
|
217
|
-
- **V1 Compat (`convert_to_markdown()`)**: Only for gradual migration from legacy codebases
|
|
218
|
-
- **CLI (`html-to-markdown`)**: Batch processing, shell scripts, maximum throughput
|
|
219
|
-
|
|
220
|
-
## v1 Compatibility
|
|
221
|
-
|
|
222
|
-
A compatibility layer is provided to ease migration from v1.x:
|
|
223
|
-
|
|
224
|
-
- **Compat shim**: `html_to_markdown.v1_compat` exposes `convert_to_markdown`, `convert_to_markdown_stream`, and `markdownify`. Keyword mappings are listed in the [changelog](CHANGELOG.md#v200).
|
|
225
|
-
- **⚠️ Performance warning**: These compatibility functions add 77% overhead. Migrate to v2 API as soon as possible.
|
|
226
|
-
- **CLI**: The Rust CLI replaces the old Python script. New flags are documented via `html-to-markdown --help`.
|
|
227
|
-
- **Removed options**: `code_language_callback`, `strip`, and streaming APIs were removed; use `ConversionOptions`, `PreprocessingOptions`, and the inline-image helpers instead.
|
|
228
|
-
|
|
229
|
-
## Links
|
|
230
|
-
|
|
231
|
-
- GitHub: [https://github.com/Goldziher/html-to-markdown](https://github.com/Goldziher/html-to-markdown)
|
|
232
|
-
- Discord: [https://discord.gg/pXxagNK2zN](https://discord.gg/pXxagNK2zN)
|
|
233
|
-
- Kreuzberg ecosystem: [https://kreuzberg.dev](https://kreuzberg.dev)
|
|
234
|
-
|
|
235
|
-
## License
|
|
236
|
-
|
|
237
|
-
MIT License – see [LICENSE](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE).
|
|
238
|
-
|
|
239
|
-
## Support
|
|
240
|
-
|
|
241
|
-
If you find this library useful, consider [sponsoring the project](https://github.com/sponsors/Goldziher).
|
|
242
|
-
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
html_to_markdown-2.6.3.dist-info/RECORD,,
|
|
2
|
-
html_to_markdown-2.6.3.dist-info/WHEEL,sha256=HtAbUhtjhH1WdiDuIy2CapdoAiKCwe6bij_Tlxr1lEg,131
|
|
3
|
-
html_to_markdown-2.6.3.dist-info/METADATA,sha256=2jZEsKKkOxN4Bn8ID11oyONT7_hvHm23pmVvpgyAVK0,9789
|
|
4
|
-
html_to_markdown-2.6.3.dist-info/licenses/LICENSE,sha256=oQvPC-0UWvfg0WaeUBe11OJMtX60An-TW1ev_oaAA0k,1086
|
|
5
|
-
html_to_markdown-2.6.3.data/scripts/html-to-markdown,sha256=bRYI80buxtoHkQ5BaaFLRnONic_sG0ODud44rqh7TX0,3884128
|
|
6
|
-
html_to_markdown/options.py,sha256=vImRfeHAeyAy0Lnt6cTPHGbj7mTdw8AEUgo19u7MAA0,5080
|
|
7
|
-
html_to_markdown/_html_to_markdown.abi3.so,sha256=5nhsd3DFA0t6C_UbeZrT99KNUSTkRwrBEsR6ySQV5eA,3701616
|
|
8
|
-
html_to_markdown/__init__.py,sha256=bXngQAyZfyVClaa1YyVAUlgLYXcOFki_eaEpD42yuvM,1358
|
|
9
|
-
html_to_markdown/api.py,sha256=uiNoieNXrcXTJI2_vV7ruDv9HKD7XFuosCAeqZL-C_Q,4944
|
|
10
|
-
html_to_markdown/_rust.pyi,sha256=pi6C_qAdB81qUlC89Dy1ZKC1JrpdnqVce-caJx3ekPA,2098
|
|
11
|
-
html_to_markdown/v1_compat.py,sha256=5DZA-fPMqZ5hYiA43rFaOAqshLS8MScbBnivDXuvQII,8034
|
|
12
|
-
html_to_markdown/cli.py,sha256=Rn-s3FZPea1jgCJtDzH_TFvOEiA_uZFVfgjhr6xyL_g,64
|
|
13
|
-
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
-
html_to_markdown/exceptions.py,sha256=aTASOzbywgfqOYjlw18ZkOWSxKff4EbUbmMua_73TGA,2370
|
|
15
|
-
html_to_markdown/cli_proxy.py,sha256=HPYKH5Mf5OUvkbEQISJvAkxrbjWKxE5GokA44HoQ6z8,3858
|
|
16
|
-
html_to_markdown/__main__.py,sha256=3Ic_EbOt2h6W88q084pkz5IKU6iY5z_woBygH6u9aw0,327
|
|
17
|
-
html_to_markdown/bin/html-to-markdown,sha256=bRYI80buxtoHkQ5BaaFLRnONic_sG0ODud44rqh7TX0,3884128
|
|
File without changes
|