html-to-markdown 2.1.2__cp310-abi3-macosx_11_0_arm64.whl → 2.3.0__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +1 -1
- html_to_markdown/_html_to_markdown.abi3.so +0 -0
- html_to_markdown/_rust.pyi +2 -0
- html_to_markdown/api.py +1 -0
- html_to_markdown/bin/html-to-markdown +0 -0
- html_to_markdown/cli_proxy.py +1 -1
- html_to_markdown/options.py +3 -0
- html_to_markdown/v1_compat.py +6 -2
- {html_to_markdown-2.1.2.data → html_to_markdown-2.3.0.data}/scripts/html-to-markdown +0 -0
- {html_to_markdown-2.1.2.dist-info → html_to_markdown-2.3.0.dist-info}/METADATA +47 -8
- html_to_markdown-2.3.0.dist-info/RECORD +17 -0
- html_to_markdown-2.1.2.dist-info/RECORD +0 -17
- {html_to_markdown-2.1.2.dist-info → html_to_markdown-2.3.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-2.1.2.dist-info → html_to_markdown-2.3.0.dist-info}/licenses/LICENSE +0 -0
html_to_markdown/__init__.py
CHANGED
|
Binary file
|
html_to_markdown/_rust.pyi
CHANGED
|
@@ -11,6 +11,7 @@ class ConversionOptions:
|
|
|
11
11
|
autolinks: bool
|
|
12
12
|
default_title: bool
|
|
13
13
|
br_in_tables: bool
|
|
14
|
+
hocr_spatial_tables: bool
|
|
14
15
|
highlight_style: str
|
|
15
16
|
extract_metadata: bool
|
|
16
17
|
whitespace_mode: str
|
|
@@ -39,6 +40,7 @@ class ConversionOptions:
|
|
|
39
40
|
autolinks: bool = True,
|
|
40
41
|
default_title: bool = False,
|
|
41
42
|
br_in_tables: bool = False,
|
|
43
|
+
hocr_spatial_tables: bool = True,
|
|
42
44
|
highlight_style: str = "double-equal",
|
|
43
45
|
extract_metadata: bool = True,
|
|
44
46
|
whitespace_mode: str = "normalized",
|
html_to_markdown/api.py
CHANGED
|
@@ -51,6 +51,7 @@ def convert(
|
|
|
51
51
|
autolinks=options.autolinks,
|
|
52
52
|
default_title=options.default_title,
|
|
53
53
|
br_in_tables=options.br_in_tables,
|
|
54
|
+
hocr_spatial_tables=options.hocr_spatial_tables,
|
|
54
55
|
highlight_style=options.highlight_style,
|
|
55
56
|
extract_metadata=options.extract_metadata,
|
|
56
57
|
whitespace_mode=options.whitespace_mode,
|
|
Binary file
|
html_to_markdown/cli_proxy.py
CHANGED
html_to_markdown/options.py
CHANGED
|
@@ -71,6 +71,9 @@ class ConversionOptions:
|
|
|
71
71
|
br_in_tables: bool = False
|
|
72
72
|
"""Use <br> tags for line breaks in table cells instead of spaces."""
|
|
73
73
|
|
|
74
|
+
hocr_spatial_tables: bool = True
|
|
75
|
+
"""Reconstruct tables in hOCR documents using spatial heuristics."""
|
|
76
|
+
|
|
74
77
|
highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
|
|
75
78
|
"""Style for highlighting <mark> elements."""
|
|
76
79
|
|
html_to_markdown/v1_compat.py
CHANGED
|
@@ -120,8 +120,11 @@ def convert_to_markdown(
|
|
|
120
120
|
if custom_converters is not None:
|
|
121
121
|
raise NotImplementedError("custom_converters is not yet implemented in v2")
|
|
122
122
|
if not hocr_extract_tables:
|
|
123
|
-
|
|
124
|
-
"hocr_extract_tables
|
|
123
|
+
warnings.warn(
|
|
124
|
+
"hocr_extract_tables is deprecated and will be removed in a future release. "
|
|
125
|
+
"Use ConversionOptions(hocr_spatial_tables=False) to disable spatial table reconstruction.",
|
|
126
|
+
DeprecationWarning,
|
|
127
|
+
stacklevel=2,
|
|
125
128
|
)
|
|
126
129
|
if hocr_table_column_threshold != 50 or hocr_table_row_threshold_ratio != 0.5:
|
|
127
130
|
raise NotImplementedError(
|
|
@@ -146,6 +149,7 @@ def convert_to_markdown(
|
|
|
146
149
|
autolinks=autolinks,
|
|
147
150
|
default_title=default_title,
|
|
148
151
|
br_in_tables=br_in_tables,
|
|
152
|
+
hocr_spatial_tables=hocr_extract_tables,
|
|
149
153
|
highlight_style=highlight_style, # type: ignore[arg-type]
|
|
150
154
|
extract_metadata=extract_metadata,
|
|
151
155
|
whitespace_mode=whitespace_mode, # type: ignore[arg-type]
|
|
Binary file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.3.0
|
|
4
4
|
Classifier: Development Status :: 5 - Production/Stable
|
|
5
5
|
Classifier: Environment :: Console
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -52,11 +52,11 @@ Apple M4 • Real Wikipedia documents • `convert()` (Python)
|
|
|
52
52
|
|
|
53
53
|
| Document | Size | Latency | Throughput | Docs/sec |
|
|
54
54
|
| ------------------- | ----- | ------- | ---------- | -------- |
|
|
55
|
-
| Lists (Timeline) | 129KB | 0.62ms | 208
|
|
56
|
-
| Tables (Countries) | 360KB | 2.02ms | 178
|
|
57
|
-
| Mixed (Python wiki) | 656KB | 4.56ms | 144
|
|
55
|
+
| Lists (Timeline) | 129KB | 0.62ms | 208 MB/s | 1,613 |
|
|
56
|
+
| Tables (Countries) | 360KB | 2.02ms | 178 MB/s | 495 |
|
|
57
|
+
| Mixed (Python wiki) | 656KB | 4.56ms | 144 MB/s | 219 |
|
|
58
58
|
|
|
59
|
-
> V1 averaged ~2.5
|
|
59
|
+
> V1 averaged ~2.5 MB/s (Python/BeautifulSoup). V2's Rust engine delivers 60–80× higher throughput.
|
|
60
60
|
|
|
61
61
|
## Quick Start
|
|
62
62
|
|
|
@@ -173,11 +173,50 @@ Key fields (see docstring for full matrix):
|
|
|
173
173
|
- `capture_svg`: collect inline `<svg>` (default `True`)
|
|
174
174
|
- `infer_dimensions`: decode raster images to obtain dimensions (default `False`)
|
|
175
175
|
|
|
176
|
+
## Performance: V2 vs V1 Compatibility Layer
|
|
177
|
+
|
|
178
|
+
### ⚠️ Important: Always Use V2 API
|
|
179
|
+
|
|
180
|
+
The v2 API (`convert()`) is **strongly recommended** for all code. The v1 compatibility layer adds significant overhead and should only be used for gradual migration:
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
# ✅ RECOMMENDED - V2 Direct API (Fast)
|
|
184
|
+
from html_to_markdown import convert, ConversionOptions
|
|
185
|
+
|
|
186
|
+
markdown = convert(html) # Simple conversion - FAST
|
|
187
|
+
markdown = convert(html, ConversionOptions(heading_style="atx")) # With options - FAST
|
|
188
|
+
|
|
189
|
+
# ❌ AVOID - V1 Compatibility Layer (Slow)
|
|
190
|
+
from html_to_markdown import convert_to_markdown
|
|
191
|
+
|
|
192
|
+
markdown = convert_to_markdown(html, heading_style="atx") # Adds 77% overhead
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Performance Comparison
|
|
196
|
+
|
|
197
|
+
Benchmarked on Apple M4 with 25-paragraph HTML document:
|
|
198
|
+
|
|
199
|
+
| API | ops/sec | Relative Performance | Recommendation |
|
|
200
|
+
| ------------------------ | ---------------- | -------------------- | ------------------- |
|
|
201
|
+
| **V2 API** (`convert()`) | **129,822** | baseline | ✅ **Use this** |
|
|
202
|
+
| **V1 Compat Layer** | **67,673** | **77% slower** | ⚠️ Migration only |
|
|
203
|
+
| **CLI** | **150-210 MB/s** | Fastest | ✅ Batch processing |
|
|
204
|
+
|
|
205
|
+
The v1 compatibility layer creates extra Python objects and performs additional conversions, significantly impacting performance.
|
|
206
|
+
|
|
207
|
+
### When to Use Each
|
|
208
|
+
|
|
209
|
+
- **V2 API (`convert()`)**: All new code, production systems, performance-critical applications ← **Use this**
|
|
210
|
+
- **V1 Compat (`convert_to_markdown()`)**: Only for gradual migration from legacy codebases
|
|
211
|
+
- **CLI (`html-to-markdown`)**: Batch processing, shell scripts, maximum throughput
|
|
212
|
+
|
|
176
213
|
## v1 Compatibility
|
|
177
214
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
- **
|
|
215
|
+
A compatibility layer is provided to ease migration from v1.x:
|
|
216
|
+
|
|
217
|
+
- **Compat shim**: `html_to_markdown.v1_compat` exposes `convert_to_markdown`, `convert_to_markdown_stream`, and `markdownify`. Keyword mappings are listed in the [changelog](CHANGELOG.md#v200).
|
|
218
|
+
- **⚠️ Performance warning**: These compatibility functions add 77% overhead. Migrate to v2 API as soon as possible.
|
|
219
|
+
- **CLI**: The Rust CLI replaces the old Python script. New flags are documented via `html-to-markdown --help`.
|
|
181
220
|
- **Removed options**: `code_language_callback`, `strip`, and streaming APIs were removed; use `ConversionOptions`, `PreprocessingOptions`, and the inline-image helpers instead.
|
|
182
221
|
|
|
183
222
|
## Links
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
html_to_markdown-2.3.0.data/scripts/html-to-markdown,sha256=Pq5ne6u45stGE7uabdTs-zMJ9iyVNur55DEqWpxEnqA,3817824
|
|
2
|
+
html_to_markdown-2.3.0.dist-info/RECORD,,
|
|
3
|
+
html_to_markdown-2.3.0.dist-info/WHEEL,sha256=HtAbUhtjhH1WdiDuIy2CapdoAiKCwe6bij_Tlxr1lEg,131
|
|
4
|
+
html_to_markdown-2.3.0.dist-info/METADATA,sha256=tt8NYBDARFjWP7kXkCDuiBOGqhMVNoC-h3yv93qbG5Y,8811
|
|
5
|
+
html_to_markdown-2.3.0.dist-info/licenses/LICENSE,sha256=oQvPC-0UWvfg0WaeUBe11OJMtX60An-TW1ev_oaAA0k,1086
|
|
6
|
+
html_to_markdown/options.py,sha256=N1orEtzXiqgV88Y7eqRjw3ilxDi7N97FBkBa79hdfzo,4913
|
|
7
|
+
html_to_markdown/_html_to_markdown.abi3.so,sha256=yo-M4pugwmV5qNdpeq3Hm1So_4JLPilqkYjV1O9mWSQ,3668288
|
|
8
|
+
html_to_markdown/__init__.py,sha256=f-xST7KCaVMfqGiTcwAdWDRdYOOLmsXF3Ivgpn5MR70,1149
|
|
9
|
+
html_to_markdown/api.py,sha256=f0jhD003p8Kz5jHe0BdaKN8Uh9mvGDo2Sl9eNw06VAY,2784
|
|
10
|
+
html_to_markdown/_rust.pyi,sha256=pi6C_qAdB81qUlC89Dy1ZKC1JrpdnqVce-caJx3ekPA,2098
|
|
11
|
+
html_to_markdown/v1_compat.py,sha256=5DZA-fPMqZ5hYiA43rFaOAqshLS8MScbBnivDXuvQII,8034
|
|
12
|
+
html_to_markdown/cli.py,sha256=Rn-s3FZPea1jgCJtDzH_TFvOEiA_uZFVfgjhr6xyL_g,64
|
|
13
|
+
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
html_to_markdown/exceptions.py,sha256=aTASOzbywgfqOYjlw18ZkOWSxKff4EbUbmMua_73TGA,2370
|
|
15
|
+
html_to_markdown/cli_proxy.py,sha256=8Byrnok5-WkDWToaUeMcKi1xVr62PhZKygHymdrSfFE,3682
|
|
16
|
+
html_to_markdown/__main__.py,sha256=3Ic_EbOt2h6W88q084pkz5IKU6iY5z_woBygH6u9aw0,327
|
|
17
|
+
html_to_markdown/bin/html-to-markdown,sha256=Pq5ne6u45stGE7uabdTs-zMJ9iyVNur55DEqWpxEnqA,3817824
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
html_to_markdown-2.1.2.data/scripts/html-to-markdown,sha256=7PFfHn91sQQL-AWpzh5gBRz0xVbizJBLYQs4izr24yc,3784640
|
|
2
|
-
html_to_markdown-2.1.2.dist-info/RECORD,,
|
|
3
|
-
html_to_markdown-2.1.2.dist-info/WHEEL,sha256=HtAbUhtjhH1WdiDuIy2CapdoAiKCwe6bij_Tlxr1lEg,131
|
|
4
|
-
html_to_markdown-2.1.2.dist-info/METADATA,sha256=W-yMCoN32dNA4ggXeZLY1RazbepXMmuE2yNVxkmUizQ,7071
|
|
5
|
-
html_to_markdown-2.1.2.dist-info/licenses/LICENSE,sha256=oQvPC-0UWvfg0WaeUBe11OJMtX60An-TW1ev_oaAA0k,1086
|
|
6
|
-
html_to_markdown/options.py,sha256=oV-_GFEKuL3RBu27RD1AhVruEh-bNuY3c8ATRbzcos0,4802
|
|
7
|
-
html_to_markdown/_html_to_markdown.abi3.so,sha256=igb9iD5dR4jf2qpiaLG0IYvNz_gzQ7d0yx5ofdNK6Sg,3618704
|
|
8
|
-
html_to_markdown/__init__.py,sha256=-HSsEKPPjp08ksh9aZi3xwdTE9-kNvplMlG2npMPVuI,1149
|
|
9
|
-
html_to_markdown/api.py,sha256=HuM6RZg064VxrTvwcY-OmraS-hsGM9Bt1tIaM0_w7F8,2727
|
|
10
|
-
html_to_markdown/_rust.pyi,sha256=An3Wlvedlr_2XgzqmXulLi5AzMx3HTqOJWH11M5cgcY,2026
|
|
11
|
-
html_to_markdown/v1_compat.py,sha256=VQq1wv8OedkESpCFaUpaSUBh6vJNkByylVUbY6EPIZ8,7856
|
|
12
|
-
html_to_markdown/cli.py,sha256=Rn-s3FZPea1jgCJtDzH_TFvOEiA_uZFVfgjhr6xyL_g,64
|
|
13
|
-
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
-
html_to_markdown/exceptions.py,sha256=aTASOzbywgfqOYjlw18ZkOWSxKff4EbUbmMua_73TGA,2370
|
|
15
|
-
html_to_markdown/cli_proxy.py,sha256=MbDRZdmQMCDI9cruy1vifc__FsjNPRdvBXKFU9GaAZE,3695
|
|
16
|
-
html_to_markdown/__main__.py,sha256=3Ic_EbOt2h6W88q084pkz5IKU6iY5z_woBygH6u9aw0,327
|
|
17
|
-
html_to_markdown/bin/html-to-markdown,sha256=7PFfHn91sQQL-AWpzh5gBRz0xVbizJBLYQs4izr24yc,3784640
|
|
File without changes
|
|
File without changes
|