html-to-markdown 2.1.2__cp310-abi3-win_amd64.whl → 2.3.0__cp310-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -39,4 +39,4 @@ __all__ = [
39
39
  "markdownify",
40
40
  ]
41
41
 
42
- __version__ = "2.1.1"
42
+ __version__ = "2.3.0"
Binary file
@@ -11,6 +11,7 @@ class ConversionOptions:
11
11
  autolinks: bool
12
12
  default_title: bool
13
13
  br_in_tables: bool
14
+ hocr_spatial_tables: bool
14
15
  highlight_style: str
15
16
  extract_metadata: bool
16
17
  whitespace_mode: str
@@ -39,6 +40,7 @@ class ConversionOptions:
39
40
  autolinks: bool = True,
40
41
  default_title: bool = False,
41
42
  br_in_tables: bool = False,
43
+ hocr_spatial_tables: bool = True,
42
44
  highlight_style: str = "double-equal",
43
45
  extract_metadata: bool = True,
44
46
  whitespace_mode: str = "normalized",
html_to_markdown/api.py CHANGED
@@ -51,6 +51,7 @@ def convert(
51
51
  autolinks=options.autolinks,
52
52
  default_title=options.default_title,
53
53
  br_in_tables=options.br_in_tables,
54
+ hocr_spatial_tables=options.hocr_spatial_tables,
54
55
  highlight_style=options.highlight_style,
55
56
  extract_metadata=options.extract_metadata,
56
57
  whitespace_mode=options.whitespace_mode,
Binary file
@@ -97,7 +97,7 @@ def translate_v1_args_to_v2(argv: list[str]) -> list[str]:
97
97
 
98
98
 
99
99
  def main(argv: list[str]) -> str:
100
- """Main entry point for the CLI proxy.
100
+ """Execute the CLI proxy.
101
101
 
102
102
  Translates v1 arguments to v2 and invokes the native Rust CLI binary.
103
103
 
@@ -71,6 +71,9 @@ class ConversionOptions:
71
71
  br_in_tables: bool = False
72
72
  """Use <br> tags for line breaks in table cells instead of spaces."""
73
73
 
74
+ hocr_spatial_tables: bool = True
75
+ """Reconstruct tables in hOCR documents using spatial heuristics."""
76
+
74
77
  highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
75
78
  """Style for highlighting <mark> elements."""
76
79
 
@@ -120,8 +120,11 @@ def convert_to_markdown(
120
120
  if custom_converters is not None:
121
121
  raise NotImplementedError("custom_converters is not yet implemented in v2")
122
122
  if not hocr_extract_tables:
123
- raise NotImplementedError(
124
- "hocr_extract_tables toggle was removed in v2. hOCR tables are always reconstructed when detected."
123
+ warnings.warn(
124
+ "hocr_extract_tables is deprecated and will be removed in a future release. "
125
+ "Use ConversionOptions(hocr_spatial_tables=False) to disable spatial table reconstruction.",
126
+ DeprecationWarning,
127
+ stacklevel=2,
125
128
  )
126
129
  if hocr_table_column_threshold != 50 or hocr_table_row_threshold_ratio != 0.5:
127
130
  raise NotImplementedError(
@@ -146,6 +149,7 @@ def convert_to_markdown(
146
149
  autolinks=autolinks,
147
150
  default_title=default_title,
148
151
  br_in_tables=br_in_tables,
152
+ hocr_spatial_tables=hocr_extract_tables,
149
153
  highlight_style=highlight_style, # type: ignore[arg-type]
150
154
  extract_metadata=extract_metadata,
151
155
  whitespace_mode=whitespace_mode, # type: ignore[arg-type]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 2.1.2
3
+ Version: 2.3.0
4
4
  Classifier: Development Status :: 5 - Production/Stable
5
5
  Classifier: Environment :: Console
6
6
  Classifier: Intended Audience :: Developers
@@ -52,11 +52,11 @@ Apple M4 • Real Wikipedia documents • `convert()` (Python)
52
52
 
53
53
  | Document | Size | Latency | Throughput | Docs/sec |
54
54
  | ------------------- | ----- | ------- | ---------- | -------- |
55
- | Lists (Timeline) | 129KB | 0.62ms | 208MB/s | 1,613 |
56
- | Tables (Countries) | 360KB | 2.02ms | 178MB/s | 495 |
57
- | Mixed (Python wiki) | 656KB | 4.56ms | 144MB/s | 219 |
55
+ | Lists (Timeline) | 129KB | 0.62ms | 208 MB/s | 1,613 |
56
+ | Tables (Countries) | 360KB | 2.02ms | 178 MB/s | 495 |
57
+ | Mixed (Python wiki) | 656KB | 4.56ms | 144 MB/s | 219 |
58
58
 
59
- > V1 averaged ~2.5MB/s (Python/BeautifulSoup). V2s Rust engine delivers 60–80× higher throughput.
59
+ > V1 averaged ~2.5 MB/s (Python/BeautifulSoup). V2's Rust engine delivers 60–80× higher throughput.
60
60
 
61
61
  ## Quick Start
62
62
 
@@ -173,11 +173,50 @@ Key fields (see docstring for full matrix):
173
173
  - `capture_svg`: collect inline `<svg>` (default `True`)
174
174
  - `infer_dimensions`: decode raster images to obtain dimensions (default `False`)
175
175
 
176
+ ## Performance: V2 vs V1 Compatibility Layer
177
+
178
+ ### ⚠️ Important: Always Use V2 API
179
+
180
+ The v2 API (`convert()`) is **strongly recommended** for all code. The v1 compatibility layer adds significant overhead and should only be used for gradual migration:
181
+
182
+ ```python
183
+ # ✅ RECOMMENDED - V2 Direct API (Fast)
184
+ from html_to_markdown import convert, ConversionOptions
185
+
186
+ markdown = convert(html) # Simple conversion - FAST
187
+ markdown = convert(html, ConversionOptions(heading_style="atx")) # With options - FAST
188
+
189
+ # ❌ AVOID - V1 Compatibility Layer (Slow)
190
+ from html_to_markdown import convert_to_markdown
191
+
192
+ markdown = convert_to_markdown(html, heading_style="atx") # Adds 77% overhead
193
+ ```
194
+
195
+ ### Performance Comparison
196
+
197
+ Benchmarked on Apple M4 with 25-paragraph HTML document:
198
+
199
+ | API | ops/sec | Relative Performance | Recommendation |
200
+ | ------------------------ | ---------------- | -------------------- | ------------------- |
201
+ | **V2 API** (`convert()`) | **129,822** | baseline | ✅ **Use this** |
202
+ | **V1 Compat Layer** | **67,673** | **77% slower** | ⚠️ Migration only |
203
+ | **CLI** | **150-210 MB/s** | Fastest | ✅ Batch processing |
204
+
205
+ The v1 compatibility layer creates extra Python objects and performs additional conversions, significantly impacting performance.
206
+
207
+ ### When to Use Each
208
+
209
+ - **V2 API (`convert()`)**: All new code, production systems, performance-critical applications ← **Use this**
210
+ - **V1 Compat (`convert_to_markdown()`)**: Only for gradual migration from legacy codebases
211
+ - **CLI (`html-to-markdown`)**: Batch processing, shell scripts, maximum throughput
212
+
176
213
  ## v1 Compatibility
177
214
 
178
- - **Performance**: V1 averaged ~2.5 MB/s; V2 sustains 150–210 MB/s with identical Markdown output.
179
- - **Compat shim**: `html_to_markdown.v1_compat` exposes `convert_to_markdown`, `convert_to_markdown_stream`, and `markdownify` to ease migration. Keyword mappings are listed in the [changelog](CHANGELOG.md#v200).
180
- - **CLI**: The Rust CLI replaces the Python script. New flags are documented via `html-to-markdown --help`.
215
+ A compatibility layer is provided to ease migration from v1.x:
216
+
217
+ - **Compat shim**: `html_to_markdown.v1_compat` exposes `convert_to_markdown`, `convert_to_markdown_stream`, and `markdownify`. Keyword mappings are listed in the [changelog](CHANGELOG.md#v200).
218
+ - **⚠️ Performance warning**: These compatibility functions add 77% overhead. Migrate to v2 API as soon as possible.
219
+ - **CLI**: The Rust CLI replaces the old Python script. New flags are documented via `html-to-markdown --help`.
181
220
  - **Removed options**: `code_language_callback`, `strip`, and streaming APIs were removed; use `ConversionOptions`, `PreprocessingOptions`, and the inline-image helpers instead.
182
221
 
183
222
  ## Links
@@ -0,0 +1,17 @@
1
+ html_to_markdown-2.3.0.data/scripts/html-to-markdown.exe,sha256=NoG5btr57ihL85-URLcreFRZTiAUfO7km_FiEUMa7xI,4469760
2
+ html_to_markdown-2.3.0.dist-info/METADATA,sha256=wcmX8lAc-dZZp8ETrpbXqu58Kf19n3fE6ubBPd0OfKU,9012
3
+ html_to_markdown-2.3.0.dist-info/WHEEL,sha256=4EDp_7DiFfWl1yYv5M4wSosAn5L_xgD1dyrQxQxfCx8,95
4
+ html_to_markdown-2.3.0.dist-info/licenses/LICENSE,sha256=QhKFMkQLa4mSUlOsyG9VElzC7GYbAKtiS_EwOCyH-b4,1107
5
+ html_to_markdown/__init__.py,sha256=KgR9V82EqdL5S7dzK_USOv6STjyhVRJubDYScHxOJS0,1191
6
+ html_to_markdown/__main__.py,sha256=5objj9lB7hhpSpZsDok5tv9o9yztVR63Ccww-pXsAyY,343
7
+ html_to_markdown/_html_to_markdown.pyd,sha256=Peez6o-WS_O6Cc3YqbxjJzRVw0mLY5HsbJyfeoYj7BY,4214784
8
+ html_to_markdown/_rust.pyi,sha256=JP8tvcjYDfFJeJkbLpQ4qeK-5jl0hzIVT3Sa0daTkyo,2171
9
+ html_to_markdown/api.py,sha256=U7-Tu8TaVa32vveCtiOhTwoEojklkDV2e-6ItAiP3d4,2858
10
+ html_to_markdown/bin/html-to-markdown.exe,sha256=NoG5btr57ihL85-URLcreFRZTiAUfO7km_FiEUMa7xI,4469760
11
+ html_to_markdown/cli.py,sha256=z59l8sF8wIRRzJtUd-tXgqiC0WTqkTjzl-df8Ey_oQ0,67
12
+ html_to_markdown/cli_proxy.py,sha256=JGOuINBI8OMYLxojXGz8DdzMHo8eqgdINstOZWrdw-8,3816
13
+ html_to_markdown/exceptions.py,sha256=31VqpPi4JLGv7lI2481Z4f2s5ejYmq97c3s-WFFkXVU,2443
14
+ html_to_markdown/options.py,sha256=jna7fx9bHhx8N7u5IYtMXganFFzdJSVVgLZW0tYk3GA,5054
15
+ html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ html_to_markdown/v1_compat.py,sha256=aVt9cVTBfYcrS8EfBsrC6HQwWc3Kz9-65-LB9foN6Jk,8227
17
+ html_to_markdown-2.3.0.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- html_to_markdown-2.1.2.data/scripts/html-to-markdown.exe,sha256=SJCBlZp0uTo6_wzS7KEgXariZtABUVc64o5TVxOwVZo,4414976
2
- html_to_markdown-2.1.2.dist-info/METADATA,sha256=TgFH9djK4HzJF_vDFVZCm7EDXYscA4v9t31DuXCujIE,7233
3
- html_to_markdown-2.1.2.dist-info/WHEEL,sha256=4EDp_7DiFfWl1yYv5M4wSosAn5L_xgD1dyrQxQxfCx8,95
4
- html_to_markdown-2.1.2.dist-info/licenses/LICENSE,sha256=QhKFMkQLa4mSUlOsyG9VElzC7GYbAKtiS_EwOCyH-b4,1107
5
- html_to_markdown/__init__.py,sha256=3_Egcf46oNcEam7rc7zAHx8lfOj1eVNO1p0kErVf_fs,1191
6
- html_to_markdown/__main__.py,sha256=5objj9lB7hhpSpZsDok5tv9o9yztVR63Ccww-pXsAyY,343
7
- html_to_markdown/_html_to_markdown.pyd,sha256=ES7QEe9lTb2ZK3yvC2-vNHng__U7HB3CY5p2wJ0IuNQ,4159488
8
- html_to_markdown/_rust.pyi,sha256=SHrrT8opJd5kcRYycooR4AS9is5tr1beSGtpoUWqzNc,2097
9
- html_to_markdown/api.py,sha256=YQQuJoO1OQnXpuOLk8TbdQDTARcKYFbf_zSA44BeHCM,2800
10
- html_to_markdown/bin/html-to-markdown.exe,sha256=SJCBlZp0uTo6_wzS7KEgXariZtABUVc64o5TVxOwVZo,4414976
11
- html_to_markdown/cli.py,sha256=z59l8sF8wIRRzJtUd-tXgqiC0WTqkTjzl-df8Ey_oQ0,67
12
- html_to_markdown/cli_proxy.py,sha256=J2Qk9MnnkFKIroxc0wn79nzI0dXqXDDNEAF9o9hth9Y,3829
13
- html_to_markdown/exceptions.py,sha256=31VqpPi4JLGv7lI2481Z4f2s5ejYmq97c3s-WFFkXVU,2443
14
- html_to_markdown/options.py,sha256=ijjRBTwrbESbwmYTOXV_ZO1A1GAmOzzILiFoPeC-jZk,4940
15
- html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- html_to_markdown/v1_compat.py,sha256=Lb3pppLfVH9EyAYGbOfpcO3vYkof4SIYDMI-CBEbh-A,8045
17
- html_to_markdown-2.1.2.dist-info/RECORD,,