html-to-markdown 2.6.3__cp310-abi3-macosx_11_0_arm64.whl → 2.14.2__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,73 +0,0 @@
1
- class ConversionOptions:
2
- heading_style: str
3
- list_indent_type: str
4
- list_indent_width: int
5
- bullets: str
6
- strong_em_symbol: str
7
- escape_asterisks: bool
8
- escape_underscores: bool
9
- escape_misc: bool
10
- code_language: str
11
- autolinks: bool
12
- default_title: bool
13
- br_in_tables: bool
14
- hocr_spatial_tables: bool
15
- highlight_style: str
16
- extract_metadata: bool
17
- whitespace_mode: str
18
- strip_newlines: bool
19
- wrap: bool
20
- wrap_width: int
21
- convert_as_inline: bool
22
- sub_symbol: str
23
- sup_symbol: str
24
- newline_style: str
25
- keep_inline_images_in: list[str]
26
- preprocessing: PreprocessingOptions
27
- encoding: str
28
-
29
- def __init__(
30
- self,
31
- heading_style: str = "underlined",
32
- list_indent_type: str = "spaces",
33
- list_indent_width: int = 4,
34
- bullets: str = "*+-",
35
- strong_em_symbol: str = "*",
36
- escape_asterisks: bool = True,
37
- escape_underscores: bool = True,
38
- escape_misc: bool = True,
39
- code_language: str = "",
40
- autolinks: bool = True,
41
- default_title: bool = False,
42
- br_in_tables: bool = False,
43
- hocr_spatial_tables: bool = True,
44
- highlight_style: str = "double-equal",
45
- extract_metadata: bool = True,
46
- whitespace_mode: str = "normalized",
47
- strip_newlines: bool = False,
48
- wrap: bool = False,
49
- wrap_width: int = 80,
50
- convert_as_inline: bool = False,
51
- sub_symbol: str = "",
52
- sup_symbol: str = "",
53
- newline_style: str = "spaces",
54
- keep_inline_images_in: list[str] | None = None,
55
- preprocessing: PreprocessingOptions | None = None,
56
- encoding: str = "utf-8",
57
- ) -> None: ...
58
-
59
- class PreprocessingOptions:
60
- enabled: bool
61
- preset: str
62
- remove_navigation: bool
63
- remove_forms: bool
64
-
65
- def __init__(
66
- self,
67
- enabled: bool = False,
68
- preset: str = "standard",
69
- remove_navigation: bool = True,
70
- remove_forms: bool = True,
71
- ) -> None: ...
72
-
73
- def convert(html: str, options: ConversionOptions | None = None) -> str: ...
@@ -1,242 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: html-to-markdown
3
- Version: 2.6.3
4
- Classifier: Development Status :: 5 - Production/Stable
5
- Classifier: Environment :: Console
6
- Classifier: Intended Audience :: Developers
7
- Classifier: License :: OSI Approved :: MIT License
8
- Classifier: Operating System :: OS Independent
9
- Classifier: Programming Language :: Python :: 3 :: Only
10
- Classifier: Programming Language :: Python :: 3.10
11
- Classifier: Programming Language :: Python :: 3.11
12
- Classifier: Programming Language :: Python :: 3.12
13
- Classifier: Programming Language :: Python :: 3.13
14
- Classifier: Programming Language :: Python :: 3.14
15
- Classifier: Programming Language :: Rust
16
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
- Classifier: Topic :: Text Processing
18
- Classifier: Topic :: Text Processing :: Markup
19
- Classifier: Topic :: Text Processing :: Markup :: HTML
20
- Classifier: Topic :: Text Processing :: Markup :: Markdown
21
- Classifier: Typing :: Typed
22
- License-File: LICENSE
23
- Summary: High-performance HTML to Markdown converter powered by Rust with a clean Python API
24
- Keywords: cli-tool,converter,html,html2markdown,html5,markdown,markup,parser,rust,text-processing
25
- Home-Page: https://github.com/Goldziher/html-to-markdown
26
- Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
27
- Requires-Python: >=3.10
28
- Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
29
- Project-URL: Changelog, https://github.com/Goldziher/html-to-markdown/releases
30
- Project-URL: Homepage, https://github.com/Goldziher/html-to-markdown
31
- Project-URL: Issues, https://github.com/Goldziher/html-to-markdown/issues
32
- Project-URL: Repository, https://github.com/Goldziher/html-to-markdown.git
33
-
34
- # html-to-markdown
35
-
36
- High-performance HTML to Markdown converter with a clean Python API (powered by a Rust core). The same engine also drives the Node.js, Ruby, and WebAssembly bindings, so rendered Markdown stays identical across runtimes. Wheels are published for Linux, macOS, and Windows.
37
-
38
- [![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://crates.io/crates/html-to-markdown-rs)
39
- [![npm (node)](https://badge.fury.io/js/html-to-markdown-node.svg)](https://www.npmjs.com/package/html-to-markdown-node)
40
- [![npm (wasm)](https://badge.fury.io/js/html-to-markdown-wasm.svg)](https://www.npmjs.com/package/html-to-markdown-wasm)
41
- [![npm (typescript)](https://badge.fury.io/js/html-to-markdown.svg)](https://www.npmjs.com/package/html-to-markdown)
42
- [![PyPI](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
43
- [![Packagist](https://img.shields.io/packagist/v/goldziher/html-to-markdown.svg)](https://packagist.org/packages/goldziher/html-to-markdown)
44
- [![RubyGems](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)
45
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
46
-
47
- ## Installation
48
-
49
- ```bash
50
- pip install html-to-markdown
51
- ```
52
-
53
- ## Performance Snapshot
54
-
55
- Apple M4 • Real Wikipedia documents • `convert()` (Python)
56
-
57
- | Document | Size | Latency | Throughput | Docs/sec |
58
- | ------------------- | ----- | ------- | ---------- | -------- |
59
- | Lists (Timeline) | 129KB | 0.62ms | 208 MB/s | 1,613 |
60
- | Tables (Countries) | 360KB | 2.02ms | 178 MB/s | 495 |
61
- | Mixed (Python wiki) | 656KB | 4.56ms | 144 MB/s | 219 |
62
-
63
- > V1 averaged ~2.5 MB/s (Python/BeautifulSoup). V2's Rust engine delivers 60–80× higher throughput.
64
-
65
- ## Quick Start
66
-
67
- ```python
68
- from html_to_markdown import convert
69
-
70
- html = """
71
- <h1>Welcome</h1>
72
- <p>This is <strong>fast</strong> Rust-powered conversion!</p>
73
- <ul>
74
- <li>Blazing fast</li>
75
- <li>Type safe</li>
76
- <li>Easy to use</li>
77
- </ul>
78
- """
79
-
80
- markdown = convert(html)
81
- print(markdown)
82
- ```
83
-
84
- ## Configuration (v2 API)
85
-
86
- ```python
87
- from html_to_markdown import ConversionOptions, convert
88
-
89
- options = ConversionOptions(
90
- heading_style="atx",
91
- list_indent_width=2,
92
- bullets="*+-",
93
- )
94
- options.escape_asterisks = True
95
- options.code_language = "python"
96
- options.extract_metadata = True
97
-
98
- markdown = convert(html, options)
99
- ```
100
-
101
- ### HTML Preprocessing
102
-
103
- ```python
104
- from html_to_markdown import ConversionOptions, PreprocessingOptions, convert
105
-
106
- options = ConversionOptions(
107
- preprocessing=PreprocessingOptions(enabled=True, preset="aggressive"),
108
- )
109
-
110
- markdown = convert(scraped_html, options)
111
- ```
112
-
113
- ### Inline Image Extraction
114
-
115
- ```python
116
- from html_to_markdown import InlineImageConfig, convert_with_inline_images
117
-
118
- markdown, inline_images, warnings = convert_with_inline_images(
119
- '<p><img src="data:image/png;base64,...==" alt="Pixel" width="1" height="1"></p>',
120
- image_config=InlineImageConfig(max_decoded_size_bytes=1024, infer_dimensions=True),
121
- )
122
-
123
- if inline_images:
124
- first = inline_images[0]
125
- print(first["format"], first["dimensions"], first["attributes"]) # e.g. "png", (1, 1), {"width": "1"}
126
- ```
127
-
128
- Each inline image is returned as a typed dictionary (`bytes` payload, metadata, and relevant HTML attributes). Warnings are human-readable skip reasons.
129
-
130
- ### hOCR (HTML OCR) Support
131
-
132
- ```python
133
- from html_to_markdown import ConversionOptions, convert
134
-
135
- # Default: emit structured Markdown directly
136
- markdown = convert(hocr_html)
137
-
138
- # hOCR documents are detected automatically; tables are reconstructed without extra configuration.
139
- markdown = convert(hocr_html)
140
- ```
141
-
142
- ## CLI (same engine)
143
-
144
- ```bash
145
- pipx install html-to-markdown # or: pip install html-to-markdown
146
-
147
- html-to-markdown page.html > page.md
148
- cat page.html | html-to-markdown --heading-style atx > page.md
149
- ```
150
-
151
- ## API Surface
152
-
153
- ### `ConversionOptions`
154
-
155
- Key fields (see docstring for full matrix):
156
-
157
- - `heading_style`: `"underlined" | "atx" | "atx_closed"`
158
- - `list_indent_width`: spaces per indent level (default 2)
159
- - `bullets`: cycle of bullet characters (`"*+-"`)
160
- - `strong_em_symbol`: `"*"` or `"_"`
161
- - `code_language`: default fenced code block language
162
- - `wrap`, `wrap_width`: wrap Markdown output
163
- - `strip_tags`: remove specific HTML tags
164
- - `preprocessing`: `PreprocessingOptions`
165
- - `encoding`: input character encoding (informational)
166
-
167
- ### `PreprocessingOptions`
168
-
169
- - `enabled`: enable HTML sanitisation (default: `True` since v2.4.2 for robust malformed HTML handling)
170
- - `preset`: `"minimal" | "standard" | "aggressive"` (default: `"standard"`)
171
- - `remove_navigation`: remove navigation elements (default: `True`)
172
- - `remove_forms`: remove form elements (default: `True`)
173
-
174
- **Note:** As of v2.4.2, preprocessing is enabled by default to ensure robust handling of malformed HTML (e.g., bare angle brackets like `1<2` in content). Set `enabled=False` if you need minimal preprocessing.
175
-
176
- ### `InlineImageConfig`
177
-
178
- - `max_decoded_size_bytes`: reject larger payloads
179
- - `filename_prefix`: generated name prefix (`embedded_image` default)
180
- - `capture_svg`: collect inline `<svg>` (default `True`)
181
- - `infer_dimensions`: decode raster images to obtain dimensions (default `False`)
182
-
183
- ## Performance: V2 vs V1 Compatibility Layer
184
-
185
- ### ⚠️ Important: Always Use V2 API
186
-
187
- The v2 API (`convert()`) is **strongly recommended** for all code. The v1 compatibility layer adds significant overhead and should only be used for gradual migration:
188
-
189
- ```python
190
- # ✅ RECOMMENDED - V2 Direct API (Fast)
191
- from html_to_markdown import convert, ConversionOptions
192
-
193
- markdown = convert(html) # Simple conversion - FAST
194
- markdown = convert(html, ConversionOptions(heading_style="atx")) # With options - FAST
195
-
196
- # ❌ AVOID - V1 Compatibility Layer (Slow)
197
- from html_to_markdown import convert_to_markdown
198
-
199
- markdown = convert_to_markdown(html, heading_style="atx") # Adds 77% overhead
200
- ```
201
-
202
- ### Performance Comparison
203
-
204
- Benchmarked on Apple M4 with 25-paragraph HTML document:
205
-
206
- | API | ops/sec | Relative Performance | Recommendation |
207
- | ------------------------ | ---------------- | -------------------- | ------------------- |
208
- | **V2 API** (`convert()`) | **129,822** | baseline | ✅ **Use this** |
209
- | **V1 Compat Layer** | **67,673** | **77% slower** | ⚠️ Migration only |
210
- | **CLI** | **150-210 MB/s** | Fastest | ✅ Batch processing |
211
-
212
- The v1 compatibility layer creates extra Python objects and performs additional conversions, significantly impacting performance.
213
-
214
- ### When to Use Each
215
-
216
- - **V2 API (`convert()`)**: All new code, production systems, performance-critical applications ← **Use this**
217
- - **V1 Compat (`convert_to_markdown()`)**: Only for gradual migration from legacy codebases
218
- - **CLI (`html-to-markdown`)**: Batch processing, shell scripts, maximum throughput
219
-
220
- ## v1 Compatibility
221
-
222
- A compatibility layer is provided to ease migration from v1.x:
223
-
224
- - **Compat shim**: `html_to_markdown.v1_compat` exposes `convert_to_markdown`, `convert_to_markdown_stream`, and `markdownify`. Keyword mappings are listed in the [changelog](CHANGELOG.md#v200).
225
- - **⚠️ Performance warning**: These compatibility functions add 77% overhead. Migrate to v2 API as soon as possible.
226
- - **CLI**: The Rust CLI replaces the old Python script. New flags are documented via `html-to-markdown --help`.
227
- - **Removed options**: `code_language_callback`, `strip`, and streaming APIs were removed; use `ConversionOptions`, `PreprocessingOptions`, and the inline-image helpers instead.
228
-
229
- ## Links
230
-
231
- - GitHub: [https://github.com/Goldziher/html-to-markdown](https://github.com/Goldziher/html-to-markdown)
232
- - Discord: [https://discord.gg/pXxagNK2zN](https://discord.gg/pXxagNK2zN)
233
- - Kreuzberg ecosystem: [https://kreuzberg.dev](https://kreuzberg.dev)
234
-
235
- ## License
236
-
237
- MIT License – see [LICENSE](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE).
238
-
239
- ## Support
240
-
241
- If you find this library useful, consider [sponsoring the project](https://github.com/sponsors/Goldziher).
242
-
@@ -1,17 +0,0 @@
1
- html_to_markdown-2.6.3.dist-info/RECORD,,
2
- html_to_markdown-2.6.3.dist-info/WHEEL,sha256=HtAbUhtjhH1WdiDuIy2CapdoAiKCwe6bij_Tlxr1lEg,131
3
- html_to_markdown-2.6.3.dist-info/METADATA,sha256=2jZEsKKkOxN4Bn8ID11oyONT7_hvHm23pmVvpgyAVK0,9789
4
- html_to_markdown-2.6.3.dist-info/licenses/LICENSE,sha256=oQvPC-0UWvfg0WaeUBe11OJMtX60An-TW1ev_oaAA0k,1086
5
- html_to_markdown-2.6.3.data/scripts/html-to-markdown,sha256=bRYI80buxtoHkQ5BaaFLRnONic_sG0ODud44rqh7TX0,3884128
6
- html_to_markdown/options.py,sha256=vImRfeHAeyAy0Lnt6cTPHGbj7mTdw8AEUgo19u7MAA0,5080
7
- html_to_markdown/_html_to_markdown.abi3.so,sha256=5nhsd3DFA0t6C_UbeZrT99KNUSTkRwrBEsR6ySQV5eA,3701616
8
- html_to_markdown/__init__.py,sha256=bXngQAyZfyVClaa1YyVAUlgLYXcOFki_eaEpD42yuvM,1358
9
- html_to_markdown/api.py,sha256=uiNoieNXrcXTJI2_vV7ruDv9HKD7XFuosCAeqZL-C_Q,4944
10
- html_to_markdown/_rust.pyi,sha256=pi6C_qAdB81qUlC89Dy1ZKC1JrpdnqVce-caJx3ekPA,2098
11
- html_to_markdown/v1_compat.py,sha256=5DZA-fPMqZ5hYiA43rFaOAqshLS8MScbBnivDXuvQII,8034
12
- html_to_markdown/cli.py,sha256=Rn-s3FZPea1jgCJtDzH_TFvOEiA_uZFVfgjhr6xyL_g,64
13
- html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- html_to_markdown/exceptions.py,sha256=aTASOzbywgfqOYjlw18ZkOWSxKff4EbUbmMua_73TGA,2370
15
- html_to_markdown/cli_proxy.py,sha256=HPYKH5Mf5OUvkbEQISJvAkxrbjWKxE5GokA44HoQ6z8,3858
16
- html_to_markdown/__main__.py,sha256=3Ic_EbOt2h6W88q084pkz5IKU6iY5z_woBygH6u9aw0,327
17
- html_to_markdown/bin/html-to-markdown,sha256=bRYI80buxtoHkQ5BaaFLRnONic_sG0ODud44rqh7TX0,3884128