html-to-markdown 1.5.0__tar.gz → 1.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

Files changed (23) hide show
  1. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/PKG-INFO +50 -13
  2. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/README.md +46 -12
  3. html_to_markdown-1.8.0/html_to_markdown/__init__.py +24 -0
  4. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/cli.py +1 -4
  5. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/converters.py +36 -92
  6. html_to_markdown-1.8.0/html_to_markdown/exceptions.py +49 -0
  7. html_to_markdown-1.8.0/html_to_markdown/preprocessor.py +407 -0
  8. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/processing.py +447 -210
  9. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/utils.py +12 -5
  10. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/PKG-INFO +50 -13
  11. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/SOURCES.txt +2 -0
  12. html_to_markdown-1.8.0/html_to_markdown.egg-info/requires.txt +5 -0
  13. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/pyproject.toml +3 -2
  14. html_to_markdown-1.5.0/html_to_markdown/__init__.py +0 -6
  15. html_to_markdown-1.5.0/html_to_markdown.egg-info/requires.txt +0 -1
  16. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/LICENSE +0 -0
  17. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/__main__.py +0 -0
  18. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/constants.py +0 -0
  19. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown/py.typed +0 -0
  20. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
  21. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/entry_points.txt +0 -0
  22. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/html_to_markdown.egg-info/top_level.txt +0 -0
  23. {html_to_markdown-1.5.0 → html_to_markdown-1.8.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.5.0
3
+ Version: 1.8.0
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -32,6 +32,9 @@ Requires-Python: >=3.9
32
32
  Description-Content-Type: text/markdown
33
33
  License-File: LICENSE
34
34
  Requires-Dist: beautifulsoup4>=4.13.4
35
+ Requires-Dist: nh3>=0.2.21
36
+ Provides-Extra: lxml
37
+ Requires-Dist: lxml>=5; extra == "lxml"
35
38
  Dynamic: license-file
36
39
 
37
40
  # html-to-markdown
@@ -60,6 +63,28 @@ Python 3.9+.
60
63
  pip install html-to-markdown
61
64
  ```
62
65
 
66
+ ### Optional lxml Parser
67
+
68
+ For improved performance, you can install with the optional lxml parser:
69
+
70
+ ```shell
71
+ pip install html-to-markdown[lxml]
72
+ ```
73
+
74
+ The lxml parser offers:
75
+
76
+ - **~30% faster HTML parsing** compared to the default html.parser
77
+ - Better handling of malformed HTML
78
+ - More robust parsing for complex documents
79
+
80
+ Once installed, lxml is automatically used by default for better performance. You can explicitly specify a parser if needed:
81
+
82
+ ```python
83
+ result = convert_to_markdown(html) # Auto-detects: uses lxml if available, otherwise html.parser
84
+ result = convert_to_markdown(html, parser="lxml") # Force lxml (requires installation)
85
+ result = convert_to_markdown(html, parser="html.parser") # Force built-in parser
86
+ ```
87
+
63
88
  ## Quick Start
64
89
 
65
90
  Convert HTML to Markdown with a single function call:
@@ -180,18 +205,19 @@ Custom converters take precedence over the built-in converters and can be used a
180
205
 
181
206
  ### Key Configuration Options
182
207
 
183
- | Option | Type | Default | Description |
184
- | ------------------- | ---- | ---------------- | ------------------------------------------------------ |
185
- | `extract_metadata` | bool | `True` | Extract document metadata as comment header |
186
- | `convert_as_inline` | bool | `False` | Treat content as inline elements only |
187
- | `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
188
- | `highlight_style` | str | `'double-equal'` | Highlight style (`'double-equal'`, `'html'`, `'bold'`) |
189
- | `stream_processing` | bool | `False` | Enable streaming for large documents |
190
- | `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
191
- | `bullets` | str | `'*+-'` | Characters to use for bullet points |
192
- | `escape_asterisks` | bool | `True` | Escape * characters |
193
- | `wrap` | bool | `False` | Enable text wrapping |
194
- | `wrap_width` | int | `80` | Text wrap width |
208
+ | Option | Type | Default | Description |
209
+ | ------------------- | ---- | ---------------- | --------------------------------------------------------------- |
210
+ | `extract_metadata` | bool | `True` | Extract document metadata as comment header |
211
+ | `convert_as_inline` | bool | `False` | Treat content as inline elements only |
212
+ | `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
213
+ | `highlight_style` | str | `'double-equal'` | Highlight style (`'double-equal'`, `'html'`, `'bold'`) |
214
+ | `stream_processing` | bool | `False` | Enable streaming for large documents |
215
+ | `parser` | str | auto-detect | BeautifulSoup parser (auto-detects `'lxml'` or `'html.parser'`) |
216
+ | `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
217
+ | `bullets` | str | `'*+-'` | Characters to use for bullet points |
218
+ | `escape_asterisks` | bool | `True` | Escape * characters |
219
+ | `wrap` | bool | `False` | Enable text wrapping |
220
+ | `wrap_width` | int | `80` | Text wrap width |
195
221
 
196
222
  For a complete list of all 20+ options, see the [Configuration Reference](#configuration-reference) section below.
197
223
 
@@ -379,6 +405,17 @@ uv run python -m html_to_markdown input.html
379
405
  uv build
380
406
  ```
381
407
 
408
+ ## Performance
409
+
410
+ The library is optimized for performance with several key features:
411
+
412
+ - **Efficient ancestor caching**: Reduces repeated DOM traversals using context-aware caching
413
+ - **Streaming support**: Process large documents in chunks to minimize memory usage
414
+ - **Optional lxml parser**: ~30% faster parsing for complex HTML documents
415
+ - **Optimized string operations**: Minimizes string concatenations in hot paths
416
+
417
+ Typical throughput: ~2 MB/s for regular processing on modern hardware.
418
+
382
419
  ## License
383
420
 
384
421
  This library uses the MIT license.
@@ -24,6 +24,28 @@ Python 3.9+.
24
24
  pip install html-to-markdown
25
25
  ```
26
26
 
27
+ ### Optional lxml Parser
28
+
29
+ For improved performance, you can install with the optional lxml parser:
30
+
31
+ ```shell
32
+ pip install html-to-markdown[lxml]
33
+ ```
34
+
35
+ The lxml parser offers:
36
+
37
+ - **~30% faster HTML parsing** compared to the default html.parser
38
+ - Better handling of malformed HTML
39
+ - More robust parsing for complex documents
40
+
41
+ Once installed, lxml is automatically used by default for better performance. You can explicitly specify a parser if needed:
42
+
43
+ ```python
44
+ result = convert_to_markdown(html) # Auto-detects: uses lxml if available, otherwise html.parser
45
+ result = convert_to_markdown(html, parser="lxml") # Force lxml (requires installation)
46
+ result = convert_to_markdown(html, parser="html.parser") # Force built-in parser
47
+ ```
48
+
27
49
  ## Quick Start
28
50
 
29
51
  Convert HTML to Markdown with a single function call:
@@ -144,18 +166,19 @@ Custom converters take precedence over the built-in converters and can be used a
144
166
 
145
167
  ### Key Configuration Options
146
168
 
147
- | Option | Type | Default | Description |
148
- | ------------------- | ---- | ---------------- | ------------------------------------------------------ |
149
- | `extract_metadata` | bool | `True` | Extract document metadata as comment header |
150
- | `convert_as_inline` | bool | `False` | Treat content as inline elements only |
151
- | `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
152
- | `highlight_style` | str | `'double-equal'` | Highlight style (`'double-equal'`, `'html'`, `'bold'`) |
153
- | `stream_processing` | bool | `False` | Enable streaming for large documents |
154
- | `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
155
- | `bullets` | str | `'*+-'` | Characters to use for bullet points |
156
- | `escape_asterisks` | bool | `True` | Escape * characters |
157
- | `wrap` | bool | `False` | Enable text wrapping |
158
- | `wrap_width` | int | `80` | Text wrap width |
169
+ | Option | Type | Default | Description |
170
+ | ------------------- | ---- | ---------------- | --------------------------------------------------------------- |
171
+ | `extract_metadata` | bool | `True` | Extract document metadata as comment header |
172
+ | `convert_as_inline` | bool | `False` | Treat content as inline elements only |
173
+ | `heading_style` | str | `'underlined'` | Header style (`'underlined'`, `'atx'`, `'atx_closed'`) |
174
+ | `highlight_style` | str | `'double-equal'` | Highlight style (`'double-equal'`, `'html'`, `'bold'`) |
175
+ | `stream_processing` | bool | `False` | Enable streaming for large documents |
176
+ | `parser` | str | auto-detect | BeautifulSoup parser (auto-detects `'lxml'` or `'html.parser'`) |
177
+ | `autolinks` | bool | `True` | Auto-convert URLs to Markdown links |
178
+ | `bullets` | str | `'*+-'` | Characters to use for bullet points |
179
+ | `escape_asterisks` | bool | `True` | Escape * characters |
180
+ | `wrap` | bool | `False` | Enable text wrapping |
181
+ | `wrap_width` | int | `80` | Text wrap width |
159
182
 
160
183
  For a complete list of all 20+ options, see the [Configuration Reference](#configuration-reference) section below.
161
184
 
@@ -343,6 +366,17 @@ uv run python -m html_to_markdown input.html
343
366
  uv build
344
367
  ```
345
368
 
369
+ ## Performance
370
+
371
+ The library is optimized for performance with several key features:
372
+
373
+ - **Efficient ancestor caching**: Reduces repeated DOM traversals using context-aware caching
374
+ - **Streaming support**: Process large documents in chunks to minimize memory usage
375
+ - **Optional lxml parser**: ~30% faster parsing for complex HTML documents
376
+ - **Optimized string operations**: Minimizes string concatenations in hot paths
377
+
378
+ Typical throughput: ~2 MB/s for regular processing on modern hardware.
379
+
346
380
  ## License
347
381
 
348
382
  This library uses the MIT license.
@@ -0,0 +1,24 @@
1
+ from html_to_markdown.exceptions import (
2
+ ConflictingOptionsError,
3
+ EmptyHtmlError,
4
+ HtmlToMarkdownError,
5
+ InvalidParserError,
6
+ MissingDependencyError,
7
+ )
8
+ from html_to_markdown.preprocessor import create_preprocessor, preprocess_html
9
+ from html_to_markdown.processing import convert_to_markdown, convert_to_markdown_stream
10
+
11
+ markdownify = convert_to_markdown
12
+
13
+ __all__ = [
14
+ "ConflictingOptionsError",
15
+ "EmptyHtmlError",
16
+ "HtmlToMarkdownError",
17
+ "InvalidParserError",
18
+ "MissingDependencyError",
19
+ "convert_to_markdown",
20
+ "convert_to_markdown_stream",
21
+ "create_preprocessor",
22
+ "markdownify",
23
+ "preprocess_html",
24
+ ]
@@ -191,7 +191,6 @@ def main(argv: list[str]) -> str:
191
191
 
192
192
  args = parser.parse_args(argv)
193
193
 
194
- # Prepare base arguments
195
194
  base_args = {
196
195
  "strip": args.strip,
197
196
  "convert": args.convert,
@@ -216,18 +215,16 @@ def main(argv: list[str]) -> str:
216
215
  "highlight_style": args.highlight_style,
217
216
  }
218
217
 
219
- # Add streaming parameters only if streaming is enabled
220
218
  if args.stream_processing:
221
219
  base_args["stream_processing"] = True
222
220
  base_args["chunk_size"] = args.chunk_size
223
221
 
224
- # Progress callback for CLI
225
222
  if args.show_progress:
226
223
 
227
224
  def progress_callback(processed: int, total: int) -> None:
228
225
  if total > 0:
229
226
  percent = (processed / total) * 100
230
- # Use sys.stderr to avoid ruff T201 error for progress output
227
+
231
228
  sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
232
229
  sys.stderr.flush()
233
230